|
@@ -1,52 +1,83 @@
|
|
|
package luatask
|
|
|
|
|
|
import (
|
|
|
- "encoding/json"
|
|
|
+ "fmt"
|
|
|
"github.com/donnie4w/go-logger/logger"
|
|
|
+ "go.mongodb.org/mongo-driver/bson"
|
|
|
qu "qfw/util"
|
|
|
"sync"
|
|
|
"time"
|
|
|
"util"
|
|
|
)
|
|
|
|
|
|
+const NEWTASK_LISTERR, NEWTASK_DATAINFOERR, NEWTASK_RATEERR, NEWTASK_DOWNLOADERR, NEWTASK_DATAINFOWARN = "1", "2", "4", "5", "6"
|
|
|
+
|
|
|
var NewCodeInfoMap = map[string]*NewSpider{}
|
|
|
+var LuaErrTypeInfo = map[string]string{
|
|
|
+ NEWTASK_LISTERR: "列表页异常",
|
|
|
+ NEWTASK_DATAINFOERR: "数据异常错误",
|
|
|
+ NEWTASK_RATEERR: "采集频率异常",
|
|
|
+ NEWTASK_DOWNLOADERR: "下载异常",
|
|
|
+ NEWTASK_DATAINFOWARN: "数据异常警告",
|
|
|
+}
|
|
|
+var DataInfoErrMap = map[int]string{
|
|
|
+ 1: "Save Coll Error",
|
|
|
+ 2: "File Size Or Url Error",
|
|
|
+ 4: "Field Value Is Null",
|
|
|
+ 9: "Html Contains Temp Language",
|
|
|
+ 10: "Publishtime Is Error",
|
|
|
+ 11: "Publishtime Is Zero",
|
|
|
+ 12: "Field Type Error",
|
|
|
+}
|
|
|
+var DataInfoWarnMap = map[int]string{
|
|
|
+ 5: "Field Value Contains Random Code",
|
|
|
+ 6: "Field Value Not Contains Chinese",
|
|
|
+ 8: "Detail File Err",
|
|
|
+}
|
|
|
|
|
|
type NewSpider struct {
|
|
|
//爬虫基本信息
|
|
|
- Code string `json:"code"`
|
|
|
- Site string `json:"site"`
|
|
|
- Channel string `json:"channel"`
|
|
|
- Platform string `json:"platform"`
|
|
|
- Event int `json:"event"`
|
|
|
- PendState int `json:"pendstate"`
|
|
|
- ModifyUser string `json:"modifyuser"`
|
|
|
- ModifyId string `json:"modifyuserid"`
|
|
|
- ModifyTime int64 `json:"modifytime"`
|
|
|
- Model int `json:"model"`
|
|
|
- Working int `json:"working"`
|
|
|
- AuditTime int64 `json:"l_uploadtime"`
|
|
|
- ListIsFilter bool `json:"listisfilter"`
|
|
|
- TaskTags map[string]interface{} `json:"tasktags"`
|
|
|
+ Code string `bson:"code"`
|
|
|
+ Site string `bson:"site"`
|
|
|
+ Channel string `bson:"channel"`
|
|
|
+ Platform string `bson:"platform"`
|
|
|
+ Event int `bson:"event"`
|
|
|
+ PendState int `bson:"pendstate"`
|
|
|
+ ModifyUser string `bson:"modifyuser"`
|
|
|
+ ModifyId string `bson:"modifyuserid"`
|
|
|
+ ModifyTime int64 `bson:"modifytime"`
|
|
|
+ Model int `bson:"model"`
|
|
|
+ Working int `bson:"working"`
|
|
|
+ AuditTime int64 `bson:"l_uploadtime"`
|
|
|
+ ListIsFilter bool `bson:"listisfilter"`
|
|
|
+ TaskTags map[string]interface{} `bson:"tasktags"`
|
|
|
//统计信息
|
|
|
- Detail_DownloadNum int `json:"detail_downloadnum"`
|
|
|
- Detail_DownloadSuccessNum int `json:"detail_downloadsuccessnum"`
|
|
|
- Detail_DownloadFailNum int `json:"detail_downloadfailnum"`
|
|
|
- List_IsGetData bool `json:"list_isgetdata"`
|
|
|
- List_RunTimes int `json:"list_runtimes"`
|
|
|
- List_NoDataTimes int `json:"list_nodatatimes"`
|
|
|
- List_AllInTimes int `json:"list_allintimes"`
|
|
|
- WarnInfoMap map[int]*WarnInfo `json:"warninfo"`
|
|
|
+ Detail_DownloadNum int `bson:"detail_downloadnum"`
|
|
|
+ Detail_DownloadSuccessNum int `bson:"detail_downloadsuccessnum"`
|
|
|
+ Detail_DownloadFailNum int `bson:"detail_downloadfailnum"`
|
|
|
+ List_IsGetData bool `bson:"list_isgetdata"`
|
|
|
+ List_RunTimes int `bson:"list_runtimes"`
|
|
|
+ List_NoDataTimes int `bson:"list_nodatatimes"`
|
|
|
+ List_AllInTimes int `bson:"list_allintimes"`
|
|
|
+ WarnInfoMap map[int]*WarnInfo `bson:"warninfo"`
|
|
|
//python
|
|
|
- Py_TaskId string `json:"py_taskid"`
|
|
|
- Py_NodeName string `json:"py_nodename"`
|
|
|
+ Py_TaskId string `bson:"py_taskid"`
|
|
|
+ Py_NodeName string `bson:"py_nodename"`
|
|
|
+ Py_IsValid bool `bson:"py_isvalid"`
|
|
|
//补充信息
|
|
|
- Comeintime int64 `json:"comeintime"`
|
|
|
+ Comeintime int64 `bson:"comeintime"`
|
|
|
+ //异常汇总
|
|
|
+ //Error map[string]*ErrorInfo `json:"error"`
|
|
|
+ ErrType int `bson:"errtype"` //记录权重最高的异常类型
|
|
|
+ ErrTypeMap map[int]bool `bson:"errtypemap"` //记录所有异常
|
|
|
+ ErrDescription string `bson:"errdescription"` //异常描述
|
|
|
}
|
|
|
|
|
|
type WarnInfo struct {
|
|
|
- Info string `json:"info"`
|
|
|
- Num int `json:"num"`
|
|
|
- Fields map[string]int `json:"fields"`
|
|
|
+ Info string `bson:"info"`
|
|
|
+ Num int `bson:"num"`
|
|
|
+ Fields map[string]int `bson:"fields"`
|
|
|
+ Hrefs map[string]string `bson:"hrefs"`
|
|
|
}
|
|
|
|
|
|
func NewStartTask() {
|
|
@@ -55,8 +86,8 @@ func NewStartTask() {
|
|
|
getCodeBaseInfo() //获取爬虫基本信息
|
|
|
getPythonSummaryInfo() //获取python汇总信息
|
|
|
getLuaSummaryInfo() //获取lua汇总信息
|
|
|
- getWarnInfo() //异常信息汇总
|
|
|
- saveCodeInfo() //保存记录
|
|
|
+ getSpiderWarnInfo() //获取异常数据
|
|
|
+ saveCodeInfo() //汇总异常信息,产出任务
|
|
|
}
|
|
|
|
|
|
func getCodeBaseInfo() {
|
|
@@ -112,14 +143,20 @@ func getCodeBaseInfo() {
|
|
|
}()
|
|
|
sp := &NewSpider{
|
|
|
WarnInfoMap: map[int]*WarnInfo{},
|
|
|
+ //Error: map[string]*ErrorInfo{},
|
|
|
+ ErrType: -1,
|
|
|
+ ErrTypeMap: map[int]bool{},
|
|
|
}
|
|
|
- luaByte, _ := json.Marshal(tmp)
|
|
|
- if json.Unmarshal(luaByte, &sp) != nil {
|
|
|
+ luaByte, _ := bson.Marshal(tmp)
|
|
|
+ if bson.Unmarshal(luaByte, &sp) != nil {
|
|
|
qu.Info("初始化爬虫失败:", tmp["_id"])
|
|
|
return
|
|
|
}
|
|
|
sp.Working = util.CodeEventWorking[sp.Working]
|
|
|
sp.Model = util.CodeEventModel[sp.Event]
|
|
|
+ if sp.Platform == "python" {
|
|
|
+ sp.Model = 1
|
|
|
+ }
|
|
|
lock.Lock()
|
|
|
NewCodeInfoMap[sp.Code] = sp
|
|
|
lock.Unlock()
|
|
@@ -130,7 +167,7 @@ func getCodeBaseInfo() {
|
|
|
tmp = map[string]interface{}{}
|
|
|
}
|
|
|
wg.Wait()
|
|
|
- logger.Info("爬虫基本信息准备完成...", len(CodeInfoMap))
|
|
|
+ logger.Info("爬虫基本信息准备完成...", len(NewCodeInfoMap))
|
|
|
}
|
|
|
|
|
|
func getPythonSummaryInfo() {
|
|
@@ -156,12 +193,7 @@ func getPythonSummaryInfo() {
|
|
|
wg.Done()
|
|
|
}()
|
|
|
code := qu.ObjToString(tmp["code"])
|
|
|
- if is_valid, _ := tmp["is_valid"].(bool); !is_valid { //无效监控爬虫
|
|
|
- lock.Lock()
|
|
|
- delete(NewCodeInfoMap, code)
|
|
|
- lock.Unlock()
|
|
|
- return
|
|
|
- }
|
|
|
+ is_valid, _ := tmp["is_valid"].(bool) //无效监控爬虫
|
|
|
py_taskid := qu.ObjToString(tmp["py_taskid"])
|
|
|
py_nodename := qu.ObjToString(tmp["py_nodename"])
|
|
|
list_isgetdata, _ := tmp["list_isgetdata"].(bool)
|
|
@@ -175,6 +207,7 @@ func getPythonSummaryInfo() {
|
|
|
if sp := NewCodeInfoMap[code]; sp != nil {
|
|
|
sp.Py_TaskId = py_taskid
|
|
|
sp.Py_NodeName = py_nodename
|
|
|
+ sp.Py_IsValid = is_valid
|
|
|
sp.List_IsGetData = list_isgetdata
|
|
|
sp.List_AllInTimes = list_allintimes
|
|
|
sp.List_NoDataTimes = list_nodatatimes
|
|
@@ -196,13 +229,13 @@ func getPythonSummaryInfo() {
|
|
|
}
|
|
|
|
|
|
func getLuaSummaryInfo() {
|
|
|
- getCodeHeart() //获取心跳信息
|
|
|
+ getSpiderHeart() //获取心跳信息
|
|
|
getSpiderHighListDownloadNum() //获取下载量信息
|
|
|
getSpiderListDownloadNum() //获取下载量信息
|
|
|
- getSpiderDownloadRateDataNew() //获取下载详情
|
|
|
+ getSpiderDownloadRateData() //获取下载详情
|
|
|
}
|
|
|
|
|
|
-func getWarnInfo() {
|
|
|
+func getSpiderWarnInfo() {
|
|
|
defer qu.Catch()
|
|
|
sess := util.MgoS.GetMgoConn()
|
|
|
defer util.MgoS.DestoryMongoConn(sess)
|
|
@@ -241,15 +274,20 @@ func getWarnInfo() {
|
|
|
}
|
|
|
code := qu.ObjToString(tmp["code"])
|
|
|
info := qu.ObjToString(tmp["info"])
|
|
|
+ href := qu.ObjToString(tmp["href"])
|
|
|
lock.Lock()
|
|
|
if sp := NewCodeInfoMap[code]; sp != nil {
|
|
|
if wf := sp.WarnInfoMap[infotype]; wf != nil {
|
|
|
+ if wf.Fields[field] == 0 {
|
|
|
+ wf.Hrefs[field] = href
|
|
|
+ }
|
|
|
wf.Fields[field] += 1
|
|
|
} else {
|
|
|
sp.WarnInfoMap[infotype] = &WarnInfo{
|
|
|
Info: info,
|
|
|
Num: 1,
|
|
|
Fields: map[string]int{field: 1},
|
|
|
+ Hrefs: map[string]string{field: href},
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -264,7 +302,7 @@ func getWarnInfo() {
|
|
|
logger.Info("错误信息数据统计完成...")
|
|
|
}
|
|
|
|
|
|
-func getCodeHeart() {
|
|
|
+func getSpiderHeart() {
|
|
|
defer qu.Catch()
|
|
|
sess := util.MgoS.GetMgoConn()
|
|
|
defer util.MgoS.DestoryMongoConn(sess)
|
|
@@ -309,7 +347,7 @@ func getCodeHeart() {
|
|
|
logger.Info("lua统计心跳信息完成...")
|
|
|
}
|
|
|
|
|
|
-func getSpiderHighListDownloadNum() {
|
|
|
+func getSpiderHighListDownloadNum() { //竞品数据暂未统计(延迟采集)
|
|
|
defer qu.Catch()
|
|
|
sess := util.MgoS.GetMgoConn()
|
|
|
defer util.MgoS.DestoryMongoConn(sess)
|
|
@@ -322,6 +360,7 @@ func getSpiderHighListDownloadNum() {
|
|
|
fields := map[string]interface{}{
|
|
|
"spidercode": 1,
|
|
|
"state": 1,
|
|
|
+ "times": 1,
|
|
|
}
|
|
|
lock := &sync.Mutex{}
|
|
|
wg := &sync.WaitGroup{}
|
|
@@ -339,13 +378,17 @@ func getSpiderHighListDownloadNum() {
|
|
|
}()
|
|
|
code := qu.ObjToString(tmp["spidercode"])
|
|
|
state := qu.IntAll(tmp["state"])
|
|
|
+ times := tmp["times"]
|
|
|
lock.Lock()
|
|
|
if sp := NewCodeInfoMap[code]; sp != nil {
|
|
|
if state == 1 {
|
|
|
sp.Detail_DownloadSuccessNum++
|
|
|
- } else {
|
|
|
+ } else if state == -1 {
|
|
|
+ sp.Detail_DownloadFailNum++
|
|
|
+ } else if state == 0 && times != nil {
|
|
|
sp.Detail_DownloadFailNum++
|
|
|
}
|
|
|
+ //未统计未下载的数据量 state==0,times==nil
|
|
|
sp.Detail_DownloadNum++
|
|
|
}
|
|
|
lock.Unlock()
|
|
@@ -427,7 +470,7 @@ func getSpiderListDownloadNum() {
|
|
|
logger.Info("lua统计spider_listdata采集量完成...")
|
|
|
}
|
|
|
|
|
|
-func getSpiderDownloadRateDataNew() {
|
|
|
+func getSpiderDownloadRateData() {
|
|
|
defer qu.Catch()
|
|
|
sess := util.MgoS.GetMgoConn()
|
|
|
defer util.MgoS.DestoryMongoConn(sess)
|
|
@@ -437,9 +480,9 @@ func getSpiderDownloadRateDataNew() {
|
|
|
date := qu.FormatDateByInt64(&StartTime, qu.Date_Short_Layout)
|
|
|
query := map[string]interface{}{
|
|
|
"date": date,
|
|
|
- "event": map[string]interface{}{
|
|
|
- "$ne": 7000,
|
|
|
- },
|
|
|
+ //"event": map[string]interface{}{
|
|
|
+ // "$ne": 7000,
|
|
|
+ //},
|
|
|
}
|
|
|
fields := map[string]interface{}{
|
|
|
"spidercode": 1,
|
|
@@ -484,7 +527,8 @@ func saveCodeInfo() {
|
|
|
wg := &sync.WaitGroup{}
|
|
|
ch := make(chan bool, 5)
|
|
|
comeintime := time.Now().Unix()
|
|
|
- arr := []map[string]interface{}{}
|
|
|
+ codeInfoArr := []map[string]interface{}{} //爬虫下载详情
|
|
|
+ taskArr := [][]map[string]interface{}{} //任务更新集
|
|
|
for _, spider := range NewCodeInfoMap {
|
|
|
ch <- true
|
|
|
wg.Add(1)
|
|
@@ -493,29 +537,386 @@ func saveCodeInfo() {
|
|
|
<-ch
|
|
|
wg.Done()
|
|
|
}()
|
|
|
+ getAllErr(sp) //汇总异常
|
|
|
+ createTask(sp, &taskArr, lock) //
|
|
|
sp.Comeintime = comeintime
|
|
|
- spByte, err := json.Marshal(sp)
|
|
|
+ spByte, err := bson.Marshal(sp)
|
|
|
if err != nil {
|
|
|
logger.Info("Json Marshal Error", sp.Code)
|
|
|
return
|
|
|
}
|
|
|
+ lock.Lock()
|
|
|
+ defer lock.Unlock()
|
|
|
tmp := map[string]interface{}{}
|
|
|
- if json.Unmarshal(spByte, &tmp) == nil {
|
|
|
- lock.Lock()
|
|
|
- arr = append(arr, tmp)
|
|
|
- if len(arr) > 500 {
|
|
|
- util.MgoS.SaveBulk("spider_info", arr...)
|
|
|
- arr = []map[string]interface{}{}
|
|
|
- }
|
|
|
- lock.Unlock()
|
|
|
+ if bson.Unmarshal(spByte, &tmp) == nil {
|
|
|
+ codeInfoArr = append(codeInfoArr, tmp)
|
|
|
+ } else {
|
|
|
+ logger.Error("Json UnMarshal Error", sp.Code)
|
|
|
+ return
|
|
|
+ }
|
|
|
+ if len(codeInfoArr) > 500 {
|
|
|
+ util.MgoS.SaveBulk("spider_info", codeInfoArr...)
|
|
|
+ codeInfoArr = []map[string]interface{}{}
|
|
|
+ }
|
|
|
+ if len(taskArr) > 500 {
|
|
|
+ util.MgoEB.UpSertBulk("newtask", taskArr...)
|
|
|
+ taskArr = [][]map[string]interface{}{}
|
|
|
}
|
|
|
-
|
|
|
}(spider)
|
|
|
}
|
|
|
wg.Wait()
|
|
|
- if len(arr) > 0 {
|
|
|
- util.MgoS.SaveBulk("spider_info", arr...)
|
|
|
- arr = []map[string]interface{}{}
|
|
|
+ if len(codeInfoArr) > 0 {
|
|
|
+ util.MgoS.SaveBulk("spider_info", codeInfoArr...)
|
|
|
+ codeInfoArr = []map[string]interface{}{}
|
|
|
+ }
|
|
|
+ if len(taskArr) > 0 {
|
|
|
+ util.MgoEB.UpSertBulk("newtask", taskArr...)
|
|
|
+ taskArr = [][]map[string]interface{}{}
|
|
|
}
|
|
|
+ NewCodeInfoMap = map[string]*NewSpider{}
|
|
|
logger.Info("爬虫统计完成...")
|
|
|
}
|
|
|
+
|
|
|
+func createTask(sp *NewSpider, taskArr *[][]map[string]interface{}, lock *sync.Mutex) {
|
|
|
+ defer qu.Catch()
|
|
|
+ if sp.Event == 7000 {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ if sp.ErrType == -1 { //无异常
|
|
|
+ return
|
|
|
+ }
|
|
|
+ state_new := 0
|
|
|
+ if sp.ErrType == 2 { //数据异常错误类型,任务状态1
|
|
|
+ state_new = 1
|
|
|
+ }
|
|
|
+ //查询历史任务
|
|
|
+ query := map[string]interface{}{
|
|
|
+ "s_code": sp.Code,
|
|
|
+ "i_state": map[string]interface{}{
|
|
|
+ "$in": []int{0, 1, 2, 3, 5}, //查询现有正在维护的任务
|
|
|
+ },
|
|
|
+ }
|
|
|
+ fields := map[string]interface{}{
|
|
|
+ "i_state": 1,
|
|
|
+ "s_type": 1,
|
|
|
+ "s_descript": 1,
|
|
|
+ "i_times": 1,
|
|
|
+ }
|
|
|
+ list, _ := util.MgoEB.Find("newtask", query, nil, fields, false, -1, -1)
|
|
|
+ update := []map[string]interface{}{}
|
|
|
+ if list != nil && len(*list) > 0 { //已有任务
|
|
|
+ if len(*list) > 1 {
|
|
|
+ logger.Error("Code:", sp.Code, "任务异常")
|
|
|
+ util.MgoEB.Save("luacreatetaskerr", map[string]interface{}{
|
|
|
+ "code": sp.Code,
|
|
|
+ "comeintime": time.Now().Unix(),
|
|
|
+ "tasknum": len(*list),
|
|
|
+ })
|
|
|
+ return
|
|
|
+ }
|
|
|
+ task := (*list)[0] //唯一任务
|
|
|
+ state_old := qu.IntAll(task["i_state"]) //历史任务状态
|
|
|
+ times_old := qu.IntAll(task["i_times"]) //历史任务次数
|
|
|
+ type_old := qu.ObjToString(task["s_type"]) //历史任务异常类型
|
|
|
+ descript_old := qu.ObjToString(task["s_descript"]) //历史任务描述
|
|
|
+
|
|
|
+ result := map[string]interface{}{
|
|
|
+ "i_event": sp.Event,
|
|
|
+ "l_updatetime": time.Now().Unix(),
|
|
|
+ "i_times": times_old + 1,
|
|
|
+ "s_descript": descript_old + time.Now().Format(qu.Date_Short_Layout) + "追加描述:------------------------------\n" + sp.ErrDescription,
|
|
|
+ }
|
|
|
+ //任务状态
|
|
|
+ if state_old == 0 {
|
|
|
+ result["i_state"] = 1 //第二次任务,将历史待确认任务升级为待处理
|
|
|
+ }
|
|
|
+ //任务类型
|
|
|
+ if sp.ErrType < qu.IntAll(type_old) { //取优先级高者
|
|
|
+ result["s_type"] = fmt.Sprint(sp.ErrType)
|
|
|
+ }
|
|
|
+ update = append(update, map[string]interface{}{"_id": task["_id"]})
|
|
|
+ update = append(update, map[string]interface{}{"$set": result})
|
|
|
+ lock.Lock()
|
|
|
+ *taskArr = append(*taskArr, update)
|
|
|
+ lock.Unlock()
|
|
|
+ } else { //无历史任务
|
|
|
+ saveMap := map[string]interface{}{
|
|
|
+ "s_modify": sp.ModifyUser,
|
|
|
+ "s_modifyid": sp.ModifyId,
|
|
|
+ "s_code": sp.Code,
|
|
|
+ "s_site": sp.Site,
|
|
|
+ "s_channel": sp.Channel,
|
|
|
+ "i_event": sp.Event,
|
|
|
+ "i_state": state_new,
|
|
|
+ "s_source": "程序",
|
|
|
+ "s_type": fmt.Sprint(sp.ErrType),
|
|
|
+ "s_descript": sp.ErrDescription,
|
|
|
+ "i_times": 1,
|
|
|
+ "l_comeintime": time.Now().Unix(),
|
|
|
+ "l_complete": util.CompleteTime("1"),
|
|
|
+ //"s_urgency": "1",
|
|
|
+ "s_platform": sp.Platform,
|
|
|
+ }
|
|
|
+ update = append(update, query)
|
|
|
+ update = append(update, saveMap)
|
|
|
+ lock.Lock()
|
|
|
+ *taskArr = append(*taskArr, update)
|
|
|
+ lock.Unlock()
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func getAllErr(sp *NewSpider) {
|
|
|
+ listErr(sp) //列表页异常
|
|
|
+ dataInfoErr(sp) //数据异常错误
|
|
|
+ downloadRateErr(sp) //下载频率异常
|
|
|
+ downloadFailedErr(sp) //下载异常
|
|
|
+ dataInfoWarn(sp) //数据异常警告
|
|
|
+}
|
|
|
+
|
|
|
+func listErr(sp *NewSpider) {
|
|
|
+ defer qu.Catch()
|
|
|
+ if sp.Platform == "python" && !sp.Py_IsValid {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ if !sp.List_IsGetData || sp.List_RunTimes == 0 {
|
|
|
+ errFlag := false
|
|
|
+ if sp.TaskTags != nil {
|
|
|
+ tagTime, _ := sp.TaskTags[NEWTASK_LISTERR].(int64) //用struct接收,会转为floa64
|
|
|
+ if tagTime == 0 { //无列表标记
|
|
|
+ errFlag = true
|
|
|
+ } else if tagTime > 0 && tagTime <= util.GetTime(-7) { //标记失效
|
|
|
+ errFlag = true
|
|
|
+ }
|
|
|
+ } else { //无任何标记
|
|
|
+ errFlag = true
|
|
|
+ }
|
|
|
+ if errFlag {
|
|
|
+ //sp.Error[NEWTASK_LISTERR] = &ErrorInfo{
|
|
|
+ // ErrInfo: map[string]bool{LuaErrTypeInfo[NEWTASK_LISTERR]: true},
|
|
|
+ //}
|
|
|
+ sp.ErrType = qu.IntAll(NEWTASK_LISTERR)
|
|
|
+ sp.ErrTypeMap[qu.IntAll(NEWTASK_LISTERR)] = true
|
|
|
+ sp.ErrDescription += "列表页异常:\n 列表页共采集" + fmt.Sprint(sp.List_RunTimes) + "轮,其中有" + fmt.Sprint(sp.List_NoDataTimes) + "轮无数据\n"
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+func dataInfoErr(sp *NewSpider) {
|
|
|
+ defer qu.Catch()
|
|
|
+ if len(sp.WarnInfoMap) > 0 {
|
|
|
+ errFlag := false
|
|
|
+ resultDescription := ""
|
|
|
+ for err, _ := range DataInfoErrMap {
|
|
|
+ if wf := sp.WarnInfoMap[err]; wf != nil {
|
|
|
+ tmpDescription := ""
|
|
|
+ for field, href := range wf.Hrefs {
|
|
|
+ tmpDescription += " 字段" + field + ":" + href + "\n"
|
|
|
+ }
|
|
|
+ if tmpDescription != "" {
|
|
|
+ resultDescription += " " + wf.Info + "\n" + tmpDescription
|
|
|
+ }
|
|
|
+ errFlag = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if errFlag {
|
|
|
+ //sp.Error[NEWTASK_DATAINFOERR] = &ErrorInfo{
|
|
|
+ // ErrInfo: map[string]bool{LuaErrTypeInfo[NEWTASK_DATAINFOERR]: true},
|
|
|
+ //}
|
|
|
+ sp.ErrDescription += "数据异常错误:\n" + resultDescription
|
|
|
+ sp.ErrTypeMap[qu.IntAll(NEWTASK_DATAINFOERR)] = true
|
|
|
+ if sp.ErrType < 0 {
|
|
|
+ sp.ErrType = qu.IntAll(NEWTASK_DATAINFOERR)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+}
|
|
|
+func downloadRateErr(sp *NewSpider) {
|
|
|
+ defer qu.Catch()
|
|
|
+ if sp.Platform == "python" && !sp.Py_IsValid {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ if sp.List_AllInTimes > 0 {
|
|
|
+ errFlag := false
|
|
|
+ if sp.Model == 1 { //分开采集,直接记录异常
|
|
|
+ errFlag = true
|
|
|
+ } else { //顺序采集
|
|
|
+ if sp.TaskTags != nil {
|
|
|
+ tagTime, _ := sp.TaskTags[NEWTASK_RATEERR].(int64)
|
|
|
+ if tagTime == 0 { //无列表标记
|
|
|
+ errFlag = true
|
|
|
+ } else if tagTime > 0 && tagTime <= util.GetTime(-7) { //标记失效
|
|
|
+ errFlag = true
|
|
|
+ }
|
|
|
+ } else { //无标记,记录列表页异常
|
|
|
+ errFlag = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if errFlag {
|
|
|
+ //sp.Error[NEWTASK_RATEERR] = &ErrorInfo{
|
|
|
+ // ErrInfo: map[string]bool{LuaErrTypeInfo[NEWTASK_RATEERR]: true},
|
|
|
+ //}
|
|
|
+ sp.ErrTypeMap[qu.IntAll(NEWTASK_RATEERR)] = true
|
|
|
+ sp.ErrDescription += "采集频率异常:\n 列表页共采集" + fmt.Sprint(sp.List_RunTimes) + "轮,其中有" + fmt.Sprint(sp.List_AllInTimes) + "轮数据全采\n"
|
|
|
+ if sp.ErrType < 0 {
|
|
|
+ sp.ErrType = qu.IntAll(NEWTASK_RATEERR)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+func downloadFailedErr(sp *NewSpider) {
|
|
|
+ defer qu.Catch()
|
|
|
+ if sp.Platform == "python" && !sp.Py_IsValid {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ if sp.Detail_DownloadFailNum > 0 {
|
|
|
+ tagTime := int64(-1)
|
|
|
+ if sp.TaskTags != nil {
|
|
|
+ tagTime, _ = sp.TaskTags[NEWTASK_DOWNLOADERR].(int64)
|
|
|
+ } else { //无标记,记录列表页异常
|
|
|
+ tagTime = 0
|
|
|
+ }
|
|
|
+ if tagTime > -1 {
|
|
|
+ if sp.Model == 1 { //分开采集(python爬虫默认分开采集模式)
|
|
|
+ errFlag := false
|
|
|
+ if sp.Detail_DownloadNum < 100 { //下载总量小于100
|
|
|
+ if sp.Detail_DownloadFailNum >= 3 { //失败个数超过3个
|
|
|
+ errFlag = true //异常
|
|
|
+ } else if float64(sp.Detail_DownloadFailNum)/float64(sp.Detail_DownloadNum) >= 0.2 { //失败占比超过20%
|
|
|
+ errFlag = true //异常
|
|
|
+ }
|
|
|
+ } else if sp.Detail_DownloadFailNum >= 3 { //下载总量大于100,失败个数超过3个
|
|
|
+ if float64(sp.Detail_DownloadFailNum)/float64(sp.Detail_DownloadNum) >= 0.03 { //失败占比超过3%
|
|
|
+ errFlag = true //异常
|
|
|
+ } else {
|
|
|
+ if sp.Detail_DownloadFailNum >= 30 { //失败个数超过30个
|
|
|
+ errFlag = true //异常
|
|
|
+ } else {
|
|
|
+ tagFlag := tagTime == util.GetTime(-7) //上次标记时间是否是7天前当天
|
|
|
+ if tagTime == 0 || !tagFlag { //系统打标记
|
|
|
+ //系统打标记
|
|
|
+ } else if tagFlag {
|
|
|
+ errFlag = true //异常
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if errFlag {
|
|
|
+ q := map[string]interface{}{
|
|
|
+ "$or": []interface{}{
|
|
|
+ map[string]interface{}{ //state=-1下载失败
|
|
|
+ "spidercode": sp.Code,
|
|
|
+ "state": -1,
|
|
|
+ "comeintime": map[string]interface{}{
|
|
|
+ "$gte": StartTime,
|
|
|
+ "$lt": EndTime,
|
|
|
+ },
|
|
|
+ },
|
|
|
+ map[string]interface{}{ //state=0,times存在,前一天未下载成功的
|
|
|
+ "spidercode": sp.Code,
|
|
|
+ "state": 0,
|
|
|
+ "times": map[string]interface{}{
|
|
|
+ "$exists": true,
|
|
|
+ },
|
|
|
+ "comeintime": map[string]interface{}{
|
|
|
+ "$gte": StartTime,
|
|
|
+ "$lt": EndTime,
|
|
|
+ },
|
|
|
+ },
|
|
|
+ },
|
|
|
+ }
|
|
|
+ sp.getErrHrefs("spider_highlistdata", NEWTASK_DOWNLOADERR, q)
|
|
|
+ sp.ErrTypeMap[qu.IntAll(NEWTASK_DOWNLOADERR)] = true
|
|
|
+ if sp.ErrType < 0 {
|
|
|
+ sp.ErrType = qu.IntAll(NEWTASK_DOWNLOADERR)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else { //顺序采集
|
|
|
+ //查询有无第一次记录(count=0),且下载失败的数据(count>0的数据表示该数据已经在采集当天统计过,不再二次统计)
|
|
|
+ q := map[string]interface{}{
|
|
|
+ "spidercode": sp.Code,
|
|
|
+ "count": 0,
|
|
|
+ "state": -1,
|
|
|
+ "comeintime": map[string]interface{}{
|
|
|
+ "$gte": StartTime,
|
|
|
+ "$lt": EndTime,
|
|
|
+ },
|
|
|
+ }
|
|
|
+ count := sp.getErrHrefs("spider_listdata", NEWTASK_DOWNLOADERR, q)
|
|
|
+ if count > 0 {
|
|
|
+ sp.ErrTypeMap[qu.IntAll(NEWTASK_DOWNLOADERR)] = true
|
|
|
+ if sp.ErrType < 0 {
|
|
|
+ sp.ErrType = qu.IntAll(NEWTASK_DOWNLOADERR)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+func dataInfoWarn(sp *NewSpider) {
|
|
|
+ defer qu.Catch()
|
|
|
+ if len(sp.WarnInfoMap) > 0 {
|
|
|
+ tagTime := int64(-1)
|
|
|
+ if sp.TaskTags != nil {
|
|
|
+ tagTime, _ = sp.TaskTags[NEWTASK_DATAINFOWARN].(int64)
|
|
|
+ } else { //无标记,记录列表页异常
|
|
|
+ tagTime = 0
|
|
|
+ }
|
|
|
+ if tagTime > -1 { //标记时间超时或无标记
|
|
|
+ errFlag := false
|
|
|
+ resultDescription := ""
|
|
|
+ for err, _ := range DataInfoWarnMap {
|
|
|
+ if wf := sp.WarnInfoMap[err]; wf != nil {
|
|
|
+ tmpDescription := ""
|
|
|
+ for field, href := range wf.Hrefs {
|
|
|
+ tmpDescription += " 字段" + field + ":" + href + "\n"
|
|
|
+ }
|
|
|
+ if tmpDescription != "" {
|
|
|
+ resultDescription += " " + wf.Info + "\n" + tmpDescription
|
|
|
+ }
|
|
|
+ errFlag = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if errFlag {
|
|
|
+ //sp.Error[NEWTASK_DATAINFOWARN] = &ErrorInfo{
|
|
|
+ // ErrInfo: map[string]bool{LuaErrTypeInfo[NEWTASK_DATAINFOWARN]: true},
|
|
|
+ //}
|
|
|
+ sp.ErrDescription += "数据异常警告:\n" + resultDescription
|
|
|
+ sp.ErrTypeMap[qu.IntAll(NEWTASK_DATAINFOWARN)] = true
|
|
|
+ if sp.ErrType < 0 {
|
|
|
+ sp.ErrType = qu.IntAll(NEWTASK_DATAINFOWARN)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+}
|
|
|
+func (sp *NewSpider) getErrHrefs(coll, errType string, query map[string]interface{}) (count int) {
|
|
|
+ defer qu.Catch()
|
|
|
+ if coll == "spider_listdata" { //
|
|
|
+ count = util.MgoS.Count(coll, query)
|
|
|
+ } else {
|
|
|
+ count = sp.Detail_DownloadFailNum
|
|
|
+ }
|
|
|
+ if count == 0 {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ sp.ErrDescription += LuaErrTypeInfo[NEWTASK_DOWNLOADERR] + ":共下载" + fmt.Sprint(sp.Detail_DownloadNum) + "条,失败" + fmt.Sprint(sp.Detail_DownloadFailNum) + "条\n"
|
|
|
+ if sp.Platform != "golua平台" {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ list, _ := util.MgoS.Find(coll, query, nil, `{"href":1}`, false, 0, 3)
|
|
|
+ if len(*list) > 0 {
|
|
|
+ //errHrefs := []*ErrRemark{}
|
|
|
+ for _, l := range *list {
|
|
|
+ href := qu.ObjToString(l["href"])
|
|
|
+ //errHrefs = append(errHrefs, &ErrRemark{Href: href})
|
|
|
+ sp.ErrDescription += " " + href + "\n"
|
|
|
+ }
|
|
|
+ //sp.Error[errType] = &ErrorInfo{
|
|
|
+ // Num: sp.Detail_DownloadFailNum,
|
|
|
+ // Err: errHrefs,
|
|
|
+ // ErrInfo: map[string]bool{LuaErrTypeInfo[errType]: true},
|
|
|
+ //}
|
|
|
+ }
|
|
|
+ return
|
|
|
+}
|