Browse Source

新版爬虫维护任务流程

maxiaoshan 2 years ago
parent
commit
8d9c1349e9
2 changed files with 468 additions and 64 deletions
  1. 465 64
      src/luatask/newtask.go
  2. 3 0
      src/timetask/random.go

+ 465 - 64
src/luatask/newtask.go

@@ -1,52 +1,83 @@
 package luatask
 
 import (
-	"encoding/json"
+	"fmt"
 	"github.com/donnie4w/go-logger/logger"
+	"go.mongodb.org/mongo-driver/bson"
 	qu "qfw/util"
 	"sync"
 	"time"
 	"util"
 )
 
+const NEWTASK_LISTERR, NEWTASK_DATAINFOERR, NEWTASK_RATEERR, NEWTASK_DOWNLOADERR, NEWTASK_DATAINFOWARN = "1", "2", "4", "5", "6"
+
 var NewCodeInfoMap = map[string]*NewSpider{}
+var LuaErrTypeInfo = map[string]string{
+	NEWTASK_LISTERR:      "列表页异常",
+	NEWTASK_DATAINFOERR:  "数据异常错误",
+	NEWTASK_RATEERR:      "采集频率异常",
+	NEWTASK_DOWNLOADERR:  "下载异常",
+	NEWTASK_DATAINFOWARN: "数据异常警告",
+}
+var DataInfoErrMap = map[int]string{
+	1:  "Save Coll Error",
+	2:  "File Size Or Url Error",
+	4:  "Field Value Is Null",
+	9:  "Html Contains Temp Language",
+	10: "Publishtime Is Error",
+	11: "Publishtime Is Zero",
+	12: "Field Type Error",
+}
+var DataInfoWarnMap = map[int]string{
+	5: "Field Value Contains Random Code",
+	6: "Field Value Not Contains Chinese",
+	8: "Detail File Err",
+}
 
 type NewSpider struct {
 	//爬虫基本信息
-	Code         string                 `json:"code"`
-	Site         string                 `json:"site"`
-	Channel      string                 `json:"channel"`
-	Platform     string                 `json:"platform"`
-	Event        int                    `json:"event"`
-	PendState    int                    `json:"pendstate"`
-	ModifyUser   string                 `json:"modifyuser"`
-	ModifyId     string                 `json:"modifyuserid"`
-	ModifyTime   int64                  `json:"modifytime"`
-	Model        int                    `json:"model"`
-	Working      int                    `json:"working"`
-	AuditTime    int64                  `json:"l_uploadtime"`
-	ListIsFilter bool                   `json:"listisfilter"`
-	TaskTags     map[string]interface{} `json:"tasktags"`
+	Code         string                 `bson:"code"`
+	Site         string                 `bson:"site"`
+	Channel      string                 `bson:"channel"`
+	Platform     string                 `bson:"platform"`
+	Event        int                    `bson:"event"`
+	PendState    int                    `bson:"pendstate"`
+	ModifyUser   string                 `bson:"modifyuser"`
+	ModifyId     string                 `bson:"modifyuserid"`
+	ModifyTime   int64                  `bson:"modifytime"`
+	Model        int                    `bson:"model"`
+	Working      int                    `bson:"working"`
+	AuditTime    int64                  `bson:"l_uploadtime"`
+	ListIsFilter bool                   `bson:"listisfilter"`
+	TaskTags     map[string]interface{} `bson:"tasktags"`
 	//统计信息
-	Detail_DownloadNum        int               `json:"detail_downloadnum"`
-	Detail_DownloadSuccessNum int               `json:"detail_downloadsuccessnum"`
-	Detail_DownloadFailNum    int               `json:"detail_downloadfailnum"`
-	List_IsGetData            bool              `json:"list_isgetdata"`
-	List_RunTimes             int               `json:"list_runtimes"`
-	List_NoDataTimes          int               `json:"list_nodatatimes"`
-	List_AllInTimes           int               `json:"list_allintimes"`
-	WarnInfoMap               map[int]*WarnInfo `json:"warninfo"`
+	Detail_DownloadNum        int               `bson:"detail_downloadnum"`
+	Detail_DownloadSuccessNum int               `bson:"detail_downloadsuccessnum"`
+	Detail_DownloadFailNum    int               `bson:"detail_downloadfailnum"`
+	List_IsGetData            bool              `bson:"list_isgetdata"`
+	List_RunTimes             int               `bson:"list_runtimes"`
+	List_NoDataTimes          int               `bson:"list_nodatatimes"`
+	List_AllInTimes           int               `bson:"list_allintimes"`
+	WarnInfoMap               map[int]*WarnInfo `bson:"warninfo"`
 	//python
-	Py_TaskId   string `json:"py_taskid"`
-	Py_NodeName string `json:"py_nodename"`
+	Py_TaskId   string `bson:"py_taskid"`
+	Py_NodeName string `bson:"py_nodename"`
+	Py_IsValid  bool   `bson:"py_isvalid"`
 	//补充信息
-	Comeintime int64 `json:"comeintime"`
+	Comeintime int64 `bson:"comeintime"`
+	//异常汇总
+	//Error          map[string]*ErrorInfo `json:"error"`
+	ErrType        int          `bson:"errtype"`        //记录权重最高的异常类型
+	ErrTypeMap     map[int]bool `bson:"errtypemap"`     //记录所有异常
+	ErrDescription string       `bson:"errdescription"` //异常描述
 }
 
 type WarnInfo struct {
-	Info   string         `json:"info"`
-	Num    int            `json:"num"`
-	Fields map[string]int `json:"fields"`
+	Info   string            `bson:"info"`
+	Num    int               `bson:"num"`
+	Fields map[string]int    `bson:"fields"`
+	Hrefs  map[string]string `bson:"hrefs"`
 }
 
 func NewStartTask() {
@@ -55,8 +86,8 @@ func NewStartTask() {
 	getCodeBaseInfo()      //获取爬虫基本信息
 	getPythonSummaryInfo() //获取python汇总信息
 	getLuaSummaryInfo()    //获取lua汇总信息
-	getWarnInfo()          //异常信息汇总
-	saveCodeInfo()         //保存记录
+	getSpiderWarnInfo()    //获取异常数据
+	saveCodeInfo()         //汇总异常信息,产出任务
 }
 
 func getCodeBaseInfo() {
@@ -112,14 +143,20 @@ func getCodeBaseInfo() {
 			}()
 			sp := &NewSpider{
 				WarnInfoMap: map[int]*WarnInfo{},
+				//Error:       map[string]*ErrorInfo{},
+				ErrType:    -1,
+				ErrTypeMap: map[int]bool{},
 			}
-			luaByte, _ := json.Marshal(tmp)
-			if json.Unmarshal(luaByte, &sp) != nil {
+			luaByte, _ := bson.Marshal(tmp)
+			if bson.Unmarshal(luaByte, &sp) != nil {
 				qu.Info("初始化爬虫失败:", tmp["_id"])
 				return
 			}
 			sp.Working = util.CodeEventWorking[sp.Working]
 			sp.Model = util.CodeEventModel[sp.Event]
+			if sp.Platform == "python" {
+				sp.Model = 1
+			}
 			lock.Lock()
 			NewCodeInfoMap[sp.Code] = sp
 			lock.Unlock()
@@ -130,7 +167,7 @@ func getCodeBaseInfo() {
 		tmp = map[string]interface{}{}
 	}
 	wg.Wait()
-	logger.Info("爬虫基本信息准备完成...", len(CodeInfoMap))
+	logger.Info("爬虫基本信息准备完成...", len(NewCodeInfoMap))
 }
 
 func getPythonSummaryInfo() {
@@ -156,12 +193,7 @@ func getPythonSummaryInfo() {
 				wg.Done()
 			}()
 			code := qu.ObjToString(tmp["code"])
-			if is_valid, _ := tmp["is_valid"].(bool); !is_valid { //无效监控爬虫
-				lock.Lock()
-				delete(NewCodeInfoMap, code)
-				lock.Unlock()
-				return
-			}
+			is_valid, _ := tmp["is_valid"].(bool) //无效监控爬虫
 			py_taskid := qu.ObjToString(tmp["py_taskid"])
 			py_nodename := qu.ObjToString(tmp["py_nodename"])
 			list_isgetdata, _ := tmp["list_isgetdata"].(bool)
@@ -175,6 +207,7 @@ func getPythonSummaryInfo() {
 			if sp := NewCodeInfoMap[code]; sp != nil {
 				sp.Py_TaskId = py_taskid
 				sp.Py_NodeName = py_nodename
+				sp.Py_IsValid = is_valid
 				sp.List_IsGetData = list_isgetdata
 				sp.List_AllInTimes = list_allintimes
 				sp.List_NoDataTimes = list_nodatatimes
@@ -196,13 +229,13 @@ func getPythonSummaryInfo() {
 }
 
 func getLuaSummaryInfo() {
-	getCodeHeart()                 //获取心跳信息
+	getSpiderHeart()               //获取心跳信息
 	getSpiderHighListDownloadNum() //获取下载量信息
 	getSpiderListDownloadNum()     //获取下载量信息
-	getSpiderDownloadRateDataNew() //获取下载详情
+	getSpiderDownloadRateData()    //获取下载详情
 }
 
-func getWarnInfo() {
+func getSpiderWarnInfo() {
 	defer qu.Catch()
 	sess := util.MgoS.GetMgoConn()
 	defer util.MgoS.DestoryMongoConn(sess)
@@ -241,15 +274,20 @@ func getWarnInfo() {
 			}
 			code := qu.ObjToString(tmp["code"])
 			info := qu.ObjToString(tmp["info"])
+			href := qu.ObjToString(tmp["href"])
 			lock.Lock()
 			if sp := NewCodeInfoMap[code]; sp != nil {
 				if wf := sp.WarnInfoMap[infotype]; wf != nil {
+					if wf.Fields[field] == 0 {
+						wf.Hrefs[field] = href
+					}
 					wf.Fields[field] += 1
 				} else {
 					sp.WarnInfoMap[infotype] = &WarnInfo{
 						Info:   info,
 						Num:    1,
 						Fields: map[string]int{field: 1},
+						Hrefs:  map[string]string{field: href},
 					}
 				}
 			}
@@ -264,7 +302,7 @@ func getWarnInfo() {
 	logger.Info("错误信息数据统计完成...")
 }
 
-func getCodeHeart() {
+func getSpiderHeart() {
 	defer qu.Catch()
 	sess := util.MgoS.GetMgoConn()
 	defer util.MgoS.DestoryMongoConn(sess)
@@ -309,7 +347,7 @@ func getCodeHeart() {
 	logger.Info("lua统计心跳信息完成...")
 }
 
-func getSpiderHighListDownloadNum() {
+func getSpiderHighListDownloadNum() { //竞品数据暂未统计(延迟采集)
 	defer qu.Catch()
 	sess := util.MgoS.GetMgoConn()
 	defer util.MgoS.DestoryMongoConn(sess)
@@ -322,6 +360,7 @@ func getSpiderHighListDownloadNum() {
 	fields := map[string]interface{}{
 		"spidercode": 1,
 		"state":      1,
+		"times":      1,
 	}
 	lock := &sync.Mutex{}
 	wg := &sync.WaitGroup{}
@@ -339,13 +378,17 @@ func getSpiderHighListDownloadNum() {
 			}()
 			code := qu.ObjToString(tmp["spidercode"])
 			state := qu.IntAll(tmp["state"])
+			times := tmp["times"]
 			lock.Lock()
 			if sp := NewCodeInfoMap[code]; sp != nil {
 				if state == 1 {
 					sp.Detail_DownloadSuccessNum++
-				} else {
+				} else if state == -1 {
+					sp.Detail_DownloadFailNum++
+				} else if state == 0 && times != nil {
 					sp.Detail_DownloadFailNum++
 				}
+				//未统计未下载的数据量 state==0,times==nil
 				sp.Detail_DownloadNum++
 			}
 			lock.Unlock()
@@ -427,7 +470,7 @@ func getSpiderListDownloadNum() {
 	logger.Info("lua统计spider_listdata采集量完成...")
 }
 
-func getSpiderDownloadRateDataNew() {
+func getSpiderDownloadRateData() {
 	defer qu.Catch()
 	sess := util.MgoS.GetMgoConn()
 	defer util.MgoS.DestoryMongoConn(sess)
@@ -437,9 +480,9 @@ func getSpiderDownloadRateDataNew() {
 	date := qu.FormatDateByInt64(&StartTime, qu.Date_Short_Layout)
 	query := map[string]interface{}{
 		"date": date,
-		"event": map[string]interface{}{
-			"$ne": 7000,
-		},
+		//"event": map[string]interface{}{
+		//	"$ne": 7000,
+		//},
 	}
 	fields := map[string]interface{}{
 		"spidercode": 1,
@@ -484,7 +527,8 @@ func saveCodeInfo() {
 	wg := &sync.WaitGroup{}
 	ch := make(chan bool, 5)
 	comeintime := time.Now().Unix()
-	arr := []map[string]interface{}{}
+	codeInfoArr := []map[string]interface{}{} //爬虫下载详情
+	taskArr := [][]map[string]interface{}{}   //任务更新集
 	for _, spider := range NewCodeInfoMap {
 		ch <- true
 		wg.Add(1)
@@ -493,29 +537,386 @@ func saveCodeInfo() {
 				<-ch
 				wg.Done()
 			}()
+			getAllErr(sp)                  //汇总异常
+			createTask(sp, &taskArr, lock) //
 			sp.Comeintime = comeintime
-			spByte, err := json.Marshal(sp)
+			spByte, err := bson.Marshal(sp)
 			if err != nil {
 				logger.Info("Json Marshal Error", sp.Code)
 				return
 			}
+			lock.Lock()
+			defer lock.Unlock()
 			tmp := map[string]interface{}{}
-			if json.Unmarshal(spByte, &tmp) == nil {
-				lock.Lock()
-				arr = append(arr, tmp)
-				if len(arr) > 500 {
-					util.MgoS.SaveBulk("spider_info", arr...)
-					arr = []map[string]interface{}{}
-				}
-				lock.Unlock()
+			if bson.Unmarshal(spByte, &tmp) == nil {
+				codeInfoArr = append(codeInfoArr, tmp)
+			} else {
+				logger.Error("Json UnMarshal Error", sp.Code)
+				return
+			}
+			if len(codeInfoArr) > 500 {
+				util.MgoS.SaveBulk("spider_info", codeInfoArr...)
+				codeInfoArr = []map[string]interface{}{}
+			}
+			if len(taskArr) > 500 {
+				util.MgoEB.UpSertBulk("newtask", taskArr...)
+				taskArr = [][]map[string]interface{}{}
 			}
-
 		}(spider)
 	}
 	wg.Wait()
-	if len(arr) > 0 {
-		util.MgoS.SaveBulk("spider_info", arr...)
-		arr = []map[string]interface{}{}
+	if len(codeInfoArr) > 0 {
+		util.MgoS.SaveBulk("spider_info", codeInfoArr...)
+		codeInfoArr = []map[string]interface{}{}
+	}
+	if len(taskArr) > 0 {
+		util.MgoEB.UpSertBulk("newtask", taskArr...)
+		taskArr = [][]map[string]interface{}{}
 	}
+	NewCodeInfoMap = map[string]*NewSpider{}
 	logger.Info("爬虫统计完成...")
 }
+
+func createTask(sp *NewSpider, taskArr *[][]map[string]interface{}, lock *sync.Mutex) {
+	defer qu.Catch()
+	if sp.Event == 7000 {
+		return
+	}
+	if sp.ErrType == -1 { //无异常
+		return
+	}
+	state_new := 0
+	if sp.ErrType == 2 { //数据异常错误类型,任务状态1
+		state_new = 1
+	}
+	//查询历史任务
+	query := map[string]interface{}{
+		"s_code": sp.Code,
+		"i_state": map[string]interface{}{
+			"$in": []int{0, 1, 2, 3, 5}, //查询现有正在维护的任务
+		},
+	}
+	fields := map[string]interface{}{
+		"i_state":    1,
+		"s_type":     1,
+		"s_descript": 1,
+		"i_times":    1,
+	}
+	list, _ := util.MgoEB.Find("newtask", query, nil, fields, false, -1, -1)
+	update := []map[string]interface{}{}
+	if list != nil && len(*list) > 0 { //已有任务
+		if len(*list) > 1 {
+			logger.Error("Code:", sp.Code, "任务异常")
+			util.MgoEB.Save("luacreatetaskerr", map[string]interface{}{
+				"code":       sp.Code,
+				"comeintime": time.Now().Unix(),
+				"tasknum":    len(*list),
+			})
+			return
+		}
+		task := (*list)[0]                                 //唯一任务
+		state_old := qu.IntAll(task["i_state"])            //历史任务状态
+		times_old := qu.IntAll(task["i_times"])            //历史任务次数
+		type_old := qu.ObjToString(task["s_type"])         //历史任务异常类型
+		descript_old := qu.ObjToString(task["s_descript"]) //历史任务描述
+
+		result := map[string]interface{}{
+			"i_event":      sp.Event,
+			"l_updatetime": time.Now().Unix(),
+			"i_times":      times_old + 1,
+			"s_descript":   descript_old + time.Now().Format(qu.Date_Short_Layout) + "追加描述:------------------------------\n" + sp.ErrDescription,
+		}
+		//任务状态
+		if state_old == 0 {
+			result["i_state"] = 1 //第二次任务,将历史待确认任务升级为待处理
+		}
+		//任务类型
+		if sp.ErrType < qu.IntAll(type_old) { //取优先级高者
+			result["s_type"] = fmt.Sprint(sp.ErrType)
+		}
+		update = append(update, map[string]interface{}{"_id": task["_id"]})
+		update = append(update, map[string]interface{}{"$set": result})
+		lock.Lock()
+		*taskArr = append(*taskArr, update)
+		lock.Unlock()
+	} else { //无历史任务
+		saveMap := map[string]interface{}{
+			"s_modify":     sp.ModifyUser,
+			"s_modifyid":   sp.ModifyId,
+			"s_code":       sp.Code,
+			"s_site":       sp.Site,
+			"s_channel":    sp.Channel,
+			"i_event":      sp.Event,
+			"i_state":      state_new,
+			"s_source":     "程序",
+			"s_type":       fmt.Sprint(sp.ErrType),
+			"s_descript":   sp.ErrDescription,
+			"i_times":      1,
+			"l_comeintime": time.Now().Unix(),
+			"l_complete":   util.CompleteTime("1"),
+			//"s_urgency":    "1",
+			"s_platform": sp.Platform,
+		}
+		update = append(update, query)
+		update = append(update, saveMap)
+		lock.Lock()
+		*taskArr = append(*taskArr, update)
+		lock.Unlock()
+	}
+}
+
+func getAllErr(sp *NewSpider) {
+	listErr(sp)           //列表页异常
+	dataInfoErr(sp)       //数据异常错误
+	downloadRateErr(sp)   //下载频率异常
+	downloadFailedErr(sp) //下载异常
+	dataInfoWarn(sp)      //数据异常警告
+}
+
+func listErr(sp *NewSpider) {
+	defer qu.Catch()
+	if sp.Platform == "python" && !sp.Py_IsValid {
+		return
+	}
+	if !sp.List_IsGetData || sp.List_RunTimes == 0 {
+		errFlag := false
+		if sp.TaskTags != nil {
+			tagTime, _ := sp.TaskTags[NEWTASK_LISTERR].(int64) //用struct接收,会转为floa64
+			if tagTime == 0 {                                  //无列表标记
+				errFlag = true
+			} else if tagTime > 0 && tagTime <= util.GetTime(-7) { //标记失效
+				errFlag = true
+			}
+		} else { //无任何标记
+			errFlag = true
+		}
+		if errFlag {
+			//sp.Error[NEWTASK_LISTERR] = &ErrorInfo{
+			//	ErrInfo: map[string]bool{LuaErrTypeInfo[NEWTASK_LISTERR]: true},
+			//}
+			sp.ErrType = qu.IntAll(NEWTASK_LISTERR)
+			sp.ErrTypeMap[qu.IntAll(NEWTASK_LISTERR)] = true
+			sp.ErrDescription += "列表页异常:\n	列表页共采集" + fmt.Sprint(sp.List_RunTimes) + "轮,其中有" + fmt.Sprint(sp.List_NoDataTimes) + "轮无数据\n"
+		}
+	}
+}
+func dataInfoErr(sp *NewSpider) {
+	defer qu.Catch()
+	if len(sp.WarnInfoMap) > 0 {
+		errFlag := false
+		resultDescription := ""
+		for err, _ := range DataInfoErrMap {
+			if wf := sp.WarnInfoMap[err]; wf != nil {
+				tmpDescription := ""
+				for field, href := range wf.Hrefs {
+					tmpDescription += "		字段" + field + ":" + href + "\n"
+				}
+				if tmpDescription != "" {
+					resultDescription += "	" + wf.Info + "\n" + tmpDescription
+				}
+				errFlag = true
+			}
+		}
+		if errFlag {
+			//sp.Error[NEWTASK_DATAINFOERR] = &ErrorInfo{
+			//	ErrInfo: map[string]bool{LuaErrTypeInfo[NEWTASK_DATAINFOERR]: true},
+			//}
+			sp.ErrDescription += "数据异常错误:\n" + resultDescription
+			sp.ErrTypeMap[qu.IntAll(NEWTASK_DATAINFOERR)] = true
+			if sp.ErrType < 0 {
+				sp.ErrType = qu.IntAll(NEWTASK_DATAINFOERR)
+			}
+		}
+	}
+
+}
+func downloadRateErr(sp *NewSpider) {
+	defer qu.Catch()
+	if sp.Platform == "python" && !sp.Py_IsValid {
+		return
+	}
+	if sp.List_AllInTimes > 0 {
+		errFlag := false
+		if sp.Model == 1 { //分开采集,直接记录异常
+			errFlag = true
+		} else { //顺序采集
+			if sp.TaskTags != nil {
+				tagTime, _ := sp.TaskTags[NEWTASK_RATEERR].(int64)
+				if tagTime == 0 { //无列表标记
+					errFlag = true
+				} else if tagTime > 0 && tagTime <= util.GetTime(-7) { //标记失效
+					errFlag = true
+				}
+			} else { //无标记,记录列表页异常
+				errFlag = true
+			}
+		}
+		if errFlag {
+			//sp.Error[NEWTASK_RATEERR] = &ErrorInfo{
+			//	ErrInfo: map[string]bool{LuaErrTypeInfo[NEWTASK_RATEERR]: true},
+			//}
+			sp.ErrTypeMap[qu.IntAll(NEWTASK_RATEERR)] = true
+			sp.ErrDescription += "采集频率异常:\n	列表页共采集" + fmt.Sprint(sp.List_RunTimes) + "轮,其中有" + fmt.Sprint(sp.List_AllInTimes) + "轮数据全采\n"
+			if sp.ErrType < 0 {
+				sp.ErrType = qu.IntAll(NEWTASK_RATEERR)
+			}
+		}
+	}
+}
+func downloadFailedErr(sp *NewSpider) {
+	defer qu.Catch()
+	if sp.Platform == "python" && !sp.Py_IsValid {
+		return
+	}
+	if sp.Detail_DownloadFailNum > 0 {
+		tagTime := int64(-1)
+		if sp.TaskTags != nil {
+			tagTime, _ = sp.TaskTags[NEWTASK_DOWNLOADERR].(int64)
+		} else { //无标记,记录列表页异常
+			tagTime = 0
+		}
+		if tagTime > -1 {
+			if sp.Model == 1 { //分开采集(python爬虫默认分开采集模式)
+				errFlag := false
+				if sp.Detail_DownloadNum < 100 { //下载总量小于100
+					if sp.Detail_DownloadFailNum >= 3 { //失败个数超过3个
+						errFlag = true //异常
+					} else if float64(sp.Detail_DownloadFailNum)/float64(sp.Detail_DownloadNum) >= 0.2 { //失败占比超过20%
+						errFlag = true //异常
+					}
+				} else if sp.Detail_DownloadFailNum >= 3 { //下载总量大于100,失败个数超过3个
+					if float64(sp.Detail_DownloadFailNum)/float64(sp.Detail_DownloadNum) >= 0.03 { //失败占比超过3%
+						errFlag = true //异常
+					} else {
+						if sp.Detail_DownloadFailNum >= 30 { //失败个数超过30个
+							errFlag = true //异常
+						} else {
+							tagFlag := tagTime == util.GetTime(-7) //上次标记时间是否是7天前当天
+							if tagTime == 0 || !tagFlag {          //系统打标记
+								//系统打标记
+							} else if tagFlag {
+								errFlag = true //异常
+							}
+						}
+					}
+				}
+				if errFlag {
+					q := map[string]interface{}{
+						"$or": []interface{}{
+							map[string]interface{}{ //state=-1下载失败
+								"spidercode": sp.Code,
+								"state":      -1,
+								"comeintime": map[string]interface{}{
+									"$gte": StartTime,
+									"$lt":  EndTime,
+								},
+							},
+							map[string]interface{}{ //state=0,times存在,前一天未下载成功的
+								"spidercode": sp.Code,
+								"state":      0,
+								"times": map[string]interface{}{
+									"$exists": true,
+								},
+								"comeintime": map[string]interface{}{
+									"$gte": StartTime,
+									"$lt":  EndTime,
+								},
+							},
+						},
+					}
+					sp.getErrHrefs("spider_highlistdata", NEWTASK_DOWNLOADERR, q)
+					sp.ErrTypeMap[qu.IntAll(NEWTASK_DOWNLOADERR)] = true
+					if sp.ErrType < 0 {
+						sp.ErrType = qu.IntAll(NEWTASK_DOWNLOADERR)
+					}
+				}
+			} else { //顺序采集
+				//查询有无第一次记录(count=0),且下载失败的数据(count>0的数据表示该数据已经在采集当天统计过,不再二次统计)
+				q := map[string]interface{}{
+					"spidercode": sp.Code,
+					"count":      0,
+					"state":      -1,
+					"comeintime": map[string]interface{}{
+						"$gte": StartTime,
+						"$lt":  EndTime,
+					},
+				}
+				count := sp.getErrHrefs("spider_listdata", NEWTASK_DOWNLOADERR, q)
+				if count > 0 {
+					sp.ErrTypeMap[qu.IntAll(NEWTASK_DOWNLOADERR)] = true
+					if sp.ErrType < 0 {
+						sp.ErrType = qu.IntAll(NEWTASK_DOWNLOADERR)
+					}
+				}
+			}
+		}
+	}
+}
+func dataInfoWarn(sp *NewSpider) {
+	defer qu.Catch()
+	if len(sp.WarnInfoMap) > 0 {
+		tagTime := int64(-1)
+		if sp.TaskTags != nil {
+			tagTime, _ = sp.TaskTags[NEWTASK_DATAINFOWARN].(int64)
+		} else { //无标记,记录列表页异常
+			tagTime = 0
+		}
+		if tagTime > -1 { //标记时间超时或无标记
+			errFlag := false
+			resultDescription := ""
+			for err, _ := range DataInfoWarnMap {
+				if wf := sp.WarnInfoMap[err]; wf != nil {
+					tmpDescription := ""
+					for field, href := range wf.Hrefs {
+						tmpDescription += "		字段" + field + ":" + href + "\n"
+					}
+					if tmpDescription != "" {
+						resultDescription += "	" + wf.Info + "\n" + tmpDescription
+					}
+					errFlag = true
+				}
+			}
+			if errFlag {
+				//sp.Error[NEWTASK_DATAINFOWARN] = &ErrorInfo{
+				//	ErrInfo: map[string]bool{LuaErrTypeInfo[NEWTASK_DATAINFOWARN]: true},
+				//}
+				sp.ErrDescription += "数据异常警告:\n" + resultDescription
+				sp.ErrTypeMap[qu.IntAll(NEWTASK_DATAINFOWARN)] = true
+				if sp.ErrType < 0 {
+					sp.ErrType = qu.IntAll(NEWTASK_DATAINFOWARN)
+				}
+			}
+		}
+	}
+
+}
+func (sp *NewSpider) getErrHrefs(coll, errType string, query map[string]interface{}) (count int) {
+	defer qu.Catch()
+	if coll == "spider_listdata" { //
+		count = util.MgoS.Count(coll, query)
+	} else {
+		count = sp.Detail_DownloadFailNum
+	}
+	if count == 0 {
+		return
+	}
+	sp.ErrDescription += LuaErrTypeInfo[NEWTASK_DOWNLOADERR] + ":共下载" + fmt.Sprint(sp.Detail_DownloadNum) + "条,失败" + fmt.Sprint(sp.Detail_DownloadFailNum) + "条\n"
+	if sp.Platform != "golua平台" {
+		return
+	}
+	list, _ := util.MgoS.Find(coll, query, nil, `{"href":1}`, false, 0, 3)
+	if len(*list) > 0 {
+		//errHrefs := []*ErrRemark{}
+		for _, l := range *list {
+			href := qu.ObjToString(l["href"])
+			//errHrefs = append(errHrefs, &ErrRemark{Href: href})
+			sp.ErrDescription += "	" + href + "\n"
+		}
+		//sp.Error[errType] = &ErrorInfo{
+		//	Num:     sp.Detail_DownloadFailNum,
+		//	Err:     errHrefs,
+		//	ErrInfo: map[string]bool{LuaErrTypeInfo[errType]: true},
+		//}
+	}
+	return
+}

+ 3 - 0
src/timetask/random.go

@@ -239,6 +239,9 @@ func GetSpiderWarnData() {
 					if warnInfo.MaxLevel < level {
 						warnInfo.MaxLevel = level
 					}
+					if warnInfo.Entry && !entry { //已entry为false为优先级
+						warnInfo.Entry = entry
+					}
 				}
 				lock.Unlock()
 			}