|
@@ -88,6 +88,7 @@ func NewStartTask() {
|
|
|
getLuaSummaryInfo() //获取lua汇总信息
|
|
|
getSpiderWarnInfo() //获取异常数据
|
|
|
saveCodeInfo() //汇总异常信息,产出任务
|
|
|
+ closeTask()
|
|
|
}
|
|
|
|
|
|
func getCodeBaseInfo() {
|
|
@@ -246,7 +247,14 @@ func getSpiderWarnInfo() {
|
|
|
},
|
|
|
}
|
|
|
fields := map[string]interface{}{
|
|
|
- "data": 0,
|
|
|
+ "field": 1,
|
|
|
+ "level": 1,
|
|
|
+ "info": 1,
|
|
|
+ "code": 1,
|
|
|
+ "infotype": 1,
|
|
|
+ "href": 1,
|
|
|
+ "data.publishtime": 1,
|
|
|
+ "data.l_np_publishtime": 1,
|
|
|
}
|
|
|
it := sess.DB(util.MgoS.DbName).C("spider_warn").Find(&query).Select(&fields).Iter()
|
|
|
n := 0
|
|
@@ -272,6 +280,19 @@ func getSpiderWarnInfo() {
|
|
|
} else if infotype == 8 && field == "projectinfo" {
|
|
|
return
|
|
|
}
|
|
|
+ if infotype == 2 || infotype == 6 || infotype == 8 {
|
|
|
+ if data, ok := tmp["data"].(map[string]interface{}); ok {
|
|
|
+ var ptime int64
|
|
|
+ if l_np_publishtime := data["l_np_publishtime"]; l_np_publishtime != nil {
|
|
|
+ ptime = qu.Int64All(l_np_publishtime)
|
|
|
+ } else if publishtime := data["publishtime"]; publishtime != nil {
|
|
|
+ ptime = qu.Int64All(publishtime)
|
|
|
+ }
|
|
|
+ if ptime < time.Now().AddDate(0, -6, 0).Unix() { //半年内的异常数据有效
|
|
|
+ return
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
code := qu.ObjToString(tmp["code"])
|
|
|
info := qu.ObjToString(tmp["info"])
|
|
|
href := qu.ObjToString(tmp["href"])
|
|
@@ -330,11 +351,11 @@ func getSpiderHeart() {
|
|
|
findListHeart := qu.Int64All(tmp["findlist"])
|
|
|
lock.Lock()
|
|
|
if sp := NewCodeInfoMap[code]; sp != nil {
|
|
|
- limitDayNum := 0
|
|
|
- if sp.Event == 7520 { //由于7520节点爬虫循环一轮的时间较长,心跳有可能仍是前一天的
|
|
|
- limitDayNum = -1
|
|
|
- }
|
|
|
- sp.List_IsGetData = findListHeart > util.GetTime(limitDayNum)
|
|
|
+ //limitDayNum := 0
|
|
|
+ //if sp.Event == 7520 { //由于7520节点爬虫循环一轮的时间较长,心跳有可能仍是前一天的
|
|
|
+ // limitDayNum = -1
|
|
|
+ //}
|
|
|
+ sp.List_IsGetData = findListHeart > util.GetTime(0)-int64(12*3600) //前一天12点
|
|
|
}
|
|
|
lock.Unlock()
|
|
|
}(tmp)
|
|
@@ -696,7 +717,11 @@ func listErr(sp *NewSpider) {
|
|
|
//}
|
|
|
sp.ErrType = qu.IntAll(NEWTASK_LISTERR)
|
|
|
sp.ErrTypeMap[qu.IntAll(NEWTASK_LISTERR)] = true
|
|
|
- sp.ErrDescription += "列表页异常:\n 列表页共采集" + fmt.Sprint(sp.List_RunTimes) + "轮,其中有" + fmt.Sprint(sp.List_NoDataTimes) + "轮无数据\n"
|
|
|
+ if !sp.List_IsGetData {
|
|
|
+ sp.ErrDescription += "列表页异常:\n 无最新心跳\n"
|
|
|
+ } else if sp.List_RunTimes == 0 {
|
|
|
+ sp.ErrDescription += "列表页异常:\n 列表页共采集" + fmt.Sprint(sp.List_RunTimes) + "轮,其中有" + fmt.Sprint(sp.List_NoDataTimes) + "轮无数据\n"
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -921,6 +946,26 @@ func (sp *NewSpider) getErrHrefs(coll, errType string, query map[string]interfac
|
|
|
return
|
|
|
}
|
|
|
|
|
|
+//关闭任务
|
|
|
+func closeTask() {
|
|
|
+ defer qu.Catch()
|
|
|
+ query := map[string]interface{}{ //关闭7天未转为待处理的下载异常,数据异常警告类型的任务
|
|
|
+ "l_comeintime": map[string]interface{}{
|
|
|
+ "$lte": util.GetTime(-7),
|
|
|
+ },
|
|
|
+ "i_state": 0,
|
|
|
+ "s_type": map[string]interface{}{
|
|
|
+ "$in": []string{"5", "6"},
|
|
|
+ },
|
|
|
+ }
|
|
|
+ set := map[string]interface{}{
|
|
|
+ "$set": map[string]interface{}{
|
|
|
+ "l_closetime": time.Now().Unix(),
|
|
|
+ },
|
|
|
+ }
|
|
|
+ util.MgoEB.Update("newtask", query, set, false, true)
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
1、列表页统计的是当天心跳,提前告警。如果当天心跳有问题呢?
|
|
|
2、下载异常由于原网站详情页无信息造成的,如何提高任务准确率?
|