瀏覽代碼

detail过滤

maxiaoshan 3 年之前
父節點
當前提交
f423dc0973
共有 1 個文件被更改,包括 50 次插入10 次删除
  1. 50 10
      src/spider/spider.go

+ 50 - 10
src/spider/spider.go

@@ -94,7 +94,6 @@ func DownloadHighDetail(code string) {
 	for {
 		//logger.Info("爬虫代码:", s.Code, "已下架:", s.Stop)
 		//if !s.Stop { //爬虫是运行状态
-		//TODO 延迟采集还未添加
 		/*
 			1、每轮开始先查询当天下载的数据
 			2、本次查询无数据依次向前推一天查询数据(暂定50条数据)
@@ -239,6 +238,18 @@ func DownloadHighDetail(code string) {
 							return
 						}
 					}
+					//正文、附件分析,下载异常数据重新下载
+					if AnalysisProjectInfo(data) {
+						times++
+						ss := map[string]interface{}{"times": times, "updatetime": time.Now().Unix()}
+						if times >= 3 { //3次下载失败今天不再下载,state置为1
+							ss["state"] = -1
+							ss["detailfilerr"] = true
+						}
+						set := map[string]interface{}{"$set": ss}
+						MgoS.Update("spider_highlistdata", query, set, false, false)
+						return
+					}
 					t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
 					if t1 > time.Now().Unix() { //防止发布时间超前
 						data["publishtime"] = time.Now().Unix()
@@ -247,15 +258,15 @@ func DownloadHighDetail(code string) {
 					delete(data, "checkpublishtime")
 					data["comeintime"] = time.Now().Unix()
 					//计数
-					tmpsp1, b := Allspiders.Load(sp.Code)
-					if b {
-						sp1, ok := tmpsp1.(*Spider)
-						if ok {
-							atomic.AddInt32(&sp1.LastDowncount, 1)
-							atomic.AddInt32(&sp1.TodayDowncount, 1)
-							atomic.AddInt32(&sp1.TotalDowncount, 1)
-						}
-					}
+					//tmpsp1, b := Allspiders.Load(sp.Code)
+					//if b {
+					//	sp1, ok := tmpsp1.(*Spider)
+					//	if ok {
+					//		atomic.AddInt32(&sp1.LastDowncount, 1)
+					//		atomic.AddInt32(&sp1.TodayDowncount, 1)
+					//		atomic.AddInt32(&sp1.TotalDowncount, 1)
+					//	}
+					//}
 					data["spidercode"] = sp.Code
 					data["dataging"] = 0
 					data["iscompete"] = sp.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
@@ -272,6 +283,35 @@ func DownloadHighDetail(code string) {
 		}
 	}
 }
+
+//detail含“详情请访问原网页!”且附件未下成功的,不计入下载成功
+func AnalysisProjectInfo(data map[string]interface{}) bool {
+	defer qu.Catch()
+	detail := qu.ObjToString(data["detail"])
+	if detail == "详情请访问原网页!" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
+		if projectinfo, ok := data["projectinfo"].(map[string]interface{}); ok && len(projectinfo) > 0 {
+			if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok && len(attachments) > 0 {
+				fileOk := false
+				for _, data := range attachments {
+					if d, ok := data.(map[string]interface{}); ok {
+						fid := qu.ObjToString(d["fid"])
+						if fid != "" { //附件上传成功
+							fileOk = true
+							break
+						}
+					}
+				}
+				return !fileOk
+			} else {
+				return true
+			}
+		} else {
+			return true
+		}
+	}
+	return false
+}
+
 func FilterByDetail(href string, query, data map[string]interface{}) bool {
 	if data["delete"] != nil {
 		//增量