소스 검색

detail附件验证数据是否下载成功

maxiaoshan 3 년 전
부모
커밋
3a6a24b0ad
1개의 변경된 파일44개의 추가작업 그리고 4개의 파일을 삭제
  1. 44 4
      src/spider/spider.go

+ 44 - 4
src/spider/spider.go

@@ -1014,10 +1014,22 @@ func (s *Spider) DownloadDetail(stype string) {
 						updateArr = append(updateArr, update)
 						spLock.Unlock()
 						return
-						//deleteData := FilterByDetail(href, query, data) //针对列表页无法过滤需要在详情页过滤的数据,进行过滤处理
-						//if deleteData {
-						//	return
-						//}
+					}
+					//正文、附件分析,下载异常数据重新下载
+					if AnalysisProjectInfo(data) {
+						times++
+						ss := map[string]interface{}{"times": times, "updatetime": time.Now().Unix()}
+						if times >= 3 { //3次下载失败今天不再下载,state置为1
+							ss["state"] = -1
+							ss["detailfilerr"] = true
+						}
+						set := map[string]interface{}{"$set": ss}
+						update = append(update, query)
+						update = append(update, set)
+						spLock.Lock()
+						updateArr = append(updateArr, update)
+						spLock.Unlock()
+						return
 					}
 					t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
 					if t1 > time.Now().Unix() { //防止发布时间超前
@@ -1087,6 +1099,34 @@ func NewSpiderByScript(num int, code string, info map[string]string, spChan chan
 	}
 }
 
+//detail含“详情请访问原网页!”且附件未下成功的,不计入下载成功
+func AnalysisProjectInfo(data map[string]interface{}) bool {
+	defer qu.Catch()
+	detail := qu.ObjToString(data["detail"])
+	if detail == "详情请访问原网页" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
+		if projectinfo, ok := data["projectinfo"].(map[string]interface{}); ok && len(projectinfo) > 0 {
+			if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok && len(attachments) > 0 {
+				fileOk := false
+				for _, data := range attachments {
+					if d, ok := data.(map[string]interface{}); ok {
+						fid := qu.ObjToString(d["fid"])
+						if fid != "" { //附件上传成功
+							fileOk = true
+							break
+						}
+					}
+				}
+				return !fileOk
+			} else {
+				return true
+			}
+		} else {
+			return true
+		}
+	}
+	return false
+}
+
 //打印线程数
 func AllThreadLog() {
 	logger.Info("Detail Download All Thread:", AllThreadNum)