maxiaoshan пре 3 година
родитељ
комит
c8f350fcc7
1 измењених фајлова са 46 додато и 0 уклоњено
  1. 46 0
      src/spider/spider.go

+ 46 - 0
src/spider/spider.go

@@ -148,6 +148,24 @@ func (s *Spider) DownloadDetailItem(paramdata, tmp map[string]interface{}) {
 			util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
 		}
 	}
+	if data["delete"] != nil { //三级页过滤
+		//增量
+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
+		//全量
+		db := HexToBigIntMod(href)
+		hashHref := HexText(href)
+		util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
+		//更新数据状态
+		UpdateState(id, map[string]interface{}{"state": 3, "remark": "Detail Delete"})
+		//下载成功根据href更新spider_highlistdata中state
+		Mgo.Update("spider_highlistdata", map[string]interface{}{"href": tmp["href"]}, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "by": "regather", "updatetime": time.Now().Unix()}}, false, true)
+		Mgo.Update("spider_listdata", map[string]interface{}{"href": tmp["href"]}, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "by": "regather", "updatetime": time.Now().Unix()}}, false, true)
+		return
+	}
+	//正文、附件分析,下载异常数据重新下载
+	if AnalysisProjectInfo(data) {
+		return
+	}
 	//jsondata处理
 	if jsondata, ok := data["jsondata"].(string); ok {
 		jsondataMap := map[string]interface{}{}
@@ -201,6 +219,34 @@ func (s *Spider) DownloadDetailItem(paramdata, tmp map[string]interface{}) {
 	}
 }
 
+//detail含“详情请访问原网页!”且附件未下成功的,不计入下载成功
+func AnalysisProjectInfo(data map[string]interface{}) bool {
+	defer qu.Catch()
+	detail := qu.ObjToString(data["detail"])
+	if detail == "详情请访问原网页" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
+		if projectinfo, ok := data["projectinfo"].(map[string]interface{}); ok && len(projectinfo) > 0 {
+			if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok && len(attachments) > 0 {
+				fileOk := false
+				for _, data := range attachments {
+					if d, ok := data.(map[string]interface{}); ok {
+						fid := qu.ObjToString(d["fid"])
+						if fid != "" { //附件上传成功
+							fileOk = true
+							break
+						}
+					}
+				}
+				return !fileOk
+			} else {
+				return true
+			}
+		} else {
+			return true
+		}
+	}
+	return false
+}
+
 //下载解析内容页
 func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
 	defer mu.Catch()