Parcourir la source

分开采集模式数据列表页redis判重修改

maxiaoshan il y a 2 ans
Parent
commit
eaee684674
3 fichiers modifiés avec 114 ajouts et 105 suppressions
  1. 83 83
      src/spider/history.go
  2. 16 9
      src/spider/spider.go
  3. 15 13
      src/spider/store.go

+ 83 - 83
src/spider/history.go

@@ -38,89 +38,89 @@ func (s *Spider) StartSpider() {
 }
 
 //加载应采集数据,进行采集
-func (s *Spider) DownloadHistoryDetail() {
-	defer qu.Catch()
-	q := map[string]interface{}{"spidercode": s.Code, "state": 0}
-	o := map[string]interface{}{"_id": 1}
-	f := map[string]interface{}{
-		"state":      0,
-		"comeintime": 0,
-		"event":      0,
-	}
-	//UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail") //记录采集三级页心跳
-	list, _ := MgoS.Find("spider_historydata", q, o, f, false, 0, 200)
-	if len(*list) == 0 { //数据量为0,表示无可下载数据,爬虫作废
-		s.Stop = true
-		return
-	}
-	//采集(目前未开多线程)
-	for _, tmp := range *list {
-		id := tmp["_id"]
-		href := qu.ObjToString(tmp["href"])
-		hashHref := sputil.HexText(href)
-		isExist := sputil.RedisClusterExists(hashHref) //全量href redis判重
-		if isExist {
-			set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "href", "updatetime": time.Now().Unix()}} //已存在state置为1
-			MgoS.UpdateById("spider_historydata", id, set)
-			return
-		}
-		success := true    //数据是否下载成功的标志
-		delete(tmp, "_id") //删除列表页信息无用字段_id
-		data := map[string]interface{}{}
-		for k, v := range tmp {
-			data[k] = v
-		}
-		//下载、解析、入库
-		data, err := s.DownloadDetailPage(tmp, data)
-		//UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //下载数据心跳
-		if err != nil || data == nil {
-			success = false
-			if err != nil {
-				logger.Error(s.Code, err, tmp)
-				//if len(tmp) > 0 {
-				//	SaveErrorData(s.MUserName, tmp, err) //保存错误信息
-				//}
-			} /*else if data == nil && times >= 3 { //下载问题,建editor任务
-				DownloadErrorData(s.Code, tmp)
-			}*/
-		} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
-			sputil.RedisClusterSet(hashHref, "", -1)
-		}
-		if !success { //下载失败
-			set := map[string]interface{}{"$set": map[string]interface{}{"state": -1, "updatetime": time.Now().Unix()}}
-			MgoS.UpdateById("spider_historydata", id, set)
-			return
-		} else if data["delete"] != nil { //三级页过滤
-			sputil.RedisClusterSet(hashHref, "", -1) //过滤掉的数据存值全量redis
-			//更新mgo 要删除的数据更新spider_historydata state=1不再下载,更新redis
-			set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true, "updatetime": time.Now().Unix()}}
-			MgoS.UpdateById("spider_historydata", id, set)
-			return
-		}
-		//正文、附件分析,下载异常数据重新下载
-		if AnalysisProjectInfo(data) {
-			set := map[string]interface{}{"$set": map[string]interface{}{"state": -1, "detailfilerr": true, "updatetime": time.Now().Unix()}}
-			MgoS.UpdateById("spider_historydata", id, set)
-			return
-		}
-		t1 := sputil.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
-		if t1 > time.Now().Unix() { //防止发布时间超前
-			data["publishtime"] = time.Now().Unix()
-		}
-		delete(data, "exit")
-		delete(data, "checkpublishtime")
-		data["comeintime"] = time.Now().Unix()
-		data["spidercode"] = s.Code
-		data["dataging"] = 0
-		data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
-		//发送保存服务
-		Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
-		set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}} //下载成功state置为1
-		MgoS.UpdateById("spider_historydata", id, set)
-	}
-	//采集完LoadScript
-	s.LoadScript(&s.Name, &s.Channel, &s.MUserName, s.Code, s.ScriptFile, true, false)
-}
+//func (s *Spider) DownloadHistoryDetail() {
+//	defer qu.Catch()
+//	q := map[string]interface{}{"spidercode": s.Code, "state": 0}
+//	o := map[string]interface{}{"_id": 1}
+//	f := map[string]interface{}{
+//		"state":      0,
+//		"comeintime": 0,
+//		"event":      0,
+//	}
+//	//UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail") //记录采集三级页心跳
+//	list, _ := MgoS.Find("spider_historydata", q, o, f, false, 0, 200)
+//	if len(*list) == 0 { //数据量为0,表示无可下载数据,爬虫作废
+//		s.Stop = true
+//		return
+//	}
+//	//采集(目前未开多线程)
+//	for _, tmp := range *list {
+//		id := tmp["_id"]
+//		href := qu.ObjToString(tmp["href"])
+//		hashHref := sputil.HexText(href)
+//		isExist := sputil.RedisClusterExists(hashHref) //全量href redis判重
+//		if isExist {
+//			set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "href", "updatetime": time.Now().Unix()}} //已存在state置为1
+//			MgoS.UpdateById("spider_historydata", id, set)
+//			return
+//		}
+//		success := true    //数据是否下载成功的标志
+//		delete(tmp, "_id") //删除列表页信息无用字段_id
+//		data := map[string]interface{}{}
+//		for k, v := range tmp {
+//			data[k] = v
+//		}
+//		//下载、解析、入库
+//		data, err := s.DownloadDetailPage(tmp, data)
+//		//UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //下载数据心跳
+//		if err != nil || data == nil {
+//			success = false
+//			if err != nil {
+//				logger.Error(s.Code, err, tmp)
+//				//if len(tmp) > 0 {
+//				//	SaveErrorData(s.MUserName, tmp, err) //保存错误信息
+//				//}
+//			} /*else if data == nil && times >= 3 { //下载问题,建editor任务
+//				DownloadErrorData(s.Code, tmp)
+//			}*/
+//		} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
+//			sputil.RedisClusterSet(hashHref, "", -1)
+//		}
+//		if !success { //下载失败
+//			set := map[string]interface{}{"$set": map[string]interface{}{"state": -1, "updatetime": time.Now().Unix()}}
+//			MgoS.UpdateById("spider_historydata", id, set)
+//			return
+//		} else if data["delete"] != nil { //三级页过滤
+//			sputil.RedisClusterSet(hashHref, "", -1) //过滤掉的数据存值全量redis
+//			//更新mgo 要删除的数据更新spider_historydata state=1不再下载,更新redis
+//			set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true, "updatetime": time.Now().Unix()}}
+//			MgoS.UpdateById("spider_historydata", id, set)
+//			return
+//		}
+//		//正文、附件分析,下载异常数据重新下载
+//		if AnalysisProjectInfo(data) {
+//			set := map[string]interface{}{"$set": map[string]interface{}{"state": -1, "detailfilerr": true, "updatetime": time.Now().Unix()}}
+//			MgoS.UpdateById("spider_historydata", id, set)
+//			return
+//		}
+//		t1 := sputil.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
+//		if t1 > time.Now().Unix() { //防止发布时间超前
+//			data["publishtime"] = time.Now().Unix()
+//		}
+//		delete(data, "exit")
+//		delete(data, "checkpublishtime")
+//		data["comeintime"] = time.Now().Unix()
+//		data["spidercode"] = s.Code
+//		data["dataging"] = 0
+//		data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
+//		//发送保存服务
+//		Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
+//		set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}} //下载成功state置为1
+//		MgoS.UpdateById("spider_historydata", id, set)
+//	}
+//	//采集完LoadScript
+//	s.LoadScript(&s.Name, &s.Channel, &s.MUserName, s.Code, s.ScriptFile, true, false)
+//}
 
 //定时检测数据集汇总爬虫
 func GetHistoryDownloadSpider() {

+ 16 - 9
src/spider/spider.go

@@ -101,6 +101,7 @@ var TimeChan = make(chan bool, 1)
 var Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
 var DomainNameReg = regexp.MustCompile(`(?://).+?(?:)[::/]`)
 var RepDomainNameReg = regexp.MustCompile(`[::/]+`)
+var RestrictAccessReg = regexp.MustCompile(`访问被拒绝`)
 var Today string
 var SpiderFlowMap = sync.Map{} //code:{"2022-05-16":SpiderFlow}
 var AllThreadNum int64
@@ -538,6 +539,8 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
 		//更新spider_listdata中数据下载失败标记
 		MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": -1}})
 		return
+	} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
+		util.RedisClusterSet(hashHref, "", -1) //全量redis中存值列表页href
 	}
 	//详情页过滤数据
 	set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}
@@ -995,12 +998,12 @@ func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
 						return
 					}
 					//正文、附件分析,下载异常数据重新下载
-					if AnalysisProjectInfo(data) {
+					if r := AnalysisProjectInfo(data); r != "" { //顺序采集暂不加此块判断(异常数据不会加redis,导致一直下载)
 						times++
 						ss := map[string]interface{}{"times": times, "updatetime": time.Now().Unix()}
 						if times >= 3 { //3次下载失败今天不再下载,state置为-1
 							ss["state"] = -1
-							ss["detailfilerr"] = true
+							ss["detailfilerr"] = r
 						}
 						set := map[string]interface{}{"$set": ss}
 						update = append(update, query)
@@ -1081,9 +1084,12 @@ func NewSpiderByScript(num int, code string, info map[string]string, spChan chan
 }
 
 //detail含“详情请访问原网页!”且附件未下成功的,不计入下载成功
-func AnalysisProjectInfo(data map[string]interface{}) bool {
+func AnalysisProjectInfo(data map[string]interface{}) string {
 	defer qu.Catch()
 	detail := qu.ObjToString(data["detail"])
+	if RestrictAccessReg.MatchString(detail) { //限制访问
+		return "ip"
+	}
 	if detail == "详情请访问原网页!" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
 		if projectinfo, ok := data["projectinfo"].(map[string]interface{}); ok && len(projectinfo) > 0 {
 			if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok && len(attachments) > 0 {
@@ -1092,20 +1098,21 @@ func AnalysisProjectInfo(data map[string]interface{}) bool {
 					if d, ok := data.(map[string]interface{}); ok {
 						fid := qu.ObjToString(d["fid"])
 						if fid != "" { //附件上传成功
-							fileOk = true
-							break
+							return ""
 						}
 					}
 				}
-				return !fileOk
+				if !fileOk {
+					return "detail_file"
+				}
 			} else {
-				return true
+				return "detail_file"
 			}
 		} else {
-			return true
+			return "detail_file"
 		}
 	}
-	return false
+	return ""
 }
 
 //打印线程数

+ 15 - 13
src/spider/store.go

@@ -238,21 +238,23 @@ func SaveErrorData(modifyuser string, pd map[string]interface{}, err interface{}
 //保存modal=1模式采集的列表页信息
 func SaveHighListPageData(tmp map[string]interface{}, code, hashHref string, num *int) {
 	//列表页href判重
-	redisCode := lu.RedisClusterGet("list_" + hashHref)
-	//此处区分历史节点(7000)和增量节点
-	if redisCode != "" {
-		if lu.Config.IsHistoryEvent || strings.Contains(redisCode, code) { //列表页数据已采集
-			*num++
-			return
-		} else {
-			lu.RedisClusterSet("list_"+hashHref, code+"+"+redisCode, 86400*365*2) //两年
-		}
+	isExist := lu.RedisClusterExists("list_" + hashHref)
+	if isExist {
+		*num++
+		return
 	} else {
-		lu.RedisClusterSet("list_"+hashHref, code+"+"+redisCode, 86400*365*2) //两年
+		lu.RedisClusterSet("list_"+hashHref, "", 86400*365*2) //不存在,存两年
 	}
-	//if redisCode != "" && strings.Contains(redisCode, code) { //相同爬虫采集且href相同,表示重复
-	//	*num++
-	//	return
+
+	//redisCode := lu.RedisClusterGet("list_" + hashHref)
+	////此处区分历史节点(7000)和增量节点
+	//if redisCode != "" {
+	//	if lu.Config.IsHistoryEvent || strings.Contains(redisCode, code) { //列表页数据已采集
+	//		*num++
+	//		return
+	//	} else {
+	//		lu.RedisClusterSet("list_"+hashHref, code+"+"+redisCode, 86400*365*2) //两年
+	//	}
 	//} else {
 	//	lu.RedisClusterSet("list_"+hashHref, code+"+"+redisCode, 86400*365*2) //两年
 	//}