Forráskód Böngészése

调整无限翻页逻辑

maxiaoshan 3 éve
szülő
commit
33d8e6143a
5 módosított fájl, 47 hozzáadás és 23 törlés
  1. 3 3
      src/config.json
  2. 3 0
      src/spider/handler.go
  3. 1 1
      src/spider/msgservice.go
  4. 39 18
      src/spider/spider.go
  5. 1 1
      src/spider/store.go

+ 3 - 3
src/config.json

@@ -6,10 +6,10 @@
     "editor_dbsize": 2,
     "editoraddr": "http://127.0.0.1:6011/spider/infos",
     "msgname": "爬虫监控中心_zjk",
-    "msgserveraddr": "spdata.jianyu360.com:801",
+    "msgserveraddr": "spdata.jianyu360.com:803",
     "msgserveraddrfile": "spdata.jianyu360.com:802",
 	"isdelay":false,
-    "working": 1,
+    "working": 0,
     "chansize": 4,
     "detailchansize": 20,
     "uploadevent": 7100,
@@ -19,7 +19,7 @@
     "ishistoryevent": false,
     "tesseractadd": "http://test.qmx.top:1688",
     "testdir": "res/test/spider_test.lua",
-    "redisservers": "title_repeat_judgement=192.168.3.207:1679,title_repeat_fulljudgement=192.168.3.207:1679,title_repeat_listpagehref=192.168.3.207:1679",
+    "redisservers": "title_repeat_judgement=192.168.3.207:2679,title_repeat_fulljudgement=192.168.3.207:2679,title_repeat_listpagehref=192.168.3.207:1679",
      "word":{
     	"keyword":"(抽签|中标|招标|成交|合同|中标候选人|资格预审|拟建|邀请|询价|比选|议价|竞价|磋商|采购|招投标|答疑|变更公告|更正公告|竞争性谈判|竞谈|意见征询|澄清|单一来源|流标|废标|验收公告|中止|终止|违规|处罚|征集公告|开标结果|评审结果|监理|招租|租赁|评判结果|项目|遴选|补遗|竞标|征求意见)",
     	"notkeyword":"(招聘|拍卖|出租|出让|使用权|资产)"

+ 3 - 0
src/spider/handler.go

@@ -1232,10 +1232,12 @@ func SpiderInfoSend() {
 //保存心跳信息
 func SaveHeartInfo() {
 	time.Sleep(30 * time.Second)
+	num := 0
 	SpiderHeart.Range(func(key, value interface{}) bool {
 		code := key.(string)
 		heart, ok := value.(*Heart)
 		if ok {
+			num++
 			update := []map[string]interface{}{}
 			update = append(update, map[string]interface{}{"code": code})
 			update = append(update, map[string]interface{}{"$set": map[string]interface{}{
@@ -1254,6 +1256,7 @@ func SaveHeartInfo() {
 		}
 		return true
 	})
+	logger.Info("更新心跳个数:", num)
 	time.AfterFunc(20*time.Minute, SaveHeartInfo)
 }
 

+ 1 - 1
src/spider/msgservice.go

@@ -292,7 +292,7 @@ func SaveObj(event int, checkAtrr string, data map[string]interface{}, saveredis
 			}
 			//保存服务未接收成功的数据会存入data_bak中,确保数据不丢失依赖补发程序
 			if id != "" {
-				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
+				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
 				if !flag { //保存服务发送成功
 					//全量(判断是否已存在防止覆盖id)
 					isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)

+ 39 - 18
src/spider/spider.go

@@ -269,7 +269,10 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 			UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list") //记录所有节点列表页心跳
 		}
 		//qu.Debug("重复页:", repeatPageNum, "	配置最大页:", tmpMax, "	最终最大页:", max, "	当前页:", start, "重复次数:", repeatPageTimes)
-		if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次,不再翻页
+		//if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次,不再翻页
+		//	break
+		//}
+		if isRunRepeatList && repeatPageTimes >= 10 { //重复次数超过10次,不再翻页
 			break
 		}
 		if err := s.L.CallByParam(lua.P{
@@ -278,9 +281,24 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 			Protect: true,
 		}, lua.LNumber(start)); err != nil {
 			//panic(s.Code + "," + err.Error())
-			log.Println(s.Code + "," + err.Error())
+			logger.Error("列表页采集报错", start, s.Code+","+err.Error())
 			errs = err.Error()
 			atomic.AddInt32(&s.Script.ErrorNum, 1)
+			//列表页采集报错进行重试,超过重试次数视为该页已采
+			if downtimes < 2 {
+				downtimes++
+				start--
+				//} else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
+			} else if isRunRepeatList { //超过重试次数,视为本页重复
+				if repeatPageNum+1 == start {
+					repeatPageTimes++ //次数加1
+				} else {
+					repeatPageTimes = 0 //重复次数重置0
+				}
+				repeatPageNum = start //赋值页码
+				downtimes = 0
+			}
+			continue
 		}
 		lv := s.L.Get(-1)
 		s.L.Pop(1)
@@ -307,7 +325,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 								db := HexToBigIntMod(href) //根据href的哈希值选择Redis的db
 								hashHref := HexText(href)
 								//增量(redis默认db0)
-								util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
+								util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
 								//全量(判断是否已存在防止覆盖id)
 								isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)
 								if !isExist {
@@ -320,11 +338,12 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 						s.HistoricalMendDownloadDetailItem(tmp) //历史补漏下载三级页
 					}
 				}
-				if start <= tmpMax { //数量赋值
-					repeatAllNum += repeatListNum
-					downloadAllNum += tabLen
-				}
-				if start > tmpMax && isRunRepeatList { //执行连续页码判重
+				//if start <= tmpMax { //数量赋值
+				repeatAllNum += repeatListNum
+				downloadAllNum += tabLen
+				//}
+				//if start > tmpMax && isRunRepeatList { //执行连续页码判重
+				if isRunRepeatList { //执行连续页码判重
 					if repeatListNum >= tabLen { //当前start列表页全部数据都已采集
 						//qu.Debug("重复页:", repeatPageNum, "当前页:", start)
 						if repeatPageNum+1 == start || repeatPageNum == 0 {
@@ -348,7 +367,8 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 					downtimes++
 					start--
 					continue
-				} else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
+					//} else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
+				} else if isRunRepeatList { //超过重试次数,视为本页重复
 					if repeatPageNum+1 == start {
 						repeatPageTimes++ //次数加1
 					} else {
@@ -362,7 +382,8 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 				downtimes++
 				start--
 				continue
-			} else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
+				//} else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
+			} else if isRunRepeatList { //超过重试次数,视为本页重复
 				if repeatPageNum+1 == start {
 					repeatPageTimes++ //次数加1
 				} else {
@@ -554,7 +575,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 	if util.Config.Modal == 1 { //除7000、7500、7700节点外所有节点只采集列表页信息
 		isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
 		if isExist { //更新redis生命周期
-			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
+			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
 			*num++ //已采集
 			return
 		}
@@ -566,7 +587,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 		}
 		isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
 		if isExist { //更新redis生命周期
-			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
+			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
 			*num++ //已采集
 			return
 		}
@@ -582,7 +603,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 		}
 		SaveListPageData(paramdata, &id, isEsRepeat) //保存7000、7410、7500、7700节点列表页采集的信息
 		if isEsRepeat {                              //类竞品数据title判重数据加入redis
-			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
+			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
 			return
 		}
 	}
@@ -603,7 +624,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 	} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
 		log.Println("beforeHref:", href, "afterHref:", tmphref)
 		//增量
-		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
 		//全量
 		db := HexToBigIntMod(href)
 		hashHref := HexText(href)
@@ -805,7 +826,7 @@ func (s *Spider) DownloadHighDetail() {
 						if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
 							set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
 							Mgo.Update("spider_highlistdata", query, set, false, false)
-							util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
+							util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
 							continue
 						}
 					}
@@ -837,7 +858,7 @@ func (s *Spider) DownloadHighDetail() {
 					} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
 						log.Println("beforeHref:", href, "afterHref:", tmphref)
 						//增量
-						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
+						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
 						//全量
 						db := HexToBigIntMod(href)
 						hashHref := HexText(href)
@@ -940,7 +961,7 @@ func (s *Spider) DownloadListDetail() {
 				if Es.Count(EsIndex, EsType, esQuery) > 0 { //es中含本title数据,不再采集,更新list表数据状态
 					set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
 					Mgo.Update("spider_highlistdata", query, set, false, false)
-					util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
+					util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
 					continue
 				}
 			}
@@ -972,7 +993,7 @@ func (s *Spider) DownloadListDetail() {
 			} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
 				log.Println("beforeHref:", href, "afterHref:", tmphref)
 				//增量
-				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
+				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
 				//全量
 				db := HexToBigIntMod(href)
 				hashHref := HexText(href)

+ 1 - 1
src/spider/store.go

@@ -81,7 +81,7 @@ func Store(mode, event int, c, coverAttr string, data map[string]interface{}, fl
 			db := HexToBigIntMod(href) //根据href的哈希值选择Redis的db
 			hashHref := HexText(href)
 			//增量
-			lu.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
+			lu.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
 			//全量
 			isExist, _ := lu.ExistRedis("title_repeat_fulljudgement", db, hashHref)
 			if !isExist {