3 жил өмнө · 33d8e6143a
--- a/src/config.json
+++ b/src/config.json
@@ -6,10 +6,10 @@
 
															     "editor_dbsize": 2,
														
 
															     "editoraddr": "http://127.0.0.1:6011/spider/infos",
														
 
															     "msgname": "爬虫监控中心_zjk",
														
 
															-    "msgserveraddr": "spdata.jianyu360.com:801",
														
 
															+    "msgserveraddr": "spdata.jianyu360.com:803",
														
 
															     "msgserveraddrfile": "spdata.jianyu360.com:802",
														
 
															 	"isdelay":false,
														
 
															-    "working": 1,
														
 
															+    "working": 0,
														
 
															     "chansize": 4,
														
 
															     "detailchansize": 20,
														
 
															     "uploadevent": 7100,
														
@@ -19,7 +19,7 @@
 
															     "ishistoryevent": false,
														
 
															     "tesseractadd": "http://test.qmx.top:1688",
														
 
															     "testdir": "res/test/spider_test.lua",
														
 
															-    "redisservers": "title_repeat_judgement=192.168.3.207:1679,title_repeat_fulljudgement=192.168.3.207:1679,title_repeat_listpagehref=192.168.3.207:1679",
														
 
															+    "redisservers": "title_repeat_judgement=192.168.3.207:2679,title_repeat_fulljudgement=192.168.3.207:2679,title_repeat_listpagehref=192.168.3.207:1679",
														
 
															      "word":{
														
 
															     	"keyword":"(抽签|中标|招标|成交|合同|中标候选人|资格预审|拟建|邀请|询价|比选|议价|竞价|磋商|采购|招投标|答疑|变更公告|更正公告|竞争性谈判|竞谈|意见征询|澄清|单一来源|流标|废标|验收公告|中止|终止|违规|处罚|征集公告|开标结果|评审结果|监理|招租|租赁|评判结果|项目|遴选|补遗|竞标|征求意见)",
														
 
															     	"notkeyword":"(招聘|拍卖|出租|出让|使用权|资产)"
														
--- a/src/spider/handler.go
+++ b/src/spider/handler.go
@@ -1232,10 +1232,12 @@ func SpiderInfoSend() {
 
															 //保存心跳信息
														
 
															 func SaveHeartInfo() {
														
 
															 	time.Sleep(30 * time.Second)
														
 
															+	num := 0
														
 
															 	SpiderHeart.Range(func(key, value interface{}) bool {
														
 
															 		code := key.(string)
														
 
															 		heart, ok := value.(*Heart)
														
 
															 		if ok {
														
 
															+			num++
														
 
															 			update := []map[string]interface{}{}
														
 
															 			update = append(update, map[string]interface{}{"code": code})
														
 
															 			update = append(update, map[string]interface{}{"$set": map[string]interface{}{
														
@@ -1254,6 +1256,7 @@ func SaveHeartInfo() {
 
															 		}
														
 
															 		return true
														
 
															 	})
														
 
															+	logger.Info("更新心跳个数：", num)
														
 
															 	time.AfterFunc(20*time.Minute, SaveHeartInfo)
														
 
															 }
														
--- a/src/spider/msgservice.go
+++ b/src/spider/msgservice.go
@@ -292,7 +292,7 @@ func SaveObj(event int, checkAtrr string, data map[string]interface{}, saveredis
 
															 			}
														
 
															 			//保存服务未接收成功的数据会存入data_bak中，确保数据不丢失依赖补发程序
														
 
															 			if id != "" {
														
 
															-				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
														
 
															+				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
														
 
															 				if !flag { //保存服务发送成功
														
 
															 					//全量(判断是否已存在防止覆盖id)
														
 
															 					isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)
														
--- a/src/spider/spider.go
+++ b/src/spider/spider.go
@@ -269,7 +269,10 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 
															 			UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list") //记录所有节点列表页心跳
														
 
															 		}
														
 
															 		//qu.Debug("重复页：", repeatPageNum, "	配置最大页：", tmpMax, "	最终最大页：", max, "	当前页：", start, "重复次数：", repeatPageTimes)
														
 
															-		if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次，不再翻页
														
 
															+		//if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次，不再翻页
														
 
															+		//	break
														
 
															+		//}
														
 
															+		if isRunRepeatList && repeatPageTimes >= 10 { //重复次数超过10次，不再翻页
														
 
															 			break
														
 
															 		}
														
 
															 		if err := s.L.CallByParam(lua.P{
														
@@ -278,9 +281,24 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 
															 			Protect: true,
														
 
															 		}, lua.LNumber(start)); err != nil {
														
 
															 			//panic(s.Code + "," + err.Error())
														
 
															-			log.Println(s.Code + "," + err.Error())
														
 
															+			logger.Error("列表页采集报错", start, s.Code+","+err.Error())
														
 
															 			errs = err.Error()
														
 
															 			atomic.AddInt32(&s.Script.ErrorNum, 1)
														
 
															+			//列表页采集报错进行重试，超过重试次数视为该页已采
														
 
															+			if downtimes < 2 {
														
 
															+				downtimes++
														
 
															+				start--
														
 
															+				//} else if start > tmpMax && isRunRepeatList { //超过重试次数，视为本页重复
														
 
															+			} else if isRunRepeatList { //超过重试次数，视为本页重复
														
 
															+				if repeatPageNum+1 == start {
														
 
															+					repeatPageTimes++ //次数加1
														
 
															+				} else {
														
 
															+					repeatPageTimes = 0 //重复次数重置0
														
 
															+				}
														
 
															+				repeatPageNum = start //赋值页码
														
 
															+				downtimes = 0
														
 
															+			}
														
 
															+			continue
														
 
															 		}
														
 
															 		lv := s.L.Get(-1)
														
 
															 		s.L.Pop(1)
														
@@ -307,7 +325,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 
															 								db := HexToBigIntMod(href) //根据href的哈希值选择Redis的db
														
 
															 								hashHref := HexText(href)
														
 
															 								//增量(redis默认db0)
														
 
															-								util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
														
 
															+								util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
														
 
															 								//全量(判断是否已存在防止覆盖id)
														
 
															 								isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)
														
 
															 								if !isExist {
														
@@ -320,11 +338,12 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 
															 						s.HistoricalMendDownloadDetailItem(tmp) //历史补漏下载三级页
														
 
															 					}
														
 
															 				}
														
 
															-				if start <= tmpMax { //数量赋值
														
 
															-					repeatAllNum += repeatListNum
														
 
															-					downloadAllNum += tabLen
														
 
															-				}
														
 
															-				if start > tmpMax && isRunRepeatList { //执行连续页码判重
														
 
															+				//if start <= tmpMax { //数量赋值
														
 
															+				repeatAllNum += repeatListNum
														
 
															+				downloadAllNum += tabLen
														
 
															+				//}
														
 
															+				//if start > tmpMax && isRunRepeatList { //执行连续页码判重
														
 
															+				if isRunRepeatList { //执行连续页码判重
														
 
															 					if repeatListNum >= tabLen { //当前start列表页全部数据都已采集
														
 
															 						//qu.Debug("重复页：", repeatPageNum, "当前页：", start)
														
 
															 						if repeatPageNum+1 == start || repeatPageNum == 0 {
														
@@ -348,7 +367,8 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 
															 					downtimes++
														
 
															 					start--
														
 
															 					continue
														
 
															-				} else if start > tmpMax && isRunRepeatList { //超过重试次数，视为本页重复
														
 
															+					//} else if start > tmpMax && isRunRepeatList { //超过重试次数，视为本页重复
														
 
															+				} else if isRunRepeatList { //超过重试次数，视为本页重复
														
 
															 					if repeatPageNum+1 == start {
														
 
															 						repeatPageTimes++ //次数加1
														
 
															 					} else {
														
@@ -362,7 +382,8 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 
															 				downtimes++
														
 
															 				start--
														
 
															 				continue
														
 
															-			} else if start > tmpMax && isRunRepeatList { //超过重试次数，视为本页重复
														
 
															+				//} else if start > tmpMax && isRunRepeatList { //超过重试次数，视为本页重复
														
 
															+			} else if isRunRepeatList { //超过重试次数，视为本页重复
														
 
															 				if repeatPageNum+1 == start {
														
 
															 					repeatPageTimes++ //次数加1
														
 
															 				} else {
														
@@ -554,7 +575,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 
															 	if util.Config.Modal == 1 { //除7000、7500、7700节点外所有节点只采集列表页信息
														
 
															 		isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
														
 
															 		if isExist { //更新redis生命周期
														
 
															-			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
														
 
															+			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
														
 
															 			*num++ //已采集
														
 
															 			return
														
 
															 		}
														
@@ -566,7 +587,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 
															 		}
														
 
															 		isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
														
 
															 		if isExist { //更新redis生命周期
														
 
															-			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
														
 
															+			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
														
 
															 			*num++ //已采集
														
 
															 			return
														
 
															 		}
														
@@ -582,7 +603,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 
															 		}
														
 
															 		SaveListPageData(paramdata, &id, isEsRepeat) //保存7000、7410、7500、7700节点列表页采集的信息
														
 
															 		if isEsRepeat {                              //类竞品数据title判重数据加入redis
														
 
															-			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
														
 
															+			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
														
 
															 			return
														
 
															 		}
														
 
															 	}
														
@@ -603,7 +624,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 
															 	} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
														
 
															 		log.Println("beforeHref:", href, "afterHref:", tmphref)
														
 
															 		//增量
														
 
															-		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
														
 
															+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
														
 
															 		//全量
														
 
															 		db := HexToBigIntMod(href)
														
 
															 		hashHref := HexText(href)
														
@@ -805,7 +826,7 @@ func (s *Spider) DownloadHighDetail() {
 
															 						if count > 0 { //es中含本title数据，不再采集，更新list表数据状态
														
 
															 							set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
														
 
															 							Mgo.Update("spider_highlistdata", query, set, false, false)
														
 
															-							util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
														
 
															+							util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
														
 
															 							continue
														
 
															 						}
														
 
															 					}
														
@@ -837,7 +858,7 @@ func (s *Spider) DownloadHighDetail() {
 
															 					} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
														
 
															 						log.Println("beforeHref:", href, "afterHref:", tmphref)
														
 
															 						//增量
														
 
															-						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
														
 
															+						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
														
 
															 						//全量
														
 
															 						db := HexToBigIntMod(href)
														
 
															 						hashHref := HexText(href)
														
@@ -940,7 +961,7 @@ func (s *Spider) DownloadListDetail() {
 
															 				if Es.Count(EsIndex, EsType, esQuery) > 0 { //es中含本title数据，不再采集，更新list表数据状态
														
 
															 					set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
														
 
															 					Mgo.Update("spider_highlistdata", query, set, false, false)
														
 
															-					util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
														
 
															+					util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
														
 
															 					continue
														
 
															 				}
														
 
															 			}
														
@@ -972,7 +993,7 @@ func (s *Spider) DownloadListDetail() {
 
															 			} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
														
 
															 				log.Println("beforeHref:", href, "afterHref:", tmphref)
														
 
															 				//增量
														
 
															-				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
														
 
															+				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
														
 
															 				//全量
														
 
															 				db := HexToBigIntMod(href)
														
 
															 				hashHref := HexText(href)
														
--- a/src/spider/store.go
+++ b/src/spider/store.go
@@ -81,7 +81,7 @@ func Store(mode, event int, c, coverAttr string, data map[string]interface{}, fl
 
															 			db := HexToBigIntMod(href) //根据href的哈希值选择Redis的db
														
 
															 			hashHref := HexText(href)
														
 
															 			//增量
														
 
															-			lu.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
														
 
															+			lu.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
														
 
															 			//全量
														
 
															 			isExist, _ := lu.ExistRedis("title_repeat_fulljudgement", db, hashHref)
														
 
															 			if !isExist {