Sfoglia il codice sorgente

延迟采集调整

maxiaoshan 2 anni fa
parent
commit
5dbf2ca411
3 ha cambiato i file con 17 aggiunte e 9 eliminazioni
  1. 1 1
      src/main.go
  2. 10 6
      src/spider/spider.go
  3. 6 2
      src/spider/util.go

+ 1 - 1
src/main.go

@@ -1,7 +1,7 @@
 package main
 
 import (
-	codegrpc "analysiscode"
+	codegrpc "analysiscode/client"
 	_ "filter"
 	"fmt"
 	"io/ioutil"

+ 10 - 6
src/spider/spider.go

@@ -103,10 +103,14 @@ var TimeChan = make(chan bool, 1)
 var Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
 var DomainNameReg = regexp.MustCompile(`(?://).+?(?:)[::/]`)
 var RepDomainNameReg = regexp.MustCompile(`[::/]+`)
-var DelaySites map[string]int //延迟采集站点集合
 var Today string
 var SpiderFlowMap = sync.Map{} //code:{"2022-05-16":SpiderFlow}
 var AllThreadNum int64
+var DelaySiteMap map[string]*DelaySite //延迟采集站点集合
+type DelaySite struct {
+	DelayTime int
+	Compete   bool
+}
 
 //心跳
 func UpdateHeart(site, channel, code, user, t string) {
@@ -596,7 +600,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 			return
 		}
 		isEsRepeat := false
-		if delayDay := DelaySites[s.Name]; delayDay > 0 { //类竞品站点爬虫title做es7天内判重检验(顺序采集无法延迟,只能判重)
+		if delaySite := DelaySiteMap[s.Name]; delaySite != nil && delaySite.Compete {
 			title := qu.ObjToString(paramdata["title"])
 			eTime := time.Now().Unix()
 			sTime := eTime - int64(7*86400)
@@ -809,11 +813,11 @@ func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
 	o := map[string]interface{}{"_id": -1}
 	if !isHistory { //非历史数据下载,补充comeintime时间检索条件
 		comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
-		if delayTime := DelaySites[s.Name]; delayTime > 0 {
-			isEsRepeat = true
-			if delayTime <= util.Config.DayNum*24 { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay小时采集(由于7410、7500、7700为顺序采集,无法延时)
+		if delaySite := DelaySiteMap[s.Name]; delaySite != nil {
+			isEsRepeat = delaySite.Compete
+			if delaySite.DelayTime <= util.Config.DayNum*24 { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay小时采集(由于7410、7500、7700为顺序采集,无法延时)
 				//comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
-				comeintimeQuery["$lte"] = time.Now().Unix() - int64(3600*delayTime)
+				comeintimeQuery["$lte"] = time.Now().Unix() - int64(3600*delaySite.DelayTime)
 			}
 		}
 		q["comeintime"] = comeintimeQuery

+ 6 - 2
src/spider/util.go

@@ -12,11 +12,15 @@ var ErrFid = "a6879f0a8570256aa21fb978e6dabb50429a30dfacff697cf0b898abbc5c262e"
 //初始化延迟采集站点集合
 func InitOther() {
 	defer qu.Catch()
-	DelaySites = map[string]int{}
+	DelaySiteMap = map[string]*DelaySite{}
 	list, _ := MgoS.Find("spider_compete", nil, nil, nil, false, -1, -1)
 	for _, l := range *list {
 		site := qu.ObjToString(l["site"])
 		delayTime := qu.IntAll(l["delaytime"])
-		DelaySites[site] = delayTime
+		compete, _ := l["compete"].(bool)
+		DelaySiteMap[site] = &DelaySite{
+			DelayTime: delayTime,
+			Compete:   compete,
+		}
 	}
 }