|
@@ -103,10 +103,14 @@ var TimeChan = make(chan bool, 1)
|
|
|
var Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
|
|
|
var DomainNameReg = regexp.MustCompile(`(?://).+?(?:)[::/]`)
|
|
|
var RepDomainNameReg = regexp.MustCompile(`[::/]+`)
|
|
|
-var DelaySites map[string]int //延迟采集站点集合
|
|
|
var Today string
|
|
|
var SpiderFlowMap = sync.Map{} //code:{"2022-05-16":SpiderFlow}
|
|
|
var AllThreadNum int64
|
|
|
+var DelaySiteMap map[string]*DelaySite //延迟采集站点集合
|
|
|
+type DelaySite struct {
|
|
|
+ DelayTime int
|
|
|
+ Compete bool
|
|
|
+}
|
|
|
|
|
|
//心跳
|
|
|
func UpdateHeart(site, channel, code, user, t string) {
|
|
@@ -596,7 +600,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
return
|
|
|
}
|
|
|
isEsRepeat := false
|
|
|
- if delayDay := DelaySites[s.Name]; delayDay > 0 { //类竞品站点爬虫title做es7天内判重检验(顺序采集无法延迟,只能判重)
|
|
|
+ if delaySite := DelaySiteMap[s.Name]; delaySite != nil && delaySite.Compete {
|
|
|
title := qu.ObjToString(paramdata["title"])
|
|
|
eTime := time.Now().Unix()
|
|
|
sTime := eTime - int64(7*86400)
|
|
@@ -809,11 +813,11 @@ func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
|
|
|
o := map[string]interface{}{"_id": -1}
|
|
|
if !isHistory { //非历史数据下载,补充comeintime时间检索条件
|
|
|
comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
|
|
|
- if delayTime := DelaySites[s.Name]; delayTime > 0 {
|
|
|
- isEsRepeat = true
|
|
|
- if delayTime <= util.Config.DayNum*24 { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay小时采集(由于7410、7500、7700为顺序采集,无法延时)
|
|
|
+ if delaySite := DelaySiteMap[s.Name]; delaySite != nil {
|
|
|
+ isEsRepeat = delaySite.Compete
|
|
|
+ if delaySite.DelayTime <= util.Config.DayNum*24 { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay小时采集(由于7410、7500、7700为顺序采集,无法延时)
|
|
|
//comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
|
|
|
- comeintimeQuery["$lte"] = time.Now().Unix() - int64(3600*delayTime)
|
|
|
+ comeintimeQuery["$lte"] = time.Now().Unix() - int64(3600*delaySite.DelayTime)
|
|
|
}
|
|
|
}
|
|
|
q["comeintime"] = comeintimeQuery
|