|
@@ -81,17 +81,18 @@ type Spider struct {
|
|
var Es *es.Elastic
|
|
var Es *es.Elastic
|
|
var EsIndex string
|
|
var EsIndex string
|
|
var EsType string
|
|
var EsType string
|
|
|
|
+var Mgo *mgo.MongodbSim
|
|
var UpdataMgoCache = make(chan []map[string]interface{}, 1000) //更新要重下数据的状态
|
|
var UpdataMgoCache = make(chan []map[string]interface{}, 1000) //更新要重下数据的状态
|
|
var UpdataHeartCache = make(chan []map[string]interface{}, 1000) //更新爬虫心跳信息
|
|
var UpdataHeartCache = make(chan []map[string]interface{}, 1000) //更新爬虫心跳信息
|
|
var SaveMgoCache = make(chan map[string]interface{}, 1000) //保存爬虫采集非本站点数据
|
|
var SaveMgoCache = make(chan map[string]interface{}, 1000) //保存爬虫采集非本站点数据
|
|
var SP = make(chan bool, 5)
|
|
var SP = make(chan bool, 5)
|
|
var SPH = make(chan bool, 5)
|
|
var SPH = make(chan bool, 5)
|
|
var SPS = make(chan bool, 5)
|
|
var SPS = make(chan bool, 5)
|
|
-var Mgo *mgo.MongodbSim
|
|
|
|
var TimeChan = make(chan bool, 1)
|
|
var TimeChan = make(chan bool, 1)
|
|
var Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
|
|
var Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
|
|
var DomainNameReg = regexp.MustCompile(`(?://).+?(?:)[::/]`)
|
|
var DomainNameReg = regexp.MustCompile(`(?://).+?(?:)[::/]`)
|
|
var RepDomainNameReg = regexp.MustCompile(`[::/]+`)
|
|
var RepDomainNameReg = regexp.MustCompile(`[::/]+`)
|
|
|
|
+var DelaySites map[string]int //延迟采集站点集合
|
|
|
|
|
|
//心跳
|
|
//心跳
|
|
func UpdateHeart(site, channel, code, user, t string) {
|
|
func UpdateHeart(site, channel, code, user, t string) {
|
|
@@ -570,7 +571,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
isEsRepeat := false
|
|
isEsRepeat := false
|
|
- if delayDay := util.Config.DelaySites[s.Name]; delayDay > 0 { //类竞品站点爬虫title做es7天内判重检验(顺序采集无法延迟,只能判重)
|
|
|
|
|
|
+ if delayDay := DelaySites[s.Name]; delayDay > 0 { //类竞品站点爬虫title做es7天内判重检验(顺序采集无法延迟,只能判重)
|
|
title := qu.ObjToString(paramdata["title"])
|
|
title := qu.ObjToString(paramdata["title"])
|
|
eTime := time.Now().Unix()
|
|
eTime := time.Now().Unix()
|
|
sTime := eTime - int64(7*86400)
|
|
sTime := eTime - int64(7*86400)
|
|
@@ -768,7 +769,7 @@ func (s *Spider) DownloadHighDetail() {
|
|
if !s.Stop { //爬虫是运行状态
|
|
if !s.Stop { //爬虫是运行状态
|
|
comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
|
|
comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
|
|
isEsRepeat := false //是否进行es判重
|
|
isEsRepeat := false //是否进行es判重
|
|
- if delayDay := util.Config.DelaySites[s.Name]; delayDay > 0 {
|
|
|
|
|
|
+ if delayDay := DelaySites[s.Name]; delayDay > 0 {
|
|
isEsRepeat = true
|
|
isEsRepeat = true
|
|
if delayDay <= util.Config.DayNum*24 { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay小时采集(由于7410、7500、7700为顺序采集,无法延时)
|
|
if delayDay <= util.Config.DayNum*24 { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay小时采集(由于7410、7500、7700为顺序采集,无法延时)
|
|
//comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
|
|
//comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
|
|
@@ -904,7 +905,7 @@ func (s *Spider) DownloadListDetail() {
|
|
}()
|
|
}()
|
|
comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
|
|
comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
|
|
isEsRepeat := false //是否进行es判重
|
|
isEsRepeat := false //是否进行es判重
|
|
- if delayDay := util.Config.DelaySites[s.Name]; delayDay > 0 {
|
|
|
|
|
|
+ if delayDay := DelaySites[s.Name]; delayDay > 0 {
|
|
isEsRepeat = true
|
|
isEsRepeat = true
|
|
if delayDay <= util.Config.DayNum { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay天采集(由于7410、7500、7700为顺序采集,无法延时)
|
|
if delayDay <= util.Config.DayNum { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay天采集(由于7410、7500、7700为顺序采集,无法延时)
|
|
//comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
|
|
//comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
|