|
@@ -64,6 +64,7 @@ var UpdataMgoCache = make(chan []map[string]interface{}, 1000) //更新要重下
|
|
|
var SP = make(chan bool, 5)
|
|
|
var TimeChan = make(chan bool, 1)
|
|
|
var Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
|
|
|
+var DelaySites map[string]int //延迟采集站点集合
|
|
|
|
|
|
//高性能模式定时采集三级页信息
|
|
|
func DetailData() {
|
|
@@ -89,6 +90,7 @@ func DownloadHighDetail(code string) {
|
|
|
for {
|
|
|
//logger.Info("爬虫代码:", s.Code, "已下架:", s.Stop)
|
|
|
//if !s.Stop { //爬虫是运行状态
|
|
|
+ //TODO 延迟采集还未添加
|
|
|
/*
|
|
|
1、每轮开始先查询当天下载的数据
|
|
|
2、本次查询无数据依次向前推一天查询数据(暂定50条数据)
|
|
@@ -136,6 +138,28 @@ func DownloadHighDetail(code string) {
|
|
|
}()
|
|
|
_id := tmp["_id"]
|
|
|
query := map[string]interface{}{"_id": _id}
|
|
|
+ href := qu.ObjToString(tmp["href"])
|
|
|
+ //由于目前列表页redis判重是href+code可能导致同一条href有多条不同code采集的数据存在
|
|
|
+ //为了避免重复下载,进行增量redis判重
|
|
|
+ isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
|
+ if isExist {
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
|
+ MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
+ return
|
|
|
+ }
|
|
|
+ // if isEsRepeat { //竞品数据es title判重
|
|
|
+ // title := qu.ObjToString(tmp["title"])
|
|
|
+ // eTime := time.Now().Unix()
|
|
|
+ // sTime := eTime - int64(7*86400)
|
|
|
+ // esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
+ // count := Es.Count(EsIndex, EsType, esQuery)
|
|
|
+ // if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
|
+ // set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
|
+ // Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
+ // util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ // continue
|
|
|
+ // }
|
|
|
+ // }
|
|
|
competehref := qu.ObjToString(tmp["competehref"])
|
|
|
if competehref != "" { //验证三方网站数据剑鱼是否已采集
|
|
|
title := qu.ObjToString(tmp["title"])
|
|
@@ -150,7 +174,6 @@ func DownloadHighDetail(code string) {
|
|
|
success := true //数据是否下载成功的标志
|
|
|
delete(tmp, "_id")
|
|
|
delete(tmp, "times")
|
|
|
- href := qu.ObjToString(tmp["href"])
|
|
|
data := map[string]interface{}{}
|
|
|
var err interface{}
|
|
|
for k, v := range tmp {
|
|
@@ -177,7 +200,7 @@ func DownloadHighDetail(code string) {
|
|
|
} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
log.Println("beforeHref:", href, "afterHref:", href)
|
|
|
//增量
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
//全量
|
|
|
db := HexToBigIntMod(tmphref)
|
|
|
hashHref := HexText(href)
|