|
@@ -13,6 +13,7 @@ import (
|
|
"math/rand"
|
|
"math/rand"
|
|
mu "mfw/util"
|
|
mu "mfw/util"
|
|
qu "qfw/util"
|
|
qu "qfw/util"
|
|
|
|
+ es "qfw/util/elastic"
|
|
"regexp"
|
|
"regexp"
|
|
util "spiderutil"
|
|
util "spiderutil"
|
|
"strings"
|
|
"strings"
|
|
@@ -60,6 +61,9 @@ type Spider struct {
|
|
IsCompete bool //区分新老爬虫
|
|
IsCompete bool //区分新老爬虫
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+var Es *es.Elastic
|
|
|
|
+var EsIndex string
|
|
|
|
+var EsType string
|
|
var UpdataMgoCache = make(chan []map[string]interface{}, 1000) //更新要重下数据的状态
|
|
var UpdataMgoCache = make(chan []map[string]interface{}, 1000) //更新要重下数据的状态
|
|
var SP = make(chan bool, 5)
|
|
var SP = make(chan bool, 5)
|
|
var TimeChan = make(chan bool, 1)
|
|
var TimeChan = make(chan bool, 1)
|
|
@@ -107,9 +111,21 @@ func DownloadHighDetail(code string) {
|
|
}
|
|
}
|
|
list := &[]map[string]interface{}{} //查询数据的集合
|
|
list := &[]map[string]interface{}{} //查询数据的集合
|
|
for day := 0; day <= util.Config.DayNum; day++ {
|
|
for day := 0; day <= util.Config.DayNum; day++ {
|
|
- comeintime := map[string]interface{}{"$gte": GetTime(-day)} //指定查询数据的时间
|
|
|
|
- if day != 0 { //不是当天,指定数据范围
|
|
|
|
|
|
+ startTime := GetTime(-day)
|
|
|
|
+ comeintime := map[string]interface{}{"$gte": startTime} //指定查询数据的时间
|
|
|
|
+ if day != 0 { //不是当天,指定数据范围
|
|
comeintime["$lt"] = GetTime(-day + 1)
|
|
comeintime["$lt"] = GetTime(-day + 1)
|
|
|
|
+ } else if code == "a_gcy_mcgg" { //
|
|
|
|
+ endTime := time.Now().Unix() - 12*3600
|
|
|
|
+ if endTime > startTime {
|
|
|
|
+ comeintime = map[string]interface{}{
|
|
|
|
+ "$gte": startTime,
|
|
|
|
+ "$lt": endTime,
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+
|
|
}
|
|
}
|
|
q["comeintime"] = comeintime
|
|
q["comeintime"] = comeintime
|
|
list, _ = MgoS.Find("spider_highlistdata", q, o, f, false, 0, 100)
|
|
list, _ = MgoS.Find("spider_highlistdata", q, o, f, false, 0, 100)
|
|
@@ -147,19 +163,19 @@ func DownloadHighDetail(code string) {
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
return
|
|
return
|
|
}
|
|
}
|
|
- // if isEsRepeat { //竞品数据es title判重
|
|
|
|
- // title := qu.ObjToString(tmp["title"])
|
|
|
|
- // eTime := time.Now().Unix()
|
|
|
|
- // sTime := eTime - int64(7*86400)
|
|
|
|
- // esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
|
- // count := Es.Count(EsIndex, EsType, esQuery)
|
|
|
|
- // if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
|
|
- // set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
|
|
- // Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
|
- // util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
|
- // continue
|
|
|
|
- // }
|
|
|
|
- // }
|
|
|
|
|
|
+ if code == "a_gcy_mcgg" { //竞品数据es title判重
|
|
|
|
+ title := qu.ObjToString(tmp["title"])
|
|
|
|
+ eTime := time.Now().Unix()
|
|
|
|
+ sTime := eTime - int64(7*86400)
|
|
|
|
+ esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
|
+ count := Es.Count(EsIndex, EsType, esQuery)
|
|
|
|
+ if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
|
|
+ MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
|
+ return
|
|
|
|
+ }
|
|
|
|
+ }
|
|
competehref := qu.ObjToString(tmp["competehref"])
|
|
competehref := qu.ObjToString(tmp["competehref"])
|
|
if competehref != "" { //验证三方网站数据剑鱼是否已采集
|
|
if competehref != "" { //验证三方网站数据剑鱼是否已采集
|
|
title := qu.ObjToString(tmp["title"])
|
|
title := qu.ObjToString(tmp["title"])
|
|
@@ -217,6 +233,11 @@ func DownloadHighDetail(code string) {
|
|
set := map[string]interface{}{"$set": ss}
|
|
set := map[string]interface{}{"$set": ss}
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
return
|
|
return
|
|
|
|
+ } else { //三级页过滤
|
|
|
|
+ deleteData := FilterByDetail(href, query, data) //针对列表页无法过滤需要在详情页过滤的数据,进行过滤处理
|
|
|
|
+ if deleteData {
|
|
|
|
+ return
|
|
|
|
+ }
|
|
}
|
|
}
|
|
t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
|
|
t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
|
|
if t1 > time.Now().Unix() { //防止发布时间超前
|
|
if t1 > time.Now().Unix() { //防止发布时间超前
|
|
@@ -251,6 +272,21 @@ func DownloadHighDetail(code string) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+func FilterByDetail(href string, query, data map[string]interface{}) bool {
|
|
|
|
+ if data["delete"] != nil {
|
|
|
|
+ //增量
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
|
+ //全量
|
|
|
|
+ db := HexToBigIntMod(href)
|
|
|
|
+ hashHref := HexText(href)
|
|
|
|
+ util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
|
|
+ //更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true}}
|
|
|
|
+ MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
|
+ return true
|
|
|
|
+ }
|
|
|
|
+ return false
|
|
|
|
+}
|
|
|
|
|
|
//下载解析内容页
|
|
//下载解析内容页
|
|
func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
|
|
func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
|