|
@@ -159,7 +159,7 @@ func DownloadHighDetail(code string) {
|
|
//为了避免重复下载,进行增量redis判重
|
|
//为了避免重复下载,进行增量redis判重
|
|
isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
if isExist {
|
|
if isExist {
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
|
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
return
|
|
return
|
|
}
|
|
}
|
|
@@ -170,7 +170,7 @@ func DownloadHighDetail(code string) {
|
|
esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
count := Es.Count(EsIndex, EsType, esQuery)
|
|
count := Es.Count(EsIndex, EsType, esQuery)
|
|
if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
|
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
return
|
|
return
|
|
@@ -181,7 +181,7 @@ func DownloadHighDetail(code string) {
|
|
title := qu.ObjToString(tmp["title"])
|
|
title := qu.ObjToString(tmp["title"])
|
|
one, _ := MgoS.FindOne("data_bak", map[string]interface{}{"title": title})
|
|
one, _ := MgoS.FindOne("data_bak", map[string]interface{}{"title": title})
|
|
if one != nil && len(*one) > 0 { //剑鱼已采集,舍弃此条信息
|
|
if one != nil && len(*one) > 0 { //剑鱼已采集,舍弃此条信息
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
|
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
return
|
|
return
|
|
}
|
|
}
|
|
@@ -260,7 +260,7 @@ func DownloadHighDetail(code string) {
|
|
data["dataging"] = 0
|
|
data["dataging"] = 0
|
|
data["iscompete"] = sp.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
|
|
data["iscompete"] = sp.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
|
|
Store(sp.StoreMode, sp.StoreToMsgEvent, sp.Collection, sp.CoverAttr, data, true)
|
|
Store(sp.StoreMode, sp.StoreToMsgEvent, sp.Collection, sp.CoverAttr, data, true)
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1}} //下载成功state置为1
|
|
|
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}} //下载成功state置为1
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
}(l, spTmp)
|
|
}(l, spTmp)
|
|
}
|
|
}
|
|
@@ -281,7 +281,7 @@ func FilterByDetail(href string, query, data map[string]interface{}) bool {
|
|
hashHref := HexText(href)
|
|
hashHref := HexText(href)
|
|
util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
//更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
//更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true}}
|
|
|
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true, "updatetime": time.Now().Unix()}}
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
return true
|
|
return true
|
|
}
|
|
}
|