|
@@ -114,18 +114,19 @@ func DownloadHighDetail(code string) {
|
|
|
comeintime := map[string]interface{}{"$gte": startTime} //指定查询数据的时间
|
|
|
if day != 0 { //不是当天,指定数据范围
|
|
|
comeintime["$lt"] = GetTime(-day + 1)
|
|
|
- } else if code == "a_gcy_mcgg" { //
|
|
|
- endTime := time.Now().Unix() - 12*3600
|
|
|
- if endTime > startTime {
|
|
|
- comeintime = map[string]interface{}{
|
|
|
- "$gte": startTime,
|
|
|
- "$lt": endTime,
|
|
|
- }
|
|
|
- } else {
|
|
|
- continue
|
|
|
- }
|
|
|
-
|
|
|
}
|
|
|
+ //} else if code == "a_gcy_mcgg" { //延迟采集站点(延迟采集站点不加入多线程采集luaspecialcode库中)
|
|
|
+ // endTime := time.Now().Unix() - 12*3600
|
|
|
+ // if endTime > startTime {
|
|
|
+ // comeintime = map[string]interface{}{
|
|
|
+ // "$gte": startTime,
|
|
|
+ // "$lt": endTime,
|
|
|
+ // }
|
|
|
+ // } else {
|
|
|
+ // continue
|
|
|
+ // }
|
|
|
+ //
|
|
|
+ //}
|
|
|
q["comeintime"] = comeintime
|
|
|
list, _ = MgoS.Find("spider_highlistdata", q, o, f, false, 0, 100)
|
|
|
//logger.Debug("code:", code, "query:", q, "当前查询数据量:", len(*list))
|
|
@@ -154,37 +155,38 @@ func DownloadHighDetail(code string) {
|
|
|
_id := tmp["_id"]
|
|
|
query := map[string]interface{}{"_id": _id}
|
|
|
href := qu.ObjToString(tmp["href"])
|
|
|
+ hashHref := HexText(href)
|
|
|
//由于目前列表页redis判重是href+code可能导致同一条href有多条不同code采集的数据存在
|
|
|
- //为了避免重复下载,进行增量redis判重
|
|
|
- isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
|
+ //为了避免重复下载,进行全量redis判重
|
|
|
+ isExist := util.RedisClusterExists(hashHref)
|
|
|
if isExist {
|
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
return
|
|
|
}
|
|
|
- if code == "a_gcy_mcgg" { //竞品数据es title判重
|
|
|
- title := qu.ObjToString(tmp["title"])
|
|
|
- eTime := time.Now().Unix()
|
|
|
- sTime := eTime - int64(7*86400)
|
|
|
- esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
- count := Es.Count(EsIndex, EsType, esQuery)
|
|
|
- if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
- MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
- return
|
|
|
- }
|
|
|
- }
|
|
|
- competehref := qu.ObjToString(tmp["competehref"])
|
|
|
- if competehref != "" { //验证三方网站数据剑鱼是否已采集
|
|
|
- title := qu.ObjToString(tmp["title"])
|
|
|
- one, _ := MgoS.FindOne("data_bak", map[string]interface{}{"title": title})
|
|
|
- if one != nil && len(*one) > 0 { //剑鱼已采集,舍弃此条信息
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
- MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
- return
|
|
|
- }
|
|
|
- }
|
|
|
+ //if code == "a_gcy_mcgg" { //竞品数据es title判重
|
|
|
+ // title := qu.ObjToString(tmp["title"])
|
|
|
+ // eTime := time.Now().Unix()
|
|
|
+ // sTime := eTime - int64(7*86400)
|
|
|
+ // esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
+ // count := Es.Count(EsIndex, EsType, esQuery)
|
|
|
+ // if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
|
+ // set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
+ // MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
+ // util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
+ // return
|
|
|
+ // }
|
|
|
+ //}
|
|
|
+ //competehref := qu.ObjToString(tmp["competehref"])
|
|
|
+ //if competehref != "" { //验证三方网站数据剑鱼是否已采集
|
|
|
+ // title := qu.ObjToString(tmp["title"])
|
|
|
+ // one, _ := MgoS.FindOne("data_bak", map[string]interface{}{"title": title})
|
|
|
+ // if one != nil && len(*one) > 0 { //剑鱼已采集,舍弃此条信息
|
|
|
+ // set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
+ // MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
+ // return
|
|
|
+ // }
|
|
|
+ //}
|
|
|
times := qu.IntAll(tmp["times"])
|
|
|
success := true //数据是否下载成功的标志
|
|
|
delete(tmp, "_id")
|
|
@@ -204,25 +206,12 @@ func DownloadHighDetail(code string) {
|
|
|
if len(tmp) > 0 {
|
|
|
SaveErrorData(sp.MUserName, tmp, err) //保存错误信息
|
|
|
}
|
|
|
- if errstr, ok := err.(*lua.ApiError); ok {
|
|
|
- errText := errstr.Object.String()
|
|
|
- logger.Info(errText, errText == "d.nx != 0")
|
|
|
- }
|
|
|
|
|
|
} /*else if data == nil && times >= 3 { //下载问题,建editor任务
|
|
|
DownloadErrorData(s.Code, tmp)
|
|
|
}*/
|
|
|
} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
- log.Println("beforeHref:", href, "afterHref:", href)
|
|
|
- //增量
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
- //全量
|
|
|
- db := HexToBigIntMod(tmphref)
|
|
|
- hashHref := HexText(href)
|
|
|
- isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)
|
|
|
- if !isExist {
|
|
|
- util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
|
- }
|
|
|
+ util.RedisClusterSet(hashHref, "", -1)
|
|
|
}
|
|
|
if !success { //下载失败更新次数和状态
|
|
|
ss := map[string]interface{}{"times": times, "updatetime": time.Now().Unix()}
|
|
@@ -232,11 +221,12 @@ func DownloadHighDetail(code string) {
|
|
|
set := map[string]interface{}{"$set": ss}
|
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
return
|
|
|
- } else { //三级页过滤
|
|
|
- deleteData := FilterByDetail(href, query, data) //针对列表页无法过滤需要在详情页过滤的数据,进行过滤处理
|
|
|
- if deleteData {
|
|
|
- return
|
|
|
- }
|
|
|
+ } else if data["delete"] != nil { //三级页过滤
|
|
|
+ util.RedisClusterSet(hashHref, "", -1) //过滤掉的数据存值全量redis
|
|
|
+ //更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true, "updatetime": time.Now().Unix()}}
|
|
|
+ MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
+ return
|
|
|
}
|
|
|
//正文、附件分析,下载异常数据重新下载
|
|
|
if AnalysisProjectInfo(data) {
|
|
@@ -257,16 +247,6 @@ func DownloadHighDetail(code string) {
|
|
|
delete(data, "exit")
|
|
|
delete(data, "checkpublishtime")
|
|
|
data["comeintime"] = time.Now().Unix()
|
|
|
- //计数
|
|
|
- //tmpsp1, b := Allspiders.Load(sp.Code)
|
|
|
- //if b {
|
|
|
- // sp1, ok := tmpsp1.(*Spider)
|
|
|
- // if ok {
|
|
|
- // atomic.AddInt32(&sp1.LastDowncount, 1)
|
|
|
- // atomic.AddInt32(&sp1.TodayDowncount, 1)
|
|
|
- // atomic.AddInt32(&sp1.TotalDowncount, 1)
|
|
|
- // }
|
|
|
- //}
|
|
|
data["spidercode"] = sp.Code
|
|
|
data["dataging"] = 0
|
|
|
data["iscompete"] = sp.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
|
|
@@ -312,22 +292,6 @@ func AnalysisProjectInfo(data map[string]interface{}) bool {
|
|
|
return false
|
|
|
}
|
|
|
|
|
|
-func FilterByDetail(href string, query, data map[string]interface{}) bool {
|
|
|
- if data["delete"] != nil {
|
|
|
- //增量
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
- //全量
|
|
|
- db := HexToBigIntMod(href)
|
|
|
- hashHref := HexText(href)
|
|
|
- util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
|
- //更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true, "updatetime": time.Now().Unix()}}
|
|
|
- MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
- return true
|
|
|
- }
|
|
|
- return false
|
|
|
-}
|
|
|
-
|
|
|
//下载解析内容页
|
|
|
func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
|
|
|
defer mu.Catch()
|