|
@@ -850,8 +850,7 @@ func (s *Spider) DownloadHighDetail() {
|
|
if !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
if !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail") //记录modal=1采集三级页心跳
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail") //记录modal=1采集三级页心跳
|
|
}
|
|
}
|
|
- list, _ := Mgo.Find("spider_highlistdata_test", q, o, f, false, 0, 100)
|
|
|
|
- qu.Debug("----", len(*list))
|
|
|
|
|
|
+ list, _ := Mgo.Find("spider_highlistdata", q, o, f, false, 0, 100)
|
|
if list != nil && len(*list) > 0 {
|
|
if list != nil && len(*list) > 0 {
|
|
for _, tmp := range *list {
|
|
for _, tmp := range *list {
|
|
_id := tmp["_id"]
|
|
_id := tmp["_id"]
|
|
@@ -862,7 +861,7 @@ func (s *Spider) DownloadHighDetail() {
|
|
isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
if isExist {
|
|
if isExist {
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
- Mgo.Update("spider_highlistdata_test", query, set, false, false)
|
|
|
|
|
|
+ Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
if isEsRepeat { //es数据title判重
|
|
if isEsRepeat { //es数据title判重
|
|
@@ -873,7 +872,7 @@ func (s *Spider) DownloadHighDetail() {
|
|
count := Es.Count(EsIndex, EsType, esQuery)
|
|
count := Es.Count(EsIndex, EsType, esQuery)
|
|
if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
- Mgo.Update("spider_highlistdata_test", query, set, false, false)
|
|
|
|
|
|
+ Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
@@ -922,7 +921,7 @@ func (s *Spider) DownloadHighDetail() {
|
|
ss["state"] = -1
|
|
ss["state"] = -1
|
|
}
|
|
}
|
|
set := map[string]interface{}{"$set": ss}
|
|
set := map[string]interface{}{"$set": ss}
|
|
- Mgo.Update("spider_highlistdata_test", query, set, false, false)
|
|
|
|
|
|
+ Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
continue
|
|
continue
|
|
} else {
|
|
} else {
|
|
deleteData := FilterByDetail(href, query, data) //针对列表页无法过滤需要在详情页过滤的数据,进行过滤处理
|
|
deleteData := FilterByDetail(href, query, data) //针对列表页无法过滤需要在详情页过滤的数据,进行过滤处理
|
|
@@ -952,7 +951,7 @@ func (s *Spider) DownloadHighDetail() {
|
|
data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
|
|
data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
|
|
Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
|
|
Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}} //下载成功state置为1
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}} //下载成功state置为1
|
|
- Mgo.Update("spider_highlistdata_test", query, set, false, false)
|
|
|
|
|
|
+ Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
}
|
|
}
|
|
//重载spider
|
|
//重载spider
|
|
s.LoadScript(&s.Name, &s.Channel, &s.MUserName, s.Code, s.ScriptFile, true)
|
|
s.LoadScript(&s.Name, &s.Channel, &s.MUserName, s.Code, s.ScriptFile, true)
|