|
@@ -503,10 +503,24 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
|
}
|
|
|
return
|
|
|
}
|
|
|
+ //详情页过滤数据
|
|
|
+ set := map[string]interface{}{"state": 1}
|
|
|
+ if data["delete"] != nil {
|
|
|
+ //增量
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ //全量
|
|
|
+ db := HexToBigIntMod(href)
|
|
|
+ hashHref := HexText(href)
|
|
|
+ util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
|
+ //更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
|
+ set["delete"] = true
|
|
|
+ Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": set})
|
|
|
+ return
|
|
|
+ }
|
|
|
//更新spider_listdata中数据下载成功标记
|
|
|
if id != "" {
|
|
|
//Mgo.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "byid": id}}, false, true)
|
|
|
- Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1}})
|
|
|
+ Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": set})
|
|
|
}
|
|
|
flag := true
|
|
|
t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"])) //publishtime
|
|
@@ -604,7 +618,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
return
|
|
|
}
|
|
|
}
|
|
|
- //下载、解析、入库
|
|
|
+ //下载详情页
|
|
|
data, err = s.DownloadDetailPage(paramdata, data)
|
|
|
if err != nil || data == nil {
|
|
|
if err != nil {
|
|
@@ -630,9 +644,27 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
|
}
|
|
|
}
|
|
|
+ //详情页下载数据成功心跳
|
|
|
+ if !s.Stop {
|
|
|
+ UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //记录modal=0老模式采集到数据心跳
|
|
|
+ }
|
|
|
+ set := map[string]interface{}{"state": 1, "byid": id}
|
|
|
+ //详情页过滤数据
|
|
|
+ if data["delete"] != nil {
|
|
|
+ //增量
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ //全量
|
|
|
+ db := HexToBigIntMod(href)
|
|
|
+ hashHref := HexText(href)
|
|
|
+ util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
|
+ //更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
|
+ set["delete"] = true
|
|
|
+ Mgo.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
|
|
|
+ return
|
|
|
+ }
|
|
|
//更新spider_listdata中数据下载成功标记
|
|
|
if id != "" {
|
|
|
- Mgo.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "byid": id}}, false, true)
|
|
|
+ Mgo.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
|
|
|
//Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1}})
|
|
|
}
|
|
|
|
|
@@ -640,9 +672,6 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
if t1 > time.Now().Unix() { //防止发布时间超前
|
|
|
data["publishtime"] = time.Now().Unix()
|
|
|
}
|
|
|
- if !s.Stop {
|
|
|
- UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //记录modal=0老模式采集到数据心跳
|
|
|
- }
|
|
|
delete(data, "state")
|
|
|
delete(data, "exit")
|
|
|
delete(data, "checkpublishtime")
|
|
@@ -705,7 +734,7 @@ func (s *Spider) DownloadDetailByNames(p interface{}) {
|
|
|
Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
|
|
|
}
|
|
|
|
|
|
-//下载解析内容页
|
|
|
+//下载解析详情页
|
|
|
func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
|
|
|
defer mu.Catch()
|
|
|
s.LastHeartbeat = time.Now().Unix()
|
|
@@ -872,6 +901,7 @@ func (s *Spider) DownloadHighDetail() {
|
|
|
util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
if !success { //下载失败更新次数和状态
|
|
|
ss := map[string]interface{}{"times": times}
|
|
|
if times >= 3 { //3次下载失败今天不再下载,state置为1
|
|
@@ -880,6 +910,12 @@ func (s *Spider) DownloadHighDetail() {
|
|
|
set := map[string]interface{}{"$set": ss}
|
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
continue
|
|
|
+ } else {
|
|
|
+ qu.Debug(data)
|
|
|
+ deleteData := FilterByDetail(href, query, data) //针对列表页无法过滤需要在详情页过滤的数据,进行过滤处理
|
|
|
+ if deleteData {
|
|
|
+ continue
|
|
|
+ }
|
|
|
}
|
|
|
t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
|
|
|
if t1 > time.Now().Unix() { //防止发布时间超前
|
|
@@ -1023,6 +1059,11 @@ func (s *Spider) DownloadListDetail() {
|
|
|
set := map[string]interface{}{"$set": ss}
|
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
continue
|
|
|
+ } else {
|
|
|
+ deleteData := FilterByDetail(href, query, data) //针对列表页无法过滤需要在详情页过滤的数据,进行过滤处理
|
|
|
+ if deleteData {
|
|
|
+ continue
|
|
|
+ }
|
|
|
}
|
|
|
t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
|
|
|
if t1 > time.Now().Unix() { //防止发布时间超前
|
|
@@ -1051,6 +1092,22 @@ func (s *Spider) DownloadListDetail() {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+func FilterByDetail(href string, query, data map[string]interface{}) bool {
|
|
|
+ if data["delete"] != nil {
|
|
|
+ //增量
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ //全量
|
|
|
+ db := HexToBigIntMod(href)
|
|
|
+ hashHref := HexText(href)
|
|
|
+ util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
|
+ //更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true}}
|
|
|
+ Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
//获取随机数
|
|
|
func GetRandMath(num int) int {
|
|
|
r := rand.New(rand.NewSource(time.Now().UnixNano()))
|