maxiaoshan 3 жил өмнө
parent
commit
ad55187b9d

+ 64 - 7
src/spider/spider.go

@@ -503,10 +503,24 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
 		}
 		}
 		return
 		return
 	}
 	}
+	//详情页过滤数据
+	set := map[string]interface{}{"state": 1}
+	if data["delete"] != nil {
+		//增量
+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+		//全量
+		db := HexToBigIntMod(href)
+		hashHref := HexText(href)
+		util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
+		//更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
+		set["delete"] = true
+		Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": set})
+		return
+	}
 	//更新spider_listdata中数据下载成功标记
 	//更新spider_listdata中数据下载成功标记
 	if id != "" {
 	if id != "" {
 		//Mgo.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "byid": id}}, false, true)
 		//Mgo.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "byid": id}}, false, true)
-		Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1}})
+		Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": set})
 	}
 	}
 	flag := true
 	flag := true
 	t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"])) //publishtime
 	t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"])) //publishtime
@@ -604,7 +618,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 			return
 			return
 		}
 		}
 	}
 	}
-	//下载、解析、入库
+	//下载详情页
 	data, err = s.DownloadDetailPage(paramdata, data)
 	data, err = s.DownloadDetailPage(paramdata, data)
 	if err != nil || data == nil {
 	if err != nil || data == nil {
 		if err != nil {
 		if err != nil {
@@ -630,9 +644,27 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 			util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
 			util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
 		}
 		}
 	}
 	}
+	//详情页下载数据成功心跳
+	if !s.Stop {
+		UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //记录modal=0老模式采集到数据心跳
+	}
+	set := map[string]interface{}{"state": 1, "byid": id}
+	//详情页过滤数据
+	if data["delete"] != nil {
+		//增量
+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+		//全量
+		db := HexToBigIntMod(href)
+		hashHref := HexText(href)
+		util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
+		//更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
+		set["delete"] = true
+		Mgo.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
+		return
+	}
 	//更新spider_listdata中数据下载成功标记
 	//更新spider_listdata中数据下载成功标记
 	if id != "" {
 	if id != "" {
-		Mgo.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "byid": id}}, false, true)
+		Mgo.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
 		//Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1}})
 		//Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1}})
 	}
 	}
 
 
@@ -640,9 +672,6 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 	if t1 > time.Now().Unix() { //防止发布时间超前
 	if t1 > time.Now().Unix() { //防止发布时间超前
 		data["publishtime"] = time.Now().Unix()
 		data["publishtime"] = time.Now().Unix()
 	}
 	}
-	if !s.Stop {
-		UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //记录modal=0老模式采集到数据心跳
-	}
 	delete(data, "state")
 	delete(data, "state")
 	delete(data, "exit")
 	delete(data, "exit")
 	delete(data, "checkpublishtime")
 	delete(data, "checkpublishtime")
@@ -705,7 +734,7 @@ func (s *Spider) DownloadDetailByNames(p interface{}) {
 	Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
 	Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
 }
 }
 
 
-//下载解析内容
+//下载解析详情
 func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
 func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
 	defer mu.Catch()
 	defer mu.Catch()
 	s.LastHeartbeat = time.Now().Unix()
 	s.LastHeartbeat = time.Now().Unix()
@@ -872,6 +901,7 @@ func (s *Spider) DownloadHighDetail() {
 							util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
 							util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
 						}
 						}
 					}
 					}
+
 					if !success { //下载失败更新次数和状态
 					if !success { //下载失败更新次数和状态
 						ss := map[string]interface{}{"times": times}
 						ss := map[string]interface{}{"times": times}
 						if times >= 3 { //3次下载失败今天不再下载,state置为1
 						if times >= 3 { //3次下载失败今天不再下载,state置为1
@@ -880,6 +910,12 @@ func (s *Spider) DownloadHighDetail() {
 						set := map[string]interface{}{"$set": ss}
 						set := map[string]interface{}{"$set": ss}
 						Mgo.Update("spider_highlistdata", query, set, false, false)
 						Mgo.Update("spider_highlistdata", query, set, false, false)
 						continue
 						continue
+					} else {
+						qu.Debug(data)
+						deleteData := FilterByDetail(href, query, data) //针对列表页无法过滤需要在详情页过滤的数据,进行过滤处理
+						if deleteData {
+							continue
+						}
 					}
 					}
 					t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
 					t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
 					if t1 > time.Now().Unix() { //防止发布时间超前
 					if t1 > time.Now().Unix() { //防止发布时间超前
@@ -1023,6 +1059,11 @@ func (s *Spider) DownloadListDetail() {
 				set := map[string]interface{}{"$set": ss}
 				set := map[string]interface{}{"$set": ss}
 				Mgo.Update("spider_highlistdata", query, set, false, false)
 				Mgo.Update("spider_highlistdata", query, set, false, false)
 				continue
 				continue
+			} else {
+				deleteData := FilterByDetail(href, query, data) //针对列表页无法过滤需要在详情页过滤的数据,进行过滤处理
+				if deleteData {
+					continue
+				}
 			}
 			}
 			t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
 			t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
 			if t1 > time.Now().Unix() { //防止发布时间超前
 			if t1 > time.Now().Unix() { //防止发布时间超前
@@ -1051,6 +1092,22 @@ func (s *Spider) DownloadListDetail() {
 	}
 	}
 }
 }
 
 
+func FilterByDetail(href string, query, data map[string]interface{}) bool {
+	if data["delete"] != nil {
+		//增量
+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+		//全量
+		db := HexToBigIntMod(href)
+		hashHref := HexText(href)
+		util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
+		//更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
+		set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true}}
+		Mgo.Update("spider_highlistdata", query, set, false, false)
+		return true
+	}
+	return false
+}
+
 //获取随机数
 //获取随机数
 func GetRandMath(num int) int {
 func GetRandMath(num int) int {
 	r := rand.New(rand.NewSource(time.Now().UnixNano()))
 	r := rand.New(rand.NewSource(time.Now().UnixNano()))