maxiaoshan 2 лет назад
Родитель
Сommit
c7f1547844
2 измененных файлов с 6 добавлено и 1 удалено
  1. 5 1
      src/spider/msgservice.go
  2. 1 0
      src/spider/spider.go

+ 5 - 1
src/spider/msgservice.go

@@ -249,7 +249,11 @@ func SaveObj(event int, checkAtrr string, data map[string]interface{}, saveredis
 	bs, _ := json.Marshal(data)
 	size := len(bs) / (1024 * 1024)
 	if size > 10 {
-		log.Println(event, checkAtrr, data["href"], data["title"], len(bs))
+		href := fmt.Sprint(data["href"])
+		hashHref := util.HexText(href)
+		util.RedisClusterSet(hashHref, "", -1)
+		MgoS.Save("spider_filterdata", data)
+		//log.Println(event, checkAtrr, data["href"], data["title"], len(bs))
 		return
 	}
 	defer mu.Catch()

+ 1 - 0
src/spider/spider.go

@@ -635,6 +635,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 	//下载详情页
 	data, err = s.DownloadDetailPage(paramdata, data)
 	if err != nil || data == nil {
+		*num++ //顺序采集模式,在记录重复数据个数时,采集失败记为重复(避免下载失败数据每轮次采集都不会被判重,造成全采次数+1)
 		if err != nil {
 			logger.Error(s.Code, err, paramdata)
 			//if len(paramdata) > 0 {