Parcourir la source

爬虫补采模块日志存储修改

mxs il y a 1 an
Parent
commit
f85dbf464e
1 fichiers modifiés avec 41 ajouts et 37 suppressions
  1. 41 37
      src/spider/spider.go

+ 41 - 37
src/spider/spider.go

@@ -940,16 +940,16 @@ func (s *Spider) DownListPageItemByThreadsBack() (errs interface{}) {
 func (s *Spider) SupplementDownListPageItem() (errs interface{}) {
 	defer qu.Catch()
 	var (
-		errtimes       int    //采集异常次数(暂定10次)
-		errPageNum     int    //当前采集异常页码
-		downtimes      int    //记录某页重试次数(暂定3次)
-		downloadAllNum int    //记录本次采集,信息采集总量
-		saveAllNum     int    //记录本次采集,信息补采总量
-		repeatAllNum   int    //记录本次采集,信息重复总量
-		pageTitleHash  string //记录当前页所有title文本
-		finishText     = "正常退出"
+		errtimes       int      //采集异常次数(暂定10次)
+		errPageNum     int      //当前采集异常页码
+		downtimes      int      //记录某页重试次数(暂定3次)
+		downloadAllNum int      //记录本次采集,信息采集总量
+		saveAllNum     int      //记录本次采集,信息补采总量
+		repeatAllNum   int      //记录本次采集,信息重复总量
+		pageTitleHash  string   //记录当前页所有title文本
+		finishText     = "正常退出" //
+		start          = 1      //起始页
 	)
-	start := 1 //起始页
 	for {
 		if errtimes >= Supplement_MaxErrorTimes { //连续异常次数超过10次,爬虫不再翻页
 			finishText = "异常退出"
@@ -978,14 +978,18 @@ func (s *Spider) SupplementDownListPageItem() (errs interface{}) {
 		s.L.Pop(1)
 		if tbl, ok := lv.(*lua.LTable); ok {
 			if tabLen := tbl.Len(); tabLen > 0 { //列表页有数据,根据列表页信息下载三级页
-				repeatListNum := 0 // 当前列表页连接重复个数
-				isBreak := false
-				var publishtimeErrTimes int
-				var text string
-				num := 1
+				var (
+					publishtimeErrTimes int
+					text                string
+					repeatListNum       int // 当前列表页连接重复个数
+					num                 = 1
+					isBreak             = false
+				)
 				for ; num <= tabLen; num++ {
 					v := tbl.RawGetInt(num).(*lua.LTable)
 					tmp := util.TableToMap(v)
+					tmp["dataging"] = 0 //数据中打标记dataging=0
+					s.DownloadDetailItem(tmp, &repeatListNum)
 					pTmp := qu.ObjToString(tmp["publishtime"])
 					title := qu.ObjToString(tmp["title"])
 					text += title
@@ -993,18 +997,17 @@ func (s *Spider) SupplementDownListPageItem() (errs interface{}) {
 					publishtime := pTime.Unix()
 					if publishtime > 1000000000 && publishtime < Supplement_Publishtime { //正常退出
 						isBreak = true
-						break
+						//break
 					} else if publishtime <= 1000000000 { //异常发布时间
 						publishtimeErrTimes++
 					}
-					tmp["dataging"] = 0 //数据中打标记dataging=0
-					s.DownloadDetailItem(tmp, &repeatListNum)
 				}
-				downloadAllNum += tabLen
-				repeatAllNum += repeatListNum
-				saveAllNum += num - 1 - repeatListNum
-				tmpPageTitleHash := pageTitleHash
-				pageTitleHash = util.HexText(text)
+				logger.Info(s.Code, start, tabLen, repeatListNum)
+				downloadAllNum += tabLen                                                //采集总量累计
+				repeatAllNum += repeatListNum                                           //重复总量累计
+				saveAllNum += num - 1 - repeatListNum                                   //保存总量累计
+				tmpPageTitleHash := pageTitleHash                                       //
+				pageTitleHash = util.HexText(text)                                      //
 				if tabLen == publishtimeErrTimes || tmpPageTitleHash == pageTitleHash { //当前页数据发布时间均异常;当前页与上页采集内容一致
 					//if errtimes == 0 || start == errPageNum+1  {
 					errtimes++
@@ -1044,20 +1047,20 @@ func (s *Spider) SupplementDownListPageItem() (errs interface{}) {
 		errPageNum = 0
 		util.TimeSleepFunc(100*time.Millisecond, TimeSleepChan)
 	}
-	logger.Info(s.Code, "本轮列表页采集详情:", downloadAllNum, repeatAllNum, start, finishText)
-	if !util.Config.IsHistoryEvent && !s.Stop { //非历史节点统计下载率
-		save := map[string]interface{}{
-			"site":       s.Name,
-			"channel":    s.Channel,
-			"spidercode": s.Code,
-			"comeintime": time.Now().Unix(),
-			"modifyuser": s.MUserName,
-			"endpage":    start,
-			"finish":     finishText,
-			"num":        saveAllNum,
-		}
-		MgoS.Save("spider_supplement", save)
-	}
+	logger.Info(s.Code, "本轮列表页采集详情:", downloadAllNum, repeatAllNum, saveAllNum, finishText)
+	save := map[string]interface{}{
+		"site":       s.Name,
+		"channel":    s.Channel,
+		"spidercode": s.Code,
+		"comeintime": time.Now().Unix(),
+		"modifyuser": s.MUserName,
+		"endpage":    start,
+		"finish":     finishText,
+		"savenum":    saveAllNum,
+		"count":      downloadAllNum,
+		"repeat":     repeatAllNum,
+	}
+	MgoS.Save("spider_supplement", save)
 	return errs
 }
 
@@ -1267,7 +1270,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 	hashHref := util.HexText(href)
 	//列表页redis判重
 	isExist := util.RedisExist("list", "list_"+hashHref)
-	if Supplement { //补采,再进行全量redis判重
+	if Supplement && !isExist { //补采,再进行全量redis判重
 		isExist, _ = util.ExistsBloomRedis("href", href)
 	}
 	if isExist {
@@ -1331,6 +1334,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 			return
 		}
 	}
+
 	//详情页下载数据成功心跳
 	if !s.Stop {
 		UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute", false) //记录modal=0老模式采集到数据心跳