瀏覽代碼

列表页信息采集修改

maxiaoshan 1 年之前
父節點
當前提交
a1d14e72e1
共有 3 個文件被更改,包括 186 次插入65 次删除
  1. 4 7
      src/config.json
  2. 2 2
      src/main.go
  3. 180 56
      src/spider/spider.go

+ 4 - 7
src/config.json

@@ -11,7 +11,7 @@
     },
     "editoraddr": "http://127.0.0.1:6011/spider/infos",
     "msgname": "爬虫采集平台7100",
-    "msgserveraddr": "spdata.jianyu360.com:803",
+    "msgserveraddr": "spdata.jianyu360.com:801",
     "msgserveraddrfile": "spdata.jianyu360.com:802",
     "msgserveraddrchromedp": "spdata.jianyu360.com:807",
 	"isdelay":false,
@@ -42,12 +42,9 @@
         "ossBucketName":"jy-editor"
     },
     "pageturninfo": {
-        "repeatpagetimeslimit_w0": 3,
-        "repeatpagetimeslimit_w1": 3,
-        "turnpagemaxlimit_w0": 5,
-        "turnpagemaxlimit_w1": 50,
-        "nextpagemaxlimit_w0": 5,
-        "nextpagemaxlimit_w1": 50,
+        "repeatpagetimeslimit": 3,
+        "turnpagemaxlimit": 100,
+        "nextpagemaxlimit": 100,
         "listparalleltasklimit": 3,
         "listthreadsnum": 2
     },

+ 2 - 2
src/main.go

@@ -125,9 +125,9 @@ func main() {
 	//查列表页信息采集三级页
 	go spider.DetailData()
 	//处理心跳信息
-	//go spider.SaveHeartInfo()
+	go spider.SaveHeartInfo()
 	//批量保存心跳信息
-	//go spider.UpdateHeartInfo()
+	go spider.UpdateHeartInfo()
 	//7000历史节点下载详情页
 	go spider.HistoryEventDownloadDetail()
 

+ 180 - 56
src/spider/spider.go

@@ -229,8 +229,169 @@ func (s *Spider) GetLastPublishTime() (errs interface{}) {
 	return nil
 }
 
-//下载列表
+//下载列表(较DownListPageItemBack去掉了无数据的重试和重复页记录)
 func (s *Spider) DownListPageItem() (errs interface{}) {
+	defer qu.Catch()
+	start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
+	s.MaxPage = max                                                            //
+	repeatAllNum := 0                                                          //本轮采集tmpMax页总的重复个数
+	downloadAllNum := 0                                                        //本轮采集tmpMax页总个数
+	if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" {     //7000节点,爬虫跑历史
+		max = s.GetIntVar("spiderHistoryMaxPage")
+	}
+	downtimes := 0                                                                     //记录某页重试次数(暂定3次)
+	repeatPageTimes := 0                                                               //记录页码连续判重的次数(暂定连续判重页码数为5次时,不再翻页)
+	isRunRepeatList := false                                                           //是否执行列表页连续判重
+	if !util.Config.IsHistoryEvent && util.Config.Modal == 1 && max > 1 && max < 101 { //除顺序采集模式和非历史节点外所有节点,采集列表页时进行连续10页判重
+		isRunRepeatList = true
+		max = util.Config.PageTurnInfo.TurnPageMaxLimit //高性能模式设置最大页为100,队列模式50页
+	}
+	//子任务判断
+	if s.ContinueDownListChildTask {
+		start = util.Config.PageTurnInfo.TurnPageMaxLimit + 1                                       //子任务起始页
+		max = util.Config.PageTurnInfo.TurnPageMaxLimit + util.Config.PageTurnInfo.NextPageMaxLimit //子任务最大页
+	}
+	for ; start <= max && !s.Stop; start++ {
+		if !s.Stop && !s.ContinueDownListChildTask { //在下载详情页时爬虫下架,此时不再存心跳信息
+			UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list", start == 1) //记录所有节点列表页心跳
+		}
+		//qu.Debug("爬虫:", s.Code, "	配置最大页:", s.MaxPage, "	最终最大页:", max, "	当前页:", start, "重复次数:", repeatPageTimes)
+		if isRunRepeatList && repeatPageTimes >= util.Config.PageTurnInfo.RepeatPageTimesLimit { //超过连续判重上限,不再翻页
+			break
+		}
+		if err := s.L.CallByParam(lua.P{
+			Fn:      s.L.GetGlobal("downloadAndParseListPage"),
+			NRet:    1,
+			Protect: true,
+		}, lua.LNumber(start)); err != nil {
+			//panic(s.Code + "," + err.Error())
+			logger.Error("列表页采集报错", start, s.Code+","+err.Error())
+			errs = err.Error()
+			atomic.AddInt32(&s.Script.ErrorNum, 1)
+			//列表页采集报错进行重试,超过重试次数视为该页已采(脚本报错直接退出不进行内部重试)
+			if downtimes < 3 {
+				downtimes++
+				start--
+			} else if isRunRepeatList { //超过重试次数,视为本页重复
+				repeatPageTimes++ //次数加1
+				downtimes = 0
+			}
+			continue
+		}
+		lv := s.L.Get(-1)
+		s.L.Pop(1)
+		if tbl, ok := lv.(*lua.LTable); ok {
+			//list := []map[string]interface{}{}
+			//qu.Debug("当前页数据量:", tbl.Len())
+			if tabLen := tbl.Len(); tabLen > 0 { //列表页有数据,根据列表页信息下载三级页
+				repeatListNum := 0 // 当前列表页连接重复个数
+				for i := 1; i <= tabLen; i++ {
+					v := tbl.RawGetInt(i).(*lua.LTable)
+					tmp := util.TableToMap(v)
+					if !s.IsHistoricalMend { //不是历史补漏
+						tmp["dataging"] = 0 //数据中打标记dataging=0
+						if s.DownDetail {
+							s.DownloadDetailItem(tmp, &repeatListNum)
+						}
+					} else { //历史补漏
+						s.HistoricalMendDownloadDetailItem(tmp) //历史补漏下载三级页
+					}
+				}
+				repeatAllNum += repeatListNum
+				downloadAllNum += tabLen
+				if isRunRepeatList { //执行连续页码判重
+					if repeatListNum >= tabLen { //当前start列表页全部数据都已采集
+						repeatPageTimes++ //次数加1
+					} else { //当前start页有新数据,重复次数重置
+						repeatPageTimes = 0
+					}
+				}
+			} else if isRunRepeatList {
+				repeatPageTimes++ //次数加1
+			}
+		} else if isRunRepeatList {
+			repeatPageTimes++ //次数加1
+		}
+		downtimes = 0 //当前页下载无误,重置下载重试次数
+		util.TimeSleepFunc(100*time.Millisecond, TimeSleepChan)
+	}
+	logger.Info(s.Code, "本轮列表页采集详情:", s.ContinueDownListChildTask, downloadAllNum, repeatAllNum, start, s.Stop)
+	if !util.Config.IsHistoryEvent && !s.Stop { //非历史节点统计下载率
+		nowTime := time.Now()
+		sDate := qu.FormatDate(&nowTime, qu.Date_Short_Layout)
+		set := map[string]interface{}{
+			"site":       s.Name,
+			"channel":    s.Channel,
+			"spidercode": s.Code,
+			"updatetime": nowTime.Unix(),
+			"event":      util.Config.Uploadevent,
+			"modifyuser": s.MUserName,
+			"maxpage":    s.MaxPage,
+			"runrate":    s.SpiderRunRate,
+			"endpage":    start,
+			"date":       sDate,
+		}
+		inc := map[string]interface{}{
+			"alltimes": 1,
+		}
+		//记录翻页是否成功
+		if s.MaxPage > 1 {
+			if s.PageOneTextHash != "" {
+				if s.PageTwoTextHash != "" {
+					if s.PageOneTextHash != s.PageTwoTextHash {
+						inc["page_success"] = 1
+					} else {
+						inc["page_fail"] = 1
+					}
+				} else {
+					inc["page_fail"] = 1
+				}
+			} else if s.PageTwoTextHash != "" {
+				inc["page_onefail"] = 1
+			}
+		}
+		if downloadAllNum > 0 {
+			rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
+			rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
+			if rate == 1.0 {
+				if downloadAllNum == 1 { //列表页数据过滤的只剩一条新数据
+					inc["oh_percent_onenum"] = 1
+				} else {
+					inc["oh_percent"] = 1
+				}
+				//} else if rate >= 0.9 {
+				//	inc["nt_percent"] = 1
+				//} else if rate >= 0.8 {
+				//	inc["et_percent"] = 1
+				//} else {
+				//	inc["other_percent"] = 1
+			}
+			if isRunRepeatList && start > max { //连续翻页超过了上限
+				if !s.ContinueDownListChildTask {
+					go ContinueDownListPageItem(s) //开启子任务继续采集
+				} else {
+					inc["uplimit"] = 1
+				}
+			}
+		} else {
+			inc["zero"] = 1
+		}
+		query := map[string]interface{}{
+			"date":       sDate,
+			"spidercode": s.Code,
+		}
+		MgoS.Update("spider_downloadrate", query, map[string]interface{}{
+			"$set": set,
+			"$inc": inc,
+		}, true, false)
+	}
+	//信息重置
+	s.PageOneTextHash = ""
+	s.PageTwoTextHash = ""
+	return errs
+}
+
+func (s *Spider) DownListPageItemBack() (errs interface{}) {
 	defer qu.Catch()
 	start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
 	s.MaxPage = max                                                            //
@@ -243,24 +404,15 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 	downtimes := 0                                                                     //记录某页重试次数(暂定3次)
 	repeatPageNum := 0                                                                 //记录列表页所有连接重复的页码
 	repeatPageTimes := 0                                                               //记录页码连续判重的次数(暂定连续判重页码数为5次时,不再翻页)
-	repeatPageTimesLimit := util.Config.PageTurnInfo.RepeatPageTimesLimitW0            //记录页码连续判重的次数上线(高性能模式10页,队列模式5页)
 	isRunRepeatList := false                                                           //是否执行列表页连续判重
 	if !util.Config.IsHistoryEvent && util.Config.Modal == 1 && max > 1 && max < 101 { //除顺序采集模式和非历史节点外所有节点,采集列表页时进行连续10页判重
 		isRunRepeatList = true
-		max = util.Config.PageTurnInfo.TurnPageMaxLimitW0 //高性能模式设置最大页为100
-		if util.Config.Working == 1 {                     //队列模式
-			repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimitW1 //连续判重页3
-			max = util.Config.PageTurnInfo.TurnPageMaxLimitW1                      //队列模式最大页50
-		}
+		max = util.Config.PageTurnInfo.TurnPageMaxLimit //高性能模式设置最大页为100,队列模式50页
 	}
 	//子任务判断
 	if s.ContinueDownListChildTask {
-		start = util.Config.PageTurnInfo.TurnPageMaxLimitW0 + 1
-		max = util.Config.PageTurnInfo.TurnPageMaxLimitW0 + util.Config.PageTurnInfo.NextPageMaxLimitW0
-		if util.Config.Working == 1 { //队列模式
-			start = util.Config.PageTurnInfo.TurnPageMaxLimitW1 + 1
-			max = util.Config.PageTurnInfo.TurnPageMaxLimitW1 + util.Config.PageTurnInfo.NextPageMaxLimitW1
-		}
+		start = util.Config.PageTurnInfo.TurnPageMaxLimit + 1
+		max = util.Config.PageTurnInfo.TurnPageMaxLimit + util.Config.PageTurnInfo.NextPageMaxLimit
 	}
 	for ; start <= max && !s.Stop; start++ {
 		if !s.Stop && !s.ContinueDownListChildTask { //在下载详情页时爬虫下架,此时不再存心跳信息
@@ -270,7 +422,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 		//if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次,不再翻页
 		//	break
 		//}
-		if isRunRepeatList && repeatPageTimes >= repeatPageTimesLimit { //重复次数超过10次,不再翻页
+		if isRunRepeatList && repeatPageTimes >= util.Config.PageTurnInfo.RepeatPageTimesLimit { //超过连续判重上限,不再翻页
 			break
 		}
 		if err := s.L.CallByParam(lua.P{
@@ -444,11 +596,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 			"date":       sDate,
 			"spidercode": s.Code,
 		}
-		coll := "spider_downloadrate"
-		if s.ContinueDownListChildTask {
-			coll = "spider_downloadrate_child"
-		}
-		MgoS.Update(coll, query, map[string]interface{}{
+		MgoS.Update("spider_downloadrate", query, map[string]interface{}{
 			"$set": set,
 			"$inc": inc,
 		}, true, false)
@@ -469,26 +617,17 @@ func (s *Spider) DownListPageItemByThreads() (errs interface{}) {
 	if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" {     //7000节点根据爬虫类型,取采集的最大页
 		max = s.GetIntVar("spiderHistoryMaxPage") //采集历史的爬虫,取历史最大页配置spiderHistoryMaxPage
 	}
-	repeatPageTimesLimit := 0 //记录页码连续判重的次数上限
-	isRunRepeatList := false  //是否执行列表页连续判重逻辑
+	isRunRepeatList := false //是否执行列表页连续判重逻辑
 	//是否进行连续翻页判断,修改最大页
 	if !util.Config.IsHistoryEvent && util.Config.Modal == 1 && max > 1 && max < 101 { //除顺序采集模式和非历史节点外所有节点,采集列表页时进行连续10页判重
 		isRunRepeatList = true
-		repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimitW0
-		max = util.Config.PageTurnInfo.TurnPageMaxLimitW0 //高性能模式设置最大页为100
-		if util.Config.Working == 1 {                     //队列模式
-			repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimitW1 //连续判重页3
-			max = util.Config.PageTurnInfo.TurnPageMaxLimitW1                      //队列模式最大页50
-		}
+		max = util.Config.PageTurnInfo.TurnPageMaxLimit //高性能模式设置最大页为100,队列模式50页
+
 	}
 	//子任务判断
 	//if s.ContinueDownListChildTask {
-	//	start = util.Config.PageTurnInfo.TurnPageMaxLimitW0 + 1
-	//	max = util.Config.PageTurnInfo.TurnPageMaxLimitW0 + util.Config.PageTurnInfo.NextPageMaxLimitW0
-	//	if util.Config.Working == 1 { //队列模式
-	//		start = util.Config.PageTurnInfo.TurnPageMaxLimitW1 + 1
-	//		max = util.Config.PageTurnInfo.TurnPageMaxLimitW1 + util.Config.PageTurnInfo.NextPageMaxLimitW1
-	//	}
+	//	start = util.Config.PageTurnInfo.TurnPageMaxLimit + 1
+	//	max = util.Config.PageTurnInfo.TurnPageMaxLimit + util.Config.PageTurnInfo.NextPageMaxLimit
 	//}
 	//创建并发Spider对象
 	spChan := make(chan *Spider, 1)
@@ -527,6 +666,7 @@ func (s *Spider) DownListPageItemByThreads() (errs interface{}) {
 				}()
 				//下载某一页数据
 				downnum, repeatnum := sp.DownListOnePage(pagenum)
+				//logger.Info(sp.Code, "pagenum", pagenum, "repeat", downnum == repeatnum, downnum, repeatnum, &sp)
 				//汇总下载量
 				atomic.AddInt64(&downloadAllNum, int64(downnum))
 				atomic.AddInt64(&repeatAllNum, int64(repeatnum))
@@ -554,7 +694,7 @@ func (s *Spider) DownListPageItemByThreads() (errs interface{}) {
 					repeatTimes = 0
 				}
 			}
-			if repeatTimes >= repeatPageTimesLimit { //超过连续判重页,不再采集
+			if repeatTimes >= util.Config.PageTurnInfo.RepeatPageTimesLimit { //超过连续判重上限,不再翻页
 				break
 			}
 		}
@@ -625,11 +765,7 @@ func (s *Spider) DownListPageItemByThreads() (errs interface{}) {
 			"date":       sDate,
 			"spidercode": s.Code,
 		}
-		coll := "spider_downloadrate"
-		if s.ContinueDownListChildTask {
-			coll = "spider_downloadrate_child"
-		}
-		MgoS.Update(coll, query, map[string]interface{}{
+		MgoS.Update("spider_downloadrate", query, map[string]interface{}{
 			"$set": set,
 			"$inc": inc,
 		}, true, false)
@@ -655,21 +791,13 @@ func (s *Spider) DownListPageItemByThreadsBack() (errs interface{}) {
 	//是否进行连续翻页判断,修改最大页
 	if !util.Config.IsHistoryEvent && util.Config.Modal == 1 && max > 1 && max < 101 { //除顺序采集模式和非历史节点外所有节点,采集列表页时进行连续10页判重
 		isRunRepeatList = true
-		repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimitW0
-		max = util.Config.PageTurnInfo.TurnPageMaxLimitW0 //高性能模式设置最大页为100
-		if util.Config.Working == 1 {                     //队列模式
-			repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimitW1 //连续判重页3
-			max = util.Config.PageTurnInfo.TurnPageMaxLimitW1                      //队列模式最大页50
-		}
+		repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimit
+		max = util.Config.PageTurnInfo.TurnPageMaxLimit //高性能模式设置最大页为100,队列模式50页
 	}
 	//子任务判断
 	if s.ContinueDownListChildTask {
-		start = util.Config.PageTurnInfo.TurnPageMaxLimitW0 + 1
-		max = util.Config.PageTurnInfo.TurnPageMaxLimitW0 + util.Config.PageTurnInfo.NextPageMaxLimitW0
-		if util.Config.Working == 1 { //队列模式
-			start = util.Config.PageTurnInfo.TurnPageMaxLimitW1 + 1
-			max = util.Config.PageTurnInfo.TurnPageMaxLimitW1 + util.Config.PageTurnInfo.NextPageMaxLimitW1
-		}
+		start = util.Config.PageTurnInfo.TurnPageMaxLimit + 1
+		max = util.Config.PageTurnInfo.TurnPageMaxLimit + util.Config.PageTurnInfo.NextPageMaxLimit
 	}
 	//创建并发Spider对象
 	spChan := make(chan *Spider, 1)
@@ -793,11 +921,7 @@ func (s *Spider) DownListPageItemByThreadsBack() (errs interface{}) {
 			"date":       sDate,
 			"spidercode": s.Code,
 		}
-		coll := "spider_downloadrate"
-		if s.ContinueDownListChildTask {
-			coll = "spider_downloadrate_child"
-		}
-		MgoS.Update(coll, query, map[string]interface{}{
+		MgoS.Update("spider_downloadrate", query, map[string]interface{}{
 			"$set": set,
 			"$inc": inc,
 		}, true, false)