Bläddra i källkod

监控列表页采集量、重复量

maxiaoshan 3 år sedan
förälder
incheckning
b1227ac228
1 ändrade filer med 24 tillägg och 22 borttagningar
  1. 24 22
      src/spider/spider.go

+ 24 - 22
src/spider/spider.go

@@ -244,15 +244,15 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 	}
 	downtimes := 0                                                                  //记录某页重试次数(暂定3次)
 	repeatPageNum := 0                                                              //记录列表页所有连接重复的页码
-	repeatPageTimes := 0                                                            //记录页码连续判重的次数(暂定连续判重页码数为10次时,不再翻页)
+	repeatPageTimes := 0                                                            //记录页码连续判重的次数(暂定连续判重页码数为5次时,不再翻页)
 	isRunRepeatList := false                                                        //是否执行列表页连续判重
-	if util.Config.Modal == 1 && util.Config.Working == 0 && max > 1 && max < 101 { //7100 7400最大页小于101且大于1,对此部分爬虫采集列表页时进行连续10页判重
+	if util.Config.Modal == 1 && util.Config.Working == 0 && max > 1 && max < 101 { //7100 7400最大页小于101且大于1,对此部分爬虫采集列表页时进行连续5页判重
 		isRunRepeatList = true
 		max = 100 //设置最大页为100
 	}
 	for ; start <= max && !s.Stop; start++ {
-		//qu.Debug("重复页:", repeatPageNum, "	最大页:", max, "	当前页:", start, "重复次数:", repeatPageTimes)
-		if isRunRepeatList && repeatPageTimes >= 10 { //重复次数超过10次,不再翻页
+		//qu.Debug("重复页:", repeatPageNum, "	配置最大页:", tmpMax, "	最终最大页:", max, "	当前页:", start, "重复次数:", repeatPageTimes)
+		if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次,不再翻页
 			break
 		}
 		if err := s.L.CallByParam(lua.P{
@@ -306,7 +306,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 					repeatAllNum += repeatListNum
 					downloadAllNum += tabLen
 				}
-				if isRunRepeatList { //执行连续页码判重
+				if start > tmpMax && isRunRepeatList { //执行连续页码判重
 					if repeatListNum >= tabLen { //当前start列表页全部数据都已采集
 						//qu.Debug("重复页:", repeatPageNum, "当前页:", start)
 						if repeatPageNum+1 == start || repeatPageNum == 0 {
@@ -330,7 +330,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 					downtimes++
 					start--
 					continue
-				} else if isRunRepeatList { //超过重试次数,视为本页重复
+				} else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
 					if repeatPageNum+1 == start {
 						repeatPageTimes++ //次数加1
 					} else {
@@ -344,7 +344,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 				downtimes++
 				start--
 				continue
-			} else if isRunRepeatList { //超过重试次数,视为本页重复
+			} else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
 				if repeatPageNum+1 == start {
 					repeatPageTimes++ //次数加1
 				} else {
@@ -360,20 +360,20 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 		//			map[string]interface{}{"$inc": map[string]interface{}{"param_common.4": 1}},
 		//			true, false)
 	}
+	set := map[string]interface{}{
+		"spidercode": s.Code,
+		"updatetime": time.Now().Unix(),
+		"event":      util.Config.Uploadevent,
+		"modifyuser": s.MUserName,
+		"maxpage":    tmpMax,
+		"runrate":    s.SpiderRunRate,
+		"endpage":    start,
+	}
+	inc := map[string]interface{}{}
 	if downloadAllNum > 0 {
+		inc["alltimes"] = 1
 		rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
 		rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
-		set := map[string]interface{}{
-			"spidercode": s.Code,
-			"updatetime": time.Now().Unix(),
-			"event":      util.Config.Uploadevent,
-			"modifyuser": s.MUserName,
-			"maxpage":    tmpMax,
-			"runrate":    s.SpiderRunRate,
-		}
-		inc := map[string]interface{}{
-			"alltimes": 1,
-		}
 		if rate == 1.0 {
 			inc["oh_percent"] = 1
 		} else if rate >= 0.9 {
@@ -383,11 +383,13 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 		} else {
 			inc["other_percent"] = 1
 		}
-		Mgo.Update("spider_downloadrate", map[string]interface{}{"spidercode": s.Code}, map[string]interface{}{
-			"$set": set,
-			"$inc": inc,
-		}, true, false)
+	} else {
+		inc["zero"] = 1
 	}
+	Mgo.Update("spider_downloadrate", map[string]interface{}{"spidercode": s.Code}, map[string]interface{}{
+		"$set": set,
+		"$inc": inc,
+	}, true, false)
 	return errs
 }