maxiaoshan 3 years ago
parent
commit
0a7a6700c4
2 changed files with 39 additions and 3 deletions
  1. 1 1
      src/config.json
  2. 38 2
      src/spider/spider.go

+ 1 - 1
src/config.json

@@ -19,7 +19,7 @@
     "ishistoryevent": false,
     "tesseractadd": "http://test.qmx.top:1688",
     "testdir": "res/test/spider_test.lua",
-    "redisservers": "title_repeat_judgement=192.168.3.207:1679,title_repeat_fulljudgement=192.168.3.207:1679,title_repeat_listpagehref=192.168.3.207:1679",
+    "redisservers": "title_repeat_judgement=192.168.3.207:1679,title_repeat_fulljudgement=192.168.3.207:1679,title_repeat_listpagehref=192.168.3.207:2679",
      "word":{
     	"keyword":"(抽签|中标|招标|成交|合同|中标候选人|资格预审|拟建|邀请|询价|比选|议价|竞价|磋商|采购|招投标|答疑|变更公告|更正公告|竞争性谈判|竞谈|意见征询|澄清|单一来源|流标|废标|验收公告|中止|终止|违规|处罚|征集公告|开标结果|评审结果|监理|招租|租赁|评判结果|项目|遴选|补遗|竞标|征求意见)",
     	"notkeyword":"(招聘|拍卖|出租|出让|使用权|资产)"

+ 38 - 2
src/spider/spider.go

@@ -15,6 +15,7 @@ import (
 	mgo "mongodb"
 	qu "qfw/util"
 	mgu "qfw/util/mongodbutil"
+	"strconv"
 
 	//mgu "qfw/util/mongodbutil"
 	//"qfw/util/redis"
@@ -234,8 +235,11 @@ func (s *Spider) GetLastPublishTime() (errs interface{}) {
 //下载列表
 func (s *Spider) DownListPageItem() (errs interface{}) {
 	defer mu.Catch()
-	start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage")
-	if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" { //7000节点,爬虫跑历史
+	start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
+	tmpMax := max                                                              //临时记录最大页
+	repeatAllNum := 0                                                          //本轮采集tmpMax页总的重复个数
+	downloadAllNum := 0                                                        //本轮采集tmpMax页总个数
+	if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" {     //7000节点,爬虫跑历史
 		max = s.GetIntVar("spiderHistoryMaxPage")
 	}
 	downtimes := 0                                                                  //记录某页重试次数(暂定3次)
@@ -298,6 +302,10 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 						s.HistoricalMendDownloadDetailItem(tmp) //历史补漏下载三级页
 					}
 				}
+				if start <= tmpMax { //数量赋值
+					repeatAllNum += repeatListNum
+					downloadAllNum += tabLen
+				}
 				if isRunRepeatList { //执行连续页码判重
 					if repeatListNum >= tabLen { //当前start列表页全部数据都已采集
 						//qu.Debug("重复页:", repeatPageNum, "当前页:", start)
@@ -352,6 +360,34 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 		//			map[string]interface{}{"$inc": map[string]interface{}{"param_common.4": 1}},
 		//			true, false)
 	}
+	if downloadAllNum > 0 {
+		rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
+		rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
+		set := map[string]interface{}{
+			"spidercode": s.Code,
+			"updatetime": time.Now().Unix(),
+			"event":      util.Config.Uploadevent,
+			"modifyuser": s.MUserName,
+			"maxpage":    tmpMax,
+			"runrate":    s.SpiderRunRate,
+		}
+		inc := map[string]interface{}{
+			"alltimes": 1,
+		}
+		if rate == 1.0 {
+			inc["oh_percent"] = 1
+		} else if rate >= 0.9 {
+			inc["nt_percent"] = 1
+		} else if rate >= 0.8 {
+			inc["et_percent"] = 1
+		} else {
+			inc["other_percent"] = 1
+		}
+		Mgo.Update("spider_downloadrate", map[string]interface{}{"spidercode": s.Code}, map[string]interface{}{
+			"$set": set,
+			"$inc": inc,
+		}, true, false)
+	}
 	return errs
 }