|
@@ -15,6 +15,7 @@ import (
|
|
|
mgo "mongodb"
|
|
|
qu "qfw/util"
|
|
|
mgu "qfw/util/mongodbutil"
|
|
|
+ "strconv"
|
|
|
|
|
|
//mgu "qfw/util/mongodbutil"
|
|
|
//"qfw/util/redis"
|
|
@@ -234,8 +235,11 @@ func (s *Spider) GetLastPublishTime() (errs interface{}) {
|
|
|
//下载列表
|
|
|
func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
defer mu.Catch()
|
|
|
- start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage")
|
|
|
- if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" { //7000节点,爬虫跑历史
|
|
|
+ start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
|
|
|
+ tmpMax := max //临时记录最大页
|
|
|
+ repeatAllNum := 0 //本轮采集tmpMax页总的重复个数
|
|
|
+ downloadAllNum := 0 //本轮采集tmpMax页总个数
|
|
|
+ if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" { //7000节点,爬虫跑历史
|
|
|
max = s.GetIntVar("spiderHistoryMaxPage")
|
|
|
}
|
|
|
downtimes := 0 //记录某页重试次数(暂定3次)
|
|
@@ -298,6 +302,10 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
s.HistoricalMendDownloadDetailItem(tmp) //历史补漏下载三级页
|
|
|
}
|
|
|
}
|
|
|
+ if start <= tmpMax { //数量赋值
|
|
|
+ repeatAllNum += repeatListNum
|
|
|
+ downloadAllNum += tabLen
|
|
|
+ }
|
|
|
if isRunRepeatList { //执行连续页码判重
|
|
|
if repeatListNum >= tabLen { //当前start列表页全部数据都已采集
|
|
|
//qu.Debug("重复页:", repeatPageNum, "当前页:", start)
|
|
@@ -352,6 +360,34 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
// map[string]interface{}{"$inc": map[string]interface{}{"param_common.4": 1}},
|
|
|
// true, false)
|
|
|
}
|
|
|
+ if downloadAllNum > 0 {
|
|
|
+ rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
|
|
|
+ rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
|
|
|
+ set := map[string]interface{}{
|
|
|
+ "spidercode": s.Code,
|
|
|
+ "updatetime": time.Now().Unix(),
|
|
|
+ "event": util.Config.Uploadevent,
|
|
|
+ "modifyuser": s.MUserName,
|
|
|
+ "maxpage": tmpMax,
|
|
|
+ "runrate": s.SpiderRunRate,
|
|
|
+ }
|
|
|
+ inc := map[string]interface{}{
|
|
|
+ "alltimes": 1,
|
|
|
+ }
|
|
|
+ if rate == 1.0 {
|
|
|
+ inc["oh_percent"] = 1
|
|
|
+ } else if rate >= 0.9 {
|
|
|
+ inc["nt_percent"] = 1
|
|
|
+ } else if rate >= 0.8 {
|
|
|
+ inc["et_percent"] = 1
|
|
|
+ } else {
|
|
|
+ inc["other_percent"] = 1
|
|
|
+ }
|
|
|
+ Mgo.Update("spider_downloadrate", map[string]interface{}{"spidercode": s.Code}, map[string]interface{}{
|
|
|
+ "$set": set,
|
|
|
+ "$inc": inc,
|
|
|
+ }, true, false)
|
|
|
+ }
|
|
|
return errs
|
|
|
}
|
|
|
|