|
@@ -289,8 +289,11 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
max = 50 //队列模式最大页50
|
|
|
}
|
|
|
}
|
|
|
+ inc := map[string]interface{}{}
|
|
|
+ pageOneDataHash := ""
|
|
|
for ; start <= max && !s.Stop; start++ {
|
|
|
- if !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
|
+ var HrefArr []string //记录href
|
|
|
+ if !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list") //记录所有节点列表页心跳
|
|
|
}
|
|
|
//logger.Info("爬虫:", s.Code, "重复页:", repeatPageNum, " 配置最大页:", tmpMax, " 最终最大页:", max, " 当前页:", start, "重复次数:", repeatPageTimes)
|
|
@@ -335,6 +338,9 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
for i := 1; i <= tabLen; i++ {
|
|
|
v := tbl.RawGetInt(i).(*lua.LTable)
|
|
|
tmp := util.TableToMap(v)
|
|
|
+ if tmpMax > 1 { //配置页码大于1时记录
|
|
|
+ HrefArr = append(HrefArr, qu.ObjToString(tmp["href"]))
|
|
|
+ }
|
|
|
//s.ThisSiteData(tmp) //统计当前下载数据是否是本站点数据
|
|
|
if !s.IsHistoricalMend { //不是历史补漏
|
|
|
tmp["dataging"] = 0 //数据中打标记dataging=0
|
|
@@ -411,8 +417,28 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
}
|
|
|
}
|
|
|
downtimes = 0 //当前页下载无误,重置下载重试次数
|
|
|
+ //记录翻页异常爬虫(目前只统计页面至少能翻到第二页,且两页都能采到数据的情况。)
|
|
|
+ if tmpMax > 1 && !util.Config.IsHistoryEvent {
|
|
|
+ if start == 1 && len(HrefArr) > 0 { //记录第一页数据的hash
|
|
|
+ pageOneDataHash = util.HexText(strings.Join(HrefArr, ","))
|
|
|
+ } else if start == 2 && pageOneDataHash != "" { //对比第一页的hash
|
|
|
+ //此处hrefArr可能为空,1、可能全部过滤;2、采集失败无数据
|
|
|
+ if len(HrefArr) > 0 {
|
|
|
+ pageTwoDataHash := util.HexText(strings.Join(HrefArr, ","))
|
|
|
+ if pageTwoDataHash == pageOneDataHash {
|
|
|
+ inc["page_fail"] = 1 //两页数据相同,翻页失败
|
|
|
+ } else { //两页数据不同,翻页成功
|
|
|
+ inc["page_success"] = 1
|
|
|
+ }
|
|
|
+ } else { //第二页无数据,无法翻页
|
|
|
+ inc["page_fail"] = 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ HrefArr = []string{}
|
|
|
util.TimeSleepFunc(100*time.Millisecond, TimeSleepChan)
|
|
|
}
|
|
|
+ pageOneDataHash = ""
|
|
|
logger.Info(s.Code, "本轮列表页采集详情:", downloadAllNum, repeatAllNum, start, s.Stop)
|
|
|
if !util.Config.IsHistoryEvent && !s.Stop { //非历史节点统计下载率
|
|
|
nowTime := time.Now()
|
|
@@ -429,9 +455,10 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
"endpage": start,
|
|
|
"date": sDate,
|
|
|
}
|
|
|
- inc := map[string]interface{}{
|
|
|
- "alltimes": 1,
|
|
|
- }
|
|
|
+ //inc = map[string]interface{}{
|
|
|
+ // "alltimes": 1,
|
|
|
+ //}
|
|
|
+ inc["alltimes"] = 1
|
|
|
if downloadAllNum > 0 {
|
|
|
rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
|
|
|
rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
|