|
@@ -270,10 +270,11 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
defer mu.Catch()
|
|
defer mu.Catch()
|
|
s.AlreadyGetPageHeart = map[int]bool{} //重置记录
|
|
s.AlreadyGetPageHeart = map[int]bool{} //重置记录
|
|
start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
|
|
start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
|
|
- tmpMax := max //临时记录最大页
|
|
|
|
- repeatAllNum := 0 //本轮采集tmpMax页总的重复个数
|
|
|
|
- downloadAllNum := 0 //本轮采集tmpMax页总个数
|
|
|
|
- if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" { //7000节点,爬虫跑历史
|
|
|
|
|
|
+ s.MaxPage = max //
|
|
|
|
+ //tmpMax := max //临时记录最大页
|
|
|
|
+ repeatAllNum := 0 //本轮采集tmpMax页总的重复个数
|
|
|
|
+ downloadAllNum := 0 //本轮采集tmpMax页总个数
|
|
|
|
+ if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" { //7000节点,爬虫跑历史
|
|
max = s.GetIntVar("spiderHistoryMaxPage")
|
|
max = s.GetIntVar("spiderHistoryMaxPage")
|
|
}
|
|
}
|
|
downtimes := 0 //记录某页重试次数(暂定3次)
|
|
downtimes := 0 //记录某页重试次数(暂定3次)
|
|
@@ -289,11 +290,8 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
max = 50 //队列模式最大页50
|
|
max = 50 //队列模式最大页50
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- inc := map[string]interface{}{}
|
|
|
|
- pageOneDataHash := ""
|
|
|
|
for ; start <= max && !s.Stop; start++ {
|
|
for ; start <= max && !s.Stop; start++ {
|
|
- var HrefArr []string //记录href
|
|
|
|
- if !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
|
|
|
|
+ if !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list") //记录所有节点列表页心跳
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list") //记录所有节点列表页心跳
|
|
}
|
|
}
|
|
//logger.Info("爬虫:", s.Code, "重复页:", repeatPageNum, " 配置最大页:", tmpMax, " 最终最大页:", max, " 当前页:", start, "重复次数:", repeatPageTimes)
|
|
//logger.Info("爬虫:", s.Code, "重复页:", repeatPageNum, " 配置最大页:", tmpMax, " 最终最大页:", max, " 当前页:", start, "重复次数:", repeatPageTimes)
|
|
@@ -338,9 +336,6 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
for i := 1; i <= tabLen; i++ {
|
|
for i := 1; i <= tabLen; i++ {
|
|
v := tbl.RawGetInt(i).(*lua.LTable)
|
|
v := tbl.RawGetInt(i).(*lua.LTable)
|
|
tmp := util.TableToMap(v)
|
|
tmp := util.TableToMap(v)
|
|
- if tmpMax > 1 { //配置页码大于1时记录
|
|
|
|
- HrefArr = append(HrefArr, qu.ObjToString(tmp["href"]))
|
|
|
|
- }
|
|
|
|
//s.ThisSiteData(tmp) //统计当前下载数据是否是本站点数据
|
|
//s.ThisSiteData(tmp) //统计当前下载数据是否是本站点数据
|
|
if !s.IsHistoricalMend { //不是历史补漏
|
|
if !s.IsHistoricalMend { //不是历史补漏
|
|
tmp["dataging"] = 0 //数据中打标记dataging=0
|
|
tmp["dataging"] = 0 //数据中打标记dataging=0
|
|
@@ -417,28 +412,8 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
downtimes = 0 //当前页下载无误,重置下载重试次数
|
|
downtimes = 0 //当前页下载无误,重置下载重试次数
|
|
- //记录翻页异常爬虫(目前只统计页面至少能翻到第二页,且两页都能采到数据的情况。)
|
|
|
|
- if tmpMax > 1 && !util.Config.IsHistoryEvent {
|
|
|
|
- if start == 1 && len(HrefArr) > 0 { //记录第一页数据的hash
|
|
|
|
- pageOneDataHash = util.HexText(strings.Join(HrefArr, ","))
|
|
|
|
- } else if start == 2 && pageOneDataHash != "" { //对比第一页的hash
|
|
|
|
- //此处hrefArr可能为空,1、可能全部过滤;2、采集失败无数据
|
|
|
|
- if len(HrefArr) > 0 {
|
|
|
|
- pageTwoDataHash := util.HexText(strings.Join(HrefArr, ","))
|
|
|
|
- if pageTwoDataHash == pageOneDataHash {
|
|
|
|
- inc["page_fail"] = 1 //两页数据相同,翻页失败
|
|
|
|
- } else { //两页数据不同,翻页成功
|
|
|
|
- inc["page_success"] = 1
|
|
|
|
- }
|
|
|
|
- } else { //第二页无数据,无法翻页
|
|
|
|
- inc["page_fail"] = 1
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- HrefArr = []string{}
|
|
|
|
util.TimeSleepFunc(100*time.Millisecond, TimeSleepChan)
|
|
util.TimeSleepFunc(100*time.Millisecond, TimeSleepChan)
|
|
}
|
|
}
|
|
- pageOneDataHash = ""
|
|
|
|
logger.Info(s.Code, "本轮列表页采集详情:", downloadAllNum, repeatAllNum, start, s.Stop)
|
|
logger.Info(s.Code, "本轮列表页采集详情:", downloadAllNum, repeatAllNum, start, s.Stop)
|
|
if !util.Config.IsHistoryEvent && !s.Stop { //非历史节点统计下载率
|
|
if !util.Config.IsHistoryEvent && !s.Stop { //非历史节点统计下载率
|
|
nowTime := time.Now()
|
|
nowTime := time.Now()
|
|
@@ -450,15 +425,26 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
"updatetime": nowTime.Unix(),
|
|
"updatetime": nowTime.Unix(),
|
|
"event": util.Config.Uploadevent,
|
|
"event": util.Config.Uploadevent,
|
|
"modifyuser": s.MUserName,
|
|
"modifyuser": s.MUserName,
|
|
- "maxpage": tmpMax,
|
|
|
|
|
|
+ "maxpage": s.MaxPage,
|
|
"runrate": s.SpiderRunRate,
|
|
"runrate": s.SpiderRunRate,
|
|
"endpage": start,
|
|
"endpage": start,
|
|
"date": sDate,
|
|
"date": sDate,
|
|
}
|
|
}
|
|
- //inc = map[string]interface{}{
|
|
|
|
- // "alltimes": 1,
|
|
|
|
- //}
|
|
|
|
- inc["alltimes"] = 1
|
|
|
|
|
|
+ inc := map[string]interface{}{
|
|
|
|
+ "alltimes": 1,
|
|
|
|
+ }
|
|
|
|
+ //记录翻页是否成功
|
|
|
|
+ if s.PageOneTextHash != "" {
|
|
|
|
+ if s.PageTwoTextHash != "" {
|
|
|
|
+ if s.PageOneTextHash != s.PageTwoTextHash {
|
|
|
|
+ inc["page_success"] = 1
|
|
|
|
+ } else {
|
|
|
|
+ inc["page_fail"] = 1
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ inc["page_fail"] = 1
|
|
|
|
+ }
|
|
|
|
+ }
|
|
if downloadAllNum > 0 {
|
|
if downloadAllNum > 0 {
|
|
rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
|
|
rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
|
|
rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
|
|
rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
|
|
@@ -483,6 +469,10 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
"$inc": inc,
|
|
"$inc": inc,
|
|
}, true, false)
|
|
}, true, false)
|
|
}
|
|
}
|
|
|
|
+ //信息重置
|
|
|
|
+ s.RecordedHeartInfo = false
|
|
|
|
+ s.PageOneTextHash = ""
|
|
|
|
+ s.PageTwoTextHash = ""
|
|
return errs
|
|
return errs
|
|
}
|
|
}
|
|
|
|
|