|
@@ -244,15 +244,15 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
}
|
|
|
downtimes := 0 //记录某页重试次数(暂定3次)
|
|
|
repeatPageNum := 0 //记录列表页所有连接重复的页码
|
|
|
- repeatPageTimes := 0 //记录页码连续判重的次数(暂定连续判重页码数为10次时,不再翻页)
|
|
|
+ repeatPageTimes := 0 //记录页码连续判重的次数(暂定连续判重页码数为5次时,不再翻页)
|
|
|
isRunRepeatList := false //是否执行列表页连续判重
|
|
|
- if util.Config.Modal == 1 && util.Config.Working == 0 && max > 1 && max < 101 { //7100 7400最大页小于101且大于1,对此部分爬虫采集列表页时进行连续10页判重
|
|
|
+ if util.Config.Modal == 1 && util.Config.Working == 0 && max > 1 && max < 101 { //7100 7400最大页小于101且大于1,对此部分爬虫采集列表页时进行连续5页判重
|
|
|
isRunRepeatList = true
|
|
|
max = 100 //设置最大页为100
|
|
|
}
|
|
|
for ; start <= max && !s.Stop; start++ {
|
|
|
- //qu.Debug("重复页:", repeatPageNum, " 最大页:", max, " 当前页:", start, "重复次数:", repeatPageTimes)
|
|
|
- if isRunRepeatList && repeatPageTimes >= 10 { //重复次数超过10次,不再翻页
|
|
|
+ //qu.Debug("重复页:", repeatPageNum, " 配置最大页:", tmpMax, " 最终最大页:", max, " 当前页:", start, "重复次数:", repeatPageTimes)
|
|
|
+ if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次,不再翻页
|
|
|
break
|
|
|
}
|
|
|
if err := s.L.CallByParam(lua.P{
|
|
@@ -306,7 +306,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
repeatAllNum += repeatListNum
|
|
|
downloadAllNum += tabLen
|
|
|
}
|
|
|
- if isRunRepeatList { //执行连续页码判重
|
|
|
+ if start > tmpMax && isRunRepeatList { //执行连续页码判重
|
|
|
if repeatListNum >= tabLen { //当前start列表页全部数据都已采集
|
|
|
//qu.Debug("重复页:", repeatPageNum, "当前页:", start)
|
|
|
if repeatPageNum+1 == start || repeatPageNum == 0 {
|
|
@@ -330,7 +330,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
downtimes++
|
|
|
start--
|
|
|
continue
|
|
|
- } else if isRunRepeatList { //超过重试次数,视为本页重复
|
|
|
+ } else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
|
|
|
if repeatPageNum+1 == start {
|
|
|
repeatPageTimes++ //次数加1
|
|
|
} else {
|
|
@@ -344,7 +344,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
downtimes++
|
|
|
start--
|
|
|
continue
|
|
|
- } else if isRunRepeatList { //超过重试次数,视为本页重复
|
|
|
+ } else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
|
|
|
if repeatPageNum+1 == start {
|
|
|
repeatPageTimes++ //次数加1
|
|
|
} else {
|
|
@@ -360,20 +360,20 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
// map[string]interface{}{"$inc": map[string]interface{}{"param_common.4": 1}},
|
|
|
// true, false)
|
|
|
}
|
|
|
+ set := map[string]interface{}{
|
|
|
+ "spidercode": s.Code,
|
|
|
+ "updatetime": time.Now().Unix(),
|
|
|
+ "event": util.Config.Uploadevent,
|
|
|
+ "modifyuser": s.MUserName,
|
|
|
+ "maxpage": tmpMax,
|
|
|
+ "runrate": s.SpiderRunRate,
|
|
|
+ "endpage": start,
|
|
|
+ }
|
|
|
+ inc := map[string]interface{}{}
|
|
|
if downloadAllNum > 0 {
|
|
|
+ inc["alltimes"] = 1
|
|
|
rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
|
|
|
rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
|
|
|
- set := map[string]interface{}{
|
|
|
- "spidercode": s.Code,
|
|
|
- "updatetime": time.Now().Unix(),
|
|
|
- "event": util.Config.Uploadevent,
|
|
|
- "modifyuser": s.MUserName,
|
|
|
- "maxpage": tmpMax,
|
|
|
- "runrate": s.SpiderRunRate,
|
|
|
- }
|
|
|
- inc := map[string]interface{}{
|
|
|
- "alltimes": 1,
|
|
|
- }
|
|
|
if rate == 1.0 {
|
|
|
inc["oh_percent"] = 1
|
|
|
} else if rate >= 0.9 {
|
|
@@ -383,11 +383,13 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
} else {
|
|
|
inc["other_percent"] = 1
|
|
|
}
|
|
|
- Mgo.Update("spider_downloadrate", map[string]interface{}{"spidercode": s.Code}, map[string]interface{}{
|
|
|
- "$set": set,
|
|
|
- "$inc": inc,
|
|
|
- }, true, false)
|
|
|
+ } else {
|
|
|
+ inc["zero"] = 1
|
|
|
}
|
|
|
+ Mgo.Update("spider_downloadrate", map[string]interface{}{"spidercode": s.Code}, map[string]interface{}{
|
|
|
+ "$set": set,
|
|
|
+ "$inc": inc,
|
|
|
+ }, true, false)
|
|
|
return errs
|
|
|
}
|
|
|
|