|
@@ -229,8 +229,169 @@ func (s *Spider) GetLastPublishTime() (errs interface{}) {
|
|
|
return nil
|
|
|
}
|
|
|
|
|
|
-//下载列表
|
|
|
+//下载列表(较DownListPageItemBack去掉了无数据的重试和重复页记录)
|
|
|
func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
+ defer qu.Catch()
|
|
|
+ start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
|
|
|
+ s.MaxPage = max //
|
|
|
+ repeatAllNum := 0 //本轮采集tmpMax页总的重复个数
|
|
|
+ downloadAllNum := 0 //本轮采集tmpMax页总个数
|
|
|
+ if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" { //7000节点,爬虫跑历史
|
|
|
+ max = s.GetIntVar("spiderHistoryMaxPage")
|
|
|
+ }
|
|
|
+ downtimes := 0 //记录某页重试次数(暂定3次)
|
|
|
+ repeatPageTimes := 0 //记录页码连续判重的次数(暂定连续判重页码数为5次时,不再翻页)
|
|
|
+ isRunRepeatList := false //是否执行列表页连续判重
|
|
|
+ if !util.Config.IsHistoryEvent && util.Config.Modal == 1 && max > 1 && max < 101 { //除顺序采集模式和非历史节点外所有节点,采集列表页时进行连续10页判重
|
|
|
+ isRunRepeatList = true
|
|
|
+ max = util.Config.PageTurnInfo.TurnPageMaxLimit //高性能模式设置最大页为100,队列模式50页
|
|
|
+ }
|
|
|
+ //子任务判断
|
|
|
+ if s.ContinueDownListChildTask {
|
|
|
+ start = util.Config.PageTurnInfo.TurnPageMaxLimit + 1 //子任务起始页
|
|
|
+ max = util.Config.PageTurnInfo.TurnPageMaxLimit + util.Config.PageTurnInfo.NextPageMaxLimit //子任务最大页
|
|
|
+ }
|
|
|
+ for ; start <= max && !s.Stop; start++ {
|
|
|
+ if !s.Stop && !s.ContinueDownListChildTask { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
|
+ UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list", start == 1) //记录所有节点列表页心跳
|
|
|
+ }
|
|
|
+ //qu.Debug("爬虫:", s.Code, " 配置最大页:", s.MaxPage, " 最终最大页:", max, " 当前页:", start, "重复次数:", repeatPageTimes)
|
|
|
+ if isRunRepeatList && repeatPageTimes >= util.Config.PageTurnInfo.RepeatPageTimesLimit { //超过连续判重上限,不再翻页
|
|
|
+ break
|
|
|
+ }
|
|
|
+ if err := s.L.CallByParam(lua.P{
|
|
|
+ Fn: s.L.GetGlobal("downloadAndParseListPage"),
|
|
|
+ NRet: 1,
|
|
|
+ Protect: true,
|
|
|
+ }, lua.LNumber(start)); err != nil {
|
|
|
+ //panic(s.Code + "," + err.Error())
|
|
|
+ logger.Error("列表页采集报错", start, s.Code+","+err.Error())
|
|
|
+ errs = err.Error()
|
|
|
+ atomic.AddInt32(&s.Script.ErrorNum, 1)
|
|
|
+ //列表页采集报错进行重试,超过重试次数视为该页已采(脚本报错直接退出不进行内部重试)
|
|
|
+ if downtimes < 3 {
|
|
|
+ downtimes++
|
|
|
+ start--
|
|
|
+ } else if isRunRepeatList { //超过重试次数,视为本页重复
|
|
|
+ repeatPageTimes++ //次数加1
|
|
|
+ downtimes = 0
|
|
|
+ }
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ lv := s.L.Get(-1)
|
|
|
+ s.L.Pop(1)
|
|
|
+ if tbl, ok := lv.(*lua.LTable); ok {
|
|
|
+ //list := []map[string]interface{}{}
|
|
|
+ //qu.Debug("当前页数据量:", tbl.Len())
|
|
|
+ if tabLen := tbl.Len(); tabLen > 0 { //列表页有数据,根据列表页信息下载三级页
|
|
|
+ repeatListNum := 0 // 当前列表页连接重复个数
|
|
|
+ for i := 1; i <= tabLen; i++ {
|
|
|
+ v := tbl.RawGetInt(i).(*lua.LTable)
|
|
|
+ tmp := util.TableToMap(v)
|
|
|
+ if !s.IsHistoricalMend { //不是历史补漏
|
|
|
+ tmp["dataging"] = 0 //数据中打标记dataging=0
|
|
|
+ if s.DownDetail {
|
|
|
+ s.DownloadDetailItem(tmp, &repeatListNum)
|
|
|
+ }
|
|
|
+ } else { //历史补漏
|
|
|
+ s.HistoricalMendDownloadDetailItem(tmp) //历史补漏下载三级页
|
|
|
+ }
|
|
|
+ }
|
|
|
+ repeatAllNum += repeatListNum
|
|
|
+ downloadAllNum += tabLen
|
|
|
+ if isRunRepeatList { //执行连续页码判重
|
|
|
+ if repeatListNum >= tabLen { //当前start列表页全部数据都已采集
|
|
|
+ repeatPageTimes++ //次数加1
|
|
|
+ } else { //当前start页有新数据,重复次数重置
|
|
|
+ repeatPageTimes = 0
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else if isRunRepeatList {
|
|
|
+ repeatPageTimes++ //次数加1
|
|
|
+ }
|
|
|
+ } else if isRunRepeatList {
|
|
|
+ repeatPageTimes++ //次数加1
|
|
|
+ }
|
|
|
+ downtimes = 0 //当前页下载无误,重置下载重试次数
|
|
|
+ util.TimeSleepFunc(100*time.Millisecond, TimeSleepChan)
|
|
|
+ }
|
|
|
+ logger.Info(s.Code, "本轮列表页采集详情:", s.ContinueDownListChildTask, downloadAllNum, repeatAllNum, start, s.Stop)
|
|
|
+ if !util.Config.IsHistoryEvent && !s.Stop { //非历史节点统计下载率
|
|
|
+ nowTime := time.Now()
|
|
|
+ sDate := qu.FormatDate(&nowTime, qu.Date_Short_Layout)
|
|
|
+ set := map[string]interface{}{
|
|
|
+ "site": s.Name,
|
|
|
+ "channel": s.Channel,
|
|
|
+ "spidercode": s.Code,
|
|
|
+ "updatetime": nowTime.Unix(),
|
|
|
+ "event": util.Config.Uploadevent,
|
|
|
+ "modifyuser": s.MUserName,
|
|
|
+ "maxpage": s.MaxPage,
|
|
|
+ "runrate": s.SpiderRunRate,
|
|
|
+ "endpage": start,
|
|
|
+ "date": sDate,
|
|
|
+ }
|
|
|
+ inc := map[string]interface{}{
|
|
|
+ "alltimes": 1,
|
|
|
+ }
|
|
|
+ //记录翻页是否成功
|
|
|
+ if s.MaxPage > 1 {
|
|
|
+ if s.PageOneTextHash != "" {
|
|
|
+ if s.PageTwoTextHash != "" {
|
|
|
+ if s.PageOneTextHash != s.PageTwoTextHash {
|
|
|
+ inc["page_success"] = 1
|
|
|
+ } else {
|
|
|
+ inc["page_fail"] = 1
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ inc["page_fail"] = 1
|
|
|
+ }
|
|
|
+ } else if s.PageTwoTextHash != "" {
|
|
|
+ inc["page_onefail"] = 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if downloadAllNum > 0 {
|
|
|
+ rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
|
|
|
+ rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
|
|
|
+ if rate == 1.0 {
|
|
|
+ if downloadAllNum == 1 { //列表页数据过滤的只剩一条新数据
|
|
|
+ inc["oh_percent_onenum"] = 1
|
|
|
+ } else {
|
|
|
+ inc["oh_percent"] = 1
|
|
|
+ }
|
|
|
+ //} else if rate >= 0.9 {
|
|
|
+ // inc["nt_percent"] = 1
|
|
|
+ //} else if rate >= 0.8 {
|
|
|
+ // inc["et_percent"] = 1
|
|
|
+ //} else {
|
|
|
+ // inc["other_percent"] = 1
|
|
|
+ }
|
|
|
+ if isRunRepeatList && start > max { //连续翻页超过了上限
|
|
|
+ if !s.ContinueDownListChildTask {
|
|
|
+ go ContinueDownListPageItem(s) //开启子任务继续采集
|
|
|
+ } else {
|
|
|
+ inc["uplimit"] = 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ inc["zero"] = 1
|
|
|
+ }
|
|
|
+ query := map[string]interface{}{
|
|
|
+ "date": sDate,
|
|
|
+ "spidercode": s.Code,
|
|
|
+ }
|
|
|
+ MgoS.Update("spider_downloadrate", query, map[string]interface{}{
|
|
|
+ "$set": set,
|
|
|
+ "$inc": inc,
|
|
|
+ }, true, false)
|
|
|
+ }
|
|
|
+ //信息重置
|
|
|
+ s.PageOneTextHash = ""
|
|
|
+ s.PageTwoTextHash = ""
|
|
|
+ return errs
|
|
|
+}
|
|
|
+
|
|
|
+func (s *Spider) DownListPageItemBack() (errs interface{}) {
|
|
|
defer qu.Catch()
|
|
|
start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
|
|
|
s.MaxPage = max //
|
|
@@ -243,24 +404,15 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
downtimes := 0 //记录某页重试次数(暂定3次)
|
|
|
repeatPageNum := 0 //记录列表页所有连接重复的页码
|
|
|
repeatPageTimes := 0 //记录页码连续判重的次数(暂定连续判重页码数为5次时,不再翻页)
|
|
|
- repeatPageTimesLimit := util.Config.PageTurnInfo.RepeatPageTimesLimitW0 //记录页码连续判重的次数上线(高性能模式10页,队列模式5页)
|
|
|
isRunRepeatList := false //是否执行列表页连续判重
|
|
|
if !util.Config.IsHistoryEvent && util.Config.Modal == 1 && max > 1 && max < 101 { //除顺序采集模式和非历史节点外所有节点,采集列表页时进行连续10页判重
|
|
|
isRunRepeatList = true
|
|
|
- max = util.Config.PageTurnInfo.TurnPageMaxLimitW0 //高性能模式设置最大页为100
|
|
|
- if util.Config.Working == 1 { //队列模式
|
|
|
- repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimitW1 //连续判重页3
|
|
|
- max = util.Config.PageTurnInfo.TurnPageMaxLimitW1 //队列模式最大页50
|
|
|
- }
|
|
|
+ max = util.Config.PageTurnInfo.TurnPageMaxLimit //高性能模式设置最大页为100,队列模式50页
|
|
|
}
|
|
|
//子任务判断
|
|
|
if s.ContinueDownListChildTask {
|
|
|
- start = util.Config.PageTurnInfo.TurnPageMaxLimitW0 + 1
|
|
|
- max = util.Config.PageTurnInfo.TurnPageMaxLimitW0 + util.Config.PageTurnInfo.NextPageMaxLimitW0
|
|
|
- if util.Config.Working == 1 { //队列模式
|
|
|
- start = util.Config.PageTurnInfo.TurnPageMaxLimitW1 + 1
|
|
|
- max = util.Config.PageTurnInfo.TurnPageMaxLimitW1 + util.Config.PageTurnInfo.NextPageMaxLimitW1
|
|
|
- }
|
|
|
+ start = util.Config.PageTurnInfo.TurnPageMaxLimit + 1
|
|
|
+ max = util.Config.PageTurnInfo.TurnPageMaxLimit + util.Config.PageTurnInfo.NextPageMaxLimit
|
|
|
}
|
|
|
for ; start <= max && !s.Stop; start++ {
|
|
|
if !s.Stop && !s.ContinueDownListChildTask { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
@@ -270,7 +422,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
//if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次,不再翻页
|
|
|
// break
|
|
|
//}
|
|
|
- if isRunRepeatList && repeatPageTimes >= repeatPageTimesLimit { //重复次数超过10次,不再翻页
|
|
|
+ if isRunRepeatList && repeatPageTimes >= util.Config.PageTurnInfo.RepeatPageTimesLimit { //超过连续判重上限,不再翻页
|
|
|
break
|
|
|
}
|
|
|
if err := s.L.CallByParam(lua.P{
|
|
@@ -444,11 +596,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
"date": sDate,
|
|
|
"spidercode": s.Code,
|
|
|
}
|
|
|
- coll := "spider_downloadrate"
|
|
|
- if s.ContinueDownListChildTask {
|
|
|
- coll = "spider_downloadrate_child"
|
|
|
- }
|
|
|
- MgoS.Update(coll, query, map[string]interface{}{
|
|
|
+ MgoS.Update("spider_downloadrate", query, map[string]interface{}{
|
|
|
"$set": set,
|
|
|
"$inc": inc,
|
|
|
}, true, false)
|
|
@@ -469,26 +617,17 @@ func (s *Spider) DownListPageItemByThreads() (errs interface{}) {
|
|
|
if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" { //7000节点根据爬虫类型,取采集的最大页
|
|
|
max = s.GetIntVar("spiderHistoryMaxPage") //采集历史的爬虫,取历史最大页配置spiderHistoryMaxPage
|
|
|
}
|
|
|
- repeatPageTimesLimit := 0 //记录页码连续判重的次数上限
|
|
|
- isRunRepeatList := false //是否执行列表页连续判重逻辑
|
|
|
+ isRunRepeatList := false //是否执行列表页连续判重逻辑
|
|
|
//是否进行连续翻页判断,修改最大页
|
|
|
if !util.Config.IsHistoryEvent && util.Config.Modal == 1 && max > 1 && max < 101 { //除顺序采集模式和非历史节点外所有节点,采集列表页时进行连续10页判重
|
|
|
isRunRepeatList = true
|
|
|
- repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimitW0
|
|
|
- max = util.Config.PageTurnInfo.TurnPageMaxLimitW0 //高性能模式设置最大页为100
|
|
|
- if util.Config.Working == 1 { //队列模式
|
|
|
- repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimitW1 //连续判重页3
|
|
|
- max = util.Config.PageTurnInfo.TurnPageMaxLimitW1 //队列模式最大页50
|
|
|
- }
|
|
|
+ max = util.Config.PageTurnInfo.TurnPageMaxLimit //高性能模式设置最大页为100,队列模式50页
|
|
|
+
|
|
|
}
|
|
|
//子任务判断
|
|
|
//if s.ContinueDownListChildTask {
|
|
|
- // start = util.Config.PageTurnInfo.TurnPageMaxLimitW0 + 1
|
|
|
- // max = util.Config.PageTurnInfo.TurnPageMaxLimitW0 + util.Config.PageTurnInfo.NextPageMaxLimitW0
|
|
|
- // if util.Config.Working == 1 { //队列模式
|
|
|
- // start = util.Config.PageTurnInfo.TurnPageMaxLimitW1 + 1
|
|
|
- // max = util.Config.PageTurnInfo.TurnPageMaxLimitW1 + util.Config.PageTurnInfo.NextPageMaxLimitW1
|
|
|
- // }
|
|
|
+ // start = util.Config.PageTurnInfo.TurnPageMaxLimit + 1
|
|
|
+ // max = util.Config.PageTurnInfo.TurnPageMaxLimit + util.Config.PageTurnInfo.NextPageMaxLimit
|
|
|
//}
|
|
|
//创建并发Spider对象
|
|
|
spChan := make(chan *Spider, 1)
|
|
@@ -527,6 +666,7 @@ func (s *Spider) DownListPageItemByThreads() (errs interface{}) {
|
|
|
}()
|
|
|
//下载某一页数据
|
|
|
downnum, repeatnum := sp.DownListOnePage(pagenum)
|
|
|
+ //logger.Info(sp.Code, "pagenum", pagenum, "repeat", downnum == repeatnum, downnum, repeatnum, &sp)
|
|
|
//汇总下载量
|
|
|
atomic.AddInt64(&downloadAllNum, int64(downnum))
|
|
|
atomic.AddInt64(&repeatAllNum, int64(repeatnum))
|
|
@@ -554,7 +694,7 @@ func (s *Spider) DownListPageItemByThreads() (errs interface{}) {
|
|
|
repeatTimes = 0
|
|
|
}
|
|
|
}
|
|
|
- if repeatTimes >= repeatPageTimesLimit { //超过连续判重页,不再采集
|
|
|
+ if repeatTimes >= util.Config.PageTurnInfo.RepeatPageTimesLimit { //超过连续判重上限,不再翻页
|
|
|
break
|
|
|
}
|
|
|
}
|
|
@@ -625,11 +765,7 @@ func (s *Spider) DownListPageItemByThreads() (errs interface{}) {
|
|
|
"date": sDate,
|
|
|
"spidercode": s.Code,
|
|
|
}
|
|
|
- coll := "spider_downloadrate"
|
|
|
- if s.ContinueDownListChildTask {
|
|
|
- coll = "spider_downloadrate_child"
|
|
|
- }
|
|
|
- MgoS.Update(coll, query, map[string]interface{}{
|
|
|
+ MgoS.Update("spider_downloadrate", query, map[string]interface{}{
|
|
|
"$set": set,
|
|
|
"$inc": inc,
|
|
|
}, true, false)
|
|
@@ -655,21 +791,13 @@ func (s *Spider) DownListPageItemByThreadsBack() (errs interface{}) {
|
|
|
//是否进行连续翻页判断,修改最大页
|
|
|
if !util.Config.IsHistoryEvent && util.Config.Modal == 1 && max > 1 && max < 101 { //除顺序采集模式和非历史节点外所有节点,采集列表页时进行连续10页判重
|
|
|
isRunRepeatList = true
|
|
|
- repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimitW0
|
|
|
- max = util.Config.PageTurnInfo.TurnPageMaxLimitW0 //高性能模式设置最大页为100
|
|
|
- if util.Config.Working == 1 { //队列模式
|
|
|
- repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimitW1 //连续判重页3
|
|
|
- max = util.Config.PageTurnInfo.TurnPageMaxLimitW1 //队列模式最大页50
|
|
|
- }
|
|
|
+ repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimit
|
|
|
+ max = util.Config.PageTurnInfo.TurnPageMaxLimit //高性能模式设置最大页为100,队列模式50页
|
|
|
}
|
|
|
//子任务判断
|
|
|
if s.ContinueDownListChildTask {
|
|
|
- start = util.Config.PageTurnInfo.TurnPageMaxLimitW0 + 1
|
|
|
- max = util.Config.PageTurnInfo.TurnPageMaxLimitW0 + util.Config.PageTurnInfo.NextPageMaxLimitW0
|
|
|
- if util.Config.Working == 1 { //队列模式
|
|
|
- start = util.Config.PageTurnInfo.TurnPageMaxLimitW1 + 1
|
|
|
- max = util.Config.PageTurnInfo.TurnPageMaxLimitW1 + util.Config.PageTurnInfo.NextPageMaxLimitW1
|
|
|
- }
|
|
|
+ start = util.Config.PageTurnInfo.TurnPageMaxLimit + 1
|
|
|
+ max = util.Config.PageTurnInfo.TurnPageMaxLimit + util.Config.PageTurnInfo.NextPageMaxLimit
|
|
|
}
|
|
|
//创建并发Spider对象
|
|
|
spChan := make(chan *Spider, 1)
|
|
@@ -793,11 +921,7 @@ func (s *Spider) DownListPageItemByThreadsBack() (errs interface{}) {
|
|
|
"date": sDate,
|
|
|
"spidercode": s.Code,
|
|
|
}
|
|
|
- coll := "spider_downloadrate"
|
|
|
- if s.ContinueDownListChildTask {
|
|
|
- coll = "spider_downloadrate_child"
|
|
|
- }
|
|
|
- MgoS.Update(coll, query, map[string]interface{}{
|
|
|
+ MgoS.Update("spider_downloadrate", query, map[string]interface{}{
|
|
|
"$set": set,
|
|
|
"$inc": inc,
|
|
|
}, true, false)
|