|
@@ -940,16 +940,16 @@ func (s *Spider) DownListPageItemByThreadsBack() (errs interface{}) {
|
|
func (s *Spider) SupplementDownListPageItem() (errs interface{}) {
|
|
func (s *Spider) SupplementDownListPageItem() (errs interface{}) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
var (
|
|
var (
|
|
- errtimes int //采集异常次数(暂定10次)
|
|
|
|
- errPageNum int //当前采集异常页码
|
|
|
|
- downtimes int //记录某页重试次数(暂定3次)
|
|
|
|
- downloadAllNum int //记录本次采集,信息采集总量
|
|
|
|
- saveAllNum int //记录本次采集,信息补采总量
|
|
|
|
- repeatAllNum int //记录本次采集,信息重复总量
|
|
|
|
- pageTitleHash string //记录当前页所有title文本
|
|
|
|
- finishText = "正常退出"
|
|
|
|
|
|
+ errtimes int //采集异常次数(暂定10次)
|
|
|
|
+ errPageNum int //当前采集异常页码
|
|
|
|
+ downtimes int //记录某页重试次数(暂定3次)
|
|
|
|
+ downloadAllNum int //记录本次采集,信息采集总量
|
|
|
|
+ saveAllNum int //记录本次采集,信息补采总量
|
|
|
|
+ repeatAllNum int //记录本次采集,信息重复总量
|
|
|
|
+ pageTitleHash string //记录当前页所有title文本
|
|
|
|
+ finishText = "正常退出" //
|
|
|
|
+ start = 1 //起始页
|
|
)
|
|
)
|
|
- start := 1 //起始页
|
|
|
|
for {
|
|
for {
|
|
if errtimes >= Supplement_MaxErrorTimes { //连续异常次数超过10次,爬虫不再翻页
|
|
if errtimes >= Supplement_MaxErrorTimes { //连续异常次数超过10次,爬虫不再翻页
|
|
finishText = "异常退出"
|
|
finishText = "异常退出"
|
|
@@ -978,14 +978,18 @@ func (s *Spider) SupplementDownListPageItem() (errs interface{}) {
|
|
s.L.Pop(1)
|
|
s.L.Pop(1)
|
|
if tbl, ok := lv.(*lua.LTable); ok {
|
|
if tbl, ok := lv.(*lua.LTable); ok {
|
|
if tabLen := tbl.Len(); tabLen > 0 { //列表页有数据,根据列表页信息下载三级页
|
|
if tabLen := tbl.Len(); tabLen > 0 { //列表页有数据,根据列表页信息下载三级页
|
|
- repeatListNum := 0 // 当前列表页连接重复个数
|
|
|
|
- isBreak := false
|
|
|
|
- var publishtimeErrTimes int
|
|
|
|
- var text string
|
|
|
|
- num := 1
|
|
|
|
|
|
+ var (
|
|
|
|
+ publishtimeErrTimes int
|
|
|
|
+ text string
|
|
|
|
+ repeatListNum int // 当前列表页连接重复个数
|
|
|
|
+ num = 1
|
|
|
|
+ isBreak = false
|
|
|
|
+ )
|
|
for ; num <= tabLen; num++ {
|
|
for ; num <= tabLen; num++ {
|
|
v := tbl.RawGetInt(num).(*lua.LTable)
|
|
v := tbl.RawGetInt(num).(*lua.LTable)
|
|
tmp := util.TableToMap(v)
|
|
tmp := util.TableToMap(v)
|
|
|
|
+ tmp["dataging"] = 0 //数据中打标记dataging=0
|
|
|
|
+ s.DownloadDetailItem(tmp, &repeatListNum)
|
|
pTmp := qu.ObjToString(tmp["publishtime"])
|
|
pTmp := qu.ObjToString(tmp["publishtime"])
|
|
title := qu.ObjToString(tmp["title"])
|
|
title := qu.ObjToString(tmp["title"])
|
|
text += title
|
|
text += title
|
|
@@ -993,18 +997,17 @@ func (s *Spider) SupplementDownListPageItem() (errs interface{}) {
|
|
publishtime := pTime.Unix()
|
|
publishtime := pTime.Unix()
|
|
if publishtime > 1000000000 && publishtime < Supplement_Publishtime { //正常退出
|
|
if publishtime > 1000000000 && publishtime < Supplement_Publishtime { //正常退出
|
|
isBreak = true
|
|
isBreak = true
|
|
- break
|
|
|
|
|
|
+ //break
|
|
} else if publishtime <= 1000000000 { //异常发布时间
|
|
} else if publishtime <= 1000000000 { //异常发布时间
|
|
publishtimeErrTimes++
|
|
publishtimeErrTimes++
|
|
}
|
|
}
|
|
- tmp["dataging"] = 0 //数据中打标记dataging=0
|
|
|
|
- s.DownloadDetailItem(tmp, &repeatListNum)
|
|
|
|
}
|
|
}
|
|
- downloadAllNum += tabLen
|
|
|
|
- repeatAllNum += repeatListNum
|
|
|
|
- saveAllNum += num - 1 - repeatListNum
|
|
|
|
- tmpPageTitleHash := pageTitleHash
|
|
|
|
- pageTitleHash = util.HexText(text)
|
|
|
|
|
|
+ logger.Info(s.Code, start, tabLen, repeatListNum)
|
|
|
|
+ downloadAllNum += tabLen //采集总量累计
|
|
|
|
+ repeatAllNum += repeatListNum //重复总量累计
|
|
|
|
+ saveAllNum += num - 1 - repeatListNum //保存总量累计
|
|
|
|
+ tmpPageTitleHash := pageTitleHash //
|
|
|
|
+ pageTitleHash = util.HexText(text) //
|
|
if tabLen == publishtimeErrTimes || tmpPageTitleHash == pageTitleHash { //当前页数据发布时间均异常;当前页与上页采集内容一致
|
|
if tabLen == publishtimeErrTimes || tmpPageTitleHash == pageTitleHash { //当前页数据发布时间均异常;当前页与上页采集内容一致
|
|
//if errtimes == 0 || start == errPageNum+1 {
|
|
//if errtimes == 0 || start == errPageNum+1 {
|
|
errtimes++
|
|
errtimes++
|
|
@@ -1044,20 +1047,20 @@ func (s *Spider) SupplementDownListPageItem() (errs interface{}) {
|
|
errPageNum = 0
|
|
errPageNum = 0
|
|
util.TimeSleepFunc(100*time.Millisecond, TimeSleepChan)
|
|
util.TimeSleepFunc(100*time.Millisecond, TimeSleepChan)
|
|
}
|
|
}
|
|
- logger.Info(s.Code, "本轮列表页采集详情:", downloadAllNum, repeatAllNum, start, finishText)
|
|
|
|
- if !util.Config.IsHistoryEvent && !s.Stop { //非历史节点统计下载率
|
|
|
|
- save := map[string]interface{}{
|
|
|
|
- "site": s.Name,
|
|
|
|
- "channel": s.Channel,
|
|
|
|
- "spidercode": s.Code,
|
|
|
|
- "comeintime": time.Now().Unix(),
|
|
|
|
- "modifyuser": s.MUserName,
|
|
|
|
- "endpage": start,
|
|
|
|
- "finish": finishText,
|
|
|
|
- "num": saveAllNum,
|
|
|
|
- }
|
|
|
|
- MgoS.Save("spider_supplement", save)
|
|
|
|
- }
|
|
|
|
|
|
+ logger.Info(s.Code, "本轮列表页采集详情:", downloadAllNum, repeatAllNum, saveAllNum, finishText)
|
|
|
|
+ save := map[string]interface{}{
|
|
|
|
+ "site": s.Name,
|
|
|
|
+ "channel": s.Channel,
|
|
|
|
+ "spidercode": s.Code,
|
|
|
|
+ "comeintime": time.Now().Unix(),
|
|
|
|
+ "modifyuser": s.MUserName,
|
|
|
|
+ "endpage": start,
|
|
|
|
+ "finish": finishText,
|
|
|
|
+ "savenum": saveAllNum,
|
|
|
|
+ "count": downloadAllNum,
|
|
|
|
+ "repeat": repeatAllNum,
|
|
|
|
+ }
|
|
|
|
+ MgoS.Save("spider_supplement", save)
|
|
return errs
|
|
return errs
|
|
}
|
|
}
|
|
|
|
|
|
@@ -1267,7 +1270,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
hashHref := util.HexText(href)
|
|
hashHref := util.HexText(href)
|
|
//列表页redis判重
|
|
//列表页redis判重
|
|
isExist := util.RedisExist("list", "list_"+hashHref)
|
|
isExist := util.RedisExist("list", "list_"+hashHref)
|
|
- if Supplement { //补采,再进行全量redis判重
|
|
|
|
|
|
+ if Supplement && !isExist { //补采,再进行全量redis判重
|
|
isExist, _ = util.ExistsBloomRedis("href", href)
|
|
isExist, _ = util.ExistsBloomRedis("href", href)
|
|
}
|
|
}
|
|
if isExist {
|
|
if isExist {
|
|
@@ -1331,6 +1334,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+
|
|
//详情页下载数据成功心跳
|
|
//详情页下载数据成功心跳
|
|
if !s.Stop {
|
|
if !s.Stop {
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute", false) //记录modal=0老模式采集到数据心跳
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute", false) //记录modal=0老模式采集到数据心跳
|