|
@@ -269,7 +269,10 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list") //记录所有节点列表页心跳
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list") //记录所有节点列表页心跳
|
|
}
|
|
}
|
|
//qu.Debug("重复页:", repeatPageNum, " 配置最大页:", tmpMax, " 最终最大页:", max, " 当前页:", start, "重复次数:", repeatPageTimes)
|
|
//qu.Debug("重复页:", repeatPageNum, " 配置最大页:", tmpMax, " 最终最大页:", max, " 当前页:", start, "重复次数:", repeatPageTimes)
|
|
- if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次,不再翻页
|
|
|
|
|
|
+ //if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次,不再翻页
|
|
|
|
+ // break
|
|
|
|
+ //}
|
|
|
|
+ if isRunRepeatList && repeatPageTimes >= 10 { //重复次数超过10次,不再翻页
|
|
break
|
|
break
|
|
}
|
|
}
|
|
if err := s.L.CallByParam(lua.P{
|
|
if err := s.L.CallByParam(lua.P{
|
|
@@ -278,9 +281,24 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
Protect: true,
|
|
Protect: true,
|
|
}, lua.LNumber(start)); err != nil {
|
|
}, lua.LNumber(start)); err != nil {
|
|
//panic(s.Code + "," + err.Error())
|
|
//panic(s.Code + "," + err.Error())
|
|
- log.Println(s.Code + "," + err.Error())
|
|
|
|
|
|
+ logger.Error("列表页采集报错", start, s.Code+","+err.Error())
|
|
errs = err.Error()
|
|
errs = err.Error()
|
|
atomic.AddInt32(&s.Script.ErrorNum, 1)
|
|
atomic.AddInt32(&s.Script.ErrorNum, 1)
|
|
|
|
+ //列表页采集报错进行重试,超过重试次数视为该页已采
|
|
|
|
+ if downtimes < 2 {
|
|
|
|
+ downtimes++
|
|
|
|
+ start--
|
|
|
|
+ //} else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
|
|
|
|
+ } else if isRunRepeatList { //超过重试次数,视为本页重复
|
|
|
|
+ if repeatPageNum+1 == start {
|
|
|
|
+ repeatPageTimes++ //次数加1
|
|
|
|
+ } else {
|
|
|
|
+ repeatPageTimes = 0 //重复次数重置0
|
|
|
|
+ }
|
|
|
|
+ repeatPageNum = start //赋值页码
|
|
|
|
+ downtimes = 0
|
|
|
|
+ }
|
|
|
|
+ continue
|
|
}
|
|
}
|
|
lv := s.L.Get(-1)
|
|
lv := s.L.Get(-1)
|
|
s.L.Pop(1)
|
|
s.L.Pop(1)
|
|
@@ -307,7 +325,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
db := HexToBigIntMod(href) //根据href的哈希值选择Redis的db
|
|
db := HexToBigIntMod(href) //根据href的哈希值选择Redis的db
|
|
hashHref := HexText(href)
|
|
hashHref := HexText(href)
|
|
//增量(redis默认db0)
|
|
//增量(redis默认db0)
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
//全量(判断是否已存在防止覆盖id)
|
|
//全量(判断是否已存在防止覆盖id)
|
|
isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)
|
|
isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)
|
|
if !isExist {
|
|
if !isExist {
|
|
@@ -320,11 +338,12 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
s.HistoricalMendDownloadDetailItem(tmp) //历史补漏下载三级页
|
|
s.HistoricalMendDownloadDetailItem(tmp) //历史补漏下载三级页
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- if start <= tmpMax { //数量赋值
|
|
|
|
- repeatAllNum += repeatListNum
|
|
|
|
- downloadAllNum += tabLen
|
|
|
|
- }
|
|
|
|
- if start > tmpMax && isRunRepeatList { //执行连续页码判重
|
|
|
|
|
|
+ //if start <= tmpMax { //数量赋值
|
|
|
|
+ repeatAllNum += repeatListNum
|
|
|
|
+ downloadAllNum += tabLen
|
|
|
|
+ //}
|
|
|
|
+ //if start > tmpMax && isRunRepeatList { //执行连续页码判重
|
|
|
|
+ if isRunRepeatList { //执行连续页码判重
|
|
if repeatListNum >= tabLen { //当前start列表页全部数据都已采集
|
|
if repeatListNum >= tabLen { //当前start列表页全部数据都已采集
|
|
//qu.Debug("重复页:", repeatPageNum, "当前页:", start)
|
|
//qu.Debug("重复页:", repeatPageNum, "当前页:", start)
|
|
if repeatPageNum+1 == start || repeatPageNum == 0 {
|
|
if repeatPageNum+1 == start || repeatPageNum == 0 {
|
|
@@ -348,7 +367,8 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
downtimes++
|
|
downtimes++
|
|
start--
|
|
start--
|
|
continue
|
|
continue
|
|
- } else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
|
|
|
|
|
|
+ //} else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
|
|
|
|
+ } else if isRunRepeatList { //超过重试次数,视为本页重复
|
|
if repeatPageNum+1 == start {
|
|
if repeatPageNum+1 == start {
|
|
repeatPageTimes++ //次数加1
|
|
repeatPageTimes++ //次数加1
|
|
} else {
|
|
} else {
|
|
@@ -362,7 +382,8 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
downtimes++
|
|
downtimes++
|
|
start--
|
|
start--
|
|
continue
|
|
continue
|
|
- } else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
|
|
|
|
|
|
+ //} else if start > tmpMax && isRunRepeatList { //超过重试次数,视为本页重复
|
|
|
|
+ } else if isRunRepeatList { //超过重试次数,视为本页重复
|
|
if repeatPageNum+1 == start {
|
|
if repeatPageNum+1 == start {
|
|
repeatPageTimes++ //次数加1
|
|
repeatPageTimes++ //次数加1
|
|
} else {
|
|
} else {
|
|
@@ -554,7 +575,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
if util.Config.Modal == 1 { //除7000、7500、7700节点外所有节点只采集列表页信息
|
|
if util.Config.Modal == 1 { //除7000、7500、7700节点外所有节点只采集列表页信息
|
|
isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
if isExist { //更新redis生命周期
|
|
if isExist { //更新redis生命周期
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
*num++ //已采集
|
|
*num++ //已采集
|
|
return
|
|
return
|
|
}
|
|
}
|
|
@@ -566,7 +587,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
}
|
|
}
|
|
isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
if isExist { //更新redis生命周期
|
|
if isExist { //更新redis生命周期
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
*num++ //已采集
|
|
*num++ //已采集
|
|
return
|
|
return
|
|
}
|
|
}
|
|
@@ -582,7 +603,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
}
|
|
}
|
|
SaveListPageData(paramdata, &id, isEsRepeat) //保存7000、7410、7500、7700节点列表页采集的信息
|
|
SaveListPageData(paramdata, &id, isEsRepeat) //保存7000、7410、7500、7700节点列表页采集的信息
|
|
if isEsRepeat { //类竞品数据title判重数据加入redis
|
|
if isEsRepeat { //类竞品数据title判重数据加入redis
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
return
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -603,7 +624,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
log.Println("beforeHref:", href, "afterHref:", tmphref)
|
|
log.Println("beforeHref:", href, "afterHref:", tmphref)
|
|
//增量
|
|
//增量
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
//全量
|
|
//全量
|
|
db := HexToBigIntMod(href)
|
|
db := HexToBigIntMod(href)
|
|
hashHref := HexText(href)
|
|
hashHref := HexText(href)
|
|
@@ -805,7 +826,7 @@ func (s *Spider) DownloadHighDetail() {
|
|
if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -837,7 +858,7 @@ func (s *Spider) DownloadHighDetail() {
|
|
} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
log.Println("beforeHref:", href, "afterHref:", tmphref)
|
|
log.Println("beforeHref:", href, "afterHref:", tmphref)
|
|
//增量
|
|
//增量
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
//全量
|
|
//全量
|
|
db := HexToBigIntMod(href)
|
|
db := HexToBigIntMod(href)
|
|
hashHref := HexText(href)
|
|
hashHref := HexText(href)
|
|
@@ -940,7 +961,7 @@ func (s *Spider) DownloadListDetail() {
|
|
if Es.Count(EsIndex, EsType, esQuery) > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
if Es.Count(EsIndex, EsType, esQuery) > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -972,7 +993,7 @@ func (s *Spider) DownloadListDetail() {
|
|
} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
log.Println("beforeHref:", href, "afterHref:", tmphref)
|
|
log.Println("beforeHref:", href, "afterHref:", tmphref)
|
|
//增量
|
|
//增量
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
//全量
|
|
//全量
|
|
db := HexToBigIntMod(href)
|
|
db := HexToBigIntMod(href)
|
|
hashHref := HexText(href)
|
|
hashHref := HexText(href)
|