|
@@ -31,6 +31,7 @@ type Heart struct {
|
|
DetailExecuteHeart int64 //三级页采集到数据心跳
|
|
DetailExecuteHeart int64 //三级页采集到数据心跳
|
|
FindListHeart int64 //findListHtml执行心跳
|
|
FindListHeart int64 //findListHtml执行心跳
|
|
ListHeart int64 //爬虫列表页执行心跳
|
|
ListHeart int64 //爬虫列表页执行心跳
|
|
|
|
+ FirstPageHeart int64 //采集第一页的心跳
|
|
ModifyUser string //爬虫维护人
|
|
ModifyUser string //爬虫维护人
|
|
Site string //站点
|
|
Site string //站点
|
|
Channel string //栏目
|
|
Channel string //栏目
|
|
@@ -118,13 +119,16 @@ type DelaySite struct {
|
|
}
|
|
}
|
|
|
|
|
|
//心跳
|
|
//心跳
|
|
-func UpdateHeart(site, channel, code, user, t string) {
|
|
|
|
|
|
+func UpdateHeart(site, channel, code, user, t string, firstpage bool) {
|
|
//sp, spiderOk := LoopListPath.Load(code)
|
|
//sp, spiderOk := LoopListPath.Load(code)
|
|
//if spiderOk && sp != nil {
|
|
//if spiderOk && sp != nil {
|
|
if htmp, ok := SpiderHeart.Load(code); ok {
|
|
if htmp, ok := SpiderHeart.Load(code); ok {
|
|
if heart, ok := htmp.(*Heart); ok {
|
|
if heart, ok := htmp.(*Heart); ok {
|
|
if t == "list" {
|
|
if t == "list" {
|
|
heart.ListHeart = time.Now().Unix()
|
|
heart.ListHeart = time.Now().Unix()
|
|
|
|
+ if firstpage {
|
|
|
|
+ heart.FirstPageHeart = time.Now().Unix()
|
|
|
|
+ }
|
|
} else if t == "findlist" {
|
|
} else if t == "findlist" {
|
|
heart.FindListHeart = time.Now().Unix()
|
|
heart.FindListHeart = time.Now().Unix()
|
|
} else if t == "detail" {
|
|
} else if t == "detail" {
|
|
@@ -299,7 +303,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
}
|
|
}
|
|
for ; start <= max && !s.Stop; start++ {
|
|
for ; start <= max && !s.Stop; start++ {
|
|
if !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
if !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
- UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list") //记录所有节点列表页心跳
|
|
|
|
|
|
+ UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list", start == 1) //记录所有节点列表页心跳
|
|
}
|
|
}
|
|
//logger.Info("爬虫:", s.Code, "重复页:", repeatPageNum, " 配置最大页:", tmpMax, " 最终最大页:", max, " 当前页:", start, "重复次数:", repeatPageTimes)
|
|
//logger.Info("爬虫:", s.Code, "重复页:", repeatPageNum, " 配置最大页:", tmpMax, " 最终最大页:", max, " 当前页:", start, "重复次数:", repeatPageTimes)
|
|
//if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次,不再翻页
|
|
//if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次,不再翻页
|
|
@@ -655,7 +659,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
return
|
|
return
|
|
} else {
|
|
} else {
|
|
if !s.Stop {
|
|
if !s.Stop {
|
|
- UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail") //记录modal=0老模式采集三级页心跳
|
|
|
|
|
|
+ UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail", false) //记录modal=0老模式采集三级页心跳
|
|
}
|
|
}
|
|
isEsRepeat := false
|
|
isEsRepeat := false
|
|
if delaySite := DelaySiteMap[s.Name]; delaySite != nil && delaySite.Compete {
|
|
if delaySite := DelaySiteMap[s.Name]; delaySite != nil && delaySite.Compete {
|
|
@@ -706,7 +710,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
}
|
|
}
|
|
//详情页下载数据成功心跳
|
|
//详情页下载数据成功心跳
|
|
if !s.Stop {
|
|
if !s.Stop {
|
|
- UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //记录modal=0老模式采集到数据心跳
|
|
|
|
|
|
+ UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute", false) //记录modal=0老模式采集到数据心跳
|
|
}
|
|
}
|
|
set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}
|
|
set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}
|
|
//详情页过滤数据
|
|
//详情页过滤数据
|
|
@@ -902,7 +906,7 @@ func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
|
|
"event": 0,
|
|
"event": 0,
|
|
}
|
|
}
|
|
if !isHistory && !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
if !isHistory && !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
- UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail") //记录modal=1采集三级页心跳
|
|
|
|
|
|
+ UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail", false) //记录modal=1采集三级页心跳
|
|
}
|
|
}
|
|
countNum := MgoS.Count(coll, q) //统计util.Config.DayNum天内未下载爬虫个数
|
|
countNum := MgoS.Count(coll, q) //统计util.Config.DayNum天内未下载爬虫个数
|
|
if isHistory && countNum == 0 { //下载历史数据量为0,手动stop
|
|
if isHistory && countNum == 0 { //下载历史数据量为0,手动stop
|
|
@@ -995,7 +999,7 @@ func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
|
|
//下载、解析、入库
|
|
//下载、解析、入库
|
|
data, err = sp.DownloadDetailPage(tmp, data)
|
|
data, err = sp.DownloadDetailPage(tmp, data)
|
|
if !isHistory && !sp.Stop && sp.IsMainThread { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
if !isHistory && !sp.Stop && sp.IsMainThread { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
- UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //记录modal=1下载数据心跳
|
|
|
|
|
|
+ UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute", false) //记录modal=1下载数据心跳
|
|
}
|
|
}
|
|
if err != nil || data == nil {
|
|
if err != nil || data == nil {
|
|
success = false
|
|
success = false
|