|
@@ -16,6 +16,7 @@ import (
|
|
|
qu "qfw/util"
|
|
|
mgu "qfw/util/mongodbutil"
|
|
|
"strconv"
|
|
|
+ "sync"
|
|
|
|
|
|
//mgu "qfw/util/mongodbutil"
|
|
|
//"qfw/util/redis"
|
|
@@ -31,6 +32,7 @@ import (
|
|
|
"github.com/yuin/gopher-lua"
|
|
|
)
|
|
|
|
|
|
+//心跳
|
|
|
type Heart struct {
|
|
|
DetailHeart int64 //爬虫三级页执行心跳
|
|
|
DetailExecuteHeart int64 //三级页采集到数据心跳
|
|
@@ -41,6 +43,15 @@ type Heart struct {
|
|
|
Channel string //栏目
|
|
|
}
|
|
|
|
|
|
+//流量
|
|
|
+type SpiderFlow struct {
|
|
|
+ Flow int64 //流量
|
|
|
+ ModifyUser string //爬虫维护人
|
|
|
+ Site string //站点
|
|
|
+ Channel string //栏目
|
|
|
+ //Code string
|
|
|
+}
|
|
|
+
|
|
|
//爬虫()
|
|
|
type Spider struct {
|
|
|
Script
|
|
@@ -93,6 +104,8 @@ var Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
|
|
|
var DomainNameReg = regexp.MustCompile(`(?://).+?(?:)[::/]`)
|
|
|
var RepDomainNameReg = regexp.MustCompile(`[::/]+`)
|
|
|
var DelaySites map[string]int //延迟采集站点集合
|
|
|
+var Today string
|
|
|
+var SpiderFlowMap = sync.Map{} //code:{"2022-05-16":SpiderFlow}
|
|
|
|
|
|
//心跳
|
|
|
func UpdateHeart(site, channel, code, user, t string) {
|
|
@@ -483,7 +496,7 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
|
//更新spider_listdata中数据下载成功标记
|
|
|
if id != "" {
|
|
|
//Mgo.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "byid": id}}, false, true)
|
|
|
- Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1}})
|
|
|
+ Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}})
|
|
|
}
|
|
|
return
|
|
|
}
|
|
@@ -504,7 +517,7 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
|
return
|
|
|
}
|
|
|
//详情页过滤数据
|
|
|
- set := map[string]interface{}{"state": 1}
|
|
|
+ set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}
|
|
|
if data["delete"] != nil {
|
|
|
//增量
|
|
|
util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
@@ -629,7 +642,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
}
|
|
|
//更新spider_listdata中数据下载失败标记
|
|
|
if id != "" {
|
|
|
- Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": -1}})
|
|
|
+ Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": -1, "updatetime": time.Now().Unix()}})
|
|
|
}
|
|
|
return
|
|
|
} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
@@ -648,7 +661,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
if !s.Stop {
|
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //记录modal=0老模式采集到数据心跳
|
|
|
}
|
|
|
- set := map[string]interface{}{"state": 1, "byid": id}
|
|
|
+ set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix(), "byid": id}
|
|
|
//详情页过滤数据
|
|
|
if data["delete"] != nil {
|
|
|
//增量
|
|
@@ -837,7 +850,8 @@ func (s *Spider) DownloadHighDetail() {
|
|
|
if !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
|
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail") //记录modal=1采集三级页心跳
|
|
|
}
|
|
|
- list, _ := Mgo.Find("spider_highlistdata", q, o, f, false, 0, 100)
|
|
|
+ list, _ := Mgo.Find("spider_highlistdata_test", q, o, f, false, 0, 100)
|
|
|
+ qu.Debug("----", len(*list))
|
|
|
if list != nil && len(*list) > 0 {
|
|
|
for _, tmp := range *list {
|
|
|
_id := tmp["_id"]
|
|
@@ -847,8 +861,8 @@ func (s *Spider) DownloadHighDetail() {
|
|
|
//为了避免重复下载,进行增量redis判重
|
|
|
isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
|
if isExist {
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
|
- Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
+ Mgo.Update("spider_highlistdata_test", query, set, false, false)
|
|
|
continue
|
|
|
}
|
|
|
if isEsRepeat { //es数据title判重
|
|
@@ -858,8 +872,8 @@ func (s *Spider) DownloadHighDetail() {
|
|
|
esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
count := Es.Count(EsIndex, EsType, esQuery)
|
|
|
if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
|
- Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
+ Mgo.Update("spider_highlistdata_test", query, set, false, false)
|
|
|
util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
continue
|
|
|
}
|
|
@@ -908,7 +922,7 @@ func (s *Spider) DownloadHighDetail() {
|
|
|
ss["state"] = -1
|
|
|
}
|
|
|
set := map[string]interface{}{"$set": ss}
|
|
|
- Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
+ Mgo.Update("spider_highlistdata_test", query, set, false, false)
|
|
|
continue
|
|
|
} else {
|
|
|
deleteData := FilterByDetail(href, query, data) //针对列表页无法过滤需要在详情页过滤的数据,进行过滤处理
|
|
@@ -937,8 +951,8 @@ func (s *Spider) DownloadHighDetail() {
|
|
|
data["dataging"] = 0
|
|
|
data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
|
|
|
Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1}} //下载成功state置为1
|
|
|
- Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}} //下载成功state置为1
|
|
|
+ Mgo.Update("spider_highlistdata_test", query, set, false, false)
|
|
|
}
|
|
|
//重载spider
|
|
|
s.LoadScript(s.Name, s.Channel, s.MUserName, s.Code, s.ScriptFile, true)
|
|
@@ -997,7 +1011,7 @@ func (s *Spider) DownloadListDetail() {
|
|
|
//为了避免重复下载,进行增量redis判重
|
|
|
isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
|
if isExist {
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
continue
|
|
|
}
|
|
@@ -1007,7 +1021,7 @@ func (s *Spider) DownloadListDetail() {
|
|
|
sTime := eTime - int64(7*86400)
|
|
|
esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
if Es.Count(EsIndex, EsType, esQuery) > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
continue
|
|
@@ -1085,7 +1099,7 @@ func (s *Spider) DownloadListDetail() {
|
|
|
data["dataging"] = 0
|
|
|
data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
|
|
|
Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1}} //下载成功state置为1
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}} //下载成功state置为1
|
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
}
|
|
|
}
|
|
@@ -1100,7 +1114,7 @@ func FilterByDetail(href string, query, data map[string]interface{}) bool {
|
|
|
hashHref := HexText(href)
|
|
|
util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
|
//更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true}}
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true, "updatetime": time.Now().Unix()}}
|
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
return true
|
|
|
}
|