|
@@ -342,7 +342,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
tmp["dataging"] = 0 //数据中打标记dataging=0
|
|
|
if s.DownDetail {
|
|
|
s.DownloadDetailItem(tmp, &repeatListNum)
|
|
|
- } else {
|
|
|
+ } /*else {//暂无此类爬虫
|
|
|
tmp["comeintime"] = time.Now().Unix()
|
|
|
//atomic.AddInt32(&s.LastDowncount, 1)
|
|
|
//atomic.AddInt32(&s.TodayDowncount, 1)
|
|
@@ -353,7 +353,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
util.RedisClusterSet(hashHref, "", -1) //全量redis
|
|
|
list = append(list, tmp)
|
|
|
}
|
|
|
- }
|
|
|
+ }*/
|
|
|
} else { //历史补漏
|
|
|
s.HistoricalMendDownloadDetailItem(tmp) //历史补漏下载三级页
|
|
|
}
|
|
@@ -512,52 +512,81 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
|
return
|
|
|
}
|
|
|
hashHref := util.HexText(href)
|
|
|
- isExist := util.RedisClusterExists(hashHref) //全量redis判重
|
|
|
+ isExist := util.RedisExist("list", "list_"+hashHref)
|
|
|
//logger.Debug("full href:", href, " isExist:", isExist)
|
|
|
if !s.IsMustDownload { //非强制下载
|
|
|
if isExist { //数据存在,直接return
|
|
|
return
|
|
|
} else if util.Config.IsHistoryEvent { //1、7000(历史节点)的历史补漏,数据存入spider_historydata
|
|
|
num := 0
|
|
|
- SaveHighListPageData(paramdata, s.SCode, hashHref, &num)
|
|
|
+ SaveHighListPageData(paramdata, hashHref, &num)
|
|
|
return
|
|
|
}
|
|
|
+ } else { //当前不支持强制下载
|
|
|
+ return
|
|
|
}
|
|
|
- //2、非7000(历史节点)的历史补漏,采完列表直接采详情,采完爬虫下架
|
|
|
+ //2、非7000(历史节点)的历史补漏,采完列表直接采详情,采完爬虫下架(当前无此爬虫)
|
|
|
id := ""
|
|
|
- SaveListPageData(paramdata, &id, false) //存储采集记录
|
|
|
+ isEsRepeat := false
|
|
|
+ if delaySite := DelaySiteMap[s.Name]; delaySite != nil && delaySite.Compete {
|
|
|
+ title := qu.ObjToString(paramdata["title"])
|
|
|
+ eTime := time.Now().Unix()
|
|
|
+ sTime := eTime - int64(7*86400)
|
|
|
+ esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
+ if Es.Count(EsIndex, EsType, esQuery) > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
|
+ isEsRepeat = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ SaveListPageData(paramdata, &id, isEsRepeat) //存储采集记录
|
|
|
+ if isEsRepeat { //类竞品数据title判重数据加入redis
|
|
|
+ util.RedisSet("list", "list_"+hashHref, "", 86400*365*2)
|
|
|
+ util.AddBloomRedis("href", href)
|
|
|
+ return
|
|
|
+ }
|
|
|
//qu.Debug("----------------下载、解析、入库--------------------")
|
|
|
//下载详情页
|
|
|
data, err = s.DownloadDetailPage(paramdata, data)
|
|
|
if err != nil || data == nil { //下载失败,结束
|
|
|
if err != nil {
|
|
|
logger.Error(s.Code, err, paramdata)
|
|
|
- // if len(paramdata) > 0 {
|
|
|
- // SaveErrorData(paramdata) //保存错误信息
|
|
|
- // }
|
|
|
}
|
|
|
//更新spider_listdata中数据下载失败标记
|
|
|
MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": -1}})
|
|
|
return
|
|
|
- } else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
- util.RedisClusterSet(hashHref, "", -1) //全量redis中存值列表页href
|
|
|
+ }
|
|
|
+
|
|
|
+ util.RedisSet("list", "list_"+hashHref, "", 86400*365*2) //采集成功,加入列表页redis
|
|
|
+ //根据发布时间进行数据判重校验
|
|
|
+ tmphref := qu.ObjToString(data["href"]) //取tmphref,三级页href替换导致前后href不同
|
|
|
+ publishtime := qu.Int64All(data["l_np_publishtime"])
|
|
|
+ if publishtime < time.Now().AddDate(-1, 0, 0).Unix() { //一年前数据进行全量bloom redis href判重
|
|
|
+ isExist, _ = util.ExistsBloomRedis("href", tmphref)
|
|
|
+ if isExist {
|
|
|
+ MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "bloom_href", "updatetime": time.Now().Unix()}})
|
|
|
+ return
|
|
|
+ }
|
|
|
}
|
|
|
//详情页过滤数据
|
|
|
set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}
|
|
|
if data["delete"] != nil {
|
|
|
- util.RedisClusterSet(hashHref, "", -1) //过滤掉的数据存值全量redis
|
|
|
- //更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
|
- set["delete"] = true
|
|
|
- MgoS.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
|
|
|
+ //util.AddBloomRedis("href", tmphref)//delete可能存在删除跳转网站的数据,加入全量redis后可能导致该网站采不到
|
|
|
+ set["exist"] = "delete"
|
|
|
+ //MgoS.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
|
|
|
+ MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": set})
|
|
|
return
|
|
|
}
|
|
|
//更新spider_listdata中数据下载成功标记(根据链接更新数据state;可能由后续下载成功时更新)
|
|
|
MgoS.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
|
|
|
|
|
|
+ //三级页href替换导致前后href不同,采集成功后将原始href加入全量redis
|
|
|
+ //if tmphref := qu.ObjToString(data["href"]); tmphref != href {
|
|
|
+ // util.AddBloomRedis("href", href)
|
|
|
+ //}
|
|
|
+
|
|
|
flag := true
|
|
|
- t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"])) //publishtime
|
|
|
- if s.IsMustDownload { //强制下载
|
|
|
- if isExist && t1 < time.Now().AddDate(0, 0, -5).Unix() {
|
|
|
+ //publishtime := util.ParseDate2Int64(qu.ObjToString(data["publishtime"])) //publishtime
|
|
|
+ if s.IsMustDownload { //强制下载
|
|
|
+ if isExist && publishtime < time.Now().AddDate(0, 0, -5).Unix() {
|
|
|
//qu.Debug("强制下载 redis存在")
|
|
|
data["dataging"] = 1 //此处dataging=1对应保存服务中取redis中href对应的id值,进行更新(现redis中已无id值,所以无效)
|
|
|
flag = false
|
|
@@ -571,16 +600,13 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
|
data["dataging"] = 0
|
|
|
}
|
|
|
}
|
|
|
- if t1 > time.Now().Unix() { //防止发布时间超前
|
|
|
- data["publishtime"] = time.Now().Unix()
|
|
|
- }
|
|
|
+ //if publishtime > time.Now().Unix() { //防止发布时间超前
|
|
|
+ // data["publishtime"] = time.Now().Unix()
|
|
|
+ //}
|
|
|
delete(data, "state")
|
|
|
delete(data, "exit")
|
|
|
delete(data, "checkpublishtime")
|
|
|
data["comeintime"] = time.Now().Unix()
|
|
|
- //atomic.AddInt32(&s.LastDowncount, 1)
|
|
|
- //atomic.AddInt32(&s.TodayDowncount, 1)
|
|
|
- //atomic.AddInt32(&s.TotalDowncount, 1)
|
|
|
data["spidercode"] = s.Code
|
|
|
//qu.Debug("--------------开始保存---------------")
|
|
|
data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
|
|
@@ -603,24 +629,20 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
return
|
|
|
}
|
|
|
hashHref := util.HexText(href)
|
|
|
+ //列表页redis判重
|
|
|
+ isExist := util.RedisExist("list", "list_"+hashHref)
|
|
|
+ if isExist {
|
|
|
+ *num++ //已采集
|
|
|
+ return
|
|
|
+ }
|
|
|
id := "" //记录spider_listdata中保存的数据id,便于下载成功后更新状态
|
|
|
- if util.Config.Modal == 1 || (util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history") { //除7410、7500、7510、7700节点外所有节点只采集列表页信息
|
|
|
- isExist := util.RedisClusterExists(hashHref) //全量信息中已采集
|
|
|
- if isExist {
|
|
|
- *num++ //已采集
|
|
|
- return
|
|
|
- }
|
|
|
- SaveHighListPageData(paramdata, s.SCode, hashHref, num)
|
|
|
+ if util.Config.Modal == 1 || (util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history") { //列表页、详情页分开采集模式节点和7000节点新爬虫采集的数据数据
|
|
|
+ SaveHighListPageData(paramdata, hashHref, num) //存表
|
|
|
return
|
|
|
} else {
|
|
|
if !s.Stop {
|
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail") //记录modal=0老模式采集三级页心跳
|
|
|
}
|
|
|
- isExist := util.RedisClusterExists(hashHref) //全量信息中已采集
|
|
|
- if isExist {
|
|
|
- *num++ //已采集
|
|
|
- return
|
|
|
- }
|
|
|
isEsRepeat := false
|
|
|
if delaySite := DelaySiteMap[s.Name]; delaySite != nil && delaySite.Compete {
|
|
|
title := qu.ObjToString(paramdata["title"])
|
|
@@ -633,7 +655,8 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
}
|
|
|
SaveListPageData(paramdata, &id, isEsRepeat) //保存7000、7410、7500、7510、7520、7700节点列表页采集的信息
|
|
|
if isEsRepeat { //类竞品数据title判重数据加入redis
|
|
|
- util.RedisClusterSet(hashHref, "", -1) //全量存值
|
|
|
+ util.RedisSet("list", "list_"+hashHref, "", 86400*365*2)
|
|
|
+ util.AddBloomRedis("href", href)
|
|
|
return
|
|
|
}
|
|
|
}
|
|
@@ -650,36 +673,47 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
//更新spider_listdata中数据下载失败标记
|
|
|
MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": -1, "updatetime": time.Now().Unix()}})
|
|
|
return
|
|
|
- } else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
+ } /*else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
util.RedisClusterSet(hashHref, "", -1) //全量redis中存值列表页href
|
|
|
+ }*/
|
|
|
+
|
|
|
+ util.RedisSet("list", "list_"+hashHref, "", 86400*365*2) //加入列表页redis
|
|
|
+ //根据发布时间进行数据判重校验
|
|
|
+ tmphref := qu.ObjToString(data["href"])
|
|
|
+ publishtime := qu.Int64All(data["l_np_publishtime"])
|
|
|
+ //7410节点(变链接节点)或者一年前数据进行全量bloomredis href判重
|
|
|
+ if util.Config.Uploadevent == 7410 || publishtime < time.Now().AddDate(-1, 0, 0).Unix() {
|
|
|
+ isExist, _ = util.ExistsBloomRedis("href", tmphref)
|
|
|
+ if isExist {
|
|
|
+ MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "bloom_href", "updatetime": time.Now().Unix()}})
|
|
|
+ return
|
|
|
+ }
|
|
|
}
|
|
|
//详情页下载数据成功心跳
|
|
|
if !s.Stop {
|
|
|
UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //记录modal=0老模式采集到数据心跳
|
|
|
}
|
|
|
- set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix(), "byid": id}
|
|
|
+ set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}
|
|
|
//详情页过滤数据
|
|
|
if data["delete"] != nil {
|
|
|
- util.RedisClusterSet(hashHref, "", -1) //过滤掉的数据存值全量redis
|
|
|
- //更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
|
- set["delete"] = true
|
|
|
- MgoS.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
|
|
|
+ //util.AddBloomRedis("href", tmphref)//delete可能存在删除跳转网站的数据,加入全量redis后可能导致该网站采不到
|
|
|
+ set["exist"] = "delete"
|
|
|
+ //MgoS.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
|
|
|
+ MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": set})
|
|
|
return
|
|
|
}
|
|
|
+ set["byid"] = id
|
|
|
//更新spider_listdata中数据下载成功标记(根据链接更新数据state;可能由后续下载成功时更新)
|
|
|
MgoS.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
|
|
|
|
|
|
- t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
|
|
|
- if t1 > time.Now().Unix() { //防止发布时间超前
|
|
|
- data["publishtime"] = time.Now().Unix()
|
|
|
- }
|
|
|
+ //三级页href替换导致前后href不同,采集成功后将原始href加入全量redis
|
|
|
+ //if tmphref := qu.ObjToString(data["href"]); tmphref != href {
|
|
|
+ // util.AddBloomRedis("href", href)
|
|
|
+ //}
|
|
|
delete(data, "state")
|
|
|
delete(data, "exit")
|
|
|
delete(data, "checkpublishtime")
|
|
|
data["comeintime"] = time.Now().Unix()
|
|
|
- //atomic.AddInt32(&s.LastDowncount, 1)
|
|
|
- //atomic.AddInt32(&s.TodayDowncount, 1)
|
|
|
- //atomic.AddInt32(&s.TotalDowncount, 1)
|
|
|
data["spidercode"] = s.Code
|
|
|
data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
|
|
|
data["infoformat"] = s.Infoformat //爬虫类型
|
|
@@ -764,7 +798,7 @@ func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[strin
|
|
|
if value, ok := v.(lua.LString); ok {
|
|
|
data[key] = string(value)
|
|
|
} else if value, ok := v.(lua.LNumber); ok {
|
|
|
- data[key] = value
|
|
|
+ data[key] = int64(value)
|
|
|
} else if value, ok := v.(*lua.LTable); ok {
|
|
|
tmp := util.TableToMap(value)
|
|
|
data[key] = tmp
|
|
@@ -914,20 +948,8 @@ func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
|
|
|
_id := tmp["_id"]
|
|
|
query := map[string]interface{}{"_id": _id}
|
|
|
href := qu.ObjToString(tmp["href"])
|
|
|
- hashHref := util.HexText(href)
|
|
|
+ //hashHref := util.HexText(href)
|
|
|
update := []map[string]interface{}{}
|
|
|
- //由于目前列表页redis判重是href+code可能导致同一条href有多条不同code采集的数据存在
|
|
|
- //为了避免重复下载,进行全量redis判重
|
|
|
- isExist := util.RedisClusterExists(hashHref)
|
|
|
- if isExist {
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "href", "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
- update = append(update, query)
|
|
|
- update = append(update, set)
|
|
|
- spLock.Lock()
|
|
|
- updateArr = append(updateArr, update)
|
|
|
- spLock.Unlock()
|
|
|
- return
|
|
|
- }
|
|
|
if isEsRepeat { //es数据title判重
|
|
|
title := qu.ObjToString(tmp["title"])
|
|
|
eTime := time.Now().Unix()
|
|
@@ -935,8 +957,8 @@ func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
|
|
|
esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
count := Es.Count(EsIndex, EsType, esQuery)
|
|
|
if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
|
- util.RedisClusterSet(hashHref, "", -1)
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "title", "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
+ util.AddBloomRedis("href", href)
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "es", "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
update = append(update, query)
|
|
|
update = append(update, set)
|
|
|
spLock.Lock()
|
|
@@ -970,9 +992,9 @@ func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
|
|
|
} /*else if data == nil && times >= 3 { //下载问题,建editor任务
|
|
|
DownloadErrorData(s.Code, tmp)
|
|
|
}*/
|
|
|
- } else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
+ } /*else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
util.RedisClusterSet(hashHref, "", -1)
|
|
|
- }
|
|
|
+ }*/
|
|
|
|
|
|
if !success { //下载失败更新次数和状态
|
|
|
ss := map[string]interface{}{"times": times, "updatetime": time.Now().Unix()}
|
|
@@ -987,9 +1009,9 @@ func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
|
|
|
spLock.Unlock()
|
|
|
return
|
|
|
} else if data["delete"] != nil { //三级页过滤
|
|
|
- util.RedisClusterSet(hashHref, "", -1) //过滤掉的数据存值全量redis
|
|
|
+ //util.AddBloomRedis("href", tmphref)//delete可能存在删除跳转网站的数据,加入全量redis后可能导致该网站采不到
|
|
|
//更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true, "updatetime": time.Now().Unix()}}
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "delete", "updatetime": time.Now().Unix()}}
|
|
|
update = append(update, query)
|
|
|
update = append(update, set)
|
|
|
spLock.Lock()
|
|
@@ -1013,22 +1035,28 @@ func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
|
|
|
spLock.Unlock()
|
|
|
return
|
|
|
}
|
|
|
- t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
|
|
|
- if t1 > time.Now().Unix() { //防止发布时间超前
|
|
|
- data["publishtime"] = time.Now().Unix()
|
|
|
+ //数据采集成功
|
|
|
+ //根据发布时间进行数据判重校验
|
|
|
+ tmphref := qu.ObjToString(data["href"])
|
|
|
+ publishtime := qu.Int64All(data["l_np_publishtime"])
|
|
|
+ if publishtime < time.Now().AddDate(-1, 0, 0).Unix() {
|
|
|
+ isExist, _ := util.ExistsBloomRedis("href", tmphref)
|
|
|
+ if isExist {
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{
|
|
|
+ "state": 1,
|
|
|
+ "updatetime": time.Now().Unix(),
|
|
|
+ "exist": "bloom_href",
|
|
|
+ }}
|
|
|
+ update = append(update, query)
|
|
|
+ update = append(update, set)
|
|
|
+ spLock.Lock()
|
|
|
+ updateArr = append(updateArr, update)
|
|
|
+ spLock.Unlock()
|
|
|
+ return
|
|
|
+ }
|
|
|
}
|
|
|
delete(data, "exit")
|
|
|
delete(data, "checkpublishtime")
|
|
|
- //计数
|
|
|
- //tmpsp1, b := Allspiders.Load(s.Code)
|
|
|
- //if b {
|
|
|
- // sp1, ok := tmpsp1.(*Spider)
|
|
|
- // if ok {
|
|
|
- // atomic.AddInt32(&sp1.LastDowncount, 1)
|
|
|
- // atomic.AddInt32(&sp1.TodayDowncount, 1)
|
|
|
- // atomic.AddInt32(&sp1.TotalDowncount, 1)
|
|
|
- // }
|
|
|
- //}
|
|
|
data["comeintime"] = time.Now().Unix()
|
|
|
data["spidercode"] = s.Code
|
|
|
data["dataging"] = 0
|