|
@@ -19,19 +19,21 @@ import (
|
|
|
|
|
|
//mgu "qfw/util/mongodbutil"
|
|
//mgu "qfw/util/mongodbutil"
|
|
//"qfw/util/redis"
|
|
//"qfw/util/redis"
|
|
|
|
+
|
|
|
|
+ "github.com/donnie4w/go-logger/logger"
|
|
|
|
+ "github.com/yuin/gopher-lua"
|
|
|
|
+ es "qfw/util/elastic"
|
|
"regexp"
|
|
"regexp"
|
|
util "spiderutil"
|
|
util "spiderutil"
|
|
"strings"
|
|
"strings"
|
|
"sync/atomic"
|
|
"sync/atomic"
|
|
"time"
|
|
"time"
|
|
-
|
|
|
|
- "github.com/donnie4w/go-logger/logger"
|
|
|
|
- "github.com/yuin/gopher-lua"
|
|
|
|
)
|
|
)
|
|
|
|
|
|
type Heart struct {
|
|
type Heart struct {
|
|
DetailHeart int64 //爬虫三级页执行心跳
|
|
DetailHeart int64 //爬虫三级页执行心跳
|
|
DetailExecuteHeart int64 //三级页采集到数据心跳
|
|
DetailExecuteHeart int64 //三级页采集到数据心跳
|
|
|
|
+ FindListHeart int64 //findListHtml执行心跳
|
|
ListHeart int64 //爬虫列表页执行心跳
|
|
ListHeart int64 //爬虫列表页执行心跳
|
|
ModifyUser string //爬虫维护人
|
|
ModifyUser string //爬虫维护人
|
|
Site string //站点
|
|
Site string //站点
|
|
@@ -75,6 +77,9 @@ type Spider struct {
|
|
IsCompete bool //区分新老爬虫
|
|
IsCompete bool //区分新老爬虫
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+var Es *es.Elastic
|
|
|
|
+var EsIndex string
|
|
|
|
+var EsType string
|
|
var UpdataMgoCache = make(chan []map[string]interface{}, 1000) //更新要重下数据的状态
|
|
var UpdataMgoCache = make(chan []map[string]interface{}, 1000) //更新要重下数据的状态
|
|
var UpdataHeartCache = make(chan []map[string]interface{}, 1000) //更新爬虫心跳信息
|
|
var UpdataHeartCache = make(chan []map[string]interface{}, 1000) //更新爬虫心跳信息
|
|
var SP = make(chan bool, 5)
|
|
var SP = make(chan bool, 5)
|
|
@@ -89,6 +94,8 @@ func UpdateHeart(site, channel, code, user, t string) {
|
|
if heart, ok := htmp.(*Heart); ok {
|
|
if heart, ok := htmp.(*Heart); ok {
|
|
if t == "list" {
|
|
if t == "list" {
|
|
heart.ListHeart = time.Now().Unix()
|
|
heart.ListHeart = time.Now().Unix()
|
|
|
|
+ } else if t == "findlist" {
|
|
|
|
+ heart.FindListHeart = time.Now().Unix()
|
|
} else if t == "detail" {
|
|
} else if t == "detail" {
|
|
heart.DetailHeart = time.Now().Unix()
|
|
heart.DetailHeart = time.Now().Unix()
|
|
} else if t == "detailexcute" {
|
|
} else if t == "detailexcute" {
|
|
@@ -103,6 +110,8 @@ func UpdateHeart(site, channel, code, user, t string) {
|
|
}
|
|
}
|
|
if t == "list" {
|
|
if t == "list" {
|
|
heart.ListHeart = time.Now().Unix()
|
|
heart.ListHeart = time.Now().Unix()
|
|
|
|
+ } else if t == "findlist" {
|
|
|
|
+ heart.FindListHeart = time.Now().Unix()
|
|
} else if t == "detail" {
|
|
} else if t == "detail" {
|
|
heart.DetailHeart = time.Now().Unix()
|
|
heart.DetailHeart = time.Now().Unix()
|
|
} else if t == "detailexcute" {
|
|
} else if t == "detailexcute" {
|
|
@@ -423,7 +432,7 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
db := HexToBigIntMod(href)
|
|
db := HexToBigIntMod(href)
|
|
hashHref := HexText(href)
|
|
hashHref := HexText(href)
|
|
id := ""
|
|
id := ""
|
|
- SaveListPageData(paramdata, &id) //存储采集记录
|
|
|
|
|
|
+ SaveListPageData(paramdata, &id, false) //存储采集记录
|
|
isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref) //取全量redis
|
|
isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref) //取全量redis
|
|
//log.Println("full href:", href, " isExist:", isExist)
|
|
//log.Println("full href:", href, " isExist:", isExist)
|
|
logger.Debug("full href:", href, " isExist:", isExist)
|
|
logger.Debug("full href:", href, " isExist:", isExist)
|
|
@@ -497,7 +506,6 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
defer mu.Catch()
|
|
defer mu.Catch()
|
|
var err interface{}
|
|
var err interface{}
|
|
- //TODO 下载3级页,调用LUA分析;如果配置的不用下载3级页,就到此为止了,直接存储
|
|
|
|
data := map[string]interface{}{}
|
|
data := map[string]interface{}{}
|
|
paramdata := p.(map[string]interface{})
|
|
paramdata := p.(map[string]interface{})
|
|
for k, v := range paramdata {
|
|
for k, v := range paramdata {
|
|
@@ -538,7 +546,21 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
*num++ //已采集
|
|
*num++ //已采集
|
|
return
|
|
return
|
|
}
|
|
}
|
|
- SaveListPageData(paramdata, &id) //保存7000、7410、7500、7700节点列表页采集的信息
|
|
|
|
|
|
+ isEsRepeat := false
|
|
|
|
+ if delayDay := util.Config.DelaySites[s.Name]; delayDay > 0 { //类竞品站点爬虫title做es7天内判重检验(顺序采集无法延迟,只能判重)
|
|
|
|
+ title := qu.ObjToString(paramdata["title"])
|
|
|
|
+ eTime := fmt.Sprint(GetTime(0))
|
|
|
|
+ sTime := fmt.Sprint(GetTime(-7))
|
|
|
|
+ esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + sTime + `","lte": "` + eTime + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
|
+ if Es.Count(EsIndex, EsType, esQuery) > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
|
|
+ isEsRepeat = true
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ SaveListPageData(paramdata, &id, isEsRepeat) //保存7000、7410、7500、7700节点列表页采集的信息
|
|
|
|
+ if isEsRepeat { //类竞品数据title判重数据加入redis
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
|
|
+ return
|
|
|
|
+ }
|
|
}
|
|
}
|
|
//下载、解析、入库
|
|
//下载、解析、入库
|
|
data, err = s.DownloadDetailPage(paramdata, data)
|
|
data, err = s.DownloadDetailPage(paramdata, data)
|
|
@@ -721,9 +743,14 @@ func (s *Spider) DownloadHighDetail() {
|
|
for {
|
|
for {
|
|
logger.Info("Running Code:", s.Code, "Stop:", s.Stop)
|
|
logger.Info("Running Code:", s.Code, "Stop:", s.Stop)
|
|
if !s.Stop { //爬虫是运行状态
|
|
if !s.Stop { //爬虫是运行状态
|
|
- comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
|
|
|
|
- if delayDay := util.Config.DelaySites[s.Name]; delayDay > 0 && delayDay <= util.Config.DayNum { //判断该爬虫是否属于要延迟采集的站点
|
|
|
|
- comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
|
|
|
|
|
|
+ comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
|
|
|
|
+ isEsRepeat := false //是否进行es判重
|
|
|
|
+ if delayDay := util.Config.DelaySites[s.Name]; delayDay > 0 {
|
|
|
|
+ isEsRepeat = true
|
|
|
|
+ if delayDay <= util.Config.DayNum*24 { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay小时采集(由于7410、7500、7700为顺序采集,无法延时)
|
|
|
|
+ //comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
|
|
|
|
+ comeintimeQuery["$lte"] = time.Now().Unix() - int64(3600*delayDay)
|
|
|
|
+ }
|
|
}
|
|
}
|
|
q := map[string]interface{}{
|
|
q := map[string]interface{}{
|
|
"spidercode": s.Code,
|
|
"spidercode": s.Code,
|
|
@@ -744,13 +771,17 @@ func (s *Spider) DownloadHighDetail() {
|
|
for _, tmp := range *list {
|
|
for _, tmp := range *list {
|
|
_id := tmp["_id"]
|
|
_id := tmp["_id"]
|
|
query := map[string]interface{}{"_id": _id}
|
|
query := map[string]interface{}{"_id": _id}
|
|
- competehref := qu.ObjToString(tmp["competehref"])
|
|
|
|
- if competehref != "" { //验证三方网站数据剑鱼是否已采集
|
|
|
|
|
|
+ if isEsRepeat { //es数据title判重
|
|
title := qu.ObjToString(tmp["title"])
|
|
title := qu.ObjToString(tmp["title"])
|
|
- one, _ := Mgo.FindOne("data_bak", map[string]interface{}{"title": title})
|
|
|
|
- if one != nil && len(*one) > 0 { //剑鱼已采集,舍弃此条信息
|
|
|
|
|
|
+ href := qu.ObjToString(tmp["href"])
|
|
|
|
+ eTime := fmt.Sprint(GetTime(0))
|
|
|
|
+ sTime := fmt.Sprint(GetTime(-7))
|
|
|
|
+ esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + sTime + `","lte": "` + eTime + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
|
+ count := Es.Count(EsIndex, EsType, esQuery)
|
|
|
|
+ if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -849,12 +880,19 @@ func (s *Spider) DownloadListDetail() {
|
|
s.L.Close()
|
|
s.L.Close()
|
|
CC2 <- s.L
|
|
CC2 <- s.L
|
|
}()
|
|
}()
|
|
|
|
+ comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
|
|
|
|
+ isEsRepeat := false //是否进行es判重
|
|
|
|
+ if delayDay := util.Config.DelaySites[s.Name]; delayDay > 0 {
|
|
|
|
+ isEsRepeat = true
|
|
|
|
+ if delayDay <= util.Config.DayNum { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay天采集(由于7410、7500、7700为顺序采集,无法延时)
|
|
|
|
+ //comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
|
|
|
|
+ comeintimeQuery["$lte"] = time.Now().Unix() - int64(86400*delayDay)
|
|
|
|
+ }
|
|
|
|
+ }
|
|
q := map[string]interface{}{
|
|
q := map[string]interface{}{
|
|
"spidercode": s.Code,
|
|
"spidercode": s.Code,
|
|
"state": 0, //0:入库状态;-1:采集失败;1:成功
|
|
"state": 0, //0:入库状态;-1:采集失败;1:成功
|
|
- "comeintime": map[string]interface{}{ //采集一周内的数据,防止有数据一直采不下来,造成积累
|
|
|
|
- "$gte": GetTime(-util.Config.DayNum),
|
|
|
|
- },
|
|
|
|
|
|
+ "comeintime": comeintimeQuery,
|
|
}
|
|
}
|
|
o := map[string]interface{}{"_id": -1}
|
|
o := map[string]interface{}{"_id": -1}
|
|
f := map[string]interface{}{
|
|
f := map[string]interface{}{
|
|
@@ -870,13 +908,16 @@ func (s *Spider) DownloadListDetail() {
|
|
for _, tmp := range *list {
|
|
for _, tmp := range *list {
|
|
_id := tmp["_id"]
|
|
_id := tmp["_id"]
|
|
query := map[string]interface{}{"_id": _id}
|
|
query := map[string]interface{}{"_id": _id}
|
|
- competehref := qu.ObjToString(tmp["competehref"])
|
|
|
|
- if competehref != "" { //验证三方网站数据剑鱼是否已采集
|
|
|
|
|
|
+ if isEsRepeat { //es数据title判重
|
|
title := qu.ObjToString(tmp["title"])
|
|
title := qu.ObjToString(tmp["title"])
|
|
- one, _ := Mgo.FindOne("data_bak", map[string]interface{}{"title": title})
|
|
|
|
- if one != nil && len(*one) > 0 { //剑鱼已采集,舍弃此条信息
|
|
|
|
|
|
+ href := qu.ObjToString(tmp["href"])
|
|
|
|
+ eTime := fmt.Sprint(GetTime(0))
|
|
|
|
+ sTime := fmt.Sprint(GetTime(-7))
|
|
|
|
+ esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + sTime + `","lte": "` + eTime + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
|
+ if Es.Count(EsIndex, EsType, esQuery) > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*30)
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|