|
@@ -340,7 +340,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
|
db := HexToBigIntMod(href) //根据href的哈希值选择Redis的db
|
|
|
hashHref := HexText(href)
|
|
|
//增量(redis默认db0)
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
//全量(判断是否已存在防止覆盖id)
|
|
|
isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)
|
|
|
if !isExist {
|
|
@@ -522,7 +522,7 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
|
set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}
|
|
|
if data["delete"] != nil {
|
|
|
//增量
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
//全量
|
|
|
db := HexToBigIntMod(href)
|
|
|
hashHref := HexText(href)
|
|
@@ -601,7 +601,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
if util.Config.Modal == 1 { //除7000、7500、7700节点外所有节点只采集列表页信息
|
|
|
isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
|
if isExist { //更新redis生命周期
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
*num++ //已采集
|
|
|
return
|
|
|
}
|
|
@@ -613,7 +613,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
}
|
|
|
isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
|
|
|
if isExist { //更新redis生命周期
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
*num++ //已采集
|
|
|
return
|
|
|
}
|
|
@@ -629,7 +629,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
}
|
|
|
SaveListPageData(paramdata, &id, isEsRepeat) //保存7000、7410、7500、7700节点列表页采集的信息
|
|
|
if isEsRepeat { //类竞品数据title判重数据加入redis
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
return
|
|
|
}
|
|
|
}
|
|
@@ -650,7 +650,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
log.Println("beforeHref:", href, "afterHref:", tmphref)
|
|
|
//增量
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
//全量
|
|
|
db := HexToBigIntMod(href)
|
|
|
hashHref := HexText(href)
|
|
@@ -667,7 +667,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
|
//详情页过滤数据
|
|
|
if data["delete"] != nil {
|
|
|
//增量
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
//全量
|
|
|
db := HexToBigIntMod(href)
|
|
|
hashHref := HexText(href)
|
|
@@ -939,7 +939,7 @@ func (s *Spider) DownloadDetail(stype string) {
|
|
|
esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
count := Es.Count(EsIndex, EsType, esQuery)
|
|
|
if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
update = append(update, query)
|
|
|
update = append(update, set)
|
|
@@ -977,7 +977,7 @@ func (s *Spider) DownloadDetail(stype string) {
|
|
|
} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
log.Println("beforeHref:", href, "afterHref:", tmphref)
|
|
|
//增量
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
//全量
|
|
|
db := HexToBigIntMod(href)
|
|
|
hashHref := HexText(href)
|
|
@@ -1001,7 +1001,7 @@ func (s *Spider) DownloadDetail(stype string) {
|
|
|
return
|
|
|
} else if data["delete"] != nil { //三级页过滤
|
|
|
//增量
|
|
|
- util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
//全量
|
|
|
db := HexToBigIntMod(href)
|
|
|
hashHref := HexText(href)
|
|
@@ -1103,7 +1103,7 @@ func NewSpiderByScript(num int, code string, info map[string]string, spChan chan
|
|
|
func AnalysisProjectInfo(data map[string]interface{}) bool {
|
|
|
defer qu.Catch()
|
|
|
detail := qu.ObjToString(data["detail"])
|
|
|
- if detail == "详情请访问原网页" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
|
|
|
+ if detail == "详情请访问原网页!" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
|
|
|
if projectinfo, ok := data["projectinfo"].(map[string]interface{}); ok && len(projectinfo) > 0 {
|
|
|
if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok && len(attachments) > 0 {
|
|
|
fileOk := false
|