maxiaoshan hace 3 años
padre
commit
80a10eaba8
Se han modificado 3 ficheros con 66 adiciones y 15 borrados
  1. 6 0
      src/config.json
  2. 9 0
      src/main.go
  3. 51 15
      src/spider/spider.go

+ 6 - 0
src/config.json

@@ -35,6 +35,12 @@
     "redishosts": [],
     "fileServer": "http://test.qmx.top:9333",
     "jsvmurl": "http://127.0.0.1:8080/jsvm",
+    "es": {
+        "addr": "http://192.168.3.206:9800",
+        "pool": 15,
+        "index": "bidding",
+        "type": "bidding"
+    },
     "luadisablelib": {
         "baselib": {
             "print": true

+ 9 - 0
src/main.go

@@ -7,6 +7,7 @@ import (
 	mgo "mongodb"
 	"os"
 	qu "qfw/util"
+	es "qfw/util/elastic"
 	"regexp"
 	"spider"
 
@@ -38,6 +39,14 @@ func init() {
 	spider.MgoS.InitPool()
 	//初始化Redis
 	InitRedis(Config.Redisservers)
+	//初始化es
+	spider.EsIndex = qu.ObjToString(Config.Es["index"])
+	spider.EsType = qu.ObjToString(Config.Es["type"])
+	spider.Es = &es.Elastic{
+		S_esurl: qu.ObjToString(Config.Es["addr"]),
+		I_size:  qu.IntAll(Config.Es["pool"]),
+	}
+	spider.Es.InitElasticSize()
 	//启动消息服务
 	spider.InitMsgClient(Config.Msgserveraddr, Config.Msgname)
 	spider.InitMsgClientFile(Config.MsgserveraddrFile, Config.Msgname+"file")

+ 51 - 15
src/spider/spider.go

@@ -13,6 +13,7 @@ import (
 	"math/rand"
 	mu "mfw/util"
 	qu "qfw/util"
+	es "qfw/util/elastic"
 	"regexp"
 	util "spiderutil"
 	"strings"
@@ -60,6 +61,9 @@ type Spider struct {
 	IsCompete        bool //区分新老爬虫
 }
 
+var Es *es.Elastic
+var EsIndex string
+var EsType string
 var UpdataMgoCache = make(chan []map[string]interface{}, 1000) //更新要重下数据的状态
 var SP = make(chan bool, 5)
 var TimeChan = make(chan bool, 1)
@@ -107,9 +111,21 @@ func DownloadHighDetail(code string) {
 		}
 		list := &[]map[string]interface{}{} //查询数据的集合
 		for day := 0; day <= util.Config.DayNum; day++ {
-			comeintime := map[string]interface{}{"$gte": GetTime(-day)} //指定查询数据的时间
-			if day != 0 {                                               //不是当天,指定数据范围
+			startTime := GetTime(-day)
+			comeintime := map[string]interface{}{"$gte": startTime} //指定查询数据的时间
+			if day != 0 {                                           //不是当天,指定数据范围
 				comeintime["$lt"] = GetTime(-day + 1)
+			} else if code == "a_gcy_mcgg" { //
+				endTime := time.Now().Unix() - 12*3600
+				if endTime > startTime {
+					comeintime = map[string]interface{}{
+						"$gte": startTime,
+						"$lt":  endTime,
+					}
+				} else {
+					continue
+				}
+
 			}
 			q["comeintime"] = comeintime
 			list, _ = MgoS.Find("spider_highlistdata", q, o, f, false, 0, 100)
@@ -147,19 +163,19 @@ func DownloadHighDetail(code string) {
 						MgoS.Update("spider_highlistdata", query, set, false, false)
 						return
 					}
-					// if isEsRepeat { //竞品数据es title判重
-					// 	title := qu.ObjToString(tmp["title"])
-					// 	eTime := time.Now().Unix()
-					// 	sTime := eTime - int64(7*86400)
-					// 	esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
-					// 	count := Es.Count(EsIndex, EsType, esQuery)
-					// 	if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
-					// 		set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
-					// 		Mgo.Update("spider_highlistdata", query, set, false, false)
-					// 		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
-					// 		continue
-					// 	}
-					// }
+					if code == "a_gcy_mcgg" { //竞品数据es title判重
+						title := qu.ObjToString(tmp["title"])
+						eTime := time.Now().Unix()
+						sTime := eTime - int64(7*86400)
+						esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
+						count := Es.Count(EsIndex, EsType, esQuery)
+						if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
+							set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
+							MgoS.Update("spider_highlistdata", query, set, false, false)
+							util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+							return
+						}
+					}
 					competehref := qu.ObjToString(tmp["competehref"])
 					if competehref != "" { //验证三方网站数据剑鱼是否已采集
 						title := qu.ObjToString(tmp["title"])
@@ -217,6 +233,11 @@ func DownloadHighDetail(code string) {
 						set := map[string]interface{}{"$set": ss}
 						MgoS.Update("spider_highlistdata", query, set, false, false)
 						return
+					} else { //三级页过滤
+						deleteData := FilterByDetail(href, query, data) //针对列表页无法过滤需要在详情页过滤的数据,进行过滤处理
+						if deleteData {
+							return
+						}
 					}
 					t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
 					if t1 > time.Now().Unix() { //防止发布时间超前
@@ -251,6 +272,21 @@ func DownloadHighDetail(code string) {
 		}
 	}
 }
+func FilterByDetail(href string, query, data map[string]interface{}) bool {
+	if data["delete"] != nil {
+		//增量
+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+		//全量
+		db := HexToBigIntMod(href)
+		hashHref := HexText(href)
+		util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
+		//更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
+		set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true}}
+		MgoS.Update("spider_highlistdata", query, set, false, false)
+		return true
+	}
+	return false
+}
 
 //下载解析内容页
 func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {