Просмотр исходного кода

spider_warn异常数据推送调整

maxiaoshan 2 лет назад
Родитель
Сommit
e9e1534ecb
2 измененных файлов с 21 добавлено и 10 удалено
  1. 6 6
      src/config.json
  2. 15 4
      src/timetask/random.go

+ 6 - 6
src/config.json

@@ -1,35 +1,35 @@
 {
 	"spider":{
-		"addr": "192.168.3.207:27092",
+		"addr": "192.168.3.207:29099",
 		"db": "spider",
 		"size": 15
     },
     "editor": {
-		"addr": "192.168.3.207:27092",
+		"addr": "192.168.3.207:29099",
 		"db": "editor",
 		"size": 15
     },
 	"bideditor": {
-		"addr": "192.168.3.207:27092",
+		"addr": "192.168.3.207:29099",
 		"db": "editor",
 		"size": 2,
 		"username": "",
 		"password": ""
 	},
 	"pyspider":{
-		"addr": "192.168.3.207:27092",
+		"addr": "192.168.3.207:29099",
 		"db": "py_spider",
 		"size": 5
 	},
 	"bidding": {
-		"addr": "192.168.3.207:27092",
+		"addr": "192.168.3.207:29099",
 		"db": "qfw",
 		"size": 2,
 		"username": "",
 		"password": ""
 	},
 	"es": {
-		"addr": "http://172.17.4.184:19800",
+		"addr": "http://192.168.3.128:9800",
 		"pool": 2,
 		"index": "bidding",
 		"type": "bidding"

+ 15 - 4
src/timetask/random.go

@@ -3,6 +3,7 @@ package timetask
 import (
 	"encoding/json"
 	qu "qfw/util"
+	"regexp"
 	"strings"
 	"sync"
 	"time"
@@ -28,6 +29,7 @@ var StypeArr = []string{
 	"Field Value Not Contains Chinese",
 	"Detail File Err",
 }
+var httpReg = regexp.MustCompile(`^(http|https).*`)
 
 func PushSpiderWarnErrData() {
 	GetSpiderWarnData()
@@ -151,9 +153,9 @@ func GetSpiderWarnData() {
 	defer util.MgoS.DestoryMongoConn(sess)
 	stime := util.GetTime(-1)
 	etime := util.GetTime(0)
-	if time.Now().Weekday().String() == "Monday" {
-		stime = util.GetTime(-3)
-	}
+	//if time.Now().Weekday().String() == "Monday" {
+	//	stime = util.GetTime(-3)
+	//}
 	query := map[string]interface{}{
 		"comeintime": map[string]interface{}{
 			"$gte": stime,
@@ -189,10 +191,19 @@ func GetSpiderWarnData() {
 					}
 				}
 			}
+			info := qu.ObjToString(tmp["info"])
+			if info == "Detail File Err" { //正文是链接的,进行链接判重
+				hrefDetail := httpReg.FindString(qu.ObjToString(data["detail"]))
+				if hrefDetail != "" {
+					esQuery := `{"query": {"bool": {"must": [{"term": {"href": "` + hrefDetail + `"}}]}}}`
+					if util.Es.Count(util.EsIndex, util.EsType, esQuery) >= 1 {
+						return
+					}
+				}
+			}
 			href := qu.ObjToString(tmp["href"])
 			level := qu.IntAll(tmp["level"])
 			field := qu.ObjToString(tmp["field"])
-			info := qu.ObjToString(tmp["info"])
 			title := qu.ObjToString(tmp["title"])
 			//数据验证,是否有title一致,相似publishtime的数据,视为一样的数据,不需要再修复
 			repeat := RepeatData(title, publishtime)