|
@@ -3,6 +3,7 @@ package timetask
|
|
import (
|
|
import (
|
|
"encoding/json"
|
|
"encoding/json"
|
|
qu "qfw/util"
|
|
qu "qfw/util"
|
|
|
|
+ "regexp"
|
|
"strings"
|
|
"strings"
|
|
"sync"
|
|
"sync"
|
|
"time"
|
|
"time"
|
|
@@ -28,6 +29,7 @@ var StypeArr = []string{
|
|
"Field Value Not Contains Chinese",
|
|
"Field Value Not Contains Chinese",
|
|
"Detail File Err",
|
|
"Detail File Err",
|
|
}
|
|
}
|
|
|
|
+var httpReg = regexp.MustCompile(`^(http|https).*`)
|
|
|
|
|
|
func PushSpiderWarnErrData() {
|
|
func PushSpiderWarnErrData() {
|
|
GetSpiderWarnData()
|
|
GetSpiderWarnData()
|
|
@@ -151,9 +153,9 @@ func GetSpiderWarnData() {
|
|
defer util.MgoS.DestoryMongoConn(sess)
|
|
defer util.MgoS.DestoryMongoConn(sess)
|
|
stime := util.GetTime(-1)
|
|
stime := util.GetTime(-1)
|
|
etime := util.GetTime(0)
|
|
etime := util.GetTime(0)
|
|
- if time.Now().Weekday().String() == "Monday" {
|
|
|
|
- stime = util.GetTime(-3)
|
|
|
|
- }
|
|
|
|
|
|
+ //if time.Now().Weekday().String() == "Monday" {
|
|
|
|
+ // stime = util.GetTime(-3)
|
|
|
|
+ //}
|
|
query := map[string]interface{}{
|
|
query := map[string]interface{}{
|
|
"comeintime": map[string]interface{}{
|
|
"comeintime": map[string]interface{}{
|
|
"$gte": stime,
|
|
"$gte": stime,
|
|
@@ -189,10 +191,19 @@ func GetSpiderWarnData() {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+ info := qu.ObjToString(tmp["info"])
|
|
|
|
+ if info == "Detail File Err" { //正文是链接的,进行链接判重
|
|
|
|
+ hrefDetail := httpReg.FindString(qu.ObjToString(data["detail"]))
|
|
|
|
+ if hrefDetail != "" {
|
|
|
|
+ esQuery := `{"query": {"bool": {"must": [{"term": {"href": "` + hrefDetail + `"}}]}}}`
|
|
|
|
+ if util.Es.Count(util.EsIndex, util.EsType, esQuery) >= 1 {
|
|
|
|
+ return
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
href := qu.ObjToString(tmp["href"])
|
|
href := qu.ObjToString(tmp["href"])
|
|
level := qu.IntAll(tmp["level"])
|
|
level := qu.IntAll(tmp["level"])
|
|
field := qu.ObjToString(tmp["field"])
|
|
field := qu.ObjToString(tmp["field"])
|
|
- info := qu.ObjToString(tmp["info"])
|
|
|
|
title := qu.ObjToString(tmp["title"])
|
|
title := qu.ObjToString(tmp["title"])
|
|
//数据验证,是否有title一致,相似publishtime的数据,视为一样的数据,不需要再修复
|
|
//数据验证,是否有title一致,相似publishtime的数据,视为一样的数据,不需要再修复
|
|
repeat := RepeatData(title, publishtime)
|
|
repeat := RepeatData(title, publishtime)
|