|
@@ -23,11 +23,14 @@ type WarnInfo struct {
|
|
|
Repeat bool
|
|
|
}
|
|
|
|
|
|
-var StypeArr = []string{
|
|
|
- "Field Value Is Null",
|
|
|
- "Field Value Contains Random Code",
|
|
|
- "Field Value Not Contains Chinese",
|
|
|
- "Detail File Err",
|
|
|
+var ErrorStypeMap = map[string]int{
|
|
|
+ "Field Value Is Null": 2,
|
|
|
+ "Field Value Contains Random Code": 2,
|
|
|
+ "Field Value Not Contains Chinese": 2,
|
|
|
+ "Detail File Err": 2,
|
|
|
+}
|
|
|
+var WarnStypeMap = map[string]int{
|
|
|
+ "Attachment Upload Failed": 1,
|
|
|
}
|
|
|
var httpReg = regexp.MustCompile(`^(http|https).*`)
|
|
|
|
|
@@ -153,21 +156,18 @@ func GetSpiderWarnData() {
|
|
|
defer util.MgoS.DestoryMongoConn(sess)
|
|
|
stime := util.GetTime(-1)
|
|
|
etime := util.GetTime(0)
|
|
|
- //if time.Now().Weekday().String() == "Monday" {
|
|
|
- // stime = util.GetTime(-3)
|
|
|
- //}
|
|
|
query := map[string]interface{}{
|
|
|
"comeintime": map[string]interface{}{
|
|
|
"$gte": stime,
|
|
|
"$lt": etime,
|
|
|
},
|
|
|
- "info": map[string]interface{}{ //保存服务更新后这个条件可去掉2022-11-28
|
|
|
- "$in": StypeArr,
|
|
|
- },
|
|
|
- "level": 2,
|
|
|
+ //"info": map[string]interface{}{ //保存服务更新后这个条件可去掉2022-11-28
|
|
|
+ // "$in": StypeArr,
|
|
|
+ //},
|
|
|
+ //"level": 2,
|
|
|
}
|
|
|
invalidDate := time.Now().AddDate(-2, 0, 0).Unix()
|
|
|
- ch := make(chan bool, 2)
|
|
|
+ ch := make(chan bool, 3)
|
|
|
wg := &sync.WaitGroup{}
|
|
|
lock := &sync.Mutex{}
|
|
|
result := map[string]*WarnInfo{}
|
|
@@ -181,55 +181,58 @@ func GetSpiderWarnData() {
|
|
|
<-ch
|
|
|
wg.Done()
|
|
|
}()
|
|
|
- publishtime := int64(0)
|
|
|
- data, ok := tmp["data"].(map[string]interface{})
|
|
|
- if ok {
|
|
|
- if ptime := data["publishtime"]; ptime != nil {
|
|
|
- publishtime = qu.Int64All(ptime)
|
|
|
- if publishtime > 0 && publishtime < invalidDate { //两年前的历史数据不再推送修改
|
|
|
- return
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
info := qu.ObjToString(tmp["info"])
|
|
|
- if info == "Detail File Err" { //正文是链接的,进行链接判重
|
|
|
- hrefDetail := httpReg.FindString(qu.ObjToString(data["detail"]))
|
|
|
- if hrefDetail != "" {
|
|
|
- esQuery := `{"query": {"bool": {"must": [{"term": {"href": "` + hrefDetail + `"}}]}}}`
|
|
|
- if util.Es.Count(util.EsIndex, util.EsType, esQuery) >= 1 {
|
|
|
- return
|
|
|
+ level := qu.IntAll(tmp["level"])
|
|
|
+ //指定的错误类型和级别匹配的数据,推送spider_warn_err
|
|
|
+ if ErrorStypeMap[info] == level || WarnStypeMap[info] == level {
|
|
|
+ publishtime := int64(0)
|
|
|
+ data, ok := tmp["data"].(map[string]interface{})
|
|
|
+ if ok {
|
|
|
+ if ptime := data["publishtime"]; ptime != nil {
|
|
|
+ publishtime = qu.Int64All(ptime)
|
|
|
+ if publishtime > 0 && publishtime < invalidDate { //两年前的历史数据不再推送修改
|
|
|
+ return
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
- }
|
|
|
- href := qu.ObjToString(tmp["href"])
|
|
|
- level := qu.IntAll(tmp["level"])
|
|
|
- field := qu.ObjToString(tmp["field"])
|
|
|
- title := qu.ObjToString(tmp["title"])
|
|
|
- //数据验证,是否有title一致,相似publishtime的数据,视为一样的数据,不需要再修复
|
|
|
- repeat := RepeatData(title, publishtime)
|
|
|
- lock.Lock()
|
|
|
- if warnInfo := result[href]; warnInfo == nil {
|
|
|
- warnInfo = &WarnInfo{
|
|
|
- Fields: map[string]bool{field: true},
|
|
|
- MaxLevel: level,
|
|
|
- Data: data,
|
|
|
- Site: tmp["site"],
|
|
|
- Channel: tmp["channel"],
|
|
|
- Title: title,
|
|
|
- Infos: map[string]bool{info: true},
|
|
|
- Code: tmp["code"],
|
|
|
- Href: href,
|
|
|
- Repeat: repeat,
|
|
|
+ if info == "Detail File Err" { //正文是链接的,进行链接判重
|
|
|
+ hrefDetail := httpReg.FindString(qu.ObjToString(data["detail"]))
|
|
|
+ if hrefDetail != "" {
|
|
|
+ esQuery := `{"query": {"bool": {"must": [{"term": {"href": "` + hrefDetail + `"}}]}}}`
|
|
|
+ if util.Es.Count(util.EsIndex, util.EsType, esQuery) >= 1 {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
- result[href] = warnInfo
|
|
|
- } else {
|
|
|
- warnInfo.Fields[field] = true
|
|
|
- warnInfo.Infos[info] = true
|
|
|
- if warnInfo.MaxLevel < level {
|
|
|
- warnInfo.MaxLevel = level
|
|
|
+ href := qu.ObjToString(tmp["href"])
|
|
|
+ field := qu.ObjToString(tmp["field"])
|
|
|
+ title := qu.ObjToString(tmp["title"])
|
|
|
+ //数据验证,是否有title一致,相似publishtime的数据,视为一样的数据,不需要再修复
|
|
|
+ repeat := RepeatData(title, publishtime)
|
|
|
+ lock.Lock()
|
|
|
+ if warnInfo := result[href]; warnInfo == nil {
|
|
|
+ warnInfo = &WarnInfo{
|
|
|
+ Fields: map[string]bool{field: true},
|
|
|
+ MaxLevel: level,
|
|
|
+ Data: data,
|
|
|
+ Site: tmp["site"],
|
|
|
+ Channel: tmp["channel"],
|
|
|
+ Title: title,
|
|
|
+ Infos: map[string]bool{info: true},
|
|
|
+ Code: tmp["code"],
|
|
|
+ Href: href,
|
|
|
+ Repeat: repeat,
|
|
|
+ }
|
|
|
+ result[href] = warnInfo
|
|
|
+ } else {
|
|
|
+ warnInfo.Fields[field] = true
|
|
|
+ warnInfo.Infos[info] = true
|
|
|
+ if warnInfo.MaxLevel < level {
|
|
|
+ warnInfo.MaxLevel = level
|
|
|
+ }
|
|
|
}
|
|
|
+ lock.Unlock()
|
|
|
}
|
|
|
- lock.Unlock()
|
|
|
}(tmp)
|
|
|
if n%1000 == 0 {
|
|
|
qu.Debug("current:", n)
|
|
@@ -285,14 +288,14 @@ func GetSpiderWarnData() {
|
|
|
}
|
|
|
|
|
|
func RepeatData(title string, publishtime int64) bool {
|
|
|
- return util.MgoB.Count("bidding",
|
|
|
- map[string]interface{}{
|
|
|
- "title": title,
|
|
|
- "publishtime": map[string]interface{}{
|
|
|
- "$gte": publishtime + 86400*3,
|
|
|
- "$lte": publishtime - 86400*3,
|
|
|
- },
|
|
|
- }) > 0
|
|
|
+ q := map[string]interface{}{
|
|
|
+ "title": title,
|
|
|
+ "publishtime": map[string]interface{}{
|
|
|
+ "$lte": publishtime + 86400*3,
|
|
|
+ "$gte": publishtime - 86400*3,
|
|
|
+ },
|
|
|
+ }
|
|
|
+ return util.MgoB.Count("bidding", q) > 0
|
|
|
}
|
|
|
|
|
|
/*
|