|
@@ -1,6 +1,7 @@
|
|
|
package timetask
|
|
|
|
|
|
import (
|
|
|
+ "encoding/json"
|
|
|
qu "qfw/util"
|
|
|
"strings"
|
|
|
"sync"
|
|
@@ -18,6 +19,127 @@ type WarnInfo struct {
|
|
|
Infos map[string]bool
|
|
|
Code interface{}
|
|
|
Href interface{}
|
|
|
+ Repeat bool
|
|
|
+}
|
|
|
+
|
|
|
+var StypeArr = []string{
|
|
|
+ "Field Value Is Null",
|
|
|
+ "Field Value Contains Random Code",
|
|
|
+ "Field Value Not Contains Chinese",
|
|
|
+ "Detail File Err",
|
|
|
+}
|
|
|
+
|
|
|
+func PushSpiderWarnErrData() {
|
|
|
+ GetSpiderWarnData()
|
|
|
+ GetHighlistDetailFilErrData()
|
|
|
+}
|
|
|
+
|
|
|
+func GetHighlistDetailFilErrData() {
|
|
|
+ defer qu.Catch()
|
|
|
+ sess := util.MgoS.GetMgoConn()
|
|
|
+ defer util.MgoS.DestoryMongoConn(sess)
|
|
|
+ stime := util.GetTime(-7)
|
|
|
+ etime := util.GetTime(0)
|
|
|
+ query := map[string]interface{}{
|
|
|
+ "comeintime": map[string]interface{}{
|
|
|
+ "$gte": stime,
|
|
|
+ "$lt": etime,
|
|
|
+ },
|
|
|
+ "detailfilerr": true,
|
|
|
+ "state": -1,
|
|
|
+ }
|
|
|
+ fields := map[string]interface{}{
|
|
|
+ "site": 1,
|
|
|
+ "channel": 1,
|
|
|
+ "spidercode": 1,
|
|
|
+ "area": 1,
|
|
|
+ "city": 1,
|
|
|
+ "district": 1,
|
|
|
+ "jsondata": 1,
|
|
|
+ "publishtime": 1,
|
|
|
+ "comeintime": 1,
|
|
|
+ "href": 1,
|
|
|
+ "title": 1,
|
|
|
+ "dataging": 1,
|
|
|
+ "_id": 0,
|
|
|
+ }
|
|
|
+ ch := make(chan bool, 2)
|
|
|
+ wg := &sync.WaitGroup{}
|
|
|
+ lock := &sync.Mutex{}
|
|
|
+ arr := []map[string]interface{}{}
|
|
|
+ it := sess.DB(util.MgoS.DbName).C("spider_highlistdata").Find(&query).Select(&fields).Iter()
|
|
|
+ n := 0
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
|
|
|
+ ch <- true
|
|
|
+ wg.Add(1)
|
|
|
+ go func(tmp map[string]interface{}) {
|
|
|
+ defer func() {
|
|
|
+ <-ch
|
|
|
+ wg.Done()
|
|
|
+ }()
|
|
|
+ result := map[string]interface{}{}
|
|
|
+ result["from"] = "list"
|
|
|
+ result["level"] = 2
|
|
|
+ result["info"] = "Detail File Err"
|
|
|
+ result["ok"] = false
|
|
|
+ result["field"] = "detail"
|
|
|
+ result["site"] = tmp["site"]
|
|
|
+ result["channel"] = tmp["channel"]
|
|
|
+ result["title"] = tmp["title"]
|
|
|
+ result["href"] = tmp["href"]
|
|
|
+ result["spidercode"] = tmp["spidercode"]
|
|
|
+ result["comeintime"] = time.Now().Unix()
|
|
|
+ //publishtime
|
|
|
+ publishtime_str := qu.ObjToString(tmp["publishtime"])
|
|
|
+ publishtime_int := int64(0)
|
|
|
+ if publishtime_str != "0" {
|
|
|
+ if t, err := time.ParseInLocation(qu.Date_Full_Layout, publishtime_str, time.Local); err == nil {
|
|
|
+ publishtime_int = t.Unix()
|
|
|
+ }
|
|
|
+ }
|
|
|
+ result["repeat"] = RepeatData(qu.ObjToString(tmp["title"]), publishtime_int)
|
|
|
+ //jsondata
|
|
|
+ if jsondata := qu.ObjToString(tmp["jsondata"]); jsondata != "" {
|
|
|
+ jsondataMap := map[string]interface{}{}
|
|
|
+ if json.Unmarshal([]byte(jsondata), &jsondataMap) == nil {
|
|
|
+ tmp["jsondata"] = jsondataMap
|
|
|
+ } else {
|
|
|
+ delete(tmp, "jsondata")
|
|
|
+ }
|
|
|
+ }
|
|
|
+ iscompete := false
|
|
|
+ coll := "bidding"
|
|
|
+ lua, _ := util.MgoEB.FindOne("luaconfig", map[string]interface{}{"code": tmp["spidercode"]})
|
|
|
+ if len(*lua) > 0 {
|
|
|
+ iscompete, _ = (*lua)["spidercompete"].(bool)
|
|
|
+ param_common := (*lua)["param_common"].([]interface{})
|
|
|
+ if len(param_common) >= 8 {
|
|
|
+ coll = qu.ObjToString(param_common[7])
|
|
|
+ }
|
|
|
+ }
|
|
|
+ tmp["iscompete"] = iscompete
|
|
|
+ tmp["publishtime"] = publishtime_int
|
|
|
+ tmp["_d"] = "comeintime"
|
|
|
+ tmp["T"] = coll
|
|
|
+ result["data"] = tmp
|
|
|
+ lock.Lock()
|
|
|
+ arr = append(arr, result)
|
|
|
+ if len(arr) > 500 {
|
|
|
+ util.MgoS.SaveBulk("spider_warn_err", arr...)
|
|
|
+ arr = []map[string]interface{}{}
|
|
|
+ }
|
|
|
+ lock.Unlock()
|
|
|
+ }(tmp)
|
|
|
+ if n%100 == 0 {
|
|
|
+ qu.Debug("current:", n)
|
|
|
+ }
|
|
|
+ tmp = map[string]interface{}{}
|
|
|
+ }
|
|
|
+ wg.Wait()
|
|
|
+ if len(arr) > 0 {
|
|
|
+ util.MgoS.SaveBulk("spider_warn_err", arr...)
|
|
|
+ arr = []map[string]interface{}{}
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
func GetSpiderWarnData() {
|
|
@@ -35,9 +157,10 @@ func GetSpiderWarnData() {
|
|
|
"$gte": stime,
|
|
|
"$lt": etime,
|
|
|
},
|
|
|
- "info": map[string]interface{}{
|
|
|
- "$in": []string{"Html Contains Temp Language", "Field Value Contains Random Code", "Publishtime Is Too Early", "Publishtime Is Too Late", "Field Value Not Contains Chinese"},
|
|
|
+ "info": map[string]interface{}{ //保存服务更新后这个条件可去掉2022-11-28
|
|
|
+ "$in": StypeArr,
|
|
|
},
|
|
|
+ "level": 2,
|
|
|
}
|
|
|
ch := make(chan bool, 2)
|
|
|
wg := &sync.WaitGroup{}
|
|
@@ -58,24 +181,28 @@ func GetSpiderWarnData() {
|
|
|
field := qu.ObjToString(tmp["field"])
|
|
|
info := qu.ObjToString(tmp["info"])
|
|
|
title := qu.ObjToString(tmp["title"])
|
|
|
- if info == "Field Value Not Contains Chinese" && RepeatData(title) > 0 { //数据验证,数据库已有title一致的数据不再推送
|
|
|
- return
|
|
|
- }
|
|
|
- if field == "publishtime" { //特殊处理publishtime字段的level(保存服务中publishtime异常数据入bidding库,level不能为2)
|
|
|
- level = 1
|
|
|
+ publishtime := int64(0)
|
|
|
+ data, ok := tmp["data"].(map[string]interface{})
|
|
|
+ if ok {
|
|
|
+ if ptime := data["publishtime"]; ptime != nil {
|
|
|
+ publishtime = qu.Int64All(ptime)
|
|
|
+ }
|
|
|
}
|
|
|
+ //数据验证,是否有title一致,相似publishtime的数据,视为一样的数据,不需要再修复
|
|
|
+ repeat := RepeatData(title, publishtime)
|
|
|
lock.Lock()
|
|
|
if warnInfo := result[href]; warnInfo == nil {
|
|
|
warnInfo = &WarnInfo{
|
|
|
Fields: map[string]bool{field: true},
|
|
|
MaxLevel: level,
|
|
|
- Data: tmp["data"],
|
|
|
+ Data: data,
|
|
|
Site: tmp["site"],
|
|
|
Channel: tmp["channel"],
|
|
|
Title: title,
|
|
|
Infos: map[string]bool{info: true},
|
|
|
Code: tmp["code"],
|
|
|
Href: href,
|
|
|
+ Repeat: repeat,
|
|
|
}
|
|
|
result[href] = warnInfo
|
|
|
} else {
|
|
@@ -117,12 +244,14 @@ func GetSpiderWarnData() {
|
|
|
"site": w.Site,
|
|
|
"channel": w.Channel,
|
|
|
"title": w.Title,
|
|
|
+ "repeat": w.Repeat,
|
|
|
"comeintime": time.Now().Unix(),
|
|
|
"info": strings.Join(infos, ","),
|
|
|
- "code": w.Code,
|
|
|
+ "spidercode": w.Code,
|
|
|
"href": w.Href,
|
|
|
"data": w.Data,
|
|
|
"ok": false,
|
|
|
+ "from": "warn",
|
|
|
})
|
|
|
if len(saveArr) > 500 {
|
|
|
util.MgoS.SaveBulk("spider_warn_err", saveArr...)
|
|
@@ -138,15 +267,15 @@ func GetSpiderWarnData() {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-func RepeatData(title string) int {
|
|
|
+func RepeatData(title string, publishtime int64) bool {
|
|
|
return util.MgoB.Count("bidding",
|
|
|
map[string]interface{}{
|
|
|
"title": title,
|
|
|
- "comeintime": map[string]interface{}{
|
|
|
- "$gte": util.GetTime(-3),
|
|
|
- "$lte": time.Now().Unix(),
|
|
|
+ "publishtime": map[string]interface{}{
|
|
|
+ "$gte": publishtime + 86400*3,
|
|
|
+ "$lte": publishtime - 86400*3,
|
|
|
},
|
|
|
- })
|
|
|
+ }) > 0
|
|
|
}
|
|
|
|
|
|
/*
|