|
@@ -26,12 +26,12 @@ import (
|
|
var (
|
|
var (
|
|
lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
|
|
lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
|
|
|
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
|
- saveLimit = 100 //抽取日志批量保存
|
|
|
|
- PageSize = 5000 //查询分页
|
|
|
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
|
+ saveLimit = 100 //抽取日志批量保存
|
|
|
|
+ PageSize = 5000 //查询分页
|
|
Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1}`
|
|
Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1}`
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
)
|
|
)
|
|
@@ -301,6 +301,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
CategorySecond: subtype,
|
|
CategorySecond: subtype,
|
|
Content: qu.ObjToString(doc["detail"]),
|
|
Content: qu.ObjToString(doc["detail"]),
|
|
SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
|
|
+ Site: qu.ObjToString(doc["site"]),
|
|
//Domain: qu.ObjToString(doc["domain"]),
|
|
//Domain: qu.ObjToString(doc["domain"]),
|
|
//Href: qu.ObjToString(doc["href"]),
|
|
//Href: qu.ObjToString(doc["href"]),
|
|
Title: qu.ObjToString(doc["title"]),
|
|
Title: qu.ObjToString(doc["title"]),
|
|
@@ -318,6 +319,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
Category: toptype,
|
|
Category: toptype,
|
|
Content: qu.ObjToString(doc["detailfile"]),
|
|
Content: qu.ObjToString(doc["detailfile"]),
|
|
SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
|
|
+ Site: qu.ObjToString(doc["site"]),
|
|
Title: qu.ObjToString(doc["title"]),
|
|
Title: qu.ObjToString(doc["title"]),
|
|
Data: &doc,
|
|
Data: &doc,
|
|
City: qu.ObjToString(doc["city"]),
|
|
City: qu.ObjToString(doc["city"]),
|
|
@@ -1523,6 +1525,8 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
auxinfo := auxInfo(j)
|
|
auxinfo := auxInfo(j)
|
|
//从排序结果中取值
|
|
//从排序结果中取值
|
|
tmp := map[string]interface{}{} //抽取值
|
|
tmp := map[string]interface{}{} //抽取值
|
|
|
|
+ tmp["spidercode"] = j.SpiderCode
|
|
|
|
+ tmp["site"] = j.Site
|
|
tmp["jsondata"] = j.Jsondata
|
|
tmp["jsondata"] = j.Jsondata
|
|
tmp["fieldall"] = auxinfo
|
|
tmp["fieldall"] = auxinfo
|
|
for _, val := range result {
|
|
for _, val := range result {
|
|
@@ -1871,7 +1875,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
|
|
func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
|
|
func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
i := redis.GetInt(field, field+"_"+fv) //查找redis
|
|
i := redis.GetInt(field, field+"_"+fv) //查找redis
|
|
- if i == 0 { //reids未找到,执行规则匹配
|
|
|
|
|
|
+ if i == 0 { //reids未找到,执行规则匹配
|
|
val[field+"_isredis"] = false
|
|
val[field+"_isredis"] = false
|
|
e.RuleMatch(field, fv, val) //规则匹配
|
|
e.RuleMatch(field, fv, val) //规则匹配
|
|
} else { //redis找到,打标识存库
|
|
} else { //redis找到,打标识存库
|