|
@@ -26,12 +26,12 @@ import (
|
|
|
var (
|
|
|
lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
|
|
|
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
- saveLimit = 100 //抽取日志批量保存
|
|
|
- PageSize = 5000 //查询分页
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
+ saveLimit = 100 //抽取日志批量保存
|
|
|
+ PageSize = 5000 //查询分页
|
|
|
Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
@@ -584,11 +584,11 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
|
}
|
|
|
data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
|
|
|
if key == "budget" || key == "bidamount" {
|
|
|
- if istrue, ok := data[len(data)-1].(bool); istrue && ok {
|
|
|
- j.Result[key][i].IsTrue = true
|
|
|
- } else {
|
|
|
- continue
|
|
|
- }
|
|
|
+ if istrue, ok := data[len(data)-1].(bool); istrue && ok {
|
|
|
+ j.Result[key][i].IsTrue = true
|
|
|
+ } else {
|
|
|
+ continue
|
|
|
+ }
|
|
|
}
|
|
|
before, _ := v.Value.(string)
|
|
|
v.Value = data[0]
|
|
@@ -654,6 +654,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
|
|
|
if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
|
ExtRuleCore(tmp, e, vc, j, isSite)
|
|
|
}
|
|
|
+
|
|
|
// log.Debug("抽取-规则", tmp)
|
|
|
|
|
|
//抽取-后置规则
|
|
@@ -757,9 +758,6 @@ func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju
|
|
|
for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
|
|
|
if k == 0 {
|
|
|
tp = "colon"
|
|
|
- // for _, vv := range v.Kvs {
|
|
|
- // qu.Debug(vv.Key, vv.Value)
|
|
|
- // }
|
|
|
} else if k == 1 {
|
|
|
tp = "space"
|
|
|
} else if k == 2 {
|
|
@@ -1120,6 +1118,14 @@ func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kv
|
|
|
// }
|
|
|
} else if k == 1 {
|
|
|
tp = "space"
|
|
|
+ // for _, vv := range v.Kvs {
|
|
|
+ // qu.Debug("space-kvs:", vv.Key, vv.Value)
|
|
|
+ // }
|
|
|
+ // for kkk, vv := range v.KvTags {
|
|
|
+ // for _, vvv := range vv {
|
|
|
+ // qu.Debug("space-tags", kkk, vvv.Key, vvv.Value)
|
|
|
+ // }
|
|
|
+ // }
|
|
|
} else if k == 2 {
|
|
|
tp = "table"
|
|
|
// for _, vv := range v.Kvs {
|
|
@@ -1582,9 +1588,9 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
} else if v.Field == "projectname" {
|
|
|
tmp[v.Field] = v.Value
|
|
|
break
|
|
|
- } else if v.Field == "bidamount"||v.Field =="budget"{
|
|
|
- if v.IsTrue{
|
|
|
- tmp[v.Field] =v.Value
|
|
|
+ } else if v.Field == "bidamount" || v.Field == "budget" {
|
|
|
+ if v.IsTrue {
|
|
|
+ tmp[v.Field] = v.Value
|
|
|
break
|
|
|
}
|
|
|
}
|
|
@@ -1699,6 +1705,10 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ // fmt.Println("=============抽取结果================")
|
|
|
+ // for k, v := range tmp {
|
|
|
+ // qu.Debug(k, "---", v)
|
|
|
+ // }
|
|
|
//tmp["extract_content"] = j.Content
|
|
|
if e.TaskInfo.TestColl == "" {
|
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
@@ -1743,13 +1753,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
e.RWMutex.Unlock()
|
|
|
}
|
|
|
} else { //测试结果
|
|
|
- // fmt.Println("=============抽取结果================")
|
|
|
- // for k, v := range tmp {
|
|
|
- // qu.Debug(k, "---", v)
|
|
|
- // }
|
|
|
- // for field, _ := range e.Fields {
|
|
|
- // qu.Debug(field, "---", tmp[field])
|
|
|
- // }
|
|
|
delete(tmp, "_id")
|
|
|
if len(j.BlockPackage) > 0 { //分包详情
|
|
|
bs, _ := json.Marshal(j.BlockPackage)
|
|
@@ -1888,8 +1891,8 @@ func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
|
|
|
standardized = true
|
|
|
}
|
|
|
}
|
|
|
- if field == "budget"||field == "bidamount"{
|
|
|
- if !v.IsTrue{
|
|
|
+ if field == "budget" || field == "bidamount" {
|
|
|
+ if !v.IsTrue {
|
|
|
continue
|
|
|
}
|
|
|
}
|
|
@@ -1945,7 +1948,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
|
|
|
func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
|
|
|
defer qu.Catch()
|
|
|
i := redis.GetInt(field, field+"_"+fv) //查找redis
|
|
|
- if i == 0 { //reids未找到,执行规则匹配
|
|
|
+ if i == 0 { //reids未找到,执行规则匹配
|
|
|
val[field+"_isredis"] = false
|
|
|
e.RuleMatch(field, fv, val) //规则匹配
|
|
|
} else { //redis找到,打标识存库
|