|
@@ -27,12 +27,12 @@ import (
|
|
|
var (
|
|
|
lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
|
|
|
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
- saveLimit = 100 //抽取日志批量保存
|
|
|
- PageSize = 5000 //查询分页
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
+ saveLimit = 100 //抽取日志批量保存
|
|
|
+ PageSize = 5000 //查询分页
|
|
|
Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
@@ -529,7 +529,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
|
// log.Debug("抽取-规则", tmp)
|
|
|
//抽取-后置规则
|
|
|
for _, v := range vc.RuleBacks {
|
|
|
- ExtRegBack(j, v, e.TaskInfo)
|
|
|
+ ExtRegBack(j, v, e.TaskInfo, vc)
|
|
|
}
|
|
|
//kv规则
|
|
|
for _, v := range vc.KVRuleCores {
|
|
@@ -557,7 +557,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
|
}
|
|
|
for i := 0; i < 3; i++ {
|
|
|
for _, v := range vc.RuleBacks {
|
|
|
- ExtRegBack(j, v, e.TaskInfo)
|
|
|
+ ExtRegBack(j, v, e.TaskInfo, vc)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -566,11 +566,11 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
|
//全局后置规则
|
|
|
if isSite {
|
|
|
for _, v := range e.SiteRuleBacks {
|
|
|
- ExtRegBack(j, v, e.TaskInfo)
|
|
|
+ ExtRegBack(j, v, e.TaskInfo, nil)
|
|
|
}
|
|
|
} else {
|
|
|
for _, v := range e.RuleBacks {
|
|
|
- ExtRegBack(j, v, e.TaskInfo)
|
|
|
+ ExtRegBack(j, v, e.TaskInfo, nil)
|
|
|
}
|
|
|
}
|
|
|
//函数清理
|
|
@@ -671,7 +671,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
|
|
|
//抽取-后置规则
|
|
|
for _, v := range vc.RuleBacks {
|
|
|
if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
|
- ExtRegBack(j, v, e.TaskInfo)
|
|
|
+ ExtRegBack(j, v, e.TaskInfo, vc)
|
|
|
}
|
|
|
}
|
|
|
// log.Debug("抽取-后置规则", tmp)
|
|
@@ -681,7 +681,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
|
|
|
//全局后置规则
|
|
|
for _, v := range e.RuleBacks {
|
|
|
if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
|
- ExtRegBack(j, v, e.TaskInfo)
|
|
|
+ ExtRegBack(j, v, e.TaskInfo, nil)
|
|
|
}
|
|
|
}
|
|
|
//函数清理
|
|
@@ -1396,7 +1396,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job,
|
|
|
}
|
|
|
|
|
|
//后置过滤
|
|
|
-func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
+func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
|
|
|
defer qu.Catch()
|
|
|
if in.IsLua {
|
|
|
result := GetResultMapForLua(j)
|
|
@@ -1426,10 +1426,17 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
} else {
|
|
|
extinfo := map[string]interface{}{}
|
|
|
if in.Field != "" {
|
|
|
+ clearByTitle := false
|
|
|
+ if vc != nil && vc.ExtFrom == "title" && in.Field == "buyer" { //buyer从title抽取到的单独走titile的清理
|
|
|
+ clearByTitle = true
|
|
|
+ }
|
|
|
if j.Result[in.Field] != nil {
|
|
|
tmp := j.Result[in.Field]
|
|
|
exts := []interface{}{}
|
|
|
for k, v := range tmp {
|
|
|
+ if clearByTitle && v.ExtFrom != "title" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
//table抽取到的数据不清理
|
|
|
// if v.Type == "table" && v.Field != "projectname" {
|
|
|
// continue
|
|
@@ -2090,9 +2097,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
|
|
|
if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
|
|
|
marshalbt, _ := json.Marshal(j.Jsondata)
|
|
|
tmpjddata := make(map[string]interface{})
|
|
|
- json.Unmarshal(marshalbt,&tmpjddata)
|
|
|
- //jsondata清理
|
|
|
- clearJd(j.Jsondata)
|
|
|
+ json.Unmarshal(marshalbt, &tmpjddata)
|
|
|
for _, jdkey := range ju.JsonData {
|
|
|
if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
|
|
|
for tmpk, tmpv := range j.Result[jdkey][:5] {
|
|
@@ -2111,8 +2116,8 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
|
|
|
delete((*j.Jsondata), jdkey)
|
|
|
break
|
|
|
}
|
|
|
- }else {
|
|
|
- if (*j.Jsondata)[jdkey] == tmpv.Value{
|
|
|
+ } else {
|
|
|
+ if (*j.Jsondata)[jdkey] == tmpv.Value {
|
|
|
extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100}
|
|
|
j.Result[jdkey] = append(j.Result[jdkey], extField)
|
|
|
ju.Sort(j.Result[jdkey])
|
|
@@ -2123,7 +2128,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- if len(*j.Jsondata)>0{
|
|
|
+ if len(*j.Jsondata) > 0 {
|
|
|
j.Result = JsonDataMergeProcessing(j, e)
|
|
|
}
|
|
|
j.Jsondata = &tmpjddata
|