|
@@ -23,13 +23,13 @@ import (
|
|
|
)
|
|
|
|
|
|
var (
|
|
|
- lock sync.RWMutex
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
- saveLimit = 200 //抽取日志批量保存
|
|
|
- PageSize = 5000 //查询分页
|
|
|
+ lock sync.RWMutex
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
+ saveLimit = 200 //抽取日志批量保存
|
|
|
+ PageSize = 5000 //查询分页
|
|
|
Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
@@ -345,7 +345,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
//抽取规则
|
|
|
tmprules := map[string][]*RuleCore{}
|
|
|
lock.Lock()
|
|
|
- if j.Category == "*"{
|
|
|
+ if e.RuleCores[j.Category] == nil {
|
|
|
j.Category = "*_其他"
|
|
|
}
|
|
|
for k, vc1 := range e.RuleCores[j.Category] {
|
|
@@ -393,7 +393,13 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
}
|
|
|
}
|
|
|
} else {
|
|
|
- for _, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
|
|
|
+ var cores map[string][]*RuleCore
|
|
|
+ if e.RuleCores[j.Category+"_"+j.CategorySecond] == nil {
|
|
|
+ cores = e.RuleCores["*_其他"]
|
|
|
+ } else {
|
|
|
+ cores = e.RuleCores[j.Category+"_"+j.CategorySecond]
|
|
|
+ }
|
|
|
+ for _, vc1 := range cores {
|
|
|
for _, vc := range vc1 {
|
|
|
tmp := ju.DeepCopy(doc).(map[string]interface{})
|
|
|
//是否进入逻辑
|
|
@@ -477,7 +483,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
clear.MesField[key] != nil {
|
|
|
text := qu.ObjToString(v.Value)
|
|
|
text = clear.OtherClean(key, text)
|
|
|
- v.Value = text
|
|
|
+ if text != "" {
|
|
|
+ v.Value = text
|
|
|
+ }
|
|
|
}
|
|
|
lock.Unlock()
|
|
|
}
|
|
@@ -675,15 +683,15 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
|
}
|
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
|
for _, tmp := range tmps {
|
|
|
- field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], }
|
|
|
- if extfrom == "title"{
|
|
|
+ field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"],}
|
|
|
+ if extfrom == "title" {
|
|
|
field.Score = 4
|
|
|
}
|
|
|
if tmp["blocktag"] != nil {
|
|
|
field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
}
|
|
|
item := &ju.ScoreItem{Des: "初始化", Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
|
|
|
- if extfrom =="title"{
|
|
|
+ if extfrom == "title" {
|
|
|
item.Score = 4
|
|
|
}
|
|
|
if tmp["scoreitem"] == nil {
|
|
@@ -916,6 +924,12 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
|
continue
|
|
|
}
|
|
|
val := text[pos[p]:pos[p+1]]
|
|
|
+ if val == "招标公告" {
|
|
|
+ return extinfo
|
|
|
+ }
|
|
|
+ if utf8.RuneCountInString(val) < 2 && extfrom == "title" {
|
|
|
+ val = text
|
|
|
+ }
|
|
|
tmps := []map[string]interface{}{}
|
|
|
tmp := map[string]interface{}{
|
|
|
"field": v.Field,
|
|
@@ -931,14 +945,14 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
|
extinfo[k] = tmps
|
|
|
if strings.TrimSpace(val) != "" {
|
|
|
exfield := ju.ExtField{Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
|
|
|
- if extfrom == "title"{
|
|
|
+ if extfrom == "title" {
|
|
|
exfield.Score = 4
|
|
|
}
|
|
|
if tmp["blocktag"] != nil {
|
|
|
exfield.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
}
|
|
|
item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
|
|
|
- if extfrom =="title"{
|
|
|
+ if extfrom == "title" {
|
|
|
item.Score = 4
|
|
|
}
|
|
|
if tmp["scoreitem"] == nil {
|
|
@@ -983,14 +997,14 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
|
j.Result[v.Field] = [](*ju.ExtField){}
|
|
|
}
|
|
|
field := &ju.ExtField{Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
|
|
|
- if extfrom == "title"{
|
|
|
+ if extfrom == "title" {
|
|
|
field.Score = 4
|
|
|
}
|
|
|
if tmp["blocktag"] != nil {
|
|
|
field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
}
|
|
|
item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
|
|
|
- if extfrom =="title"{
|
|
|
+ if extfrom == "title" {
|
|
|
item.Score = 4
|
|
|
}
|
|
|
if tmp["scoreitem"] == nil {
|
|
@@ -1052,7 +1066,7 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
// continue
|
|
|
// }
|
|
|
text := qu.ObjToString(v.Value)
|
|
|
- if text != "" {
|
|
|
+ if text != "" && v.ExtFrom != "title" {
|
|
|
text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
|
|
|
}
|
|
|
j.Result[in.Field][k].Value = text
|
|
@@ -1370,17 +1384,28 @@ func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField
|
|
|
values := map[string][]*ju.SortObject{}
|
|
|
for key, val := range result {
|
|
|
fieldValue := map[string][]interface{}{}
|
|
|
+ cfscore := make(map[string]float64) //重复匹配加分
|
|
|
if iscore { //走打分
|
|
|
for _, v := range val {
|
|
|
if len(fmt.Sprint(v.Value)) < 1 {
|
|
|
continue //去除空串
|
|
|
}
|
|
|
- if fieldValue[fmt.Sprint(v.Value)+v.Type] == nil{
|
|
|
+ if v.Score >0 {
|
|
|
+ cfscore[fmt.Sprint(v.Value)] += 1
|
|
|
+ }
|
|
|
+ if fieldValue[fmt.Sprint(v.Value)+v.Type] == nil {
|
|
|
fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
|
|
|
- }else if fieldValue[fmt.Sprint(v.Value)+v.Type][0].(float64) < v.Score {
|
|
|
+ } else if fieldValue[fmt.Sprint(v.Value)+v.Type][0].(float64) < v.Score {
|
|
|
fieldValue[fmt.Sprint(v.Value)+v.Type][0] = v.Score
|
|
|
}
|
|
|
}
|
|
|
+ for key := range fieldValue {
|
|
|
+ for cfkey, cfv := range cfscore {
|
|
|
+ if strings.Contains(key, cfkey) {
|
|
|
+ fieldValue[key][0] = fieldValue[key][0].(float64) + cfv
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
} else { //不走打分,按出现频次
|
|
|
for _, v := range val {
|
|
|
if len(fmt.Sprint(v.Value)) < 1 {
|
|
@@ -1450,7 +1475,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
|
|
|
func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
|
|
|
defer qu.Catch()
|
|
|
i := redis.GetInt(field, field+"_"+fv) //查找redis
|
|
|
- if i == 0 { //reids未找到,执行规则匹配
|
|
|
+ if i == 0 { //reids未找到,执行规则匹配
|
|
|
val[field+"_isredis"] = false
|
|
|
e.RuleMatch(field, fv, val) //规则匹配
|
|
|
} else { //redis找到,打标识存库
|