|
@@ -382,41 +382,37 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
// log.Debug("抽取-前置规则", tmp)
|
|
|
|
|
|
//抽取-规则
|
|
|
- for _, v := range vc.RuleCores {
|
|
|
- ExtRegCore(vc.ExtFrom, tmp, j, v, e)
|
|
|
- }
|
|
|
+ ExtRuleCore(tmp, e, vc, j)
|
|
|
// log.Debug("抽取-规则", tmp)
|
|
|
|
|
|
+ //抽取-后置规则
|
|
|
+ for _, v := range vc.RuleBacks {
|
|
|
+ ExtRegBack(j, v, e.TaskInfo)
|
|
|
+ }
|
|
|
+ // log.Debug("抽取-后置规则", tmp)
|
|
|
+
|
|
|
//项目名称未能抽取到,标题来凑
|
|
|
if vc.Field == "projectname" && vc.ExtFrom == "title" {
|
|
|
- //if len(j.Result[vc.Field]) < 1 {//如果抽取有结果,不走标题。待验证,暂时标题加入选举逻辑
|
|
|
- field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
|
|
|
- if tmp["blocktag"] != nil {
|
|
|
- btag := make(map[string]string)
|
|
|
- for k := range tmp["blocktag"].(map[string]bool) {
|
|
|
- blocktag.Lock()
|
|
|
- btag[k] = TagConfigDesc[k]
|
|
|
- blocktag.Unlock()
|
|
|
+ isextitle := true
|
|
|
+ for _, v := range j.Result[vc.Field] {
|
|
|
+ if len([]rune(qu.ObjToString(v.Value))) > 5 {
|
|
|
+ isextitle = false
|
|
|
+ break
|
|
|
}
|
|
|
- field.BlockTag = btag
|
|
|
}
|
|
|
- j.Result[vc.Field] = append(j.Result[vc.Field], field)
|
|
|
- //}
|
|
|
- }
|
|
|
-
|
|
|
- //抽取-后置规则
|
|
|
- for i := 0; i < 3; i++ {
|
|
|
- for _, v := range vc.RuleBacks {
|
|
|
- ExtRegBack(j, v, e.TaskInfo)
|
|
|
+ if isextitle { //标题加入选举
|
|
|
+ field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
|
|
|
+ j.Result[vc.Field] = append(j.Result[vc.Field], field)
|
|
|
+ }
|
|
|
+ for i := 0; i < 3; i++ {
|
|
|
+ for _, v := range vc.RuleBacks {
|
|
|
+ ExtRegBack(j, v, e.TaskInfo)
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
- // log.Debug("抽取-后置规则", tmp)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- //for _, vvc := range j.Result["budget"] {
|
|
|
- //log.Debug("-----", fmt.Sprintf("%+v", vvc))
|
|
|
- //}
|
|
|
//全局后置规则
|
|
|
for _, v := range e.RuleBacks {
|
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
@@ -516,10 +512,8 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
|
// log.Debug("抽取-前置规则", tmp)
|
|
|
|
|
|
//抽取-规则
|
|
|
- for _, v := range vc.RuleCores {
|
|
|
- if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
|
- ExtRegCore(vc.ExtFrom, tmp, j, v, e)
|
|
|
- }
|
|
|
+ if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
|
+ ExtRuleCore(tmp, e, vc, j)
|
|
|
}
|
|
|
// log.Debug("抽取-规则", tmp)
|
|
|
|
|
@@ -622,32 +616,24 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
|
|
|
}
|
|
|
|
|
|
//抽取-规则
|
|
|
-func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
|
|
|
- defer qu.Catch()
|
|
|
- //根据field配置项目,是否抽取。例如:废标、流标等跳过,
|
|
|
- b := IsExtract(in.Field, j.Title, j.Content)
|
|
|
- if !b {
|
|
|
- return
|
|
|
+func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job) {
|
|
|
+ var kvMap map[string][]map[string]interface{}
|
|
|
+ if vc.ExtFrom != "title" {
|
|
|
+ kvMap = getKvByLuaFields(vc, j, e)
|
|
|
}
|
|
|
- kvMap := map[string][]map[string]interface{}{}
|
|
|
- if extfrom != "title" {
|
|
|
- kvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
|
|
|
- if in.IsLua {
|
|
|
- lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
|
|
|
- lua.KvMap = kvMap
|
|
|
- lua.Block = j.Block
|
|
|
- extinfo := lua.RunScript("core")
|
|
|
- if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
|
|
|
- kvMap[in.Field] = tmps
|
|
|
- }
|
|
|
+ for _, v := range vc.RuleCores {
|
|
|
+ if v.IsLua {
|
|
|
+ ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, kvMap)
|
|
|
+ } else {
|
|
|
+ ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e)
|
|
|
}
|
|
|
}
|
|
|
- if len(kvMap) > 0 {
|
|
|
- if j.Result[in.Field] == nil {
|
|
|
- j.Result[in.Field] = [](*ju.ExtField){}
|
|
|
+ for k, v := range kvMap {
|
|
|
+ if j.Result[k] == nil {
|
|
|
+ j.Result[k] = [](*ju.ExtField){}
|
|
|
}
|
|
|
- for _, tmp := range kvMap[in.Field] {
|
|
|
- field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: in.Field, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
|
|
|
+ for _, tmp := range v {
|
|
|
+ field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
|
|
|
if tmp["blocktag"] != nil {
|
|
|
btag := make(map[string]string)
|
|
|
for k := range tmp["blocktag"].(map[string]bool) {
|
|
@@ -659,45 +645,71 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
|
}
|
|
|
field.BlockTag = btag
|
|
|
}
|
|
|
- j.Result[in.Field] = append(j.Result[in.Field], field)
|
|
|
+ j.Result[k] = append(j.Result[k], field)
|
|
|
}
|
|
|
- AddExtLog("extract", j.SourceMid, nil, kvMap, in, et.TaskInfo) //抽取日志
|
|
|
- } else if !in.IsLua {
|
|
|
- //全文正则
|
|
|
- //text := qu.ObjToString(doc[extfrom])
|
|
|
- //if in.Field != "" {
|
|
|
- // extinfo := extRegCoreToResult(extfrom, text, j, in)
|
|
|
- // if len(extinfo) > 0 {
|
|
|
- // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
- // }
|
|
|
- //}
|
|
|
- //块抽取
|
|
|
- if in.Field != "" {
|
|
|
- if extfrom == "title" {
|
|
|
- extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+//抽取-规则-kv
|
|
|
+func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap map[string][]map[string]interface{}) {
|
|
|
+ defer qu.Catch()
|
|
|
+ if extfrom == "title" || !in.IsLua {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
|
|
|
+ lua.KvMap = kvMap
|
|
|
+ lua.Block = j.Block
|
|
|
+ extinfo := lua.RunScript("core")
|
|
|
+ if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
|
|
|
+ for _, v := range tmps {
|
|
|
+ v["core"] = in.Code
|
|
|
+ }
|
|
|
+ kvMap[in.Field] = tmps
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+//抽取-规则-正则
|
|
|
+func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
|
|
|
+ defer qu.Catch()
|
|
|
+ //根据field配置项目,是否抽取。例如:废标、流标等跳过,
|
|
|
+ b := IsExtract(in.Field, j.Title, j.Content)
|
|
|
+ if !b {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ //全文正则
|
|
|
+ //text := qu.ObjToString(doc[extfrom])
|
|
|
+ //if in.Field != "" {
|
|
|
+ // extinfo := extRegCoreToResult(extfrom, text, j, in)
|
|
|
+ // if len(extinfo) > 0 {
|
|
|
+ // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ // }
|
|
|
+ //}
|
|
|
+ //块抽取
|
|
|
+ if in.Field != "" {
|
|
|
+ if extfrom == "title" {
|
|
|
+ extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in)
|
|
|
+ if len(extinfo) > 0 {
|
|
|
+ AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ for _, v := range j.Block {
|
|
|
+ btag := make(map[string]string)
|
|
|
+ for k := range v.Classify {
|
|
|
+ blocktag.Lock()
|
|
|
+ btag[k] = TagConfigDesc[k]
|
|
|
+ blocktag.Unlock()
|
|
|
+ }
|
|
|
+ extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in)
|
|
|
if len(extinfo) > 0 {
|
|
|
AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
}
|
|
|
- } else {
|
|
|
- for _, v := range j.Block {
|
|
|
- btag := make(map[string]string)
|
|
|
- for k := range v.Classify {
|
|
|
- blocktag.Lock()
|
|
|
- btag[k] = TagConfigDesc[k]
|
|
|
- blocktag.Unlock()
|
|
|
- }
|
|
|
- extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in)
|
|
|
- if len(extinfo) > 0 {
|
|
|
- AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
- }
|
|
|
- }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
//lua脚本根据属性设置提取kv值
|
|
|
-func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
|
|
|
+func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) map[string][]map[string]interface{} {
|
|
|
kvmap := map[string][]map[string]interface{}{}
|
|
|
blocks := []*ju.Block{}
|
|
|
for _, bl := range j.Block {
|
|
@@ -707,8 +719,8 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
blocks = append(blocks, bl)
|
|
|
}
|
|
|
}
|
|
|
- for fieldname, field := range in.LFields {
|
|
|
- if field != in.Field {
|
|
|
+ for fieldname, field := range vc.LFields {
|
|
|
+ if field != vc.Field {
|
|
|
continue
|
|
|
}
|
|
|
for _, bl := range blocks {
|
|
@@ -729,9 +741,8 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
if text != "" {
|
|
|
kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
"field": field,
|
|
|
- "code": in.Code,
|
|
|
"ruletext": vv.Key,
|
|
|
- "extfrom": extfrom,
|
|
|
+ "extfrom": vc.ExtFrom,
|
|
|
"sourcevalue": text,
|
|
|
"value": text,
|
|
|
"type": tp,
|
|
@@ -744,6 +755,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
|
|
|
return kvmap
|
|
|
}
|
|
|
|
|
@@ -773,7 +785,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job,
|
|
|
"field": v.Field,
|
|
|
"code": v.Code,
|
|
|
"ruletext": v.RuleText,
|
|
|
- "extfrom": extfrom,
|
|
|
+ "extfrom": text,
|
|
|
"value": val,
|
|
|
"type": "regexp",
|
|
|
"matchtype": "regcontent",
|
|
@@ -810,7 +822,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job,
|
|
|
"field": v.Field,
|
|
|
"code": v.Code,
|
|
|
"ruletext": regArr[0],
|
|
|
- "extfrom": extfrom,
|
|
|
+ "extfrom": text,
|
|
|
"value": value,
|
|
|
"type": "regexp",
|
|
|
"matchtype": "regcontent",
|
|
@@ -846,7 +858,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job,
|
|
|
"field": v.Field,
|
|
|
"code": v.Code,
|
|
|
"ruletext": v.RuleText,
|
|
|
- "extfrom": extfrom,
|
|
|
+ "extfrom": text,
|
|
|
"value": val,
|
|
|
"type": "regexp",
|
|
|
"matchtype": "regcontent",
|