|
@@ -185,6 +185,11 @@ func RunExtractTask(taskId string) {
|
|
|
if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
|
|
|
continue
|
|
|
}
|
|
|
+ //根据标题判断是否抽取
|
|
|
+ b := IsExtract("title", qu.ObjToString(v["title"]), "")
|
|
|
+ if !b {
|
|
|
+ continue
|
|
|
+ }
|
|
|
_id := qu.BsonIdToSId(v["_id"])
|
|
|
//log.Debug(_id)
|
|
|
if !ext.IsRun {
|
|
@@ -289,7 +294,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
|
|
|
pretreated.AnalyStart(jf)
|
|
|
}
|
|
|
}, func(err interface{}) {
|
|
|
- log.Debug("pretreated.AnalyStart", err)
|
|
|
+ log.Debug("pretreated.AnalyStart", err, j.SourceMid)
|
|
|
})
|
|
|
return j, jf
|
|
|
}
|
|
@@ -438,6 +443,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
}
|
|
|
//函数清理
|
|
|
for key, val := range j.Result {
|
|
|
+ tmpExtFields := make([]*ju.ExtField, 0)
|
|
|
+ tmpWeight := -999 //记录最大权重
|
|
|
+ tmpIndex := -999 //记录最大权重下标
|
|
|
for _, v := range val {
|
|
|
lockclear.Lock()
|
|
|
cfn := e.ClearFn[key]
|
|
@@ -461,6 +469,22 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
//AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
|
|
|
lockclear.Unlock()
|
|
|
}
|
|
|
+ //项目编号,采购单位权重清理
|
|
|
+ if (key == "projectcode" || key == "buyer") && len(val) > 1 {
|
|
|
+ for i, v := range val {
|
|
|
+ if v.Weight == 0 {
|
|
|
+ tmpExtFields = append(tmpExtFields, v)
|
|
|
+ continue
|
|
|
+ } else if v.Weight > tmpWeight {
|
|
|
+ tmpWeight = v.Weight
|
|
|
+ tmpIndex = i
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if tmpIndex != -999 {
|
|
|
+ tmpExtFields = append(tmpExtFields, val[tmpIndex])
|
|
|
+ j.Result[key] = tmpExtFields
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
PackageDetail(j, e) //处理分包信息
|
|
|
// bs, _ := json.Marshal(j.Result)
|
|
@@ -615,7 +639,7 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
|
|
|
//抽取-规则
|
|
|
func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
|
|
|
defer qu.Catch()
|
|
|
- //废标、流标、ppp等跳过
|
|
|
+ //根据field配置项目,是否抽取。例如:废标、流标等跳过,
|
|
|
b := IsExtract(in.Field, j.Title, j.Content)
|
|
|
if !b {
|
|
|
return
|
|
@@ -632,11 +656,13 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
|
}
|
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
|
for _, tmp := range tmps {
|
|
|
- field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
|
|
|
+ field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
|
|
|
if tmp["blocktag"] != nil {
|
|
|
btag := make(map[string]string)
|
|
|
for k := range tmp["blocktag"].(map[string]bool) {
|
|
|
- btag[k] = TagConfigDesc[k]
|
|
|
+ if TagConfigDesc[k] != "" {
|
|
|
+ btag[k] = TagConfigDesc[k]
|
|
|
+ }
|
|
|
}
|
|
|
field.BlockTag = btag
|
|
|
}
|
|
@@ -683,8 +709,19 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
|
//lua脚本根据属性设置提取kv值
|
|
|
func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
|
|
|
kvmap := map[string][]map[string]interface{}{}
|
|
|
+ blocks := []*ju.Block{}
|
|
|
+ for _, bl := range j.Block {
|
|
|
+ if len(bl.Block) > 0 {
|
|
|
+ blocks = append(blocks, bl.Block...)
|
|
|
+ } else {
|
|
|
+ blocks = append(blocks, bl)
|
|
|
+ }
|
|
|
+ }
|
|
|
for fieldname, field := range in.LFields {
|
|
|
- for _, bl := range j.Block {
|
|
|
+ if field != in.Field {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ for _, bl := range blocks {
|
|
|
tp := ""
|
|
|
for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
|
|
|
if k == 0 {
|
|
@@ -709,7 +746,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
"value": text,
|
|
|
"type": tp,
|
|
|
"matchtype": "tag_string",
|
|
|
- "blocktag": bl.Tag,
|
|
|
+ "blocktag": bl.Classify,
|
|
|
"weight": vv.Weight,
|
|
|
})
|
|
|
}
|
|
@@ -882,6 +919,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
|
|
|
}
|
|
|
j.Result[in.Field][k].Value = text
|
|
|
+ if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
|
|
|
+ continue
|
|
|
+ }
|
|
|
exts = append(exts, map[string]interface{}{
|
|
|
"field": v.Field,
|
|
|
"code": v.Code,
|
|
@@ -909,6 +949,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
|
|
|
}
|
|
|
j.Result[key][k].Value = text
|
|
|
+ if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
|
|
|
+ continue
|
|
|
+ }
|
|
|
exts = append(exts, map[string]interface{}{
|
|
|
"field": v.Field,
|
|
|
"code": v.Code,
|