Browse Source

Merge branch 'dev3.2' of ssh://192.168.3.207:10022/qmx/jy-data-extract into dev3.2

wcj 6 years ago
parent
commit
c861fb22b2

+ 19 - 33
src/jy/extract/extract.go

@@ -388,7 +388,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				// log.Debug("抽取-规则", tmp)
 
 				//项目名称未能抽取到,标题来凑
-				if vc.Field == "projectname" {
+				if vc.Field == "projectname" && vc.ExtFrom == "title" {
 					//if len(j.Result[vc.Field]) < 1 {//如果抽取有结果,不走标题。待验证,暂时标题加入选举逻辑
 					field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
 					if tmp["blocktag"] != nil {
@@ -405,8 +405,10 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				}
 
 				//抽取-后置规则
-				for _, v := range vc.RuleBacks {
-					ExtRegBack(j, v, e.TaskInfo)
+				for i := 0; i < 3; i++ {
+					for _, v := range vc.RuleBacks {
+						ExtRegBack(j, v, e.TaskInfo)
+					}
 				}
 				// log.Debug("抽取-后置规则", tmp)
 			}
@@ -452,7 +454,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
 				before, _ := v.Value.(string)
 				v.Value = data[0]
-				BeforeAddClearFnLog("clearcfn", "函数清理", j.SourceMid, before, "clear_cfn", v, e)
+				BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
 				//添加行数清理的日志
 				//清理特殊符号
 				lockclear.Lock()
@@ -463,30 +465,11 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					if text != "" {
 						v.Value = text
 					}
-					BeforeAddClearFnLog("clearsymbol", "特殊符号清理", j.SourceMid, before, "clear_symbol", v, e)
+					BeforeAddClearFnLog(v.Type, "特殊符号清理", j.SourceMid, before, v.MatchType, v, e)
 				}
 				//AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
 				lockclear.Unlock()
 			}
-			//项目编号,采购单位权重清理
-			//          tmpExtFields := make([]*ju.ExtField, 0)
-			//			tmpWeight := -999 //记录最大权重
-			//			tmpIndex := -999  //记录最大权重下标
-			//			if (key == "projectcode" || key == "buyer") && len(val) > 1 {
-			//				for i, v := range val {
-			//					if v.Weight == 0 {
-			//						tmpExtFields = append(tmpExtFields, v)
-			//						continue
-			//					} else if v.Weight > tmpWeight {
-			//						tmpWeight = v.Weight
-			//						tmpIndex = i
-			//					}
-			//				}
-			//				if tmpIndex != -999 {
-			//					tmpExtFields = append(tmpExtFields, val[tmpIndex])
-			//					j.Result[key] = tmpExtFields
-			//				}
-			//			}
 		}
 		PackageDetail(j, e) //处理分包信息
 		//		bs, _ := json.Marshal(j.Result)
@@ -646,14 +629,17 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 	if !b {
 		return
 	}
-	kvMap := getKvByLuaFields(extfrom, j, in, et.Tag)
-	if in.IsLua {
-		lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
-		lua.KvMap = kvMap
-		lua.Block = j.Block
-		extinfo := lua.RunScript("core")
-		if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
-			kvMap[in.Field] = tmps
+	kvMap := map[string][]map[string]interface{}{}
+	if extfrom != "title" {
+		kvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
+		if in.IsLua {
+			lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
+			lua.KvMap = kvMap
+			lua.Block = j.Block
+			extinfo := lua.RunScript("core")
+			if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
+				kvMap[in.Field] = tmps
+			}
 		}
 	}
 	if len(kvMap) > 0 {
@@ -661,7 +647,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 			j.Result[in.Field] = [](*ju.ExtField){}
 		}
 		for _, tmp := range kvMap[in.Field] {
-			field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: in.Field, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
+			field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: in.Field, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
 			if tmp["blocktag"] != nil {
 				btag := make(map[string]string)
 				for k := range tmp["blocktag"].(map[string]bool) {

+ 0 - 186
src/jy/extract/extractInit.go

@@ -471,192 +471,6 @@ func (e *ExtractTask) InitRuleCore() {
 	}
 }
 
-//加载抽取规则
-func (e *ExtractTask) InitRuleCore2() {
-	defer qu.Catch()
-	e.Fields = map[string]int{}
-	infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
-	e.RuleCores = make(map[string]map[string][]*RuleCore)
-	for _, v := range *infolist {
-		topclass := qu.ObjToString(v["topclass"])
-		if v["subclass"] == nil {
-			e.RuleCores[topclass] = make(map[string][]*RuleCore)
-			for attr, _ := range v["fields"].(map[string]interface{}) {
-				vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+attr+`"}`, `{}`)
-				e.RuleCores[topclass][attr] = append(e.RuleCores[topclass][attr], e.InfoRole(*vinfo)...)
-			}
-		} else {
-			for ca, fs := range v["subclass"].(map[string]interface{}) {
-				e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
-				for field, _ := range fs.(map[string]interface{}) {
-					vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+field+`"}`, `{}`)
-					e.RuleCores[topclass+"_"+ca][field] = append(e.RuleCores[topclass+"_"+ca][field], e.InfoRole(*vinfo)...)
-				}
-			}
-		}
-	}
-}
-func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
-	maps := []*RuleCore{}
-	if b, _ := vinfo["isuse"].(bool); !b {
-		return nil
-	}
-	s_field := qu.ObjToString(vinfo["s_field"])
-	pid := qu.BsonIdToSId(vinfo["_id"])
-	list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
-	for _, vv := range *list {
-		if b, _ := vv["isuse"].(bool); !b {
-			continue
-		}
-		rcore := &RuleCore{}
-		rcore.Field = s_field
-		rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
-		rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
-		//前置规则
-		rulePres := []*RegLuaInfo{}
-		plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
-		for _, v := range *plist {
-			rinfo := &RegLuaInfo{
-				Field: qu.ObjToString(v["s_field"]),
-				Code:  v["s_code"].(string),
-				Name:  v["s_name"].(string),
-				IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
-			}
-			if rinfo.IsLua {
-				rinfo.RuleText = v["s_luascript"].(string)
-				rulePres = append(rulePres, rinfo)
-			} else {
-				qu.Try(func() {
-					rinfo.RuleText = v["s_rule"].(string)
-					tmp := strings.Split(rinfo.RuleText, "__")
-					var pattern string
-					if strings.Contains(tmp[0], "\\u") {
-						tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
-						tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
-						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
-					} else {
-						pattern = tmp[0]
-					}
-					if len(tmp) == 2 {
-						rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
-					} else {
-						rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
-					}
-					rulePres = append(rulePres, rinfo)
-				}, func(err interface{}) {
-					log.Debug(rinfo.Code, rinfo.Field, err)
-				})
-			}
-		}
-		rcore.RulePres = rulePres
-
-		//后置规则
-		ruleBacks := []*RegLuaInfo{}
-		blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
-		for _, v := range *blist {
-			rinfo := &RegLuaInfo{
-				Field: qu.ObjToString(v["s_field"]),
-				Code:  v["s_code"].(string),
-				Name:  v["s_name"].(string),
-				IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
-			}
-			if rinfo.IsLua {
-				rinfo.RuleText = v["s_luascript"].(string)
-				ruleBacks = append(ruleBacks, rinfo)
-			} else {
-				qu.Try(func() {
-					rinfo.RuleText = v["s_rule"].(string)
-					tmp := strings.Split(rinfo.RuleText, "__")
-					var pattern string
-					if strings.Contains(tmp[0], "\\u") {
-						tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
-						tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
-						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
-					} else {
-						pattern = tmp[0]
-					}
-					if len(tmp) == 2 {
-						rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
-					} else {
-						rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
-					}
-					ruleBacks = append(ruleBacks, rinfo)
-				}, func(err interface{}) {
-					log.Debug(rinfo.Code, rinfo.Field, err)
-				})
-			}
-		}
-		rcore.RuleBacks = ruleBacks
-
-		//抽取规则
-		ruleCores := []*RegLuaInfo{}
-		clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
-		for _, v := range *clist {
-			if b, _ := v["isuse"].(bool); !b {
-				continue
-			}
-			field := qu.ObjToString(v["s_field"])
-			e.Fields[field] = 1 //加入抽取属性组备用
-			rinfo := &RegLuaInfo{
-				Field: field,
-				Code:  v["s_code"].(string),
-				Name:  v["s_name"].(string),
-				IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
-			}
-			if rinfo.IsLua {
-				rinfo.RuleText = v["s_luascript"].(string)
-				//提取全部属性
-				rinfo.LFields = getALLFields()
-				ruleCores = append(ruleCores, rinfo)
-			} else {
-				qu.Try(func() {
-					rinfo.RuleText = v["s_rule"].(string)
-					ptmp := strings.Split(rinfo.RuleText, "#")
-					sign := 0
-					if len(ptmp) == 2 {
-						if ptmp[1] == "正" {
-							sign = 1
-						} else if ptmp[1] == "负" {
-							sign = -1
-						}
-					}
-					tmp := strings.Split(ptmp[0], "__")
-					var pattern string
-					if strings.Contains(tmp[0], "\\u") {
-						tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
-						tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
-						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
-					} else {
-						pattern = tmp[0]
-					}
-					if len(tmp) == 2 {
-						epos := strings.Split(tmp[1], ",")
-						posm := map[string]int{}
-						for _, v := range epos {
-							ks := strings.Split(v, ":")
-							if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
-								posm[ks[1]] = qu.IntAll(ks[0])
-							} else { //(.*)招标公告__2
-								posm[rinfo.Field] = qu.IntAll(ks[0])
-							}
-						}
-						rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm, NumSign: sign}
-					} else {
-						rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
-					}
-					ruleCores = append(ruleCores, rinfo)
-				}, func(err interface{}) {
-					log.Debug(rinfo.Code, rinfo.Field, err)
-				})
-			}
-		}
-		rcore.RuleCores = ruleCores
-		//
-		maps = append(maps, rcore)
-	}
-	return maps
-}
-
 //加载分包抽取规则
 func (e *ExtractTask) InitPkgCore() {
 	defer qu.Catch()

+ 4 - 3
src/jy/extract/score.go

@@ -174,9 +174,10 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 				typescore = CommonScore[tmpsvalue.Type]
 			}
 			lockscore.Unlock()
-
-			tmps[tmpsindex].Score += titlescore
-			tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: titlescore})
+			if titlescore > 0 {
+				tmps[tmpsindex].Score += titlescore
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: titlescore})
+			}
 			tmps[tmpsindex].Score += typescore
 			tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: typescore})
 

+ 4 - 1
src/jy/pretreated/analykv.go

@@ -196,7 +196,7 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 					v = str1
 					if k0 < len(strs)-1 {
 						s2 := u.TrimLRSpace(strings.Join(strs[k0+1], ""), "")
-						if len([]rune(s2)) < 10 && !Key.MatchString(s2) {
+						if len([]rune(s2)) < 10 && !regexp.MustCompile("^[0-9]+[、]+$").MatchString(s2) && !Key.MatchString(s2) {
 							v += s2
 							k0++
 						}
@@ -334,6 +334,9 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 }
 
 func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, matchMap map[string]map[string]bool, from int) {
+	if regexp.MustCompile("^[0-9]+[、]+$").MatchString(v){
+		return
+	}
 	k = space.ReplaceAllString(k, "")
 	if len([]rune(k)) > 1 {
 		if len([]rune(k)) < 5 && strings.HasPrefix(k, "联系") || ContactInfoMustReg.MatchString(k) {

+ 2 - 0
src/jy/pretreated/analystep.go

@@ -6,6 +6,7 @@ package pretreated
 import (
 	"encoding/json"
 	"jy/util"
+	//"log"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
@@ -40,6 +41,7 @@ func AnalyStart(job *util.Job) {
 					processTableInBlock(bl_bl, job)
 				}
 			}
+			FindProjectCode(bl.Text, job) //匹配项目编号
 			processTableInBlock(bl, job)
 			//新加 未分块table中未能解析到中标候选人,从正文中解析
 			if job.Winnerorder == nil || len(job.Winnerorder) == 0 {

+ 8 - 3
src/jy/pretreated/analytable.go

@@ -107,11 +107,12 @@ var (
 	underline                   = regexp.MustCompile("_+$")
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
 	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
-	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(编号|项目编号|标段编号){1}(:|:)(.){4,30}()|\)|\])`)
-	projectcodeReg2             = regexp.MustCompile(`((?:^|\n)编号|项目编号|标段编号){1}(:|:)(.){4,30}[0-9a-zA-Z]`)
+	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(编号|项目编号|标段编号|招标编号){1}(:|:)(.){4,30}()|\)|\])`)
+	projectcodeReg2             = regexp.MustCompile(`((?:^|\n)编号|项目编号|标段编号){1}(:|:)(.){4,30}[0-9a-zA-Z]`)
 	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")
 	jsonReg                     = regexp.MustCompile(`\{.+:[^}]*\} `) //  \{".*\":\".+\"}
 	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
+	winnerOrerReg               = regexp.MustCompile("(中标)?候选人")
 )
 
 //在解析时,判断表格元素是否隐藏
@@ -160,7 +161,11 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (kvTags map[s
 			if MhSpilt.MatchString(v1) {
 				t1.Weight -= 50
 			}
-			kvTags[t1.Value] = append(kvTags[t1.Value], &u.Tag{Key: k, Value: v1, Weight: t1.Weight})
+			if winnerOrerReg.MatchString(tabletag) && t1.Value == "采购单位联系人" { //处理table中项目负责人
+				kvTags[k] = append(kvTags[k], &u.Tag{Key: k, Value: v1, IsInvalid: true})
+			} else {
+				kvTags[t1.Value] = append(kvTags[t1.Value], &u.Tag{Key: k1, Value: v1, Weight: t1.Weight})
+			}
 		}
 		//k1 = res[0].Value
 	} else {

+ 30 - 4
src/res/fieldscore.json

@@ -328,8 +328,34 @@
     },
     "projectcode": {
         "type": "string",
-        "positivewords": [],
+        "positivewords": [
+            {
+                "describe": "有关键字加分",
+                "regstr": "(财采|招字|财购|赣购){1}",
+                "score": 2
+            },
+            {
+                "describe": "号结尾加分",
+                "regstr": ".{4,35}(号)$",
+                "score": 2
+            }
+        ],
         "negativewords": [
+            {
+                "describe": "长度年月日纯数字减分",
+                "regstr": "^\\d{8}$",
+                "score": -1
+            },
+            {
+                "describe": "纯大写英文字母",
+                "regstr": "^[A-Z]{5,}$",
+                "score": -0.2
+            },
+            {
+                "describe": "中文汉字大于5个",
+                "regstr": "[\\u4e00-\\u9fa5]{5,}",
+                "score": -2
+            },
             {
                 "describe": "全为中文汉字或符号",
                 "regstr": "^[\\u4e00-\\u9fa5()()【】\\[\\],,。、::《》]+$",
@@ -337,7 +363,7 @@
             },
             {
                 "describe": "以一个汉字以上结束",
-                "regstr": "[\\u4e00-\\u9fa5]{1,}$",
+                "regstr": "[\\u4e00-\\u9fa5/]{1,}$",
                 "score": -1
             },
             {
@@ -364,14 +390,14 @@
                 "describe": "[gt,lte,score]",
                 "range": [
                     4,
-                    35,
+                    36,
                     3
                 ]
             },
             {
                 "describe": "[gt,∞,score]",
                 "range": [
-                    35,
+                    36,
                     -1,
                     -1
                 ]