瀏覽代碼

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

fengweiqiang 6 年之前
父節點
當前提交
bf93a7ce9d
共有 4 個文件被更改,包括 31 次插入226 次删除
  1. 19 33
      src/jy/extract/extract.go
  2. 0 186
      src/jy/extract/extractInit.go
  3. 8 3
      src/jy/extract/score.go
  4. 4 4
      src/jy/pretreated/analytable.go

+ 19 - 33
src/jy/extract/extract.go

@@ -388,7 +388,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				// log.Debug("抽取-规则", tmp)
 
 				//项目名称未能抽取到,标题来凑
-				if vc.Field == "projectname" {
+				if vc.Field == "projectname" && vc.ExtFrom == "title" {
 					//if len(j.Result[vc.Field]) < 1 {//如果抽取有结果,不走标题。待验证,暂时标题加入选举逻辑
 					field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
 					if tmp["blocktag"] != nil {
@@ -405,8 +405,10 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				}
 
 				//抽取-后置规则
-				for _, v := range vc.RuleBacks {
-					ExtRegBack(j, v, e.TaskInfo)
+				for i := 0; i < 3; i++ {
+					for _, v := range vc.RuleBacks {
+						ExtRegBack(j, v, e.TaskInfo)
+					}
 				}
 				// log.Debug("抽取-后置规则", tmp)
 			}
@@ -452,7 +454,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
 				before, _ := v.Value.(string)
 				v.Value = data[0]
-				BeforeAddClearFnLog("clearcfn", "函数清理", j.SourceMid, before, "clear_cfn", v, e)
+				BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
 				//添加行数清理的日志
 				//清理特殊符号
 				lockclear.Lock()
@@ -463,30 +465,11 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					if text != "" {
 						v.Value = text
 					}
-					BeforeAddClearFnLog("clearsymbol", "特殊符号清理", j.SourceMid, before, "clear_symbol", v, e)
+					BeforeAddClearFnLog(v.Type, "特殊符号清理", j.SourceMid, before, v.MatchType, v, e)
 				}
 				//AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
 				lockclear.Unlock()
 			}
-			//项目编号,采购单位权重清理
-			//          tmpExtFields := make([]*ju.ExtField, 0)
-			//			tmpWeight := -999 //记录最大权重
-			//			tmpIndex := -999  //记录最大权重下标
-			//			if (key == "projectcode" || key == "buyer") && len(val) > 1 {
-			//				for i, v := range val {
-			//					if v.Weight == 0 {
-			//						tmpExtFields = append(tmpExtFields, v)
-			//						continue
-			//					} else if v.Weight > tmpWeight {
-			//						tmpWeight = v.Weight
-			//						tmpIndex = i
-			//					}
-			//				}
-			//				if tmpIndex != -999 {
-			//					tmpExtFields = append(tmpExtFields, val[tmpIndex])
-			//					j.Result[key] = tmpExtFields
-			//				}
-			//			}
 		}
 		PackageDetail(j, e) //处理分包信息
 		//		bs, _ := json.Marshal(j.Result)
@@ -646,14 +629,17 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 	if !b {
 		return
 	}
-	kvMap := getKvByLuaFields(extfrom, j, in, et.Tag)
-	if in.IsLua {
-		lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
-		lua.KvMap = kvMap
-		lua.Block = j.Block
-		extinfo := lua.RunScript("core")
-		if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
-			kvMap[in.Field] = tmps
+	kvMap := map[string][]map[string]interface{}{}
+	if extfrom != "title" {
+		kvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
+		if in.IsLua {
+			lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
+			lua.KvMap = kvMap
+			lua.Block = j.Block
+			extinfo := lua.RunScript("core")
+			if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
+				kvMap[in.Field] = tmps
+			}
 		}
 	}
 	if len(kvMap) > 0 {
@@ -661,7 +647,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 			j.Result[in.Field] = [](*ju.ExtField){}
 		}
 		for _, tmp := range kvMap[in.Field] {
-			field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: in.Field, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
+			field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: in.Field, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
 			if tmp["blocktag"] != nil {
 				btag := make(map[string]string)
 				for k := range tmp["blocktag"].(map[string]bool) {

+ 0 - 186
src/jy/extract/extractInit.go

@@ -471,192 +471,6 @@ func (e *ExtractTask) InitRuleCore() {
 	}
 }
 
-//加载抽取规则
-func (e *ExtractTask) InitRuleCore2() {
-	defer qu.Catch()
-	e.Fields = map[string]int{}
-	infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
-	e.RuleCores = make(map[string]map[string][]*RuleCore)
-	for _, v := range *infolist {
-		topclass := qu.ObjToString(v["topclass"])
-		if v["subclass"] == nil {
-			e.RuleCores[topclass] = make(map[string][]*RuleCore)
-			for attr, _ := range v["fields"].(map[string]interface{}) {
-				vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+attr+`"}`, `{}`)
-				e.RuleCores[topclass][attr] = append(e.RuleCores[topclass][attr], e.InfoRole(*vinfo)...)
-			}
-		} else {
-			for ca, fs := range v["subclass"].(map[string]interface{}) {
-				e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
-				for field, _ := range fs.(map[string]interface{}) {
-					vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+field+`"}`, `{}`)
-					e.RuleCores[topclass+"_"+ca][field] = append(e.RuleCores[topclass+"_"+ca][field], e.InfoRole(*vinfo)...)
-				}
-			}
-		}
-	}
-}
-func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
-	maps := []*RuleCore{}
-	if b, _ := vinfo["isuse"].(bool); !b {
-		return nil
-	}
-	s_field := qu.ObjToString(vinfo["s_field"])
-	pid := qu.BsonIdToSId(vinfo["_id"])
-	list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
-	for _, vv := range *list {
-		if b, _ := vv["isuse"].(bool); !b {
-			continue
-		}
-		rcore := &RuleCore{}
-		rcore.Field = s_field
-		rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
-		rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
-		//前置规则
-		rulePres := []*RegLuaInfo{}
-		plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
-		for _, v := range *plist {
-			rinfo := &RegLuaInfo{
-				Field: qu.ObjToString(v["s_field"]),
-				Code:  v["s_code"].(string),
-				Name:  v["s_name"].(string),
-				IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
-			}
-			if rinfo.IsLua {
-				rinfo.RuleText = v["s_luascript"].(string)
-				rulePres = append(rulePres, rinfo)
-			} else {
-				qu.Try(func() {
-					rinfo.RuleText = v["s_rule"].(string)
-					tmp := strings.Split(rinfo.RuleText, "__")
-					var pattern string
-					if strings.Contains(tmp[0], "\\u") {
-						tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
-						tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
-						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
-					} else {
-						pattern = tmp[0]
-					}
-					if len(tmp) == 2 {
-						rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
-					} else {
-						rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
-					}
-					rulePres = append(rulePres, rinfo)
-				}, func(err interface{}) {
-					log.Debug(rinfo.Code, rinfo.Field, err)
-				})
-			}
-		}
-		rcore.RulePres = rulePres
-
-		//后置规则
-		ruleBacks := []*RegLuaInfo{}
-		blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
-		for _, v := range *blist {
-			rinfo := &RegLuaInfo{
-				Field: qu.ObjToString(v["s_field"]),
-				Code:  v["s_code"].(string),
-				Name:  v["s_name"].(string),
-				IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
-			}
-			if rinfo.IsLua {
-				rinfo.RuleText = v["s_luascript"].(string)
-				ruleBacks = append(ruleBacks, rinfo)
-			} else {
-				qu.Try(func() {
-					rinfo.RuleText = v["s_rule"].(string)
-					tmp := strings.Split(rinfo.RuleText, "__")
-					var pattern string
-					if strings.Contains(tmp[0], "\\u") {
-						tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
-						tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
-						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
-					} else {
-						pattern = tmp[0]
-					}
-					if len(tmp) == 2 {
-						rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
-					} else {
-						rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
-					}
-					ruleBacks = append(ruleBacks, rinfo)
-				}, func(err interface{}) {
-					log.Debug(rinfo.Code, rinfo.Field, err)
-				})
-			}
-		}
-		rcore.RuleBacks = ruleBacks
-
-		//抽取规则
-		ruleCores := []*RegLuaInfo{}
-		clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
-		for _, v := range *clist {
-			if b, _ := v["isuse"].(bool); !b {
-				continue
-			}
-			field := qu.ObjToString(v["s_field"])
-			e.Fields[field] = 1 //加入抽取属性组备用
-			rinfo := &RegLuaInfo{
-				Field: field,
-				Code:  v["s_code"].(string),
-				Name:  v["s_name"].(string),
-				IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
-			}
-			if rinfo.IsLua {
-				rinfo.RuleText = v["s_luascript"].(string)
-				//提取全部属性
-				rinfo.LFields = getALLFields()
-				ruleCores = append(ruleCores, rinfo)
-			} else {
-				qu.Try(func() {
-					rinfo.RuleText = v["s_rule"].(string)
-					ptmp := strings.Split(rinfo.RuleText, "#")
-					sign := 0
-					if len(ptmp) == 2 {
-						if ptmp[1] == "正" {
-							sign = 1
-						} else if ptmp[1] == "负" {
-							sign = -1
-						}
-					}
-					tmp := strings.Split(ptmp[0], "__")
-					var pattern string
-					if strings.Contains(tmp[0], "\\u") {
-						tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
-						tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
-						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
-					} else {
-						pattern = tmp[0]
-					}
-					if len(tmp) == 2 {
-						epos := strings.Split(tmp[1], ",")
-						posm := map[string]int{}
-						for _, v := range epos {
-							ks := strings.Split(v, ":")
-							if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
-								posm[ks[1]] = qu.IntAll(ks[0])
-							} else { //(.*)招标公告__2
-								posm[rinfo.Field] = qu.IntAll(ks[0])
-							}
-						}
-						rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm, NumSign: sign}
-					} else {
-						rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
-					}
-					ruleCores = append(ruleCores, rinfo)
-				}, func(err interface{}) {
-					log.Debug(rinfo.Code, rinfo.Field, err)
-				})
-			}
-		}
-		rcore.RuleCores = ruleCores
-		//
-		maps = append(maps, rcore)
-	}
-	return maps
-}
-
 //加载分包抽取规则
 func (e *ExtractTask) InitPkgCore() {
 	defer qu.Catch()

+ 8 - 3
src/jy/extract/score.go

@@ -130,6 +130,10 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 		taglength := len(ftag[field])
 		locktag.Unlock()
 		for tmpsindex, tmpsvalue := range tmps {
+			//没有抽取到值,不打分
+			if string_value := fmt.Sprint(tmpsvalue.Value); string_value == "" || string_value == "0" || string_value == "<nil>" {
+				continue
+			}
 			lockscore.Lock()
 			describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
 			lockscore.Unlock()
@@ -170,9 +174,10 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 				typescore = CommonScore[tmpsvalue.Type]
 			}
 			lockscore.Unlock()
-
-			tmps[tmpsindex].Score += titlescore
-			tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: titlescore})
+			if titlescore > 0 {
+				tmps[tmpsindex].Score += titlescore
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: titlescore})
+			}
 			tmps[tmpsindex].Score += typescore
 			tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: typescore})
 

+ 4 - 4
src/jy/pretreated/analytable.go

@@ -70,7 +70,7 @@ var (
 	//简单判断金额
 	filter_zbje_jd = regexp.MustCompile("^[^售]{0,4}(价|额).{0,4}$")
 	//且排队以下字眼的key
-	filter_zbje_kn = regexp.MustCompile("得分|打分|时间|业绩|须知|分$")
+	filter_zbje_kn = regexp.MustCompile("得分|打分|时间|业绩|须知|分|要求$")
 	//且值包含以下字眼
 	filter_zbje_v = regexp.MustCompile("[¥$$0-9一二三四五六七八九十,,〇零点..壹贰叁肆伍陆柒捌玖拾百佰千仟万亿億元圆角分整正()::大小写]{2,16}")
 
@@ -175,10 +175,10 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (kvTags map[s
 			if tabletag == "" {
 				returntag = "中标情况"
 			}
-			kvTags["中标金额"] = append(kvTags["中标金额"], &u.Tag{Key: "中标金额", Value: v1, Weight: -100})
+			kvTags["中标金额"] = append(kvTags["中标金额"], &u.Tag{Key: k, Value: v1, Weight: -100})
 		} else if filter_zbdw_ky.MatchString(k) && !filter_zbdw_kn.MatchString(k) &&
 			filter_zbdw_v.MatchString(v1) {
-			kvTags["中标单位"] = append(kvTags["中标单位"], &u.Tag{Key: "中标单位", Value: v1, Weight: -100})
+			kvTags["中标单位"] = append(kvTags["中标单位"], &u.Tag{Key: k, Value: v1, Weight: -100})
 			if tabletag == "" {
 				returntag = "中标情况"
 			}
@@ -190,7 +190,7 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (kvTags map[s
 			if filter_tag_zb.MatchString(tabletag) || filter_tag_zb.MatchString(tabledesc) {
 				//u.Debug(v1, k, "-----", filter_zbdw_jd.MatchString(k), filter_zbdw_v.MatchString(v1))
 				if filter_zbje_jd.MatchString(k) && !filter_zbje_kn.MatchString(k) && filter_zbje_v.MatchString(v1) {
-					kvTags["中标金额"] = append(kvTags["中标金额"], &u.Tag{Key: "中标金额", Value: v1, Weight: -100})
+					kvTags["中标金额"] = append(kvTags["中标金额"], &u.Tag{Key: k, Value: v1, Weight: -100})
 
 				} /*else if filter_zbdw_jd.MatchString(k) && filter_zbdw_v.MatchString(v1) {
 					k1 = append(k1, "中标单位")