Эх сурвалжийг харах

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

maxiaoshan 6 жил өмнө
parent
commit
5b19187cf6

+ 3 - 3
src/config.json

@@ -3,14 +3,14 @@
     "mgodb": "192.168.3.207:27082",
     "dbsize": 2,
     "dbname": "extract_kf",
-    "redis": "buyer=192.168.3.207:3379,winner=192.168.3.207:3379,agency=192.168.3.207:3379",
+    "redis": "buyer=192.168.3.207:1377,winner=192.168.3.207:1378,agency=192.168.3.207:1379",
     "elasticsearch": "http://192.168.3.11:9800",
     "elasticPoolSize": 30,
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
-    "saveresult": true,
+    "saveresult": false,
     "qualityaudit": false,
-    "saveblock": true,
+    "saveblock": false,
     "filelength": 100000,
     "iscltlog": false,
     "brandgoods": false,

+ 60 - 42
src/jy/extract/extract.go

@@ -419,27 +419,34 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 		}
 		//候选人加入
 		if len(j.Winnerorder) > 0 {
-			winner := &ju.ExtField{
-				Field:     "winner",
-				Code:      "",
-				RuleText:  "",
-				Type:      "winnerorder",
-				MatchType: "winnerorder",
-				ExtFrom:   "",
-				Value:     j.Winnerorder[0]["entname"],
-				Score:     0,
-			}
-			if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
-				winner.Score = -5
+			//候选人中标金额
+			if price := j.Winnerorder[0]["price"]; price != nil {
+				bidamount := &ju.ExtField{
+					Field:     "bidamount",
+					Code:      "",
+					RuleText:  "",
+					Type:      "winnerorder",
+					MatchType: "winnerorder",
+					ExtFrom:   "",
+					Value:     price,
+					Score:     0,
+				}
+				j.Result["bidamount"] = []*ju.ExtField{bidamount}
 			}
-			winners := j.Result["winner"]
-			if winners != nil {
-				winners = append(winners, winner)
-			} else {
-				winners = []*ju.ExtField{}
-				winners = append(winners, winner)
+			//候选人中标单位
+			if entname := j.Winnerorder[0]["entname"]; entname != nil {
+				winner := &ju.ExtField{
+					Field:     "winner",
+					Code:      "",
+					RuleText:  "",
+					Type:      "winnerorder",
+					MatchType: "winnerorder",
+					ExtFrom:   "",
+					Value:     entname,
+					Score:     0,
+				}
+				j.Result["winner"] = []*ju.ExtField{winner}
 			}
-			j.Result["winner"] = winners
 		}
 		//函数清理
 		for key, val := range j.Result {
@@ -532,27 +539,34 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
 		}
 		//候选人加入
 		if len(j.Winnerorder) > 0 {
-			winner := &ju.ExtField{
-				Field:     "winner",
-				Code:      "",
-				RuleText:  "",
-				Type:      "winnerorder",
-				MatchType: "winnerorder",
-				ExtFrom:   "",
-				Value:     j.Winnerorder[0]["entname"],
-				Score:     0,
-			}
-			if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
-				winner.Score = -5
+			//候选人中标金额
+			if price := j.Winnerorder[0]["price"]; price != nil {
+				bidamount := &ju.ExtField{
+					Field:     "bidamount",
+					Code:      "",
+					RuleText:  "",
+					Type:      "winnerorder",
+					MatchType: "winnerorder",
+					ExtFrom:   "",
+					Value:     price,
+					Score:     0,
+				}
+				j.Result["bidamount"] = []*ju.ExtField{bidamount}
 			}
-			winners := j.Result["winner"]
-			if winners != nil {
-				winners = append(winners, winner)
-			} else {
-				winners = []*ju.ExtField{}
-				winners = append(winners, winner)
+			//候选人中标单位
+			if entname := j.Winnerorder[0]["entname"]; entname != nil {
+				winner := &ju.ExtField{
+					Field:     "winner",
+					Code:      "",
+					RuleText:  "",
+					Type:      "winnerorder",
+					MatchType: "winnerorder",
+					ExtFrom:   "",
+					Value:     entname,
+					Score:     0,
+				}
+				j.Result["winner"] = []*ju.ExtField{winner}
 			}
-			j.Result["winner"] = winners
 		}
 		//函数清理
 		for key, val := range j.Result {
@@ -737,6 +751,7 @@ func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) map[string][]map
 					text := ju.TrimLRSpace(vv.Value, "")
 					if text != "" {
 						kvmap[field] = append(kvmap[field], map[string]interface{}{
+							"code":        "CL_" + vv.Key,
 							"field":       field,
 							"ruletext":    vv.Key,
 							"extfrom":     vc.ExtFrom,
@@ -917,10 +932,10 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 					if text != "" {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 					}
-					j.Result[in.Field][k].Value = text
 					if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
 						continue
 					}
+					j.Result[in.Field][k].Value = text
 					exts = append(exts, map[string]interface{}{
 						"field":     v.Field,
 						"code":      v.Code,
@@ -947,10 +962,10 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 					if text != "" {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 					}
-					j.Result[key][k].Value = text
 					if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
 						continue
 					}
+					j.Result[key][k].Value = text
 					exts = append(exts, map[string]interface{}{
 						"field":     v.Field,
 						"code":      v.Code,
@@ -1001,7 +1016,7 @@ func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *Re
 		return
 	}
 	logdata := map[string]interface{}{
-		"code":       v.Code,
+		"code":       qu.If(v.Code == "", "kv", v.Code),
 		"name":       v.Name,
 		"type":       ftype,
 		"ruletext":   v.RuleText,
@@ -1101,10 +1116,13 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		tmp := map[string]interface{}{} //抽取值
 		tmp["fieldall"] = auxinfo
 		for _, val := range result {
-			for _, v := range val { //取第一个非负数
+			for _, v := range val { //取第一个非负数,项目名称除外
 				if v.Score > -1 {
 					tmp[v.Field] = v.Value
 					break
+				} else if v.Field == "projectname" {
+					tmp[v.Field] = v.Value
+					break
 				}
 			}
 		}

+ 1 - 38
src/jy/extract/score.go

@@ -98,25 +98,6 @@ func init() {
 					}
 				}
 			}
-			if winnerorders, ok := tmp["winnerorder"].([]interface{}); ok {
-				for _, winnerorder := range winnerorders {
-					if p, ok := winnerorder.(map[string]interface{}); ok {
-						qu.Try(func() {
-							strReq, _ := p["regstr"].(string)
-							if strings.Contains(strReq, "\\u") {
-								strReq = strings.Replace(strReq, "\\", "\\\\", -1)
-								strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
-								strReq, _ = strconv.Unquote(`"` + strReq + `"`)
-								p["regexp"] = regexp.MustCompile(strReq)
-							} else {
-								p["regexp"] = regexp.MustCompile(strReq)
-							}
-						}, func(err interface{}) {
-							log.Println(err)
-						})
-					}
-				}
-			}
 		}
 	}
 }
@@ -271,26 +252,8 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 						}
 					}
 				}
-				//4.中标候选人打分
-				if winnerorders, ok := scoreRule["winnerorder"].([]interface{}); ok {
-					for _, winnerorder := range winnerorders {
-						if p, ok := winnerorder.(map[string]interface{}); ok {
-							qu.Try(func() {
-								if p["regexp"] != nil {
-									reg := p["regexp"].(*regexp.Regexp)
-									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
-										tmps[tmpsindex].Score += qu.Float64All(p["score"])
-										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "位置打分winnerorder" + fmt.Sprint(p["describe"]), Code: "winnerorder", RuleText: reg.String(), ScoreFrom: "fieldscore.json.winnerorder", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
-									}
-								}
-							}, func(err interface{}) {
-								log.Println(err)
-							})
-						}
-					}
-				}
 			}
-			//5.数据范围打分
+			//4.数据范围打分
 			if scoreRule["type"] == "float" {
 				min := qu.IntAll(scoreRule["min"])
 				max := qu.IntAll(scoreRule["max"])

+ 15 - 3
src/jy/pretreated/analystep.go

@@ -46,6 +46,7 @@ func AnalyStart(job *util.Job) {
 			//新加 未分块table中未能解析到中标候选人,从正文中解析
 			if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
 				bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
+				job.Winnerorder = bl.Winnerorder
 			}
 			job.Block = append(job.Block, bl)
 		}
@@ -77,6 +78,7 @@ func AnalyStart(job *util.Job) {
 		//新加 未分块table中未能解析到中标候选人,从正文中解析
 		if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
 			bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
+			job.Winnerorder = bl.Winnerorder
 		}
 		//log.Println(bl.Text)
 		job.Block = append(job.Block, bl)
@@ -105,10 +107,20 @@ func FindProjectCode(newCon string, job *util.Job) {
 	}
 	var proCode string
 	blCode := &util.Block{}
-	if projectcodeRegAll.MatchString(newCon){//项目名称项目编号一起的5d424bdfa5cb26b9b7ac7a85
+	if newCon = projectcodeRegAll.FindString(newCon); newCon != "" { //项目名称项目编号一起的
+		//5d424bdfa5cb26b9b7ac7a85
+		//5d425a48a5cb26b9b7df5fec
+		//5d425506a5cb26b9b7cd2c3c
 		splitStr := strings.Split(newCon, " ")
-		if len(splitStr) >=2{
-			newCon = "项目编号:"+splitStr[len(splitStr)-1]
+		if len(splitStr) >= 2 {
+			newCon = "项目编号:" + splitStr[len(splitStr)-1]
+		} else if len(splitStr) == 1 {
+			if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+				newCon = "项目编号:" + tmpstr
+			}else if strings.Contains(newCon,"、"){
+				tmpstrs :=strings.Split(newCon,"、")
+				newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
+			}
 		}
 	}
 	proCode = projectcodeReg.FindString(newCon)

+ 5 - 4
src/jy/pretreated/analytable.go

@@ -107,9 +107,10 @@ var (
 	underline                   = regexp.MustCompile("_+$")
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
 	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
-	projectcodeRegAll           = regexp.MustCompile(`采购项目名称及项目编号[:|:]?`)
-	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(编号|项目编号|标段编号|招标编号){1}(:|:)(.){4,30}()|\)|\])`)
-	projectcodeReg2             = regexp.MustCompile(`((?:^|\n)编号|项目编号|标段编号){1}(:|:)(.){4,30}[0-9a-zA-Z)号]`)
+	projectcodeRegAll           = regexp.MustCompile(`(采购项目|项目)名称及[项目]?编号[:|:]?.*[\n]?`)
+	projectcodeRegAll2          = regexp.MustCompile("[((].{4,30}[))]")
+	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(^([\s]?编号)|项目编号|标段编号|招标编号){1}(:|:)(.){4,30}()|\)|\])`)
+	projectcodeReg2             = regexp.MustCompile(`(^([\s]?编号)|项目编号){1}(:|:)(.{4,39})[0-9a-zA-Z)号]`)
 	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")
 	jsonReg                     = regexp.MustCompile(`\{.+:[^}]*\} `) //  \{".*\":\".+\"}
 	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
@@ -3150,7 +3151,7 @@ func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMa
 	for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序
 		val := table.SortKV.Map[key]
 		key = regReplAllSpace.ReplaceAllString(key, "")
-		key = strings.Replace(key, "", "", -1)    //处理一个特殊的采购量 经上层处理空格后未处理掉
+		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 			/*
 				{

+ 12 - 7
src/jy/pretreated/winnerorder.go

@@ -1,6 +1,7 @@
 package pretreated
 
 import (
+	"log"
 	//"jy/clear"
 	"jy/util"
 	qutil "qfw/util"
@@ -94,6 +95,17 @@ func (wo *WinnerOrderEntity) Find(text string, flag bool, from int) []map[string
 			}
 		}
 	}
+	//候选人有一半以上是错误的话,那么就认为全部抽错了
+	invalidCount := 0
+	for _, v := range winners {
+		if !findCandidate.MatchString(qutil.ObjToString(v["entname"])) {
+			invalidCount++
+		}
+	}
+	log.Println(invalidCount)
+	if invalidCount > len(winners)/2 {
+		return []map[string]interface{}{}
+	}
 	return winners
 }
 
@@ -208,13 +220,6 @@ func (wo *WinnerOrderEntity) findByReg(content string, blocks []string, reg_2 *r
 				winners = append(winners, object)
 				object = map[string]interface{}{}
 			}
-			//新加 从正文抽取时对v校验
-			if from == 1 || from == 3 {
-				v = findCandidate.FindString(v)
-				if v == "" {
-					continue
-				}
-			}
 			val := wo.clear("中标单位", v)
 			if val != nil {
 				count++

+ 29 - 16
src/res/fieldscore.json

@@ -131,6 +131,11 @@
                 "describe": "包含负分",
                 "regstr": "(附件|委托|代理|咨询|管理有限公司|管理顾问|招标失败|交易中心|不足|公告|变更|招标|废标|废止|流标|中标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
                 "score": -5
+            },
+            {
+                "describe": "包含负分不再展示",
+                "regstr": "(详见|提出|面向|施工)",
+                "score": -50
             }
         ],
         "length": [
@@ -174,6 +179,11 @@
                 "describe": "包含负分",
                 "regstr": "(附件|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
                 "score": -10
+            },
+			{
+                "describe": "非结尾",
+                "regstr": ".*[^集团|公司|学校|中心|家具城|门诊|\\[大中小\\]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场|行]$",
+                "score": -5
             }
         ],
         "length": [
@@ -201,13 +211,6 @@
                     -1
                 ]
             }
-        ],
-        "winnerorder": [
-            {
-                "describe": "非结尾",
-                "regstr": ".*[^集团|公司|学校|中心|家具城|门诊|\\[大中小\\]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场|行]$",
-                "score": -5
-            }
         ]
     },
     "agency": {
@@ -326,9 +329,14 @@
         "positivewords": [
             {
                 "describe": "有关键字加分",
-                "regstr": "(财采|招字|财购|赣购){1}",
+                "regstr": "(财采|招字|财购|赣购|豫财|管字|豫政){1}",
                 "score": 2
             },
+            {
+                "describe": "有关键字加分",
+                "regstr":"(【[0-9]{4}】.{2,5}号){1}",
+                "score": 0.5
+            },
             {
                 "describe": "号结尾加分",
                 "regstr": ".{4,35}(号)$",
@@ -336,6 +344,11 @@
             }
         ],
         "negativewords": [
+            {
+                "describe": "以什么开始的减分",
+                "regstr": "^【",
+                "score": -1
+            },
             {
                 "describe": "长度年月日纯数字减分",
                 "regstr": "^\\d{8}$",
@@ -347,14 +360,14 @@
                 "score": -0.2
             },
             {
-                "describe": "中文汉字大于5个",
-                "regstr": "[\\u4e00-\\u9fa5]{5,}",
-                "score": -2
+                "describe": "中文汉字大于6个",
+                "regstr": "[\\u4e00-\\u9fa5]{6,}",
+                "score": -1.3
             },
             {
                 "describe": "全为中文汉字或符号",
                 "regstr": "^[\\u4e00-\\u9fa5()()【】\\[\\],,。、::《》]+$",
-                "score": -2
+                "score": -20
             },
             {
                 "describe": "以一个汉字以上结束",
@@ -363,13 +376,13 @@
             },
             {
                 "describe": "包含负分",
-                "regstr": "(null|勘察|设计|设备|项目|标段|工程|监理|范围|分包|月|日|天|[,,。、::“”‘’_\"])",
+                "regstr": "(null|勘察|设计|设备|项目|标段|工程|监理|范围|分包|月|日|天|\\([0-9]{1}\\)|[,,。、::“”‘’_\"])",
                 "score": -1
             },
             {
-                "describe": "标段编号匹配-2",
-                "regstr": "/.{2}",
-                "score": -2
+                "describe": "标段编号匹配-0.3",
+                "regstr": "(\\([0-9]{1}\\)|-)",
+                "score": -0.3
             },
             {
                 "describe": "-结束没有抽取完",

BIN
src/src