Selaa lähdekoodia

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

zhangjinkun 6 vuotta sitten
vanhempi
commit
0361d25105

+ 96 - 68
src/jy/extract/extract.go

@@ -366,7 +366,6 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 			}
 		}
 		lockrule.Unlock()
-
 		//抽取规则
 		for _, vc1 := range tmprules {
 			for _, vc := range vc1 {
@@ -414,42 +413,10 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				}
 			}
 		}
-
 		//全局后置规则
 		for _, v := range e.RuleBacks {
 			ExtRegBack(j, v, e.TaskInfo)
 		}
-		//候选人加入
-		if len(j.Winnerorder) > 0 {
-			//候选人中标金额
-			if price := j.Winnerorder[0]["price"]; price != nil {
-				bidamount := &ju.ExtField{
-					Field:     "bidamount",
-					Code:      "",
-					RuleText:  "",
-					Type:      "winnerorder",
-					MatchType: "winnerorder",
-					ExtFrom:   "",
-					Value:     price,
-					Score:     0,
-				}
-				j.Result["bidamount"] = []*ju.ExtField{bidamount}
-			}
-			//候选人中标单位
-			if entname := j.Winnerorder[0]["entname"]; entname != nil {
-				winner := &ju.ExtField{
-					Field:     "winner",
-					Code:      "",
-					RuleText:  "",
-					Type:      "winnerorder",
-					MatchType: "winnerorder",
-					ExtFrom:   "",
-					Value:     entname,
-					Score:     0,
-				}
-				j.Result["winner"] = []*ju.ExtField{winner}
-			}
-		}
 		//函数清理
 		for key, val := range j.Result {
 			for _, v := range val {
@@ -539,37 +506,6 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
 				ExtRegBack(j, v, e.TaskInfo)
 			}
 		}
-		//候选人加入
-		if len(j.Winnerorder) > 0 {
-			//候选人中标金额
-			if price := j.Winnerorder[0]["price"]; price != nil {
-				bidamount := &ju.ExtField{
-					Field:     "bidamount",
-					Code:      "",
-					RuleText:  "",
-					Type:      "winnerorder",
-					MatchType: "winnerorder",
-					ExtFrom:   "",
-					Value:     price,
-					Score:     0,
-				}
-				j.Result["bidamount"] = []*ju.ExtField{bidamount}
-			}
-			//候选人中标单位
-			if entname := j.Winnerorder[0]["entname"]; entname != nil {
-				winner := &ju.ExtField{
-					Field:     "winner",
-					Code:      "",
-					RuleText:  "",
-					Type:      "winnerorder",
-					MatchType: "winnerorder",
-					ExtFrom:   "",
-					Value:     entname,
-					Score:     0,
-				}
-				j.Result["winner"] = []*ju.ExtField{winner}
-			}
-		}
 		//函数清理
 		for key, val := range j.Result {
 			for _, v := range val {
@@ -630,14 +566,16 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
 
 //抽取-规则
 func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job) {
+	//候选人加入
 	var kvMap map[string][]map[string]interface{}
+	extByReg := true
 	if vc.ExtFrom != "title" {
-		kvMap = getKvByLuaFields(vc, j, e)
+		kvMap, extByReg = getKvByLuaFields(vc, j, e)
 	}
 	for _, v := range vc.RuleCores {
 		if v.IsLua {
 			ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, kvMap)
-		} else {
+		} else if extByReg {
 			ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e)
 		}
 	}
@@ -722,8 +660,65 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
 }
 
 //lua脚本根据属性设置提取kv值
-func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) map[string][]map[string]interface{} {
+func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
 	kvmap := map[string][]map[string]interface{}{}
+	if len(j.Winnerorder) > 0 {
+		if vc.Field == "bidamount" {
+			for _, v := range j.Winnerorder {
+				kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
+					"code":        "winnerorder",
+					"field":       vc.Field,
+					"ruletext":    "中标候选人",
+					"extfrom":     vc.ExtFrom,
+					"sourcevalue": "中标候选人",
+					"value":       v["price"],
+					"type":        "winnerorder",
+					"matchtype":   "winnerorder",
+				})
+			}
+			//候选人中标金额
+			if price := j.Winnerorder[0]["price"]; price != nil {
+				kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
+					"code":        "CL_中标候选人",
+					"field":       vc.Field,
+					"ruletext":    "中标候选人",
+					"extfrom":     vc.ExtFrom,
+					"sourcevalue": "中标候选人",
+					"value":       price,
+					"type":        "winnerorder",
+					"matchtype":   "winnerorder",
+				})
+				return kvmap, false
+			}
+		} else if vc.Field == "winner" {
+			for _, v := range j.Winnerorder {
+				kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
+					"code":        "winnerorder",
+					"field":       vc.Field,
+					"ruletext":    "中标候选人",
+					"extfrom":     vc.ExtFrom,
+					"sourcevalue": "中标候选人",
+					"value":       v["entname"],
+					"type":        "winnerorder",
+					"matchtype":   "winnerorder",
+				})
+			}
+			//候选人中标单位
+			if entname := j.Winnerorder[0]["entname"]; entname != nil {
+				kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
+					"code":        "CL_中标候选人",
+					"field":       vc.Field,
+					"ruletext":    "中标候选人",
+					"extfrom":     vc.ExtFrom,
+					"sourcevalue": "中标候选人",
+					"value":       entname,
+					"type":        "winnerorder",
+					"matchtype":   "winnerorder",
+				})
+				return kvmap, false
+			}
+		}
+	}
 	for fieldname, field := range vc.LFields {
 		if field != vc.Field {
 			continue
@@ -731,7 +726,7 @@ func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) map[string][]map
 		extractFromKv(field, fieldname, j.Block, vc, kvmap)
 	}
 	AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
-	return kvmap
+	return kvmap, true
 }
 
 func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}) {
@@ -1108,6 +1103,8 @@ type FieldValue struct {
 //分析抽取结果并保存
 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 	qu.Try(func() {
+		//重新取出清理过后的中标候选人
+		resetWinnerorder(j)
 		doc, result, _id := funcAnalysis(j, e.Tag)
 		if isSaveTag, _ := ju.Config["isSaveTag"].(bool); isSaveTag {
 			go otherNeedSave(j, result, e)
@@ -1519,3 +1516,34 @@ func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
 	}
 	return SMap
 }
+
+//中标候选人经过清理之后,重新取出赋值
+func resetWinnerorder(j *ju.Job) {
+	if len(j.Winnerorder) == 0 {
+		return
+	}
+	//中标单位
+	i := 0
+	winners := []*ju.ExtField{}
+	for _, v := range j.Result["winner"] {
+		if v.Code == "winnerorder" {
+			j.Winnerorder[i]["entname"] = v.Value
+			i++
+		} else {
+			winners = append(winners, v)
+		}
+	}
+	j.Result["winner"] = winners
+	//中标金额
+	i = 0
+	bidamounts := []*ju.ExtField{}
+	for _, v := range j.Result["bidamount"] {
+		if v.Code == "winnerorder" {
+			j.Winnerorder[i]["price"] = v.Value
+			i++
+		} else {
+			bidamounts = append(bidamounts, v)
+		}
+	}
+	j.Result["bidamount"] = bidamounts
+}

+ 38 - 38
src/jy/pretreated/analystep.go

@@ -7,7 +7,6 @@ import (
 	"encoding/json"
 	"jy/util"
 	//"log"
-	"unicode/utf8"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
@@ -52,6 +51,7 @@ func AnalyStart(job *util.Job) {
 			job.Block = append(job.Block, bl)
 		}
 	} else { //未分块,创建分块
+		//log.Println(con)
 		bl := &util.Block{}
 		newCon := con
 		if len(tabs) > 0 { //解析表格逻辑
@@ -102,50 +102,50 @@ func processTableInBlock(bl *util.Block, job *util.Job) {
 
 //匹配项目编号
 func FindProjectCode(newCon string, job *util.Job) {
-	newCon = TextAfterRemoveTable(newCon)
+	newCon = HtmlToText(newCon)
 	if strings.TrimSpace(newCon) == "" {
 		return
 	}
 	var proCode string
 	blCode := &util.Block{}
-	if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
-		//5d424bdfa5cb26b9b7ac7a85
-		//5d425a48a5cb26b9b7df5fec
-		//5d425506a5cb26b9b7cd2c3c
-		splitStr := strings.Split(newConTMP, " ")
-		if len(splitStr) >= 2 {
-			if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
-				newCon = "项目编号:" + splitStr[len(splitStr)-1]
-			} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
-				//5d4253f3a5cb26b9b7ca2662
-				newCon = "项目编号:" + tmpstr
-			}
-		} else if len(splitStr) == 1 {
-			if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
-				newCon = "项目编号:" + tmpstr
-			} else if strings.Contains(newConTMP, "、") {
-				tmpstrs := strings.Split(newCon, "、")
-				newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
+	/*		if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
+			//5d424bdfa5cb26b9b7ac7a85
+			//5d425a48a5cb26b9b7df5fec
+			//5d425506a5cb26b9b7cd2c3c
+			splitStr := strings.Split(newConTMP, " ")
+			if len(splitStr) >= 2 {
+				if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
+					newCon = "项目编号:" + splitStr[len(splitStr)-1]
+				} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+					//5d4253f3a5cb26b9b7ca2662
+					newCon = "项目编号:" + tmpstr
+				}
+			} else if len(splitStr) == 1 {
+				if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+					newCon = "项目编号:" + tmpstr
+				} else if strings.Contains(newConTMP, "、") {
+					tmpstrs := strings.Split(newCon, "、")
+					newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
+				}
 			}
 		}
-	}
-	proCode = projectcodeReg.FindString(newCon)
-	if proCode != "" {
-		ckv := GetKVAll(proCode, job.Title, nil, 1)
-		blCode.ColonKV = ckv
-		blCode.Text = proCode
-		job.Block = append(job.Block, blCode)
-	} else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
-		ckv := GetKVAll(proCode, job.Title, nil, 1)
-		blCode.ColonKV = ckv
-		blCode.Text = proCode
-		job.Block = append(job.Block, blCode)
-	} else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
-		ckv := GetKVAll(proCode, job.Title, nil, 1)
-		blCode.Text = proCode
-		blCode.ColonKV = ckv
-		job.Block = append(job.Block, blCode)
-	}
+		proCode = projectcodeReg.FindString(newCon)
+		if proCode != "" {
+			ckv := GetKVAll(proCode, job.Title, nil, 1)
+			blCode.ColonKV = ckv
+			blCode.Text = proCode
+			job.Block = append(job.Block, blCode)
+		} else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
+			ckv := GetKVAll(proCode, job.Title, nil, 1)
+			blCode.ColonKV = ckv
+			blCode.Text = proCode
+			job.Block = append(job.Block, blCode)
+		} else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
+			ckv := GetKVAll(proCode, job.Title, nil, 1)
+			blCode.Text = proCode
+			blCode.ColonKV = ckv
+			job.Block = append(job.Block, blCode)
+		}*/
 	if proCode = jsonReg.FindString(newCon); proCode != "" {
 		jsonMap := make(map[string]string)
 		json.Unmarshal([]byte(proCode), &jsonMap)

+ 7 - 17
src/jy/pretreated/analytable.go

@@ -103,15 +103,9 @@ var (
 	MultipleValueSplitReg       = regexp.MustCompile("[,,、\\s\u3000\u2003\u00a0]")
 	BuyerContacts               = []string{"采购单位联系人", "采购单位联系电话", "采购单位联系地址"}
 	FilterSerial                = regexp.MustCompile(".+[、..::,]")
-	filterTableWror             = regexp.MustCompile("班子成员")
 	underline                   = regexp.MustCompile("_+$")
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
-	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
-	projectcodeRegAll           = regexp.MustCompile(`(采购)?项目名称及(项目)?编号[:|:]?.*[\n]?`)
-	projectcodeRegAll2          = regexp.MustCompile("[((].{4,30}[))]?")
-	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(^([\s]?编号)|项目编号|标段编号|招标编号){1}(:|:)(.){4,30}()|\)|\])`)
-	projectcodeReg2             = regexp.MustCompile(`(^([\s]?编号)|项目编号){1}(:|:)(.{4,39})[0-9a-zA-Z)号]`)
-	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")
+	nswinnertabletag            = regexp.MustCompile("[评得分估]+|标的|班子成员")
 	jsonReg                     = regexp.MustCompile(`\{.+:[^}]*\} `) //  \{".*\":\".+\"}
 	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
 	winnerOrderAndBidResult     = regexp.MustCompile("((中标)?候选人|(中标|评标)结果)")
@@ -213,10 +207,6 @@ func (table *Table) KVFilter() {
 	//4.对KV的处理
 	//判断表格是否有用,调用abandontable正则数组进行判断
 	//遍历每一行
-	winnertag := iswinnertabletag.MatchString(table.Tag) && !nswinnertabletag.MatchString(table.Tag) //table标签
-	if !winnertag {
-		winnertag = iswinnertabletag.MatchString(table.Tag) && !nswinnertabletag.MatchString(table.TableResult.BlockTag) //块标签
-	}
 	table.analyTdKV() //1.遍历每行每列td的sortkv添加到table.SorkVK中;2.td有子表格的处理
 	as := NewSortMap()
 
@@ -244,11 +234,7 @@ func (table *Table) KVFilter() {
 		}
 	}
 	//处理值是数组的kv放入标准化kv中//处理table.SortKV.value为数组的情况
-	table.sortKVArr(as, winnertag)
-	//
-	if filterTableWror.MatchString(table.Tag) {
-		table.WinnerOrder = nil
-	}
+	table.sortKVArr(as)
 	//
 	if len(table.WinnerOrder) > 0 || !table.BPackage {
 		winnerOrder := []map[string]interface{}{}
@@ -317,7 +303,11 @@ func (table *Table) KVFilter() {
 }
 
 //处理table.SortKV.value为数组的情况
-func (table *Table) sortKVArr(as *SortMap, winnertag bool) {
+func (table *Table) sortKVArr(as *SortMap) {
+	winnertag := iswinnertabletag.MatchString(table.Tag) && !nswinnertabletag.MatchString(table.Tag) //table标签
+	if !winnertag {
+		winnertag = iswinnertabletag.MatchString(table.TableResult.BlockTag) && !nswinnertabletag.MatchString(table.TableResult.BlockTag) //块标签
+	}
 	checkKey := map[int]bool{}
 	for kn, k := range as.Keys { //遍历table.SortKV.value为数组的key
 		v := as.Map[k]

+ 1 - 0
src/jy/pretreated/colonkv.go

@@ -371,6 +371,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 				if buyer == "" {
 					continue
 				}
+
 				prevLine := kv.PrevLine
 				prevLine = strings.TrimSpace(prevLine)
 				prevLine = strings.Split(prevLine, " ")[0]

+ 18 - 18
src/jy/pretreated/tablev2.go

@@ -150,24 +150,24 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		//}
 		//}
 	}
-	//抽取不到走正则抽
-	proCode := projectcodeReg.FindString(text)
-	if proCode != "" {
-		ckv := GetKVAll(proCode, "", nil, 1)
-		for _, v := range ckv.KvTags {
-			for _, vv := range v {
-				td.SortKV.AddKey(vv.Key, vv.Value)
-			}
-		}
-	} else if proCode = projectcodeReg2.FindString(text); proCode != "" {
-		ckv := GetKVAll(proCode, "", nil, 1)
-		for _, v := range ckv.KvTags {
-			for _, vv := range v {
-				td.SortKV.AddKey(vv.Key, vv.Value)
-			}
-		}
-	}
-	if proCode = jsonReg.FindString(text); proCode != "" {
+	////抽取不到走正则抽
+	//proCode := projectcodeReg.FindString(text)
+	//if proCode != "" {
+	//	ckv := GetKVAll(proCode, "", nil, 1)
+	//	for _, v := range ckv.KvTags {
+	//		for _, vv := range v {
+	//			td.SortKV.AddKey(vv.Key, vv.Value)
+	//		}
+	//	}
+	//} else if proCode = projectcodeReg2.FindString(text); proCode != "" {
+	//	ckv := GetKVAll(proCode, "", nil, 1)
+	//	for _, v := range ckv.KvTags {
+	//		for _, vv := range v {
+	//			td.SortKV.AddKey(vv.Key, vv.Value)
+	//		}
+	//	}
+	//}
+	if proCode := jsonReg.FindString(text); proCode != "" {
 		jsonMap := make(map[string]string)
 		json.Unmarshal([]byte(proCode), &jsonMap)
 		for k, v := range jsonMap {

+ 7 - 2
src/res/fieldscore.json

@@ -356,7 +356,7 @@
         "negativewords": [
             {
                 "describe": "以什么开始的减分",
-                "regstr": "^【",
+                "regstr": "^[|-]",
                 "score": -1
             },
             {
@@ -371,9 +371,14 @@
             },
             {
                 "describe": "中文汉字大于6个",
-                "regstr": "[\\u4e00-\\u9fa5]{6,}",
+                "regstr": "[\\u4e00-\\u9fa5]{6,9}",
                 "score": -1.3
             },
+            {
+                "describe": "中文汉字大于10个",
+                "regstr": "[\\u4e00-\\u9fa5]{10,}",
+                "score": -3
+            },
             {
                 "describe": "全为中文汉字或符号",
                 "regstr": "^[\\u4e00-\\u9fa5()()【】\\[\\],,。、::《》]+$",

+ 0 - 5
src/res/formattext.json

@@ -50,11 +50,6 @@
             "separator": "([\u4e00-\u9fa5]+?)[\u3000\u2003\u00a0\\s]+__$1",
             "desc": "例如:把采 购 人替换成采购人"
         },
-        {
-            "reg": "([\u4e00-\u9fa5][^((,,。、.;;\r\n]{1,30}?[::][^\\s\u3000\u2003\u00a0,、。;;\r\n]+)([((])([\u4e00-\u9fa5][^,,。、.;;\r\n))包段]{1,30}?[::].+?)([))])",
-            "separator": "$1\n$2\n$3\n$4\n",
-            "desc": "例如:采购项目名称:脱贫攻坚大数据平台建设项目(项目编号:YLLBC20164002-HS)"
-        },
         {
             "reg_c": "([\u4e00-\u9fa5][^((,,。、.;;\r\n]{1,30}?[::][^\\s\u3000\u2003\u00a0,、。;;\r\n]+)([((])(.+?[::].+?)([))])",
 			"reg": "[((]([^::))\\r\\n]{2,10}[::][^::))\\r\\n]+)+[))]",