Browse Source

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

maxiaoshan 6 năm trước cách đây
mục cha
commit
e054bd9072

+ 109 - 78
src/jy/extract/extract.go

@@ -236,6 +236,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
 	} else {
 		detail = d2
 	}
+	detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
 	d3, _ := doc["summary"].(string)
 	detail = ju.CutLableStr(d3 + "\n" + detail)
 	detail = cut.ClearHtml(d3 + "\n" + detail)
@@ -366,7 +367,6 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 			}
 		}
 		lockrule.Unlock()
-
 		//抽取规则
 		for _, vc1 := range tmprules {
 			for _, vc := range vc1 {
@@ -392,17 +392,19 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				// log.Debug("抽取-后置规则", tmp)
 
 				//项目名称未能抽取到,标题来凑
-				if vc.Field == "projectname" && vc.ExtFrom == "title" {
-					isextitle := true
-					for _, v := range j.Result[vc.Field] {
-						if len([]rune(qu.ObjToString(v.Value))) > 5 {
-							isextitle = false
-							break
+				if vc.Field == "projectname" {
+					if vc.ExtFrom == "title" {
+						isextitle := true
+						for _, v := range j.Result[vc.Field] {
+							if len([]rune(qu.ObjToString(v.Value))) > 5 {
+								isextitle = false
+								break
+							}
+						}
+						if isextitle { //标题加入选举
+							field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
+							j.Result[vc.Field] = append(j.Result[vc.Field], field)
 						}
-					}
-					if isextitle { //标题加入选举
-						field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
-						j.Result[vc.Field] = append(j.Result[vc.Field], field)
 					}
 					for i := 0; i < 3; i++ {
 						for _, v := range vc.RuleBacks {
@@ -412,42 +414,10 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				}
 			}
 		}
-
 		//全局后置规则
 		for _, v := range e.RuleBacks {
 			ExtRegBack(j, v, e.TaskInfo)
 		}
-		//候选人加入
-		if len(j.Winnerorder) > 0 {
-			//候选人中标金额
-			if price := j.Winnerorder[0]["price"]; price != nil {
-				bidamount := &ju.ExtField{
-					Field:     "bidamount",
-					Code:      "",
-					RuleText:  "",
-					Type:      "winnerorder",
-					MatchType: "winnerorder",
-					ExtFrom:   "",
-					Value:     price,
-					Score:     0,
-				}
-				j.Result["bidamount"] = []*ju.ExtField{bidamount}
-			}
-			//候选人中标单位
-			if entname := j.Winnerorder[0]["entname"]; entname != nil {
-				winner := &ju.ExtField{
-					Field:     "winner",
-					Code:      "",
-					RuleText:  "",
-					Type:      "winnerorder",
-					MatchType: "winnerorder",
-					ExtFrom:   "",
-					Value:     entname,
-					Score:     0,
-				}
-				j.Result["winner"] = []*ju.ExtField{winner}
-			}
-		}
 		//函数清理
 		for key, val := range j.Result {
 			for _, v := range val {
@@ -537,37 +507,6 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
 				ExtRegBack(j, v, e.TaskInfo)
 			}
 		}
-		//候选人加入
-		if len(j.Winnerorder) > 0 {
-			//候选人中标金额
-			if price := j.Winnerorder[0]["price"]; price != nil {
-				bidamount := &ju.ExtField{
-					Field:     "bidamount",
-					Code:      "",
-					RuleText:  "",
-					Type:      "winnerorder",
-					MatchType: "winnerorder",
-					ExtFrom:   "",
-					Value:     price,
-					Score:     0,
-				}
-				j.Result["bidamount"] = []*ju.ExtField{bidamount}
-			}
-			//候选人中标单位
-			if entname := j.Winnerorder[0]["entname"]; entname != nil {
-				winner := &ju.ExtField{
-					Field:     "winner",
-					Code:      "",
-					RuleText:  "",
-					Type:      "winnerorder",
-					MatchType: "winnerorder",
-					ExtFrom:   "",
-					Value:     entname,
-					Score:     0,
-				}
-				j.Result["winner"] = []*ju.ExtField{winner}
-			}
-		}
 		//函数清理
 		for key, val := range j.Result {
 			for _, v := range val {
@@ -628,14 +567,16 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
 
 //抽取-规则
 func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job) {
+	//候选人加入
 	var kvMap map[string][]map[string]interface{}
+	extByReg := true
 	if vc.ExtFrom != "title" {
-		kvMap = getKvByLuaFields(vc, j, e)
+		kvMap, extByReg = getKvByLuaFields(vc, j, e)
 	}
 	for _, v := range vc.RuleCores {
 		if v.IsLua {
 			ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, kvMap)
-		} else {
+		} else if extByReg {
 			ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e)
 		}
 	}
@@ -720,8 +661,65 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
 }
 
 //lua脚本根据属性设置提取kv值
-func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) map[string][]map[string]interface{} {
+func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
 	kvmap := map[string][]map[string]interface{}{}
+	if len(j.Winnerorder) > 0 {
+		if vc.Field == "bidamount" {
+			for _, v := range j.Winnerorder {
+				kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
+					"code":        "winnerorder",
+					"field":       vc.Field,
+					"ruletext":    "中标候选人",
+					"extfrom":     vc.ExtFrom,
+					"sourcevalue": "中标候选人",
+					"value":       v["price"],
+					"type":        "winnerorder",
+					"matchtype":   "winnerorder",
+				})
+			}
+			//候选人中标金额
+			if price := j.Winnerorder[0]["price"]; price != nil {
+				kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
+					"code":        "CL_中标候选人",
+					"field":       vc.Field,
+					"ruletext":    "中标候选人",
+					"extfrom":     vc.ExtFrom,
+					"sourcevalue": "中标候选人",
+					"value":       price,
+					"type":        "winnerorder",
+					"matchtype":   "winnerorder",
+				})
+				return kvmap, false
+			}
+		} else if vc.Field == "winner" {
+			for _, v := range j.Winnerorder {
+				kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
+					"code":        "winnerorder",
+					"field":       vc.Field,
+					"ruletext":    "中标候选人",
+					"extfrom":     vc.ExtFrom,
+					"sourcevalue": "中标候选人",
+					"value":       v["entname"],
+					"type":        "winnerorder",
+					"matchtype":   "winnerorder",
+				})
+			}
+			//候选人中标单位
+			if entname := j.Winnerorder[0]["entname"]; entname != nil {
+				kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
+					"code":        "CL_中标候选人",
+					"field":       vc.Field,
+					"ruletext":    "中标候选人",
+					"extfrom":     vc.ExtFrom,
+					"sourcevalue": "中标候选人",
+					"value":       entname,
+					"type":        "winnerorder",
+					"matchtype":   "winnerorder",
+				})
+				return kvmap, false
+			}
+		}
+	}
 	for fieldname, field := range vc.LFields {
 		if field != vc.Field {
 			continue
@@ -729,7 +727,7 @@ func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) map[string][]map
 		extractFromKv(field, fieldname, j.Block, vc, kvmap)
 	}
 	AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
-	return kvmap
+	return kvmap, true
 }
 
 func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}) {
@@ -1106,6 +1104,8 @@ type FieldValue struct {
 //分析抽取结果并保存
 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 	qu.Try(func() {
+		//重新取出清理过后的中标候选人
+		resetWinnerorder(j)
 		doc, result, _id := funcAnalysis(j, e.Tag)
 		if isSaveTag, _ := ju.Config["isSaveTag"].(bool); isSaveTag {
 			go otherNeedSave(j, result, e)
@@ -1517,3 +1517,34 @@ func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
 	}
 	return SMap
 }
+
+//中标候选人经过清理之后,重新取出赋值
+func resetWinnerorder(j *ju.Job) {
+	if len(j.Winnerorder) == 0 {
+		return
+	}
+	//中标单位
+	i := 0
+	winners := []*ju.ExtField{}
+	for _, v := range j.Result["winner"] {
+		if v.Code == "winnerorder" {
+			j.Winnerorder[i]["entname"] = v.Value
+			i++
+		} else {
+			winners = append(winners, v)
+		}
+	}
+	j.Result["winner"] = winners
+	//中标金额
+	i = 0
+	bidamounts := []*ju.ExtField{}
+	for _, v := range j.Result["bidamount"] {
+		if v.Code == "winnerorder" {
+			j.Winnerorder[i]["price"] = v.Value
+			i++
+		} else {
+			bidamounts = append(bidamounts, v)
+		}
+	}
+	j.Result["bidamount"] = bidamounts
+}

+ 1 - 1
src/jy/pretreated/analykv.go

@@ -13,7 +13,7 @@ var Key = regexp.MustCompile("[:::]")
 var Time = regexp.MustCompile("[\\d]")
 var dh = regexp.MustCompile("[,,.]")
 var space = regexp.MustCompile("[\\s\\n \u3000\u2003\u00a0]+")
-var val = regexp.MustCompile("[^\\s\\n \u3000\u2003\u00a0,,。!;;]")
+var val = regexp.MustCompile("[^\\s\\n \u3000\u2003\u00a0,,。!;;\\-]")
 var matchkh = map[string]string{
 	"(": ")",
 	"(": ")",

+ 40 - 40
src/jy/pretreated/analystep.go

@@ -7,7 +7,6 @@ import (
 	"encoding/json"
 	"jy/util"
 	//"log"
-	"unicode/utf8"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
@@ -31,7 +30,7 @@ func AnalyStart(job *util.Job) {
 		}
 	}
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
-	if len(blockArrays) > 0 { //有分块
+	if len(blockArrays) > 0 {                                                //有分块
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
@@ -52,6 +51,7 @@ func AnalyStart(job *util.Job) {
 			job.Block = append(job.Block, bl)
 		}
 	} else { //未分块,创建分块
+		//log.Println(con)
 		bl := &util.Block{}
 		newCon := con
 		if len(tabs) > 0 { //解析表格逻辑
@@ -102,50 +102,50 @@ func processTableInBlock(bl *util.Block, job *util.Job) {
 
 //匹配项目编号
 func FindProjectCode(newCon string, job *util.Job) {
-	newCon = TextAfterRemoveTable(newCon)
+	newCon = HtmlToText(newCon)
 	if strings.TrimSpace(newCon) == "" {
 		return
 	}
 	var proCode string
 	blCode := &util.Block{}
-	if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
-		//5d424bdfa5cb26b9b7ac7a85
-		//5d425a48a5cb26b9b7df5fec
-		//5d425506a5cb26b9b7cd2c3c
-		splitStr := strings.Split(newConTMP, " ")
-		if len(splitStr) >= 2 {
-			if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
-				newCon = "项目编号:" + splitStr[len(splitStr)-1]
-			} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
-				//5d4253f3a5cb26b9b7ca2662
-				newCon = "项目编号:" + tmpstr
-			}
-		} else if len(splitStr) == 1 {
-			if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
-				newCon = "项目编号:" + tmpstr
-			} else if strings.Contains(newConTMP, "、") {
-				tmpstrs := strings.Split(newCon, "、")
-				newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
+	/*		if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
+				//5d424bdfa5cb26b9b7ac7a85
+				//5d425a48a5cb26b9b7df5fec
+				//5d425506a5cb26b9b7cd2c3c
+				splitStr := strings.Split(newConTMP, " ")
+				if len(splitStr) >= 2 {
+					if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
+						newCon = "项目编号:" + splitStr[len(splitStr)-1]
+					} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+						//5d4253f3a5cb26b9b7ca2662
+						newCon = "项目编号:" + tmpstr
+					}
+				} else if len(splitStr) == 1 {
+					if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+						newCon = "项目编号:" + tmpstr
+					} else if strings.Contains(newConTMP, "、") {
+						tmpstrs := strings.Split(newCon, "、")
+						newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
+					}
+				}
 			}
-		}
-	}
-	proCode = projectcodeReg.FindString(newCon)
-	if proCode != "" {
-		ckv := GetKVAll(proCode, job.Title, nil, 1)
-		blCode.ColonKV = ckv
-		blCode.Text = proCode
-		job.Block = append(job.Block, blCode)
-	} else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
-		ckv := GetKVAll(proCode, job.Title, nil, 1)
-		blCode.ColonKV = ckv
-		blCode.Text = proCode
-		job.Block = append(job.Block, blCode)
-	} else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
-		ckv := GetKVAll(proCode, job.Title, nil, 1)
-		blCode.Text = proCode
-		blCode.ColonKV = ckv
-		job.Block = append(job.Block, blCode)
-	}
+			proCode = projectcodeReg.FindString(newCon)
+			if proCode != "" {
+				ckv := GetKVAll(proCode, job.Title, nil, 1)
+				blCode.ColonKV = ckv
+				blCode.Text = proCode
+				job.Block = append(job.Block, blCode)
+			} else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
+				ckv := GetKVAll(proCode, job.Title, nil, 1)
+				blCode.ColonKV = ckv
+				blCode.Text = proCode
+				job.Block = append(job.Block, blCode)
+			} else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
+				ckv := GetKVAll(proCode, job.Title, nil, 1)
+				blCode.Text = proCode
+				blCode.ColonKV = ckv
+				job.Block = append(job.Block, blCode)
+			}*/
 	if proCode = jsonReg.FindString(newCon); proCode != "" {
 		jsonMap := make(map[string]string)
 		json.Unmarshal([]byte(proCode), &jsonMap)

+ 7 - 17
src/jy/pretreated/analytable.go

@@ -103,15 +103,9 @@ var (
 	MultipleValueSplitReg       = regexp.MustCompile("[,,、\\s\u3000\u2003\u00a0]")
 	BuyerContacts               = []string{"采购单位联系人", "采购单位联系电话", "采购单位联系地址"}
 	FilterSerial                = regexp.MustCompile(".+[、..::,]")
-	filterTableWror             = regexp.MustCompile("班子成员")
 	underline                   = regexp.MustCompile("_+$")
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
-	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
-	projectcodeRegAll           = regexp.MustCompile(`(采购)?项目名称及(项目)?编号[:|:]?.*[\n]?`)
-	projectcodeRegAll2          = regexp.MustCompile("[((].{4,30}[))]?")
-	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(^([\s]?编号)|项目编号|标段编号|招标编号){1}(:|:)(.){4,30}()|\)|\])`)
-	projectcodeReg2             = regexp.MustCompile(`(^([\s]?编号)|项目编号){1}(:|:)(.{4,39})[0-9a-zA-Z)号]`)
-	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")
+	nswinnertabletag            = regexp.MustCompile("[评得分估]+|标的|班子成员")
 	jsonReg                     = regexp.MustCompile(`\{.+:[^}]*\} `) //  \{".*\":\".+\"}
 	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
 	winnerOrderAndBidResult     = regexp.MustCompile("((中标)?候选人|(中标|评标)结果)")
@@ -213,10 +207,6 @@ func (table *Table) KVFilter() {
 	//4.对KV的处理
 	//判断表格是否有用,调用abandontable正则数组进行判断
 	//遍历每一行
-	winnertag := iswinnertabletag.MatchString(table.Tag) && !nswinnertabletag.MatchString(table.Tag) //table标签
-	if !winnertag {
-		winnertag = iswinnertabletag.MatchString(table.Tag) && !nswinnertabletag.MatchString(table.TableResult.BlockTag) //块标签
-	}
 	table.analyTdKV() //1.遍历每行每列td的sortkv添加到table.SorkVK中;2.td有子表格的处理
 	as := NewSortMap()
 
@@ -244,11 +234,7 @@ func (table *Table) KVFilter() {
 		}
 	}
 	//处理值是数组的kv放入标准化kv中//处理table.SortKV.value为数组的情况
-	table.sortKVArr(as, winnertag)
-	//
-	if filterTableWror.MatchString(table.Tag) {
-		table.WinnerOrder = nil
-	}
+	table.sortKVArr(as)
 	//
 	if len(table.WinnerOrder) > 0 || !table.BPackage {
 		winnerOrder := []map[string]interface{}{}
@@ -317,7 +303,11 @@ func (table *Table) KVFilter() {
 }
 
 //处理table.SortKV.value为数组的情况
-func (table *Table) sortKVArr(as *SortMap, winnertag bool) {
+func (table *Table) sortKVArr(as *SortMap) {
+	winnertag := iswinnertabletag.MatchString(table.Tag) && !nswinnertabletag.MatchString(table.Tag) //table标签
+	if !winnertag {
+		winnertag = iswinnertabletag.MatchString(table.TableResult.BlockTag) && !nswinnertabletag.MatchString(table.TableResult.BlockTag) //块标签
+	}
 	checkKey := map[int]bool{}
 	for kn, k := range as.Keys { //遍历table.SortKV.value为数组的key
 		v := as.Map[k]

+ 10 - 8
src/jy/pretreated/division.go

@@ -158,6 +158,7 @@ func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock) ([]*ut
 		if k+1 != index {
 			if k == 0 {
 				returnValue = 3
+				break
 			} else {
 				if currentIndex+1 != index {
 					//如果序号不是连续的,不往下走
@@ -177,7 +178,6 @@ func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock) ([]*ut
 			}
 			currentIndex = index
 		}
-
 		//
 		title := serialTitles[2]                         //标题
 		title = regTrimSpace.ReplaceAllString(title, "") //清除前后空格
@@ -428,6 +428,7 @@ func appendWarpStop(text string) string {
 	}
 	return text
 }
+
 //分段
 func DivideSegmentHtml(txt string) []*util.Segment {
 	//先分段
@@ -448,6 +449,7 @@ func DivideSegmentHtml(txt string) []*util.Segment {
 	}
 	return segs
 }
+
 //分段
 func DivideSegment(txt string) []*util.Segment {
 	//先分段
@@ -463,7 +465,7 @@ func DivideSegment(txt string) []*util.Segment {
 				tmpstr += fmt.Sprint(r)
 				return false
 			} else if tmpstr == fmt.Sprint(r) {
-				if r == 46 || r == 12289{
+				if r == 46 || r == 12289 {
 					tmpstr = ""
 				}
 				return false
@@ -473,7 +475,7 @@ func DivideSegment(txt string) []*util.Segment {
 				return true
 			}
 		}
-		tmpstr= ""
+		tmpstr = ""
 		return r == 10 || r == 13
 	})
 	//再去除空行
@@ -588,7 +590,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			return false, ""
 		}
 		//
-		is := regexp.MustCompile(v[0] + "[::]*").FindAllStringIndex(con, -1)
+		is := regexp.MustCompile(v[0]+"[::]*").FindAllStringIndex(con, -1)
 		for _, sv := range is {
 			appendWarpIndex = append(appendWarpIndex, sv[0])
 		}
@@ -628,13 +630,13 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			indexPkgMap[sv[0]] = v[0]
 		}
 		//key在包前面,并且在一行的开头
-		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)" + pgflag).FindAllStringSubmatchIndex(con, -1)
+		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
 		if len(keys) == 0 {
 			//key在包前面,并且key以冒号结尾
-			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
+			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
 		}
 		if len(keys) == 0 {
-			keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
+			keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
 		}
 		for _, key := range keys {
 			startEndMap[key[5]] = key[4]
@@ -688,7 +690,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			}
 			index := util.PackageNumberConvert(bk)
 			//去掉前缀,空格必须要加,分kv的时候要用
-			text = regexp.MustCompile(bv[0] + "[::]*").ReplaceAllString(text, "")
+			text = regexp.MustCompile(bv[0]+"[::]*").ReplaceAllString(text, "")
 			headKey := ""
 			if indexKeyStringMap[iv] != "" {
 				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {

+ 18 - 18
src/jy/pretreated/tablev2.go

@@ -150,24 +150,24 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		//}
 		//}
 	}
-	//抽取不到走正则抽
-	proCode := projectcodeReg.FindString(text)
-	if proCode != "" {
-		ckv := GetKVAll(proCode, "", nil, 1)
-		for _, v := range ckv.KvTags {
-			for _, vv := range v {
-				td.SortKV.AddKey(vv.Key, vv.Value)
-			}
-		}
-	} else if proCode = projectcodeReg2.FindString(text); proCode != "" {
-		ckv := GetKVAll(proCode, "", nil, 1)
-		for _, v := range ckv.KvTags {
-			for _, vv := range v {
-				td.SortKV.AddKey(vv.Key, vv.Value)
-			}
-		}
-	}
-	if proCode = jsonReg.FindString(text); proCode != "" {
+	////抽取不到走正则抽
+	//proCode := projectcodeReg.FindString(text)
+	//if proCode != "" {
+	//	ckv := GetKVAll(proCode, "", nil, 1)
+	//	for _, v := range ckv.KvTags {
+	//		for _, vv := range v {
+	//			td.SortKV.AddKey(vv.Key, vv.Value)
+	//		}
+	//	}
+	//} else if proCode = projectcodeReg2.FindString(text); proCode != "" {
+	//	ckv := GetKVAll(proCode, "", nil, 1)
+	//	for _, v := range ckv.KvTags {
+	//		for _, vv := range v {
+	//			td.SortKV.AddKey(vv.Key, vv.Value)
+	//		}
+	//	}
+	//}
+	if proCode := jsonReg.FindString(text); proCode != "" {
 		jsonMap := make(map[string]string)
 		json.Unmarshal([]byte(proCode), &jsonMap)
 		for k, v := range jsonMap {

+ 14 - 4
src/res/fieldscore.json

@@ -64,6 +64,11 @@
                 "describe": "以*结尾",
                 "regstr": ".{2,100}(的|招标|公示|公告|谈判|公开|通知|采购文件|交易中心)$",
                 "score": -5
+            },
+            {
+                "describe": "包含词",
+                "regstr": "(万元|本项目)",
+                "score": -10
             }
         ],
         "length": [
@@ -103,14 +108,14 @@
                 "describe": "[gt,lte,score]",
                 "range": [
                     35,
-                    45,
+                    60,
                     1
                 ]
             },
             {
                 "describe": "[gt,∞,score]",
                 "range": [
-                    45,
+                    60,
                     -1,
                     -2
                 ]
@@ -351,7 +356,7 @@
         "negativewords": [
             {
                 "describe": "以什么开始的减分",
-                "regstr": "^【",
+                "regstr": "^[|-]",
                 "score": -1
             },
             {
@@ -366,9 +371,14 @@
             },
             {
                 "describe": "中文汉字大于6个",
-                "regstr": "[\\u4e00-\\u9fa5]{6,}",
+                "regstr": "[\\u4e00-\\u9fa5]{6,9}",
                 "score": -1.3
             },
+            {
+                "describe": "中文汉字大于10个",
+                "regstr": "[\\u4e00-\\u9fa5]{10,}",
+                "score": -3
+            },
             {
                 "describe": "全为中文汉字或符号",
                 "regstr": "^[\\u4e00-\\u9fa5()()【】\\[\\],,。、::《》]+$",

+ 0 - 5
src/res/formattext.json

@@ -50,11 +50,6 @@
             "separator": "([\u4e00-\u9fa5]+?)[\u3000\u2003\u00a0\\s]+__$1",
             "desc": "例如:把采 购 人替换成采购人"
         },
-        {
-            "reg": "([\u4e00-\u9fa5][^((,,。、.;;\r\n]{1,30}?[::][^\\s\u3000\u2003\u00a0,、。;;\r\n]+)([((])([\u4e00-\u9fa5][^,,。、.;;\r\n))包段]{1,30}?[::].+?)([))])",
-            "separator": "$1\n$2\n$3\n$4\n",
-            "desc": "例如:采购项目名称:脱贫攻坚大数据平台建设项目(项目编号:YLLBC20164002-HS)"
-        },
         {
             "reg_c": "([\u4e00-\u9fa5][^((,,。、.;;\r\n]{1,30}?[::][^\\s\u3000\u2003\u00a0,、。;;\r\n]+)([((])(.+?[::].+?)([))])",
 			"reg": "[((]([^::))\\r\\n]{2,10}[::][^::))\\r\\n]+)+[))]",