wcj 6 năm trước cách đây
mục cha
commit
ba27b4877c

+ 42 - 9
src/jy/clear/specialsymbols.go

@@ -13,9 +13,13 @@ var SymField map[string]interface{} //对称符号过滤字段
 var AsyField map[string]interface{} //特殊符号过滤字段
 var MesField map[string]interface{} //乱码过滤字段
 var SymInterCon []string            //对称符号中间内容处理
+var SymmetricDelete map[string]bool //对称符号之间的内容是否删除
 
 func init() {
 	qu.ReadConfig("./specialsymbols.json", &SpecialSymbols)
+	if SymmetricDelete == nil {
+		SymmetricDelete = map[string]bool{}
+	}
 	//对称符号
 	tmp1 := SpecialSymbols["symmetric"].(map[string]interface{})
 	SymField = tmp1["field"].(map[string]interface{})
@@ -23,9 +27,13 @@ func init() {
 	SymmetricMap = make(map[string]string)
 	allSymbol := ""
 	for _, val := range symmetricArr { //SymmetricMap
-		tmpArr := qu.ObjArrToStringArr(val.([]interface{}))
+		symap := val.(map[string]interface{})
+		tmpArr := qu.ObjArrToStringArr(symap["text"].([]interface{}))
+		isdelete, _ := symap["symdelete"].(bool) //对称符号之间的内容是否删除
 		s1 := strings.Replace(tmpArr[0], "\\", "", -1)
 		s2 := strings.Replace(tmpArr[1], "\\", "", -1)
+		SymmetricDelete[s1] = isdelete
+		SymmetricDelete[s2] = isdelete
 		allSymbol = tmpArr[0] + tmpArr[1] + allSymbol
 		SymmetricMap[s2] = s1
 	}
@@ -57,7 +65,7 @@ func init() {
 	MesReg = regexp.MustCompile(messycodeStr)
 	SymInterCon = qu.ObjArrToStringArr(SpecialSymbols["symintercon"].([]interface{}))
 
-	//	text := []rune("2019年大兴新城地区公共厕所及附属设施项目(改造-施工)")
+	//	text := []rune("中煤张家口煤矿机械有限责任公司铸造槽帮(可含热处理工序、机加工工序或全工序)外协合格")
 	//	for i := 1; i <= 2; i++ {
 	//		text = AnotherRemoveStart(text)
 	//		qu.Debug(string(text))
@@ -240,6 +248,7 @@ func OtherClean(field, text string) string {
 func AnotherRemoveStart(text []rune) []rune {
 	defer qu.Catch()
 	if len(text) > 0 {
+		delstrarr := []string{}
 		pairedIndex := make(map[int]int)      //对称符号索引位置
 		symbolIndex := make(map[string][]int) //记录符号和当前索引位置
 		surplusMax := -1                      //记录多余的反符号最大值
@@ -298,10 +307,15 @@ func AnotherRemoveStart(text []rune) []rune {
 			}
 		}
 		//处理文本中有对称符号的情况
-		tmptext, ismatch := DelContext(pairedIndex, text)
+		tmptext, ismatch, delindex := DelContext(pairedIndex, text)
 		if ismatch {
 			return tmptext
 		}
+		if len(delindex) > 0 {
+			for s, e := range delindex {
+				delstrarr = append(delstrarr, string(text[s:e+1]))
+			}
+		}
 		//例:“教育部高等教育教学评估中心数据中心升级改造”项目 -> 教育部高等教育教学评估中心数据中心升级改造项目
 		if surplusMax == -1 && positiveMax == -1 {
 			i := pairedIndex[0]
@@ -330,14 +344,15 @@ func AnotherRemoveStart(text []rune) []rune {
 				}
 			}
 		}
-		firstOpposite := pairedIndex[0]
-		if firstOpposite != 0 { //第一个正符号对应反符号的位置
+		firstOpposite := pairedIndex[0]                             //第一个正符号对应反符号的位置
+		if firstOpposite != 0 && SymmetricDelete[string(text[0])] { //删除开头由对称符号包括的内容:(2019年大兴)新城地(区公共厕)所及附属(改造发斯蒂芬)-->新城地(区公共厕)所及附属(改造发斯蒂芬)
 			text = text[firstOpposite+1:]
 			removeLength = firstOpposite + 1
 			nb = nb + removeLength
 		}
 		lastOpposite := pairedIndex[length-1] //最后一个符号
-		if lastOpposite > 0 {                 //有对称的正向符号,删除其中间内容
+		lenew := len(text)
+		if lastOpposite > 0 && lenew > 0 && SymmetricDelete[string(text[lenew-1])] { //删除结尾由对称符号包括的内容:新城地(区公共厕)所及附属(改造发斯蒂芬)-->新城地(区公共厕)所及附属
 			//na = length - lastOpposite
 			text = text[:lastOpposite-removeLength]
 		} else if surplusMax == length-1 { //没有对称,只删除最后一个反符号
@@ -357,21 +372,39 @@ func AnotherRemoveStart(text []rune) []rune {
 				text = text[:positiveMax-nb]
 			}
 		}
+		if len(delstrarr) > 0 {
+			for _, rep := range delstrarr {
+				lenew := len(text)
+				if lenew > 0 && strings.HasPrefix(rep, string(text[0])) { //要清理的内容是开头和结尾部分,清理
+					text = []rune(strings.Replace(string(text), rep, "", -1))
+				}
+				lenew = len(text)
+				if lenew > 0 && strings.HasSuffix(rep, string(text[lenew-1])) {
+					text = []rune(strings.Replace(string(text), rep, "", -1))
+				}
+			}
+		}
 	}
 	return text
 }
 
-func DelContext(pairedIndex map[int]int, text []rune) ([]rune, bool) {
+func DelContext(pairedIndex map[int]int, text []rune) ([]rune, bool, map[int]int) {
 	length := 0
 	var result []rune
 	ismatch := false
+	delindex := map[int]int{}
 	for s, e := range pairedIndex {
 		if s < e {
+			nowsym := string(text[s])
+			if SymmetricDelete[nowsym] { //删除该对称中的内容
+				delindex[s] = e
+				continue
+			}
 			var tmp []rune
 			tmp = text[s+1 : e]
 			if len(tmp) > 2 { //排除对称符号中只有["工程","项目","采购","服务","监理","施工","设计"]
 				for _, r := range SymInterCon {
-					if strings.HasSuffix(string(tmp), r) && len(tmp) > length && len([]rune(strings.Replace(string(tmp), r, "", -1))) > 4 {
+					if strings.HasSuffix(string(tmp), r) && len(tmp) > length && len([]rune(strings.Replace(string(tmp), r, "", -1))) > 6 {
 						ismatch = true
 						result = tmp
 						length = len(tmp)
@@ -381,7 +414,7 @@ func DelContext(pairedIndex map[int]int, text []rune) ([]rune, bool) {
 		}
 	}
 
-	return result, ismatch
+	return result, ismatch, delindex
 }
 
 func DealSinAndDouQuotes(text []rune) []rune {

+ 6 - 0
src/jy/extract/extract.go

@@ -917,6 +917,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 					}
 					j.Result[in.Field][k].Value = text
+					if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
+						continue
+					}
 					exts = append(exts, map[string]interface{}{
 						"field":     v.Field,
 						"code":      v.Code,
@@ -944,6 +947,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 					}
 					j.Result[key][k].Value = text
+					if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
+						continue
+					}
 					exts = append(exts, map[string]interface{}{
 						"field":     v.Field,
 						"code":      v.Code,

+ 8 - 4
src/jy/extract/score.go

@@ -126,10 +126,6 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 	for field, tmps := range result {
 		for tmpsindex, tmpsvalue := range tmps {
 			describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
-			if tmpsvalue.ExtFrom == "title" { //标题打分初始化
-				tmps[tmpsindex].Score += CommonScore["title"]
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
-			}
 			//是否有段标签
 			if len(tmpsvalue.BlockTag) > 0 {
 				//有标签段
@@ -151,6 +147,10 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 			//抽取类型打分
 			if FieldsScore[field] != nil { //指定抽取属性打分配置
 				fieldscore := FieldsScore[field]
+				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
+					tmps[tmpsindex].Score += fieldscore["title"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["title"]})
+				}
 				if strings.Contains(tmpsvalue.Type, "colon") {
 					tmps[tmpsindex].Score += fieldscore["colon"]
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["colon"]})
@@ -165,6 +165,10 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["regexp"]})
 				}
 			} else { //通用抽取属性打分配置
+				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
+					tmps[tmpsindex].Score += CommonScore["title"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
+				}
 				if strings.Contains(tmpsvalue.Type, "colon") {
 					tmps[tmpsindex].Score += CommonScore["colon"]
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["colon"]})

+ 3 - 3
src/jy/pretreated/analystep.go

@@ -60,7 +60,7 @@ func AnalyStart(job *util.Job) {
 				processTableResult(tabres, bl, job)
 			}
 			//			for k, v := range bl.TableKV.Kv {
-			//				log.Println("bl.TableKV.Kv", k, v)
+			//				//log.Println("bl.TableKV.Kv", k, v)
 			//			}
 		} else {
 			//从正文里面找分包
@@ -69,8 +69,8 @@ func AnalyStart(job *util.Job) {
 		FindProjectCode(newCon, job) //匹配项目编号
 		bl.Text = HtmlToText(con)
 		//调用kv解析
-		bl.ColonKV = GetKVAll(newCon, "", nil, 1)
-		bl.SpaceKV = SspacekvEntity.Entrance(newCon, "", nil)
+		bl.ColonKV = GetKVAll(bl.Text, "", nil, 1)
+		bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil)
 		//新加 未分块table中未能解析到中标候选人,从正文中解析
 		if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
 			bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)

+ 3 - 0
src/jy/pretreated/colonkv.go

@@ -724,6 +724,9 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
 							if strings.TrimSpace(nextval) == "" {
 								continue
 							}
+							if GetAppointTags(nextval, tagdbs).Len() > 0 || GetAppointTags(k, tagdbs).Len() > 0{
+								continue
+							}
 						}
 					}
 					kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: nextval, Weight: tk.Weight})

+ 6 - 6
src/res/fieldscore.json

@@ -14,7 +14,7 @@
                 "table": 3,
                 "colon": 3,
                 "space": 3,
-                "regexp": 2
+                "regexp": 1
             },
             "winner": {
                 "table": 3,
@@ -41,7 +41,7 @@
             {
                 "describe": "以*结尾",
                 "regstr": ".{2,100}(项目|工程|施工|服务|设备|采购|设计|系统)$",
-                "score": 3
+                "score": 2
             }
         ],
         "negativewords": [
@@ -346,22 +346,22 @@
                 "describe": "[gt,lte,score]",
                 "range": [
                     0,
-                    3,
+                    4,
                     -5
                 ]
             },
             {
                 "describe": "[gt,lte,score]",
                 "range": [
-                    3,
-                    30,
+                    4,
+                    35,
                     3
                 ]
             },
             {
                 "describe": "[gt,∞,score]",
                 "range": [
-                    30,
+                    35,
                     -1,
                     -1
                 ]

+ 63 - 36
src/specialsymbols.json

@@ -7,42 +7,69 @@
             "agency": true
         },
         "symbol": [
-            [
-                "(",
-                ")"
-            ],
-            [
-                "\\[",
-                "\\]"
-            ],
-            [
-                "{",
-                "}"
-            ],
-            [
-                "{",
-                "}"
-            ],
-            [
-                "‘",
-                "’"
-            ],
-            [
-                "“",
-                "”"
-            ],
-            [
-                "【",
-                "】"
-            ],
-            [
-                "(",
-                ")"
-            ],
-            [
-                "<",
-                ">"
-            ]
+			{
+				"symdelete":false,
+				"text":[
+                		"(",
+                		")"
+           		 	]				
+			},
+			{
+				"symdelete":false,
+				"text":[
+	                "(",
+	                ")"
+	            ]
+			},
+			{
+				"symdelete":true,
+				"text":[
+		                "\\[",
+		                "\\]"
+		            ]			
+			},
+			{
+				"symdelete":true,
+				"text":[
+		                "{",
+		                "}"
+		            ]			
+			},
+			{
+				"symdelete":true,
+				"text":[
+		                "{",
+		                "}"
+		            ]			
+			},
+            {
+				"symdelete":true,
+				"text":[
+		                "‘",
+		                "’"
+		            ]		
+			},
+			{
+				"symdelete":true,
+				"text":[
+		                "“",
+		                "”"
+		            ]	
+			},
+			{
+				"symdelete":true,
+				"text":[
+	                "【",
+	                "】"
+	            ]		
+			},
+			{
+				"symdelete":true,
+				"text":[
+	                "<",
+	                ">"
+	            ]
+			}
         ]
     },
     "asymmetric": {