Browse Source

采购人正文抽取优化;项目名称对称符号清理

maxiaoshan 6 years ago
parent
commit
78e9ff52c2

+ 0 - 8
src/extractcity.json

@@ -1,8 +0,0 @@
-[
-	"city",
-	"buyer",
-	"buyeraddr",
-	"projectname",
-	"title"
-]
-	

+ 14 - 2
src/jy/clear/specialsymbols.go

@@ -9,6 +9,7 @@ import (
 var SpecialSymbols map[string]interface{}
 var SpecialSymbols map[string]interface{}
 var SymmetricMap map[string]string
 var SymmetricMap map[string]string
 var AsyReg, AllReg, MesReg *regexp.Regexp
 var AsyReg, AllReg, MesReg *regexp.Regexp
+var CommonWords *regexp.Regexp      //通用词正则
 var SymField map[string]interface{} //对称符号过滤字段
 var SymField map[string]interface{} //对称符号过滤字段
 var AsyField map[string]interface{} //特殊符号过滤字段
 var AsyField map[string]interface{} //特殊符号过滤字段
 var MesField map[string]interface{} //乱码过滤字段
 var MesField map[string]interface{} //乱码过滤字段
@@ -16,10 +17,13 @@ var SymInterCon []string            //对称符号中间内容处理
 var SymmetricDelete map[string]bool //对称符号之间的内容是否删除
 var SymmetricDelete map[string]bool //对称符号之间的内容是否删除
 
 
 func init() {
 func init() {
-	qu.ReadConfig("./specialsymbols.json", &SpecialSymbols)
+	qu.ReadConfig("./res/specialsymbols.json", &SpecialSymbols)
 	if SymmetricDelete == nil {
 	if SymmetricDelete == nil {
 		SymmetricDelete = map[string]bool{}
 		SymmetricDelete = map[string]bool{}
 	}
 	}
+	//通用词
+	cw, _ := SpecialSymbols["commonwords"].(string)
+	CommonWords = regexp.MustCompile(cw)
 	//对称符号
 	//对称符号
 	tmp1 := SpecialSymbols["symmetric"].(map[string]interface{})
 	tmp1 := SpecialSymbols["symmetric"].(map[string]interface{})
 	SymField = tmp1["field"].(map[string]interface{})
 	SymField = tmp1["field"].(map[string]interface{})
@@ -65,7 +69,7 @@ func init() {
 	MesReg = regexp.MustCompile(messycodeStr)
 	MesReg = regexp.MustCompile(messycodeStr)
 	SymInterCon = qu.ObjArrToStringArr(SpecialSymbols["symintercon"].([]interface{}))
 	SymInterCon = qu.ObjArrToStringArr(SpecialSymbols["symintercon"].([]interface{}))
 
 
-	//	text := []rune("中煤张家口煤矿机械有限责任公司铸造槽帮(可含热处理工序、机加工工序或全工序)外协合格")
+	//	text := []rune("江西省建筑工程建设监理有限公司关于《江西泰和乌鸡林下生态养殖系统》申报第五批中国重要农业文化遗产与《泰和乌鸡产业(2020-2030)发展规划》编制)竞争性磋商招标公告(第二次)")
 	//	for i := 1; i <= 2; i++ {
 	//	for i := 1; i <= 2; i++ {
 	//		text = AnotherRemoveStart(text)
 	//		text = AnotherRemoveStart(text)
 	//		qu.Debug(string(text))
 	//		qu.Debug(string(text))
@@ -361,10 +365,18 @@ func AnotherRemoveStart(text []rune) []rune {
 		//有多余反向符号,删除之前部分 surplusMax所有多余反向符号的最大索引
 		//有多余反向符号,删除之前部分 surplusMax所有多余反向符号的最大索引
 		if surplusMax != -1 && surplusMax > firstOpposite && surplusMax < length-1 {
 		if surplusMax != -1 && surplusMax > firstOpposite && surplusMax < length-1 {
 			if (lastOpposite > 0 && surplusMax < lastOpposite) || (lastOpposite == 0) { //发发发发发发}发(发发发发发发)
 			if (lastOpposite > 0 && surplusMax < lastOpposite) || (lastOpposite == 0) { //发发发发发发}发(发发发发发发)
+				beforedel := text
+				deleteindex := surplusMax - nb
 				text = text[surplusMax-nb+1:]
 				text = text[surplusMax-nb+1:]
 				nb = surplusMax + 1
 				nb = surplusMax + 1
+				//非对称符号清理后判断剩余部分长度小于6,取删除部分
+				if len(CommonWords.ReplaceAllString(string(text), "")) < 6 { //{"_id":ObjectId("5d42425ba5cb26b9b7850640")}
+					result := beforedel[:deleteindex]
+					return result
+				}
 			}
 			}
 		}
 		}
+
 		//多余正符号删除之后部分(优先删除反符号之前部分)//(发发{发发)发发发发发发发发发发发发发发发(发{发)
 		//多余正符号删除之后部分(优先删除反符号之前部分)//(发发{发发)发发发发发发发发发发发发发发发(发{发)
 		if positiveMax != -1 && positiveMax != 0 && positiveMax > surplusMax && positiveMax > firstOpposite { ////发发发发发发]发发{
 		if positiveMax != -1 && positiveMax != 0 && positiveMax > surplusMax && positiveMax > firstOpposite { ////发发发发发发]发发{
 			if (lastOpposite > 0 && positiveMax < lastOpposite) || (lastOpposite == 0) { //发发发发发发发发{发发发发发(发发)
 			if (lastOpposite > 0 && positiveMax < lastOpposite) || (lastOpposite == 0) { //发发发发发发发发{发发发发发(发发)

+ 0 - 6
src/jy/extract/extractcity.go

@@ -49,12 +49,6 @@ type AreaCode struct {
 	C    []string
 	C    []string
 }
 }
 
 
-var SortField []string
-
-func init() {
-	qu.ReadConfig("./extractcity.json", &SortField)
-}
-
 //抽取city
 //抽取city
 func (e *ExtractTask) ExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
 func (e *ExtractTask) ExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
 	/*
 	/*

+ 5 - 2
src/jy/pretreated/analytable.go

@@ -2732,10 +2732,13 @@ func (tn *Table) asdFind(td_k string, matchCount int, weightMap map[string]map[s
 	}
 	}
 	if len(indexMap) == 0 && td_kv.Title != "" {
 	if len(indexMap) == 0 && td_kv.Title != "" {
 		//td_kv.Title
 		//td_kv.Title
-		if titleMatchType := ContactTypeTitleMatch(td_kv.Title); titleMatchType != "" {
+		if titleMatchType := ContactTypeTitleMatch(td_kv.Title); len(titleMatchType) != 0 {
 			thidTdIndex = 0
 			thidTdIndex = 0
 			matchMap = map[string]map[string]bool{}
 			matchMap = map[string]map[string]bool{}
-			indexMap = map[int]string{1: titleMatchType}
+			//indexMap = map[int]string{1: titleMatchType}
+			for i, t := range titleMatchType {
+				indexMap[i+1] = t
+			}
 		}
 		}
 	}
 	}
 	return matchCount, weightMap, matchMap, thisTrHasMatch, indexMap, iscontinue, reCreate, thidTdIndex
 	return matchCount, weightMap, matchMap, thisTrHasMatch, indexMap, iscontinue, reCreate, thidTdIndex

+ 69 - 24
src/jy/pretreated/colonkv.go

@@ -260,6 +260,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	//处理联系人信息
 	//处理联系人信息
 	var indexMap map[int]string
 	var indexMap map[int]string
 	var matchMap map[string]map[string]bool
 	var matchMap map[string]map[string]bool
+	hasMatch := make(map[string]bool)
 	if contactFormat == nil || title != "" {
 	if contactFormat == nil || title != "" {
 		indexMap = map[int]string{}
 		indexMap = map[int]string{}
 		matchMap = map[string]map[string]bool{}
 		matchMap = map[string]map[string]bool{}
@@ -288,6 +289,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			}
 			}
 			isCanAddToIndexMap = true
 			isCanAddToIndexMap = true
 		}
 		}
+		n := 1
 		for _, ct_k := range HasOrderContactType(k) {
 		for _, ct_k := range HasOrderContactType(k) {
 			if !ContactType[ct_k].MatchString(k) {
 			if !ContactType[ct_k].MatchString(k) {
 				continue
 				continue
@@ -297,18 +299,23 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			if isContinue || !ascFindFlag {
 			if isContinue || !ascFindFlag {
 				continue
 				continue
 			}
 			}
-			if isCanAddToIndexMap && len(indexMap) == 0 {
-				indexMap[1] = ct_k
+			//			if isCanAddToIndexMap && len(indexMap) == 0 {
+			if isCanAddToIndexMap {
+				indexMap[n] = ct_k
+				n++
 				ascFind = false
 				ascFind = false
 			}
 			}
 		}
 		}
 	}
 	}
 	mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上
 	mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上
 	titleMatch := false
 	titleMatch := false
-	if titleMatchType := ContactTypeTitleMatch(title); titleMatchType != "" {
+	if titleMatchType := ContactTypeTitleMatch(title); len(titleMatchType) != 0 {
 		titleMatch = true
 		titleMatch = true
 		mustMatchFirst = false
 		mustMatchFirst = false
-		indexMap = map[int]string{1: titleMatchType}
+		for i, t := range titleMatchType {
+			indexMap[i+1] = t
+		}
+		//indexMap = map[int]string{1: titleMatchType}
 	}
 	}
 	//	if buyers == nil {
 	//	if buyers == nil {
 	//		Debug("title-------", mustMatchFirst, title, indexMap, matchMap, totalIndexMap, ascFind)
 	//		Debug("title-------", mustMatchFirst, title, indexMap, matchMap, totalIndexMap, ascFind)
@@ -326,6 +333,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置
 	mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置
 	kvsTemp := make([]*Kv, len(*kvs))
 	kvsTemp := make([]*Kv, len(*kvs))
 	copy(kvsTemp, *kvs)
 	copy(kvsTemp, *kvs)
+	//again := 0
 	for kv_index, kv := range *kvs {
 	for kv_index, kv := range *kvs {
 		isBreak := true
 		isBreak := true
 		v := strings.TrimSpace(kv.Value)
 		v := strings.TrimSpace(kv.Value)
@@ -381,6 +389,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			}
 			}
 		} else if ascFind {
 		} else if ascFind {
 			for _, ct_k := range HasOrderContactType(k) {
 			for _, ct_k := range HasOrderContactType(k) {
+				//again++
 				if k_length < 3 || k_length > 15 {
 				if k_length < 3 || k_length > 15 {
 					isBreak = false
 					isBreak = false
 					continue
 					continue
@@ -451,10 +460,10 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 					continue
 					continue
 				}
 				}
 				isBreak = false
 				isBreak = false
-				if index != 0 || notmatchCount != 0 {
-					startIndex = 0
-					indexMap = map[int]string{}
-				}
+				//				if index != 0 || notmatchCount != 0 {
+				//					startIndex = 0
+				//					indexMap = map[int]string{}
+				//				}
 				if startIndex == 0 {
 				if startIndex == 0 {
 					indexMap = map[int]string{}
 					indexMap = map[int]string{}
 				}
 				}
@@ -502,12 +511,28 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			continue
 			continue
 		}
 		}
 		isBreak = false
 		isBreak = false
-		if prevKey != k {
+
+		//		if again > 0 {
+		//			if again%2 == 1 {
+		//				index = 1
+		//			} else {
+		//				index = 2
+		//			}
+		//		} else {
+		//			if prevKey != k {
+		//				prevKey = k
+		//				index = 1
+		//			} else if prevKey == k {
+		//				index++
+		//			}
+		//		}
+		if prevKey != k && !hasMatch[k] {
 			prevKey = k
 			prevKey = k
 			index = 1
 			index = 1
-		} else if prevKey == k {
+		} else if index < 2 {
 			index++
 			index++
 		}
 		}
+		hasMatch[k] = true
 		//过滤值
 		//过滤值
 		if filterValue.MatchString(v) {
 		if filterValue.MatchString(v) {
 			continue
 			continue
@@ -577,28 +602,48 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 		(*contactFormat).MatchMap = matchMap
 		(*contactFormat).MatchMap = matchMap
 	}
 	}
 	//	if buyers == nil {
 	//	if buyers == nil {
-	//		for _, kv := range *kvs {
-	//			Debug("bbbbbbbbbb", kv.Key, kv.Value)
-	//		}
+	//	for _, kv := range *kvs {
+	//		Debug("bbbbbbbbbb", kv.Key, kv.Value)
+	//	}
 	//	}
 	//	}
 	//Debug("totalIndexMap", len(totalIndexMap))
 	//Debug("totalIndexMap", len(totalIndexMap))
 }
 }
-func ContactTypeTitleMatch(title string) string {
-	matchType := ""
+func ContactTypeTitleMatch(title string) []string {
+	matchType := []string{}
+	matchTypeMap := map[string]bool{}
 	if title != "" && len([]rune(title)) < 15 {
 	if title != "" && len([]rune(title)) < 15 {
 		if ContactBuyerTitleReg.MatchString(title) {
 		if ContactBuyerTitleReg.MatchString(title) {
-			matchType = "采购单位"
-		} else if ContactAgencyTitleReg.MatchString(title) {
-			matchType = "代理机构"
-		} else {
-			for _, ct_k := range HasOrderContactType(title) {
-				if ContactType[ct_k].MatchString(title) {
-					matchType = ct_k
-					break
-				}
+			matchType = append(matchType, "采购单位")
+			matchTypeMap["采购单位"] = true
+		}
+		if ContactAgencyTitleReg.MatchString(title) {
+			matchType = append(matchType, "代理机构")
+			matchTypeMap["代理机构"] = true
+		}
+		if len(matchType) == 2 {
+			return matchType
+		}
+		for _, ct_k := range HasOrderContactType(title) {
+			if ContactType[ct_k].MatchString(title) && !matchTypeMap[ct_k] {
+				matchType = append(matchType, ct_k)
 			}
 			}
 		}
 		}
 	}
 	}
+	//	matchType := ""
+	//	if title != "" && len([]rune(title)) < 15 {
+	//		if ContactBuyerTitleReg.MatchString(title) {
+	//			matchType = "采购单位"
+	//		} else if ContactAgencyTitleReg.MatchString(title) {
+	//			matchType = "代理机构"
+	//		} else {
+	//			for _, ct_k := range HasOrderContactType(title) {
+	//				if ContactType[ct_k].MatchString(title) {
+	//					matchType = ct_k
+	//					break
+	//				}
+	//			}
+	//		}
+	//	}
 	return matchType
 	return matchType
 }
 }
 
 

+ 2 - 1
src/specialsymbols.json → src/res/specialsymbols.json

@@ -108,5 +108,6 @@
         },
         },
         "symbol": []
         "symbol": []
     },
     },
-	"symintercon":["工程","项目","采购","服务","监理","施工","设计"]
+	"symintercon":["工程","项目","采购","服务","监理","施工","设计"],
+	"commonwords":"([招中流废邀]标|询价|网上|终止|谈判|变更|竞争性).{0,}"
 }
 }