瀏覽代碼

Merge branch 'dev3.2' of ssh://192.168.3.207:10022/qmx/jy-data-extract into dev3.2

wcj 6 年之前
父節點
當前提交
18f8331383

+ 0 - 8
src/extractcity.json

@@ -1,8 +0,0 @@
-[
-	"city",
-	"buyer",
-	"buyeraddr",
-	"projectname",
-	"title"
-]
-	

+ 14 - 2
src/jy/clear/specialsymbols.go

@@ -9,6 +9,7 @@ import (
 var SpecialSymbols map[string]interface{}
 var SymmetricMap map[string]string
 var AsyReg, AllReg, MesReg *regexp.Regexp
+var CommonWords *regexp.Regexp      //通用词正则
 var SymField map[string]interface{} //对称符号过滤字段
 var AsyField map[string]interface{} //特殊符号过滤字段
 var MesField map[string]interface{} //乱码过滤字段
@@ -16,10 +17,13 @@ var SymInterCon []string            //对称符号中间内容处理
 var SymmetricDelete map[string]bool //对称符号之间的内容是否删除
 
 func init() {
-	qu.ReadConfig("./specialsymbols.json", &SpecialSymbols)
+	qu.ReadConfig("./res/specialsymbols.json", &SpecialSymbols)
 	if SymmetricDelete == nil {
 		SymmetricDelete = map[string]bool{}
 	}
+	//通用词
+	cw, _ := SpecialSymbols["commonwords"].(string)
+	CommonWords = regexp.MustCompile(cw)
 	//对称符号
 	tmp1 := SpecialSymbols["symmetric"].(map[string]interface{})
 	SymField = tmp1["field"].(map[string]interface{})
@@ -65,7 +69,7 @@ func init() {
 	MesReg = regexp.MustCompile(messycodeStr)
 	SymInterCon = qu.ObjArrToStringArr(SpecialSymbols["symintercon"].([]interface{}))
 
-	//	text := []rune("中煤张家口煤矿机械有限责任公司铸造槽帮(可含热处理工序、机加工工序或全工序)外协合格")
+	//	text := []rune("江西省建筑工程建设监理有限公司关于《江西泰和乌鸡林下生态养殖系统》申报第五批中国重要农业文化遗产与《泰和乌鸡产业(2020-2030)发展规划》编制)竞争性磋商招标公告(第二次)")
 	//	for i := 1; i <= 2; i++ {
 	//		text = AnotherRemoveStart(text)
 	//		qu.Debug(string(text))
@@ -362,10 +366,18 @@ func AnotherRemoveStart(text []rune) []rune {
 		//有多余反向符号,删除之前部分 surplusMax所有多余反向符号的最大索引
 		if surplusMax != -1 && surplusMax > firstOpposite && surplusMax < length-1 {
 			if (lastOpposite > 0 && surplusMax < lastOpposite) || (lastOpposite == 0) { //发发发发发发}发(发发发发发发)
+				beforedel := text
+				deleteindex := surplusMax - nb
 				text = text[surplusMax-nb+1:]
 				nb = surplusMax + 1
+				//非对称符号清理后判断剩余部分长度小于6,取删除部分
+				if len(CommonWords.ReplaceAllString(string(text), "")) < 6 { //{"_id":ObjectId("5d42425ba5cb26b9b7850640")}
+					result := beforedel[:deleteindex]
+					return result
+				}
 			}
 		}
+
 		//多余正符号删除之后部分(优先删除反符号之前部分)//(发发{发发)发发发发发发发发发发发发发发发(发{发)
 		if positiveMax != -1 && positiveMax != 0 && positiveMax > surplusMax && positiveMax > firstOpposite { ////发发发发发发]发发{
 			if (lastOpposite > 0 && positiveMax < lastOpposite) || (lastOpposite == 0) { //发发发发发发发发{发发发发发(发发)

+ 11 - 5
src/jy/extract/extract.go

@@ -22,7 +22,7 @@ import (
 )
 
 var (
-	lock, lockrule, lockclear sync.RWMutex
+	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
 	cut           = ju.NewCut()                          //获取正文并清理
 	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
@@ -394,7 +394,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					if tmp["blocktag"] != nil {
 						btag := make(map[string]string)
 						for k := range tmp["blocktag"].(map[string]bool) {
+							blocktag.Lock()
 							btag[k] = TagConfigDesc[k]
+							blocktag.Unlock()
 						}
 						field.BlockTag = btag
 					}
@@ -663,9 +665,11 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 			if tmp["blocktag"] != nil {
 				btag := make(map[string]string)
 				for k := range tmp["blocktag"].(map[string]bool) {
+					blocktag.Lock()
 					if TagConfigDesc[k] != "" {
 						btag[k] = TagConfigDesc[k]
 					}
+					blocktag.Unlock()
 				}
 				field.BlockTag = btag
 			}
@@ -692,7 +696,9 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				for _, v := range j.Block {
 					btag := make(map[string]string)
 					for k := range v.Classify {
+						blocktag.Lock()
 						btag[k] = TagConfigDesc[k]
+						blocktag.Unlock()
 					}
 					extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in)
 					if len(extinfo) > 0 {
@@ -1091,7 +1097,7 @@ type FieldValue struct {
 //分析抽取结果并保存
 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 	qu.Try(func() {
-		doc, result, _id := funcAnalysis(j)
+		doc, result, _id := funcAnalysis(j, e.Tag)
 		if isSaveTag, _ := ju.Config["isSaveTag"].(bool); isSaveTag {
 			go otherNeedSave(j, result, e)
 		}
@@ -1116,7 +1122,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		//处理附件
 		var resultf map[string][]*ju.ExtField
 		if jf != nil {
-			_, resultf, _ = funcAnalysis(jf)
+			_, resultf, _ = funcAnalysis(jf, e.Tag)
 			auxinfof := auxInfo(jf)
 			tmp["fieldallf"] = auxinfof
 			ffield := map[string]interface{}{}
@@ -1352,12 +1358,12 @@ func delFiled(k string) bool {
 	return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
 }
 
-func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
+func funcAnalysis(j *ju.Job, ftag map[string][]*Tag) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
 	defer qu.Catch()
 	doc := j.Data
 	result := j.Result
 	_id := qu.BsonIdToSId((*doc)["_id"])
-	result = ScoreFields(j)
+	result = ScoreFields(j, ftag)
 
 	//结果排序
 	for _, val := range result {

+ 0 - 6
src/jy/extract/extractcity.go

@@ -49,12 +49,6 @@ type AreaCode struct {
 	C    []string
 }
 
-var SortField []string
-
-func init() {
-	qu.ReadConfig("./extractcity.json", &SortField)
-}
-
 //抽取city
 func (e *ExtractTask) ExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
 	/*

+ 43 - 16
src/jy/extract/score.go

@@ -9,10 +9,12 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"sync"
 	"unicode/utf8"
 )
 
 var (
+	lockscore     sync.RWMutex
 	SoreConfig    map[string]map[string]interface{}
 	TagConfig     map[string]map[string]float64
 	TagConfigDesc map[string]string
@@ -120,21 +122,28 @@ func init() {
 }
 
 //结果打分
-func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
+func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 	qu.Catch()
 	result := j.Result
 	for field, tmps := range result {
+		locktag.Lock()
+		taglength := len(ftag[field])
+		locktag.Unlock()
 		for tmpsindex, tmpsvalue := range tmps {
+			lockscore.Lock()
 			describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
+			lockscore.Unlock()
 			//是否有段标签
 			if len(tmpsvalue.BlockTag) > 0 {
 				//有标签段
 				var qz float64 = 0.0 //取权重最高的
 				for key := range tmpsvalue.BlockTag {
 					//key = "其他"//TODO 测试用
+					lockscore.Lock()
 					if TagConfig[key][field] > qz {
 						qz = TagConfig[key][field]
 					}
+					lockscore.Unlock()
 				}
 				tmps[tmpsindex].Score += ju.FloatFormat(BlockScore*qz, 4) //乘以权重系数
 				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", ScoreFrom: "tagscore.json", Value: tmpsvalue.Value, Score: BlockScore * qz})
@@ -145,31 +154,49 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 			}
 
 			//抽取类型打分
-			if FieldsScore[field] != nil { //指定抽取属性打分配置
-				fieldscore := FieldsScore[field]
+			lockscore.Lock()
+			fieldscore := FieldsScore[field]
+			typescore := float64(0)
+			titlescore := float64(0)
+			if fieldscore != nil { //指定抽取属性打分配置
 				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
-					tmps[tmpsindex].Score += fieldscore["title"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["title"]})
+					titlescore = fieldscore["title"]
 				}
-				tmps[tmpsindex].Score += fieldscore[tmpsvalue.Type]
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore[tmpsvalue.Type]})
+				typescore = fieldscore[tmpsvalue.Type]
 			} else { //通用抽取属性打分配置
 				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
-					tmps[tmpsindex].Score += CommonScore["title"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
+					titlescore = CommonScore["title"]
 				}
-				tmps[tmpsindex].Score += CommonScore[tmpsvalue.Type]
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore[tmpsvalue.Type]})
+				typescore = CommonScore[tmpsvalue.Type]
 			}
+			lockscore.Unlock()
+
+			tmps[tmpsindex].Score += titlescore
+			tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: titlescore})
+			tmps[tmpsindex].Score += typescore
+			tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: typescore})
+
 			//kv权重打分
-			if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
-				weightscore := ju.FloatFormat(1+float64(tmps[tmpsindex].Weight)/1000, 4)
-				tmps[tmpsindex].Score += weightscore
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
+			if fieldscore != nil { //指定抽取属性打分配置
+				if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
+					weightscore := ju.FloatFormat(float64(qu.Float64All(fieldscore["kvweight"]))+float64(tmps[tmpsindex].Weight)/float64(taglength), 4)
+					tmps[tmpsindex].Score += weightscore
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
+				} else {
+					//正则权重,暂不考虑
+				}
 			} else {
-				//正则权重,暂不考虑
+				if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
+					weightscore := ju.FloatFormat(float64(qu.Float64All(CommonScore["kvweight"]))+float64(tmps[tmpsindex].Weight)/float64(taglength), 4)
+					tmps[tmpsindex].Score += weightscore
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
+				} else {
+					//正则权重,暂不考虑
+				}
 			}
+			lockscore.Lock()
 			scoreRule := SoreConfig[field]
+			lockscore.Unlock()
 			if scoreRule == nil {
 				continue
 			}

+ 5 - 2
src/jy/pretreated/analytable.go

@@ -2732,10 +2732,13 @@ func (tn *Table) asdFind(td_k string, matchCount int, weightMap map[string]map[s
 	}
 	if len(indexMap) == 0 && td_kv.Title != "" {
 		//td_kv.Title
-		if titleMatchType := ContactTypeTitleMatch(td_kv.Title); titleMatchType != "" {
+		if titleMatchType := ContactTypeTitleMatch(td_kv.Title); len(titleMatchType) != 0 {
 			thidTdIndex = 0
 			matchMap = map[string]map[string]bool{}
-			indexMap = map[int]string{1: titleMatchType}
+			//indexMap = map[int]string{1: titleMatchType}
+			for i, t := range titleMatchType {
+				indexMap[i+1] = t
+			}
 		}
 	}
 	return matchCount, weightMap, matchMap, thisTrHasMatch, indexMap, iscontinue, reCreate, thidTdIndex

+ 69 - 24
src/jy/pretreated/colonkv.go

@@ -260,6 +260,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	//处理联系人信息
 	var indexMap map[int]string
 	var matchMap map[string]map[string]bool
+	hasMatch := make(map[string]bool)
 	if contactFormat == nil || title != "" {
 		indexMap = map[int]string{}
 		matchMap = map[string]map[string]bool{}
@@ -288,6 +289,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			}
 			isCanAddToIndexMap = true
 		}
+		n := 1
 		for _, ct_k := range HasOrderContactType(k) {
 			if !ContactType[ct_k].MatchString(k) {
 				continue
@@ -297,18 +299,23 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			if isContinue || !ascFindFlag {
 				continue
 			}
-			if isCanAddToIndexMap && len(indexMap) == 0 {
-				indexMap[1] = ct_k
+			//			if isCanAddToIndexMap && len(indexMap) == 0 {
+			if isCanAddToIndexMap {
+				indexMap[n] = ct_k
+				n++
 				ascFind = false
 			}
 		}
 	}
 	mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上
 	titleMatch := false
-	if titleMatchType := ContactTypeTitleMatch(title); titleMatchType != "" {
+	if titleMatchType := ContactTypeTitleMatch(title); len(titleMatchType) != 0 {
 		titleMatch = true
 		mustMatchFirst = false
-		indexMap = map[int]string{1: titleMatchType}
+		for i, t := range titleMatchType {
+			indexMap[i+1] = t
+		}
+		//indexMap = map[int]string{1: titleMatchType}
 	}
 	//	if buyers == nil {
 	//		Debug("title-------", mustMatchFirst, title, indexMap, matchMap, totalIndexMap, ascFind)
@@ -326,6 +333,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置
 	kvsTemp := make([]*Kv, len(*kvs))
 	copy(kvsTemp, *kvs)
+	//again := 0
 	for kv_index, kv := range *kvs {
 		isBreak := true
 		v := strings.TrimSpace(kv.Value)
@@ -381,6 +389,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			}
 		} else if ascFind {
 			for _, ct_k := range HasOrderContactType(k) {
+				//again++
 				if k_length < 3 || k_length > 15 {
 					isBreak = false
 					continue
@@ -451,10 +460,10 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 					continue
 				}
 				isBreak = false
-				if index != 0 || notmatchCount != 0 {
-					startIndex = 0
-					indexMap = map[int]string{}
-				}
+				//				if index != 0 || notmatchCount != 0 {
+				//					startIndex = 0
+				//					indexMap = map[int]string{}
+				//				}
 				if startIndex == 0 {
 					indexMap = map[int]string{}
 				}
@@ -502,12 +511,28 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			continue
 		}
 		isBreak = false
-		if prevKey != k {
+
+		//		if again > 0 {
+		//			if again%2 == 1 {
+		//				index = 1
+		//			} else {
+		//				index = 2
+		//			}
+		//		} else {
+		//			if prevKey != k {
+		//				prevKey = k
+		//				index = 1
+		//			} else if prevKey == k {
+		//				index++
+		//			}
+		//		}
+		if prevKey != k && !hasMatch[k] {
 			prevKey = k
 			index = 1
-		} else if prevKey == k {
+		} else if index < 2 {
 			index++
 		}
+		hasMatch[k] = true
 		//过滤值
 		if filterValue.MatchString(v) {
 			continue
@@ -577,28 +602,48 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 		(*contactFormat).MatchMap = matchMap
 	}
 	//	if buyers == nil {
-	//		for _, kv := range *kvs {
-	//			Debug("bbbbbbbbbb", kv.Key, kv.Value)
-	//		}
+	//	for _, kv := range *kvs {
+	//		Debug("bbbbbbbbbb", kv.Key, kv.Value)
+	//	}
 	//	}
 	//Debug("totalIndexMap", len(totalIndexMap))
 }
-func ContactTypeTitleMatch(title string) string {
-	matchType := ""
+func ContactTypeTitleMatch(title string) []string {
+	matchType := []string{}
+	matchTypeMap := map[string]bool{}
 	if title != "" && len([]rune(title)) < 15 {
 		if ContactBuyerTitleReg.MatchString(title) {
-			matchType = "采购单位"
-		} else if ContactAgencyTitleReg.MatchString(title) {
-			matchType = "代理机构"
-		} else {
-			for _, ct_k := range HasOrderContactType(title) {
-				if ContactType[ct_k].MatchString(title) {
-					matchType = ct_k
-					break
-				}
+			matchType = append(matchType, "采购单位")
+			matchTypeMap["采购单位"] = true
+		}
+		if ContactAgencyTitleReg.MatchString(title) {
+			matchType = append(matchType, "代理机构")
+			matchTypeMap["代理机构"] = true
+		}
+		if len(matchType) == 2 {
+			return matchType
+		}
+		for _, ct_k := range HasOrderContactType(title) {
+			if ContactType[ct_k].MatchString(title) && !matchTypeMap[ct_k] {
+				matchType = append(matchType, ct_k)
 			}
 		}
 	}
+	//	matchType := ""
+	//	if title != "" && len([]rune(title)) < 15 {
+	//		if ContactBuyerTitleReg.MatchString(title) {
+	//			matchType = "采购单位"
+	//		} else if ContactAgencyTitleReg.MatchString(title) {
+	//			matchType = "代理机构"
+	//		} else {
+	//			for _, ct_k := range HasOrderContactType(title) {
+	//				if ContactType[ct_k].MatchString(title) {
+	//					matchType = ct_k
+	//					break
+	//				}
+	//			}
+	//		}
+	//	}
 	return matchType
 }
 

+ 13 - 3
src/res/fieldscore.json

@@ -6,7 +6,8 @@
             "table": 3,
             "colon": 3,
             "space": 3,
-            "regexp": 2
+            "regexp": 2,
+            "kvweight": 1
         },
         "fields": {
             "projectname": {
@@ -14,14 +15,23 @@
                 "table": 3,
                 "colon": 3,
                 "space": 3,
-                "regexp": 1
+                "regexp": 1,
+                "kvweight": 1
             },
             "winner": {
                 "table": 3,
                 "colon": 3,
                 "space": 3,
                 "regexp": 2,
-                "winnerorder": 3
+                "winnerorder": 3,
+                "kvweight": 1
+            },
+            "buyertel": {
+                "table": 3,
+                "colon": 3,
+                "space": 3,
+                "regexp": 2,
+                "kvweight": 5
             }
         }
     },

+ 2 - 1
src/specialsymbols.json → src/res/specialsymbols.json

@@ -108,5 +108,6 @@
         },
         "symbol": []
     },
-	"symintercon":["工程","项目","采购","服务","监理","施工","设计"]
+	"symintercon":["工程","项目","采购","服务","监理","施工","设计"],
+	"commonwords":"([招中流废邀]标|询价|网上|终止|谈判|变更|竞争性).{0,}"
 }