Browse Source

采购人tablekv中抽取优化

maxiaoshan 6 years ago
parent
commit
3996be3b38

+ 11 - 5
src/jy/extract/extract.go

@@ -22,7 +22,7 @@ import (
 )
 
 var (
-	lock, lockrule, lockclear sync.RWMutex
+	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
 	cut           = ju.NewCut()                          //获取正文并清理
 	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
@@ -394,7 +394,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					if tmp["blocktag"] != nil {
 						btag := make(map[string]string)
 						for k := range tmp["blocktag"].(map[string]bool) {
+							blocktag.Lock()
 							btag[k] = TagConfigDesc[k]
+							blocktag.Unlock()
 						}
 						field.BlockTag = btag
 					}
@@ -663,9 +665,11 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 			if tmp["blocktag"] != nil {
 				btag := make(map[string]string)
 				for k := range tmp["blocktag"].(map[string]bool) {
+					blocktag.Lock()
 					if TagConfigDesc[k] != "" {
 						btag[k] = TagConfigDesc[k]
 					}
+					blocktag.Unlock()
 				}
 				field.BlockTag = btag
 			}
@@ -692,7 +696,9 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				for _, v := range j.Block {
 					btag := make(map[string]string)
 					for k := range v.Classify {
+						blocktag.Lock()
 						btag[k] = TagConfigDesc[k]
+						blocktag.Unlock()
 					}
 					extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in)
 					if len(extinfo) > 0 {
@@ -1091,7 +1097,7 @@ type FieldValue struct {
 //分析抽取结果并保存
 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 	qu.Try(func() {
-		doc, result, _id := funcAnalysis(j)
+		doc, result, _id := funcAnalysis(j, e.Tag)
 		if isSaveTag, _ := ju.Config["isSaveTag"].(bool); isSaveTag {
 			go otherNeedSave(j, result, e)
 		}
@@ -1116,7 +1122,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		//处理附件
 		var resultf map[string][]*ju.ExtField
 		if jf != nil {
-			_, resultf, _ = funcAnalysis(jf)
+			_, resultf, _ = funcAnalysis(jf, e.Tag)
 			auxinfof := auxInfo(jf)
 			tmp["fieldallf"] = auxinfof
 			ffield := map[string]interface{}{}
@@ -1352,12 +1358,12 @@ func delFiled(k string) bool {
 	return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
 }
 
-func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
+func funcAnalysis(j *ju.Job, ftag map[string][]*Tag) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
 	defer qu.Catch()
 	doc := j.Data
 	result := j.Result
 	_id := qu.BsonIdToSId((*doc)["_id"])
-	result = ScoreFields(j)
+	result = ScoreFields(j, ftag)
 
 	//结果排序
 	for _, val := range result {

+ 43 - 16
src/jy/extract/score.go

@@ -9,10 +9,12 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"sync"
 	"unicode/utf8"
 )
 
 var (
+	lockscore     sync.RWMutex
 	SoreConfig    map[string]map[string]interface{}
 	TagConfig     map[string]map[string]float64
 	TagConfigDesc map[string]string
@@ -120,21 +122,28 @@ func init() {
 }
 
 //结果打分
-func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
+func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 	qu.Catch()
 	result := j.Result
 	for field, tmps := range result {
+		locktag.Lock()
+		taglength := len(ftag[field])
+		locktag.Unlock()
 		for tmpsindex, tmpsvalue := range tmps {
+			lockscore.Lock()
 			describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
+			lockscore.Unlock()
 			//是否有段标签
 			if len(tmpsvalue.BlockTag) > 0 {
 				//有标签段
 				var qz float64 = 0.0 //取权重最高的
 				for key := range tmpsvalue.BlockTag {
 					//key = "其他"//TODO 测试用
+					lockscore.Lock()
 					if TagConfig[key][field] > qz {
 						qz = TagConfig[key][field]
 					}
+					lockscore.Unlock()
 				}
 				tmps[tmpsindex].Score += ju.FloatFormat(BlockScore*qz, 4) //乘以权重系数
 				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", ScoreFrom: "tagscore.json", Value: tmpsvalue.Value, Score: BlockScore * qz})
@@ -145,31 +154,49 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 			}
 
 			//抽取类型打分
-			if FieldsScore[field] != nil { //指定抽取属性打分配置
-				fieldscore := FieldsScore[field]
+			lockscore.Lock()
+			fieldscore := FieldsScore[field]
+			typescore := float64(0)
+			titlescore := float64(0)
+			if fieldscore != nil { //指定抽取属性打分配置
 				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
-					tmps[tmpsindex].Score += fieldscore["title"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["title"]})
+					titlescore = fieldscore["title"]
 				}
-				tmps[tmpsindex].Score += fieldscore[tmpsvalue.Type]
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore[tmpsvalue.Type]})
+				typescore = fieldscore[tmpsvalue.Type]
 			} else { //通用抽取属性打分配置
 				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
-					tmps[tmpsindex].Score += CommonScore["title"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
+					titlescore = CommonScore["title"]
 				}
-				tmps[tmpsindex].Score += CommonScore[tmpsvalue.Type]
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore[tmpsvalue.Type]})
+				typescore = CommonScore[tmpsvalue.Type]
 			}
+			lockscore.Unlock()
+
+			tmps[tmpsindex].Score += titlescore
+			tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: titlescore})
+			tmps[tmpsindex].Score += typescore
+			tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: typescore})
+
 			//kv权重打分
-			if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
-				weightscore := ju.FloatFormat(1+float64(tmps[tmpsindex].Weight)/1000, 4)
-				tmps[tmpsindex].Score += weightscore
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
+			if fieldscore != nil { //指定抽取属性打分配置
+				if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
+					weightscore := ju.FloatFormat(float64(qu.Float64All(fieldscore["kvweight"]))+float64(tmps[tmpsindex].Weight)/float64(taglength), 4)
+					tmps[tmpsindex].Score += weightscore
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
+				} else {
+					//正则权重,暂不考虑
+				}
 			} else {
-				//正则权重,暂不考虑
+				if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
+					weightscore := ju.FloatFormat(float64(qu.Float64All(CommonScore["kvweight"]))+float64(tmps[tmpsindex].Weight)/float64(taglength), 4)
+					tmps[tmpsindex].Score += weightscore
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
+				} else {
+					//正则权重,暂不考虑
+				}
 			}
+			lockscore.Lock()
 			scoreRule := SoreConfig[field]
+			lockscore.Unlock()
 			if scoreRule == nil {
 				continue
 			}

+ 4 - 4
src/jy/pretreated/analykv.go

@@ -73,7 +73,7 @@ func GetLines(con string) (res []*Line) {
 				l1.Str = strings.Join(l1.Strs, "")
 				if !regexp.MustCompile("^[,,.。\\s \u3000\u2003\u00a0]$").MatchString(l1.Str) {
 					l1.Str = u.TrimLRSpace(l1.Str, "")
-					l1.Str = TimeHM.ReplaceAllString(l1.Str, "D$1H$2M")
+					l1.Str = TimeHM.ReplaceAllString(l1.Str, ReplTimeHM)
 					l1.Strs = strings.Split(l1.Str, "")
 					res = append(res, l1)
 				}
@@ -130,6 +130,7 @@ func FindKv_v2(con, tag string) (m *SortMap) {
 }
 
 var TimeHM = regexp.MustCompile("[\\s \u3000\u2003\u00a0]*([01]{0,1}[0123456789]|2[0123])[::]([012345][0-9])[::]{0,1}")
+var ReplTimeHM = "D${1}H${2}M"
 
 //from 1--全文 2--table td
 func FindKv(con, tag string, from int) (m *SortMap) {
@@ -151,7 +152,7 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 			if len(s1) > 0 {
 				str := strings.Join(s1, "")
 				str = u.TrimLRSpace(str, "")
-				str = TimeHM.ReplaceAllString(str, "D${1}H${2}M")
+				str = TimeHM.ReplaceAllString(str, ReplTimeHM)
 				s1 = strings.Split(str, "")
 				if len(s1) > 0 {
 					strs = append(strs, s1)
@@ -168,8 +169,7 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 	if len(s1) > 0 {
 		str := strings.Join(s1, "")
 		str = u.TrimLRSpace(str, "")
-		//u.Debug(str, TimeHM.ReplaceAllString(str, "D${1}H${2}M"))
-		str = TimeHM.ReplaceAllString(str, "D${1}H${2}M")
+		str = TimeHM.ReplaceAllString(str, ReplTimeHM)
 		s1 = strings.Split(str, "")
 		if len(s1) > 0 {
 			strs = append(strs, s1)

+ 8 - 5
src/jy/pretreated/colonkv.go

@@ -20,7 +20,7 @@ var (
 	regReplKV2    = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)")
 	regKV         = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)")
 	filterK       = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+")
-	filterValue   = regexp.MustCompile("(^(无)$|.+%.*|[\r\n\\s\u3000\u2003\u00a0]+|^<.*>)")
+	filterValue   = regexp.MustCompile("(^(无)$|.+%.*|^[\r\n\\s\u3000\u2003\u00a0]+$|^<.*>)")
 	regReplKey    = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全名]称|姓名)$")
 	BlockTagMap   = map[string]bool{
 		"招标范围": true,
@@ -693,12 +693,14 @@ func GetKVAll(content, title string, contactFormat *ContactFormat, from int) *Jo
 func MergeKvTags(kvTags_1, kvTags_2 map[string][]*Tag) {
 	for k, v := range kvTags_2 {
 		for _, vv := range v {
-			if strings.TrimSpace(vv.Value) == "" {
+			value_vv := strings.TrimSpace(vv.Value)
+			if value_vv == "" {
 				continue
 			}
 			isExists := false
 			for _, vvv := range kvTags_1[k] {
-				if vvv.Value == vv.Value && vvv.Weight == vv.Weight {
+				value_vvv := strings.TrimSpace(vvv.Value)
+				if (value_vvv == value_vv || TimeHM.ReplaceAllString(value_vvv, ReplTimeHM) == value_vv || value_vvv == TimeHM.ReplaceAllString(value_vv, ReplTimeHM)) && vvv.Weight == vv.Weight {
 					isExists = true
 					break
 				}
@@ -726,9 +728,10 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
 		kvTags[title] = append(kvTags[title], &Tag{title, title, 0, nil, false})
 	}
 	for _, findkv := range findkvs {
-		k, val, nextval := findkv.Key, findkv.Value, strings.TrimSpace(findkv.NextLine)
+		k, val, nextval := findkv.Key, strings.TrimSpace(findkv.Value), strings.TrimSpace(findkv.NextLine)
 		//val是空的话,不打标签
 		if filterValue.MatchString(val) {
+			log.Println(k, val)
 			continue
 		}
 		key := k
@@ -769,7 +772,7 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
 							if strings.TrimSpace(nextval) == "" {
 								continue
 							}
-							if GetAppointTags(nextval, tagdbs).Len() > 0 || GetAppointTags(k, tagdbs).Len() > 0{
+							if GetAppointTags(nextval, tagdbs).Len() > 0 || GetAppointTags(k, tagdbs).Len() > 0 {
 								continue
 							}
 						}

+ 14 - 4
src/res/fieldscore.json

@@ -6,7 +6,8 @@
             "table": 3,
             "colon": 3,
             "space": 3,
-            "regexp": 2
+            "regexp": 2,
+            "kvweight": 1
         },
         "fields": {
             "projectname": {
@@ -14,14 +15,23 @@
                 "table": 3,
                 "colon": 3,
                 "space": 3,
-                "regexp": 1
+                "regexp": 1,
+                "kvweight": 1
             },
             "winner": {
                 "table": 3,
                 "colon": 3,
                 "space": 3,
                 "regexp": 2,
-                "winnerorder": 3
+                "winnerorder": 3,
+                "kvweight": 1
+            },
+            "buyertel": {
+                "table": 3,
+                "colon": 3,
+                "space": 3,
+                "regexp": 2,
+                "kvweight": 5
             }
         }
     },
@@ -196,7 +206,7 @@
             {
                 "describe": "非结尾",
                 "regstr": ".*[^集团|公司|学校|中心|家具城|门诊|\\[大中小\\]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场|行]$",
-                "score": -10
+                "score": -5
             }
         ]
     },