Browse Source

kv权重打分调整

zhangjinkun 6 years ago
parent
commit
557fbc1bb6
3 changed files with 67 additions and 24 deletions
  1. 11 5
      src/jy/extract/extract.go
  2. 43 16
      src/jy/extract/score.go
  3. 13 3
      src/res/fieldscore.json

+ 11 - 5
src/jy/extract/extract.go

@@ -22,7 +22,7 @@ import (
 )
 
 var (
-	lock, lockrule, lockclear sync.RWMutex
+	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
 	cut           = ju.NewCut()                          //获取正文并清理
 	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
@@ -394,7 +394,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					if tmp["blocktag"] != nil {
 						btag := make(map[string]string)
 						for k := range tmp["blocktag"].(map[string]bool) {
+							blocktag.Lock()
 							btag[k] = TagConfigDesc[k]
+							blocktag.Unlock()
 						}
 						field.BlockTag = btag
 					}
@@ -660,9 +662,11 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 						if tmp["blocktag"] != nil {
 							btag := make(map[string]string)
 							for k := range tmp["blocktag"].(map[string]bool) {
+								blocktag.Lock()
 								if TagConfigDesc[k] != "" {
 									btag[k] = TagConfigDesc[k]
 								}
+								blocktag.Unlock()
 							}
 							field.BlockTag = btag
 						}
@@ -694,7 +698,9 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				for _, v := range j.Block {
 					btag := make(map[string]string)
 					for k := range v.Classify {
+						blocktag.Lock()
 						btag[k] = TagConfigDesc[k]
+						blocktag.Unlock()
 					}
 					extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in)
 					if len(extinfo) > 0 {
@@ -1093,7 +1099,7 @@ type FieldValue struct {
 //分析抽取结果并保存
 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 	qu.Try(func() {
-		doc, result, _id := funcAnalysis(j)
+		doc, result, _id := funcAnalysis(j, e.Tag)
 		if isSaveTag, _ := ju.Config["isSaveTag"].(bool); isSaveTag {
 			go otherNeedSave(j, result, e)
 		}
@@ -1118,7 +1124,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		//处理附件
 		var resultf map[string][]*ju.ExtField
 		if jf != nil {
-			_, resultf, _ = funcAnalysis(jf)
+			_, resultf, _ = funcAnalysis(jf, e.Tag)
 			auxinfof := auxInfo(jf)
 			tmp["fieldallf"] = auxinfof
 			ffield := map[string]interface{}{}
@@ -1354,12 +1360,12 @@ func delFiled(k string) bool {
 	return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
 }
 
-func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
+func funcAnalysis(j *ju.Job, ftag map[string][]*Tag) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
 	defer qu.Catch()
 	doc := j.Data
 	result := j.Result
 	_id := qu.BsonIdToSId((*doc)["_id"])
-	result = ScoreFields(j)
+	result = ScoreFields(j, ftag)
 
 	//结果排序
 	for _, val := range result {

+ 43 - 16
src/jy/extract/score.go

@@ -9,10 +9,12 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"sync"
 	"unicode/utf8"
 )
 
 var (
+	lockscore     sync.RWMutex
 	SoreConfig    map[string]map[string]interface{}
 	TagConfig     map[string]map[string]float64
 	TagConfigDesc map[string]string
@@ -120,21 +122,28 @@ func init() {
 }
 
 //结果打分
-func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
+func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 	qu.Catch()
 	result := j.Result
 	for field, tmps := range result {
+		locktag.Lock()
+		taglength := len(ftag[field])
+		locktag.Unlock()
 		for tmpsindex, tmpsvalue := range tmps {
+			lockscore.Lock()
 			describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
+			lockscore.Unlock()
 			//是否有段标签
 			if len(tmpsvalue.BlockTag) > 0 {
 				//有标签段
 				var qz float64 = 0.0 //取权重最高的
 				for key := range tmpsvalue.BlockTag {
 					//key = "其他"//TODO 测试用
+					lockscore.Lock()
 					if TagConfig[key][field] > qz {
 						qz = TagConfig[key][field]
 					}
+					lockscore.Unlock()
 				}
 				tmps[tmpsindex].Score += ju.FloatFormat(BlockScore*qz, 4) //乘以权重系数
 				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", ScoreFrom: "tagscore.json", Value: tmpsvalue.Value, Score: BlockScore * qz})
@@ -145,31 +154,49 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 			}
 
 			//抽取类型打分
-			if FieldsScore[field] != nil { //指定抽取属性打分配置
-				fieldscore := FieldsScore[field]
+			lockscore.Lock()
+			fieldscore := FieldsScore[field]
+			typescore := float64(0)
+			titlescore := float64(0)
+			if fieldscore != nil { //指定抽取属性打分配置
 				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
-					tmps[tmpsindex].Score += fieldscore["title"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["title"]})
+					titlescore = fieldscore["title"]
 				}
-				tmps[tmpsindex].Score += fieldscore[tmpsvalue.Type]
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore[tmpsvalue.Type]})
+				typescore = fieldscore[tmpsvalue.Type]
 			} else { //通用抽取属性打分配置
 				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
-					tmps[tmpsindex].Score += CommonScore["title"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
+					titlescore = CommonScore["title"]
 				}
-				tmps[tmpsindex].Score += CommonScore[tmpsvalue.Type]
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore[tmpsvalue.Type]})
+				typescore = CommonScore[tmpsvalue.Type]
 			}
+			lockscore.Unlock()
+
+			tmps[tmpsindex].Score += titlescore
+			tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: titlescore})
+			tmps[tmpsindex].Score += typescore
+			tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: typescore})
+
 			//kv权重打分
-			if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
-				weightscore := ju.FloatFormat(1+float64(tmps[tmpsindex].Weight)/1000, 4)
-				tmps[tmpsindex].Score += weightscore
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
+			if fieldscore != nil { //指定抽取属性打分配置
+				if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
+					weightscore := ju.FloatFormat(float64(qu.Float64All(fieldscore["kvweight"]))+float64(tmps[tmpsindex].Weight)/float64(taglength), 4)
+					tmps[tmpsindex].Score += weightscore
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
+				} else {
+					//正则权重,暂不考虑
+				}
 			} else {
-				//正则权重,暂不考虑
+				if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
+					weightscore := ju.FloatFormat(float64(qu.Float64All(CommonScore["kvweight"]))+float64(tmps[tmpsindex].Weight)/float64(taglength), 4)
+					tmps[tmpsindex].Score += weightscore
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
+				} else {
+					//正则权重,暂不考虑
+				}
 			}
+			lockscore.Lock()
 			scoreRule := SoreConfig[field]
+			lockscore.Unlock()
 			if scoreRule == nil {
 				continue
 			}

+ 13 - 3
src/res/fieldscore.json

@@ -6,7 +6,8 @@
             "table": 3,
             "colon": 3,
             "space": 3,
-            "regexp": 2
+            "regexp": 2,
+            "kvweight": 1
         },
         "fields": {
             "projectname": {
@@ -14,14 +15,23 @@
                 "table": 3,
                 "colon": 3,
                 "space": 3,
-                "regexp": 1
+                "regexp": 1,
+                "kvweight": 1
             },
             "winner": {
                 "table": 3,
                 "colon": 3,
                 "space": 3,
                 "regexp": 2,
-                "winnerorder": 3
+                "winnerorder": 3,
+                "kvweight": 1
+            },
+            "buyertel": {
+                "table": 3,
+                "colon": 3,
+                "space": 3,
+                "regexp": 2,
+                "kvweight": 5
             }
         }
     },