Browse Source

打分细化

zhangjinkun 6 years ago
parent
commit
44f7e49e68
2 changed files with 84 additions and 24 deletions
  1. 60 17
      src/jy/extract/score.go
  2. 24 7
      src/res/fieldscore.json

+ 60 - 17
src/jy/extract/score.go

@@ -17,20 +17,43 @@ var (
 	TagConfig     map[string]map[string]float64
 	TagConfigDesc map[string]string
 
-	TitleScore, RepeatScore, BlockScore float64
+	RepeatScore, BlockScore float64
+	CommonScore             map[string]float64
+	FieldsScore             map[string]map[string]float64
 )
 
 func init() {
 	qu.ReadConfig("./res/tagscoredesc.json", &TagConfigDesc)
 	qu.ReadConfig("./res/tagscore.json", &TagConfig)
 	qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
-	TitleScore = qu.Float64All(SoreConfig["extractype"]["title"])
 	if repeat, ok := SoreConfig["other"]["repeat"].(map[string]interface{}); ok {
 		RepeatScore = qu.Float64All(repeat["score"])
 	}
 	if block, ok := SoreConfig["other"]["block"].(map[string]interface{}); ok {
 		BlockScore = qu.Float64All(block["score"])
 	}
+	//通用抽取属性打分配置
+	if tmp, ok := SoreConfig["extractype"]["common"].(map[string]interface{}); ok {
+		CommonScore = map[string]float64{}
+		for k, v := range tmp {
+			CommonScore[k] = qu.Float64All(v)
+		}
+	}
+	log.Println(CommonScore)
+	//指定抽取属性打分配置
+	if tmp, ok := SoreConfig["extractype"]["fields"].(map[string]interface{}); ok {
+		FieldsScore = map[string]map[string]float64{}
+		for key, fieldmap := range tmp {
+			fieldscore := map[string]float64{}
+			if field, ok := fieldmap.(map[string]interface{}); ok {
+				for k, score := range field {
+					fieldscore[k] = qu.Float64All(score)
+				}
+			}
+			FieldsScore[key] = fieldscore
+		}
+	}
+	log.Println(FieldsScore)
 	//实例化正则
 	for _, tmp := range SoreConfig {
 		//log.Println(tmp)
@@ -102,9 +125,10 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 	result := j.Result
 	for field, tmps := range result {
 		for tmpsindex, tmpsvalue := range tmps {
+			describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
 			if tmpsvalue.ExtFrom == "title" { //标题打分初始化
-				tmps[tmpsindex].Score += TitleScore
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: TitleScore})
+				tmps[tmpsindex].Score += CommonScore["title"]
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
 			}
 			//是否有段标签
 			if len(tmpsvalue.BlockTag) > 0 {
@@ -125,23 +149,42 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 			}
 
 			//抽取类型打分
-			if strings.Contains(tmpsvalue.Type, "colon") {
-				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
-			} else if strings.Contains(tmpsvalue.Type, "space") {
-				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
-			} else if strings.Contains(tmpsvalue.Type, "table") {
-				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
-			} else if strings.Contains(tmpsvalue.Type, "regexp") {
-				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])})
+			if FieldsScore[field] != nil { //指定抽取属性打分配置
+				fieldscore := FieldsScore[field]
+				if strings.Contains(tmpsvalue.Type, "colon") {
+					tmps[tmpsindex].Score += fieldscore["colon"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["colon"]})
+				} else if strings.Contains(tmpsvalue.Type, "space") {
+					tmps[tmpsindex].Score += fieldscore["space"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["space"]})
+				} else if strings.Contains(tmpsvalue.Type, "table") {
+					tmps[tmpsindex].Score += fieldscore["table"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["table"]})
+				} else if strings.Contains(tmpsvalue.Type, "regexp") {
+					tmps[tmpsindex].Score += fieldscore["regexp"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["regexp"]})
+				}
+			} else { //通用抽取属性打分配置
+				if strings.Contains(tmpsvalue.Type, "colon") {
+					tmps[tmpsindex].Score += CommonScore["colon"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["colon"]})
+				} else if strings.Contains(tmpsvalue.Type, "space") {
+					tmps[tmpsindex].Score += CommonScore["space"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["space"]})
+				} else if strings.Contains(tmpsvalue.Type, "table") {
+					tmps[tmpsindex].Score += CommonScore["table"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["table"]})
+				} else if strings.Contains(tmpsvalue.Type, "regexp") {
+					tmps[tmpsindex].Score += CommonScore["regexp"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["regexp"]})
+				}
 			}
+
 			scoreRule := SoreConfig[field]
 			if scoreRule == nil {
 				continue
 			}
+			//配置打分
 			if scoreRule["type"] == "string" {
 				//1.长度打分
 				valueLen := utf8.RuneCountInString(fmt.Sprint(tmpsvalue.Value))
@@ -207,7 +250,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 						}
 					}
 				}
-				//4.位置打分
+				//4.中标候选人打分
 				if winnerorders, ok := scoreRule["winnerorder"].([]interface{}); ok {
 					for _, winnerorder := range winnerorders {
 						if p, ok := winnerorder.(map[string]interface{}); ok {

+ 24 - 7
src/res/fieldscore.json

@@ -1,12 +1,29 @@
 {
     "extractype": {
         "describe": "抽取类型打分",
-        "title": 2,
-        "table": 3,
-        "colon": 3,
-        "space": 3,
-        "regexp": 2,
-        "winnerorder": 3
+        "common": {
+            "title": 2,
+            "table": 3,
+            "colon": 3,
+            "space": 3,
+            "regexp": 2
+        },
+        "fields": {
+            "projectname": {
+                "title": 1,
+                "table": 3,
+                "colon": 3,
+                "space": 3,
+                "regexp": 2
+            },
+            "winner": {
+                "table": 3,
+                "colon": 3,
+                "space": 3,
+                "regexp": 2,
+                "winnerorder": 3
+            }
+        }
     },
     "other": {
         "block": {
@@ -69,7 +86,7 @@
                 "range": [
                     10,
                     35,
-                    3
+                    2
                 ]
             },
             {