瀏覽代碼

项目名称优化、打分逻辑扩展

zhangjinkun 6 年之前
父節點
當前提交
3ec72cfa62
共有 5 個文件被更改,包括 261 次插入94 次删除
  1. 10 10
      src/jy/extract/extract.go
  2. 2 1
      src/jy/extract/extractInit.go
  3. 41 22
      src/jy/extract/score.go
  4. 11 10
      src/jy/util/article.go
  5. 197 51
      src/res/fieldscore.json

+ 10 - 10
src/jy/extract/extract.go

@@ -30,7 +30,7 @@ var (
 	ClearTaskList map[string]*ClearTask                  //清理任务列表
 	saveLimit     = 200                                  //抽取日志批量保存
 	PageSize      = 5000                                 //查询分页
-	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
+	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
 
@@ -231,10 +231,10 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
 	} else {
 		detail = d2
 	}
-	detail = ju.CutLableStr(detail)
-	detail = cut.ClearHtml(detail)
+	d3, _ := doc["summary"].(string)
+	detail = ju.CutLableStr(d3 + "\n" + detail)
+	detail = cut.ClearHtml(d3 + "\n" + detail)
 	doc["detail"] = detail
-
 	if isextFile {
 		file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
 	}
@@ -385,10 +385,10 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				//项目名称未能抽取到,标题来凑
 				if vc.Field == "projectname" {
 					//if len(j.Result[vc.Field]) < 1 {//如果抽取有结果,不走标题。待验证,暂时标题加入选举逻辑
-					field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
+					field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
 					if tmp["blocktag"] != nil {
-						btag:= make(map[string]string)
-						for k := range tmp["blocktag"].(map[string]bool){
+						btag := make(map[string]string)
+						for k := range tmp["blocktag"].(map[string]bool) {
 							btag[k] = TagConfigDesc[k]
 						}
 						field.BlockTag = btag
@@ -626,7 +626,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
 						if tmp["blocktag"] != nil {
 							btag := make(map[string]string)
-							for k := range tmp["blocktag"].(map[string]bool){
+							for k := range tmp["blocktag"].(map[string]bool) {
 								btag[k] = TagConfigDesc[k]
 							}
 							field.BlockTag = btag
@@ -658,7 +658,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 			} else {
 				for _, v := range j.Block {
 					btag := make(map[string]string)
-					for k:=range v.Classify{
+					for k := range v.Classify {
 						btag[k] = TagConfigDesc[k]
 					}
 					extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in)
@@ -1409,7 +1409,7 @@ func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
 
 //去重冗余字段
 func delFiled(k string) bool {
-	return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
+	return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
 }
 
 func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {

+ 2 - 1
src/jy/extract/extractInit.go

@@ -31,6 +31,7 @@ type ExtReg struct {
 	NumSign    int //正负修正标记,例如浮动率(上浮正1、下浮负-1)
 }
 type RuleCore struct {
+	Id        string        //id
 	Field     string        //逻辑字段
 	LuaLogic  string        //进入逻辑
 	ExtFrom   string        //从哪个字段抽取
@@ -308,7 +309,7 @@ func (e *ExtractTask) InitRuleCore() {
 			if b, _ := vv["isuse"].(bool); !b {
 				continue
 			}
-			rcore := &RuleCore{}
+			rcore := &RuleCore{Id: qu.BsonIdToSId(vv["_id"])}
 			rcore.Field = s_field
 			rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
 			rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)

+ 41 - 22
src/jy/extract/score.go

@@ -12,16 +12,25 @@ import (
 	"unicode/utf8"
 )
 
-var SoreConfig map[string]map[string]interface{}
-var TagConfig map[string]map[string]float64
-var TagConfigDesc map[string]string
-var TitleScore float64
+var (
+	SoreConfig    map[string]map[string]interface{}
+	TagConfig     map[string]map[string]float64
+	TagConfigDesc map[string]string
+
+	TitleScore, RepeatScore, BlockScore float64
+)
 
 func init() {
 	qu.ReadConfig("./res/tagscoredesc.json", &TagConfigDesc)
 	qu.ReadConfig("./res/tagscore.json", &TagConfig)
 	qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
 	TitleScore = qu.Float64All(SoreConfig["extractype"]["title"])
+	if repeat, ok := SoreConfig["other"]["repeat"].(map[string]interface{}); ok {
+		RepeatScore = qu.Float64All(repeat["score"])
+	}
+	if block, ok := SoreConfig["other"]["block"].(map[string]interface{}); ok {
+		BlockScore = qu.Float64All(block["score"])
+	}
 	//实例化正则
 	for _, tmp := range SoreConfig {
 		//log.Println(tmp)
@@ -89,8 +98,8 @@ func init() {
 
 //结果打分
 func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
-	result := j.Result
 	qu.Catch()
+	result := j.Result
 	for field, tmps := range result {
 		for tmpsindex, tmpsvalue := range tmps {
 			if tmpsvalue.ExtFrom == "title" { //标题打分初始化
@@ -107,8 +116,8 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 						qz = TagConfig[key][field]
 					}
 				}
-				tmps[tmpsindex].Score += 2 * qz //乘以权重系数
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", ScoreFrom: "tagscore.json", Value: tmpsvalue.Value, Score: 2 * qz})
+				tmps[tmpsindex].Score += BlockScore * qz //乘以权重系数
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", ScoreFrom: "tagscore.json", Value: tmpsvalue.Value, Score: BlockScore * qz})
 			} else {
 				//没有段标签,走其他
 				//qz := TagConfig["其他"][field]
@@ -146,21 +155,18 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				if lengths, ok := scoreRule["length"].([]interface{}); ok {
 					for _, tmp := range lengths {
 						if length, ok := tmp.(map[string]interface{}); ok {
-							min := qu.IntAll(length["min"])
-							max := qu.IntAll(length["max"])
-							scores, _ := length["score"].([]interface{})
-							if len(scores) < 3 {
-								continue
-							}
-							if valueLen < min {
-								tmps[tmpsindex].Score += qu.Float64All(scores[0])
-								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, "<", min), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
-							} else if valueLen > max {
-								tmps[tmpsindex].Score += qu.Float64All(scores[2])
-								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, ">", max), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
-							} else {
-								tmps[tmpsindex].Score += qu.Float64All(scores[1])
-								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, ">", min, "&&", valueLen, "<", max), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
+							if ranges, ok := length["range"].([]interface{}); ok {
+								gt := qu.IntAll(ranges[0])
+								lte := qu.IntAll(ranges[1])
+								if lte < 0 { //∞
+									lte = 999999
+								}
+								score := qu.Float64All(ranges[2])
+								if valueLen > gt && valueLen <= lte {
+									tmps[tmpsindex].Score += score
+									tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(gt, "<", valueLen, "<=", lte), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: score})
+									break
+								}
 							}
 						}
 					}
@@ -259,6 +265,19 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				}
 			}
 		}
+		//计算重复值,并加分=重复数量*乘系数
+		valrepeat := map[string]int{}
+		for _, v := range tmps {
+			valrepeat[fmt.Sprint(v.Value)] += 1
+		}
+		for index, v := range tmps {
+			v.ValRepeat = valrepeat[fmt.Sprint(v.Value)] - 1
+			if v.ValRepeat > 0 {
+				score := RepeatScore * float64(v.ValRepeat)
+				v.Score += score
+				tmps[index].ScoreItem = append(tmps[index].ScoreItem, &ju.ScoreItem{Des: "重复次数打分repeat", Code: field + ".repeat", RuleText: "repeat:" + fmt.Sprint(v.ValRepeat), ScoreFrom: "fieldscore.json." + field, Value: v.Value, Score: score})
+			}
+		}
 	}
 	return result
 }

+ 11 - 10
src/jy/util/article.go

@@ -39,16 +39,17 @@ type Job struct {
 
 type ExtField struct {
 	BlockTag    map[string]string //块标签
-	Field       string          //属性
-	Code        string          //匹配标签(字符串、正则)、正则或lua代码
-	RuleText    string          //内容
-	Type        string          //kv(细类:colon1,colon2,space,table)、正则(regexp)
-	MatchType   string          //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
-	ExtFrom     string          //抽取来源(title,detail)
-	SourceValue interface{}     //抽取结果--未清理
-	Value       interface{}     //抽取结果
-	Score       float64         //得分
-	ScoreItem   []*ScoreItem    //打分项
+	Field       string            //属性
+	Code        string            //匹配标签(字符串、正则)、正则或lua代码
+	RuleText    string            //内容
+	Type        string            //kv(细类:colon1,colon2,space,table)、正则(regexp)
+	MatchType   string            //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
+	ExtFrom     string            //抽取来源(title,detail)
+	SourceValue interface{}       //抽取结果--未清理
+	Value       interface{}       //抽取结果
+	Score       float64           //得分
+	ScoreItem   []*ScoreItem      //打分项
+	ValRepeat   int               //结果值重复次数,打分参考
 }
 
 //打分项

+ 197 - 51
src/res/fieldscore.json

@@ -8,6 +8,16 @@
         "regexp": 2,
         "winnerorder": 3
     },
+    "other": {
+        "block": {
+            "describe": "块属性基础分值",
+            "score": 0.5
+        },
+        "repeat": {
+            "describe": "重复项:分值*重复次数",
+            "score": 0.1
+        }
+    },
     "projectname": {
         "type": "string",
         "positivewords": [
@@ -20,24 +30,62 @@
         "negativewords": [
             {
                 "describe": "以*开头",
-                "regstr": "^(关于|\\[|【)",
-                "score": -10
+                "regstr": "^(\\[|【)",
+                "score": -2
             },
             {
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(的|招标|公示|公告|谈判|公开|通知|采购文件)$",
-                "score": -2
+                "regstr": ".{2,100}(的|招标|公示|公告|谈判|公开|通知|采购文件|交易中心)$",
+                "score": -5
             }
         ],
         "length": [
             {
-                "describe": "长度打分min>val:-6,min<=val<=max:1,max<val:-1",
-                "min": 4,
-                "max": 40,
-                "score": [
-                    -10,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
                     3,
-                    -1
+                    -2
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    3,
+                    5,
+                    0
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    5,
+                    10,
+                    1
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    10,
+                    35,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    35,
+                    45,
+                    1
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    45,
+                    -1,
+                    -2
                 ]
             }
         ]
@@ -60,12 +108,26 @@
         ],
         "length": [
             {
-                "describe": "长度打分min>val:-6,min<=val<=max:1,max<val:-1",
-                "min": 4,
-                "max": 25,
-                "score": [
-                    -10,
-                    3,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    4,
+                    -10
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    4,
+                    25,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    25,
+                    -1,
                     -1
                 ]
             }
@@ -89,12 +151,26 @@
         ],
         "length": [
             {
-                "describe": "长度打分min>val:-6,min<=val<=max:1,max<val:-1",
-                "min": 4,
-                "max": 20,
-                "score": [
-                    -10,
-                    3,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    4,
+                    -10
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    4,
+                    35,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    35,
+                    -1,
                     -1
                 ]
             }
@@ -124,12 +200,26 @@
         "negativewords": [],
         "length": [
             {
-                "describe": "长度打分min>val:-6,min<=val<=max:1,max<val:-1",
-                "min": 4,
-                "max": 30,
-                "score": [
-                    -10,
-                    3,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    4,
+                    -10
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    4,
+                    30,
+                    3
+                ]
+            },
+            {
+                "describe": "长度[gt,∞,score]",
+                "range": [
+                    30,
+                    -1,
                     -1
                 ]
             }
@@ -147,12 +237,26 @@
         "negativewords": [],
         "length": [
             {
-                "describe": "长度打分min>val:0,min<=val<=max:1,max<val:-1",
-                "min": 2,
-                "max": 6,
-                "score": [
-                    -5,
-                    3,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    1,
+                    -5
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    1,
+                    7,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    7,
+                    -1,
                     -1
                 ]
             }
@@ -170,12 +274,26 @@
         ],
         "length": [
             {
-                "describe": "长度打分min>val:-1,min<=val<=max:1,max<val:-1",
-                "min": 7,
-                "max": 14,
-                "score": [
-                    -5,
-                    3,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    6,
+                    -5
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    6,
+                    14,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    14,
+                    -1,
                     -1
                 ]
             }
@@ -208,12 +326,26 @@
         ],
         "length": [
             {
-                "describe": "长度打分min>val:-1,min<=val<=max:1,max<val:-1",
-                "min": 4,
-                "max": 30,
-                "score": [
-                    -5,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
                     3,
+                    -5
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    3,
+                    30,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    30,
+                    -1,
                     -1
                 ]
             }
@@ -256,12 +388,26 @@
         "type": "string",
         "length": [
             {
-                "describe": "项目范围字数",
-                "min": 2,
-                "max": 500,
-                "score": [
-                    -10,
-                    3,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    2,
+                    -10
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    2,
+                    500,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    500,
+                    -1,
                     -1
                 ]
             }