Browse Source

1.全文抽取改成块抽取;2.记录打分流程

fengweiqiang 6 years ago
parent
commit
f7d00ef7af
4 changed files with 137 additions and 174 deletions
  1. 77 33
      src/jy/extract/extract.go
  2. 23 132
      src/jy/extract/score.go
  3. 22 9
      src/jy/util/article.go
  4. 15 0
      src/res/fieldscore.json

+ 77 - 33
src/jy/extract/extract.go

@@ -13,6 +13,7 @@ import (
 	"reflect"
 	"regexp"
 	"strconv"
+	"strings"
 	"sync"
 	"time"
 	"unicode/utf8"
@@ -22,13 +23,13 @@ import (
 )
 
 var (
-	lock          sync.RWMutex
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 200                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	lock    sync.RWMutex
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask          //任务列表
+	ClearTaskList map[string]*ClearTask            //清理任务列表
+	saveLimit     = 200                            //抽取日志批量保存
+	PageSize      = 5000                           //查询分页
 	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -370,13 +371,13 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					//项目名称未能抽取到,标题来凑
 					if vc.Field == "projectname" {
 						if len(j.Result[vc.Field]) < 1 {
+							items := make([]*ju.ScoreItem, 1)
+							items[0] = &ju.ScoreItem{Des:"项目名称未能抽取到,标题来凑初始化",ExtFrom: "title", Value: j.Title, Score: 4}
+							field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
 							if tmp["blocktag"] != nil {
-								j.Result[vc.Field] = append(j.Result[vc.Field],
-									&ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
-							} else {
-								j.Result[vc.Field] = append(j.Result[vc.Field],
-									&ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
+								field.BlockTag = tmp["blocktag"].(map[string]bool)
 							}
+							j.Result[vc.Field] = append(j.Result[vc.Field],field)
 							//j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
 						}
 					}
@@ -410,12 +411,14 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 
 					//项目名称未能抽取到,标题来凑
 					if vc.Field == "projectname" {
+						items := make([]*ju.ScoreItem, 1)
+						items[0] = &ju.ScoreItem{Des:"项目名称未能抽取到,标题来凑初始化",ExtFrom: "title", Value: j.Title, Score: 4}
+						field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
 						if len(j.Result[vc.Field]) < 1 {
 							if tmp["blocktag"] != nil {
-								j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
-							} else {
-								j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
+								field.BlockTag = tmp["blocktag"].(map[string]bool)
 							}
+							j.Result[vc.Field] = append(j.Result[vc.Field],field)
 							//j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
 						}
 					}
@@ -669,13 +672,19 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				}
 				if tmps, ok := v.([]map[string]interface{}); ok {
 					for _, tmp := range tmps {
+						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], Score: 0}
 						if tmp["blocktag"] != nil {
-							j.Result[k] = append(j.Result[k],
-								&ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"], 0})
-						} else {
-							j.Result[k] = append(j.Result[k],
-								&ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], Score: 0})
+							field.BlockTag = tmp["blocktag"].(map[string]bool)
 						}
+						item := &ju.ScoreItem{Des:"初始化",Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], Score: 0}
+						if tmp["scoreitem"] == nil{
+							scoreItems := make([]*ju.ScoreItem, 0)
+							scoreItems = append(scoreItems, item)
+							field.ScoreItem = scoreItems
+						}else {
+							field.ScoreItem = append(field.ScoreItem, item)
+						}
+						j.Result[k] = append(j.Result[k],field)
 					}
 				}
 			}
@@ -685,11 +694,20 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 		}
 	} else {
 		//全文正则
-		text := qu.ObjToString(doc[extfrom])
+		//text := qu.ObjToString(doc[extfrom])
+		//if in.Field != "" {
+		//	extinfo := extRegCoreToResult(extfrom, text, j, in)
+		//	if len(extinfo) > 0 {
+		//		AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
+		//	}
+		//}
+		//块抽取
 		if in.Field != "" {
-			extinfo := extRegCoreToResult(extfrom, text, j, in)
-			if len(extinfo) > 0 {
-				AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
+			for _, v := range j.Block {
+				extinfo := extRegCoreToResult(extfrom, v.Text, &v.Tag, j, in)
+				if len(extinfo) > 0 {
+					AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
+				}
 			}
 		}
 	}
@@ -869,7 +887,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 }
 
 //正则提取结果
-func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
+func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
 	defer qu.Catch()
 	extinfo := map[string][]map[string]interface{}{}
 	if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
@@ -891,15 +909,24 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
 						"value":     val,
 						"type":      "regexp",
 						"matchtype": "regcontent",
+						"blocktag":  *tag,
 					}
 					tmps = append(tmps, tmp)
 					extinfo[k] = tmps
-					if val != "" {
+					if strings.TrimSpace(val) != "" {
+						exfield := ju.ExtField{Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val, Score: 0}
 						if tmp["blocktag"] != nil {
-							j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
+							exfield.BlockTag = tmp["blocktag"].(map[string]bool)
+						}
+						item := ju.ScoreItem{Des:"初始化",Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val, Score: 0}
+						if tmp["scoreitem"] == nil {
+							sitems := make([]*ju.ScoreItem, 0)
+							sitems = append(sitems, &item)
+							exfield.ScoreItem = sitems
 						} else {
-							j.Result[k] = append(j.Result[k], &ju.ExtField{nil, k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
+							exfield.ScoreItem = append(exfield.ScoreItem , &item)
 						}
+						j.Result[k] = append(j.Result[k], &exfield)
 						//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
 					}
 				}
@@ -926,17 +953,26 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
 				"value":     val,
 				"type":      "regexp",
 				"matchtype": "regcontent",
+				"blocktag":  *tag,
 			}
 			tmps = append(tmps, tmp)
 			extinfo[v.Field] = tmps
 			if j.Result[v.Field] == nil {
 				j.Result[v.Field] = [](*ju.ExtField){}
 			}
+			field := &ju.ExtField{Field: v.Field, Code:v.Code, RuleText:v.RuleText,Type: "regexp",MatchType: "regcontent", ExtFrom:extfrom,Value: val,Score: 0}
 			if tmp["blocktag"] != nil {
-				j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
+				field.BlockTag = tmp["blocktag"].(map[string]bool)
+			}
+			item := ju.ScoreItem{Des:"初始化",Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val, Score: 0}
+			if tmp["scoreitem"] == nil {
+				sitems := make([]*ju.ScoreItem, 0)
+				sitems = append(sitems, &item)
+				field.ScoreItem = sitems
 			} else {
-				j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{nil, v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
+				field.ScoreItem = append(field.ScoreItem , &item)
 			}
+			j.Result[v.Field] = append(j.Result[v.Field],field )
 		}
 	}
 	return extinfo
@@ -956,11 +992,19 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 			if tmps, ok := v.([]map[string]interface{}); ok {
 				j.Result[k] = [](*ju.ExtField){}
 				for _, tmp := range tmps {
+					field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
 					if tmp["blocktag"] != nil {
-						j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
+						field.BlockTag = tmp["blocktag"].(map[string]bool)
+					}
+					item := ju.ScoreItem{Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
+					if tmp["scoreitem"] == nil {
+						scoreItems := make([]*ju.ScoreItem, 0)
+						scoreItems = append(scoreItems, &item)
+						field.ScoreItem = scoreItems
 					} else {
-						j.Result[k] = append(j.Result[k], &ju.ExtField{nil, k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
+						field.ScoreItem = append(field.ScoreItem, &item)
 					}
+					j.Result[k] = append(j.Result[k], field)
 					//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
 				}
 			}
@@ -1374,7 +1418,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 {                            //reids未找到,执行规则匹配
+	if i == 0 { //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 23 - 132
src/jy/extract/score.go

@@ -100,6 +100,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 					}
 				}
 				tmps[tmpsindex].Score += 2 * qz //乘以权重系数
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: 2 * qz})
 			} else {
 				//没有段标签,走其他
 				//qz := TagConfig["其他"][field]
@@ -109,15 +110,19 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 			//是否有kv值
 			if strings.Contains(tmpsvalue.Type, "colon") {
 				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
 			} else if strings.Contains(tmpsvalue.Type, "space") {
 				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
 			} else if strings.Contains(tmpsvalue.Type, "table") {
 				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
 			}
 
 			//正则
 			if strings.Contains(tmpsvalue.Type, "regexp") {
 				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])})
 			}
 
 			scoreRule := SoreConfig[field]
@@ -127,6 +132,13 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 			if scoreRule["type"] == "string" {
 				//1.长度打分
 				valueLen := utf8.RuneCountInString(fmt.Sprint(tmpsvalue.Value))
+				if valueLen < 1 {
+					continue
+				}
+				if valueLen > 100 && field != "projectscope" {
+					tmps[tmpsindex].Score = -99
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen > 100 && field != "projectscope"直接-99分`, Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: -99})
+				}
 				if lengths, ok := scoreRule["length"].([]interface{}); ok {
 					for _, tmp := range lengths {
 						if length, ok := tmp.(map[string]interface{}); ok {
@@ -138,10 +150,13 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 							}
 							if valueLen < min {
 								tmps[tmpsindex].Score += qu.Float64All(scores[0])
+								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
 							} else if valueLen > max {
 								tmps[tmpsindex].Score += qu.Float64All(scores[2])
+								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
 							} else {
 								tmps[tmpsindex].Score += qu.Float64All(scores[1])
+								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score:qu.Float64All(scores[1])})
 							}
 						}
 					}
@@ -155,6 +170,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 									reg := p["regexp"].(*regexp.Regexp)
 									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
 										tmps[tmpsindex].Score += qu.Float64All(p["score"])
+										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "负面词打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
 									}
 								}
 							}, func(err interface{}) {
@@ -172,6 +188,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 									reg := p["regexp"].(*regexp.Regexp)
 									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
 										tmps[tmpsindex].Score += qu.Float64All(p["score"])
+										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "正面词打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
 									}
 								}
 							}, func(err interface{}) {
@@ -189,6 +206,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 									reg := p["regexp"].(*regexp.Regexp)
 									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
 										tmps[tmpsindex].Score += qu.Float64All(p["score"])
+										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "winnerorder", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
 									}
 								}
 							}, func(err interface{}) {
@@ -209,10 +227,13 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				}
 				if val < min && 0 < val {
 					tmps[tmpsindex].Score += qu.Float64All(scores[0])
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
 				} else if val > max {
 					tmps[tmpsindex].Score += qu.Float64All(scores[2])
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
 				} else if val <= max && val >= min {
 					tmps[tmpsindex].Score += qu.Float64All(scores[1])
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
 				}
 			}
 			//其他打分配置
@@ -227,143 +248,13 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				}
 				if val > max {
 					tmps[tmpsindex].Score += qu.Float64All(scores[2])
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
 				} else if val <= max && val > min {
 					tmps[tmpsindex].Score += qu.Float64All(scores[1])
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
 				}
 			}
 		}
 	}
-
 	return result
-	//qu.Try(func() {
-	//	//打分
-	//	for field, tmps := range result {
-	//		scoreRule := SoreConfig[field]
-	//		if scoreRule == nil {
-	//			continue
-	//		}
-	//extractype := SoreConfig["extractype"]
-	//fieldtype := scoreRule["type"]
-	//		for _, v := range tmps {
-	//			fmt.Println(v)
-	//	if len(fmt.Sprint(v.Value)) < 1 {
-	//		continue //空串跳过
-	//	}
-	//	//长度超过100个字,直接负分
-	//	vlen := len([]rune(qu.ObjToString(v.Value)))
-	//	if vlen > 100 && field != "projectscope" {
-	//		v.Score = -1
-	//	} else {
-	//		//类型打分
-	//		if v.ExtFrom == "title" {
-	//			v.Score += qu.IntAll(extractype["title"])
-	//		} else {
-	//			if strings.Contains(v.Type, "table") {
-	//				v.Score += qu.IntAll(extractype["table"])
-	//			} else if strings.Contains(v.Type, "colon") {
-	//				v.Score += qu.IntAll(extractype["colon"])
-	//			} else if strings.Contains(v.Type, "space") {
-	//				v.Score += qu.IntAll(extractype["space"])
-	//			} else if strings.Contains(v.Type, "regexp") {
-	//				v.Score += qu.IntAll(extractype["regexp"])
-	//			} else if strings.Contains(v.Type, "winnerorder") {
-	//				v.Score += qu.IntAll(extractype["winnerorder"])
-	//			}
-	//		}
-	//		//字符型打分
-	//		if fieldtype == "string" {
-	//			//位置打分
-	//			if positions, ok := scoreRule["position"].([]interface{}); ok {
-	//				for _, position := range positions {
-	//					if p, ok := position.(map[string]interface{}); ok {
-	//						qu.Try(func() {
-	//							if p["regexp"] != nil {
-	//								reg := p["regexp"].(*regexp.Regexp)
-	//								if reg.MatchString(qu.ObjToString(v.Value)) {
-	//									v.Score += qu.IntAll(p["score"])
-	//								}
-	//							}
-	//						}, func(err interface{}) {
-	//							log.Println(err)
-	//						})
-	//					}
-	//				}
-	//			}
-	//			//长度打分
-	//			if lengths, ok := scoreRule["length"].([]interface{}); ok {
-	//				for _, tmp := range lengths {
-	//					if length, ok := tmp.(map[string]interface{}); ok {
-	//						min := qu.IntAll(length["min"])
-	//						max := qu.IntAll(length["max"])
-	//						scores, _ := length["score"].([]interface{})
-	//						if len(scores) < 3 {
-	//							continue
-	//						}
-	//						if vlen < min {
-	//							v.Score += qu.IntAll(scores[0])
-	//						} else if vlen > max {
-	//							v.Score += qu.IntAll(scores[2])
-	//						} else {
-	//							v.Score += qu.IntAll(scores[1])
-	//						}
-	//					}
-	//				}
-	//			}
-	//			//
-	//			if winnerorders, ok := scoreRule["winnerorder"].([]interface{}); ok {
-	//				for _, winnerorder := range winnerorders {
-	//					if p, ok := winnerorder.(map[string]interface{}); ok {
-	//						qu.Try(func() {
-	//							if p["regexp"] != nil {
-	//								reg := p["regexp"].(*regexp.Regexp)
-	//								if reg.MatchString(qu.ObjToString(v.Value)) {
-	//									v.Score += qu.IntAll(p["score"])
-	//								}
-	//							}
-	//						}, func(err interface{}) {
-	//							log.Println(err)
-	//						})
-	//					}
-	//				}
-	//			}
-	//		}
-	//		//float类型打分
-	//		if fieldtype == "float" {
-	//			min := qu.IntAll(scoreRule["min"])
-	//			max := qu.IntAll(scoreRule["max"])
-	//			val := qu.IntAll(v.Value)
-	//			scores, _ := scoreRule["score"].([]interface{})
-	//			if len(scores) < 3 {
-	//				continue
-	//			}
-	//			if val < min && 0 < val {
-	//				v.Score += qu.IntAll(scores[0])
-	//			} else if val > max {
-	//				v.Score += qu.IntAll(scores[2])
-	//			} else if val <= max && val >= min {
-	//				v.Score += qu.IntAll(scores[1])
-	//			}
-	//		}
-	//		//decimal
-	//		if fieldtype == "decimal" {
-	//			min := qu.IntAll(scoreRule["min"])
-	//			max := qu.IntAll(scoreRule["max"])
-	//			val := qu.IntAll(v.Value)
-	//			scores, _ := scoreRule["score"].([]interface{})
-	//			if len(scores) < 3 {
-	//				continue
-	//			}
-	//			if val > max {
-	//				v.Score += qu.IntAll(scores[2])
-	//			} else if val <= max && val > min {
-	//				v.Score += qu.IntAll(scores[1])
-	//			}
-	//		}
-	//	}
-	//}
-	//}
-	//}, func(err interface{}) {
-	//	log.Println("ScoreFields err", err)
-	//})
-	//return result
 }

+ 22 - 9
src/jy/util/article.go

@@ -37,15 +37,28 @@ type Job struct {
 }
 
 type ExtField struct {
-	BlockTag  map[string]bool //块标签
-	Field     string          //属性
-	Code      string          //匹配标签(字符串、正则)、正则或lua代码
-	RuleText  string          //内容
-	Type      string          //kv(细类:colon1,colon2,space,table)、正则(regexp)
-	MatchType string          //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
-	ExtFrom   string          //抽取来源(title,detail)
-	Value     interface{}     //抽取结果
-	Score     float64         //得分
+	BlockTag  map[string]bool         //块标签
+	Field     string                  //属性
+	Code      string                  //匹配标签(字符串、正则)、正则或lua代码
+	RuleText  string                  //内容
+	Type      string                  //kv(细类:colon1,colon2,space,table)、正则(regexp)
+	MatchType string                  //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
+	ExtFrom   string                  //抽取来源(title,detail)
+	Value     interface{}             //抽取结果
+	Score     float64                 //得分
+	ScoreItem []*ScoreItem //打分项
+}
+
+//打分项
+type ScoreItem struct {
+	Des       string      //分数说明
+	Code      string      //匹配标签(字符串、正则)、正则或lua代码
+	RuleText  string      //内容
+	Type      string      //kv(细类:colon1,colon2,space,table)、正则(regexp)
+	MatchType string      //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
+	ExtFrom   string      //抽取来源(title,detail)
+	Value     interface{} //抽取结果
+	Score     float64     //得分结果
 }
 
 //分块规则

+ 15 - 0
src/res/fieldscore.json

@@ -243,5 +243,20 @@
             2,
             -3
         ]
+    },
+    "projectscope": {
+        "type": "string",
+        "length": [
+            {
+                "describe": "项目范围字数",
+                "min": 2,
+                "max": 500,
+                "score": [
+                    -5,
+                    3,
+                    -1
+                ]
+            }
+        ]
     }
 }