Kaynağa Gözat

打分调整、对比查询增加属性

zhangjinkun 6 yıl önce
ebeveyn
işleme
bd81346886

+ 0 - 1
src/config.json

@@ -9,7 +9,6 @@
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
     "saveresult": true,
-    "fieldscore": true,
     "qualityaudit": false,
     "saveblock": true,
     "filelength": 100000,

+ 10 - 9
src/jy/extract/extract.go

@@ -24,12 +24,12 @@ import (
 var (
 	lock, lockrule, lockclear sync.RWMutex
 
-	cut     = ju.NewCut()                          //获取正文并清理
-	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask          //任务列表
-	ClearTaskList map[string]*ClearTask            //清理任务列表
-	saveLimit     = 200                            //抽取日志批量保存
-	PageSize      = 5000                           //查询分页
+	cut           = ju.NewCut()                          //获取正文并清理
+	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask                //任务列表
+	ClearTaskList map[string]*ClearTask                  //清理任务列表
+	saveLimit     = 200                                  //抽取日志批量保存
+	PageSize      = 5000                                 //查询分页
 	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -869,6 +869,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						continue
 					}
 					val := text[pos[p]:pos[p+1]]
+					sourcevalue := val
 					if val == "招标公告" {
 						return extinfo
 					}
@@ -892,7 +893,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						if v.RegCore.NumSign == -1 { //正负值修正
 							val = "-" + val
 						}
-						exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
+						exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: sourcevalue, Value: val}
 						if extfrom == "title" {
 							exfield.Score = 4
 						}
@@ -1336,7 +1337,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if len(blocks) > 0 {
 			tmp["blocks"] = blocks
 		}
-		tmp["extract_content"] = j.Content
+		//tmp["extract_content"] = j.Content
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
 				for field, _ := range e.Fields {
@@ -1554,7 +1555,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 { //reids未找到,执行规则匹配
+	if i == 0 {                            //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 3 - 2
src/jy/extract/extractInit.go

@@ -369,11 +369,12 @@ func (e *ExtractTask) InitRuleCore() {
 						tmp := strings.Split(rinfo.RuleText, "__")
 						var pattern string
 						if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
 							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 						} else {
 							pattern = tmp[0]
 						}
-						pattern, _ = strconv.Unquote(`"` + pattern + `"`)
 						if len(tmp) == 2 {
 							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 						} else {
@@ -413,12 +414,12 @@ func (e *ExtractTask) InitRuleCore() {
 						tmp := strings.Split(rinfo.RuleText, "__")
 						var pattern string
 						if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
 							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 						} else {
 							pattern = tmp[0]
 						}
-						pattern, _ = strconv.Unquote(`"` + pattern + `"`)
 						if len(tmp) == 2 {
 							epos := strings.Split(tmp[1], ",")
 							posm := map[string]int{}

+ 10 - 17
src/jy/extract/score.go

@@ -108,29 +108,22 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				//qz := TagConfig["其他"][field]
 				//tmps[tmpsindex].Score += 2 * qz //乘以权重系数
 			}
-			if tmpsvalue.ExtFrom != "title" { //非标题抽取
-				//是否有kv值
-				if strings.Contains(tmpsvalue.Type, "colon") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "colonkv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
-				} else if strings.Contains(tmpsvalue.Type, "space") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "spacekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
-				} else if strings.Contains(tmpsvalue.Type, "table") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "tablekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
-				}
+			//是否有kv值
+			if strings.Contains(tmpsvalue.Type, "colon") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "colonkv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
+			} else if strings.Contains(tmpsvalue.Type, "space") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "spacekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
+			} else if strings.Contains(tmpsvalue.Type, "table") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "tablekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
 			}
 			if tmpsvalue.ExtFrom != "title" { //非标题抽取
 				if strings.Contains(tmpsvalue.Type, "regexp") {
 					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "regexp", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])})
 				}
-			} else {
-				if strings.Contains(tmpsvalue.Type, "regexp") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"]) + 1
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "regexp", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"]) + 1})
-				}
 			}
 			scoreRule := SoreConfig[field]
 			if scoreRule == nil {

+ 3 - 3
src/res/fieldscore.json

@@ -3,8 +3,8 @@
         "describe": "抽取类型打分",
         "title": 4,
         "table": 3,
-        "colon": 2,
-        "space": 2,
+        "colon": 3,
+        "space": 3,
         "regexp": 2,
         "winnerorder": 3
     },
@@ -32,7 +32,7 @@
         "length": [
             {
                 "describe": "长度打分min>val:-6,min<=val<=max:1,max<val:-1",
-                "min": 4,
+                "min": 5,
                 "max": 35,
                 "score": [
                     -10,

+ 1 - 1
versioncomparison/config.json

@@ -17,7 +17,7 @@
         "buyertel",
         "buyeraddr",
         "agencyperson",
-        "agencytel",5d39d253a5cb26b9b7404ae1,5d3b23aaa5cb26b9b7c1ec59
+        "agencytel",
         "agencyaddr"
     ]
 }

+ 16 - 10
versioncomparison/main.go

@@ -15,13 +15,14 @@ import (
 )
 
 var (
-	SysConfig map[string]interface{}
-	Premgo    *mongodbutil.Pool //上个版本库
-	Newmgo    *mongodbutil.Pool //当前版本库
-	FieldData map[string]map[string]*Data
-	Compares  map[string]*Compare
-	Sid, Eid  string
-	Fields    []string
+	SysConfig   map[string]interface{}
+	Premgo      *mongodbutil.Pool //上个版本库
+	Newmgo      *mongodbutil.Pool //当前版本库
+	FieldData   map[string]map[string]*Data
+	Compares    map[string]*Compare
+	Sid, Eid    string
+	Fields      []string
+	FieldsQuery string
 )
 
 type Compare struct {
@@ -44,8 +45,13 @@ func init() {
 	Premgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["premgo"]), qu.ObjToString(SysConfig["predb"]))
 	Newmgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["newmgo"]), qu.ObjToString(SysConfig["newdb"]))
 	tmp, _ := SysConfig["fields"].([]interface{})
-	for _, v := range tmp {
+	for k, v := range tmp {
 		Fields = append(Fields, qu.ObjToString(v))
+		if k < (len(tmp) - 1) {
+			FieldsQuery += `"` + qu.ObjToString(v) + `":1,`
+		} else {
+			FieldsQuery += `"` + qu.ObjToString(v) + `":1`
+		}
 	}
 	FieldData = map[string]map[string]*Data{}
 	Compares = map[string]*Compare{}
@@ -110,7 +116,7 @@ func createXlsx() {
 func getVersionData() {
 	query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(Sid), "$lte": bson.ObjectIdHex(Eid)}}
 	log.Println(qu.ObjToString(SysConfig["prec"]), query)
-	list1, _ := Premgo.Find(qu.ObjToString(SysConfig["prec"]), query, nil, `{}`, false, -1, -1)
+	list1, _ := Premgo.Find(qu.ObjToString(SysConfig["prec"]), query, nil, `{`+FieldsQuery+`}`, false, -1, -1)
 	for _, v := range *list1 {
 		for _, key := range Fields {
 			rd := FieldData[key]
@@ -126,7 +132,7 @@ func getVersionData() {
 	}
 	log.Println("pre version 加载完成")
 
-	list2, _ := Newmgo.Find(qu.ObjToString(SysConfig["newc"]), query, nil, `{}`, false, -1, -1)
+	list2, _ := Newmgo.Find(qu.ObjToString(SysConfig["newc"]), query, nil, `{`+FieldsQuery+`}`, false, -1, -1)
 	for _, v := range *list2 {
 		for _, field := range Fields {
 			rd := FieldData[field]