Browse Source

增加辅助字段、排序调整

zhangjinkun 6 years ago
parent
commit
482aaffd36
3 changed files with 100 additions and 88 deletions
  1. 56 78
      src/jy/extract/extract.go
  2. 11 10
      src/jy/util/article.go
  3. 33 0
      src/jy/util/sort.go

+ 56 - 78
src/jy/extract/extract.go

@@ -10,7 +10,6 @@ import (
 	ju "jy/util"
 	qu "qfw/util"
 	"qfw/util/redis"
-	"reflect"
 	"regexp"
 	"strconv"
 	"strings"
@@ -23,13 +22,13 @@ import (
 )
 
 var (
-	lock    sync.RWMutex
-	cut     = ju.NewCut()                          //获取正文并清理
-	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask          //任务列表
-	ClearTaskList map[string]*ClearTask            //清理任务列表
-	saveLimit     = 200                            //抽取日志批量保存
-	PageSize      = 5000                           //查询分页
+	lock          sync.RWMutex
+	cut           = ju.NewCut()                          //获取正文并清理
+	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask                //任务列表
+	ClearTaskList map[string]*ClearTask                  //清理任务列表
+	saveLimit     = 200                                  //抽取日志批量保存
+	PageSize      = 5000                                 //查询分页
 	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -376,7 +375,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 						if len(j.Result[vc.Field]) < 1 {
 							items := make([]*ju.ScoreItem, 1)
 							items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
-							field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
+							field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "regexp", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title, Score: 4, ScoreItem: items}
 							if tmp["blocktag"] != nil {
 								field.BlockTag = tmp["blocktag"].(map[string]bool)
 							}
@@ -422,7 +421,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					if vc.Field == "projectname" {
 						items := make([]*ju.ScoreItem, 1)
 						items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
-						field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
+						field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "regexp", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title, Score: 4, ScoreItem: items}
 						if len(j.Result[vc.Field]) < 1 {
 							if tmp["blocktag"] != nil {
 								field.BlockTag = tmp["blocktag"].(map[string]bool)
@@ -683,7 +682,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				}
 				if tmps, ok := v.([]map[string]interface{}); ok {
 					for _, tmp := range tmps {
-						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"],}
+						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
 						if extfrom == "title" {
 							field.Score = 4
 						}
@@ -947,7 +946,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						if v.RegCore.NumSign == -1 { //正负值修正
 							val = "-" + val
 						}
-						exfield := ju.ExtField{BlockTag:*tag,Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
+						exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
 						if extfrom == "title" {
 							exfield.Score = 4
 						}
@@ -999,7 +998,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 			if j.Result[v.Field] == nil {
 				j.Result[v.Field] = [](*ju.ExtField){}
 			}
-			field := &ju.ExtField{BlockTag:*tag,Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
+			field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
 			if extfrom == "title" {
 				field.Score = 4
 			}
@@ -1203,13 +1202,15 @@ type FieldValue struct {
 //分析抽取结果并保存
 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 	qu.Try(func() {
-		doc, result, _id, values := funcAnalysis(j)
+		doc, result, _id := funcAnalysis(j)
+		auxinfo := auxInfo(j)
 		//从排序结果中取值
 		tmp := map[string]interface{}{} //抽取值
-		for key, val := range values {
+		tmp["fieldall"] = auxinfo
+		for _, val := range result {
 			for _, v := range val { //取第一个非负数
-				if v.Key != "" && v.Value > -1 {
-					tmp[key] = v.Object
+				if v.Score > -1 {
+					tmp[v.Field] = v.Value
 					break
 				}
 			}
@@ -1222,14 +1223,15 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		}
 		//处理附件
 		var resultf map[string][]*ju.ExtField
-		var filevalues map[string][]*ju.SortObject
 		if jf != nil {
-			_, resultf, _, filevalues = funcAnalysis(jf)
+			_, resultf, _ = funcAnalysis(jf)
+			auxinfof := auxInfo(jf)
+			tmp["fieldallf"] = auxinfof
 			ffield := map[string]interface{}{}
-			for key, val := range filevalues {
+			for _, val := range resultf {
 				for _, v := range val { //取第一个非负数
-					if v.Key != "" && v.Value > -1 {
-						ffield[key] = v.Object
+					if v.Score > -1 {
+						ffield[v.Field] = v.Value
 						break
 					}
 				}
@@ -1375,72 +1377,48 @@ func delFiled(k string) bool {
 	return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
 }
 
-func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string, map[string][]*ju.SortObject) {
+func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
 	defer qu.Catch()
 	doc := j.Data
 	result := j.Result
 	_id := qu.BsonIdToSId((*doc)["_id"])
-	iscore, _ := ju.Config["fieldscore"].(bool)
-	if iscore { //打分
-		result = ScoreFields(j)
-	}
+	result = ScoreFields(j)
+
 	//结果排序
-	values := map[string][]*ju.SortObject{}
-	for key, val := range result {
-		fieldValue := map[string][]interface{}{}
-		//cfscore := make(map[string]float64) //重复匹配加分
-		if iscore { //走打分
-			for _, v := range val {
-				if len(fmt.Sprint(v.Value)) < 1 {
-					continue //去除空串
-				}
-				//if v.Score >0 {
-				//	cfscore[fmt.Sprint(v.Value)] += 1
-				//}
-				if fieldValue[fmt.Sprint(v.Value)+v.Type] == nil {
-					fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
-				} else if fieldValue[fmt.Sprint(v.Value)+v.Type][0].(float64) < v.Score {
-					fieldValue[fmt.Sprint(v.Value)+v.Type][0] = v.Score
-				}
-			}
-			//for key := range fieldValue {
-			//	for cfkey, cfv := range cfscore {
-			//		if strings.Contains(key, cfkey) {
-			//			fieldValue[key][0] = fieldValue[key][0].(float64) + cfv
-			//		}
-			//	}
-			//}
-		} else { //不走打分,按出现频次
-			for _, v := range val {
-				if len(fmt.Sprint(v.Value)) < 1 {
-					continue //去除空串
-				}
-				if fieldValue[fmt.Sprint(v.Value)] == nil {
-					fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
-				} else {
-					fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
+	for _, val := range result {
+		ju.Sort(val)
+	}
+	return doc, result, _id
+}
+
+//辅助信息,如果没有排序先排序
+func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
+	fieldalls := map[string][]map[string]interface{}{}
+	for field, val := range j.Result {
+		//ju.Sort(val)
+		sfields := []map[string]interface{}{}
+		for _, v := range val {
+			standardized := false
+			if field == "buyer" || field == "winner" || field == "agency" {
+				i := redis.GetInt(field, field+"_"+qu.ObjToString(v.Value))
+				if i > 0 {
+					standardized = true
 				}
 			}
-		}
-		objects := []*ju.SortObject{}
-		for k, v := range fieldValue {
-			ValueStr := "" //第二排序
-			if reflect.TypeOf(v[1]).String() == "string" {
-				ValueStr = qu.ObjToString(v[1])
-			}
-			tmp := &ju.SortObject{
-				Key:      k,
-				Value:    qu.IntAll(v[0]),
-				Object:   v[1],
-				ValueStr: ValueStr,
+			sfield := map[string]interface{}{
+				"val":          v.Value,
+				"type":         v.Type,
+				"score":        v.Score,
+				"blocktag":     v.BlockTag,
+				"sourceval":    v.SourceValue,
+				"standardized": standardized,
 			}
-			objects = append(objects, tmp)
+			sfields = append(sfields, sfield)
 		}
-		values[key] = ju.ExtSort(objects)
+		fieldalls[field] = sfields
 	}
-	return doc, result, _id, values
+	return fieldalls
 }
-
 func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 	defer qu.Catch()
 	//获取审核字段
@@ -1479,7 +1457,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 { //reids未找到,执行规则匹配
+	if i == 0 {                            //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 11 - 10
src/jy/util/article.go

@@ -38,16 +38,17 @@ type Job struct {
 }
 
 type ExtField struct {
-	BlockTag  map[string]bool //块标签
-	Field     string          //属性
-	Code      string          //匹配标签(字符串、正则)、正则或lua代码
-	RuleText  string          //内容
-	Type      string          //kv(细类:colon1,colon2,space,table)、正则(regexp)
-	MatchType string          //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
-	ExtFrom   string          //抽取来源(title,detail)
-	Value     interface{}     //抽取结果
-	Score     float64         //得分
-	ScoreItem []*ScoreItem    //打分项
+	BlockTag    map[string]bool //块标签
+	Field       string          //属性
+	Code        string          //匹配标签(字符串、正则)、正则或lua代码
+	RuleText    string          //内容
+	Type        string          //kv(细类:colon1,colon2,space,table)、正则(regexp)
+	MatchType   string          //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
+	ExtFrom     string          //抽取来源(title,detail)
+	SourceValue interface{}     //抽取结果--未清理
+	Value       interface{}     //抽取结果
+	Score       float64         //得分
+	ScoreItem   []*ScoreItem    //打分项
 }
 
 //打分项

+ 33 - 0
src/jy/util/sort.go

@@ -2,9 +2,11 @@
 package util
 
 import (
+	"fmt"
 	"sort"
 )
 
+/*
 type SortObject struct {
 	Key      string
 	Value    int
@@ -39,3 +41,34 @@ func ExtSort(list []*SortObject) []*SortObject {
 	sort.Sort(ls)
 	return ls
 }
+
+*/
+
+//ExtField排序
+type results []*ExtField
+
+func (list results) Len() int {
+	return len(list)
+}
+
+func (list results) Less(i, j int) bool {
+	if list[i].Score > list[j].Score {
+		return true
+	} else if list[i].Score < list[j].Score {
+		return false
+	} else {
+		return fmt.Sprint(list[i].Value) > fmt.Sprint(list[j].Value)
+	}
+}
+
+func (list results) Swap(i, j int) {
+	var temp *ExtField = list[i]
+	list[i] = list[j]
+	list[j] = temp
+}
+
+func Sort(list []*ExtField) []*ExtField {
+	ls := results(list)
+	sort.Sort(ls)
+	return ls
+}