Browse Source

抽取权重清理

fengweiqiang 6 years ago
parent
commit
17f4539462

+ 27 - 8
src/jy/extract/extract.go

@@ -24,12 +24,12 @@ import (
 var (
 	lock, lockrule, lockclear sync.RWMutex
 
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 200                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask          //任务列表
+	ClearTaskList map[string]*ClearTask            //清理任务列表
+	saveLimit     = 200                            //抽取日志批量保存
+	PageSize      = 5000                           //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -438,6 +438,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 		}
 		//函数清理
 		for key, val := range j.Result {
+			tmpExtFields := make([]*ju.ExtField, 0)
+			tmpWeight := -999 //记录最大权重
+			tmpIndex := -999  //记录最大权重下标
 			for _, v := range val {
 				lockclear.Lock()
 				cfn := e.ClearFn[key]
@@ -456,6 +459,22 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				}
 				lockclear.Unlock()
 			}
+			//项目编号,采购单位权重清理
+			if (key == "projectcode" || key == "buyer") && len(val) > 1 {
+				for i, v := range val {
+					if v.Weight == 0 {
+						tmpExtFields = append(tmpExtFields, v)
+						continue
+					} else if v.Weight > tmpWeight {
+						tmpWeight = v.Weight
+						tmpIndex = i
+					}
+				}
+				if tmpIndex != -999 {
+					tmpExtFields = append(tmpExtFields, val[tmpIndex])
+					j.Result[key] = tmpExtFields
+				}
+			}
 		}
 		PackageDetail(j, e) //处理分包信息
 		//		bs, _ := json.Marshal(j.Result)
@@ -627,7 +646,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				}
 				if tmps, ok := v.([]map[string]interface{}); ok {
 					for _, tmp := range tmps {
-						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
+						field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
 						if tmp["blocktag"] != nil {
 							btag := make(map[string]string)
 							for k := range tmp["blocktag"].(map[string]bool) {
@@ -1348,7 +1367,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 {                            //reids未找到,执行规则匹配
+	if i == 0 { //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 11 - 3
src/jy/pretreated/analystep.go

@@ -6,6 +6,7 @@ package pretreated
 import (
 	"encoding/json"
 	"jy/util"
+	//"log"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
@@ -29,10 +30,11 @@ func AnalyStart(job *util.Job) {
 		}
 	}
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
-	if len(blockArrays) > 0 {                                                //有分块
+	if len(blockArrays) > 0 { //有分块
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
+			//log.Println(bl.Text)
 			if len([]rune(bl.Text)) > 80 {
 				bl.Block, _ = DivideBlock(job.CategorySecond, bl.Text, 1, job.RuleBlock)
 				for _, bl_bl := range bl.Block {
@@ -54,9 +56,12 @@ func AnalyStart(job *util.Job) {
 			newCon = TextAfterRemoveTable(con)
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
 			for i := 0; i < len(tabs); i++ {
+				tabtmp := &util.Block{}
+				tabtmp.Text = tabs[i].Text()
 				//添加标识:文本中有table
 				tabres := AnalyTableV2(tabs[i], job.Category, "", con, 1, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
-				processTableResult(tabres, bl, job)                                                     //分析table解析结果
+				processTableResult(tabres, tabtmp, job)                                                 //分析table解析结果
+				job.Block = append(job.Block, tabtmp)
 			}
 			//			for k, v := range bl.TableKV.Kv {
 			//				log.Println("bl.TableKV.Kv", k, v)
@@ -84,11 +89,14 @@ func processTableInBlock(bl *util.Block, job *util.Job) {
 	for _, tab := range tabs {
 		job.HasTable = 1
 		//添加标识:文本中有table
-		tabres := AnalyTableV2(tab, job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
+		tmpblock := &util.Block{}
+		tmpblock.Text = tab.Text()
+		tabres := AnalyTableV2(tab, job.Category, bl.Title, tmpblock.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
 		processTableResult(tabres, bl, job)                                                           //分析table解析结果
 		if bl.Title == "" && tabres.BlockTag != "" {
 			bl.Title = tabres.BlockTag
 		}
+		job.Block = append(job.Block,tmpblock)
 	}
 }
 

+ 6 - 2
src/jy/pretreated/colonkv.go

@@ -713,10 +713,14 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
 				if moneyreg.MatchString(tk.Value) {
 					val += GetMoneyUnit(k, val)
 				}
-				kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: val, Weight: tk.Weight})
+				if val != "" {
+					kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: val, Weight: tk.Weight})
+				}
 			}
 		} else {
-			kvTags[key] = append(kvTags[key], &Tag{Key: k, Value: val, IsInvalid: true})
+			if val != "" {
+				kvTags[key] = append(kvTags[key], &Tag{Key: k, Value: val, IsInvalid: true})
+			}
 		}
 	}
 	return kvTags

+ 1 - 0
src/jy/util/article.go

@@ -49,6 +49,7 @@ type ExtField struct {
 	Value       interface{}       //抽取结果
 	Score       float64           //得分
 	ScoreItem   []*ScoreItem      //打分项
+	Weight      int               //权重值
 	ValRepeat   int               //结果值重复次数,打分参考
 }
 

+ 2 - 0
src/jy/util/script.go

@@ -146,6 +146,8 @@ func MapToLuaTable(l *lua.LState, obj map[string]interface{}) *lua.LTable {
 			tab.RawSet(lua.LString(k), lua.LNumber(val))
 		} else if val, ok := v.(int32); ok {
 			tab.RawSet(lua.LString(k), lua.LNumber(val))
+		} else if val, ok := v.(int); ok {
+			tab.RawSet(lua.LString(k), lua.LNumber(val))
 		} else if val, ok := v.(float64); ok {
 			tab.RawSet(lua.LString(k), lua.LNumber(val))
 		} else if val, ok := v.(float32); ok {

+ 1 - 1
src/res/fieldscore.json

@@ -129,7 +129,7 @@
                 "range": [
                     0,
                     4,
-                    -10
+                    -3
                 ]
             },
             {