Browse Source

优化抽取

fengweiqiang 6 years ago
parent
commit
07415dacbf
3 changed files with 59 additions and 25 deletions
  1. 46 21
      src/jy/extract/extract.go
  2. 2 2
      src/jy/extract/score.go
  3. 11 2
      src/jy/pretreated/analystep.go

+ 46 - 21
src/jy/extract/extract.go

@@ -23,13 +23,13 @@ import (
 )
 
 var (
-	lock          sync.RWMutex
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 200                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	lock    sync.RWMutex
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask          //任务列表
+	ClearTaskList map[string]*ClearTask            //清理任务列表
+	saveLimit     = 200                            //抽取日志批量保存
+	PageSize      = 5000                           //查询分页
 	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -345,7 +345,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 			//抽取规则
 			tmprules := map[string][]*RuleCore{}
 			lock.Lock()
-			if j.Category == "*"{
+			if e.RuleCores[j.Category] == nil {
 				j.Category = "*_其他"
 			}
 			for k, vc1 := range e.RuleCores[j.Category] {
@@ -393,7 +393,13 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				}
 			}
 		} else {
-			for _, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
+			var cores map[string][]*RuleCore
+			if e.RuleCores[j.Category+"_"+j.CategorySecond] == nil {
+				cores = e.RuleCores["*_其他"]
+			} else {
+				cores = e.RuleCores[j.Category+"_"+j.CategorySecond]
+			}
+			for _, vc1 := range cores {
 				for _, vc := range vc1 {
 					tmp := ju.DeepCopy(doc).(map[string]interface{})
 					//是否进入逻辑
@@ -477,7 +483,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					clear.MesField[key] != nil {
 					text := qu.ObjToString(v.Value)
 					text = clear.OtherClean(key, text)
-					v.Value = text
+					if text != "" {
+						v.Value = text
+					}
 				}
 				lock.Unlock()
 			}
@@ -675,15 +683,15 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				}
 				if tmps, ok := v.([]map[string]interface{}); ok {
 					for _, tmp := range tmps {
-						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], }
-						if extfrom == "title"{
+						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"],}
+						if extfrom == "title" {
 							field.Score = 4
 						}
 						if tmp["blocktag"] != nil {
 							field.BlockTag = tmp["blocktag"].(map[string]bool)
 						}
 						item := &ju.ScoreItem{Des: "初始化", Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
-						if extfrom =="title"{
+						if extfrom == "title" {
 							item.Score = 4
 						}
 						if tmp["scoreitem"] == nil {
@@ -916,6 +924,12 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						continue
 					}
 					val := text[pos[p]:pos[p+1]]
+					if val == "招标公告" {
+						return extinfo
+					}
+					if utf8.RuneCountInString(val) < 2 && extfrom == "title" {
+						val = text
+					}
 					tmps := []map[string]interface{}{}
 					tmp := map[string]interface{}{
 						"field":     v.Field,
@@ -931,14 +945,14 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 					extinfo[k] = tmps
 					if strings.TrimSpace(val) != "" {
 						exfield := ju.ExtField{Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
-						if extfrom == "title"{
+						if extfrom == "title" {
 							exfield.Score = 4
 						}
 						if tmp["blocktag"] != nil {
 							exfield.BlockTag = tmp["blocktag"].(map[string]bool)
 						}
 						item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
-						if extfrom =="title"{
+						if extfrom == "title" {
 							item.Score = 4
 						}
 						if tmp["scoreitem"] == nil {
@@ -983,14 +997,14 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 				j.Result[v.Field] = [](*ju.ExtField){}
 			}
 			field := &ju.ExtField{Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
-			if extfrom == "title"{
+			if extfrom == "title" {
 				field.Score = 4
 			}
 			if tmp["blocktag"] != nil {
 				field.BlockTag = tmp["blocktag"].(map[string]bool)
 			}
 			item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
-			if extfrom =="title"{
+			if extfrom == "title" {
 				item.Score = 4
 			}
 			if tmp["scoreitem"] == nil {
@@ -1052,7 +1066,7 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 					//						continue
 					//					}
 					text := qu.ObjToString(v.Value)
-					if text != "" {
+					if text != "" && v.ExtFrom != "title" {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 					}
 					j.Result[in.Field][k].Value = text
@@ -1370,17 +1384,28 @@ func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField
 	values := map[string][]*ju.SortObject{}
 	for key, val := range result {
 		fieldValue := map[string][]interface{}{}
+		cfscore := make(map[string]float64) //重复匹配加分
 		if iscore { //走打分
 			for _, v := range val {
 				if len(fmt.Sprint(v.Value)) < 1 {
 					continue //去除空串
 				}
-				if fieldValue[fmt.Sprint(v.Value)+v.Type] == nil{
+				if v.Score >0 {
+					cfscore[fmt.Sprint(v.Value)] += 1
+				}
+				if fieldValue[fmt.Sprint(v.Value)+v.Type] == nil {
 					fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
-				}else if fieldValue[fmt.Sprint(v.Value)+v.Type][0].(float64) < v.Score {
+				} else if fieldValue[fmt.Sprint(v.Value)+v.Type][0].(float64) < v.Score {
 					fieldValue[fmt.Sprint(v.Value)+v.Type][0] = v.Score
 				}
 			}
+			for key := range fieldValue {
+				for cfkey, cfv := range cfscore {
+					if strings.Contains(key, cfkey) {
+						fieldValue[key][0] = fieldValue[key][0].(float64) + cfv
+					}
+				}
+			}
 		} else { //不走打分,按出现频次
 			for _, v := range val {
 				if len(fmt.Sprint(v.Value)) < 1 {
@@ -1450,7 +1475,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 {                            //reids未找到,执行规则匹配
+	if i == 0 { //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 2 - 2
src/jy/extract/score.go

@@ -126,8 +126,8 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				}
 			} else {
 				if strings.Contains(tmpsvalue.Type, "regexp") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"]) + 1})
+					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])+ 1
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])+ 1 })
 				}
 			}
 			scoreRule := SoreConfig[field]

+ 11 - 2
src/jy/pretreated/analystep.go

@@ -29,7 +29,7 @@ func AnalyStart(job *util.Job) {
 		}
 	}
 	blockArrays, _ := DivideBlock(con, 1, job.RuleBlock) //分块
-	if len(blockArrays) > 0 {                            //有分块
+	if len(blockArrays) > 0 { //有分块
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
@@ -82,6 +82,15 @@ func AnalyStart(job *util.Job) {
 		bl.SpaceKV = SspacekvEntity.Entrance(newCon, "", nil)
 		job.Block = append(job.Block, bl)
 	}
+	for _, v := range job.BlockPackage {
+		block := &util.Block{}
+		block.ColonKV = v.ColonKV
+		block.TableKV = v.TableKV
+		block.SpaceKV = v.SpaceKV
+		block.Text = v.Text
+		block.Winnerorder = v.WinnerOrder
+		job.Block = append(job.Block, block)
+	}
 }
 
 //分析table解析结果
@@ -225,7 +234,7 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 //ration==1 遍历所有tabs,ration!=1 tabs只有一个
 func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) string {
 	if len(tabs) != 1 {
-		return ""//5c2aca5ea5cb26b9b7a8229b
+		return "" //5c2aca5ea5cb26b9b7a8229b
 	}
 	for _, tab := range tabs {
 		content := ""