wcj 6 år sedan
förälder
incheckning
074de5f0cf

+ 78 - 11
src/jy/extract/extract.go

@@ -24,12 +24,12 @@ import (
 var (
 	lock, lockrule, lockclear sync.RWMutex
 
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 200                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask          //任务列表
+	ClearTaskList map[string]*ClearTask            //清理任务列表
+	saveLimit     = 200                            //抽取日志批量保存
+	PageSize      = 5000                           //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -185,6 +185,11 @@ func RunExtractTask(taskId string) {
 			if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 				continue
 			}
+			//根据标题判断是否抽取
+			b := IsExtract("title", qu.ObjToString(v["title"]), "")
+			if !b {
+				continue
+			}
 			_id := qu.BsonIdToSId(v["_id"])
 			//log.Debug(_id)
 			if !ext.IsRun {
@@ -438,24 +443,48 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 		}
 		//函数清理
 		for key, val := range j.Result {
+			tmpExtFields := make([]*ju.ExtField, 0)
+			tmpWeight := -999 //记录最大权重
+			tmpIndex := -999  //记录最大权重下标
 			for _, v := range val {
 				lockclear.Lock()
 				cfn := e.ClearFn[key]
 				lockclear.Unlock()
 				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
+				before, _ := v.Value.(string)
 				v.Value = data[0]
+				BeforeAddClearFnLog("clearcfn", "函数清理", j.SourceMid, before, "clear_cfn", v, e)
+				//添加行数清理的日志
 				//清理特殊符号
 				lockclear.Lock()
-				if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
-					clear.MesField[key] != nil {
+				if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
 					text := qu.ObjToString(v.Value)
+					before = text
 					text = clear.OtherClean(key, text)
 					if text != "" {
 						v.Value = text
 					}
+					BeforeAddClearFnLog("clearsymbol", "特殊符号清理", j.SourceMid, before, "clear_symbol", v, e)
 				}
+				//AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
 				lockclear.Unlock()
 			}
+			//项目编号,采购单位权重清理
+			if (key == "projectcode" || key == "buyer") && len(val) > 1 {
+				for i, v := range val {
+					if v.Weight == 0 {
+						tmpExtFields = append(tmpExtFields, v)
+						continue
+					} else if v.Weight > tmpWeight {
+						tmpWeight = v.Weight
+						tmpIndex = i
+					}
+				}
+				if tmpIndex != -999 {
+					tmpExtFields = append(tmpExtFields, val[tmpIndex])
+					j.Result[key] = tmpExtFields
+				}
+			}
 		}
 		PackageDetail(j, e) //处理分包信息
 		//		bs, _ := json.Marshal(j.Result)
@@ -610,7 +639,7 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
 //抽取-规则
 func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
 	defer qu.Catch()
-	//废标、流标、ppp等跳过
+	//根据field配置项目,是否抽取。例如:废标、流标等跳过,
 	b := IsExtract(in.Field, j.Title, j.Content)
 	if !b {
 		return
@@ -627,7 +656,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				}
 				if tmps, ok := v.([]map[string]interface{}); ok {
 					for _, tmp := range tmps {
-						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
+						field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
 						if tmp["blocktag"] != nil {
 							btag := make(map[string]string)
 							for k := range tmp["blocktag"].(map[string]bool) {
@@ -979,6 +1008,44 @@ func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *Re
 	ExtLogs[t] = append(ExtLogs[t], logdata)
 	lock.Unlock()
 }
+func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
+	exts := []map[string]interface{}{}
+	exts = append(exts, map[string]interface{}{
+		"field":     ext.Field,
+		"code":      ext.Code,
+		"type":      ftype,
+		"matchtype": matchtype,
+		"extfrom":   ext.ExtFrom,
+		"value":     ext.Value,
+	})
+	extinfo := map[string]interface{}{
+		ext.Field: exts,
+	}
+	AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
+}
+func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
+	defer qu.Catch()
+	if !t.IsEtxLog {
+		return
+	}
+	logdata := map[string]interface{}{
+		"code":       code,
+		"name":       name,
+		"type":       ftype,
+		"ruletext":   "",
+		"islua":      false,
+		"field":      field,
+		"version":    t.Version,
+		"taskname":   t.Name,
+		"before":     before,
+		"extinfo":    extinfo,
+		"sid":        sid,
+		"comeintime": time.Now().Unix(),
+	}
+	lock.Lock()
+	ExtLogs[t] = append(ExtLogs[t], logdata)
+	lock.Unlock()
+}
 
 //保存抽取日志
 func SaveExtLog() {
@@ -1356,7 +1423,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 {                            //reids未找到,执行规则匹配
+	if i == 0 { //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 60 - 17
src/jy/extract/score.go

@@ -17,20 +17,43 @@ var (
 	TagConfig     map[string]map[string]float64
 	TagConfigDesc map[string]string
 
-	TitleScore, RepeatScore, BlockScore float64
+	RepeatScore, BlockScore float64
+	CommonScore             map[string]float64
+	FieldsScore             map[string]map[string]float64
 )
 
 func init() {
 	qu.ReadConfig("./res/tagscoredesc.json", &TagConfigDesc)
 	qu.ReadConfig("./res/tagscore.json", &TagConfig)
 	qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
-	TitleScore = qu.Float64All(SoreConfig["extractype"]["title"])
 	if repeat, ok := SoreConfig["other"]["repeat"].(map[string]interface{}); ok {
 		RepeatScore = qu.Float64All(repeat["score"])
 	}
 	if block, ok := SoreConfig["other"]["block"].(map[string]interface{}); ok {
 		BlockScore = qu.Float64All(block["score"])
 	}
+	//通用抽取属性打分配置
+	if tmp, ok := SoreConfig["extractype"]["common"].(map[string]interface{}); ok {
+		CommonScore = map[string]float64{}
+		for k, v := range tmp {
+			CommonScore[k] = qu.Float64All(v)
+		}
+	}
+	log.Println(CommonScore)
+	//指定抽取属性打分配置
+	if tmp, ok := SoreConfig["extractype"]["fields"].(map[string]interface{}); ok {
+		FieldsScore = map[string]map[string]float64{}
+		for key, fieldmap := range tmp {
+			fieldscore := map[string]float64{}
+			if field, ok := fieldmap.(map[string]interface{}); ok {
+				for k, score := range field {
+					fieldscore[k] = qu.Float64All(score)
+				}
+			}
+			FieldsScore[key] = fieldscore
+		}
+	}
+	log.Println(FieldsScore)
 	//实例化正则
 	for _, tmp := range SoreConfig {
 		//log.Println(tmp)
@@ -102,9 +125,10 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 	result := j.Result
 	for field, tmps := range result {
 		for tmpsindex, tmpsvalue := range tmps {
+			describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
 			if tmpsvalue.ExtFrom == "title" { //标题打分初始化
-				tmps[tmpsindex].Score += TitleScore
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: TitleScore})
+				tmps[tmpsindex].Score += CommonScore["title"]
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
 			}
 			//是否有段标签
 			if len(tmpsvalue.BlockTag) > 0 {
@@ -125,23 +149,42 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 			}
 
 			//抽取类型打分
-			if strings.Contains(tmpsvalue.Type, "colon") {
-				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
-			} else if strings.Contains(tmpsvalue.Type, "space") {
-				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
-			} else if strings.Contains(tmpsvalue.Type, "table") {
-				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
-			} else if strings.Contains(tmpsvalue.Type, "regexp") {
-				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])})
+			if FieldsScore[field] != nil { //指定抽取属性打分配置
+				fieldscore := FieldsScore[field]
+				if strings.Contains(tmpsvalue.Type, "colon") {
+					tmps[tmpsindex].Score += fieldscore["colon"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["colon"]})
+				} else if strings.Contains(tmpsvalue.Type, "space") {
+					tmps[tmpsindex].Score += fieldscore["space"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["space"]})
+				} else if strings.Contains(tmpsvalue.Type, "table") {
+					tmps[tmpsindex].Score += fieldscore["table"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["table"]})
+				} else if strings.Contains(tmpsvalue.Type, "regexp") {
+					tmps[tmpsindex].Score += fieldscore["regexp"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["regexp"]})
+				}
+			} else { //通用抽取属性打分配置
+				if strings.Contains(tmpsvalue.Type, "colon") {
+					tmps[tmpsindex].Score += CommonScore["colon"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["colon"]})
+				} else if strings.Contains(tmpsvalue.Type, "space") {
+					tmps[tmpsindex].Score += CommonScore["space"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["space"]})
+				} else if strings.Contains(tmpsvalue.Type, "table") {
+					tmps[tmpsindex].Score += CommonScore["table"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["table"]})
+				} else if strings.Contains(tmpsvalue.Type, "regexp") {
+					tmps[tmpsindex].Score += CommonScore["regexp"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["regexp"]})
+				}
 			}
+
 			scoreRule := SoreConfig[field]
 			if scoreRule == nil {
 				continue
 			}
+			//配置打分
 			if scoreRule["type"] == "string" {
 				//1.长度打分
 				valueLen := utf8.RuneCountInString(fmt.Sprint(tmpsvalue.Value))
@@ -207,7 +250,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 						}
 					}
 				}
-				//4.位置打分
+				//4.中标候选人打分
 				if winnerorders, ok := scoreRule["winnerorder"].([]interface{}); ok {
 					for _, winnerorder := range winnerorders {
 						if p, ok := winnerorder.(map[string]interface{}); ok {

+ 9 - 3
src/jy/pretreated/analystep.go

@@ -6,6 +6,7 @@ package pretreated
 import (
 	"encoding/json"
 	"jy/util"
+	//"log"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
@@ -33,6 +34,7 @@ func AnalyStart(job *util.Job) {
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
+			//log.Println(bl.Text)
 			if len([]rune(bl.Text)) > 80 {
 				bl.Block, _ = DivideBlock(job.CategorySecond, bl.Text, 1, job.RuleBlock)
 				for _, bl_bl := range bl.Block {
@@ -54,10 +56,11 @@ func AnalyStart(job *util.Job) {
 			newCon = TextAfterRemoveTable(con)
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
 			for i := 0; i < len(tabs); i++ {
+				tabtmp := &util.Block{}
+				tabtmp.Text = tabs[i].Text()
 				//添加标识:文本中有table
 				tabres := AnalyTableV2(tabs[i], job.Category, "", con, 1, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
 				processTableResult(tabres, bl, job)
-				//分析table解析结果
 			}
 			//			for k, v := range bl.TableKV.Kv {
 			//				log.Println("bl.TableKV.Kv", k, v)
@@ -85,11 +88,14 @@ func processTableInBlock(bl *util.Block, job *util.Job) {
 	for _, tab := range tabs {
 		job.HasTable = 1
 		//添加标识:文本中有table
-		tabres := AnalyTableV2(tab, job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
-		processTableResult(tabres, bl, job)                                                           //分析table解析结果
+		tmpblock := &util.Block{}
+		tmpblock.Text = tab.Text()
+		tabres := AnalyTableV2(tab, job.Category, bl.Title, tmpblock.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
+		processTableResult(tabres, bl, job)                                                                 //分析table解析结果
 		if bl.Title == "" && tabres.BlockTag != "" {
 			bl.Title = tabres.BlockTag
 		}
+		job.Block = append(job.Block, tmpblock)
 	}
 }
 

+ 6 - 2
src/jy/pretreated/colonkv.go

@@ -713,10 +713,14 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
 				if moneyreg.MatchString(tk.Value) {
 					val += GetMoneyUnit(k, val)
 				}
-				kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: val, Weight: tk.Weight})
+				if val != "" {
+					kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: val, Weight: tk.Weight})
+				}
 			}
 		} else {
-			kvTags[key] = append(kvTags[key], &Tag{Key: k, Value: val, IsInvalid: true})
+			if val != "" {
+				kvTags[key] = append(kvTags[key], &Tag{Key: k, Value: val, IsInvalid: true})
+			}
 		}
 	}
 	return kvTags

+ 1 - 0
src/jy/util/article.go

@@ -49,6 +49,7 @@ type ExtField struct {
 	Value       interface{}       //抽取结果
 	Score       float64           //得分
 	ScoreItem   []*ScoreItem      //打分项
+	Weight      int               //权重值
 	ValRepeat   int               //结果值重复次数,打分参考
 }
 

+ 2 - 0
src/jy/util/script.go

@@ -146,6 +146,8 @@ func MapToLuaTable(l *lua.LState, obj map[string]interface{}) *lua.LTable {
 			tab.RawSet(lua.LString(k), lua.LNumber(val))
 		} else if val, ok := v.(int32); ok {
 			tab.RawSet(lua.LString(k), lua.LNumber(val))
+		} else if val, ok := v.(int); ok {
+			tab.RawSet(lua.LString(k), lua.LNumber(val))
 		} else if val, ok := v.(float64); ok {
 			tab.RawSet(lua.LString(k), lua.LNumber(val))
 		} else if val, ok := v.(float32); ok {

+ 25 - 8
src/res/fieldscore.json

@@ -1,12 +1,29 @@
 {
     "extractype": {
         "describe": "抽取类型打分",
-        "title": 2,
-        "table": 3,
-        "colon": 3,
-        "space": 3,
-        "regexp": 2,
-        "winnerorder": 3
+        "common": {
+            "title": 2,
+            "table": 3,
+            "colon": 3,
+            "space": 3,
+            "regexp": 2
+        },
+        "fields": {
+            "projectname": {
+                "title": 1,
+                "table": 3,
+                "colon": 3,
+                "space": 3,
+                "regexp": 2
+            },
+            "winner": {
+                "table": 3,
+                "colon": 3,
+                "space": 3,
+                "regexp": 2,
+                "winnerorder": 3
+            }
+        }
     },
     "other": {
         "block": {
@@ -69,7 +86,7 @@
                 "range": [
                     10,
                     35,
-                    3
+                    2
                 ]
             },
             {
@@ -112,7 +129,7 @@
                 "range": [
                     0,
                     4,
-                    -10
+                    -3
                 ]
             },
             {

+ 4 - 4
src/res/isextract.json

@@ -1,11 +1,11 @@
 {
     "bidamount": [
-        "(不足三家|废标流标|废标公告|流标公告|ppp项目|PPP项目)"
+        "(不足三家|废标流标|废标公告|流标公告)"
     ],
     "winner": [
-        "(不足三家|废标流标|废标公告|流标公告|ppp项目|PPP项目)"
+        "(不足三家|废标流标|废标公告|流标公告)"
     ],
-	"filter":[
-		"(?i)(PPP项目[::]否|非PPP项目)"
+	"title":[
+        "(?i)(ppp项目)"
 	]
 }