Browse Source

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

# Conflicts:
#	src/jy/pretreated/analystep.go
fengweiqiang 6 years ago
parent
commit
8bc721b60b
4 changed files with 27 additions and 15 deletions
  1. 17 8
      src/jy/extract/extract.go
  2. 8 4
      src/jy/extract/score.go
  3. 0 1
      src/jy/pretreated/analystep.go
  4. 2 2
      src/res/fieldscore.json

+ 17 - 8
src/jy/extract/extract.go

@@ -24,12 +24,12 @@ import (
 var (
 	lock, lockrule, lockclear sync.RWMutex
 
-	cut     = ju.NewCut()                          //获取正文并清理
-	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask          //任务列表
-	ClearTaskList map[string]*ClearTask            //清理任务列表
-	saveLimit     = 200                            //抽取日志批量保存
-	PageSize      = 5000                           //查询分页
+	cut           = ju.NewCut()                          //获取正文并清理
+	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask                //任务列表
+	ClearTaskList map[string]*ClearTask                  //清理任务列表
+	saveLimit     = 200                                  //抽取日志批量保存
+	PageSize      = 5000                                 //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -660,7 +660,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 						if tmp["blocktag"] != nil {
 							btag := make(map[string]string)
 							for k := range tmp["blocktag"].(map[string]bool) {
-								if TagConfigDesc[k] != ""{
+								if TagConfigDesc[k] != "" {
 									btag[k] = TagConfigDesc[k]
 								}
 							}
@@ -718,6 +718,9 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 		}
 	}
 	for fieldname, field := range in.LFields {
+		if field != in.Field {
+			continue
+		}
 		for _, bl := range blocks {
 			tp := ""
 			for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
@@ -916,6 +919,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 					}
 					j.Result[in.Field][k].Value = text
+					if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
+						continue
+					}
 					exts = append(exts, map[string]interface{}{
 						"field":     v.Field,
 						"code":      v.Code,
@@ -943,6 +949,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 					}
 					j.Result[key][k].Value = text
+					if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
+						continue
+					}
 					exts = append(exts, map[string]interface{}{
 						"field":     v.Field,
 						"code":      v.Code,
@@ -1425,7 +1434,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 { //reids未找到,执行规则匹配
+	if i == 0 {                            //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 8 - 4
src/jy/extract/score.go

@@ -126,10 +126,6 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 	for field, tmps := range result {
 		for tmpsindex, tmpsvalue := range tmps {
 			describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
-			if tmpsvalue.ExtFrom == "title" { //标题打分初始化
-				tmps[tmpsindex].Score += CommonScore["title"]
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
-			}
 			//是否有段标签
 			if len(tmpsvalue.BlockTag) > 0 {
 				//有标签段
@@ -151,6 +147,10 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 			//抽取类型打分
 			if FieldsScore[field] != nil { //指定抽取属性打分配置
 				fieldscore := FieldsScore[field]
+				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
+					tmps[tmpsindex].Score += fieldscore["title"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["title"]})
+				}
 				if strings.Contains(tmpsvalue.Type, "colon") {
 					tmps[tmpsindex].Score += fieldscore["colon"]
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["colon"]})
@@ -165,6 +165,10 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["regexp"]})
 				}
 			} else { //通用抽取属性打分配置
+				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
+					tmps[tmpsindex].Score += CommonScore["title"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
+				}
 				if strings.Contains(tmpsvalue.Type, "colon") {
 					tmps[tmpsindex].Score += CommonScore["colon"]
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["colon"]})

+ 0 - 1
src/jy/pretreated/analystep.go

@@ -177,7 +177,6 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 	}
 	//处理中标人排序
 	wror := []map[string]interface{}{}
-	//log.Println(tabres.WinnerOrder)
 	for _, v := range tabres.WinnerOrder {
 		entName, _ := v["entname"].(string)
 		v["entname"] = winnerOrderEntity.clear("中标单位", entName)

+ 2 - 2
src/res/fieldscore.json

@@ -14,7 +14,7 @@
                 "table": 3,
                 "colon": 3,
                 "space": 3,
-                "regexp": 2
+                "regexp": 1
             },
             "winner": {
                 "table": 3,
@@ -41,7 +41,7 @@
             {
                 "describe": "以*结尾",
                 "regstr": ".{2,100}(项目|工程|施工|服务|设备|采购|设计|系统)$",
-                "score": 3
+                "score": 2
             }
         ],
         "negativewords": [