浏览代码

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

maxiaoshan 6 年之前
父节点
当前提交
44295cd2d5

+ 49 - 6
src/jy/extract/extract.go

@@ -185,6 +185,11 @@ func RunExtractTask(taskId string) {
 			if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 				continue
 			}
+			//根据标题判断是否抽取
+			b := IsExtract("title", qu.ObjToString(v["title"]), "")
+			if !b {
+				continue
+			}
 			_id := qu.BsonIdToSId(v["_id"])
 			//log.Debug(_id)
 			if !ext.IsRun {
@@ -289,7 +294,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
 			pretreated.AnalyStart(jf)
 		}
 	}, func(err interface{}) {
-		log.Debug("pretreated.AnalyStart", err)
+		log.Debug("pretreated.AnalyStart", err, j.SourceMid)
 	})
 	return j, jf
 }
@@ -438,6 +443,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 		}
 		//函数清理
 		for key, val := range j.Result {
+			tmpExtFields := make([]*ju.ExtField, 0)
+			tmpWeight := -999 //记录最大权重
+			tmpIndex := -999  //记录最大权重下标
 			for _, v := range val {
 				lockclear.Lock()
 				cfn := e.ClearFn[key]
@@ -461,6 +469,22 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				//AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
 				lockclear.Unlock()
 			}
+			//项目编号,采购单位权重清理
+			if (key == "projectcode" || key == "buyer") && len(val) > 1 {
+				for i, v := range val {
+					if v.Weight == 0 {
+						tmpExtFields = append(tmpExtFields, v)
+						continue
+					} else if v.Weight > tmpWeight {
+						tmpWeight = v.Weight
+						tmpIndex = i
+					}
+				}
+				if tmpIndex != -999 {
+					tmpExtFields = append(tmpExtFields, val[tmpIndex])
+					j.Result[key] = tmpExtFields
+				}
+			}
 		}
 		PackageDetail(j, e) //处理分包信息
 		//		bs, _ := json.Marshal(j.Result)
@@ -615,7 +639,7 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
 //抽取-规则
 func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
 	defer qu.Catch()
-	//废标、流标、ppp等跳过
+	//根据field配置项目,是否抽取。例如:废标、流标等跳过,
 	b := IsExtract(in.Field, j.Title, j.Content)
 	if !b {
 		return
@@ -632,11 +656,13 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				}
 				if tmps, ok := v.([]map[string]interface{}); ok {
 					for _, tmp := range tmps {
-						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
+						field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
 						if tmp["blocktag"] != nil {
 							btag := make(map[string]string)
 							for k := range tmp["blocktag"].(map[string]bool) {
-								btag[k] = TagConfigDesc[k]
+								if TagConfigDesc[k] != "" {
+									btag[k] = TagConfigDesc[k]
+								}
 							}
 							field.BlockTag = btag
 						}
@@ -683,8 +709,19 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 //lua脚本根据属性设置提取kv值
 func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
 	kvmap := map[string][]map[string]interface{}{}
+	blocks := []*ju.Block{}
+	for _, bl := range j.Block {
+		if len(bl.Block) > 0 {
+			blocks = append(blocks, bl.Block...)
+		} else {
+			blocks = append(blocks, bl)
+		}
+	}
 	for fieldname, field := range in.LFields {
-		for _, bl := range j.Block {
+		if field != in.Field {
+			continue
+		}
+		for _, bl := range blocks {
 			tp := ""
 			for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
 				if k == 0 {
@@ -709,7 +746,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 							"value":       text,
 							"type":        tp,
 							"matchtype":   "tag_string",
-							"blocktag":    bl.Tag,
+							"blocktag":    bl.Classify,
 							"weight":      vv.Weight,
 						})
 					}
@@ -882,6 +919,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 					}
 					j.Result[in.Field][k].Value = text
+					if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
+						continue
+					}
 					exts = append(exts, map[string]interface{}{
 						"field":     v.Field,
 						"code":      v.Code,
@@ -909,6 +949,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 					}
 					j.Result[key][k].Value = text
+					if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
+						continue
+					}
 					exts = append(exts, map[string]interface{}{
 						"field":     v.Field,
 						"code":      v.Code,

+ 8 - 4
src/jy/extract/score.go

@@ -126,10 +126,6 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 	for field, tmps := range result {
 		for tmpsindex, tmpsvalue := range tmps {
 			describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
-			if tmpsvalue.ExtFrom == "title" { //标题打分初始化
-				tmps[tmpsindex].Score += CommonScore["title"]
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
-			}
 			//是否有段标签
 			if len(tmpsvalue.BlockTag) > 0 {
 				//有标签段
@@ -151,6 +147,10 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 			//抽取类型打分
 			if FieldsScore[field] != nil { //指定抽取属性打分配置
 				fieldscore := FieldsScore[field]
+				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
+					tmps[tmpsindex].Score += fieldscore["title"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["title"]})
+				}
 				if strings.Contains(tmpsvalue.Type, "colon") {
 					tmps[tmpsindex].Score += fieldscore["colon"]
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["colon"]})
@@ -165,6 +165,10 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["regexp"]})
 				}
 			} else { //通用抽取属性打分配置
+				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
+					tmps[tmpsindex].Score += CommonScore["title"]
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
+				}
 				if strings.Contains(tmpsvalue.Type, "colon") {
 					tmps[tmpsindex].Score += CommonScore["colon"]
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["colon"]})

+ 17 - 10
src/jy/pretreated/analystep.go

@@ -33,6 +33,7 @@ func AnalyStart(job *util.Job) {
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
+			//log.Println(bl.Text)
 			if len([]rune(bl.Text)) > 80 {
 				bl.Block, _ = DivideBlock(job.CategorySecond, bl.Text, 1, job.RuleBlock)
 				for _, bl_bl := range bl.Block {
@@ -56,24 +57,25 @@ func AnalyStart(job *util.Job) {
 			for i := 0; i < len(tabs); i++ {
 				//添加标识:文本中有table
 				tabres := AnalyTableV2(tabs[i], job.Category, "", con, 1, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
-				processTableResult(tabres, bl, job)                                                     //分析table解析结果
+				processTableResult(tabres, bl, job)
 			}
 			//			for k, v := range bl.TableKV.Kv {
-			//				log.Println("bl.TableKV.Kv", k, v)
+			//				//log.Println("bl.TableKV.Kv", k, v)
 			//			}
 		} else {
 			//从正文里面找分包
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
 		}
 		FindProjectCode(newCon, job) //匹配项目编号
-		bl.Text = newCon
+		bl.Text = HtmlToText(con)
 		//调用kv解析
-		bl.ColonKV = GetKVAll(newCon, "", nil, 1)
-		bl.SpaceKV = SspacekvEntity.Entrance(newCon, "", nil)
+		bl.ColonKV = GetKVAll(bl.Text, "", nil, 1)
+		bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil)
 		//新加 未分块table中未能解析到中标候选人,从正文中解析
 		if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
 			bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
 		}
+		//log.Println(bl.Text)
 		job.Block = append(job.Block, bl)
 	}
 }
@@ -84,8 +86,8 @@ func processTableInBlock(bl *util.Block, job *util.Job) {
 	for _, tab := range tabs {
 		job.HasTable = 1
 		//添加标识:文本中有table
-		tabres := AnalyTableV2(tab, job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
-		processTableResult(tabres, bl, job)                                                           //分析table解析结果
+		tabres := AnalyTableV2(tab, job.Category, bl.Title, tab.Text(), 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
+		processTableResult(tabres, bl, job)                                                              //分析table解析结果
 		if bl.Title == "" && tabres.BlockTag != "" {
 			bl.Title = tabres.BlockTag
 		}
@@ -139,7 +141,10 @@ func FindProjectCode(newCon string, job *util.Job) {
 //分析table解析结果
 func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 	//解析结果中的kv
-	block.TableKV = &util.JobKv{KvTags: tabres.KvTags}
+	if block.TableKV == nil {
+		block.TableKV = util.NewJobKv()
+	}
+	MergeKvTags(block.TableKV.KvTags, tabres.KvTags)
 	//分包
 	tablePackage := map[string]*util.BlockPackage{}
 	if tabres.IsMultiPackage {
@@ -152,7 +157,7 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 			//解析kv
 			//找到key是“包1中标单位”这种的key,过滤掉包1,再次到标签库中匹配
 			labelKVs := []*util.Kv{}
-			if blockPackage.TableKV != nil && len(blockPackage.TableKV.KvTags) > 0 {
+			if blockPackage.TableKV != nil {
 				for tk, tv := range blockPackage.TableKV.KvTags {
 					for _, tvv := range tv {
 						if regReplKey.MatchString(tk) || regSplit.MatchString(tk) {
@@ -163,8 +168,10 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 						}
 					}
 				}
+			} else {
+				blockPackage.TableKV = util.NewJobKv()
 			}
-			blockPackage.TableKV.KvTags = GetKvTags(labelKVs, "", nil)
+			MergeKvTags(blockPackage.TableKV.KvTags, GetKvTags(labelKVs, "", nil))
 			tablePackage[k] = blockPackage
 		}
 	}

+ 9 - 5
src/jy/pretreated/analytable.go

@@ -236,7 +236,6 @@ func (table *Table) KVFilter() {
 			as.AddKey(k, v)
 		}
 	}
-
 	//处理值是数组的kv放入标准化kv中//处理table.SortKV.value为数组的情况
 	table.sortKVArr(as, winnertag)
 	//
@@ -314,9 +313,6 @@ func (table *Table) KVFilter() {
 func (table *Table) sortKVArr(as *SortMap, winnertag bool) {
 	checkKey := map[int]bool{}
 	for kn, k := range as.Keys { //遍历table.SortKV.value为数组的key
-		if len(table.StandKV[k]) == 0 || strings.TrimSpace(table.StandKV[k][0].Value) != "" {
-			continue
-		}
 		v := as.Map[k]
 		if vm, ok := v.([]map[string]interface{}); ok && k == NullTxtBid {
 			if table.WinnerOrder == nil {
@@ -708,6 +704,7 @@ func (ts *TableResult) Analy() {
 		//		for k, v := range table.TableResult.SortKV.Map {
 		//			qutil.Debug(k, "=====", v)
 		//		}
+		MergeKvTags(ts.KvTags, table.TableResult.KvTags)
 	}
 }
 
@@ -1487,7 +1484,11 @@ func (table *Table) FindKV() {
 			}
 			if len(kmap) > 0 {
 				for _, k := range kmapkeys {
-					table.SortKV.AddKey(k, kmap[k])
+					if len(kmap[k]) == 1 {
+						table.SortKV.AddKey(k, kmap[k][0])
+					} else if len(kmap[k]) > 1 {
+						table.SortKV.AddKey(k, kmap[k])
+					}
 				}
 			}
 		}
@@ -2082,6 +2083,9 @@ func (tn *Table) isGoonNext() {
 					if bp.TableKV == nil {
 						bp.TableKV = u.NewJobKv()
 					}
+					if bp.SpaceKV == nil {
+						bp.SpaceKV = u.NewJobKv()
+					}
 					for k2, v2 := range mv.ColonKV.KvTags {
 						for _, v2v := range v2 {
 							isExists := false

+ 19 - 3
src/jy/pretreated/colonkv.go

@@ -9,6 +9,7 @@ import (
 	"regexp"
 	"sort"
 	"strings"
+	"unicode/utf8"
 )
 
 type ColonkvEntity struct{}
@@ -680,7 +681,7 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
 		kvTags[title] = append(kvTags[title], &Tag{title, title, 0, nil, false})
 	}
 	for _, findkv := range findkvs {
-		k, val := findkv.Key, findkv.Value
+		k, val, nextval := findkv.Key, findkv.Value, strings.TrimSpace(findkv.NextLine)
 		//val是空的话,不打标签
 		if filterValue.MatchString(val) {
 			continue
@@ -713,10 +714,25 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
 				if moneyreg.MatchString(tk.Value) {
 					val += GetMoneyUnit(k, val)
 				}
-				kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: val, Weight: tk.Weight})
+				if val != "" {
+					kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: val, Weight: tk.Weight})
+				} else if nextval != "" && utf8.RuneCountInString(nextval) < 30 {
+					if strings.Contains(nextval, ":") || strings.Contains(nextval, ":") {
+						if len(strings.Split(nextval, ":")) > 1 || len(strings.Split(nextval, ":")) > 1 {
+							nextval = strings.Split(nextval, ":")[0]
+							nextval = strings.Split(nextval, ":")[0]
+							if strings.TrimSpace(nextval) == "" {
+								continue
+							}
+						}
+					}
+					kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: nextval, Weight: tk.Weight})
+				}
 			}
 		} else {
-			kvTags[key] = append(kvTags[key], &Tag{Key: k, Value: val, IsInvalid: true})
+			if val != "" {
+				kvTags[key] = append(kvTags[key], &Tag{Key: k, Value: val, IsInvalid: true})
+			}
 		}
 	}
 	return kvTags

+ 19 - 14
src/jy/pretreated/tablev2.go

@@ -63,7 +63,6 @@ type TD struct {
 	Val            string             //值
 	Text           string             //原始串
 	SortKV         *SortMap           //存放kv值
-	SortKVWeight   map[string]int     //存放kv值权重
 	Html           string             //html值
 	BH             bool               //是否是表头
 	MustBH         bool               //不能修改的表头
@@ -91,12 +90,11 @@ var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿
 func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	defer qutil.Catch()
 	td := &TD{
-		ArrVal:       []string{},
-		Goquery:      Goquery,
-		SonTds:       []*TD{},
-		TR:           tr,
-		SortKV:       NewSortMap(),
-		SortKVWeight: map[string]int{},
+		ArrVal:  []string{},
+		Goquery: Goquery,
+		SonTds:  []*TD{},
+		TR:      tr,
+		SortKV:  NewSortMap(),
 	}
 	colspan, rowspan := 0, 0
 	col, bcol := td.Goquery.Attr("colspan")
@@ -145,20 +143,28 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 				td.SortKV.AddKey(bl_sk, bl_sv)
 			}
 		}
+	} else {
+		//for _, v := range GetKVAll(txt, "", nil, 2).KvTags {
+		//for _, vv := range v {
+		//td.SortKV.AddKey(vv.Key, vv.Value)
+		//}
+		//}
 	}
 	//抽取不到走正则抽
 	proCode := projectcodeReg.FindString(text)
 	if proCode != "" {
 		ckv := GetKVAll(proCode, "", nil, 1)
-		for k, v := range ckv.KvTags {
-			td.SortKV.AddKey(k, v)
-			td.SortKVWeight[k] = -99
+		for _, v := range ckv.KvTags {
+			for _, vv := range v {
+				td.SortKV.AddKey(vv.Key, vv.Value)
+			}
 		}
 	} else if proCode = projectcodeReg2.FindString(text); proCode != "" {
 		ckv := GetKVAll(proCode, "", nil, 1)
-		for k, v := range ckv.KvTags {
-			td.SortKV.AddKey(k, v)
-			td.SortKVWeight[k] = -99
+		for _, v := range ckv.KvTags {
+			for _, vv := range v {
+				td.SortKV.AddKey(vv.Key, vv.Value)
+			}
 		}
 	}
 	if proCode = jsonReg.FindString(text); proCode != "" {
@@ -166,7 +172,6 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		json.Unmarshal([]byte(proCode), &jsonMap)
 		for k, v := range jsonMap {
 			td.SortKV.AddKey(k, v)
-			td.SortKVWeight[k] = -99
 		}
 	}
 	//对td单元格值判断是否是表头和根据td内容长度进行分块处理

+ 1 - 0
src/jy/util/article.go

@@ -49,6 +49,7 @@ type ExtField struct {
 	Value       interface{}       //抽取结果
 	Score       float64           //得分
 	ScoreItem   []*ScoreItem      //打分项
+	Weight      int               //权重值
 	ValRepeat   int               //结果值重复次数,打分参考
 }
 

+ 2 - 0
src/jy/util/script.go

@@ -146,6 +146,8 @@ func MapToLuaTable(l *lua.LState, obj map[string]interface{}) *lua.LTable {
 			tab.RawSet(lua.LString(k), lua.LNumber(val))
 		} else if val, ok := v.(int32); ok {
 			tab.RawSet(lua.LString(k), lua.LNumber(val))
+		} else if val, ok := v.(int); ok {
+			tab.RawSet(lua.LString(k), lua.LNumber(val))
 		} else if val, ok := v.(float64); ok {
 			tab.RawSet(lua.LString(k), lua.LNumber(val))
 		} else if val, ok := v.(float32); ok {

+ 3 - 3
src/res/fieldscore.json

@@ -14,7 +14,7 @@
                 "table": 3,
                 "colon": 3,
                 "space": 3,
-                "regexp": 2
+                "regexp": 1
             },
             "winner": {
                 "table": 3,
@@ -41,7 +41,7 @@
             {
                 "describe": "以*结尾",
                 "regstr": ".{2,100}(项目|工程|施工|服务|设备|采购|设计|系统)$",
-                "score": 3
+                "score": 2
             }
         ],
         "negativewords": [
@@ -129,7 +129,7 @@
                 "range": [
                     0,
                     4,
-                    -10
+                    -3
                 ]
             },
             {

+ 4 - 4
src/res/isextract.json

@@ -1,11 +1,11 @@
 {
     "bidamount": [
-        "(不足三家|废标流标|废标公告|流标公告|ppp项目|PPP项目)"
+        "(不足三家|废标流标|废标公告|流标公告)"
     ],
     "winner": [
-        "(不足三家|废标流标|废标公告|流标公告|ppp项目|PPP项目)"
+        "(不足三家|废标流标|废标公告|流标公告)"
     ],
-	"filter":[
-		"(?i)(PPP项目[::]否|非PPP项目)"
+	"title":[
+        "(?i)(ppp项目)"
 	]
 }