فهرست منبع

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

maxiaoshan 6 سال پیش
والد
کامیت
e054076aa1

+ 0 - 1
src/config.json

@@ -9,7 +9,6 @@
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
     "saveresult": true,
-    "fieldscore": true,
     "qualityaudit": false,
     "saveblock": true,
     "filelength": 100000,

+ 10 - 9
src/jy/extract/extract.go

@@ -24,12 +24,12 @@ import (
 var (
 	lock, lockrule, lockclear sync.RWMutex
 
-	cut     = ju.NewCut()                          //获取正文并清理
-	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask          //任务列表
-	ClearTaskList map[string]*ClearTask            //清理任务列表
-	saveLimit     = 200                            //抽取日志批量保存
-	PageSize      = 5000                           //查询分页
+	cut           = ju.NewCut()                          //获取正文并清理
+	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask                //任务列表
+	ClearTaskList map[string]*ClearTask                  //清理任务列表
+	saveLimit     = 200                                  //抽取日志批量保存
+	PageSize      = 5000                                 //查询分页
 	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -869,6 +869,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						continue
 					}
 					val := text[pos[p]:pos[p+1]]
+					sourcevalue := val
 					if val == "招标公告" {
 						return extinfo
 					}
@@ -892,7 +893,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						if v.RegCore.NumSign == -1 { //正负值修正
 							val = "-" + val
 						}
-						exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
+						exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: sourcevalue, Value: val}
 						if extfrom == "title" {
 							exfield.Score = 4
 						}
@@ -1336,7 +1337,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if len(blocks) > 0 {
 			tmp["blocks"] = blocks
 		}
-		tmp["extract_content"] = j.Content
+		//tmp["extract_content"] = j.Content
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
 				for field, _ := range e.Fields {
@@ -1554,7 +1555,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 { //reids未找到,执行规则匹配
+	if i == 0 {                            //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 3 - 2
src/jy/extract/extractInit.go

@@ -369,11 +369,12 @@ func (e *ExtractTask) InitRuleCore() {
 						tmp := strings.Split(rinfo.RuleText, "__")
 						var pattern string
 						if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
 							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 						} else {
 							pattern = tmp[0]
 						}
-						pattern, _ = strconv.Unquote(`"` + pattern + `"`)
 						if len(tmp) == 2 {
 							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 						} else {
@@ -413,12 +414,12 @@ func (e *ExtractTask) InitRuleCore() {
 						tmp := strings.Split(rinfo.RuleText, "__")
 						var pattern string
 						if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
 							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 						} else {
 							pattern = tmp[0]
 						}
-						pattern, _ = strconv.Unquote(`"` + pattern + `"`)
 						if len(tmp) == 2 {
 							epos := strings.Split(tmp[1], ",")
 							posm := map[string]int{}

+ 10 - 17
src/jy/extract/score.go

@@ -108,29 +108,22 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				//qz := TagConfig["其他"][field]
 				//tmps[tmpsindex].Score += 2 * qz //乘以权重系数
 			}
-			if tmpsvalue.ExtFrom != "title" { //非标题抽取
-				//是否有kv值
-				if strings.Contains(tmpsvalue.Type, "colon") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "colonkv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
-				} else if strings.Contains(tmpsvalue.Type, "space") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "spacekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
-				} else if strings.Contains(tmpsvalue.Type, "table") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "tablekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
-				}
+			//是否有kv值
+			if strings.Contains(tmpsvalue.Type, "colon") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "colonkv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
+			} else if strings.Contains(tmpsvalue.Type, "space") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "spacekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
+			} else if strings.Contains(tmpsvalue.Type, "table") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "tablekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
 			}
 			if tmpsvalue.ExtFrom != "title" { //非标题抽取
 				if strings.Contains(tmpsvalue.Type, "regexp") {
 					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "regexp", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])})
 				}
-			} else {
-				if strings.Contains(tmpsvalue.Type, "regexp") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"]) + 1
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "regexp", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"]) + 1})
-				}
 			}
 			scoreRule := SoreConfig[field]
 			if scoreRule == nil {

+ 28 - 18
src/jy/pretreated/analytable.go

@@ -133,7 +133,9 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (k1, k2 []str
 	if sv, sok := v.(string); sok { //取KV
 		v1 = sv
 	} else if sv, sok := v.([]string); sok { //是数组先默认取第一个
-		v1 = sv[0]
+		if len(sv) >= 1 {
+			v1 = sv[0]
+		}
 	}
 	//对值单位的处理   (预算|费|价|额|规模|投资)
 	if moneyreg.MatchString(tk) {
@@ -228,6 +230,10 @@ func (table *Table) KVFilter() {
 			continue
 		}
 		v := table.SortKV.Map[k]
+		if table.SortKVWeight[k] == -99 { //td格式化kv降低权重
+			as.AddKey(k, v)
+			continue
+		}
 		if _, ok := v.(string); ok { //table.SortKV.Value为字符串,匹配抽取关键词table.SortKV.Key,匹配到添加k,v到table.StandKV,table.StandKVWeight
 			k = regSpliteSegment.ReplaceAllString(regReplAllSpace.ReplaceAllString(k, ""), "")
 			k1, n_k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v) //对key标准化处理,没有找到会走中标
@@ -257,7 +263,7 @@ func (table *Table) KVFilter() {
 			} else {
 				if table.StandKV[k] == "" && qutil.ObjToString(v) != "" {
 					table.StandKV[k] = qutil.ObjToString(v)
-					table.StandKVWeight[k] = 0
+					table.StandKVWeight[k] = -99
 				}
 			}
 			for _, n_k2 := range n_k1 {
@@ -347,6 +353,9 @@ func (table *Table) KVFilter() {
 func (table *Table) sortKVArr(as *SortMap, winnertag bool) {
 	checkKey := map[int]bool{}
 	for kn, k := range as.Keys { //遍历table.SortKV.value为数组的key
+		if strings.TrimSpace(table.StandKV[k]) != "" {
+			continue
+		}
 		v := as.Map[k]
 		if vm, ok := v.([]map[string]interface{}); ok && k == NullTxtBid {
 			if table.WinnerOrder == nil {
@@ -646,11 +655,11 @@ func AnalyTableV2(tabs *goquery.Selection, toptype, blockTag, con string, itype
 	tabres = NewTableResult(_id, toptype, blockTag, con, itype, ruleBlock)
 	//可以有多个table
 	//for _, table := range tabs {
-		//隐藏表格跳过
-		if IsHide(tabs) {
-			return
-		}
-		tabres.GoqueryTabs = tabs
+	//隐藏表格跳过
+	if IsHide(tabs) {
+		return
+	}
+	tabres.GoqueryTabs = tabs
 	//}
 	//解析表格集
 	tabres.Analy()
@@ -665,16 +674,16 @@ func (ts *TableResult) Analy() {
 		MatchMap: map[string]map[string]bool{},
 	}
 	//for _, table := range ts.GoqueryTabs {
-		tn := NewTable(ts.Html, ts, ts.GoqueryTabs)
-		//核心模块
-		tsw := tn.Analy(contactFormat)
-		for _, tab := range tsw {
-			if len(tab.TRs) > 0 {
-				tabs = append(tabs, tab)
-			}
-			//fmt.Println("tab.SortKV.Map", tab.SortKV.Keys)
+	tn := NewTable(ts.Html, ts, ts.GoqueryTabs)
+	//核心模块
+	tsw := tn.Analy(contactFormat)
+	for _, tab := range tsw {
+		if len(tab.TRs) > 0 {
+			tabs = append(tabs, tab)
 		}
-		//tn.SonTables = append(tn.SonTables, tn)
+		//fmt.Println("tab.SortKV.Map", tab.SortKV.Keys)
+	}
+	//tn.SonTables = append(tn.SonTables, tn)
 	//}
 	//统一合并,考虑统一多表格是多包的情况---新增
 	if len(tabs) > 1 {
@@ -789,7 +798,7 @@ func (table *Table) createTabe(trs *goquery.Selection) {
 			td := NewTD(selm, TR, table) //初始化td,kv处理,td中有table处理,td的方向
 			//num++
 			TR.AddTD(td)
-			if td.Val == "" && td.SonTableResult == nil && len(td.SortKV.Map) == 0{ //删除一个tr,tr中所有td是空值的
+			if td.Val == "" && td.SonTableResult == nil && len(td.SortKV.Map) == 0 { //删除一个tr,tr中所有td是空值的
 				empty++
 				if tds.Size() == empty {
 					tdTextIsNull = true
@@ -1479,6 +1488,7 @@ func (table *Table) FindKV() {
 										continue
 									}
 									table.SortKV.AddKey(tdk, tdv)
+									table.SortKVWeight[tdk] = -99
 								}
 							}
 						}
@@ -3185,7 +3195,7 @@ func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMa
 	for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序
 		val := table.SortKV.Map[key]
 		key = regReplAllSpace.ReplaceAllString(key, "")
-		key = strings.Replace(key, "", "", -1)    //处理一个特殊的采购量 经上层处理空格后未处理掉
+		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 			/*
 				{

+ 13 - 11
src/jy/pretreated/tablev2.go

@@ -65,6 +65,7 @@ type TD struct {
 	Val            string             //值
 	Text           string             //原始串
 	SortKV         *SortMap           //存放kv值
+	SortKVWeight   map[string]int     //存放kv值权重
 	Html           string             //html值
 	BH             bool               //是否是表头
 	MustBH         bool               //不能修改的表头
@@ -92,11 +93,12 @@ var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿
 func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	defer qutil.Catch()
 	td := &TD{
-		ArrVal:  []string{},
-		Goquery: Goquery,
-		SonTds:  []*TD{},
-		TR:      tr,
-		SortKV:  NewSortMap(),
+		ArrVal:       []string{},
+		Goquery:      Goquery,
+		SonTds:       []*TD{},
+		TR:           tr,
+		SortKV:       NewSortMap(),
+		SortKVWeight: map[string]int{},
 	}
 	colspan, rowspan := 0, 0
 	col, bcol := td.Goquery.Attr("colspan")
@@ -135,21 +137,19 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock)
 	//看是否划块
 	if len(ub) > 0 {
-		colonKvWeight := map[string]int{}
-		spaceKvWeight := map[string]int{}
 		for _, bl := range ub {
 			//冒号kv
 			for bl_ck, bl_cv := range bl.ColonKV.Kv {
-				if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
-					colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
+				if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= td.SortKVWeight[bl_ck] {
 					td.SortKV.AddKey(bl_ck, bl_cv)
+					td.SortKVWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
 				}
 			}
 			//空格kv
 			for bl_sk, bl_sv := range bl.SpaceKV.Kv {
-				if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
-					spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
+				if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= td.SortKVWeight[bl_sk] {
 					td.SortKV.AddKey(bl_sk, bl_sv)
+					td.SortKVWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
 				}
 			}
 		}
@@ -574,6 +574,7 @@ type Table struct {
 	TDNum                  int                       //td个数
 	BPackage               bool                      //是否有包
 	SortKV                 *SortMap                  //带排序的KV值
+	SortKVWeight           map[string]int            //带排序的KV值
 	StandKV                map[string]string         //过滤后的标准化kv
 	StandKVWeight          map[string]int            //过滤后的标准化kv
 	StandRuleKV            map[string]string         //过滤后的规则kv
@@ -601,6 +602,7 @@ func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Ta
 	return &Table{
 		Html:                   Html,
 		SortKV:                 NewSortMap(),
+		SortKVWeight:           map[string]int{},
 		StandKV:                map[string]string{},
 		StandKVWeight:          map[string]int{},
 		kvscope:                map[int]map[int][]*TD{},

+ 3 - 3
src/res/fieldscore.json

@@ -3,8 +3,8 @@
         "describe": "抽取类型打分",
         "title": 4,
         "table": 3,
-        "colon": 2,
-        "space": 2,
+        "colon": 3,
+        "space": 3,
         "regexp": 2,
         "winnerorder": 3
     },
@@ -32,7 +32,7 @@
         "length": [
             {
                 "describe": "长度打分min>val:-6,min<=val<=max:1,max<val:-1",
-                "min": 4,
+                "min": 5,
                 "max": 35,
                 "score": [
                     -10,

+ 1 - 1
versioncomparison/config.json

@@ -17,7 +17,7 @@
         "buyertel",
         "buyeraddr",
         "agencyperson",
-        "agencytel",5d39d253a5cb26b9b7404ae1,5d3b23aaa5cb26b9b7c1ec59
+        "agencytel",
         "agencyaddr"
     ]
 }

+ 16 - 10
versioncomparison/main.go

@@ -15,13 +15,14 @@ import (
 )
 
 var (
-	SysConfig map[string]interface{}
-	Premgo    *mongodbutil.Pool //上个版本库
-	Newmgo    *mongodbutil.Pool //当前版本库
-	FieldData map[string]map[string]*Data
-	Compares  map[string]*Compare
-	Sid, Eid  string
-	Fields    []string
+	SysConfig   map[string]interface{}
+	Premgo      *mongodbutil.Pool //上个版本库
+	Newmgo      *mongodbutil.Pool //当前版本库
+	FieldData   map[string]map[string]*Data
+	Compares    map[string]*Compare
+	Sid, Eid    string
+	Fields      []string
+	FieldsQuery string
 )
 
 type Compare struct {
@@ -44,8 +45,13 @@ func init() {
 	Premgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["premgo"]), qu.ObjToString(SysConfig["predb"]))
 	Newmgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["newmgo"]), qu.ObjToString(SysConfig["newdb"]))
 	tmp, _ := SysConfig["fields"].([]interface{})
-	for _, v := range tmp {
+	for k, v := range tmp {
 		Fields = append(Fields, qu.ObjToString(v))
+		if k < (len(tmp) - 1) {
+			FieldsQuery += `"` + qu.ObjToString(v) + `":1,`
+		} else {
+			FieldsQuery += `"` + qu.ObjToString(v) + `":1`
+		}
 	}
 	FieldData = map[string]map[string]*Data{}
 	Compares = map[string]*Compare{}
@@ -110,7 +116,7 @@ func createXlsx() {
 func getVersionData() {
 	query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(Sid), "$lte": bson.ObjectIdHex(Eid)}}
 	log.Println(qu.ObjToString(SysConfig["prec"]), query)
-	list1, _ := Premgo.Find(qu.ObjToString(SysConfig["prec"]), query, nil, `{}`, false, -1, -1)
+	list1, _ := Premgo.Find(qu.ObjToString(SysConfig["prec"]), query, nil, `{`+FieldsQuery+`}`, false, -1, -1)
 	for _, v := range *list1 {
 		for _, key := range Fields {
 			rd := FieldData[key]
@@ -126,7 +132,7 @@ func getVersionData() {
 	}
 	log.Println("pre version 加载完成")
 
-	list2, _ := Newmgo.Find(qu.ObjToString(SysConfig["newc"]), query, nil, `{}`, false, -1, -1)
+	list2, _ := Newmgo.Find(qu.ObjToString(SysConfig["newc"]), query, nil, `{`+FieldsQuery+`}`, false, -1, -1)
 	for _, v := range *list2 {
 		for _, field := range Fields {
 			rd := FieldData[field]