Browse Source

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

maxiaoshan 6 years ago
parent
commit
88b69784d6

+ 6 - 5
src/config.json

@@ -3,15 +3,15 @@
     "mgodb": "192.168.3.207:27082",
     "dbsize": 2,
     "dbname": "extract_kf",
-    "redis": "buyer=192.168.3.207:1377,winner=192.168.3.207:1378,agency=192.168.3.207:1379",
-    "elasticsearch": "http://192.168.3.18:9800",
+    "redis": "buyer=192.168.3.207:3379,winner=192.168.3.207:3379,agency=192.168.3.207:3379",
+    "elasticsearch": "http://192.168.3.11:9800",
     "elasticPoolSize": 30,
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
-    "saveresult": false,
+    "saveresult": true,
     "fieldscore": true,
     "qualityaudit": false,
-    "saveblock": false,
+    "saveblock": true,
     "filelength": 100000,
     "iscltlog": false,
     "brandgoods": false,
@@ -56,5 +56,6 @@
                 "vswitchid": "vsw-2ze1n1k3mo3fv2irsfdps"
             }
         ]
-    }
+    },
+	"isSaveTag":false
 } 

+ 6 - 6
src/jy/clear/tonumber.go

@@ -208,11 +208,11 @@ func capitalMoney(data []interface{}) []interface{} {
 	if len(strmatch) > 0 {
 		str = strmatch[0][0]
 	}
-	//修正单位类似:捌万伍仟肆佰捌拾贰万元整
-	if strings.Contains(str, "万元") {
-		str = strings.Replace(str, "万元", "#B#", -1)
-		str = strings.Replace(str, "万", "亿", -1)
-		str = strings.Replace(str, "#B#", "万元", -1)
+	suffixUnit := float64(1)
+	if strings.HasSuffix(str, "万") || strings.HasSuffix(str, "万元") || strings.HasSuffix(str, "万元整") {
+		index := strings.LastIndex(str, "万")
+		str = str[0:index]
+		suffixUnit = float64(10000)
 	}
 	moneyRegChar.ReplaceAllStringFunc(str, func(key string) string {
 		if key == "元" || key == "圆" || key == "点" {
@@ -262,7 +262,7 @@ func capitalMoney(data []interface{}) []interface{} {
 	for _, v := range nodes {
 		ret += v
 	}
-	return []interface{}{ret + decimals, data[1]}
+	return []interface{}{(ret + decimals) * suffixUnit, data[1]}
 }
 
 //过滤符号

+ 133 - 28
src/jy/extract/extract.go

@@ -24,12 +24,12 @@ import (
 var (
 	lock, lockrule, lockclear sync.RWMutex
 
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 200                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask          //任务列表
+	ClearTaskList map[string]*ClearTask            //清理任务列表
+	saveLimit     = 200                            //抽取日志批量保存
+	PageSize      = 5000                           //查询分页
 	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -892,7 +892,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						if v.RegCore.NumSign == -1 { //正负值修正
 							val = "-" + val
 						}
-						exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: val, Value: val}
+						exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
 						if extfrom == "title" {
 							exfield.Score = 4
 						}
@@ -919,6 +919,59 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 					}
 				}
 			}
+			if len(extinfo) == 0 {
+				regArr := strings.Split(v.RuleText, "__")
+				//fmt.Println(regArr[0])
+				if len(regArr) > 0 {
+					reg, err := regexp.Compile(regArr[0])
+					if err == nil {
+						datavals := reg.FindStringSubmatch(text)
+						tmps := []map[string]interface{}{}
+						for _, value := range datavals {
+							if value == "" {
+								continue
+							}
+							tmp := map[string]interface{}{
+								"field":     v.Field,
+								"code":      v.Code + "去除__*后",
+								"ruletext":  regArr[0],
+								"extfrom":   extfrom,
+								"value":     value,
+								"type":      "regexp",
+								"matchtype": "regcontent",
+								"blocktag":  *tag,
+							}
+							tmps = append(tmps, tmp)
+							extinfo[v.Field] = tmps
+
+							exfield := ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: value}
+							if extfrom == "title" {
+								exfield.Score = 4
+							}
+							if tmp["blocktag"] != nil {
+								exfield.BlockTag = tmp["blocktag"].(map[string]bool)
+							}
+							item := ju.ScoreItem{Des: "初始化抽取规则去除__*", Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: value}
+							if extfrom == "title" {
+								item.Score = 4
+							}
+							if strings.Contains(value, "\n") {
+								item.Score -= 1
+								exfield.Score -= 1
+							}
+							if tmp["scoreitem"] == nil {
+								sitems := make([]*ju.ScoreItem, 0)
+								sitems = append(sitems, &item)
+								exfield.ScoreItem = sitems
+							} else {
+								exfield.ScoreItem = append(exfield.ScoreItem, &item)
+							}
+							j.Result[v.Field] = append(j.Result[v.Field], &exfield)
+							//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
+						}
+					}
+				}
+			}
 		}
 	} else {
 		pos := v.RegCore.Reg.FindStringIndex(text)
@@ -948,7 +1001,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 			if j.Result[v.Field] == nil {
 				j.Result[v.Field] = [](*ju.ExtField){}
 			}
-			field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: val, Value: val}
+			field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
 			if extfrom == "title" {
 				field.Score = 4
 			}
@@ -1153,7 +1206,9 @@ type FieldValue struct {
 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 	qu.Try(func() {
 		doc, result, _id := funcAnalysis(j)
-		go otherNeedSave(j, result, e)
+		if isSaveTag, _ := ju.Config["isSaveTag"].(bool); isSaveTag {
+			go otherNeedSave(j, result, e)
+		}
 		auxinfo := auxInfo(j)
 		//从排序结果中取值
 		tmp := map[string]interface{}{} //抽取值
@@ -1250,23 +1305,29 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
 			}
 			//把所有kv组装成一个字符串,存库
-			for ck, cv := range v.ColonKV.Kv {
-				kvtext.WriteString(ck)
-				kvtext.WriteString(":")
-				kvtext.WriteString(cv)
-				kvtext.WriteString(" ")
-			}
-			for sk, sv := range v.SpaceKV.Kv {
-				kvtext.WriteString(sk)
-				kvtext.WriteString(":")
-				kvtext.WriteString(sv)
-				kvtext.WriteString(" ")
-			}
-			for tk, tv := range v.TableKV.Kv {
-				kvtext.WriteString(tk)
-				kvtext.WriteString(":")
-				kvtext.WriteString(tv)
-				kvtext.WriteString(" ")
+			if v.ColonKV != nil {
+				for ck, cv := range v.ColonKV.Kv {
+					kvtext.WriteString(ck)
+					kvtext.WriteString(":")
+					kvtext.WriteString(cv)
+					kvtext.WriteString(" ")
+				}
+			}
+			if v.SpaceKV != nil {
+				for sk, sv := range v.SpaceKV.Kv {
+					kvtext.WriteString(sk)
+					kvtext.WriteString(":")
+					kvtext.WriteString(sv)
+					kvtext.WriteString(" ")
+				}
+			}
+			if v.TableKV != nil {
+				for tk, tv := range v.TableKV.Kv {
+					kvtext.WriteString(tk)
+					kvtext.WriteString(":")
+					kvtext.WriteString(tv)
+					kvtext.WriteString(" ")
+				}
 			}
 		}
 		if kvtext.Len() > 0 {
@@ -1328,13 +1389,57 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 //kv、表格、块上的标签凡是新的标签都入库
 //val  type   times   firstid  createtime 判定field
 func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
+	now := time.Now().Unix()
 	coll := e.TaskInfo.TestColl
 	if coll == "" {
 		coll = "extract_tag_result"
 	} else {
 		coll += "_tag"
 	}
-	//for _,v := range j.ColonKV
+	datas := []map[string]interface{}{}
+	kv := map[string]int{}
+	for _, v := range j.Block {
+		//
+		for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
+			if vv == nil || vv.KvTag == nil {
+				continue
+			}
+			for kkk, vvv := range vv.KvTag {
+				if vvv.Weight == ju.RetainKvWeight {
+					kv[kkk] = kv[kkk] + 1
+				}
+			}
+		}
+		for _, vv := range v.NotClassifyTitles {
+			datas = append(datas, map[string]interface{}{
+				"val":        vv,
+				"times":      0,
+				"type":       "block",
+				"firstid":    j.SourceMid,
+				"createtime": now,
+			})
+			if len(datas) == 200 {
+				db.Mgo.SaveBulk(coll, datas...)
+				datas = []map[string]interface{}{}
+			}
+		}
+	}
+	for k, v := range kv {
+		datas = append(datas, map[string]interface{}{
+			"val":        k,
+			"times":      v,
+			"type":       "kv",
+			"firstid":    j.SourceMid,
+			"createtime": now,
+		})
+		if len(datas) == 200 {
+			db.Mgo.SaveBulk(coll, datas...)
+			datas = []map[string]interface{}{}
+		}
+	}
+	if len(datas) > 0 {
+		db.Mgo.SaveBulk(coll, datas...)
+	}
 }
 
 func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
@@ -1449,7 +1554,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 {                            //reids未找到,执行规则匹配
+	if i == 0 { //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 2 - 3
src/jy/extract/extractInit.go

@@ -369,12 +369,11 @@ func (e *ExtractTask) InitRuleCore() {
 						tmp := strings.Split(rinfo.RuleText, "__")
 						var pattern string
 						if strings.Contains(tmp[0], "\\u") {
-							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
 							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
-							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 						} else {
 							pattern = tmp[0]
 						}
+						pattern, _ = strconv.Unquote(`"` + pattern + `"`)
 						if len(tmp) == 2 {
 							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 						} else {
@@ -414,12 +413,12 @@ func (e *ExtractTask) InitRuleCore() {
 						tmp := strings.Split(rinfo.RuleText, "__")
 						var pattern string
 						if strings.Contains(tmp[0], "\\u") {
-							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
 							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 						} else {
 							pattern = tmp[0]
 						}
+						pattern, _ = strconv.Unquote(`"` + pattern + `"`)
 						if len(tmp) == 2 {
 							epos := strings.Split(tmp[1], ",")
 							posm := map[string]int{}

+ 41 - 17
src/jy/pretreated/analystep.go

@@ -40,22 +40,29 @@ func AnalyStart(job *util.Job) {
 			//块中再查找表格(块,处理完把值赋到块)
 			t1, _ := ComputeConRatio(bl.Text, 2)
 			if len(t1) > 0 {
-				job.HasTable = 1                                                                             //添加标识:文本中有table
-				tabres := AnalyTableV2(t1, job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
-				processTableResult(tabres, bl, job)                                                          //分析table解析结果
-				if bl.Title == "" && tabres.BlockTag != "" {
-					bl.Title = tabres.BlockTag
+				job.HasTable = 1
+				for i := 0; i < len(tabs); i++ {
+					bl := &util.Block{}
+					//添加标识:文本中有table
+					tabres := AnalyTableV2(t1[0], job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
+					processTableResult(tabres, bl, job)                                                             //分析table解析结果
+					if bl.Title == "" && tabres.BlockTag != "" {
+						bl.Title = tabres.BlockTag
+					}
+					if len(bl.TableKV.Kv) > 0 {
+						bl.Text = tabs[i].Text()
+						job.Block = append(job.Block, bl)
+					}
 				}
 				//				for k, v := range bl.TableKV.Kv {
 				//					log.Println("bl.TableKV.Kv", k, v)
 				//				}
 			}
-			job.Block = append(job.Block, bl)
-
 			if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
 				//新加table未找到winnerorder, 从分块文本中找中标候选人
 				job.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
 			}
+			job.Block = append(job.Block, bl)
 		}
 	} else { //未分块,创建分块
 		bl := &util.Block{}
@@ -64,8 +71,19 @@ func AnalyStart(job *util.Job) {
 			job.HasTable = 1 //添加标识:文本中有table
 			newCon = TextAfterRemoveTable(con)
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
-			tabres := AnalyTableV2(tabs, job.Category, "", con, 1, job.SourceMid, job.RuleBlock)
-			processTableResult(tabres, bl, job)
+			for i := 0; i < len(tabs); i++ {
+				bl := &util.Block{}
+				//添加标识:文本中有table
+				tabres := AnalyTableV2(tabs[i], job.Category, "", con, 1, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
+				processTableResult(tabres, bl, job)                                                     //分析table解析结果
+				if bl.Title == "" && tabres.BlockTag != "" {
+					bl.Title = tabres.BlockTag
+				}
+				if len(bl.TableKV.Kv) > 0 {
+					bl.Text = tabs[i].Text()
+					job.Block = append(job.Block, bl)
+				}
+			}
 			//			for k, v := range bl.TableKV.Kv {
 			//				log.Println("bl.TableKV.Kv", k, v)
 			//			}
@@ -104,27 +122,29 @@ func FindProjectCode(newCon string, job *util.Job) {
 	var proCode string
 	proCode = projectcodeReg.FindString(newCon)
 	blCode := &util.Block{}
-	blCode.Text = proCode
 	if proCode != "" {
 		ckv := GetKVAll(proCode, job.Title, nil, 1)
 		blCode.ColonKV = ckv
+		blCode.Text = proCode
 		job.Block = append(job.Block, blCode)
-	}else if proCode = projectcodeReg2.FindString(newCon);proCode !=""{
+	} else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
 		ckv := GetKVAll(proCode, job.Title, nil, 1)
 		blCode.ColonKV = ckv
+		blCode.Text = proCode
 		job.Block = append(job.Block, blCode)
-	}else if proCode = projectcodeReg3.FindString(newCon) ;proCode !=""{
+	} else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
 		ckv := GetKVAll(proCode, job.Title, nil, 1)
+		blCode.Text = proCode
 		blCode.ColonKV = ckv
 		job.Block = append(job.Block, blCode)
 	}
-	if proCode = jsonReg.FindString(newCon);proCode != ""{
+	if proCode = jsonReg.FindString(newCon); proCode != "" {
 		jsonMap := make(map[string]string)
-		json.Unmarshal([]byte(proCode),&jsonMap)
+		json.Unmarshal([]byte(proCode), &jsonMap)
 		jobKv := util.NewJobKv()
-		for k,v := range jsonMap{
+		for k, v := range jsonMap {
 			tmpkv := new(util.Kv)
-			tmpkv.Line = k+v
+			tmpkv.Line = k + v
 			tmpkv.Key = k
 			tmpkv.Value = v
 			jobKv.Kvs = append(jobKv.Kvs, tmpkv)
@@ -146,7 +166,11 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 	for k, v := range tabres.SortKVWeight {
 		kvIndex[k] = v
 	}
-	block.TableKV = &util.JobKv{Kv: kv, KvIndex: kvIndex}
+	KvTag := map[string]*util.Tag{}
+	for k, _ := range tabres.SortKV.NotTagKey {
+		KvTag[k] = &util.Tag{Weight: util.RetainKvWeight}
+	}
+	block.TableKV = &util.JobKv{Kv: kv, KvIndex: kvIndex, KvTag: KvTag}
 
 	//分包
 	tablePackage := map[string]*util.BlockPackage{}

+ 41 - 25
src/jy/pretreated/analytable.go

@@ -3,6 +3,7 @@ package pretreated
 import (
 	"fmt"
 	u "jy/util"
+	"log"
 	qutil "qfw/util"
 	"regexp"
 	"strings"
@@ -125,8 +126,8 @@ func IsHide(g *goquery.Selection) (b bool) {
 
 //对表格的key进行标准化处理,多个k相同时,出现覆盖问题
 //待扩展,暂不支持正则标签库
-func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (k1 []string, weight []int, v1, returntag string, b bool) {
-	k1 = []string{}
+func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (k1, k2 []string, weight []int, v1, returntag string, b bool) {
+	k1, k2 = []string{}, []string{}
 	weight = []int{}
 	tk := k
 	if sv, sok := v.(string); sok { //取KV
@@ -177,6 +178,8 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (k1 []string,
 				returntag = "中标情况"
 			}
 			b = true
+		} else {
+			k2 = append(k2, k)
 		}
 	}
 	//对上一步没有取到标准化key的进一步处理
@@ -227,7 +230,7 @@ func (table *Table) KVFilter() {
 		v := table.SortKV.Map[k]
 		if _, ok := v.(string); ok { //table.SortKV.Value为字符串,匹配抽取关键词table.SortKV.Key,匹配到添加k,v到table.StandKV,table.StandKVWeight
 			k = regSpliteSegment.ReplaceAllString(regReplAllSpace.ReplaceAllString(k, ""), "")
-			k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v) //对key标准化处理,没有找到会走中标
+			k1, n_k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v) //对key标准化处理,没有找到会走中标
 			//qutil.Debug(k, v, k1, w1, v1, tag, b)
 			if b {
 				//降低冒号值的权重
@@ -257,6 +260,9 @@ func (table *Table) KVFilter() {
 					table.StandKVWeight[k] = 0
 				}
 			}
+			for _, n_k2 := range n_k1 {
+				table.SortKV.NotTagKey[n_k2] = true
+			}
 		} else {
 			//u.Debug(k, v, "---------")
 			as.AddKey(k, v)
@@ -473,7 +479,7 @@ func (table *Table) sortKVArr(as *SortMap, winnertag bool) {
 					}
 				}
 			}
-			k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v)
+			k1, n_k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v)
 			if b {
 				if tag != "" && table.Tag == "" {
 					table.Tag = tag
@@ -489,6 +495,10 @@ func (table *Table) sortKVArr(as *SortMap, winnertag bool) {
 					//					}
 					//				}
 				}
+			} else {
+				for _, n_k2 := range n_k1 {
+					table.SortKV.NotTagKey[n_k2] = true
+				}
 			}
 		}
 	}
@@ -625,7 +635,7 @@ func (table *Table) MergerToTableresult() {
 解析表格入口
 返回:汇总表格对象
 **/
-func AnalyTableV2(tabs []*goquery.Selection, toptype, blockTag, con string, itype int, _id interface{}, ruleBlock *u.RuleBlock) (tabres *TableResult) {
+func AnalyTableV2(tabs *goquery.Selection, toptype, blockTag, con string, itype int, _id interface{}, ruleBlock *u.RuleBlock) (tabres *TableResult) {
 	defer qutil.Catch()
 	//u.Debug(con)
 	if itype == 1 {
@@ -635,13 +645,13 @@ func AnalyTableV2(tabs []*goquery.Selection, toptype, blockTag, con string, ityp
 	//生成tableresult对象
 	tabres = NewTableResult(_id, toptype, blockTag, con, itype, ruleBlock)
 	//可以有多个table
-	for _, table := range tabs {
+	//for _, table := range tabs {
 		//隐藏表格跳过
-		if IsHide(table) {
-			continue
+		if IsHide(tabs) {
+			return
 		}
-		tabres.GoqueryTabs = append(tabres.GoqueryTabs, table)
-	}
+		tabres.GoqueryTabs = tabs
+	//}
 	//解析表格集
 	tabres.Analy()
 	return
@@ -654,18 +664,18 @@ func (ts *TableResult) Analy() {
 		IndexMap: map[int]string{},
 		MatchMap: map[string]map[string]bool{},
 	}
-	for _, table := range ts.GoqueryTabs {
-		tn := NewTable(ts.Html, ts, table)
+	//for _, table := range ts.GoqueryTabs {
+		tn := NewTable(ts.Html, ts, ts.GoqueryTabs)
 		//核心模块
-		ts := tn.Analy(contactFormat)
-		for _, tab := range ts {
-			if len(tab.TRs) > 0{
+		tsw := tn.Analy(contactFormat)
+		for _, tab := range tsw {
+			if len(tab.TRs) > 0 {
 				tabs = append(tabs, tab)
 			}
 			//fmt.Println("tab.SortKV.Map", tab.SortKV.Keys)
 		}
 		//tn.SonTables = append(tn.SonTables, tn)
-	}
+	//}
 	//统一合并,考虑统一多表格是多包的情况---新增
 	if len(tabs) > 1 {
 		pns := map[string]string{}
@@ -779,7 +789,7 @@ func (table *Table) createTabe(trs *goquery.Selection) {
 			td := NewTD(selm, TR, table) //初始化td,kv处理,td中有table处理,td的方向
 			//num++
 			TR.AddTD(td)
-			if td.Val == "" && td.SonTableResult == nil { //删除一个tr,tr中所有td是空值的
+			if td.Val == "" && td.SonTableResult == nil && len(td.SortKV.Map) == 0{ //删除一个tr,tr中所有td是空值的
 				empty++
 				if tds.Size() == empty {
 					tdTextIsNull = true
@@ -840,11 +850,14 @@ func (tn *Table) AnalyTables(contactFormat *u.ContactFormat) []*Table {
 				table.KVFilter()
 			}
 			for k, v := range table.StandKV { //过滤后的标准化kv
-				if table.TableResult.SortKV.Map[k] == nil {
+				if table.TableResult.SortKV.Map[k] == nil || table.StandKVWeight[k] > table.TableResult.SortKVWeight[k] {
 					table.TableResult.SortKV.AddKey(k, v)
 					table.TableResult.SortKVWeight[k] = table.StandKVWeight[k]
 				}
 			}
+			for k, v := range table.SortKV.NotTagKey {
+				table.TableResult.SortKV.NotTagKey[k] = v
+			}
 			//u.Debug(str)
 		}
 	}
@@ -1309,7 +1322,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
 									if ((tdn != nil && v.Rationmap[v2] > fv) || tdn == nil) && td.Valtype != "BO" {
 										td.KeyDirect = 2
 										td.KVDirect = 1
-										//td.BH = true
+										td.BH = true
 									}
 								}
 							}
@@ -2079,7 +2092,7 @@ func (tn *Table) manyPackageProcessByIndex(index []string, standIndex_pos []int)
 			}
 		} else if val, bvs := v1.(string); bvs && len(index) == 1 {
 			//删除子包的kv
-			k1tags, _, _, _, _ := CommonDataAnaly(k1, "", "", val)
+			k1tags, _, _, _, _, _ := CommonDataAnaly(k1, "", "", val)
 			if len(k1tags) > 0 && regexp.MustCompile("^(项目|开标|采购单位|招标机构)").MatchString(k1tags[0]) { //(k1tags[0].Value == "采购单位" || k1tags[0].Value == "项目编号")) {
 				//log.Println("remove", k1, val)
 				tn.SortKV.RemoveKey(k1)
@@ -2301,7 +2314,7 @@ func (tn *Table) assemblePackage(k1, v1, key string) {
 		bp.TableKV = u.NewJobKv()
 	}
 	if v1 != "" {
-		k2, w1, v2, _, bf := CommonDataAnaly(k1, "中标情况", "", v1) //匹配抽取关键词
+		k2, _, w1, v2, _, bf := CommonDataAnaly(k1, "中标情况", "", v1) //匹配抽取关键词
 		if bf {
 			for pos, k3 := range k2 {
 				if bp.TableKV.Kv != nil && bp.TableKV.KvTag[k3] != nil && (bp.TableKV.Kv[k3] == "" || w1[pos] > bp.TableKV.KvTag[k3].Weight) {
@@ -2309,9 +2322,10 @@ func (tn *Table) assemblePackage(k1, v1, key string) {
 					bp.TableKV.KvTag[k3] = &u.Tag{Value: v2, Weight: w1[pos]}
 				} else {
 					bp.TableKV.Kv[k1] = qutil.ObjToString(v1)
-					if tn.SortKV.Map[k3] == nil {
-						tn.SortKV.AddKey(k3, v2) //添加匹配到抽取关键词的key,value
-					}
+					//if tn.SortKV.Map[k3] == nil {
+					//	tn.SortKV.AddKey(k3, v2) //添加匹配到抽取关键词的key,value
+					//	tn.StandKVWeight[k3]=w1[pos]
+					//}
 				}
 			}
 		} else {
@@ -2705,6 +2719,8 @@ func modle(thisTdKvs []*u.Kv, td *TD, myContactType, td_k, td_v string, contactT
 			}
 		}
 		td.SortKV.AddKey(myContactType+td_k, td_v)
+		log.Println(myContactType, td_k, td_v)
+		delete(td.SortKV.NotTagKey, td_k)
 	}
 }
 
@@ -3169,7 +3185,7 @@ func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMa
 	for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序
 		val := table.SortKV.Map[key]
 		key = regReplAllSpace.ReplaceAllString(key, "")
-		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
+		key = strings.Replace(key, "", "", -1)    //处理一个特殊的采购量 经上层处理空格后未处理掉
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 			/*
 				{

+ 115 - 109
src/jy/pretreated/tablev2.go

@@ -22,13 +22,13 @@ type TableResult struct {
 	Itype          int         //1全文 2是块
 	BlockTag       string      //块标签
 	Html           string
-	Tabs           []*Table             //子表集合,子表中包含标准化kv或原始kv
-	GoqueryTabs    []*goquery.Selection //goquery对象
-	TableSize      int                  //子表的个数0,1,n
-	IsMultiPackage bool                 //是否有子包
-	PackageMap     *SortMap             //子包对象的sortmap,含标准化过的
-	SortKV         *SortMap             //全局KVmap值,标准化处理过的
-	SortKVWeight   map[string]int       //全局KVmap值,标准化处理过的
+	Tabs           []*Table           //子表集合,子表中包含标准化kv或原始kv
+	GoqueryTabs    *goquery.Selection //goquery对象
+	TableSize      int                //子表的个数0,1,n
+	IsMultiPackage bool               //是否有子包
+	PackageMap     *SortMap           //子包对象的sortmap,含标准化过的
+	SortKV         *SortMap           //全局KVmap值,标准化处理过的
+	SortKVWeight   map[string]int     //全局KVmap值,标准化处理过的
 	WinnerOrder    []map[string]interface{}
 	BrandData      [][]map[string]string //品牌抽取结果
 	HasKey         int                   //有key
@@ -46,7 +46,7 @@ func NewTableResult(Id interface{}, Toptype, BlockTag, con string, Itype int, ru
 		Itype:        Itype,
 		BlockTag:     BlockTag,
 		Tabs:         []*Table{},
-		GoqueryTabs:  []*goquery.Selection{},
+		GoqueryTabs:  &goquery.Selection{},
 		PackageMap:   NewSortMap(),
 		SortKV:       NewSortMap(),
 		SortKVWeight: map[string]int{},
@@ -123,64 +123,55 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		//qutil.Debug("有子表格")
 		//格式化正文
 		txt = TextAfterRemoveTable(td.Html)
-		td.tdHasTable(&bsontable, tr, table) //处理td中的table,块标签处理,子表解析集处理
-		//处理table外内容
-		var ub []*u.Block
-		ub, _ = DivideBlock("",txt, 2, table.TableResult.RuleBlock)
-		//看是否划块
-		if len(ub) > 0 {
-			colonKvWeight := map[string]int{}
-			spaceKvWeight := map[string]int{}
-			for _, bl := range ub {
-				//冒号kv
-				for bl_ck, bl_cv := range bl.ColonKV.Kv {
-					if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
-						colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
-						td.SortKV.AddKey(bl_ck, bl_cv)
-					}
-				}
-				//空格kv
-				for bl_sk, bl_sv := range bl.SpaceKV.Kv {
-					if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
-						spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
-						td.SortKV.AddKey(bl_sk, bl_sv)
-					}
-				}
-			}
-		}
+		td.tdHasTable(&bsontable, tr) //处理td中的table,块标签处理,子表解析集处理
 	} else {
 		txt = strings.TrimSpace(td.Goquery.Text())
 	}
 	text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
 	td.Val = text //值
 	td.Text = txt //原始串
-	//调用kv解析
-	cKV := GetKVAll(text, "", nil, 1)
-	for k,v :=range cKV.Kv{
-		td.SortKV.AddKey(k,v)
-	}
-	sKV := SspacekvEntity.Entrance(text, "", nil)
-	for k,v :=range sKV.Kv{
-		td.SortKV.AddKey(k,v)
+	//处理table外内容
+	var ub []*u.Block
+	ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock)
+	//看是否划块
+	if len(ub) > 0 {
+		colonKvWeight := map[string]int{}
+		spaceKvWeight := map[string]int{}
+		for _, bl := range ub {
+			//冒号kv
+			for bl_ck, bl_cv := range bl.ColonKV.Kv {
+				if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
+					colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
+					td.SortKV.AddKey(bl_ck, bl_cv)
+				}
+			}
+			//空格kv
+			for bl_sk, bl_sv := range bl.SpaceKV.Kv {
+				if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
+					spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
+					td.SortKV.AddKey(bl_sk, bl_sv)
+				}
+			}
+		}
 	}
 	//抽取不到走正则抽
 	proCode := projectcodeReg.FindString(text)
 	if proCode != "" {
 		ckv := GetKVAll(proCode, "", nil, 1)
-		for k,v :=range ckv.Kv{
-			td.SortKV.AddKey(k,v)
+		for k, v := range ckv.Kv {
+			td.SortKV.AddKey(k, v)
 		}
-	}else if proCode = projectcodeReg2.FindString(text);proCode !=""{
+	} else if proCode = projectcodeReg2.FindString(text); proCode != "" {
 		ckv := GetKVAll(proCode, "", nil, 1)
-		for k,v :=range ckv.Kv{
-			td.SortKV.AddKey(k,v)
+		for k, v := range ckv.Kv {
+			td.SortKV.AddKey(k, v)
 		}
 	}
-	if proCode = jsonReg.FindString(text);proCode != ""{
+	if proCode = jsonReg.FindString(text); proCode != "" {
 		jsonMap := make(map[string]string)
-		json.Unmarshal([]byte(proCode),&jsonMap)
-		for k,v := range jsonMap{
-			td.SortKV.AddKey(k,v)
+		json.Unmarshal([]byte(proCode), &jsonMap)
+		for k, v := range jsonMap {
+			td.SortKV.AddKey(k, v)
 		}
 	}
 	//对td单元格值判断是否是表头和根据td内容长度进行分块处理
@@ -201,7 +192,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 }
 
 //处理td中的table,块标签处理,子表解析集处理
-func (td *TD) tdHasTable(bsontable *bool, tr *TR, table *Table) {
+func (td *TD) tdHasTable(bsontable *bool, tr *TR) {
 	ts := td.TR.Table.TableResult
 	tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比
 	if len(tabs) > 0 {
@@ -227,64 +218,75 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR, table *Table) {
 				}
 				stag = str
 			}
-			sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
-			td.BH = false
-			for k,v := range sonts.SortKV.Map{
-				if td.TR.Table.TableResult == nil{
-					td.TR.Table.TableResult = NewTableResult(sonts.Id,sonts.Toptype,sonts.BlockTag,sonts.Html,sonts.Itype,sonts.RuleBlock)
+			for _, tv := range tabs {
+				if IsHide(tv) {
+					continue
 				}
-				td.TR.Table.TableResult.SortKV.AddKey(k,v)
-			}
-			//td.SonTableResult = sonts
-			//for _, k := range sonts.SortKV.Keys {
-			//u.Debug(k, sonts.SortKV.Map[k])
-			//				td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
-			//				td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
-			//}
-			//增加brand (子表)
-			//fmt.Println("sonsHasKey=============", sonts.HasKey)
-			//fmt.Println("sonsHasGoods========", sonts.HasGoods)
-			//fmt.Println("sonsHasBrand========", sonts.HasBrand)
-			if sonts.HasKey != 0 {
-				td.TR.Table.TableResult.HasKey = sonts.HasKey
-			}
-			if sonts.HasGoods != 0 {
-				td.TR.Table.TableResult.HasGoods = sonts.HasGoods
-			}
-			if sonts.HasBrand != 0 {
-				td.TR.Table.TableResult.HasBrand = sonts.HasBrand
-			}
-			if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table
-				for _, v := range sonts.BrandData {
-					if len(v) > 0 {
-						td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v)
+				sonts := NewTableResult(ts.Id, ts.Toptype, stag, td.Html, 2, td.TR.Table.TableResult.RuleBlock)
+				sonts.GoqueryTabs = tv
+				sonts.Analy()
+
+				//sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
+				td.BH = false
+				for k, v := range sonts.SortKV.Map {
+					if td.TR.Table.TableResult == nil {
+						td.TR.Table.TableResult = NewTableResult(sonts.Id, sonts.Toptype, sonts.BlockTag, sonts.Html, sonts.Itype, sonts.RuleBlock)
 					}
+					td.TR.Table.TableResult.SortKV.AddKey(k, v)
+					td.TR.Table.TableResult.SortKVWeight[k] = sonts.SortKVWeight[k]
 				}
-			}
-			if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
-				td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
-			}
-			if sonts.IsMultiPackage {
-				td.TR.Table.BPackage = true
-				tb1 := td.TR.Table.BlockPackage
-				for k, v := range sonts.PackageMap.Map {
-					v1 := v.(*u.BlockPackage)
-					if tb1.Map[k] == nil {
-						tb1.AddKey(k, v)
-					} else {
-						bp := tb1.Map[k].(*u.BlockPackage)
-						if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil {
-							for k2, v2 := range v1.TableKV.Kv {
-								if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" {
-									bp.TableKV.Kv[k2] = v2
-									bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
+				td.SonTableResult = sonts
+				//for _, k := range sonts.SortKV.Keys {
+				//u.Debug(k, sonts.SortKV.Map[k])
+				//				td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
+				//				td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
+				//}
+				//增加brand (子表)
+				//fmt.Println("sonsHasKey=============", sonts.HasKey)
+				//fmt.Println("sonsHasGoods========", sonts.HasGoods)
+				//fmt.Println("sonsHasBrand========", sonts.HasBrand)
+				if sonts.HasKey != 0 {
+					td.TR.Table.TableResult.HasKey = sonts.HasKey
+				}
+				if sonts.HasGoods != 0 {
+					td.TR.Table.TableResult.HasGoods = sonts.HasGoods
+				}
+				if sonts.HasBrand != 0 {
+					td.TR.Table.TableResult.HasBrand = sonts.HasBrand
+				}
+				if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table
+					for _, v := range sonts.BrandData {
+						if len(v) > 0 {
+							td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v)
+						}
+					}
+				}
+				if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
+					td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
+				}
+				if sonts.IsMultiPackage {
+					td.TR.Table.BPackage = true
+					tb1 := td.TR.Table.BlockPackage
+					for k, v := range sonts.PackageMap.Map {
+						v1 := v.(*u.BlockPackage)
+						if tb1.Map[k] == nil {
+							tb1.AddKey(k, v)
+						} else {
+							bp := tb1.Map[k].(*u.BlockPackage)
+							if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil {
+								for k2, v2 := range v1.TableKV.Kv {
+									if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" {
+										bp.TableKV.Kv[k2] = v2
+										bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
+									}
 								}
 							}
 						}
 					}
+					//u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
 				}
-				//u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
 			}
+
 		}
 	}
 }
@@ -436,8 +438,8 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable bool) {
 		*/
 
 		fSortKV := FindKv(td.Val, "", 2)
-		for k,v := range fSortKV.Map{
-			td.SortKV.AddKey(k,v)
+		for k, v := range fSortKV.Map {
+			td.SortKV.AddKey(k, v)
 		}
 		//		td.LeftNode.Val
 		//		for _, vvv := range *td.TR {
@@ -639,18 +641,20 @@ func (t *Table) InsertTR(tr *TR) {
 
 //支持排序的map
 type SortMap struct {
-	Index map[string]int
-	Keys  []string
-	Map   map[string]interface{}
-	Lock  sync.Mutex
+	Index     map[string]int
+	Keys      []string
+	Map       map[string]interface{}
+	Lock      sync.Mutex
+	NotTagKey map[string]bool
 }
 
 //快速创建排序map
 func NewSortMap() *SortMap {
 	return &SortMap{
-		Index: map[string]int{},
-		Keys:  []string{},
-		Map:   map[string]interface{}{},
+		Index:     map[string]int{},
+		Keys:      []string{},
+		Map:       map[string]interface{}{},
+		NotTagKey: map[string]bool{},
 	}
 }
 
@@ -881,11 +885,13 @@ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio
 	**/
 	return
 }
+
 //纯文本
 func HtmlToText(con string) string {
 	doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
 	return doc2.Text()
 }
+
 //取出排除表格之外的文本
 func TextAfterRemoveTable(con string) string {
 	doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))

+ 8 - 7
src/main.go

@@ -9,16 +9,17 @@ import (
 	_ "jy/front"
 	. "jy/router"
 	"jy/util"
-	"log"
 	qu "qfw/util"
-	"qfw/util/elastic"
+	//"qfw/util/elastic"
 	redis "qfw/util/redis"
+
+	log "github.com/donnie4w/go-logger/logger"
 )
 
 func init() {
-	util.SetConsole(false)
-	util.SetLevel(util.DEBUG)
-	util.SetRollingDaily("./", "out.log")
+	log.SetConsole(false)
+	log.SetLevel(log.DEBUG)
+	log.SetRollingDaily("./", "out.log")
 	qu.ReadConfig(&util.Config)
 	qu.ReadConfig("./res/brandrule.json", &util.BrandRules)
 	qu.ReadConfig("./res/goods.json", &util.GoodsConfig)
@@ -32,7 +33,7 @@ func init() {
 	redis.InitRedisBySize(qu.ObjToString(util.Config["redis"]), 50, 30, 240)
 	//初始化elastic连接
 	//"winner=172.17.145.179:2710,buyer=172.17.145.179:2711"
-	elastic.InitElasticSize(qu.ObjToString(util.Config["elasticsearch"]), qu.IntAllDef(util.Config["elasticPoolSize"], 30))
+	//elastic.InitElasticSize(qu.ObjToString(util.Config["elasticsearch"]), qu.IntAllDef(util.Config["elasticPoolSize"], 30))
 }
 
 func main() {
@@ -40,7 +41,7 @@ func main() {
 	extract.ClearUdp()   //udp通知清理
 	go extract.Export()
 	go Router.Run(":" + qu.ObjToString(util.Config["port"]))
-	go log.Println("启动..", qu.ObjToString(util.Config["port"]))
+	go log.Debug("启动..", qu.ObjToString(util.Config["port"]))
 	lock := make(chan bool)
 	<-lock
 }

+ 8 - 1
versioncomparison/config.json

@@ -11,6 +11,13 @@
         "buyer",
         "bidamount",
         "budget",
-        "winner"
+        "winner",
+        "agency",
+        "buyerperson",
+        "buyertel",
+        "buyeraddr",
+        "agencyperson",
+        "agencytel",5d39d253a5cb26b9b7404ae1,5d3b23aaa5cb26b9b7c1ec59
+        "agencyaddr"
     ]
 }

BIN
versioncomparison/template.xlsx