浏览代码

先抽取table kv,没有的话抽取td kv

fengweiqiang 6 年之前
父节点
当前提交
b3c2df418b
共有 2 个文件被更改,包括 41 次插入29 次删除
  1. 28 18
      src/jy/pretreated/analytable.go
  2. 13 11
      src/jy/pretreated/tablev2.go

+ 28 - 18
src/jy/pretreated/analytable.go

@@ -133,7 +133,9 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (k1, k2 []str
 	if sv, sok := v.(string); sok { //取KV
 		v1 = sv
 	} else if sv, sok := v.([]string); sok { //是数组先默认取第一个
-		v1 = sv[0]
+		if len(sv) >= 1 {
+			v1 = sv[0]
+		}
 	}
 	//对值单位的处理   (预算|费|价|额|规模|投资)
 	if moneyreg.MatchString(tk) {
@@ -228,6 +230,10 @@ func (table *Table) KVFilter() {
 			continue
 		}
 		v := table.SortKV.Map[k]
+		if table.SortKVWeight[k] == -99 { //td格式化kv降低权重
+			as.AddKey(k, v)
+			continue
+		}
 		if _, ok := v.(string); ok { //table.SortKV.Value为字符串,匹配抽取关键词table.SortKV.Key,匹配到添加k,v到table.StandKV,table.StandKVWeight
 			k = regSpliteSegment.ReplaceAllString(regReplAllSpace.ReplaceAllString(k, ""), "")
 			k1, n_k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v) //对key标准化处理,没有找到会走中标
@@ -257,7 +263,7 @@ func (table *Table) KVFilter() {
 			} else {
 				if table.StandKV[k] == "" && qutil.ObjToString(v) != "" {
 					table.StandKV[k] = qutil.ObjToString(v)
-					table.StandKVWeight[k] = 0
+					table.StandKVWeight[k] = -99
 				}
 			}
 			for _, n_k2 := range n_k1 {
@@ -347,6 +353,9 @@ func (table *Table) KVFilter() {
 func (table *Table) sortKVArr(as *SortMap, winnertag bool) {
 	checkKey := map[int]bool{}
 	for kn, k := range as.Keys { //遍历table.SortKV.value为数组的key
+		if strings.TrimSpace(table.StandKV[k]) != "" {
+			continue
+		}
 		v := as.Map[k]
 		if vm, ok := v.([]map[string]interface{}); ok && k == NullTxtBid {
 			if table.WinnerOrder == nil {
@@ -646,11 +655,11 @@ func AnalyTableV2(tabs *goquery.Selection, toptype, blockTag, con string, itype
 	tabres = NewTableResult(_id, toptype, blockTag, con, itype, ruleBlock)
 	//可以有多个table
 	//for _, table := range tabs {
-		//隐藏表格跳过
-		if IsHide(tabs) {
-			return
-		}
-		tabres.GoqueryTabs = tabs
+	//隐藏表格跳过
+	if IsHide(tabs) {
+		return
+	}
+	tabres.GoqueryTabs = tabs
 	//}
 	//解析表格集
 	tabres.Analy()
@@ -665,16 +674,16 @@ func (ts *TableResult) Analy() {
 		MatchMap: map[string]map[string]bool{},
 	}
 	//for _, table := range ts.GoqueryTabs {
-		tn := NewTable(ts.Html, ts, ts.GoqueryTabs)
-		//核心模块
-		tsw := tn.Analy(contactFormat)
-		for _, tab := range tsw {
-			if len(tab.TRs) > 0 {
-				tabs = append(tabs, tab)
-			}
-			//fmt.Println("tab.SortKV.Map", tab.SortKV.Keys)
+	tn := NewTable(ts.Html, ts, ts.GoqueryTabs)
+	//核心模块
+	tsw := tn.Analy(contactFormat)
+	for _, tab := range tsw {
+		if len(tab.TRs) > 0 {
+			tabs = append(tabs, tab)
 		}
-		//tn.SonTables = append(tn.SonTables, tn)
+		//fmt.Println("tab.SortKV.Map", tab.SortKV.Keys)
+	}
+	//tn.SonTables = append(tn.SonTables, tn)
 	//}
 	//统一合并,考虑统一多表格是多包的情况---新增
 	if len(tabs) > 1 {
@@ -789,7 +798,7 @@ func (table *Table) createTabe(trs *goquery.Selection) {
 			td := NewTD(selm, TR, table) //初始化td,kv处理,td中有table处理,td的方向
 			//num++
 			TR.AddTD(td)
-			if td.Val == "" && td.SonTableResult == nil && len(td.SortKV.Map) == 0{ //删除一个tr,tr中所有td是空值的
+			if td.Val == "" && td.SonTableResult == nil && len(td.SortKV.Map) == 0 { //删除一个tr,tr中所有td是空值的
 				empty++
 				if tds.Size() == empty {
 					tdTextIsNull = true
@@ -1479,6 +1488,7 @@ func (table *Table) FindKV() {
 										continue
 									}
 									table.SortKV.AddKey(tdk, tdv)
+									table.SortKVWeight[tdk] = -99
 								}
 							}
 						}
@@ -3185,7 +3195,7 @@ func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMa
 	for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序
 		val := table.SortKV.Map[key]
 		key = regReplAllSpace.ReplaceAllString(key, "")
-		key = strings.Replace(key, "", "", -1)    //处理一个特殊的采购量 经上层处理空格后未处理掉
+		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 			/*
 				{

+ 13 - 11
src/jy/pretreated/tablev2.go

@@ -65,6 +65,7 @@ type TD struct {
 	Val            string             //值
 	Text           string             //原始串
 	SortKV         *SortMap           //存放kv值
+	SortKVWeight   map[string]int     //存放kv值权重
 	Html           string             //html值
 	BH             bool               //是否是表头
 	MustBH         bool               //不能修改的表头
@@ -92,11 +93,12 @@ var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿
 func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	defer qutil.Catch()
 	td := &TD{
-		ArrVal:  []string{},
-		Goquery: Goquery,
-		SonTds:  []*TD{},
-		TR:      tr,
-		SortKV:  NewSortMap(),
+		ArrVal:       []string{},
+		Goquery:      Goquery,
+		SonTds:       []*TD{},
+		TR:           tr,
+		SortKV:       NewSortMap(),
+		SortKVWeight: map[string]int{},
 	}
 	colspan, rowspan := 0, 0
 	col, bcol := td.Goquery.Attr("colspan")
@@ -135,21 +137,19 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock)
 	//看是否划块
 	if len(ub) > 0 {
-		colonKvWeight := map[string]int{}
-		spaceKvWeight := map[string]int{}
 		for _, bl := range ub {
 			//冒号kv
 			for bl_ck, bl_cv := range bl.ColonKV.Kv {
-				if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
-					colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
+				if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= td.SortKVWeight[bl_ck] {
 					td.SortKV.AddKey(bl_ck, bl_cv)
+					td.SortKVWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
 				}
 			}
 			//空格kv
 			for bl_sk, bl_sv := range bl.SpaceKV.Kv {
-				if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
-					spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
+				if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= td.SortKVWeight[bl_sk] {
 					td.SortKV.AddKey(bl_sk, bl_sv)
+					td.SortKVWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
 				}
 			}
 		}
@@ -574,6 +574,7 @@ type Table struct {
 	TDNum                  int                       //td个数
 	BPackage               bool                      //是否有包
 	SortKV                 *SortMap                  //带排序的KV值
+	SortKVWeight           map[string]int            //带排序的KV值
 	StandKV                map[string]string         //过滤后的标准化kv
 	StandKVWeight          map[string]int            //过滤后的标准化kv
 	StandRuleKV            map[string]string         //过滤后的规则kv
@@ -601,6 +602,7 @@ func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Ta
 	return &Table{
 		Html:                   Html,
 		SortKV:                 NewSortMap(),
+		SortKVWeight:           map[string]int{},
 		StandKV:                map[string]string{},
 		StandKVWeight:          map[string]int{},
 		kvscope:                map[int]map[int][]*TD{},