Przeglądaj źródła

Merge branch 'dev3.2' of ssh://192.168.3.207:10022/qmx/jy-data-extract into dev3.2

wcj 6 lat temu
rodzic
commit
41454e5352

+ 6 - 6
src/jy/clear/tonumber.go

@@ -208,11 +208,11 @@ func capitalMoney(data []interface{}) []interface{} {
 	if len(strmatch) > 0 {
 		str = strmatch[0][0]
 	}
-	//修正单位类似:捌万伍仟肆佰捌拾贰万元整
-	if strings.Contains(str, "万元") {
-		str = strings.Replace(str, "万元", "#B#", -1)
-		str = strings.Replace(str, "万", "亿", -1)
-		str = strings.Replace(str, "#B#", "万元", -1)
+	suffixUnit := float64(1)
+	if strings.HasSuffix(str, "万") || strings.HasSuffix(str, "万元") || strings.HasSuffix(str, "万元整") {
+		index := strings.LastIndex(str, "万")
+		str = str[0:index]
+		suffixUnit = float64(10000)
 	}
 	moneyRegChar.ReplaceAllStringFunc(str, func(key string) string {
 		if key == "元" || key == "圆" || key == "点" {
@@ -262,7 +262,7 @@ func capitalMoney(data []interface{}) []interface{} {
 	for _, v := range nodes {
 		ret += v
 	}
-	return []interface{}{ret + decimals, data[1]}
+	return []interface{}{(ret + decimals) * suffixUnit, data[1]}
 }
 
 //过滤符号

+ 27 - 9
src/jy/pretreated/analystep.go

@@ -40,22 +40,29 @@ func AnalyStart(job *util.Job) {
 			//块中再查找表格(块,处理完把值赋到块)
 			t1, _ := ComputeConRatio(bl.Text, 2)
 			if len(t1) > 0 {
-				job.HasTable = 1                                                                             //添加标识:文本中有table
-				tabres := AnalyTableV2(t1, job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
-				processTableResult(tabres, bl, job)                                                          //分析table解析结果
-				if bl.Title == "" && tabres.BlockTag != "" {
-					bl.Title = tabres.BlockTag
+				job.HasTable = 1
+				for i:=0;i<len(tabs);i++{
+					bl := &util.Block{}
+					//添加标识:文本中有table
+					tabres := AnalyTableV2(t1[0], job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
+					processTableResult(tabres, bl, job)                                                          //分析table解析结果
+					if bl.Title == "" && tabres.BlockTag != "" {
+						bl.Title = tabres.BlockTag
+					}
+					if len(bl.TableKV.Kv)>0{
+						bl.Text = tabs[i].Text()
+						job.Block = append(job.Block, bl)
+					}
 				}
 				//				for k, v := range bl.TableKV.Kv {
 				//					log.Println("bl.TableKV.Kv", k, v)
 				//				}
 			}
-			job.Block = append(job.Block, bl)
-
 			if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
 				//新加table未找到winnerorder, 从分块文本中找中标候选人
 				job.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
 			}
+			job.Block = append(job.Block, bl)
 		}
 	} else { //未分块,创建分块
 		bl := &util.Block{}
@@ -64,8 +71,19 @@ func AnalyStart(job *util.Job) {
 			job.HasTable = 1 //添加标识:文本中有table
 			newCon = TextAfterRemoveTable(con)
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
-			tabres := AnalyTableV2(tabs, job.Category, "", con, 1, job.SourceMid, job.RuleBlock)
-			processTableResult(tabres, bl, job)
+			for i:=0;i<len(tabs);i++{
+				bl := &util.Block{}
+				//添加标识:文本中有table
+				tabres := AnalyTableV2(tabs[i], job.Category, "", con, 1, job.SourceMid, job.RuleBlock)//解析表格入口 返回:汇总表格对象
+				processTableResult(tabres, bl, job)                                                          //分析table解析结果
+				if bl.Title == "" && tabres.BlockTag != "" {
+					bl.Title = tabres.BlockTag
+				}
+				if len(bl.TableKV.Kv) >0 {
+					bl.Text = tabs[i].Text()
+					job.Block = append(job.Block, bl)
+				}
+			}
 			//			for k, v := range bl.TableKV.Kv {
 			//				log.Println("bl.TableKV.Kv", k, v)
 			//			}

+ 12 - 12
src/jy/pretreated/analytable.go

@@ -635,7 +635,7 @@ func (table *Table) MergerToTableresult() {
 解析表格入口
 返回:汇总表格对象
 **/
-func AnalyTableV2(tabs []*goquery.Selection, toptype, blockTag, con string, itype int, _id interface{}, ruleBlock *u.RuleBlock) (tabres *TableResult) {
+func AnalyTableV2(tabs *goquery.Selection, toptype, blockTag, con string, itype int, _id interface{}, ruleBlock *u.RuleBlock) (tabres *TableResult) {
 	defer qutil.Catch()
 	//u.Debug(con)
 	if itype == 1 {
@@ -645,13 +645,13 @@ func AnalyTableV2(tabs []*goquery.Selection, toptype, blockTag, con string, ityp
 	//生成tableresult对象
 	tabres = NewTableResult(_id, toptype, blockTag, con, itype, ruleBlock)
 	//可以有多个table
-	for _, table := range tabs {
+	//for _, table := range tabs {
 		//隐藏表格跳过
-		if IsHide(table) {
-			continue
+		if IsHide(tabs) {
+			return
 		}
-		tabres.GoqueryTabs = append(tabres.GoqueryTabs, table)
-	}
+		tabres.GoqueryTabs = tabs
+	//}
 	//解析表格集
 	tabres.Analy()
 	return
@@ -664,18 +664,18 @@ func (ts *TableResult) Analy() {
 		IndexMap: map[int]string{},
 		MatchMap: map[string]map[string]bool{},
 	}
-	for _, table := range ts.GoqueryTabs {
-		tn := NewTable(ts.Html, ts, table)
+	//for _, table := range ts.GoqueryTabs {
+		tn := NewTable(ts.Html, ts, ts.GoqueryTabs)
 		//核心模块
-		ts := tn.Analy(contactFormat)
-		for _, tab := range ts {
+		tsw := tn.Analy(contactFormat)
+		for _, tab := range tsw {
 			if len(tab.TRs) > 0 {
 				tabs = append(tabs, tab)
 			}
 			//fmt.Println("tab.SortKV.Map", tab.SortKV.Keys)
 		}
 		//tn.SonTables = append(tn.SonTables, tn)
-	}
+	//}
 	//统一合并,考虑统一多表格是多包的情况---新增
 	if len(tabs) > 1 {
 		pns := map[string]string{}
@@ -789,7 +789,7 @@ func (table *Table) createTabe(trs *goquery.Selection) {
 			td := NewTD(selm, TR, table) //初始化td,kv处理,td中有table处理,td的方向
 			//num++
 			TR.AddTD(td)
-			if td.Val == "" && td.SonTableResult == nil { //删除一个tr,tr中所有td是空值的
+			if td.Val == "" && td.SonTableResult == nil && len(td.SortKV.Map) == 0{ //删除一个tr,tr中所有td是空值的
 				empty++
 				if tds.Size() == empty {
 					tdTextIsNull = true

+ 94 - 90
src/jy/pretreated/tablev2.go

@@ -22,13 +22,13 @@ type TableResult struct {
 	Itype          int         //1全文 2是块
 	BlockTag       string      //块标签
 	Html           string
-	Tabs           []*Table             //子表集合,子表中包含标准化kv或原始kv
-	GoqueryTabs    []*goquery.Selection //goquery对象
-	TableSize      int                  //子表的个数0,1,n
-	IsMultiPackage bool                 //是否有子包
-	PackageMap     *SortMap             //子包对象的sortmap,含标准化过的
-	SortKV         *SortMap             //全局KVmap值,标准化处理过的
-	SortKVWeight   map[string]int       //全局KVmap值,标准化处理过的
+	Tabs           []*Table           //子表集合,子表中包含标准化kv或原始kv
+	GoqueryTabs    *goquery.Selection //goquery对象
+	TableSize      int                //子表的个数0,1,n
+	IsMultiPackage bool               //是否有子包
+	PackageMap     *SortMap           //子包对象的sortmap,含标准化过的
+	SortKV         *SortMap           //全局KVmap值,标准化处理过的
+	SortKVWeight   map[string]int     //全局KVmap值,标准化处理过的
 	WinnerOrder    []map[string]interface{}
 	BrandData      [][]map[string]string //品牌抽取结果
 	HasKey         int                   //有key
@@ -46,7 +46,7 @@ func NewTableResult(Id interface{}, Toptype, BlockTag, con string, Itype int, ru
 		Itype:        Itype,
 		BlockTag:     BlockTag,
 		Tabs:         []*Table{},
-		GoqueryTabs:  []*goquery.Selection{},
+		GoqueryTabs:  &goquery.Selection{},
 		PackageMap:   NewSortMap(),
 		SortKV:       NewSortMap(),
 		SortKVWeight: map[string]int{},
@@ -124,44 +124,35 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		//格式化正文
 		txt = TextAfterRemoveTable(td.Html)
 		td.tdHasTable(&bsontable, tr, table) //处理td中的table,块标签处理,子表解析集处理
-		//处理table外内容
-		var ub []*u.Block
-		ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock)
-		//看是否划块
-		if len(ub) > 0 {
-			colonKvWeight := map[string]int{}
-			spaceKvWeight := map[string]int{}
-			for _, bl := range ub {
-				//冒号kv
-				for bl_ck, bl_cv := range bl.ColonKV.Kv {
-					if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
-						colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
-						td.SortKV.AddKey(bl_ck, bl_cv)
-					}
-				}
-				//空格kv
-				for bl_sk, bl_sv := range bl.SpaceKV.Kv {
-					if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
-						spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
-						td.SortKV.AddKey(bl_sk, bl_sv)
-					}
-				}
-			}
-		}
 	} else {
 		txt = strings.TrimSpace(td.Goquery.Text())
 	}
 	text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
 	td.Val = text //值
 	td.Text = txt //原始串
-	//调用kv解析
-	cKV := GetKVAll(text, "", nil, 1)
-	for k, v := range cKV.Kv {
-		td.SortKV.AddKey(k, v)
-	}
-	sKV := SspacekvEntity.Entrance(text, "", nil)
-	for k, v := range sKV.Kv {
-		td.SortKV.AddKey(k, v)
+	//处理table外内容
+	var ub []*u.Block
+	ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock)
+	//看是否划块
+	if len(ub) > 0 {
+		colonKvWeight := map[string]int{}
+		spaceKvWeight := map[string]int{}
+		for _, bl := range ub {
+			//冒号kv
+			for bl_ck, bl_cv := range bl.ColonKV.Kv {
+				if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
+					colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
+					td.SortKV.AddKey(bl_ck, bl_cv)
+				}
+			}
+			//空格kv
+			for bl_sk, bl_sv := range bl.SpaceKV.Kv {
+				if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
+					spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
+					td.SortKV.AddKey(bl_sk, bl_sv)
+				}
+			}
+		}
 	}
 	//抽取不到走正则抽
 	proCode := projectcodeReg.FindString(text)
@@ -227,65 +218,78 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR, table *Table) {
 				}
 				stag = str
 			}
-			sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
-			td.BH = false
-			for k, v := range sonts.SortKV.Map {
-				if td.TR.Table.TableResult == nil {
-					td.TR.Table.TableResult = NewTableResult(sonts.Id, sonts.Toptype, sonts.BlockTag, sonts.Html, sonts.Itype, sonts.RuleBlock)
+			for _, tv := range tabs {
+				if IsHide(tv) {
+					continue
 				}
-				td.TR.Table.TableResult.SortKV.AddKey(k, v)
-				td.TR.Table.TableResult.SortKVWeight[k] = sonts.SortKVWeight[k]
-			}
-			//td.SonTableResult = sonts
-			//for _, k := range sonts.SortKV.Keys {
-			//u.Debug(k, sonts.SortKV.Map[k])
-			//				td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
-			//				td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
-			//}
-			//增加brand (子表)
-			//fmt.Println("sonsHasKey=============", sonts.HasKey)
-			//fmt.Println("sonsHasGoods========", sonts.HasGoods)
-			//fmt.Println("sonsHasBrand========", sonts.HasBrand)
-			if sonts.HasKey != 0 {
-				td.TR.Table.TableResult.HasKey = sonts.HasKey
-			}
-			if sonts.HasGoods != 0 {
-				td.TR.Table.TableResult.HasGoods = sonts.HasGoods
-			}
-			if sonts.HasBrand != 0 {
-				td.TR.Table.TableResult.HasBrand = sonts.HasBrand
-			}
-			if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table
-				for _, v := range sonts.BrandData {
-					if len(v) > 0 {
-						td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v)
+				sonts := NewTableResult(ts.Id, ts.Toptype, stag, td.Html, 2, table.TableResult.RuleBlock)
+				sonts.GoqueryTabs = tv
+				sonts.Analy()
+
+				//sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
+				td.BH = false
+				for k, v := range sonts.SortKV.Map {
+					if td.TR.Table.TableResult == nil {
+						td.TR.Table.TableResult = NewTableResult(sonts.Id, sonts.Toptype, sonts.BlockTag, sonts.Html, sonts.Itype, sonts.RuleBlock)
 					}
+					if td.Val == "" {
+						td.SortKV.AddKey(k, v)
+					}
+					td.TR.Table.TableResult.SortKV.AddKey(k, v)
+					td.TR.Table.TableResult.SortKVWeight[k] = sonts.SortKVWeight[k]
 				}
-			}
-			if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
-				td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
-			}
-			if sonts.IsMultiPackage {
-				td.TR.Table.BPackage = true
-				tb1 := td.TR.Table.BlockPackage
-				for k, v := range sonts.PackageMap.Map {
-					v1 := v.(*u.BlockPackage)
-					if tb1.Map[k] == nil {
-						tb1.AddKey(k, v)
-					} else {
-						bp := tb1.Map[k].(*u.BlockPackage)
-						if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil {
-							for k2, v2 := range v1.TableKV.Kv {
-								if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" {
-									bp.TableKV.Kv[k2] = v2
-									bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
+				td.SonTableResult = sonts
+				//for _, k := range sonts.SortKV.Keys {
+				//u.Debug(k, sonts.SortKV.Map[k])
+				//				td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
+				//				td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
+				//}
+				//增加brand (子表)
+				//fmt.Println("sonsHasKey=============", sonts.HasKey)
+				//fmt.Println("sonsHasGoods========", sonts.HasGoods)
+				//fmt.Println("sonsHasBrand========", sonts.HasBrand)
+				if sonts.HasKey != 0 {
+					td.TR.Table.TableResult.HasKey = sonts.HasKey
+				}
+				if sonts.HasGoods != 0 {
+					td.TR.Table.TableResult.HasGoods = sonts.HasGoods
+				}
+				if sonts.HasBrand != 0 {
+					td.TR.Table.TableResult.HasBrand = sonts.HasBrand
+				}
+				if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table
+					for _, v := range sonts.BrandData {
+						if len(v) > 0 {
+							td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v)
+						}
+					}
+				}
+				if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
+					td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
+				}
+				if sonts.IsMultiPackage {
+					td.TR.Table.BPackage = true
+					tb1 := td.TR.Table.BlockPackage
+					for k, v := range sonts.PackageMap.Map {
+						v1 := v.(*u.BlockPackage)
+						if tb1.Map[k] == nil {
+							tb1.AddKey(k, v)
+						} else {
+							bp := tb1.Map[k].(*u.BlockPackage)
+							if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil {
+								for k2, v2 := range v1.TableKV.Kv {
+									if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" {
+										bp.TableKV.Kv[k2] = v2
+										bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
+									}
 								}
 							}
 						}
 					}
+					//u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
 				}
-				//u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
 			}
+
 		}
 	}
 }