|
@@ -22,13 +22,13 @@ type TableResult struct {
|
|
|
Itype int //1全文 2是块
|
|
|
BlockTag string //块标签
|
|
|
Html string
|
|
|
- Tabs []*Table //子表集合,子表中包含标准化kv或原始kv
|
|
|
- GoqueryTabs []*goquery.Selection //goquery对象
|
|
|
- TableSize int //子表的个数0,1,n
|
|
|
- IsMultiPackage bool //是否有子包
|
|
|
- PackageMap *SortMap //子包对象的sortmap,含标准化过的
|
|
|
- SortKV *SortMap //全局KVmap值,标准化处理过的
|
|
|
- SortKVWeight map[string]int //全局KVmap值,标准化处理过的
|
|
|
+ Tabs []*Table //子表集合,子表中包含标准化kv或原始kv
|
|
|
+ GoqueryTabs *goquery.Selection //goquery对象
|
|
|
+ TableSize int //子表的个数0,1,n
|
|
|
+ IsMultiPackage bool //是否有子包
|
|
|
+ PackageMap *SortMap //子包对象的sortmap,含标准化过的
|
|
|
+ SortKV *SortMap //全局KVmap值,标准化处理过的
|
|
|
+ SortKVWeight map[string]int //全局KVmap值,标准化处理过的
|
|
|
WinnerOrder []map[string]interface{}
|
|
|
BrandData [][]map[string]string //品牌抽取结果
|
|
|
HasKey int //有key
|
|
@@ -46,7 +46,7 @@ func NewTableResult(Id interface{}, Toptype, BlockTag, con string, Itype int, ru
|
|
|
Itype: Itype,
|
|
|
BlockTag: BlockTag,
|
|
|
Tabs: []*Table{},
|
|
|
- GoqueryTabs: []*goquery.Selection{},
|
|
|
+ GoqueryTabs: &goquery.Selection{},
|
|
|
PackageMap: NewSortMap(),
|
|
|
SortKV: NewSortMap(),
|
|
|
SortKVWeight: map[string]int{},
|
|
@@ -124,44 +124,35 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
|
|
|
//格式化正文
|
|
|
txt = TextAfterRemoveTable(td.Html)
|
|
|
td.tdHasTable(&bsontable, tr, table) //处理td中的table,块标签处理,子表解析集处理
|
|
|
- //处理table外内容
|
|
|
- var ub []*u.Block
|
|
|
- ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock)
|
|
|
- //看是否划块
|
|
|
- if len(ub) > 0 {
|
|
|
- colonKvWeight := map[string]int{}
|
|
|
- spaceKvWeight := map[string]int{}
|
|
|
- for _, bl := range ub {
|
|
|
- //冒号kv
|
|
|
- for bl_ck, bl_cv := range bl.ColonKV.Kv {
|
|
|
- if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
|
|
|
- colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
|
|
|
- td.SortKV.AddKey(bl_ck, bl_cv)
|
|
|
- }
|
|
|
- }
|
|
|
- //空格kv
|
|
|
- for bl_sk, bl_sv := range bl.SpaceKV.Kv {
|
|
|
- if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
|
|
|
- spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
|
|
|
- td.SortKV.AddKey(bl_sk, bl_sv)
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
} else {
|
|
|
txt = strings.TrimSpace(td.Goquery.Text())
|
|
|
}
|
|
|
text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
|
|
|
td.Val = text //值
|
|
|
td.Text = txt //原始串
|
|
|
- //调用kv解析
|
|
|
- cKV := GetKVAll(text, "", nil, 1)
|
|
|
- for k, v := range cKV.Kv {
|
|
|
- td.SortKV.AddKey(k, v)
|
|
|
- }
|
|
|
- sKV := SspacekvEntity.Entrance(text, "", nil)
|
|
|
- for k, v := range sKV.Kv {
|
|
|
- td.SortKV.AddKey(k, v)
|
|
|
+ //处理table外内容
|
|
|
+ var ub []*u.Block
|
|
|
+ ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock)
|
|
|
+ //看是否划块
|
|
|
+ if len(ub) > 0 {
|
|
|
+ colonKvWeight := map[string]int{}
|
|
|
+ spaceKvWeight := map[string]int{}
|
|
|
+ for _, bl := range ub {
|
|
|
+ //冒号kv
|
|
|
+ for bl_ck, bl_cv := range bl.ColonKV.Kv {
|
|
|
+ if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
|
|
|
+ colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
|
|
|
+ td.SortKV.AddKey(bl_ck, bl_cv)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //空格kv
|
|
|
+ for bl_sk, bl_sv := range bl.SpaceKV.Kv {
|
|
|
+ if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
|
|
|
+ spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
|
|
|
+ td.SortKV.AddKey(bl_sk, bl_sv)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
//抽取不到走正则抽
|
|
|
proCode := projectcodeReg.FindString(text)
|
|
@@ -227,65 +218,78 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR, table *Table) {
|
|
|
}
|
|
|
stag = str
|
|
|
}
|
|
|
- sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
|
|
|
- td.BH = false
|
|
|
- for k, v := range sonts.SortKV.Map {
|
|
|
- if td.TR.Table.TableResult == nil {
|
|
|
- td.TR.Table.TableResult = NewTableResult(sonts.Id, sonts.Toptype, sonts.BlockTag, sonts.Html, sonts.Itype, sonts.RuleBlock)
|
|
|
+ for _, tv := range tabs {
|
|
|
+ if IsHide(tv) {
|
|
|
+ continue
|
|
|
}
|
|
|
- td.TR.Table.TableResult.SortKV.AddKey(k, v)
|
|
|
- td.TR.Table.TableResult.SortKVWeight[k] = sonts.SortKVWeight[k]
|
|
|
- }
|
|
|
- //td.SonTableResult = sonts
|
|
|
- //for _, k := range sonts.SortKV.Keys {
|
|
|
- //u.Debug(k, sonts.SortKV.Map[k])
|
|
|
- // td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
|
|
|
- // td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
|
|
|
- //}
|
|
|
- //增加brand (子表)
|
|
|
- //fmt.Println("sonsHasKey=============", sonts.HasKey)
|
|
|
- //fmt.Println("sonsHasGoods========", sonts.HasGoods)
|
|
|
- //fmt.Println("sonsHasBrand========", sonts.HasBrand)
|
|
|
- if sonts.HasKey != 0 {
|
|
|
- td.TR.Table.TableResult.HasKey = sonts.HasKey
|
|
|
- }
|
|
|
- if sonts.HasGoods != 0 {
|
|
|
- td.TR.Table.TableResult.HasGoods = sonts.HasGoods
|
|
|
- }
|
|
|
- if sonts.HasBrand != 0 {
|
|
|
- td.TR.Table.TableResult.HasBrand = sonts.HasBrand
|
|
|
- }
|
|
|
- if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table
|
|
|
- for _, v := range sonts.BrandData {
|
|
|
- if len(v) > 0 {
|
|
|
- td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v)
|
|
|
+ sonts := NewTableResult(ts.Id, ts.Toptype, stag, td.Html, 2, table.TableResult.RuleBlock)
|
|
|
+ sonts.GoqueryTabs = tv
|
|
|
+ sonts.Analy()
|
|
|
+
|
|
|
+ //sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
|
|
|
+ td.BH = false
|
|
|
+ for k, v := range sonts.SortKV.Map {
|
|
|
+ if td.TR.Table.TableResult == nil {
|
|
|
+ td.TR.Table.TableResult = NewTableResult(sonts.Id, sonts.Toptype, sonts.BlockTag, sonts.Html, sonts.Itype, sonts.RuleBlock)
|
|
|
}
|
|
|
+ if td.Val == "" {
|
|
|
+ td.SortKV.AddKey(k, v)
|
|
|
+ }
|
|
|
+ td.TR.Table.TableResult.SortKV.AddKey(k, v)
|
|
|
+ td.TR.Table.TableResult.SortKVWeight[k] = sonts.SortKVWeight[k]
|
|
|
}
|
|
|
- }
|
|
|
- if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
|
|
|
- td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
|
|
|
- }
|
|
|
- if sonts.IsMultiPackage {
|
|
|
- td.TR.Table.BPackage = true
|
|
|
- tb1 := td.TR.Table.BlockPackage
|
|
|
- for k, v := range sonts.PackageMap.Map {
|
|
|
- v1 := v.(*u.BlockPackage)
|
|
|
- if tb1.Map[k] == nil {
|
|
|
- tb1.AddKey(k, v)
|
|
|
- } else {
|
|
|
- bp := tb1.Map[k].(*u.BlockPackage)
|
|
|
- if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil {
|
|
|
- for k2, v2 := range v1.TableKV.Kv {
|
|
|
- if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" {
|
|
|
- bp.TableKV.Kv[k2] = v2
|
|
|
- bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
|
|
|
+ td.SonTableResult = sonts
|
|
|
+ //for _, k := range sonts.SortKV.Keys {
|
|
|
+ //u.Debug(k, sonts.SortKV.Map[k])
|
|
|
+ // td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
|
|
|
+ // td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
|
|
|
+ //}
|
|
|
+ //增加brand (子表)
|
|
|
+ //fmt.Println("sonsHasKey=============", sonts.HasKey)
|
|
|
+ //fmt.Println("sonsHasGoods========", sonts.HasGoods)
|
|
|
+ //fmt.Println("sonsHasBrand========", sonts.HasBrand)
|
|
|
+ if sonts.HasKey != 0 {
|
|
|
+ td.TR.Table.TableResult.HasKey = sonts.HasKey
|
|
|
+ }
|
|
|
+ if sonts.HasGoods != 0 {
|
|
|
+ td.TR.Table.TableResult.HasGoods = sonts.HasGoods
|
|
|
+ }
|
|
|
+ if sonts.HasBrand != 0 {
|
|
|
+ td.TR.Table.TableResult.HasBrand = sonts.HasBrand
|
|
|
+ }
|
|
|
+ if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table
|
|
|
+ for _, v := range sonts.BrandData {
|
|
|
+ if len(v) > 0 {
|
|
|
+ td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
|
|
|
+ td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
|
|
|
+ }
|
|
|
+ if sonts.IsMultiPackage {
|
|
|
+ td.TR.Table.BPackage = true
|
|
|
+ tb1 := td.TR.Table.BlockPackage
|
|
|
+ for k, v := range sonts.PackageMap.Map {
|
|
|
+ v1 := v.(*u.BlockPackage)
|
|
|
+ if tb1.Map[k] == nil {
|
|
|
+ tb1.AddKey(k, v)
|
|
|
+ } else {
|
|
|
+ bp := tb1.Map[k].(*u.BlockPackage)
|
|
|
+ if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil {
|
|
|
+ for k2, v2 := range v1.TableKV.Kv {
|
|
|
+ if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" {
|
|
|
+ bp.TableKV.Kv[k2] = v2
|
|
|
+ bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ //u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
|
|
|
}
|
|
|
- //u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
|
|
|
}
|
|
|
+
|
|
|
}
|
|
|
}
|
|
|
}
|