|
@@ -22,13 +22,13 @@ type TableResult struct {
|
|
|
Itype int //1全文 2是块
|
|
|
BlockTag string //块标签
|
|
|
Html string
|
|
|
- Tabs []*Table //子表集合,子表中包含标准化kv或原始kv
|
|
|
- GoqueryTabs []*goquery.Selection //goquery对象
|
|
|
- TableSize int //子表的个数0,1,n
|
|
|
- IsMultiPackage bool //是否有子包
|
|
|
- PackageMap *SortMap //子包对象的sortmap,含标准化过的
|
|
|
- SortKV *SortMap //全局KVmap值,标准化处理过的
|
|
|
- SortKVWeight map[string]int //全局KVmap值,标准化处理过的
|
|
|
+ Tabs []*Table //子表集合,子表中包含标准化kv或原始kv
|
|
|
+ GoqueryTabs *goquery.Selection //goquery对象
|
|
|
+ TableSize int //子表的个数0,1,n
|
|
|
+ IsMultiPackage bool //是否有子包
|
|
|
+ PackageMap *SortMap //子包对象的sortmap,含标准化过的
|
|
|
+ SortKV *SortMap //全局KVmap值,标准化处理过的
|
|
|
+ SortKVWeight map[string]int //全局KVmap值,标准化处理过的
|
|
|
WinnerOrder []map[string]interface{}
|
|
|
BrandData [][]map[string]string //品牌抽取结果
|
|
|
HasKey int //有key
|
|
@@ -46,7 +46,7 @@ func NewTableResult(Id interface{}, Toptype, BlockTag, con string, Itype int, ru
|
|
|
Itype: Itype,
|
|
|
BlockTag: BlockTag,
|
|
|
Tabs: []*Table{},
|
|
|
- GoqueryTabs: []*goquery.Selection{},
|
|
|
+ GoqueryTabs: &goquery.Selection{},
|
|
|
PackageMap: NewSortMap(),
|
|
|
SortKV: NewSortMap(),
|
|
|
SortKVWeight: map[string]int{},
|
|
@@ -123,64 +123,55 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
|
|
|
//qutil.Debug("有子表格")
|
|
|
//格式化正文
|
|
|
txt = TextAfterRemoveTable(td.Html)
|
|
|
- td.tdHasTable(&bsontable, tr, table) //处理td中的table,块标签处理,子表解析集处理
|
|
|
- //处理table外内容
|
|
|
- var ub []*u.Block
|
|
|
- ub, _ = DivideBlock("",txt, 2, table.TableResult.RuleBlock)
|
|
|
- //看是否划块
|
|
|
- if len(ub) > 0 {
|
|
|
- colonKvWeight := map[string]int{}
|
|
|
- spaceKvWeight := map[string]int{}
|
|
|
- for _, bl := range ub {
|
|
|
- //冒号kv
|
|
|
- for bl_ck, bl_cv := range bl.ColonKV.Kv {
|
|
|
- if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
|
|
|
- colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
|
|
|
- td.SortKV.AddKey(bl_ck, bl_cv)
|
|
|
- }
|
|
|
- }
|
|
|
- //空格kv
|
|
|
- for bl_sk, bl_sv := range bl.SpaceKV.Kv {
|
|
|
- if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
|
|
|
- spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
|
|
|
- td.SortKV.AddKey(bl_sk, bl_sv)
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
+ td.tdHasTable(&bsontable, tr) //处理td中的table,块标签处理,子表解析集处理
|
|
|
} else {
|
|
|
txt = strings.TrimSpace(td.Goquery.Text())
|
|
|
}
|
|
|
text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
|
|
|
td.Val = text //值
|
|
|
td.Text = txt //原始串
|
|
|
- //调用kv解析
|
|
|
- cKV := GetKVAll(text, "", nil, 1)
|
|
|
- for k,v :=range cKV.Kv{
|
|
|
- td.SortKV.AddKey(k,v)
|
|
|
- }
|
|
|
- sKV := SspacekvEntity.Entrance(text, "", nil)
|
|
|
- for k,v :=range sKV.Kv{
|
|
|
- td.SortKV.AddKey(k,v)
|
|
|
+ //处理table外内容
|
|
|
+ var ub []*u.Block
|
|
|
+ ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock)
|
|
|
+ //看是否划块
|
|
|
+ if len(ub) > 0 {
|
|
|
+ colonKvWeight := map[string]int{}
|
|
|
+ spaceKvWeight := map[string]int{}
|
|
|
+ for _, bl := range ub {
|
|
|
+ //冒号kv
|
|
|
+ for bl_ck, bl_cv := range bl.ColonKV.Kv {
|
|
|
+ if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
|
|
|
+ colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
|
|
|
+ td.SortKV.AddKey(bl_ck, bl_cv)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //空格kv
|
|
|
+ for bl_sk, bl_sv := range bl.SpaceKV.Kv {
|
|
|
+ if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
|
|
|
+ spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
|
|
|
+ td.SortKV.AddKey(bl_sk, bl_sv)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
//抽取不到走正则抽
|
|
|
proCode := projectcodeReg.FindString(text)
|
|
|
if proCode != "" {
|
|
|
ckv := GetKVAll(proCode, "", nil, 1)
|
|
|
- for k,v :=range ckv.Kv{
|
|
|
- td.SortKV.AddKey(k,v)
|
|
|
+ for k, v := range ckv.Kv {
|
|
|
+ td.SortKV.AddKey(k, v)
|
|
|
}
|
|
|
- }else if proCode = projectcodeReg2.FindString(text);proCode !=""{
|
|
|
+ } else if proCode = projectcodeReg2.FindString(text); proCode != "" {
|
|
|
ckv := GetKVAll(proCode, "", nil, 1)
|
|
|
- for k,v :=range ckv.Kv{
|
|
|
- td.SortKV.AddKey(k,v)
|
|
|
+ for k, v := range ckv.Kv {
|
|
|
+ td.SortKV.AddKey(k, v)
|
|
|
}
|
|
|
}
|
|
|
- if proCode = jsonReg.FindString(text);proCode != ""{
|
|
|
+ if proCode = jsonReg.FindString(text); proCode != "" {
|
|
|
jsonMap := make(map[string]string)
|
|
|
- json.Unmarshal([]byte(proCode),&jsonMap)
|
|
|
- for k,v := range jsonMap{
|
|
|
- td.SortKV.AddKey(k,v)
|
|
|
+ json.Unmarshal([]byte(proCode), &jsonMap)
|
|
|
+ for k, v := range jsonMap {
|
|
|
+ td.SortKV.AddKey(k, v)
|
|
|
}
|
|
|
}
|
|
|
//对td单元格值判断是否是表头和根据td内容长度进行分块处理
|
|
@@ -201,7 +192,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
|
|
|
}
|
|
|
|
|
|
//处理td中的table,块标签处理,子表解析集处理
|
|
|
-func (td *TD) tdHasTable(bsontable *bool, tr *TR, table *Table) {
|
|
|
+func (td *TD) tdHasTable(bsontable *bool, tr *TR) {
|
|
|
ts := td.TR.Table.TableResult
|
|
|
tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比
|
|
|
if len(tabs) > 0 {
|
|
@@ -227,64 +218,75 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR, table *Table) {
|
|
|
}
|
|
|
stag = str
|
|
|
}
|
|
|
- sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
|
|
|
- td.BH = false
|
|
|
- for k,v := range sonts.SortKV.Map{
|
|
|
- if td.TR.Table.TableResult == nil{
|
|
|
- td.TR.Table.TableResult = NewTableResult(sonts.Id,sonts.Toptype,sonts.BlockTag,sonts.Html,sonts.Itype,sonts.RuleBlock)
|
|
|
+ for _, tv := range tabs {
|
|
|
+ if IsHide(tv) {
|
|
|
+ continue
|
|
|
}
|
|
|
- td.TR.Table.TableResult.SortKV.AddKey(k,v)
|
|
|
- }
|
|
|
- //td.SonTableResult = sonts
|
|
|
- //for _, k := range sonts.SortKV.Keys {
|
|
|
- //u.Debug(k, sonts.SortKV.Map[k])
|
|
|
- // td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
|
|
|
- // td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
|
|
|
- //}
|
|
|
- //增加brand (子表)
|
|
|
- //fmt.Println("sonsHasKey=============", sonts.HasKey)
|
|
|
- //fmt.Println("sonsHasGoods========", sonts.HasGoods)
|
|
|
- //fmt.Println("sonsHasBrand========", sonts.HasBrand)
|
|
|
- if sonts.HasKey != 0 {
|
|
|
- td.TR.Table.TableResult.HasKey = sonts.HasKey
|
|
|
- }
|
|
|
- if sonts.HasGoods != 0 {
|
|
|
- td.TR.Table.TableResult.HasGoods = sonts.HasGoods
|
|
|
- }
|
|
|
- if sonts.HasBrand != 0 {
|
|
|
- td.TR.Table.TableResult.HasBrand = sonts.HasBrand
|
|
|
- }
|
|
|
- if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table
|
|
|
- for _, v := range sonts.BrandData {
|
|
|
- if len(v) > 0 {
|
|
|
- td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v)
|
|
|
+ sonts := NewTableResult(ts.Id, ts.Toptype, stag, td.Html, 2, td.TR.Table.TableResult.RuleBlock)
|
|
|
+ sonts.GoqueryTabs = tv
|
|
|
+ sonts.Analy()
|
|
|
+
|
|
|
+ //sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
|
|
|
+ td.BH = false
|
|
|
+ for k, v := range sonts.SortKV.Map {
|
|
|
+ if td.TR.Table.TableResult == nil {
|
|
|
+ td.TR.Table.TableResult = NewTableResult(sonts.Id, sonts.Toptype, sonts.BlockTag, sonts.Html, sonts.Itype, sonts.RuleBlock)
|
|
|
}
|
|
|
+ td.TR.Table.TableResult.SortKV.AddKey(k, v)
|
|
|
+ td.TR.Table.TableResult.SortKVWeight[k] = sonts.SortKVWeight[k]
|
|
|
}
|
|
|
- }
|
|
|
- if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
|
|
|
- td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
|
|
|
- }
|
|
|
- if sonts.IsMultiPackage {
|
|
|
- td.TR.Table.BPackage = true
|
|
|
- tb1 := td.TR.Table.BlockPackage
|
|
|
- for k, v := range sonts.PackageMap.Map {
|
|
|
- v1 := v.(*u.BlockPackage)
|
|
|
- if tb1.Map[k] == nil {
|
|
|
- tb1.AddKey(k, v)
|
|
|
- } else {
|
|
|
- bp := tb1.Map[k].(*u.BlockPackage)
|
|
|
- if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil {
|
|
|
- for k2, v2 := range v1.TableKV.Kv {
|
|
|
- if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" {
|
|
|
- bp.TableKV.Kv[k2] = v2
|
|
|
- bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
|
|
|
+ td.SonTableResult = sonts
|
|
|
+ //for _, k := range sonts.SortKV.Keys {
|
|
|
+ //u.Debug(k, sonts.SortKV.Map[k])
|
|
|
+ // td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
|
|
|
+ // td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
|
|
|
+ //}
|
|
|
+ //增加brand (子表)
|
|
|
+ //fmt.Println("sonsHasKey=============", sonts.HasKey)
|
|
|
+ //fmt.Println("sonsHasGoods========", sonts.HasGoods)
|
|
|
+ //fmt.Println("sonsHasBrand========", sonts.HasBrand)
|
|
|
+ if sonts.HasKey != 0 {
|
|
|
+ td.TR.Table.TableResult.HasKey = sonts.HasKey
|
|
|
+ }
|
|
|
+ if sonts.HasGoods != 0 {
|
|
|
+ td.TR.Table.TableResult.HasGoods = sonts.HasGoods
|
|
|
+ }
|
|
|
+ if sonts.HasBrand != 0 {
|
|
|
+ td.TR.Table.TableResult.HasBrand = sonts.HasBrand
|
|
|
+ }
|
|
|
+ if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table
|
|
|
+ for _, v := range sonts.BrandData {
|
|
|
+ if len(v) > 0 {
|
|
|
+ td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
|
|
|
+ td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
|
|
|
+ }
|
|
|
+ if sonts.IsMultiPackage {
|
|
|
+ td.TR.Table.BPackage = true
|
|
|
+ tb1 := td.TR.Table.BlockPackage
|
|
|
+ for k, v := range sonts.PackageMap.Map {
|
|
|
+ v1 := v.(*u.BlockPackage)
|
|
|
+ if tb1.Map[k] == nil {
|
|
|
+ tb1.AddKey(k, v)
|
|
|
+ } else {
|
|
|
+ bp := tb1.Map[k].(*u.BlockPackage)
|
|
|
+ if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil {
|
|
|
+ for k2, v2 := range v1.TableKV.Kv {
|
|
|
+ if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" {
|
|
|
+ bp.TableKV.Kv[k2] = v2
|
|
|
+ bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ //u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
|
|
|
}
|
|
|
- //u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
|
|
|
}
|
|
|
+
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -436,8 +438,8 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable bool) {
|
|
|
*/
|
|
|
|
|
|
fSortKV := FindKv(td.Val, "", 2)
|
|
|
- for k,v := range fSortKV.Map{
|
|
|
- td.SortKV.AddKey(k,v)
|
|
|
+ for k, v := range fSortKV.Map {
|
|
|
+ td.SortKV.AddKey(k, v)
|
|
|
}
|
|
|
// td.LeftNode.Val
|
|
|
// for _, vvv := range *td.TR {
|
|
@@ -639,18 +641,20 @@ func (t *Table) InsertTR(tr *TR) {
|
|
|
|
|
|
//支持排序的map
|
|
|
type SortMap struct {
|
|
|
- Index map[string]int
|
|
|
- Keys []string
|
|
|
- Map map[string]interface{}
|
|
|
- Lock sync.Mutex
|
|
|
+ Index map[string]int
|
|
|
+ Keys []string
|
|
|
+ Map map[string]interface{}
|
|
|
+ Lock sync.Mutex
|
|
|
+ NotTagKey map[string]bool
|
|
|
}
|
|
|
|
|
|
//快速创建排序map
|
|
|
func NewSortMap() *SortMap {
|
|
|
return &SortMap{
|
|
|
- Index: map[string]int{},
|
|
|
- Keys: []string{},
|
|
|
- Map: map[string]interface{}{},
|
|
|
+ Index: map[string]int{},
|
|
|
+ Keys: []string{},
|
|
|
+ Map: map[string]interface{}{},
|
|
|
+ NotTagKey: map[string]bool{},
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -881,11 +885,13 @@ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio
|
|
|
**/
|
|
|
return
|
|
|
}
|
|
|
+
|
|
|
//纯文本
|
|
|
func HtmlToText(con string) string {
|
|
|
doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
|
|
|
return doc2.Text()
|
|
|
}
|
|
|
+
|
|
|
//取出排除表格之外的文本
|
|
|
func TextAfterRemoveTable(con string) string {
|
|
|
doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
|