|
@@ -17,23 +17,24 @@ import (
|
|
|
|
|
|
//所有中标候选人只取第一个
|
|
//所有中标候选人只取第一个
|
|
type TableResult struct {
|
|
type TableResult struct {
|
|
- Id interface{} //信息id
|
|
|
|
- Toptype string //信息类型
|
|
|
|
- Itype int //1全文 2是块
|
|
|
|
- BlockTag string //块标签
|
|
|
|
- Html string
|
|
|
|
- Tabs []*Table //子表集合,子表中包含标准化kv或原始kv
|
|
|
|
- GoqueryTabs *goquery.Selection //goquery对象
|
|
|
|
- TableSize int //子表的个数0,1,n
|
|
|
|
- IsMultiPackage bool //是否有子包
|
|
|
|
- PackageMap *SortMap //子包对象的sortmap,含标准化过的
|
|
|
|
- KvTags map[string][]*u.Tag //全局KVmap值,标准化处理过的
|
|
|
|
- WinnerOrder []map[string]interface{}
|
|
|
|
- BrandData [][]map[string]string //品牌抽取结果
|
|
|
|
- HasKey int //有key
|
|
|
|
- HasBrand int //有品牌
|
|
|
|
- HasGoods int //有商品
|
|
|
|
- RuleBlock *u.RuleBlock
|
|
|
|
|
|
+ Id interface{} //信息id
|
|
|
|
+ Toptype string //信息类型
|
|
|
|
+ Itype int //1全文 2是块
|
|
|
|
+ BlockTag string //块标签
|
|
|
|
+ Html string
|
|
|
|
+ Tabs []*Table //子表集合,子表中包含标准化kv或原始kv
|
|
|
|
+ GoqueryTabs *goquery.Selection //goquery对象
|
|
|
|
+ TableSize int //子表的个数0,1,n
|
|
|
|
+ IsMultiPackage bool //是否有子包
|
|
|
|
+ PackageMap *SortMap //子包对象的sortmap,含标准化过的
|
|
|
|
+ KvTags map[string][]*u.Tag //全局KVmap值,标准化处理过的
|
|
|
|
+ WinnerOrder []map[string]interface{}
|
|
|
|
+ BrandData [][]map[string]string //品牌抽取结果
|
|
|
|
+ PriceNumberData [][]map[string]interface{} //单价个数抽取结果
|
|
|
|
+ HasKey int //有key
|
|
|
|
+ HasBrand int //有品牌
|
|
|
|
+ HasGoods int //有商品
|
|
|
|
+ RuleBlock *u.RuleBlock
|
|
}
|
|
}
|
|
|
|
|
|
//快速创建TableResult对象
|
|
//快速创建TableResult对象
|
|
@@ -87,7 +88,7 @@ var submatchreg = regexp.MustCompile(`((?:[一二三四五六七八九十0-10]+[
|
|
var BHKey = regexp.MustCompile(`^[^,,;:。、.]{2,8}.{0,3}[::].+$`)
|
|
var BHKey = regexp.MustCompile(`^[^,,;:。、.]{2,8}.{0,3}[::].+$`)
|
|
var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿元]+)")
|
|
var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿元]+)")
|
|
|
|
|
|
-func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite string) *TD {
|
|
|
|
|
|
+func NewTD(Goquery *goquery.Selection, tr *TR, table *Table, isSite bool, codeSite string) *TD {
|
|
defer qutil.Catch()
|
|
defer qutil.Catch()
|
|
td := &TD{
|
|
td := &TD{
|
|
ArrVal: []string{},
|
|
ArrVal: []string{},
|
|
@@ -121,7 +122,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
|
|
//qutil.Debug("有子表格")
|
|
//qutil.Debug("有子表格")
|
|
//格式化正文
|
|
//格式化正文
|
|
txt = TextAfterRemoveTable(td.Html)
|
|
txt = TextAfterRemoveTable(td.Html)
|
|
- td.tdHasTable(&bsontable, tr,isSite,codeSite) //处理td中的table,块标签处理,子表解析集处理
|
|
|
|
|
|
+ td.tdHasTable(&bsontable, tr, isSite, codeSite) //处理td中的table,块标签处理,子表解析集处理
|
|
} else {
|
|
} else {
|
|
txt = strings.TrimSpace(td.Goquery.Text())
|
|
txt = strings.TrimSpace(td.Goquery.Text())
|
|
}
|
|
}
|
|
@@ -130,7 +131,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
|
|
td.Text = txt //原始串
|
|
td.Text = txt //原始串
|
|
//处理table外内容
|
|
//处理table外内容
|
|
var ub []*u.Block
|
|
var ub []*u.Block
|
|
- ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock,isSite,codeSite)
|
|
|
|
|
|
+ ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock, isSite, codeSite)
|
|
//看是否划块
|
|
//看是否划块
|
|
if len(ub) > 0 {
|
|
if len(ub) > 0 {
|
|
for _, bl := range ub {
|
|
for _, bl := range ub {
|
|
@@ -175,7 +176,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
|
|
}
|
|
}
|
|
}
|
|
}
|
|
//对td单元格值判断是否是表头和根据td内容长度进行分块处理
|
|
//对td单元格值判断是否是表头和根据td内容长度进行分块处理
|
|
- td.tdIsHb(tr, table, bsontable,isSite,codeSite)
|
|
|
|
|
|
+ td.tdIsHb(tr, table, bsontable, isSite, codeSite)
|
|
bhead := false
|
|
bhead := false
|
|
if td.TR.RowPos == 0 { //第一行
|
|
if td.TR.RowPos == 0 { //第一行
|
|
if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
|
|
if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
|
|
@@ -192,7 +193,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
|
|
}
|
|
}
|
|
|
|
|
|
//处理td中的table,块标签处理,子表解析集处理
|
|
//处理td中的table,块标签处理,子表解析集处理
|
|
-func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
|
|
|
|
|
|
+func (td *TD) tdHasTable(bsontable *bool, tr *TR, isSite bool, codeSite string) {
|
|
ts := td.TR.Table.TableResult
|
|
ts := td.TR.Table.TableResult
|
|
tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比
|
|
tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比
|
|
if len(tabs) > 0 {
|
|
if len(tabs) > 0 {
|
|
@@ -219,7 +220,7 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
|
|
stag = str
|
|
stag = str
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- if strings.Contains(stag,"开标记录"){
|
|
|
|
|
|
+ if strings.Contains(stag, "开标记录") {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
for _, tv := range tabs {
|
|
for _, tv := range tabs {
|
|
@@ -228,7 +229,7 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
|
|
}
|
|
}
|
|
sonts := NewTableResult(ts.Id, ts.Toptype, stag, td.Html, 2, td.TR.Table.TableResult.RuleBlock)
|
|
sonts := NewTableResult(ts.Id, ts.Toptype, stag, td.Html, 2, td.TR.Table.TableResult.RuleBlock)
|
|
sonts.GoqueryTabs = tv
|
|
sonts.GoqueryTabs = tv
|
|
- sonts.Analy(isSite,codeSite)
|
|
|
|
|
|
+ sonts.Analy(isSite, codeSite)
|
|
|
|
|
|
//sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
|
|
//sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
|
|
td.BH = false
|
|
td.BH = false
|
|
@@ -262,6 +263,13 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+ if sonts.PriceNumberData != nil && len(sonts.PriceNumberData) > 0 {
|
|
|
|
+ for _, v := range sonts.PriceNumberData {
|
|
|
|
+ if len(v) > 0 {
|
|
|
|
+ td.TR.Table.TableResult.PriceNumberData = append(td.TR.Table.TableResult.PriceNumberData, v)
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
|
|
if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
|
|
td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
|
|
td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
|
|
}
|
|
}
|
|
@@ -303,7 +311,7 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
|
|
}
|
|
}
|
|
|
|
|
|
//对td单元格值判断是否是表头和根据td内容长度进行分块处理
|
|
//对td单元格值判断是否是表头和根据td内容长度进行分块处理
|
|
-func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string) {
|
|
|
|
|
|
+func (td *TD) tdIsHb(tr *TR, table *Table, bsontable, isSite bool, codeSite string) {
|
|
lenval := len([]rune(td.Val)) //经过处理的td内容长度
|
|
lenval := len([]rune(td.Val)) //经过处理的td内容长度
|
|
//if lentxt > 9 {
|
|
//if lentxt > 9 {
|
|
//td.KV = GetKVAll(txt, "")
|
|
//td.KV = GetKVAll(txt, "")
|
|
@@ -311,7 +319,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
|
|
//经过处理的td内容长度大于50,划块,分包
|
|
//经过处理的td内容长度大于50,划块,分包
|
|
if lenval > 50 { //看是否划块
|
|
if lenval > 50 { //看是否划块
|
|
//u.Debug(txt)
|
|
//u.Debug(txt)
|
|
- ub, _ = DivideBlock("", td.Text, 2, table.TableResult.RuleBlock,isSite,codeSite) //对td的原始值
|
|
|
|
|
|
+ ub, _ = DivideBlock("", td.Text, 2, table.TableResult.RuleBlock, isSite, codeSite) //对td的原始值
|
|
//看是否划块
|
|
//看是否划块
|
|
if len(ub) > 0 {
|
|
if len(ub) > 0 {
|
|
for _, bl := range ub {
|
|
for _, bl := range ub {
|
|
@@ -344,10 +352,10 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
|
|
}
|
|
}
|
|
if isFindPkg {
|
|
if isFindPkg {
|
|
if len(ub) > 0 {
|
|
if len(ub) > 0 {
|
|
- blockPackage = FindPackageFromBlocks(&ub,isSite,codeSite) //从块里面找分包
|
|
|
|
|
|
+ blockPackage = FindPackageFromBlocks(&ub, isSite, codeSite) //从块里面找分包
|
|
} else {
|
|
} else {
|
|
- if !excludeKey2.MatchString(td.Val){
|
|
|
|
- blockPackage = FindPackageFromText("", td.Val,isSite,codeSite) //从正文里面找分包
|
|
|
|
|
|
+ if !excludeKey2.MatchString(td.Val) {
|
|
|
|
+ blockPackage = FindPackageFromText("", td.Val, isSite, codeSite) //从正文里面找分包
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -389,7 +397,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
|
|
td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值
|
|
td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值
|
|
td.BH = true
|
|
td.BH = true
|
|
}
|
|
}
|
|
- _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3,isSite,codeSite) //td冒号kv
|
|
|
|
|
|
+ _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3, isSite, codeSite) //td冒号kv
|
|
for k, v := range resm {
|
|
for k, v := range resm {
|
|
if k != "" && v != "" {
|
|
if k != "" && v != "" {
|
|
td.SortKV.AddKey(k, v) //存放kv值
|
|
td.SortKV.AddKey(k, v) //存放kv值
|
|
@@ -410,14 +418,14 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
|
|
if lenval > 15 && !strings.Contains(txt, "采购代理机构名称、地址和联系方式") {
|
|
if lenval > 15 && !strings.Contains(txt, "采购代理机构名称、地址和联系方式") {
|
|
btw = false
|
|
btw = false
|
|
}
|
|
}
|
|
- if strings.Contains(td.Val, "个项目") ||strings.Contains(td.Val, "奥图码"){
|
|
|
|
|
|
+ if strings.Contains(td.Val, "个项目") || strings.Contains(td.Val, "奥图码") {
|
|
must = false
|
|
must = false
|
|
btw = false
|
|
btw = false
|
|
}
|
|
}
|
|
td.Valtype = repl
|
|
td.Valtype = repl
|
|
td.MustBH = must
|
|
td.MustBH = must
|
|
td.BH = btw
|
|
td.BH = btw
|
|
- if strings.Contains(txt,"年估算额年(万元)"){
|
|
|
|
|
|
+ if strings.Contains(txt, "年估算额年(万元)") {
|
|
td.MustBH = true
|
|
td.MustBH = true
|
|
td.BH = true
|
|
td.BH = true
|
|
}
|
|
}
|
|
@@ -450,7 +458,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
|
|
if len(td.TR.TDs) > 0 {
|
|
if len(td.TR.TDs) > 0 {
|
|
kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
|
|
kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
|
|
}
|
|
}
|
|
- _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2,isSite,codeSite) //获取冒号kv入口
|
|
|
|
|
|
+ _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2, isSite, codeSite) //获取冒号kv入口
|
|
for k, v := range resm {
|
|
for k, v := range resm {
|
|
td.SortKV.AddKey(k, v)
|
|
td.SortKV.AddKey(k, v)
|
|
}
|
|
}
|
|
@@ -496,7 +504,7 @@ func (tr *TR) AddTD(td *TD) {
|
|
tr.TDs[len(tr.TDs)-1].RightNode = td
|
|
tr.TDs[len(tr.TDs)-1].RightNode = td
|
|
}
|
|
}
|
|
**/
|
|
**/
|
|
- if tr==nil|| tr.TDs == nil{
|
|
|
|
|
|
+ if tr == nil || tr.TDs == nil {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
td.ColPos = len(tr.TDs)
|
|
td.ColPos = len(tr.TDs)
|
|
@@ -592,12 +600,13 @@ type Table struct {
|
|
StartAndEndRation map[string]*TDRationScope //同行或同列的概率,截断的单独起算
|
|
StartAndEndRation map[string]*TDRationScope //同行或同列的概率,截断的单独起算
|
|
StartAndEndRationKSort *SortMap
|
|
StartAndEndRationKSort *SortMap
|
|
WinnerOrder []map[string]interface{}
|
|
WinnerOrder []map[string]interface{}
|
|
- BSplit bool //是否是有一个表拆分成的多个表
|
|
|
|
- BHeader bool //拆分表是否有表头
|
|
|
|
- BrandData [][]map[string]string //品牌抽取结果
|
|
|
|
- HasKey int //有key
|
|
|
|
- HasBrand int //有品牌
|
|
|
|
- HasGoods int //有商品
|
|
|
|
|
|
+ BSplit bool //是否是有一个表拆分成的多个表
|
|
|
|
+ BHeader bool //拆分表是否有表头
|
|
|
|
+ BrandData [][]map[string]string //品牌抽取结果
|
|
|
|
+ HasKey int //有key
|
|
|
|
+ HasBrand int //有品牌
|
|
|
|
+ HasGoods int //有商品
|
|
|
|
+ PriceNumberData [][]map[string]interface{} //单价和个数抽取结果
|
|
}
|
|
}
|
|
|
|
|
|
func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Table {
|
|
func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Table {
|
|
@@ -851,7 +860,8 @@ func CheckHeader(txt string) (res, must bool, stype, reg, repl string) {
|
|
con 文本
|
|
con 文本
|
|
strtype 1全文 2块文本
|
|
strtype 1全文 2块文本
|
|
**/
|
|
**/
|
|
-var hisReg =regexp.MustCompile("类似业绩|历史业绩")
|
|
|
|
|
|
+var hisReg = regexp.MustCompile("类似业绩|历史业绩")
|
|
|
|
+
|
|
func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) {
|
|
func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) {
|
|
defer qutil.Catch()
|
|
defer qutil.Catch()
|
|
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
|
|
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
|
|
@@ -869,7 +879,7 @@ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if !b {
|
|
if !b {
|
|
- if hisReg.MatchString(tmpt.First().Text()){
|
|
|
|
|
|
+ if hisReg.MatchString(tmpt.First().Text()) {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
tabs = append(tabs, tmpt)
|
|
tabs = append(tabs, tmpt)
|