package pretreated //定义表格对象 import ( "fmt" u "jy/util" "log" qutil "qfw/util" "regexp" "strings" "sync" "github.com/PuerkitoBio/goquery" ) //所有中标候选人只取第一个 type TableResult struct { Id interface{} //信息id Toptype string //信息类型 Itype int //1全文 2是块 BlockTag string //块标签 Html string Tabs []*Table //子表集合,子表中包含标准化kv或原始kv GoqueryTabs []*goquery.Selection //goquery对象 TableSize int //子表的个数0,1,n IsMultiPackage bool //是否有子包 PackageMap *SortMap //子包对象的sortmap,含标准化过的 SortKV *SortMap //全局KVmap值,标准化处理过的 SortKVWeight map[string]int //全局KVmap值,标准化处理过的 WinnerOrder []map[string]interface{} BrandData [][]map[string]string //品牌抽取结果 HasKey int //有key HasBrand int //有品牌 HasGoods int //有商品 RuleBlock *u.RuleBlock } //快速创建TableResult对象 func NewTableResult(Id interface{}, Toptype, BlockTag, con string, Itype int, ruleBlock *u.RuleBlock) *TableResult { return &TableResult{ Id: Id, Toptype: Toptype, Html: con, Itype: Itype, BlockTag: BlockTag, Tabs: []*Table{}, GoqueryTabs: []*goquery.Selection{}, PackageMap: NewSortMap(), SortKV: NewSortMap(), SortKVWeight: map[string]int{}, RuleBlock: ruleBlock, } } //td节点 type TD struct { Goquery *goquery.Selection //文本对象 TR *TR //所属TR对象 LeftNode *TD //左临节点 TopNode *TD //上临节点 RightNode *TD //右节点 BottomNode *TD //下节点 Val string //值 Text string //原始串 SortKV *SortMap //存放kv值 Html string //html值 BH bool //是否是表头 MustBH bool //不能修改的表头 StandardKey string //标准表头 Colspan int //合并列 Rowspan int //合并行 StartCol int //起始列 EndCol int //终止列 StartRow int //起始行 EndRow int //终止行 ColPos int //当前在TR中的位置 HeadTd *TD //(是val元素)k节点 KVDirect int //键-值方向,0未知,1横 2纵//指值和k的方向 KeyDirect int //k方向,k纵值横,k横值纵 1横 2纵 SonTds []*TD //(是key元素)值节点数组 SonTableResult *TableResult //子值表格集 ArrVal []string //数组值,当是左临元素是合并行的元素时! Valtype string //"BO=中标人顺序" } var submatchreg = regexp.MustCompile(`((?:[一二三四五六七八九十0-10]+[、])([\\S]{4,12})|([\\S]{2,12}))[::]([\\S]{5,60})([一二三四五六七八九]+[、])?`) var BHKey = regexp.MustCompile(`^[^,,;:。、.]{2,8}.{0,3}[::].+$`) var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿元]+)") func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD { defer qutil.Catch() td := &TD{ ArrVal: []string{}, Goquery: Goquery, SonTds: []*TD{}, TR: tr, SortKV: NewSortMap(), } colspan, rowspan := 0, 0 col, bcol := td.Goquery.Attr("colspan") if bcol { colspan = qutil.IntAllDef(col, 1) } if colspan == 0 { colspan = 1 } row, brow := td.Goquery.Attr("rowspan") if brow { rowspan = qutil.IntAllDef(row, 1) } if rowspan == 0 { rowspan = 1 } td.Colspan, td.Rowspan = colspan, rowspan //合并列,合并行 td.Html, _ = td.Goquery.Html() //html值 ht := td.Goquery.ChildrenFiltered("table") //获取td的table bsontable := false //默认td中没有table txt := "" //子table处理合并 if ht.Size() > 0 { //qutil.Debug("有子表格") txt = TextAfterRemoveTable(td.Html) td.tdHasTable(&bsontable, tr, table) //处理td中的table,块标签处理,子表解析集处理 } else { txt = strings.TrimSpace(td.Goquery.Text()) } text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1") td.Val = text //值 td.Text = txt //原始串 //对td单元格值判断是否是表头和根据td内容长度进行分块处理 td.tdIsHb(tr, table, bsontable) bhead := false if td.TR.RowPos == 0 { //第一行 if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头 bhead = true } } if bhead && !bsontable { td.BH = true td.KeyDirect = 1 //k方向,k纵值横,k横值纵 1横 2纵 td.KVDirect = 2 //键-值方向,0未知,1横 2纵//指值和k的方向 } //u.Debug(td.BH, td.Val) return td } //处理td中的table,块标签处理,子表解析集处理 func (td *TD) tdHasTable(bsontable *bool, tr *TR, table *Table) { ts := td.TR.Table.TableResult tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比 if len(tabs) > 0 { (*bsontable) = true stag := ts.BlockTag //块标签 if stag == "" { var tdleft *TD if len(tr.TDs) > 0 { tdleft = tr.TDs[len(tr.TDs)-1] if tdleft.BH { //u.Debug(tdleft.Val),如果不存在就是上一行的 stag = tdleft.Val } } else if len(tr.Table.TRs) > 0 { lasttr := tr.Table.TRs[len(tr.Table.TRs)-1] str := "" for _, td3 := range lasttr.TDs { str += td3.Val if len([]rune(str)) > 14 { str = "" break } } stag = str } sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口 td.BH = false td.SonTableResult = sonts //for _, k := range sonts.SortKV.Keys { //u.Debug(k, sonts.SortKV.Map[k]) // td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string) // td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k] //} //增加brand (子表) //fmt.Println("sonsHasKey=============", sonts.HasKey) //fmt.Println("sonsHasGoods========", sonts.HasGoods) //fmt.Println("sonsHasBrand========", sonts.HasBrand) if sonts.HasKey != 0 { td.TR.Table.TableResult.HasKey = sonts.HasKey } if sonts.HasGoods != 0 { td.TR.Table.TableResult.HasGoods = sonts.HasGoods } if sonts.HasBrand != 0 { td.TR.Table.TableResult.HasBrand = sonts.HasBrand } if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table for _, v := range sonts.BrandData { if len(v) > 0 { td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v) } } } if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 { td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder } if sonts.IsMultiPackage { td.TR.Table.BPackage = true tb1 := td.TR.Table.BlockPackage for k, v := range sonts.PackageMap.Map { v1 := v.(*u.BlockPackage) if tb1.Map[k] == nil { tb1.AddKey(k, v) } else { bp := tb1.Map[k].(*u.BlockPackage) if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil { for k2, v2 := range v1.TableKV.Kv { if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" { bp.TableKV.Kv[k2] = v2 bp.Text += fmt.Sprintf("%v:%v\n", k2, v2) } } } } } //u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"])) } } } } //对td单元格值判断是否是表头和根据td内容长度进行分块处理 func (td *TD) tdIsHb(tr *TR, table *Table, bsontable bool) { lenval := len([]rune(td.Val)) //经过处理的td内容长度 //if lentxt > 9 { //td.KV = GetKVAll(txt, "") ub := []*u.Block{} //经过处理的td内容长度大于50,划块,分包 if lenval > 50 { //看是否划块 //u.Debug(txt) ub, _ = DivideBlock(td.Text, 2, table.TableResult.RuleBlock) //对td的原始值 //看是否划块 if len(ub) > 0 { colonKvWeight := map[string]int{} spaceKvWeight := map[string]int{} for _, bl := range ub { //冒号kv for bl_ck, bl_cv := range bl.ColonKV.Kv { if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] { colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight td.SortKV.AddKey(bl_ck, bl_cv) } } //空格kv for bl_sk, bl_sv := range bl.SpaceKV.Kv { if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] { spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight td.SortKV.AddKey(bl_sk, bl_sv) } } } } // blockPackage := map[string]*u.BlockPackage{} isFindPkg := true /*if td.ColPos-1 >= 0 && excludeKey.MatchString(tr.TDs[td.ColPos-1].Text) { isFindPkg = false } else if len(tr.TDs) > 0 { tdleft = tr.TDs[len(tr.TDs)-1] if tdleft.BH && excludeKey.MatchString(tr.TDs[td.ColPos-1].Text) { isFindPkg = false } }*/ if len(tr.TDs) > 0 { tdleft := tr.TDs[len(tr.TDs)-1] if tdleft.BH && excludeKey.MatchString(tdleft.Text) { //(涉及包号|包件号?|项目标号|规格|型号|招标范围|业绩|废标)|(^编号$)|([^包段标]编号) isFindPkg = false } } if isFindPkg { if len(ub) > 0 { blockPackage = FindPackageFromBlocks(&ub, "") //从块里面找分包 } else { blockPackage = FindPackageFromText("", td.Val) //从正文里面找分包 } } if len(blockPackage) > 0 { table.BPackage = true for bp_k, bp_v := range blockPackage { var bp *u.BlockPackage if table.TableResult.PackageMap.Map[bp_k] == nil { bp = bp_v } else { bp = table.TableResult.PackageMap.Map[bp_k].(*u.BlockPackage) bp.Text += "\n" + bp_v.Text } if bp.TableKV == nil { bp.TableKV = u.NewJobKv() } for k2, v2 := range bp_v.ColonKV.Kv { if bp.TableKV.Kv[k2] == "" { bp.TableKV.Kv[k2] = v2 } } for k2, v2 := range bp_v.SpaceKV.Kv { if bp.TableKV.Kv[k2] == "" { bp.TableKV.Kv[k2] = v2 } } table.TableResult.PackageMap.Map[bp_k] = bp } } } //经过处理的td内容长度小于50,冒号kv,td表头 if lenval < 50 { // td.SortKV = FindKv(text, "") kvTitle := "" if len(td.TR.TDs) > 0 { kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val } /* 预算总价 (人民币:元) */ if td.Text != "" && strings.Contains(td.Text, "预算总价") && (strings.Contains(td.Text, "(") || strings.Contains(td.Text, "(")) { tagindex := 0 if tagindex = strings.Index(td.Text, "("); tagindex <= 0 { tagindex = strings.Index(td.Text, "(") } td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值 td.BH = true } _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3) //td冒号kv for k, v := range resm { td.SortKV.AddKey(k, v) //存放kv值 } //u.Debug(td.SortKV.Keys, "-------2--------------------------------") // td.SortKV = FindKv(text, "") //GetKvFromtxt(text, "") //resm := GetKVAll(text, "") if len(td.SortKV.Keys) > 0 { //td.KVDirect = 3 //不当头也不当值,忽略 if len(td.SortKV.Keys) == 1 && BHKey.MatchString(td.Val) && !MultiReg.MatchString(td.Val) { td.Val = td.SortKV.Keys[0] td.BH = true } } else if !bsontable { txt := repSpace.ReplaceAllString(td.Val, "") btw, must, _, _, repl := CheckHeader(txt) if lenval > 15 { btw = false } if strings.Contains(td.Val, "个项目") { must = false btw = false } td.Valtype = repl td.MustBH = must td.BH = btw } } else if len(ub) == 0 { //之前这里没加判断,现在加上判断,造成分块之后的kv被覆盖掉 //u.Debug("----\n\n\n", txt, "\n\n\n----") //u.Debug(GetKVAll(txt, "")) /* subVal := submatchreg.FindAllStringSubmatch(txt, -1) if len(subVal) > 0 { for _, subv1 := range subVal { if len(subv1) == 6 { tr.Table.SortKV.AddKey(If(subv1[2] == "", subv1[3], subv1[2]).(string), subv1[4]) //tr.Table.SortKV.AddKey(subv1[1], subv1[2]) } } } */ td.SortKV = FindKv(td.Val, "", 2) // td.LeftNode.Val // for _, vvv := range *td.TR { // u.Debug(">>>>>") // } kvTitle := "" if len(td.TR.TDs) > 0 { kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val } _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2) //获取冒号kv入口 for k, v := range resm { td.SortKV.AddKey(k, v) } } } func (t *Table) Print() { for row, trs := range t.TRs { for col, td := range trs.TDs { log.Println(row, col, td.Val, td.BH, td.SortKV.Map) } } } type TR struct { TDs []*TD TopTR *TR //上临行 BottomTR *TR //下临行 Table *Table //所属表格对象 RowPos int //当前在第几行 //-----计算 MaxRow int //最大跨行 Max(td.StartRow-td.EndRow) MinRow int //最小跨行 StartRow int //起始行 EndRow int //结束行 MaxCol int //最大列 MinCol int //最小列 StartCol int //起始列 EndCol int //结束列 BDiffSpanRow bool //起始行,行中有没有不同跨行 - - - = - BDiffSpanCol bool //起始列,列中有没有不同跨列 | } func NewTR(Table *Table) *TR { return &TR{ TDs: []*TD{}, Table: Table, } } func (tr *TR) AddTD(td *TD) { /**对跨行没有意义 if len(tr.TDs) > 0 { td.LeftNode = tr.TDs[len(tr.TDs)-1] tr.TDs[len(tr.TDs)-1].RightNode = td } **/ td.ColPos = len(tr.TDs) tr.TDs = append(tr.TDs, td) } /*-- START --- 处理表头概率开始 -------*/ type pos struct { Max int Min int } type TDRationScope struct { Rationmap map[*pos]float32 Tdmap map[*pos][]*TD Poss []*pos Parentkey string } func NewTDRationScope(key string) *TDRationScope { return &TDRationScope{map[*pos]float32{}, map[*pos][]*TD{}, []*pos{}, key} } func (tdr *TDRationScope) GetPos(td *TD) (poss *pos) { k1 := tdr.Parentkey[:1] m1, m2 := td.StartRow, td.EndRow if k1 == "r" { m1, m2 = td.StartCol, td.EndCol } for _, v := range tdr.Poss { if v.Max >= m2 && v.Min <= m1 { poss = v return } } return } func (tdr *TDRationScope) GetTDRation(td *TD) (ration float32, tds []*TD) { poss := tdr.GetPos(td) if poss != nil { ration = tdr.Rationmap[poss] tds = tdr.Tdmap[poss] } return } func (tdr *TDRationScope) Addtd(td *TD) { k1 := tdr.Parentkey[:1] m1, m2 := td.StartRow, td.EndRow if k1 == "r" { m1, m2 = td.StartCol, td.EndCol } bfind := false for _, v := range tdr.Poss { if m1 == v.Max+1 { //找到 bfind = true v.Max = m2 tdr.Tdmap[v] = append(tdr.Tdmap[v], td) break } } if !bfind { pos1 := &pos{m2, m1} tdr.Tdmap[pos1] = []*TD{td} tdr.Poss = append(tdr.Poss, pos1) } } /*-- END --- 处理表头概率 -------*/ //table表格 type Table struct { Brule bool //是否规则 TRs []*TR BFirstRow bool RowNum int //行数 ColNum int //列数 TDNum int //td个数 BPackage bool //是否有包 SortKV *SortMap //带排序的KV值 StandKV map[string]string //过滤后的标准化kv StandKVWeight map[string]int //过滤后的标准化kv StandRuleKV map[string]string //过滤后的规则kv kvscope map[int]map[int][]*TD //sortkey第几个元素的的第几个值的结束位置 kTD map[int]*TD //根据索引找到key的TD元素 SonTables []*Table //孩子表集合 Tag string //表格的标签 Desc string //表格描述内容 Goquery *goquery.Selection //表格的goquery对象 Html string //所属的文本内容 BlockPackage *SortMap //子包数组 TableResult *TableResult //父元素 StartAndEndRation map[string]*TDRationScope //同行或同列的概率,截断的单独起算 StartAndEndRationKSort *SortMap WinnerOrder []map[string]interface{} BSplit bool //是否是有一个表拆分成的多个表 BHeader bool //拆分表是否有表头 BrandData [][]map[string]string //品牌抽取结果 HasKey int //有key HasBrand int //有品牌 HasGoods int //有商品 } func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Table { return &Table{ Html: Html, SortKV: NewSortMap(), StandKV: map[string]string{}, StandKVWeight: map[string]int{}, kvscope: map[int]map[int][]*TD{}, kTD: map[int]*TD{}, SonTables: []*Table{}, Goquery: tab, TRs: []*TR{}, TableResult: TableResult, StartAndEndRation: map[string]*TDRationScope{}, StartAndEndRationKSort: NewSortMap(), BlockPackage: NewSortMap(), } } func (t *Table) AddTR(tr *TR) { if len(tr.TDs) > 0 { if len(t.TRs) > 0 { tr.TopTR = t.TRs[len(t.TRs)-1] t.TRs[len(t.TRs)-1].BottomTR = tr } tr.RowPos = len(t.TRs) t.TRs = append(t.TRs, tr) } } func (t *Table) InsertTR(tr *TR) { if len(tr.TDs) > 0 { if len(t.TRs) > 0 { t.TRs[0].TopTR = tr } tr.RowPos = 0 for _, _tr := range t.TRs { _tr.RowPos += 1 } t.TRs = append([]*TR{tr}, t.TRs...) } } //支持排序的map type SortMap struct { Index map[string]int Keys []string Map map[string]interface{} Lock sync.Mutex } //快速创建排序map func NewSortMap() *SortMap { return &SortMap{ Index: map[string]int{}, Keys: []string{}, Map: map[string]interface{}{}, } } //增加值 var NullVal = regexp.MustCompile("^[/无,.。;、附]+$|^详见.{2,8}$|(详?见)?附(件|图)") func (s *SortMap) AddKey(key string, val interface{}) { //判断val // if v, ok := val.(string); ok && NullVal.ReplaceAllString(u.TrimLRSpace(v, ""), "") == "" { // return // } s.Lock.Lock() defer s.Lock.Unlock() //重复 if s.Map[key] == nil { s.Index[key] = len(s.Keys) s.Keys = append(s.Keys, key) } s.Map[key] = val } //增加值 func (s *SortMap) ReplaceKey(key string, val interface{}, replacekey string) { s.Lock.Lock() defer s.Lock.Unlock() //重复 v := s.Index[replacekey] s.Index[key] = v delete(s.Index, replacekey) s.Keys = append(s.Keys[:v], append([]string{key}, s.Keys[v+1:]...)...) delete(s.Map, replacekey) s.Map[key] = val } //删除值 func (s *SortMap) RemoveKey(key string) { s.Lock.Lock() defer s.Lock.Unlock() delete(s.Map, key) pos := s.Index[key] delete(s.Index, key) if len(s.Keys) > 0 { s.Keys = func() []string { newkeys := []string{} if len(s.Keys) > 1 { if pos == 0 { newkeys = append(newkeys, s.Keys[1:]...) //每一个都减一 for k, v := range s.Index { s.Index[k] = v - 1 } } else if pos == len(s.Keys) { newkeys = append(newkeys, s.Keys[:pos]...) } else { tmp := s.Keys[pos+1:] newkeys = append(append(newkeys, s.Keys[:pos]...), tmp...) for _, v := range tmp { s.Index[v] -= 1 } } } return newkeys }() } } //判断表头是key的对象 type TableKeyV1 struct { TMap map[string]interface{} TReg []*regexp.Regexp TRegReplStr []string } //判断表头时用到的顺序 正文、结果表头、正常表头 var THeadStr = []string{ "con", "jghead", "normalhead", } //存放敏感词 var TKMaps = map[string]*TableKeyV1{} //过滤所有非汉字内容 var filterThText = regexp.MustCompile("([((【\\[].*[))】\\]])|([^0-9a-zA-Z\\p{Han}]+)") var tLock = sync.Mutex{} //matchStro为tablev1.json文件中的key,txt为表格的内容也可以是表格的标签 //主要实现表格是否是表头的判断,表格是否有用的判断(如人员情况等是无用的) func CheckCommon(txt string, matchStr ...string) (res, must bool, stype, reg, repl string) { txt = filterThText.ReplaceAllString(txt, "") stype = "con" if len([]rune(txt)) < 30 { tLock.Lock() defer tLock.Unlock() if len(TKMaps) == 0 { for k, v := range u.TableK1 { tk := &TableKeyV1{ map[string]interface{}{}, []*regexp.Regexp{}, []string{}, } thMap := map[string]interface{}{} for _, v1 := range v { v1s := strings.Split(v1, "__") if len(v1s) == 2 { tk.TReg = append(tk.TReg, regexp.MustCompile(v1s[0])) tk.TRegReplStr = append(tk.TRegReplStr, v1s[1]) } else { key := v1 nowMap := &thMap for i := 0; i < len(key); i++ { kc := key[i : i+1] if v, ok := (*nowMap)[kc]; ok { nowMap, _ = v.(*map[string]interface{}) } else { newMap := map[string]interface{}{} newMap["Y"] = "0" (*nowMap)[kc] = &newMap nowMap = &newMap } if i == len(key)-1 { (*nowMap)["Y"] = "1" (*nowMap)["K"] = key //(*nowMap)["V"] = v } } } } tk.TMap = thMap TKMaps[k] = tk } } //先正则、后子串查找 L1: for _, v := range matchStr { //u.Debug(v) for n, vreg := range TKMaps[v].TReg { if vreg.MatchString(txt) { //u.Debug(txt, v, vreg.String()) reg = vreg.String() repl = TKMaps[v].TRegReplStr[n] if v != "con" { res = true if "M" == repl { must = true } } stype = v break L1 } } //以下是敏感词子串查找匹配 pos := 0 thMap := TKMaps[v].TMap nowMap := &thMap for i := 0; i < len(txt); i++ { word := txt[i : i+1] nowMap, _ = (*nowMap)[word].(*map[string]interface{}) if nowMap != nil { // 存在,则判断是否为最后一个 if pos == 0 { pos = i } if "1" == qutil.ObjToString((*nowMap)["Y"]) { if v != "con" { res = true } stype = v pos = 0 break L1 } } else { nowMap = &thMap if pos > 0 { i = pos pos = 0 } } } } return } else { return } } //根据td中的内容验证表头,根据tablev1.json中配置的三种规则(含正则和子串查找算法) func CheckHeader(txt string) (res, must bool, stype, reg, repl string) { return CheckCommon(txt, THeadStr...) } /** 计算表格占比,返回表格数组、占比 con 文本 strtype 1全文 2块文本 **/ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) { defer qutil.Catch() doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con)) cons := doc.Text() tables := doc.Find("table") doc = nil if tables.Size() > 0 { tabs = []*goquery.Selection{} for i := 0; i < tables.Size(); i++ { tmpt := tables.Eq(i) b := false for j := 0; j < len(tabs); j++ { if tabs[j].Contains(tmpt.Get(0)) { b = true } } if !b { tabs = append(tabs, tmpt) } } tlen := 0 for _, t := range tabs { tlen += len(t.Text()) } ratio = float32(tlen) / float32(len(cons)) } /** if ratio < float32(0.992) { //取出排除表格之外的文本 txt =getTextAfterRemoveTable(con) } **/ return } //取出排除表格之外的文本 func TextAfterRemoveTable(con string) string { doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con)) doc2.Find("table").Remove() return doc2.Text() } func HtmlAfterRemoveTable(con string) string { doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con)) doc2.Find("table").Remove() html, _ := doc2.Html() return html } func If(condition bool, trueVal, falseVal interface{}) interface{} { if condition { return trueVal } return falseVal }