package pretreated //定义表格对象 import ( "encoding/json" "fmt" u "jy/util" "log" qutil "qfw/util" "regexp" "strings" "sync" "github.com/PuerkitoBio/goquery" ) //所有中标候选人只取第一个 type TableResult struct { Id interface{} //信息id Toptype string //信息类型 Itype int //1全文 2是块 BlockTag string //块标签 Html string Tabs []*Table //子表集合,子表中包含标准化kv或原始kv GoqueryTabs *goquery.Selection //goquery对象 TableSize int //子表的个数0,1,n IsMultiPackage bool //是否有子包 PackageMap *SortMap //子包对象的sortmap,含标准化过的 KvTags map[string][]*u.Tag //全局KVmap值,标准化处理过的 WinnerOrder []map[string]interface{} BrandData [][]map[string]string //品牌抽取结果 HasKey int //有key HasBrand int //有品牌 HasGoods int //有商品 RuleBlock *u.RuleBlock } //快速创建TableResult对象 func NewTableResult(Id interface{}, Toptype, BlockTag, con string, Itype int, ruleBlock *u.RuleBlock) *TableResult { return &TableResult{ Id: Id, Toptype: Toptype, Html: con, Itype: Itype, BlockTag: BlockTag, Tabs: []*Table{}, GoqueryTabs: &goquery.Selection{}, PackageMap: NewSortMap(), KvTags: map[string][]*u.Tag{}, RuleBlock: ruleBlock, } } //td节点 type TD struct { Goquery *goquery.Selection //文本对象 TR *TR //所属TR对象 LeftNode *TD //左临节点 TopNode *TD //上临节点 RightNode *TD //右节点 BottomNode *TD //下节点 Val string //值 Text string //原始串 SortKV *SortMap //存放kv值 Html string //html值 BH bool //是否是表头 MustBH bool //不能修改的表头 StandardKey string //标准表头 Colspan int //合并列 Rowspan int //合并行 StartCol int //起始列 EndCol int //终止列 StartRow int //起始行 EndRow int //终止行 ColPos int //当前在TR中的位置 HeadTd *TD //(是val元素)k节点 KVDirect int //键-值方向,0未知,1横 2纵//指值和k的方向 KeyDirect int //k方向,k纵值横,k横值纵 1横 2纵 SonTds []*TD //(是key元素)值节点数组 SonTableResult *TableResult //子值表格集 ArrVal []string //数组值,当是左临元素是合并行的元素时! Valtype string //"BO=中标人顺序" } var submatchreg = regexp.MustCompile(`((?:[一二三四五六七八九十0-10]+[、])([\\S]{4,12})|([\\S]{2,12}))[::]([\\S]{5,60})([一二三四五六七八九]+[、])?`) var BHKey = regexp.MustCompile(`^[^,,;:。、.]{2,8}.{0,3}[::].+$`) var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿元]+)") func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite string) *TD { defer qutil.Catch() td := &TD{ ArrVal: []string{}, Goquery: Goquery, SonTds: []*TD{}, TR: tr, SortKV: NewSortMap(), } colspan, rowspan := 0, 0 col, bcol := td.Goquery.Attr("colspan") if bcol { colspan = qutil.IntAllDef(col, 1) } if colspan == 0 { colspan = 1 } row, brow := td.Goquery.Attr("rowspan") if brow { rowspan = qutil.IntAllDef(row, 1) } if rowspan == 0 { rowspan = 1 } td.Colspan, td.Rowspan = colspan, rowspan //合并列,合并行 td.Html, _ = td.Goquery.Html() //html值 ht := td.Goquery.ChildrenFiltered("table") //获取td的table bsontable := false //默认td中没有table txt := "" //子table处理合并 if ht.Size() > 0 { //qutil.Debug("有子表格") //格式化正文 txt = TextAfterRemoveTable(td.Html) td.tdHasTable(&bsontable, tr,isSite,codeSite) //处理td中的table,块标签处理,子表解析集处理 } else { txt = strings.TrimSpace(td.Goquery.Text()) } text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1") td.Val = text //值 td.Text = txt //原始串 //处理table外内容 var ub []*u.Block ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock,isSite,codeSite) //看是否划块 if len(ub) > 0 { for _, bl := range ub { //冒号kv for bl_ck, bl_cv := range bl.ColonKV.KvTags { td.SortKV.AddKey(bl_ck, bl_cv) } //空格kv for bl_sk, bl_sv := range bl.SpaceKV.KvTags { td.SortKV.AddKey(bl_sk, bl_sv) } } } else { //for _, v := range GetKVAll(txt, "", nil, 2).KvTags { //for _, vv := range v { //td.SortKV.AddKey(vv.Key, vv.Value) //} //} } ////抽取不到走正则抽 //proCode := projectcodeReg.FindString(text) //if proCode != "" { // ckv := GetKVAll(proCode, "", nil, 1) // for _, v := range ckv.KvTags { // for _, vv := range v { // td.SortKV.AddKey(vv.Key, vv.Value) // } // } //} else if proCode = projectcodeReg2.FindString(text); proCode != "" { // ckv := GetKVAll(proCode, "", nil, 1) // for _, v := range ckv.KvTags { // for _, vv := range v { // td.SortKV.AddKey(vv.Key, vv.Value) // } // } //} if proCode := jsonReg.FindString(text); proCode != "" { jsonMap := make(map[string]string) json.Unmarshal([]byte(proCode), &jsonMap) for k, v := range jsonMap { td.SortKV.AddKey(k, v) } } //对td单元格值判断是否是表头和根据td内容长度进行分块处理 td.tdIsHb(tr, table, bsontable,isSite,codeSite) bhead := false if td.TR.RowPos == 0 { //第一行 if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头 bhead = true } } if bhead && !bsontable { td.BH = true td.KeyDirect = 1 //k方向,k纵值横,k横值纵 1横 2纵 td.KVDirect = 2 //键-值方向,0未知,1横 2纵//指值和k的方向 } //u.Debug(td.BH, td.Val) return td } //处理td中的table,块标签处理,子表解析集处理 func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) { ts := td.TR.Table.TableResult tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比 if len(tabs) > 0 { (*bsontable) = true stag := ts.BlockTag //块标签 if stag == "" { var tdleft *TD if len(tr.TDs) > 0 { tdleft = tr.TDs[len(tr.TDs)-1] if tdleft.BH { //u.Debug(tdleft.Val),如果不存在就是上一行的 stag = tdleft.Val } } else if len(tr.Table.TRs) > 0 { lasttr := tr.Table.TRs[len(tr.Table.TRs)-1] str := "" for _, td3 := range lasttr.TDs { str += td3.Val if len([]rune(str)) > 14 { str = "" break } } stag = str } } for _, tv := range tabs { if IsHide(tv) { continue } sonts := NewTableResult(ts.Id, ts.Toptype, stag, td.Html, 2, td.TR.Table.TableResult.RuleBlock) sonts.GoqueryTabs = tv sonts.Analy(isSite,codeSite) //sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口 td.BH = false if td.TR.Table.TableResult == nil { td.TR.Table.TableResult = NewTableResult(sonts.Id, sonts.Toptype, sonts.BlockTag, sonts.Html, sonts.Itype, sonts.RuleBlock) } MergeKvTags(td.TR.Table.TableResult.KvTags, sonts.KvTags) td.SonTableResult = sonts //for _, k := range sonts.SortKV.Keys { //u.Debug(k, sonts.SortKV.Map[k]) // td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string) // td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k] //} //增加brand (子表) //fmt.Println("sonsHasKey=============", sonts.HasKey) //fmt.Println("sonsHasGoods========", sonts.HasGoods) //fmt.Println("sonsHasBrand========", sonts.HasBrand) if sonts.HasKey != 0 { td.TR.Table.TableResult.HasKey = sonts.HasKey } if sonts.HasGoods != 0 { td.TR.Table.TableResult.HasGoods = sonts.HasGoods } if sonts.HasBrand != 0 { td.TR.Table.TableResult.HasBrand = sonts.HasBrand } if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table for _, v := range sonts.BrandData { if len(v) > 0 { td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v) } } } if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 { td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder } if sonts.IsMultiPackage { td.TR.Table.BPackage = true tb1 := td.TR.Table.BlockPackage for _, v := range sonts.PackageMap.Keys { v1 := sonts.PackageMap.Map[v].(*u.BlockPackage) if tb1.Map[v] == nil { tb1.AddKey(v, sonts.PackageMap.Map[v]) } else { bp := tb1.Map[v].(*u.BlockPackage) if bp != nil && v1.TableKV != nil { for k2, v2 := range v1.TableKV.KvTags { if bp.TableKV == nil { bp.TableKV = u.NewJobKv() } isExists := false for _, v2v := range v2 { for _, v2vv := range bp.TableKV.KvTags[k2] { if v2v.Value == v2vv.Value { isExists = true break } } if !isExists { bp.TableKV.KvTags[k2] = append(bp.TableKV.KvTags[k2], v2v) bp.Text += fmt.Sprintf("%v:%v\n", k2, v2) } } } } } } //u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"])) } } } } //对td单元格值判断是否是表头和根据td内容长度进行分块处理 func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string) { lenval := len([]rune(td.Val)) //经过处理的td内容长度 //if lentxt > 9 { //td.KV = GetKVAll(txt, "") ub := []*u.Block{} //经过处理的td内容长度大于50,划块,分包 if lenval > 50 { //看是否划块 //u.Debug(txt) ub, _ = DivideBlock("", td.Text, 2, table.TableResult.RuleBlock,isSite,codeSite) //对td的原始值 //看是否划块 if len(ub) > 0 { for _, bl := range ub { //冒号kv for bl_ck, bl_cv := range bl.ColonKV.KvTags { td.SortKV.AddKey(bl_ck, bl_cv) } //空格kv for bl_sk, bl_sv := range bl.SpaceKV.KvTags { td.SortKV.AddKey(bl_sk, bl_sv) } } } // blockPackage := map[string]*u.BlockPackage{} isFindPkg := true /*if td.ColPos-1 >= 0 && excludeKey.MatchString(tr.TDs[td.ColPos-1].Text) { isFindPkg = false } else if len(tr.TDs) > 0 { tdleft = tr.TDs[len(tr.TDs)-1] if tdleft.BH && excludeKey.MatchString(tr.TDs[td.ColPos-1].Text) { isFindPkg = false } }*/ if len(tr.TDs) > 0 { tdleft := tr.TDs[len(tr.TDs)-1] if tdleft.BH && excludeKey.MatchString(tdleft.Text) { //(涉及包号|包件号?|项目标号|规格|型号|招标范围|业绩|废标)|(^编号$)|([^包段标]编号) isFindPkg = false } } if isFindPkg { if len(ub) > 0 { blockPackage = FindPackageFromBlocks(&ub,isSite,codeSite) //从块里面找分包 } else { blockPackage = FindPackageFromText("", td.Val,isSite,codeSite) //从正文里面找分包 } } if len(blockPackage) > 0 { table.BPackage = true for bp_k, bp_v := range blockPackage { var bp *u.BlockPackage if table.TableResult.PackageMap.Map[bp_k] == nil { bp = bp_v } else { bp = table.TableResult.PackageMap.Map[bp_k].(*u.BlockPackage) bp.Text += "\n" + bp_v.Text } if bp.TableKV == nil { bp.TableKV = u.NewJobKv() } MergeKvTags(bp.TableKV.KvTags, bp_v.ColonKV.KvTags) MergeKvTags(bp.TableKV.KvTags, bp_v.SpaceKV.KvTags) table.TableResult.PackageMap.AddKey(bp_k, bp) } } } //经过处理的td内容长度小于50,冒号kv,td表头 if lenval < 50 { // td.SortKV = FindKv(text, "") kvTitle := "" if len(td.TR.TDs) > 0 { kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val } /* 预算总价 (人民币:元) */ if td.Text != "" && strings.Contains(td.Text, "预算总价") && (strings.Contains(td.Text, "(") || strings.Contains(td.Text, "(")) { tagindex := 0 if tagindex = strings.Index(td.Text, "("); tagindex <= 0 { tagindex = strings.Index(td.Text, "(") } td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值 td.BH = true } _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3,isSite,codeSite) //td冒号kv for k, v := range resm { if k != "" && v != "" { td.SortKV.AddKey(k, v) //存放kv值 } } //u.Debug(td.SortKV.Keys, "-------2--------------------------------") // td.SortKV = FindKv(text, "") //GetKvFromtxt(text, "") //resm := GetKVAll(text, "") if len(td.SortKV.Keys) > 0 { //td.KVDirect = 3 //不当头也不当值,忽略 if len(td.SortKV.Keys) == 1 && BHKey.MatchString(td.Val) && !MultiReg.MatchString(td.Val) { td.Val, _ = td.SortKV.Map[td.SortKV.Keys[0]].(string) td.BH = true } } else if !bsontable { txt := repSpace.ReplaceAllString(td.Val, "") btw, must, _, _, repl := CheckHeader(txt) if lenval > 15 && !strings.Contains(txt, "采购代理机构名称、地址和联系方式") { btw = false } if strings.Contains(td.Val, "个项目") { must = false btw = false } td.Valtype = repl td.MustBH = must td.BH = btw if strings.Contains(txt,"年估算额年(万元)"){ td.MustBH = true td.BH = true } } } else if len(ub) == 0 { //之前这里没加判断,现在加上判断,造成分块之后的kv被覆盖掉 //u.Debug("----\n\n\n", txt, "\n\n\n----") //u.Debug(GetKVAll(txt, "")) /* subVal := submatchreg.FindAllStringSubmatch(txt, -1) if len(subVal) > 0 { for _, subv1 := range subVal { if len(subv1) == 6 { tr.Table.SortKV.AddKey(If(subv1[2] == "", subv1[3], subv1[2]).(string), subv1[4]) //tr.Table.SortKV.AddKey(subv1[1], subv1[2]) } } } */ fSortKV := FindKv(td.Val, "", 2) for _, v := range fSortKV.Keys { td.SortKV.AddKey(v, fSortKV.Map[v]) } // td.LeftNode.Val // for _, vvv := range *td.TR { // u.Debug(">>>>>") // } kvTitle := "" if len(td.TR.TDs) > 0 { kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val } _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2,isSite,codeSite) //获取冒号kv入口 for k, v := range resm { td.SortKV.AddKey(k, v) } } } func (t *Table) Print() { for row, trs := range t.TRs { for col, td := range trs.TDs { log.Println(row, col, td.Val, td.BH, td.SortKV.Map) } } } type TR struct { TDs []*TD TopTR *TR //上临行 BottomTR *TR //下临行 Table *Table //所属表格对象 RowPos int //当前在第几行 //-----计算 MaxRow int //最大跨行 Max(td.StartRow-td.EndRow) MinRow int //最小跨行 StartRow int //起始行 EndRow int //结束行 MaxCol int //最大列 MinCol int //最小列 StartCol int //起始列 EndCol int //结束列 BDiffSpanRow bool //起始行,行中有没有不同跨行 - - - = - BDiffSpanCol bool //起始列,列中有没有不同跨列 | } func NewTR(Table *Table) *TR { return &TR{ TDs: []*TD{}, Table: Table, } } func (tr *TR) AddTD(td *TD) { /**对跨行没有意义 if len(tr.TDs) > 0 { td.LeftNode = tr.TDs[len(tr.TDs)-1] tr.TDs[len(tr.TDs)-1].RightNode = td } **/ td.ColPos = len(tr.TDs) tr.TDs = append(tr.TDs, td) } /*-- START --- 处理表头概率开始 -------*/ type pos struct { Max int Min int } type TDRationScope struct { Rationmap map[*pos]float32 Tdmap map[*pos][]*TD Poss []*pos Parentkey string } func NewTDRationScope(key string) *TDRationScope { return &TDRationScope{map[*pos]float32{}, map[*pos][]*TD{}, []*pos{}, key} } func (tdr *TDRationScope) GetPos(td *TD) (poss *pos) { k1 := tdr.Parentkey[:1] m1, m2 := td.StartRow, td.EndRow if k1 == "r" { m1, m2 = td.StartCol, td.EndCol } for _, v := range tdr.Poss { if v.Max >= m2 && v.Min <= m1 { poss = v return } } return } func (tdr *TDRationScope) GetTDRation(td *TD) (ration float32, tds []*TD) { poss := tdr.GetPos(td) if poss != nil { ration = tdr.Rationmap[poss] tds = tdr.Tdmap[poss] } return } func (tdr *TDRationScope) Addtd(td *TD) { k1 := tdr.Parentkey[:1] m1, m2 := td.StartRow, td.EndRow if k1 == "r" { m1, m2 = td.StartCol, td.EndCol } bfind := false for _, v := range tdr.Poss { if m1 == v.Max+1 { //找到 bfind = true v.Max = m2 tdr.Tdmap[v] = append(tdr.Tdmap[v], td) break } } if !bfind { pos1 := &pos{m2, m1} tdr.Tdmap[pos1] = []*TD{td} tdr.Poss = append(tdr.Poss, pos1) } } /*-- END --- 处理表头概率 -------*/ //table表格 type Table struct { Brule bool //是否规则 TRs []*TR BFirstRow bool RowNum int //行数 ColNum int //列数 TDNum int //td个数 BPackage bool //是否有包 SortKV *SortMap //带排序的KV值 StandKV map[string][]*u.Tag //过滤后的标准化kv StandRuleKV map[string]string //过滤后的规则kv kvscope map[int]map[int][]*TD //sortkey第几个元素的的第几个值的结束位置 kTD map[int]*TD //根据索引找到key的TD元素 SonTables []*Table //孩子表集合 Tag string //表格的标签 Desc string //表格描述内容 Goquery *goquery.Selection //表格的goquery对象 Html string //所属的文本内容 BlockPackage *SortMap //子包数组 TableResult *TableResult //父元素 StartAndEndRation map[string]*TDRationScope //同行或同列的概率,截断的单独起算 StartAndEndRationKSort *SortMap WinnerOrder []map[string]interface{} BSplit bool //是否是有一个表拆分成的多个表 BHeader bool //拆分表是否有表头 BrandData [][]map[string]string //品牌抽取结果 HasKey int //有key HasBrand int //有品牌 HasGoods int //有商品 } func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Table { return &Table{ Html: Html, SortKV: NewSortMap(), StandKV: map[string][]*u.Tag{}, kvscope: map[int]map[int][]*TD{}, kTD: map[int]*TD{}, SonTables: []*Table{}, Goquery: tab, TRs: []*TR{}, TableResult: TableResult, StartAndEndRation: map[string]*TDRationScope{}, StartAndEndRationKSort: NewSortMap(), BlockPackage: NewSortMap(), } } func (t *Table) AddTR(tr *TR) { if len(tr.TDs) > 0 { if len(t.TRs) > 0 { tr.TopTR = t.TRs[len(t.TRs)-1] t.TRs[len(t.TRs)-1].BottomTR = tr } tr.RowPos = len(t.TRs) t.TRs = append(t.TRs, tr) } } func (t *Table) InsertTR(tr *TR) { if len(tr.TDs) > 0 { if len(t.TRs) > 0 { t.TRs[0].TopTR = tr } tr.RowPos = 0 for _, _tr := range t.TRs { _tr.RowPos += 1 } t.TRs = append([]*TR{tr}, t.TRs...) } } //支持排序的map type SortMap struct { Index map[string]int Keys []string Map map[string]interface{} Lock sync.Mutex } //快速创建排序map func NewSortMap() *SortMap { return &SortMap{ Index: map[string]int{}, Keys: []string{}, Map: map[string]interface{}{}, } } //增加值 var NullVal = regexp.MustCompile("^[/无,.。;、附]+$|^详见.{2,8}$|(详?见)?附(件|图)") func (s *SortMap) AddKey(key string, val interface{}) { //判断val // if v, ok := val.(string); ok && NullVal.ReplaceAllString(u.TrimLRSpace(v, ""), "") == "" { // return // } s.Lock.Lock() defer s.Lock.Unlock() //重复 if s.Map[key] == nil { s.Index[key] = len(s.Keys) s.Keys = append(s.Keys, key) } s.Map[key] = val } //增加值 func (s *SortMap) ReplaceKey(key string, val interface{}, replacekey string) { s.Lock.Lock() defer s.Lock.Unlock() //重复 v := s.Index[replacekey] s.Index[key] = v delete(s.Index, replacekey) s.Keys = append(s.Keys[:v], append([]string{key}, s.Keys[v+1:]...)...) delete(s.Map, replacekey) s.Map[key] = val } //删除值 func (s *SortMap) RemoveKey(key string) { s.Lock.Lock() defer s.Lock.Unlock() delete(s.Map, key) pos := s.Index[key] delete(s.Index, key) if len(s.Keys) > 0 { s.Keys = func() []string { newkeys := []string{} if len(s.Keys) > 1 { if pos == 0 { newkeys = append(newkeys, s.Keys[1:]...) //每一个都减一 for k, v := range s.Index { s.Index[k] = v - 1 } } else if pos == len(s.Keys) { newkeys = append(newkeys, s.Keys[:pos]...) } else if len(s.Keys) > 1 { tmp := s.Keys[pos+1:] newkeys = append(append(newkeys, s.Keys[:pos]...), tmp...) for _, v := range tmp { s.Index[v] -= 1 } } } return newkeys }() } } //判断表头是key的对象 type TableKeyV1 struct { TMap map[string]interface{} TReg []*regexp.Regexp TRegReplStr []string } //判断表头时用到的顺序 正文、结果表头、正常表头 var THeadStr = []string{ "con", "jghead", "normalhead", } //存放敏感词 var TKMaps = map[string]*TableKeyV1{} //过滤所有非汉字内容 var filterThText = regexp.MustCompile("([((【\\[].*[))】\\]])|([^0-9a-zA-Z\\p{Han}]+)") var tLock = sync.Mutex{} //matchStro为tablev1.json文件中的key,txt为表格的内容也可以是表格的标签 //主要实现表格是否是表头的判断,表格是否有用的判断(如人员情况等是无用的) func CheckCommon(txt string, matchStr ...string) (res, must bool, stype, reg, repl string) { txt = filterThText.ReplaceAllString(txt, "") stype = "con" if len([]rune(txt)) < 30 { tLock.Lock() defer tLock.Unlock() if len(TKMaps) == 0 { for k, v := range u.TableK1 { tk := &TableKeyV1{ map[string]interface{}{}, []*regexp.Regexp{}, []string{}, } thMap := map[string]interface{}{} for _, v1 := range v { v1s := strings.Split(v1, "__") if len(v1s) == 2 { tk.TReg = append(tk.TReg, regexp.MustCompile(v1s[0])) tk.TRegReplStr = append(tk.TRegReplStr, v1s[1]) } else { key := v1 nowMap := &thMap for i := 0; i < len(key); i++ { kc := key[i : i+1] if v, ok := (*nowMap)[kc]; ok { nowMap, _ = v.(*map[string]interface{}) } else { newMap := map[string]interface{}{} newMap["Y"] = "0" (*nowMap)[kc] = &newMap nowMap = &newMap } if i == len(key)-1 { (*nowMap)["Y"] = "1" (*nowMap)["K"] = key //(*nowMap)["V"] = v } } } } tk.TMap = thMap TKMaps[k] = tk } } //先正则、后子串查找 L1: for _, v := range matchStr { //u.Debug(v) for n, vreg := range TKMaps[v].TReg { if vreg.MatchString(txt) { //u.Debug(txt, v, vreg.String()) reg = vreg.String() repl = TKMaps[v].TRegReplStr[n] if v != "con" { res = true if "M" == repl { must = true } } stype = v break L1 } } //以下是敏感词子串查找匹配 pos := 0 thMap := TKMaps[v].TMap nowMap := &thMap for i := 0; i < len(txt); i++ { word := txt[i : i+1] nowMap, _ = (*nowMap)[word].(*map[string]interface{}) if nowMap != nil { // 存在,则判断是否为最后一个 if pos == 0 { pos = i } if "1" == qutil.ObjToString((*nowMap)["Y"]) { if v != "con" { res = true } stype = v pos = 0 break L1 } } else { nowMap = &thMap if pos > 0 { i = pos pos = 0 } } } } return } else { return } } //根据td中的内容验证表头,根据tablev1.json中配置的三种规则(含正则和子串查找算法) func CheckHeader(txt string) (res, must bool, stype, reg, repl string) { return CheckCommon(txt, THeadStr...) } /** 计算表格占比,返回表格数组、占比 con 文本 strtype 1全文 2块文本 **/ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) { defer qutil.Catch() doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con)) cons := doc.Text() tables := doc.Find("table") doc = nil if tables.Size() > 0 { tabs = []*goquery.Selection{} for i := 0; i < tables.Size(); i++ { tmpt := tables.Eq(i) b := false for j := 0; j < len(tabs); j++ { if tabs[j].Contains(tmpt.Get(0)) { b = true } } if !b { tabs = append(tabs, tmpt) } } tlen := 0 for _, t := range tabs { tlen += len(t.Text()) } ratio = float32(tlen) / float32(len(cons)) } /** if ratio < float32(0.992) { //取出排除表格之外的文本 txt =getTextAfterRemoveTable(con) } **/ return } //纯文本 func HtmlToText(con string) string { doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con)) //log.Println(doc2.Html()) doc2.Find("tr").Each(func(i int, selection *goquery.Selection) { selection.AfterHtml(string(rune(10))) }) //log.Println(doc2.Html()) return doc2.Text() } //取出排除表格之外的文本 func TextAfterRemoveTable(con string) string { doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con)) doc2.Find("table").Remove() return doc2.Text() } func HtmlAfterRemoveTable(con string) string { doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con)) doc2.Find("table").Remove() html, _ := doc2.Html() return html } func If(condition bool, trueVal, falseVal interface{}) interface{} { if condition { return trueVal } return falseVal }