123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838 |
- package pretreated
- //定义表格对象
- import (
- "fmt"
- u "jy/util"
- "log"
- qutil "qfw/util"
- "regexp"
- "strings"
- "sync"
- "github.com/PuerkitoBio/goquery"
- )
- //所有中标候选人只取第一个
- type TableResult struct {
- Id interface{} //信息id
- Toptype string //信息类型
- Itype int //1全文 2是块
- BlockTag string //块标签
- Html string
- Tabs []*Table //子表集合,子表中包含标准化kv或原始kv
- GoqueryTabs []*goquery.Selection //goquery对象
- TableSize int //子表的个数0,1,n
- IsMultiPackage bool //是否有子包
- PackageMap *SortMap //子包对象的sortmap,含标准化过的
- SortKV *SortMap //全局KVmap值,标准化处理过的
- SortKVWeight map[string]int //全局KVmap值,标准化处理过的
- WinnerOrder []map[string]interface{}
- BrandData [][]map[string]string //品牌抽取结果
- HasKey int //有key
- HasBrand int //有品牌
- HasGoods int //有商品
- RuleBlock *u.RuleBlock
- }
- //快速创建TableResult对象
- func NewTableResult(Id interface{}, Toptype, BlockTag, con string, Itype int, ruleBlock *u.RuleBlock) *TableResult {
- return &TableResult{
- Id: Id,
- Toptype: Toptype,
- Html: con,
- Itype: Itype,
- BlockTag: BlockTag,
- Tabs: []*Table{},
- GoqueryTabs: []*goquery.Selection{},
- PackageMap: NewSortMap(),
- SortKV: NewSortMap(),
- SortKVWeight: map[string]int{},
- RuleBlock: ruleBlock,
- }
- }
- //td节点
- type TD struct {
- Goquery *goquery.Selection //文本对象
- TR *TR //所属TR对象
- LeftNode *TD //左临节点
- TopNode *TD //上临节点
- RightNode *TD //右节点
- BottomNode *TD //下节点
- Val string //值
- Text string //原始串
- SortKV *SortMap //存放kv值
- Html string //html值
- BH bool //是否是表头
- MustBH bool //不能修改的表头
- StandardKey string //标准表头
- Colspan int //合并列
- Rowspan int //合并行
- StartCol int //起始列
- EndCol int //终止列
- StartRow int //起始行
- EndRow int //终止行
- ColPos int //当前在TR中的位置
- HeadTd *TD //(是val元素)k节点
- KVDirect int //键-值方向,0未知,1横 2纵//指值和k的方向
- KeyDirect int //k方向,k纵值横,k横值纵 1横 2纵
- SonTds []*TD //(是key元素)值节点数组
- SonTableResult *TableResult //子值表格集
- ArrVal []string //数组值,当是左临元素是合并行的元素时!
- Valtype string //"BO=中标人顺序"
- }
- var submatchreg = regexp.MustCompile(`((?:[一二三四五六七八九十0-10]+[、])([\\S]{4,12})|([\\S]{2,12}))[::]([\\S]{5,60})([一二三四五六七八九]+[、])?`)
- var BHKey = regexp.MustCompile(`^[^,,;:。、.]{2,8}.{0,3}[::].+$`)
- var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿元]+)")
- func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
- defer qutil.Catch()
- td := &TD{
- ArrVal: []string{},
- Goquery: Goquery,
- SonTds: []*TD{},
- TR: tr,
- SortKV: NewSortMap(),
- }
- colspan, rowspan := 0, 0
- col, bcol := td.Goquery.Attr("colspan")
- if bcol {
- colspan = qutil.IntAllDef(col, 1)
- }
- if colspan == 0 {
- colspan = 1
- }
- row, brow := td.Goquery.Attr("rowspan")
- if brow {
- rowspan = qutil.IntAllDef(row, 1)
- }
- if rowspan == 0 {
- rowspan = 1
- }
- td.Colspan, td.Rowspan = colspan, rowspan //合并列,合并行
- td.Html, _ = td.Goquery.Html() //html值
- ht := td.Goquery.ChildrenFiltered("table") //获取td的table
- bsontable := false //默认td中没有table
- txt := ""
- //子table处理合并
- if ht.Size() > 0 {
- //qutil.Debug("有子表格")
- txt = TextAfterRemoveTable(td.Html)
- td.tdHasTable(&bsontable, tr, table) //处理td中的table,块标签处理,子表解析集处理
- } else {
- txt = strings.TrimSpace(td.Goquery.Text())
- }
- text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
- td.Val = text //值
- td.Text = txt //原始串
- //对td单元格值判断是否是表头和根据td内容长度进行分块处理
- td.tdIsHb(tr, table, bsontable)
- bhead := false
- if td.TR.RowPos == 0 { //第一行
- if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
- bhead = true
- }
- }
- if bhead && !bsontable {
- td.BH = true
- td.KeyDirect = 1 //k方向,k纵值横,k横值纵 1横 2纵
- td.KVDirect = 2 //键-值方向,0未知,1横 2纵//指值和k的方向
- }
- //u.Debug(td.BH, td.Val)
- return td
- }
- //处理td中的table,块标签处理,子表解析集处理
- func (td *TD) tdHasTable(bsontable *bool, tr *TR, table *Table) {
- ts := td.TR.Table.TableResult
- tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比
- if len(tabs) > 0 {
- (*bsontable) = true
- stag := ts.BlockTag //块标签
- if stag == "" {
- var tdleft *TD
- if len(tr.TDs) > 0 {
- tdleft = tr.TDs[len(tr.TDs)-1]
- if tdleft.BH {
- //u.Debug(tdleft.Val),如果不存在就是上一行的
- stag = tdleft.Val
- }
- } else if len(tr.Table.TRs) > 0 {
- lasttr := tr.Table.TRs[len(tr.Table.TRs)-1]
- str := ""
- for _, td3 := range lasttr.TDs {
- str += td3.Val
- if len([]rune(str)) > 14 {
- str = ""
- break
- }
- }
- stag = str
- }
- sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
- td.BH = false
- td.SonTableResult = sonts
- //for _, k := range sonts.SortKV.Keys {
- //u.Debug(k, sonts.SortKV.Map[k])
- // td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
- // td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
- //}
- //增加brand (子表)
- //fmt.Println("sonsHasKey=============", sonts.HasKey)
- //fmt.Println("sonsHasGoods========", sonts.HasGoods)
- //fmt.Println("sonsHasBrand========", sonts.HasBrand)
- if sonts.HasKey != 0 {
- td.TR.Table.TableResult.HasKey = sonts.HasKey
- }
- if sonts.HasGoods != 0 {
- td.TR.Table.TableResult.HasGoods = sonts.HasGoods
- }
- if sonts.HasBrand != 0 {
- td.TR.Table.TableResult.HasBrand = sonts.HasBrand
- }
- if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table
- for _, v := range sonts.BrandData {
- if len(v) > 0 {
- td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v)
- }
- }
- }
- if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
- td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
- }
- if sonts.IsMultiPackage {
- td.TR.Table.BPackage = true
- tb1 := td.TR.Table.BlockPackage
- for k, v := range sonts.PackageMap.Map {
- v1 := v.(*u.BlockPackage)
- if tb1.Map[k] == nil {
- tb1.AddKey(k, v)
- } else {
- bp := tb1.Map[k].(*u.BlockPackage)
- if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil {
- for k2, v2 := range v1.TableKV.Kv {
- if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" {
- bp.TableKV.Kv[k2] = v2
- bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
- }
- }
- }
- }
- }
- //u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
- }
- }
- }
- }
- //对td单元格值判断是否是表头和根据td内容长度进行分块处理
- func (td *TD) tdIsHb(tr *TR, table *Table, bsontable bool) {
- lenval := len([]rune(td.Val)) //经过处理的td内容长度
- //if lentxt > 9 {
- //td.KV = GetKVAll(txt, "")
- ub := []*u.Block{}
- //经过处理的td内容长度大于50,划块,分包
- if lenval > 50 { //看是否划块
- //u.Debug(txt)
- ub, _ = DivideBlock(td.Text, 2, table.TableResult.RuleBlock) //对td的原始值
- //看是否划块
- if len(ub) > 0 {
- colonKvWeight := map[string]int{}
- spaceKvWeight := map[string]int{}
- for _, bl := range ub {
- //冒号kv
- for bl_ck, bl_cv := range bl.ColonKV.Kv {
- if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
- colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
- td.SortKV.AddKey(bl_ck, bl_cv)
- }
- }
- //空格kv
- for bl_sk, bl_sv := range bl.SpaceKV.Kv {
- if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
- spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
- td.SortKV.AddKey(bl_sk, bl_sv)
- }
- }
- }
- }
- //
- blockPackage := map[string]*u.BlockPackage{}
- isFindPkg := true
- /*if td.ColPos-1 >= 0 && excludeKey.MatchString(tr.TDs[td.ColPos-1].Text) {
- isFindPkg = false
- } else if len(tr.TDs) > 0 {
- tdleft = tr.TDs[len(tr.TDs)-1]
- if tdleft.BH && excludeKey.MatchString(tr.TDs[td.ColPos-1].Text) {
- isFindPkg = false
- }
- }*/
- if len(tr.TDs) > 0 {
- tdleft := tr.TDs[len(tr.TDs)-1]
- if tdleft.BH && excludeKey.MatchString(tdleft.Text) { //(涉及包号|包件号?|项目标号|规格|型号|招标范围|业绩|废标)|(^编号$)|([^包段标]编号)
- isFindPkg = false
- }
- }
- if isFindPkg {
- if len(ub) > 0 {
- blockPackage = FindPackageFromBlocks(&ub, "") //从块里面找分包
- } else {
- blockPackage = FindPackageFromText("", td.Val) //从正文里面找分包
- }
- }
- if len(blockPackage) > 0 {
- table.BPackage = true
- for bp_k, bp_v := range blockPackage {
- var bp *u.BlockPackage
- if table.TableResult.PackageMap.Map[bp_k] == nil {
- bp = bp_v
- } else {
- bp = table.TableResult.PackageMap.Map[bp_k].(*u.BlockPackage)
- bp.Text += "\n" + bp_v.Text
- }
- if bp.TableKV == nil {
- bp.TableKV = u.NewJobKv()
- }
- for k2, v2 := range bp_v.ColonKV.Kv {
- if bp.TableKV.Kv[k2] == "" {
- bp.TableKV.Kv[k2] = v2
- }
- }
- for k2, v2 := range bp_v.SpaceKV.Kv {
- if bp.TableKV.Kv[k2] == "" {
- bp.TableKV.Kv[k2] = v2
- }
- }
- table.TableResult.PackageMap.Map[bp_k] = bp
- }
- }
- }
- //经过处理的td内容长度小于50,冒号kv,td表头
- if lenval < 50 {
- // td.SortKV = FindKv(text, "")
- kvTitle := ""
- if len(td.TR.TDs) > 0 {
- kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
- }
- /*
- 预算总价
- (人民币:元)
- */
- if td.Text != "" && strings.Contains(td.Text, "预算总价") && (strings.Contains(td.Text, "(") || strings.Contains(td.Text, "(")) {
- tagindex := 0
- if tagindex = strings.Index(td.Text, "("); tagindex <= 0 {
- tagindex = strings.Index(td.Text, "(")
- }
- td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值
- td.BH = true
- }
- _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3) //td冒号kv
- for k, v := range resm {
- td.SortKV.AddKey(k, v) //存放kv值
- }
- //u.Debug(td.SortKV.Keys, "-------2--------------------------------")
- // td.SortKV = FindKv(text, "") //GetKvFromtxt(text, "")
- //resm := GetKVAll(text, "")
- if len(td.SortKV.Keys) > 0 {
- //td.KVDirect = 3 //不当头也不当值,忽略
- if len(td.SortKV.Keys) == 1 && BHKey.MatchString(td.Val) && !MultiReg.MatchString(td.Val) {
- td.Val = td.SortKV.Keys[0]
- td.BH = true
- }
- } else if !bsontable {
- txt := repSpace.ReplaceAllString(td.Val, "")
- btw, must, _, _, repl := CheckHeader(txt)
- if lenval > 15 {
- btw = false
- }
- if strings.Contains(td.Val, "个项目") {
- must = false
- btw = false
- }
- td.Valtype = repl
- td.MustBH = must
- td.BH = btw
- }
- } else if len(ub) == 0 {
- //之前这里没加判断,现在加上判断,造成分块之后的kv被覆盖掉
- //u.Debug("----\n\n\n", txt, "\n\n\n----")
- //u.Debug(GetKVAll(txt, ""))
- /*
- subVal := submatchreg.FindAllStringSubmatch(txt, -1)
- if len(subVal) > 0 {
- for _, subv1 := range subVal {
- if len(subv1) == 6 {
- tr.Table.SortKV.AddKey(If(subv1[2] == "", subv1[3], subv1[2]).(string), subv1[4])
- //tr.Table.SortKV.AddKey(subv1[1], subv1[2])
- }
- }
- }
- */
- td.SortKV = FindKv(td.Val, "", 2)
- // td.LeftNode.Val
- // for _, vvv := range *td.TR {
- // u.Debug(">>>>>")
- // }
- kvTitle := ""
- if len(td.TR.TDs) > 0 {
- kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
- }
- _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2) //获取冒号kv入口
- for k, v := range resm {
- td.SortKV.AddKey(k, v)
- }
- }
- }
- func (t *Table) Print() {
- for row, trs := range t.TRs {
- for col, td := range trs.TDs {
- log.Println(row, col, td.Val, td.BH, td.SortKV.Map)
- }
- }
- }
- type TR struct {
- TDs []*TD
- TopTR *TR //上临行
- BottomTR *TR //下临行
- Table *Table //所属表格对象
- RowPos int //当前在第几行
- //-----计算
- MaxRow int //最大跨行 Max(td.StartRow-td.EndRow)
- MinRow int //最小跨行
- StartRow int //起始行
- EndRow int //结束行
- MaxCol int //最大列
- MinCol int //最小列
- StartCol int //起始列
- EndCol int //结束列
- BDiffSpanRow bool //起始行,行中有没有不同跨行 - - - = -
- BDiffSpanCol bool //起始列,列中有没有不同跨列 |
- }
- func NewTR(Table *Table) *TR {
- return &TR{
- TDs: []*TD{},
- Table: Table,
- }
- }
- func (tr *TR) AddTD(td *TD) {
- /**对跨行没有意义
- if len(tr.TDs) > 0 {
- td.LeftNode = tr.TDs[len(tr.TDs)-1]
- tr.TDs[len(tr.TDs)-1].RightNode = td
- }
- **/
- td.ColPos = len(tr.TDs)
- tr.TDs = append(tr.TDs, td)
- }
- /*-- START --- 处理表头概率开始 -------*/
- type pos struct {
- Max int
- Min int
- }
- type TDRationScope struct {
- Rationmap map[*pos]float32
- Tdmap map[*pos][]*TD
- Poss []*pos
- Parentkey string
- }
- func NewTDRationScope(key string) *TDRationScope {
- return &TDRationScope{map[*pos]float32{}, map[*pos][]*TD{}, []*pos{}, key}
- }
- func (tdr *TDRationScope) GetPos(td *TD) (poss *pos) {
- k1 := tdr.Parentkey[:1]
- m1, m2 := td.StartRow, td.EndRow
- if k1 == "r" {
- m1, m2 = td.StartCol, td.EndCol
- }
- for _, v := range tdr.Poss {
- if v.Max >= m2 && v.Min <= m1 {
- poss = v
- return
- }
- }
- return
- }
- func (tdr *TDRationScope) GetTDRation(td *TD) (ration float32, tds []*TD) {
- poss := tdr.GetPos(td)
- if poss != nil {
- ration = tdr.Rationmap[poss]
- tds = tdr.Tdmap[poss]
- }
- return
- }
- func (tdr *TDRationScope) Addtd(td *TD) {
- k1 := tdr.Parentkey[:1]
- m1, m2 := td.StartRow, td.EndRow
- if k1 == "r" {
- m1, m2 = td.StartCol, td.EndCol
- }
- bfind := false
- for _, v := range tdr.Poss {
- if m1 == v.Max+1 { //找到
- bfind = true
- v.Max = m2
- tdr.Tdmap[v] = append(tdr.Tdmap[v], td)
- break
- }
- }
- if !bfind {
- pos1 := &pos{m2, m1}
- tdr.Tdmap[pos1] = []*TD{td}
- tdr.Poss = append(tdr.Poss, pos1)
- }
- }
- /*-- END --- 处理表头概率 -------*/
- //table表格
- type Table struct {
- Brule bool //是否规则
- TRs []*TR
- BFirstRow bool
- RowNum int //行数
- ColNum int //列数
- TDNum int //td个数
- BPackage bool //是否有包
- SortKV *SortMap //带排序的KV值
- StandKV map[string]string //过滤后的标准化kv
- StandKVWeight map[string]int //过滤后的标准化kv
- StandRuleKV map[string]string //过滤后的规则kv
- kvscope map[int]map[int][]*TD //sortkey第几个元素的的第几个值的结束位置
- kTD map[int]*TD //根据索引找到key的TD元素
- SonTables []*Table //孩子表集合
- Tag string //表格的标签
- Desc string //表格描述内容
- Goquery *goquery.Selection //表格的goquery对象
- Html string //所属的文本内容
- BlockPackage *SortMap //子包数组
- TableResult *TableResult //父元素
- StartAndEndRation map[string]*TDRationScope //同行或同列的概率,截断的单独起算
- StartAndEndRationKSort *SortMap
- WinnerOrder []map[string]interface{}
- BSplit bool //是否是有一个表拆分成的多个表
- BHeader bool //拆分表是否有表头
- BrandData [][]map[string]string //品牌抽取结果
- HasKey int //有key
- HasBrand int //有品牌
- HasGoods int //有商品
- }
- func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Table {
- return &Table{
- Html: Html,
- SortKV: NewSortMap(),
- StandKV: map[string]string{},
- StandKVWeight: map[string]int{},
- kvscope: map[int]map[int][]*TD{},
- kTD: map[int]*TD{},
- SonTables: []*Table{},
- Goquery: tab,
- TRs: []*TR{},
- TableResult: TableResult,
- StartAndEndRation: map[string]*TDRationScope{},
- StartAndEndRationKSort: NewSortMap(),
- BlockPackage: NewSortMap(),
- }
- }
- func (t *Table) AddTR(tr *TR) {
- if len(tr.TDs) > 0 {
- if len(t.TRs) > 0 {
- tr.TopTR = t.TRs[len(t.TRs)-1]
- t.TRs[len(t.TRs)-1].BottomTR = tr
- }
- tr.RowPos = len(t.TRs)
- t.TRs = append(t.TRs, tr)
- }
- }
- func (t *Table) InsertTR(tr *TR) {
- if len(tr.TDs) > 0 {
- if len(t.TRs) > 0 {
- t.TRs[0].TopTR = tr
- }
- tr.RowPos = 0
- for _, _tr := range t.TRs {
- _tr.RowPos += 1
- }
- t.TRs = append([]*TR{tr}, t.TRs...)
- }
- }
- //支持排序的map
- type SortMap struct {
- Index map[string]int
- Keys []string
- Map map[string]interface{}
- Lock sync.Mutex
- }
- //快速创建排序map
- func NewSortMap() *SortMap {
- return &SortMap{
- Index: map[string]int{},
- Keys: []string{},
- Map: map[string]interface{}{},
- }
- }
- //增加值
- var NullVal = regexp.MustCompile("^[/无,.。;、附]+$|^详见.{2,8}$|(详?见)?附(件|图)")
- func (s *SortMap) AddKey(key string, val interface{}) {
- //判断val
- // if v, ok := val.(string); ok && NullVal.ReplaceAllString(u.TrimLRSpace(v, ""), "") == "" {
- // return
- // }
- s.Lock.Lock()
- defer s.Lock.Unlock()
- //重复
- if s.Map[key] == nil {
- s.Index[key] = len(s.Keys)
- s.Keys = append(s.Keys, key)
- }
- s.Map[key] = val
- }
- //增加值
- func (s *SortMap) ReplaceKey(key string, val interface{}, replacekey string) {
- s.Lock.Lock()
- defer s.Lock.Unlock()
- //重复
- v := s.Index[replacekey]
- s.Index[key] = v
- delete(s.Index, replacekey)
- s.Keys = append(s.Keys[:v], append([]string{key}, s.Keys[v+1:]...)...)
- delete(s.Map, replacekey)
- s.Map[key] = val
- }
- //删除值
- func (s *SortMap) RemoveKey(key string) {
- s.Lock.Lock()
- defer s.Lock.Unlock()
- delete(s.Map, key)
- pos := s.Index[key]
- delete(s.Index, key)
- if len(s.Keys) > 0 {
- s.Keys = func() []string {
- newkeys := []string{}
- if len(s.Keys) > 1 {
- if pos == 0 {
- newkeys = append(newkeys, s.Keys[1:]...)
- //每一个都减一
- for k, v := range s.Index {
- s.Index[k] = v - 1
- }
- } else if pos == len(s.Keys) {
- newkeys = append(newkeys, s.Keys[:pos]...)
- } else {
- tmp := s.Keys[pos+1:]
- newkeys = append(append(newkeys, s.Keys[:pos]...), tmp...)
- for _, v := range tmp {
- s.Index[v] -= 1
- }
- }
- }
- return newkeys
- }()
- }
- }
- //判断表头是key的对象
- type TableKeyV1 struct {
- TMap map[string]interface{}
- TReg []*regexp.Regexp
- TRegReplStr []string
- }
- //判断表头时用到的顺序 正文、结果表头、正常表头
- var THeadStr = []string{
- "con",
- "jghead",
- "normalhead",
- }
- //存放敏感词
- var TKMaps = map[string]*TableKeyV1{}
- //过滤所有非汉字内容
- var filterThText = regexp.MustCompile("([((【\\[].*[))】\\]])|([^0-9a-zA-Z\\p{Han}]+)")
- var tLock = sync.Mutex{}
- //matchStro为tablev1.json文件中的key,txt为表格的内容也可以是表格的标签
- //主要实现表格是否是表头的判断,表格是否有用的判断(如人员情况等是无用的)
- func CheckCommon(txt string, matchStr ...string) (res, must bool, stype, reg, repl string) {
- txt = filterThText.ReplaceAllString(txt, "")
- stype = "con"
- if len([]rune(txt)) < 30 {
- tLock.Lock()
- defer tLock.Unlock()
- if len(TKMaps) == 0 {
- for k, v := range u.TableK1 {
- tk := &TableKeyV1{
- map[string]interface{}{},
- []*regexp.Regexp{},
- []string{},
- }
- thMap := map[string]interface{}{}
- for _, v1 := range v {
- v1s := strings.Split(v1, "__")
- if len(v1s) == 2 {
- tk.TReg = append(tk.TReg, regexp.MustCompile(v1s[0]))
- tk.TRegReplStr = append(tk.TRegReplStr, v1s[1])
- } else {
- key := v1
- nowMap := &thMap
- for i := 0; i < len(key); i++ {
- kc := key[i : i+1]
- if v, ok := (*nowMap)[kc]; ok {
- nowMap, _ = v.(*map[string]interface{})
- } else {
- newMap := map[string]interface{}{}
- newMap["Y"] = "0"
- (*nowMap)[kc] = &newMap
- nowMap = &newMap
- }
- if i == len(key)-1 {
- (*nowMap)["Y"] = "1"
- (*nowMap)["K"] = key
- //(*nowMap)["V"] = v
- }
- }
- }
- }
- tk.TMap = thMap
- TKMaps[k] = tk
- }
- }
- //先正则、后子串查找
- L1:
- for _, v := range matchStr {
- //u.Debug(v)
- for n, vreg := range TKMaps[v].TReg {
- if vreg.MatchString(txt) {
- //u.Debug(txt, v, vreg.String())
- reg = vreg.String()
- repl = TKMaps[v].TRegReplStr[n]
- if v != "con" {
- res = true
- if "M" == repl {
- must = true
- }
- }
- stype = v
- break L1
- }
- }
- //以下是敏感词子串查找匹配
- pos := 0
- thMap := TKMaps[v].TMap
- nowMap := &thMap
- for i := 0; i < len(txt); i++ {
- word := txt[i : i+1]
- nowMap, _ = (*nowMap)[word].(*map[string]interface{})
- if nowMap != nil { // 存在,则判断是否为最后一个
- if pos == 0 {
- pos = i
- }
- if "1" == qutil.ObjToString((*nowMap)["Y"]) {
- if v != "con" {
- res = true
- }
- stype = v
- pos = 0
- break L1
- }
- } else {
- nowMap = &thMap
- if pos > 0 {
- i = pos
- pos = 0
- }
- }
- }
- }
- return
- } else {
- return
- }
- }
- //根据td中的内容验证表头,根据tablev1.json中配置的三种规则(含正则和子串查找算法)
- func CheckHeader(txt string) (res, must bool, stype, reg, repl string) {
- return CheckCommon(txt, THeadStr...)
- }
- /**
- 计算表格占比,返回表格数组、占比
- con 文本
- strtype 1全文 2块文本
- **/
- func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) {
- defer qutil.Catch()
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
- cons := doc.Text()
- tables := doc.Find("table")
- doc = nil
- if tables.Size() > 0 {
- tabs = []*goquery.Selection{}
- for i := 0; i < tables.Size(); i++ {
- tmpt := tables.Eq(i)
- b := false
- for j := 0; j < len(tabs); j++ {
- if tabs[j].Contains(tmpt.Get(0)) {
- b = true
- }
- }
- if !b {
- tabs = append(tabs, tmpt)
- }
- }
- tlen := 0
- for _, t := range tabs {
- tlen += len(t.Text())
- }
- ratio = float32(tlen) / float32(len(cons))
- }
- /**
- if ratio < float32(0.992) {
- //取出排除表格之外的文本
- txt =getTextAfterRemoveTable(con)
- }
- **/
- return
- }
- //取出排除表格之外的文本
- func TextAfterRemoveTable(con string) string {
- doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
- doc2.Find("table").Remove()
- return doc2.Text()
- }
- func HtmlAfterRemoveTable(con string) string {
- doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
- doc2.Find("table").Remove()
- html, _ := doc2.Html()
- return html
- }
- func If(condition bool, trueVal, falseVal interface{}) interface{} {
- if condition {
- return trueVal
- }
- return falseVal
- }
|