tablev2.go 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777
  1. package pretreated
  2. //定义表格对象
  3. import (
  4. "fmt"
  5. u "jy/util"
  6. qutil "qfw/util"
  7. "regexp"
  8. "strings"
  9. "sync"
  10. "github.com/PuerkitoBio/goquery"
  11. )
  12. //所有中标候选人只取第一个
  13. type TableResult struct {
  14. Id interface{} //信息id
  15. Toptype string //信息类型
  16. Itype int //1全文 2是块
  17. BlockTag string //块标签
  18. Html string
  19. Tabs []*Table //子表集合,子表中包含标准化kv或原始kv
  20. GoqueryTabs []*goquery.Selection //goquery对象
  21. TableSize int //子表的个数0,1,n
  22. IsMultiPackage bool //是否有子包
  23. PackageMap *SortMap //子包对象的sortmap,含标准化过的
  24. SortKV *SortMap //全局KVmap值,标准化处理过的
  25. SortKVWeight map[string]int //全局KVmap值,标准化处理过的
  26. WinnerOrder []map[string]interface{}
  27. }
  28. //快速创建TableResult对象
  29. func NewTableResult(Id interface{}, Toptype, BlockTag, con string, Itype int) *TableResult {
  30. return &TableResult{
  31. Id: Id,
  32. Toptype: Toptype,
  33. Html: con,
  34. Itype: Itype,
  35. BlockTag: BlockTag,
  36. Tabs: []*Table{},
  37. GoqueryTabs: []*goquery.Selection{},
  38. PackageMap: NewSortMap(),
  39. SortKV: NewSortMap(),
  40. SortKVWeight: map[string]int{},
  41. }
  42. }
  43. //td节点
  44. type TD struct {
  45. Goquery *goquery.Selection //文本对象
  46. TR *TR //所属TR对象
  47. LeftNode *TD //左临节点
  48. TopNode *TD //上临节点
  49. RightNode *TD //右节点
  50. BottomNode *TD //下节点
  51. Val string //值
  52. Text string //原始串
  53. SortKV *SortMap //存放kv值
  54. Html string
  55. BH bool //是否是表头
  56. MustBH bool //不能修改的表头
  57. StandardKey string //标准表头
  58. Colspan int //合并列
  59. Rowspan int //合并行
  60. StartCol int //起始列
  61. EndCol int //终止列
  62. StartRow int //起始行
  63. EndRow int //终止行
  64. ColPos int //当前在TR中的位置
  65. HeadTd *TD //(是val元素)k节点
  66. KVDirect int //键-值方向,0未知,1横 2纵//指值和k的方向
  67. KeyDirect int //k方向,k纵值横,k横值纵 1横 2纵
  68. SonTds []*TD //(是key元素)值节点数组
  69. SonTableResult *TableResult //子值表格集
  70. ArrVal []string //数组值,当是左临元素是合并行的元素时!
  71. Valtype string //"BO=中标人顺序"
  72. }
  73. var submatchreg = regexp.MustCompile(`((?:[一二三四五六七八九十0-10]+[、])([\\S]{4,12})|([\\S]{2,12}))[::]([\\S]{5,60})([一二三四五六七八九]+[、])?`)
  74. var BHKey = regexp.MustCompile(`^[^,,;:。、.]{2,8}.{0,3}[::].+$`)
  75. var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿元]+)")
  76. func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
  77. defer qutil.Catch()
  78. td := &TD{
  79. ArrVal: []string{},
  80. Goquery: Goquery,
  81. SonTds: []*TD{},
  82. TR: tr,
  83. SortKV: NewSortMap(),
  84. }
  85. colspan, rowspan := 0, 0
  86. col, bcol := td.Goquery.Attr("colspan")
  87. if bcol {
  88. colspan = qutil.IntAllDef(col, 1)
  89. }
  90. if colspan == 0 {
  91. colspan = 1
  92. }
  93. row, brow := td.Goquery.Attr("rowspan")
  94. if brow {
  95. rowspan = qutil.IntAllDef(row, 1)
  96. }
  97. if rowspan == 0 {
  98. rowspan = 1
  99. }
  100. td.Colspan, td.Rowspan = colspan, rowspan
  101. td.Html, _ = td.Goquery.Html()
  102. ht := td.Goquery.ChildrenFiltered("table")
  103. bsontable := false
  104. txt := ""
  105. if ht.Size() > 0 {
  106. txt = TextAfterRemoveTable(td.Html)
  107. ts := td.TR.Table.TableResult
  108. tabs, _ := ComputeConRatio(td.Html, 2)
  109. if len(tabs) > 0 {
  110. bsontable = true
  111. stag := ts.BlockTag
  112. if stag == "" {
  113. var tdleft *TD
  114. if len(tr.TDs) > 0 {
  115. tdleft = tr.TDs[len(tr.TDs)-1]
  116. if tdleft.BH {
  117. //u.Debug(tdleft.Val),如果不存在就是上一行的
  118. stag = tdleft.Val
  119. }
  120. } else if len(tr.Table.TRs) > 0 {
  121. lasttr := tr.Table.TRs[len(tr.Table.TRs)-1]
  122. str := ""
  123. for _, td3 := range lasttr.TDs {
  124. str += td3.Val
  125. if len([]rune(str)) > 14 {
  126. str = ""
  127. break
  128. }
  129. }
  130. stag = str
  131. }
  132. }
  133. sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id)
  134. td.BH = false
  135. td.SonTableResult = sonts
  136. //for _, k := range sonts.SortKV.Keys {
  137. //u.Debug(k, sonts.SortKV.Map[k])
  138. // td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
  139. // td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
  140. //}
  141. if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
  142. td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
  143. }
  144. if sonts.IsMultiPackage {
  145. td.TR.Table.BPackage = true
  146. tb1 := td.TR.Table.BlockPackage
  147. for k, v := range sonts.PackageMap.Map {
  148. v1 := v.(*u.BlockPackage)
  149. if tb1.Map[k] == nil {
  150. tb1.AddKey(k, v)
  151. } else {
  152. bp := tb1.Map[k].(*u.BlockPackage)
  153. if v1.TableKV != nil && v1.TableKV.Kv != nil {
  154. for k2, v2 := range v1.TableKV.Kv {
  155. if bp.TableKV.Kv[k2] == "" {
  156. bp.TableKV.Kv[k2] = v2
  157. bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
  158. }
  159. }
  160. }
  161. }
  162. }
  163. //u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
  164. }
  165. }
  166. } else {
  167. txt = td.Goquery.Text()
  168. }
  169. text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
  170. //u.Debug(txt, text)
  171. td.Val = text
  172. td.Text = txt
  173. //对td单元格值判断是否是key
  174. lentxt := len([]rune(text))
  175. //if lentxt > 9 {
  176. //td.KV = GetKVAll(txt, "")
  177. ub := []*u.Block{}
  178. if lentxt > 50 { //看是否划块
  179. //u.Debug(txt)
  180. ub, _ = DivideBlock(txt, 2)
  181. if len(ub) > 0 {
  182. colonKvWeight := map[string]int{}
  183. spaceKvWeight := map[string]int{}
  184. for _, bl := range ub {
  185. //冒号kv
  186. for bl_ck, bl_cv := range bl.ColonKV.Kv {
  187. //u.Debug(bl_ck, bl_cv)
  188. if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
  189. colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
  190. td.SortKV.AddKey(bl_ck, bl_cv)
  191. }
  192. }
  193. //空格kv
  194. for bl_sk, bl_sv := range bl.SpaceKV.Kv {
  195. if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
  196. spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
  197. td.SortKV.AddKey(bl_sk, bl_sv)
  198. }
  199. }
  200. }
  201. }
  202. //
  203. blockPackage := map[string]*u.BlockPackage{}
  204. isFindPkg := true
  205. /*if td.ColPos-1 >= 0 && excludeKey.MatchString(tr.TDs[td.ColPos-1].Text) {
  206. isFindPkg = false
  207. } else if len(tr.TDs) > 0 {
  208. tdleft = tr.TDs[len(tr.TDs)-1]
  209. if tdleft.BH && excludeKey.MatchString(tr.TDs[td.ColPos-1].Text) {
  210. isFindPkg = false
  211. }
  212. }*/
  213. if len(tr.TDs) > 0 {
  214. tdleft := tr.TDs[len(tr.TDs)-1]
  215. if tdleft.BH && excludeKey.MatchString(tdleft.Text) {
  216. isFindPkg = false
  217. }
  218. }
  219. if isFindPkg {
  220. if len(ub) > 0 {
  221. blockPackage = FindPackageFromBlocks(&ub, "")
  222. } else {
  223. blockPackage = FindPackageFromText("", text)
  224. }
  225. }
  226. if len(blockPackage) > 0 {
  227. table.BPackage = true
  228. for bp_k, bp_v := range blockPackage {
  229. var bp *u.BlockPackage
  230. if table.TableResult.PackageMap.Map[bp_k] == nil {
  231. bp = bp_v
  232. } else {
  233. bp = table.TableResult.PackageMap.Map[bp_k].(*u.BlockPackage)
  234. bp.Text += "\n" + bp_v.Text
  235. }
  236. if bp.TableKV == nil {
  237. bp.TableKV = u.NewJobKv()
  238. }
  239. for k2, v2 := range bp_v.ColonKV.Kv {
  240. if bp.TableKV.Kv[k2] == "" {
  241. bp.TableKV.Kv[k2] = v2
  242. }
  243. }
  244. for k2, v2 := range bp_v.SpaceKV.Kv {
  245. if bp.TableKV.Kv[k2] == "" {
  246. bp.TableKV.Kv[k2] = v2
  247. }
  248. }
  249. table.TableResult.PackageMap.Map[bp_k] = bp
  250. }
  251. }
  252. }
  253. //
  254. if lentxt < 50 {
  255. // td.SortKV = FindKv(text, "")
  256. kvTitle := ""
  257. if len(td.TR.TDs) > 0 {
  258. kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
  259. }
  260. _, resm := colonkvEntity.entrance(text, kvTitle, 2)
  261. for k, v := range resm {
  262. td.SortKV.AddKey(k, v)
  263. }
  264. //u.Debug(td.SortKV.Keys, "-------2--------------------------------")
  265. // td.SortKV = FindKv(text, "") //GetKvFromtxt(text, "")
  266. //resm := GetKVAll(text, "")
  267. if len(td.SortKV.Keys) > 0 {
  268. //td.KVDirect = 3 //不当头也不当值,忽略
  269. if len(td.SortKV.Keys) == 1 && BHKey.MatchString(td.Val) && !MultiReg.MatchString(td.Val) {
  270. td.Val = td.SortKV.Keys[0]
  271. td.BH = true
  272. }
  273. } else if !bsontable {
  274. txt := repSpace.ReplaceAllString(text, "")
  275. btw, must, _, _, repl := CheckHeader(txt)
  276. td.Valtype = repl
  277. td.MustBH = must
  278. td.BH = btw
  279. }
  280. } else if len(ub) == 0 { //之前这里没加判断,现在加上判断,造成分块之后的kv被覆盖掉
  281. //u.Debug("----\n\n\n", txt, "\n\n\n----")
  282. //u.Debug(GetKVAll(txt, ""))
  283. /*
  284. subVal := submatchreg.FindAllStringSubmatch(txt, -1)
  285. if len(subVal) > 0 {
  286. for _, subv1 := range subVal {
  287. if len(subv1) == 6 {
  288. tr.Table.SortKV.AddKey(If(subv1[2] == "", subv1[3], subv1[2]).(string), subv1[4])
  289. //tr.Table.SortKV.AddKey(subv1[1], subv1[2])
  290. }
  291. }
  292. }
  293. */
  294. td.SortKV = FindKv(text, "", 2)
  295. // td.LeftNode.Val
  296. // for _, vvv := range *td.TR {
  297. // u.Debug(">>>>>")
  298. // }
  299. kvTitle := ""
  300. if len(td.TR.TDs) > 0 {
  301. kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
  302. }
  303. _, resm := colonkvEntity.entrance(text, kvTitle, 2)
  304. for k, v := range resm {
  305. td.SortKV.AddKey(k, v)
  306. }
  307. }
  308. bhead := false
  309. if td.TR.RowPos == 0 { //第一行
  310. if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
  311. bhead = true
  312. }
  313. }
  314. if bhead && !bsontable {
  315. td.BH = true
  316. td.KeyDirect = 1
  317. td.KVDirect = 2
  318. }
  319. //u.Debug(td.BH, td.Val)
  320. return td
  321. }
  322. func (t *Table) Print() {
  323. for row, trs := range t.TRs {
  324. for col, td := range trs.TDs {
  325. u.Debug(row, col, td.Val, td.BH, td.SortKV.Map)
  326. }
  327. }
  328. }
  329. type TR struct {
  330. TDs []*TD
  331. TopTR *TR //上临行
  332. BottomTR *TR //下临行
  333. Table *Table //所属表格对象
  334. RowPos int //当前在第几行
  335. //-----计算
  336. MaxRow int //最大跨行 Max(td.StartRow-td.EndRow)
  337. MinRow int //最小跨行
  338. StartRow int //起始行
  339. EndRow int //结束行
  340. MaxCol int //最大列
  341. MinCol int //最小列
  342. StartCol int //起始列
  343. EndCol int //结束列
  344. BDiffSpanRow bool //起始行,行中有没有不同跨行 - - - = -
  345. BDiffSpanCol bool //起始列,列中有没有不同跨列 |
  346. }
  347. func NewTR(Table *Table) *TR {
  348. return &TR{
  349. TDs: []*TD{},
  350. Table: Table,
  351. }
  352. }
  353. func (tr *TR) AddTD(td *TD) {
  354. /**对跨行没有意义
  355. if len(tr.TDs) > 0 {
  356. td.LeftNode = tr.TDs[len(tr.TDs)-1]
  357. tr.TDs[len(tr.TDs)-1].RightNode = td
  358. }
  359. **/
  360. td.ColPos = len(tr.TDs)
  361. tr.TDs = append(tr.TDs, td)
  362. }
  363. /*-- START --- 处理表头概率开始 -------*/
  364. type pos struct {
  365. Max int
  366. Min int
  367. }
  368. type TDRationScope struct {
  369. Rationmap map[*pos]float32
  370. Tdmap map[*pos][]*TD
  371. Poss []*pos
  372. Parentkey string
  373. }
  374. func NewTDRationScope(key string) *TDRationScope {
  375. return &TDRationScope{map[*pos]float32{}, map[*pos][]*TD{}, []*pos{}, key}
  376. }
  377. func (tdr *TDRationScope) GetPos(td *TD) (poss *pos) {
  378. k1 := tdr.Parentkey[:1]
  379. m1, m2 := td.StartRow, td.EndRow
  380. if k1 == "r" {
  381. m1, m2 = td.StartCol, td.EndCol
  382. }
  383. for _, v := range tdr.Poss {
  384. if v.Max >= m2 && v.Min <= m1 {
  385. poss = v
  386. return
  387. }
  388. }
  389. return
  390. }
  391. func (tdr *TDRationScope) GetTDRation(td *TD) (ration float32, tds []*TD) {
  392. poss := tdr.GetPos(td)
  393. if poss != nil {
  394. ration = tdr.Rationmap[poss]
  395. tds = tdr.Tdmap[poss]
  396. }
  397. return
  398. }
  399. func (tdr *TDRationScope) Addtd(td *TD) {
  400. k1 := tdr.Parentkey[:1]
  401. m1, m2 := td.StartRow, td.EndRow
  402. if k1 == "r" {
  403. m1, m2 = td.StartCol, td.EndCol
  404. }
  405. bfind := false
  406. for _, v := range tdr.Poss {
  407. if m1 == v.Max+1 { //找到
  408. bfind = true
  409. v.Max = m2
  410. tdr.Tdmap[v] = append(tdr.Tdmap[v], td)
  411. break
  412. }
  413. }
  414. if !bfind {
  415. pos1 := &pos{m2, m1}
  416. tdr.Tdmap[pos1] = []*TD{td}
  417. tdr.Poss = append(tdr.Poss, pos1)
  418. }
  419. }
  420. /*-- END --- 处理表头概率 -------*/
  421. //table表格
  422. type Table struct {
  423. Brule bool //是否规则
  424. TRs []*TR
  425. BFirstRow bool
  426. RowNum int //数行
  427. ColNum int //列数
  428. TDNum int //td个数
  429. BPackage bool //是否有包
  430. SortKV *SortMap //带排序的KV值
  431. StandKV map[string]string //过滤后的标准化kv
  432. StandKVWeight map[string]int //过滤后的标准化kv
  433. StandRuleKV map[string]string //过滤后的规则kv
  434. kvscope map[int]map[int][]*TD //sortkey第几个元素的的第几个值的结束位置
  435. kTD map[int]*TD //根据索引找到key的TD元素
  436. SonTables []*Table //孩子表集合
  437. Tag string //表格的标签
  438. Desc string //表格描述内容
  439. Goquery *goquery.Selection //表格的goquery对象
  440. Html string //所属的文本内容
  441. BlockPackage *SortMap //子包数组
  442. TableResult *TableResult //父元素
  443. StartAndEndRation map[string]*TDRationScope //同行或同列的概率,截断的单独起算
  444. StartAndEndRationKSort *SortMap
  445. WinnerOrder []map[string]interface{}
  446. BSplit bool //是否是有一个表拆分成的多个表
  447. BHeader bool //拆分表是否有表头
  448. }
  449. func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Table {
  450. return &Table{
  451. Html: Html,
  452. SortKV: NewSortMap(),
  453. StandKV: map[string]string{},
  454. StandKVWeight: map[string]int{},
  455. kvscope: map[int]map[int][]*TD{},
  456. kTD: map[int]*TD{},
  457. SonTables: []*Table{},
  458. Goquery: tab,
  459. TRs: []*TR{},
  460. TableResult: TableResult,
  461. StartAndEndRation: map[string]*TDRationScope{},
  462. StartAndEndRationKSort: NewSortMap(),
  463. BlockPackage: NewSortMap(),
  464. }
  465. }
  466. func (t *Table) AddTR(tr *TR) {
  467. if len(tr.TDs) > 0 {
  468. if len(t.TRs) > 0 {
  469. tr.TopTR = t.TRs[len(t.TRs)-1]
  470. t.TRs[len(t.TRs)-1].BottomTR = tr
  471. }
  472. tr.RowPos = len(t.TRs)
  473. t.TRs = append(t.TRs, tr)
  474. }
  475. }
  476. func (t *Table) InsertTR(tr *TR) {
  477. if len(tr.TDs) > 0 {
  478. if len(t.TRs) > 0 {
  479. t.TRs[0].TopTR = tr
  480. }
  481. tr.RowPos = 0
  482. for _, _tr := range t.TRs {
  483. _tr.RowPos += 1
  484. }
  485. t.TRs = append([]*TR{tr}, t.TRs...)
  486. }
  487. }
  488. //支持排序的map
  489. type SortMap struct {
  490. Index map[string]int
  491. Keys []string
  492. Map map[string]interface{}
  493. Lock sync.Mutex
  494. }
  495. //快速创建排序map
  496. func NewSortMap() *SortMap {
  497. return &SortMap{
  498. Index: map[string]int{},
  499. Keys: []string{},
  500. Map: map[string]interface{}{},
  501. }
  502. }
  503. //增加值
  504. var nullVal = regexp.MustCompile("^[/无,.。;、]+$|^详见.{2,8}$")
  505. func (s *SortMap) AddKey(key string, val interface{}) {
  506. //判断val
  507. if v, ok := val.(string); ok && nullVal.ReplaceAllString(u.TrimLRSpace(v, ""), "") == "" {
  508. return
  509. }
  510. s.Lock.Lock()
  511. defer s.Lock.Unlock()
  512. //重复
  513. if s.Map[key] == nil {
  514. s.Index[key] = len(s.Keys)
  515. s.Keys = append(s.Keys, key)
  516. }
  517. s.Map[key] = val
  518. }
  519. //增加值
  520. func (s *SortMap) ReplaceKey(key string, val interface{}, replacekey string) {
  521. s.Lock.Lock()
  522. defer s.Lock.Unlock()
  523. //重复
  524. v := s.Index[replacekey]
  525. s.Index[key] = v
  526. delete(s.Index, replacekey)
  527. s.Keys = append(s.Keys[:v], append([]string{key}, s.Keys[v+1:]...)...)
  528. delete(s.Map, replacekey)
  529. s.Map[key] = val
  530. }
  531. //删除值
  532. func (s *SortMap) RemoveKey(key string) {
  533. s.Lock.Lock()
  534. defer s.Lock.Unlock()
  535. delete(s.Map, key)
  536. pos := s.Index[key]
  537. delete(s.Index, key)
  538. if len(s.Keys) > 0 {
  539. s.Keys = func() []string {
  540. newkeys := []string{}
  541. if len(s.Keys) > 1 {
  542. if pos == 0 {
  543. newkeys = append(newkeys, s.Keys[1:]...)
  544. //每一个都减一
  545. for k, v := range s.Index {
  546. s.Index[k] = v - 1
  547. }
  548. } else if pos == len(s.Keys) {
  549. newkeys = append(newkeys, s.Keys[:pos]...)
  550. } else {
  551. tmp := s.Keys[pos+1:]
  552. newkeys = append(append(newkeys, s.Keys[:pos]...), tmp...)
  553. for _, v := range tmp {
  554. s.Index[v] -= 1
  555. }
  556. }
  557. }
  558. return newkeys
  559. }()
  560. }
  561. }
  562. //判断表头是key的对象
  563. type TableKeyV1 struct {
  564. TMap map[string]interface{}
  565. TReg []*regexp.Regexp
  566. TRegReplStr []string
  567. }
  568. //判断表头时用到的顺序 正文、结果表头、正常表头
  569. var THeadStr = []string{
  570. "con",
  571. "jghead",
  572. "normalhead",
  573. }
  574. //存放敏感词
  575. var TKMaps = map[string]*TableKeyV1{}
  576. //过滤所有非汉字内容
  577. var filterThText = regexp.MustCompile("([((【\\[].*[))】\\]])|([^0-9a-zA-Z\\p{Han}]+)")
  578. var tLock = sync.Mutex{}
  579. //matchStro为tablev1.json文件中的key,txt为表格的内容也可以是表格的标签
  580. //主要实现表格是否是表头的判断,表格是否有用的判断(如人员情况等是无用的)
  581. func CheckCommon(txt string, matchStr ...string) (res, must bool, stype, reg, repl string) {
  582. txt = filterThText.ReplaceAllString(txt, "")
  583. stype = "con"
  584. if len([]rune(txt)) < 30 {
  585. tLock.Lock()
  586. defer tLock.Unlock()
  587. if len(TKMaps) == 0 {
  588. for k, v := range u.TableK1 {
  589. tk := &TableKeyV1{
  590. map[string]interface{}{},
  591. []*regexp.Regexp{},
  592. []string{},
  593. }
  594. thMap := map[string]interface{}{}
  595. for _, v1 := range v {
  596. v1s := strings.Split(v1, "__")
  597. if len(v1s) == 2 {
  598. tk.TReg = append(tk.TReg, regexp.MustCompile(v1s[0]))
  599. tk.TRegReplStr = append(tk.TRegReplStr, v1s[1])
  600. } else {
  601. key := v1
  602. nowMap := &thMap
  603. for i := 0; i < len(key); i++ {
  604. kc := key[i : i+1]
  605. if v, ok := (*nowMap)[kc]; ok {
  606. nowMap, _ = v.(*map[string]interface{})
  607. } else {
  608. newMap := map[string]interface{}{}
  609. newMap["Y"] = "0"
  610. (*nowMap)[kc] = &newMap
  611. nowMap = &newMap
  612. }
  613. if i == len(key)-1 {
  614. (*nowMap)["Y"] = "1"
  615. (*nowMap)["K"] = key
  616. //(*nowMap)["V"] = v
  617. }
  618. }
  619. }
  620. }
  621. tk.TMap = thMap
  622. TKMaps[k] = tk
  623. }
  624. }
  625. //先正则、后子串查找
  626. L1:
  627. for _, v := range matchStr {
  628. //u.Debug(v)
  629. for n, vreg := range TKMaps[v].TReg {
  630. if vreg.MatchString(txt) {
  631. //u.Debug(txt, v, vreg.String())
  632. reg = vreg.String()
  633. repl = TKMaps[v].TRegReplStr[n]
  634. if v != "con" {
  635. res = true
  636. if "M" == repl {
  637. must = true
  638. }
  639. }
  640. stype = v
  641. break L1
  642. }
  643. }
  644. //以下是敏感词子串查找匹配
  645. pos := 0
  646. thMap := TKMaps[v].TMap
  647. nowMap := &thMap
  648. for i := 0; i < len(txt); i++ {
  649. word := txt[i : i+1]
  650. nowMap, _ = (*nowMap)[word].(*map[string]interface{})
  651. if nowMap != nil { // 存在,则判断是否为最后一个
  652. if pos == 0 {
  653. pos = i
  654. }
  655. if "1" == qutil.ObjToString((*nowMap)["Y"]) {
  656. if v != "con" {
  657. res = true
  658. }
  659. stype = v
  660. pos = 0
  661. break L1
  662. }
  663. } else {
  664. nowMap = &thMap
  665. if pos > 0 {
  666. i = pos
  667. pos = 0
  668. }
  669. }
  670. }
  671. }
  672. return
  673. } else {
  674. return
  675. }
  676. }
  677. //根据td中的内容验证表头,根据tablev1.json中配置的三种规则(含正则和子串查找算法)
  678. func CheckHeader(txt string) (res, must bool, stype, reg, repl string) {
  679. return CheckCommon(txt, THeadStr...)
  680. }
  681. /**
  682. 计算表格占比,返回表格数组、占比
  683. con 文本
  684. strtype 1全文 2块文本
  685. **/
  686. func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) {
  687. defer qutil.Catch()
  688. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  689. tables := doc.Find("table")
  690. if tables.Size() > 0 {
  691. tabs = []*goquery.Selection{}
  692. for i := 0; i < tables.Size(); i++ {
  693. tmpt := tables.Eq(i)
  694. b := false
  695. for j := 0; j < len(tabs); j++ {
  696. if tabs[j].Contains(tmpt.Get(0)) {
  697. b = true
  698. }
  699. }
  700. if !b {
  701. tabs = append(tabs, tmpt)
  702. }
  703. }
  704. tlen := 0
  705. cons := doc.Text()
  706. for _, t := range tabs {
  707. tlen += len(t.Text())
  708. }
  709. ratio = float32(tlen) / float32(len(cons))
  710. }
  711. /**
  712. if ratio < float32(0.992) {
  713. //取出排除表格之外的文本
  714. txt =getTextAfterRemoveTable(con)
  715. }
  716. **/
  717. return
  718. }
  719. //取出排除表格之外的文本
  720. func TextAfterRemoveTable(con string) string {
  721. doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  722. doc2.Find("table").Remove()
  723. return doc2.Text()
  724. }
  725. func HtmlAfterRemoveTable(con string) string {
  726. doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  727. doc2.Find("table").Remove()
  728. html, _ := doc2.Html()
  729. return html
  730. }
  731. func If(condition bool, trueVal, falseVal interface{}) interface{} {
  732. if condition {
  733. return trueVal
  734. }
  735. return falseVal
  736. }