tablev2.go 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881
  1. package pretreated
  2. //定义表格对象
  3. import (
  4. "fmt"
  5. u "jy/util"
  6. "log"
  7. qutil "qfw/util"
  8. "regexp"
  9. "strings"
  10. "sync"
  11. "github.com/PuerkitoBio/goquery"
  12. )
  13. //所有中标候选人只取第一个
  14. type TableResult struct {
  15. Id interface{} //信息id
  16. Toptype string //信息类型
  17. Itype int //1全文 2是块
  18. BlockTag string //块标签
  19. Html string
  20. Tabs []*Table //子表集合,子表中包含标准化kv或原始kv
  21. GoqueryTabs []*goquery.Selection //goquery对象
  22. TableSize int //子表的个数0,1,n
  23. IsMultiPackage bool //是否有子包
  24. PackageMap *SortMap //子包对象的sortmap,含标准化过的
  25. SortKV *SortMap //全局KVmap值,标准化处理过的
  26. SortKVWeight map[string]int //全局KVmap值,标准化处理过的
  27. WinnerOrder []map[string]interface{}
  28. BrandData [][]map[string]string //品牌抽取结果
  29. HasKey int //有key
  30. HasBrand int //有品牌
  31. HasGoods int //有商品
  32. RuleBlock *u.RuleBlock
  33. }
  34. //快速创建TableResult对象
  35. func NewTableResult(Id interface{}, Toptype, BlockTag, con string, Itype int, ruleBlock *u.RuleBlock) *TableResult {
  36. return &TableResult{
  37. Id: Id,
  38. Toptype: Toptype,
  39. Html: con,
  40. Itype: Itype,
  41. BlockTag: BlockTag,
  42. Tabs: []*Table{},
  43. GoqueryTabs: []*goquery.Selection{},
  44. PackageMap: NewSortMap(),
  45. SortKV: NewSortMap(),
  46. SortKVWeight: map[string]int{},
  47. RuleBlock: ruleBlock,
  48. }
  49. }
  50. //td节点
  51. type TD struct {
  52. Goquery *goquery.Selection //文本对象
  53. TR *TR //所属TR对象
  54. LeftNode *TD //左临节点
  55. TopNode *TD //上临节点
  56. RightNode *TD //右节点
  57. BottomNode *TD //下节点
  58. Val string //值
  59. Text string //原始串
  60. SortKV *SortMap //存放kv值
  61. Html string //html值
  62. BH bool //是否是表头
  63. MustBH bool //不能修改的表头
  64. StandardKey string //标准表头
  65. Colspan int //合并列
  66. Rowspan int //合并行
  67. StartCol int //起始列
  68. EndCol int //终止列
  69. StartRow int //起始行
  70. EndRow int //终止行
  71. ColPos int //当前在TR中的位置
  72. HeadTd *TD //(是val元素)k节点
  73. KVDirect int //键-值方向,0未知,1横 2纵//指值和k的方向
  74. KeyDirect int //k方向,k纵值横,k横值纵 1横 2纵
  75. SonTds []*TD //(是key元素)值节点数组
  76. SonTableResult *TableResult //子值表格集
  77. ArrVal []string //数组值,当是左临元素是合并行的元素时!
  78. Valtype string //"BO=中标人顺序"
  79. }
  80. var submatchreg = regexp.MustCompile(`((?:[一二三四五六七八九十0-10]+[、])([\\S]{4,12})|([\\S]{2,12}))[::]([\\S]{5,60})([一二三四五六七八九]+[、])?`)
  81. var BHKey = regexp.MustCompile(`^[^,,;:。、.]{2,8}.{0,3}[::].+$`)
  82. var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿元]+)")
  83. func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
  84. defer qutil.Catch()
  85. td := &TD{
  86. ArrVal: []string{},
  87. Goquery: Goquery,
  88. SonTds: []*TD{},
  89. TR: tr,
  90. SortKV: NewSortMap(),
  91. }
  92. colspan, rowspan := 0, 0
  93. col, bcol := td.Goquery.Attr("colspan")
  94. if bcol {
  95. colspan = qutil.IntAllDef(col, 1)
  96. }
  97. if colspan == 0 {
  98. colspan = 1
  99. }
  100. row, brow := td.Goquery.Attr("rowspan")
  101. if brow {
  102. rowspan = qutil.IntAllDef(row, 1)
  103. }
  104. if rowspan == 0 {
  105. rowspan = 1
  106. }
  107. td.Colspan, td.Rowspan = colspan, rowspan //合并列,合并行
  108. td.Html, _ = td.Goquery.Html() //html值
  109. ht := td.Goquery.ChildrenFiltered("table") //获取td的table
  110. bsontable := false //默认td中没有table
  111. txt := ""
  112. //子table处理合并
  113. if ht.Size() > 0 {
  114. //qutil.Debug("有子表格")
  115. //格式化正文
  116. txt = TextAfterRemoveTable(td.Html)
  117. td.tdHasTable(&bsontable, tr, table) //处理td中的table,块标签处理,子表解析集处理
  118. //处理table外内容
  119. var ub []*u.Block
  120. ub, _ = DivideBlock("",txt, 2, table.TableResult.RuleBlock)
  121. //看是否划块
  122. if len(ub) > 0 {
  123. colonKvWeight := map[string]int{}
  124. spaceKvWeight := map[string]int{}
  125. for _, bl := range ub {
  126. //冒号kv
  127. for bl_ck, bl_cv := range bl.ColonKV.Kv {
  128. if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
  129. colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
  130. td.SortKV.AddKey(bl_ck, bl_cv)
  131. }
  132. }
  133. //空格kv
  134. for bl_sk, bl_sv := range bl.SpaceKV.Kv {
  135. if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
  136. spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
  137. td.SortKV.AddKey(bl_sk, bl_sv)
  138. }
  139. }
  140. }
  141. }else {
  142. //调用kv解析
  143. cKV := GetKVAll(txt, "", nil, 1)
  144. for k,v :=range cKV.Kv{
  145. td.SortKV.AddKey(k,v)
  146. }
  147. sKV := SspacekvEntity.Entrance(txt, "", nil)
  148. for k,v :=range sKV.Kv{
  149. td.SortKV.AddKey(k,v)
  150. }
  151. }
  152. } else {
  153. txt = strings.TrimSpace(td.Goquery.Text())
  154. }
  155. text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
  156. td.Val = text //值
  157. td.Text = txt //原始串
  158. //对td单元格值判断是否是表头和根据td内容长度进行分块处理
  159. td.tdIsHb(tr, table, bsontable)
  160. bhead := false
  161. if td.TR.RowPos == 0 { //第一行
  162. if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
  163. bhead = true
  164. }
  165. }
  166. if bhead && !bsontable {
  167. td.BH = true
  168. td.KeyDirect = 1 //k方向,k纵值横,k横值纵 1横 2纵
  169. td.KVDirect = 2 //键-值方向,0未知,1横 2纵//指值和k的方向
  170. }
  171. //u.Debug(td.BH, td.Val)
  172. return td
  173. }
  174. //处理td中的table,块标签处理,子表解析集处理
  175. func (td *TD) tdHasTable(bsontable *bool, tr *TR, table *Table) {
  176. ts := td.TR.Table.TableResult
  177. tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比
  178. if len(tabs) > 0 {
  179. (*bsontable) = true
  180. stag := ts.BlockTag //块标签
  181. if stag == "" {
  182. var tdleft *TD
  183. if len(tr.TDs) > 0 {
  184. tdleft = tr.TDs[len(tr.TDs)-1]
  185. if tdleft.BH {
  186. //u.Debug(tdleft.Val),如果不存在就是上一行的
  187. stag = tdleft.Val
  188. }
  189. } else if len(tr.Table.TRs) > 0 {
  190. lasttr := tr.Table.TRs[len(tr.Table.TRs)-1]
  191. str := ""
  192. for _, td3 := range lasttr.TDs {
  193. str += td3.Val
  194. if len([]rune(str)) > 14 {
  195. str = ""
  196. break
  197. }
  198. }
  199. stag = str
  200. }
  201. sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
  202. td.BH = false
  203. for k,v := range sonts.SortKV.Map{
  204. if td.SonTableResult == nil{
  205. td.SonTableResult = NewTableResult(sonts.Id,sonts.Toptype,sonts.BlockTag,sonts.Html,sonts.Itype,sonts.RuleBlock)
  206. }
  207. td.SonTableResult.SortKV.AddKey(k,v)
  208. }
  209. //td.SonTableResult = sonts
  210. //for _, k := range sonts.SortKV.Keys {
  211. //u.Debug(k, sonts.SortKV.Map[k])
  212. // td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
  213. // td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
  214. //}
  215. //增加brand (子表)
  216. //fmt.Println("sonsHasKey=============", sonts.HasKey)
  217. //fmt.Println("sonsHasGoods========", sonts.HasGoods)
  218. //fmt.Println("sonsHasBrand========", sonts.HasBrand)
  219. if sonts.HasKey != 0 {
  220. td.TR.Table.TableResult.HasKey = sonts.HasKey
  221. }
  222. if sonts.HasGoods != 0 {
  223. td.TR.Table.TableResult.HasGoods = sonts.HasGoods
  224. }
  225. if sonts.HasBrand != 0 {
  226. td.TR.Table.TableResult.HasBrand = sonts.HasBrand
  227. }
  228. if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table
  229. for _, v := range sonts.BrandData {
  230. if len(v) > 0 {
  231. td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v)
  232. }
  233. }
  234. }
  235. if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
  236. td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
  237. }
  238. if sonts.IsMultiPackage {
  239. td.TR.Table.BPackage = true
  240. tb1 := td.TR.Table.BlockPackage
  241. for k, v := range sonts.PackageMap.Map {
  242. v1 := v.(*u.BlockPackage)
  243. if tb1.Map[k] == nil {
  244. tb1.AddKey(k, v)
  245. } else {
  246. bp := tb1.Map[k].(*u.BlockPackage)
  247. if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil {
  248. for k2, v2 := range v1.TableKV.Kv {
  249. if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" {
  250. bp.TableKV.Kv[k2] = v2
  251. bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
  252. }
  253. }
  254. }
  255. }
  256. }
  257. //u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
  258. }
  259. }
  260. }
  261. }
  262. //对td单元格值判断是否是表头和根据td内容长度进行分块处理
  263. func (td *TD) tdIsHb(tr *TR, table *Table, bsontable bool) {
  264. lenval := len([]rune(td.Val)) //经过处理的td内容长度
  265. //if lentxt > 9 {
  266. //td.KV = GetKVAll(txt, "")
  267. ub := []*u.Block{}
  268. //经过处理的td内容长度大于50,划块,分包
  269. if lenval > 50 { //看是否划块
  270. //u.Debug(txt)
  271. ub, _ = DivideBlock("", td.Text, 2, table.TableResult.RuleBlock) //对td的原始值
  272. //看是否划块
  273. if len(ub) > 0 {
  274. colonKvWeight := map[string]int{}
  275. spaceKvWeight := map[string]int{}
  276. for _, bl := range ub {
  277. //冒号kv
  278. for bl_ck, bl_cv := range bl.ColonKV.Kv {
  279. if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
  280. colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
  281. td.SortKV.AddKey(bl_ck, bl_cv)
  282. }
  283. }
  284. //空格kv
  285. for bl_sk, bl_sv := range bl.SpaceKV.Kv {
  286. if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
  287. spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
  288. td.SortKV.AddKey(bl_sk, bl_sv)
  289. }
  290. }
  291. }
  292. }
  293. //
  294. blockPackage := map[string]*u.BlockPackage{}
  295. isFindPkg := true
  296. /*if td.ColPos-1 >= 0 && excludeKey.MatchString(tr.TDs[td.ColPos-1].Text) {
  297. isFindPkg = false
  298. } else if len(tr.TDs) > 0 {
  299. tdleft = tr.TDs[len(tr.TDs)-1]
  300. if tdleft.BH && excludeKey.MatchString(tr.TDs[td.ColPos-1].Text) {
  301. isFindPkg = false
  302. }
  303. }*/
  304. if len(tr.TDs) > 0 {
  305. tdleft := tr.TDs[len(tr.TDs)-1]
  306. if tdleft.BH && excludeKey.MatchString(tdleft.Text) { //(涉及包号|包件号?|项目标号|规格|型号|招标范围|业绩|废标)|(^编号$)|([^包段标]编号)
  307. isFindPkg = false
  308. }
  309. }
  310. if isFindPkg {
  311. if len(ub) > 0 {
  312. blockPackage = FindPackageFromBlocks(&ub, "") //从块里面找分包
  313. } else {
  314. blockPackage = FindPackageFromText("", td.Val) //从正文里面找分包
  315. }
  316. }
  317. if len(blockPackage) > 0 {
  318. table.BPackage = true
  319. for bp_k, bp_v := range blockPackage {
  320. var bp *u.BlockPackage
  321. if table.TableResult.PackageMap.Map[bp_k] == nil {
  322. bp = bp_v
  323. } else {
  324. bp = table.TableResult.PackageMap.Map[bp_k].(*u.BlockPackage)
  325. bp.Text += "\n" + bp_v.Text
  326. }
  327. if bp.TableKV == nil {
  328. bp.TableKV = u.NewJobKv()
  329. }
  330. for k2, v2 := range bp_v.ColonKV.Kv {
  331. if bp.TableKV.Kv[k2] == "" {
  332. bp.TableKV.Kv[k2] = v2
  333. }
  334. }
  335. for k2, v2 := range bp_v.SpaceKV.Kv {
  336. if bp.TableKV.Kv[k2] == "" {
  337. bp.TableKV.Kv[k2] = v2
  338. }
  339. }
  340. table.TableResult.PackageMap.Map[bp_k] = bp
  341. }
  342. }
  343. }
  344. //经过处理的td内容长度小于50,冒号kv,td表头
  345. if lenval < 50 {
  346. // td.SortKV = FindKv(text, "")
  347. kvTitle := ""
  348. if len(td.TR.TDs) > 0 {
  349. kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
  350. }
  351. /*
  352. 预算总价
  353. (人民币:元)
  354. */
  355. if td.Text != "" && strings.Contains(td.Text, "预算总价") && (strings.Contains(td.Text, "(") || strings.Contains(td.Text, "(")) {
  356. tagindex := 0
  357. if tagindex = strings.Index(td.Text, "("); tagindex <= 0 {
  358. tagindex = strings.Index(td.Text, "(")
  359. }
  360. td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值
  361. td.BH = true
  362. }
  363. _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3) //td冒号kv
  364. for k, v := range resm {
  365. if k != "" && v != "" {
  366. td.SortKV.AddKey(k, v) //存放kv值
  367. }
  368. }
  369. //u.Debug(td.SortKV.Keys, "-------2--------------------------------")
  370. // td.SortKV = FindKv(text, "") //GetKvFromtxt(text, "")
  371. //resm := GetKVAll(text, "")
  372. if len(td.SortKV.Keys) > 0 {
  373. //td.KVDirect = 3 //不当头也不当值,忽略
  374. if len(td.SortKV.Keys) == 1 && BHKey.MatchString(td.Val) && !MultiReg.MatchString(td.Val) {
  375. td.Val = td.SortKV.Keys[0]
  376. td.BH = true
  377. }
  378. } else if !bsontable {
  379. txt := repSpace.ReplaceAllString(td.Val, "")
  380. btw, must, _, _, repl := CheckHeader(txt)
  381. if lenval > 15 && !strings.Contains(txt, "采购代理机构名称、地址和联系方式") {
  382. btw = false
  383. }
  384. if strings.Contains(td.Val, "个项目") {
  385. must = false
  386. btw = false
  387. }
  388. td.Valtype = repl
  389. td.MustBH = must
  390. td.BH = btw
  391. }
  392. } else if len(ub) == 0 {
  393. //之前这里没加判断,现在加上判断,造成分块之后的kv被覆盖掉
  394. //u.Debug("----\n\n\n", txt, "\n\n\n----")
  395. //u.Debug(GetKVAll(txt, ""))
  396. /*
  397. subVal := submatchreg.FindAllStringSubmatch(txt, -1)
  398. if len(subVal) > 0 {
  399. for _, subv1 := range subVal {
  400. if len(subv1) == 6 {
  401. tr.Table.SortKV.AddKey(If(subv1[2] == "", subv1[3], subv1[2]).(string), subv1[4])
  402. //tr.Table.SortKV.AddKey(subv1[1], subv1[2])
  403. }
  404. }
  405. }
  406. */
  407. td.SortKV = FindKv(td.Val, "", 2)
  408. // td.LeftNode.Val
  409. // for _, vvv := range *td.TR {
  410. // u.Debug(">>>>>")
  411. // }
  412. kvTitle := ""
  413. if len(td.TR.TDs) > 0 {
  414. kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
  415. }
  416. _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2) //获取冒号kv入口
  417. for k, v := range resm {
  418. td.SortKV.AddKey(k, v)
  419. }
  420. }
  421. }
  422. func (t *Table) Print() {
  423. for row, trs := range t.TRs {
  424. for col, td := range trs.TDs {
  425. log.Println(row, col, td.Val, td.BH, td.SortKV.Map)
  426. }
  427. }
  428. }
  429. type TR struct {
  430. TDs []*TD
  431. TopTR *TR //上临行
  432. BottomTR *TR //下临行
  433. Table *Table //所属表格对象
  434. RowPos int //当前在第几行
  435. //-----计算
  436. MaxRow int //最大跨行 Max(td.StartRow-td.EndRow)
  437. MinRow int //最小跨行
  438. StartRow int //起始行
  439. EndRow int //结束行
  440. MaxCol int //最大列
  441. MinCol int //最小列
  442. StartCol int //起始列
  443. EndCol int //结束列
  444. BDiffSpanRow bool //起始行,行中有没有不同跨行 - - - = -
  445. BDiffSpanCol bool //起始列,列中有没有不同跨列 |
  446. }
  447. func NewTR(Table *Table) *TR {
  448. return &TR{
  449. TDs: []*TD{},
  450. Table: Table,
  451. }
  452. }
  453. func (tr *TR) AddTD(td *TD) {
  454. /**对跨行没有意义
  455. if len(tr.TDs) > 0 {
  456. td.LeftNode = tr.TDs[len(tr.TDs)-1]
  457. tr.TDs[len(tr.TDs)-1].RightNode = td
  458. }
  459. **/
  460. td.ColPos = len(tr.TDs)
  461. tr.TDs = append(tr.TDs, td)
  462. }
  463. /*-- START --- 处理表头概率开始 -------*/
  464. type pos struct {
  465. Max int
  466. Min int
  467. }
  468. type TDRationScope struct {
  469. Rationmap map[*pos]float32
  470. Tdmap map[*pos][]*TD
  471. Poss []*pos
  472. Parentkey string
  473. }
  474. func NewTDRationScope(key string) *TDRationScope {
  475. return &TDRationScope{map[*pos]float32{}, map[*pos][]*TD{}, []*pos{}, key}
  476. }
  477. func (tdr *TDRationScope) GetPos(td *TD) (poss *pos) {
  478. k1 := tdr.Parentkey[:1]
  479. m1, m2 := td.StartRow, td.EndRow
  480. if k1 == "r" {
  481. m1, m2 = td.StartCol, td.EndCol
  482. }
  483. for _, v := range tdr.Poss {
  484. if v.Max >= m2 && v.Min <= m1 {
  485. poss = v
  486. return
  487. }
  488. }
  489. return
  490. }
  491. func (tdr *TDRationScope) GetTDRation(td *TD) (ration float32, tds []*TD) {
  492. poss := tdr.GetPos(td)
  493. if poss != nil {
  494. ration = tdr.Rationmap[poss]
  495. tds = tdr.Tdmap[poss]
  496. }
  497. return
  498. }
  499. func (tdr *TDRationScope) Addtd(td *TD) {
  500. k1 := tdr.Parentkey[:1]
  501. m1, m2 := td.StartRow, td.EndRow
  502. if k1 == "r" {
  503. m1, m2 = td.StartCol, td.EndCol
  504. }
  505. bfind := false
  506. for _, v := range tdr.Poss {
  507. if m1 == v.Max+1 { //找到
  508. bfind = true
  509. v.Max = m2
  510. tdr.Tdmap[v] = append(tdr.Tdmap[v], td)
  511. break
  512. }
  513. }
  514. if !bfind {
  515. pos1 := &pos{m2, m1}
  516. tdr.Tdmap[pos1] = []*TD{td}
  517. tdr.Poss = append(tdr.Poss, pos1)
  518. }
  519. }
  520. /*-- END --- 处理表头概率 -------*/
  521. //table表格
  522. type Table struct {
  523. Brule bool //是否规则
  524. TRs []*TR
  525. BFirstRow bool
  526. RowNum int //行数
  527. ColNum int //列数
  528. TDNum int //td个数
  529. BPackage bool //是否有包
  530. SortKV *SortMap //带排序的KV值
  531. StandKV map[string]string //过滤后的标准化kv
  532. StandKVWeight map[string]int //过滤后的标准化kv
  533. StandRuleKV map[string]string //过滤后的规则kv
  534. kvscope map[int]map[int][]*TD //sortkey第几个元素的的第几个值的结束位置
  535. kTD map[int]*TD //根据索引找到key的TD元素
  536. SonTables []*Table //孩子表集合
  537. Tag string //表格的标签
  538. Desc string //表格描述内容
  539. Goquery *goquery.Selection //表格的goquery对象
  540. Html string //所属的文本内容
  541. BlockPackage *SortMap //子包数组
  542. TableResult *TableResult //父元素
  543. StartAndEndRation map[string]*TDRationScope //同行或同列的概率,截断的单独起算
  544. StartAndEndRationKSort *SortMap
  545. WinnerOrder []map[string]interface{}
  546. BSplit bool //是否是有一个表拆分成的多个表
  547. BHeader bool //拆分表是否有表头
  548. BrandData [][]map[string]string //品牌抽取结果
  549. HasKey int //有key
  550. HasBrand int //有品牌
  551. HasGoods int //有商品
  552. }
  553. func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Table {
  554. return &Table{
  555. Html: Html,
  556. SortKV: NewSortMap(),
  557. StandKV: map[string]string{},
  558. StandKVWeight: map[string]int{},
  559. kvscope: map[int]map[int][]*TD{},
  560. kTD: map[int]*TD{},
  561. SonTables: []*Table{},
  562. Goquery: tab,
  563. TRs: []*TR{},
  564. TableResult: TableResult,
  565. StartAndEndRation: map[string]*TDRationScope{},
  566. StartAndEndRationKSort: NewSortMap(),
  567. BlockPackage: NewSortMap(),
  568. }
  569. }
  570. func (t *Table) AddTR(tr *TR) {
  571. if len(tr.TDs) > 0 {
  572. if len(t.TRs) > 0 {
  573. tr.TopTR = t.TRs[len(t.TRs)-1]
  574. t.TRs[len(t.TRs)-1].BottomTR = tr
  575. }
  576. tr.RowPos = len(t.TRs)
  577. t.TRs = append(t.TRs, tr)
  578. }
  579. }
  580. func (t *Table) InsertTR(tr *TR) {
  581. if len(tr.TDs) > 0 {
  582. if len(t.TRs) > 0 {
  583. t.TRs[0].TopTR = tr
  584. }
  585. tr.RowPos = 0
  586. for _, _tr := range t.TRs {
  587. _tr.RowPos += 1
  588. }
  589. t.TRs = append([]*TR{tr}, t.TRs...)
  590. }
  591. }
  592. //支持排序的map
  593. type SortMap struct {
  594. Index map[string]int
  595. Keys []string
  596. Map map[string]interface{}
  597. Lock sync.Mutex
  598. }
  599. //快速创建排序map
  600. func NewSortMap() *SortMap {
  601. return &SortMap{
  602. Index: map[string]int{},
  603. Keys: []string{},
  604. Map: map[string]interface{}{},
  605. }
  606. }
  607. //增加值
  608. var NullVal = regexp.MustCompile("^[/无,.。;、附]+$|^详见.{2,8}$|(详?见)?附(件|图)")
  609. func (s *SortMap) AddKey(key string, val interface{}) {
  610. //判断val
  611. // if v, ok := val.(string); ok && NullVal.ReplaceAllString(u.TrimLRSpace(v, ""), "") == "" {
  612. // return
  613. // }
  614. s.Lock.Lock()
  615. defer s.Lock.Unlock()
  616. //重复
  617. if s.Map[key] == nil {
  618. s.Index[key] = len(s.Keys)
  619. s.Keys = append(s.Keys, key)
  620. }
  621. s.Map[key] = val
  622. }
  623. //增加值
  624. func (s *SortMap) ReplaceKey(key string, val interface{}, replacekey string) {
  625. s.Lock.Lock()
  626. defer s.Lock.Unlock()
  627. //重复
  628. v := s.Index[replacekey]
  629. s.Index[key] = v
  630. delete(s.Index, replacekey)
  631. s.Keys = append(s.Keys[:v], append([]string{key}, s.Keys[v+1:]...)...)
  632. delete(s.Map, replacekey)
  633. s.Map[key] = val
  634. }
  635. //删除值
  636. func (s *SortMap) RemoveKey(key string) {
  637. s.Lock.Lock()
  638. defer s.Lock.Unlock()
  639. delete(s.Map, key)
  640. pos := s.Index[key]
  641. delete(s.Index, key)
  642. if len(s.Keys) > 0 {
  643. s.Keys = func() []string {
  644. newkeys := []string{}
  645. if len(s.Keys) > 1 {
  646. if pos == 0 {
  647. newkeys = append(newkeys, s.Keys[1:]...)
  648. //每一个都减一
  649. for k, v := range s.Index {
  650. s.Index[k] = v - 1
  651. }
  652. } else if pos == len(s.Keys) {
  653. newkeys = append(newkeys, s.Keys[:pos]...)
  654. } else {
  655. tmp := s.Keys[pos+1:]
  656. newkeys = append(append(newkeys, s.Keys[:pos]...), tmp...)
  657. for _, v := range tmp {
  658. s.Index[v] -= 1
  659. }
  660. }
  661. }
  662. return newkeys
  663. }()
  664. }
  665. }
  666. //判断表头是key的对象
  667. type TableKeyV1 struct {
  668. TMap map[string]interface{}
  669. TReg []*regexp.Regexp
  670. TRegReplStr []string
  671. }
  672. //判断表头时用到的顺序 正文、结果表头、正常表头
  673. var THeadStr = []string{
  674. "con",
  675. "jghead",
  676. "normalhead",
  677. }
  678. //存放敏感词
  679. var TKMaps = map[string]*TableKeyV1{}
  680. //过滤所有非汉字内容
  681. var filterThText = regexp.MustCompile("([((【\\[].*[))】\\]])|([^0-9a-zA-Z\\p{Han}]+)")
  682. var tLock = sync.Mutex{}
  683. //matchStro为tablev1.json文件中的key,txt为表格的内容也可以是表格的标签
  684. //主要实现表格是否是表头的判断,表格是否有用的判断(如人员情况等是无用的)
  685. func CheckCommon(txt string, matchStr ...string) (res, must bool, stype, reg, repl string) {
  686. txt = filterThText.ReplaceAllString(txt, "")
  687. stype = "con"
  688. if len([]rune(txt)) < 30 {
  689. tLock.Lock()
  690. defer tLock.Unlock()
  691. if len(TKMaps) == 0 {
  692. for k, v := range u.TableK1 {
  693. tk := &TableKeyV1{
  694. map[string]interface{}{},
  695. []*regexp.Regexp{},
  696. []string{},
  697. }
  698. thMap := map[string]interface{}{}
  699. for _, v1 := range v {
  700. v1s := strings.Split(v1, "__")
  701. if len(v1s) == 2 {
  702. tk.TReg = append(tk.TReg, regexp.MustCompile(v1s[0]))
  703. tk.TRegReplStr = append(tk.TRegReplStr, v1s[1])
  704. } else {
  705. key := v1
  706. nowMap := &thMap
  707. for i := 0; i < len(key); i++ {
  708. kc := key[i : i+1]
  709. if v, ok := (*nowMap)[kc]; ok {
  710. nowMap, _ = v.(*map[string]interface{})
  711. } else {
  712. newMap := map[string]interface{}{}
  713. newMap["Y"] = "0"
  714. (*nowMap)[kc] = &newMap
  715. nowMap = &newMap
  716. }
  717. if i == len(key)-1 {
  718. (*nowMap)["Y"] = "1"
  719. (*nowMap)["K"] = key
  720. //(*nowMap)["V"] = v
  721. }
  722. }
  723. }
  724. }
  725. tk.TMap = thMap
  726. TKMaps[k] = tk
  727. }
  728. }
  729. //先正则、后子串查找
  730. L1:
  731. for _, v := range matchStr {
  732. //u.Debug(v)
  733. for n, vreg := range TKMaps[v].TReg {
  734. if vreg.MatchString(txt) {
  735. //u.Debug(txt, v, vreg.String())
  736. reg = vreg.String()
  737. repl = TKMaps[v].TRegReplStr[n]
  738. if v != "con" {
  739. res = true
  740. if "M" == repl {
  741. must = true
  742. }
  743. }
  744. stype = v
  745. break L1
  746. }
  747. }
  748. //以下是敏感词子串查找匹配
  749. pos := 0
  750. thMap := TKMaps[v].TMap
  751. nowMap := &thMap
  752. for i := 0; i < len(txt); i++ {
  753. word := txt[i : i+1]
  754. nowMap, _ = (*nowMap)[word].(*map[string]interface{})
  755. if nowMap != nil { // 存在,则判断是否为最后一个
  756. if pos == 0 {
  757. pos = i
  758. }
  759. if "1" == qutil.ObjToString((*nowMap)["Y"]) {
  760. if v != "con" {
  761. res = true
  762. }
  763. stype = v
  764. pos = 0
  765. break L1
  766. }
  767. } else {
  768. nowMap = &thMap
  769. if pos > 0 {
  770. i = pos
  771. pos = 0
  772. }
  773. }
  774. }
  775. }
  776. return
  777. } else {
  778. return
  779. }
  780. }
  781. //根据td中的内容验证表头,根据tablev1.json中配置的三种规则(含正则和子串查找算法)
  782. func CheckHeader(txt string) (res, must bool, stype, reg, repl string) {
  783. return CheckCommon(txt, THeadStr...)
  784. }
  785. /**
  786. 计算表格占比,返回表格数组、占比
  787. con 文本
  788. strtype 1全文 2块文本
  789. **/
  790. func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) {
  791. defer qutil.Catch()
  792. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  793. cons := doc.Text()
  794. tables := doc.Find("table")
  795. doc = nil
  796. if tables.Size() > 0 {
  797. tabs = []*goquery.Selection{}
  798. for i := 0; i < tables.Size(); i++ {
  799. tmpt := tables.Eq(i)
  800. b := false
  801. for j := 0; j < len(tabs); j++ {
  802. if tabs[j].Contains(tmpt.Get(0)) {
  803. b = true
  804. }
  805. }
  806. if !b {
  807. tabs = append(tabs, tmpt)
  808. }
  809. }
  810. tlen := 0
  811. for _, t := range tabs {
  812. tlen += len(t.Text())
  813. }
  814. ratio = float32(tlen) / float32(len(cons))
  815. }
  816. /**
  817. if ratio < float32(0.992) {
  818. //取出排除表格之外的文本
  819. txt =getTextAfterRemoveTable(con)
  820. }
  821. **/
  822. return
  823. }
  824. //取出排除表格之外的文本
  825. func TextAfterRemoveTable(con string) string {
  826. doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  827. doc2.Find("table").Remove()
  828. return doc2.Text()
  829. }
  830. func HtmlAfterRemoveTable(con string) string {
  831. doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  832. doc2.Find("table").Remove()
  833. html, _ := doc2.Html()
  834. return html
  835. }
  836. func If(condition bool, trueVal, falseVal interface{}) interface{} {
  837. if condition {
  838. return trueVal
  839. }
  840. return falseVal
  841. }