tablev2.go 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910
  1. package pretreated
  2. //定义表格对象
  3. import (
  4. "encoding/json"
  5. "fmt"
  6. u "jy/util"
  7. "log"
  8. qutil "qfw/util"
  9. "regexp"
  10. "strings"
  11. "sync"
  12. "github.com/PuerkitoBio/goquery"
  13. )
  14. //所有中标候选人只取第一个
  15. type TableResult struct {
  16. Id interface{} //信息id
  17. Toptype string //信息类型
  18. Itype int //1全文 2是块
  19. BlockTag string //块标签
  20. Html string
  21. Tabs []*Table //子表集合,子表中包含标准化kv或原始kv
  22. GoqueryTabs *goquery.Selection //goquery对象
  23. TableSize int //子表的个数0,1,n
  24. IsMultiPackage bool //是否有子包
  25. PackageMap *SortMap //子包对象的sortmap,含标准化过的
  26. KvTags map[string][]*u.Tag //全局KVmap值,标准化处理过的
  27. WinnerOrder []map[string]interface{}
  28. BrandData [][]map[string]string //品牌抽取结果
  29. HasKey int //有key
  30. HasBrand int //有品牌
  31. HasGoods int //有商品
  32. RuleBlock *u.RuleBlock
  33. }
  34. //快速创建TableResult对象
  35. func NewTableResult(Id interface{}, Toptype, BlockTag, con string, Itype int, ruleBlock *u.RuleBlock) *TableResult {
  36. return &TableResult{
  37. Id: Id,
  38. Toptype: Toptype,
  39. Html: con,
  40. Itype: Itype,
  41. BlockTag: BlockTag,
  42. Tabs: []*Table{},
  43. GoqueryTabs: &goquery.Selection{},
  44. PackageMap: NewSortMap(),
  45. KvTags: map[string][]*u.Tag{},
  46. RuleBlock: ruleBlock,
  47. }
  48. }
  49. //td节点
  50. type TD struct {
  51. Goquery *goquery.Selection //文本对象
  52. TR *TR //所属TR对象
  53. LeftNode *TD //左临节点
  54. TopNode *TD //上临节点
  55. RightNode *TD //右节点
  56. BottomNode *TD //下节点
  57. Val string //值
  58. Text string //原始串
  59. SortKV *SortMap //存放kv值
  60. Html string //html值
  61. BH bool //是否是表头
  62. MustBH bool //不能修改的表头
  63. StandardKey string //标准表头
  64. Colspan int //合并列
  65. Rowspan int //合并行
  66. StartCol int //起始列
  67. EndCol int //终止列
  68. StartRow int //起始行
  69. EndRow int //终止行
  70. ColPos int //当前在TR中的位置
  71. HeadTd *TD //(是val元素)k节点
  72. KVDirect int //键-值方向,0未知,1横 2纵//指值和k的方向
  73. KeyDirect int //k方向,k纵值横,k横值纵 1横 2纵
  74. SonTds []*TD //(是key元素)值节点数组
  75. SonTableResult *TableResult //子值表格集
  76. ArrVal []string //数组值,当是左临元素是合并行的元素时!
  77. Valtype string //"BO=中标人顺序"
  78. }
  79. var submatchreg = regexp.MustCompile(`((?:[一二三四五六七八九十0-10]+[、])([\\S]{4,12})|([\\S]{2,12}))[::]([\\S]{5,60})([一二三四五六七八九]+[、])?`)
  80. var BHKey = regexp.MustCompile(`^[^,,;:。、.]{2,8}.{0,3}[::].+$`)
  81. var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿元]+)")
  82. func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite string) *TD {
  83. defer qutil.Catch()
  84. td := &TD{
  85. ArrVal: []string{},
  86. Goquery: Goquery,
  87. SonTds: []*TD{},
  88. TR: tr,
  89. SortKV: NewSortMap(),
  90. }
  91. colspan, rowspan := 0, 0
  92. col, bcol := td.Goquery.Attr("colspan")
  93. if bcol {
  94. colspan = qutil.IntAllDef(col, 1)
  95. }
  96. if colspan == 0 {
  97. colspan = 1
  98. }
  99. row, brow := td.Goquery.Attr("rowspan")
  100. if brow {
  101. rowspan = qutil.IntAllDef(row, 1)
  102. }
  103. if rowspan == 0 {
  104. rowspan = 1
  105. }
  106. td.Colspan, td.Rowspan = colspan, rowspan //合并列,合并行
  107. td.Html, _ = td.Goquery.Html() //html值
  108. ht := td.Goquery.ChildrenFiltered("table") //获取td的table
  109. bsontable := false //默认td中没有table
  110. txt := ""
  111. //子table处理合并
  112. if ht.Size() > 0 {
  113. //qutil.Debug("有子表格")
  114. //格式化正文
  115. txt = TextAfterRemoveTable(td.Html)
  116. td.tdHasTable(&bsontable, tr,isSite,codeSite) //处理td中的table,块标签处理,子表解析集处理
  117. } else {
  118. txt = strings.TrimSpace(td.Goquery.Text())
  119. }
  120. text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
  121. td.Val = text //值
  122. td.Text = txt //原始串
  123. //处理table外内容
  124. var ub []*u.Block
  125. ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock,isSite,codeSite)
  126. //看是否划块
  127. if len(ub) > 0 {
  128. for _, bl := range ub {
  129. //冒号kv
  130. for bl_ck, bl_cv := range bl.ColonKV.KvTags {
  131. td.SortKV.AddKey(bl_ck, bl_cv)
  132. }
  133. //空格kv
  134. for bl_sk, bl_sv := range bl.SpaceKV.KvTags {
  135. td.SortKV.AddKey(bl_sk, bl_sv)
  136. }
  137. }
  138. } else {
  139. //for _, v := range GetKVAll(txt, "", nil, 2).KvTags {
  140. //for _, vv := range v {
  141. //td.SortKV.AddKey(vv.Key, vv.Value)
  142. //}
  143. //}
  144. }
  145. ////抽取不到走正则抽
  146. //proCode := projectcodeReg.FindString(text)
  147. //if proCode != "" {
  148. // ckv := GetKVAll(proCode, "", nil, 1)
  149. // for _, v := range ckv.KvTags {
  150. // for _, vv := range v {
  151. // td.SortKV.AddKey(vv.Key, vv.Value)
  152. // }
  153. // }
  154. //} else if proCode = projectcodeReg2.FindString(text); proCode != "" {
  155. // ckv := GetKVAll(proCode, "", nil, 1)
  156. // for _, v := range ckv.KvTags {
  157. // for _, vv := range v {
  158. // td.SortKV.AddKey(vv.Key, vv.Value)
  159. // }
  160. // }
  161. //}
  162. if proCode := jsonReg.FindString(text); proCode != "" {
  163. jsonMap := make(map[string]string)
  164. json.Unmarshal([]byte(proCode), &jsonMap)
  165. for k, v := range jsonMap {
  166. td.SortKV.AddKey(k, v)
  167. }
  168. }
  169. //对td单元格值判断是否是表头和根据td内容长度进行分块处理
  170. td.tdIsHb(tr, table, bsontable,isSite,codeSite)
  171. bhead := false
  172. if td.TR.RowPos == 0 { //第一行
  173. if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
  174. bhead = true
  175. }
  176. }
  177. if bhead && !bsontable {
  178. td.BH = true
  179. td.KeyDirect = 1 //k方向,k纵值横,k横值纵 1横 2纵
  180. td.KVDirect = 2 //键-值方向,0未知,1横 2纵//指值和k的方向
  181. }
  182. //u.Debug(td.BH, td.Val)
  183. return td
  184. }
  185. //处理td中的table,块标签处理,子表解析集处理
  186. func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
  187. ts := td.TR.Table.TableResult
  188. tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比
  189. if len(tabs) > 0 {
  190. (*bsontable) = true
  191. stag := ts.BlockTag //块标签
  192. if stag == "" {
  193. var tdleft *TD
  194. if len(tr.TDs) > 0 {
  195. tdleft = tr.TDs[len(tr.TDs)-1]
  196. if tdleft.BH {
  197. //u.Debug(tdleft.Val),如果不存在就是上一行的
  198. stag = tdleft.Val
  199. }
  200. } else if len(tr.Table.TRs) > 0 {
  201. lasttr := tr.Table.TRs[len(tr.Table.TRs)-1]
  202. str := ""
  203. for _, td3 := range lasttr.TDs {
  204. str += td3.Val
  205. if len([]rune(str)) > 14 {
  206. str = ""
  207. break
  208. }
  209. }
  210. stag = str
  211. }
  212. }
  213. for _, tv := range tabs {
  214. if IsHide(tv) {
  215. continue
  216. }
  217. sonts := NewTableResult(ts.Id, ts.Toptype, stag, td.Html, 2, td.TR.Table.TableResult.RuleBlock)
  218. sonts.GoqueryTabs = tv
  219. sonts.Analy(isSite,codeSite)
  220. //sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
  221. td.BH = false
  222. if td.TR.Table.TableResult == nil {
  223. td.TR.Table.TableResult = NewTableResult(sonts.Id, sonts.Toptype, sonts.BlockTag, sonts.Html, sonts.Itype, sonts.RuleBlock)
  224. }
  225. MergeKvTags(td.TR.Table.TableResult.KvTags, sonts.KvTags)
  226. td.SonTableResult = sonts
  227. //for _, k := range sonts.SortKV.Keys {
  228. //u.Debug(k, sonts.SortKV.Map[k])
  229. // td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
  230. // td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
  231. //}
  232. //增加brand (子表)
  233. //fmt.Println("sonsHasKey=============", sonts.HasKey)
  234. //fmt.Println("sonsHasGoods========", sonts.HasGoods)
  235. //fmt.Println("sonsHasBrand========", sonts.HasBrand)
  236. if sonts.HasKey != 0 {
  237. td.TR.Table.TableResult.HasKey = sonts.HasKey
  238. }
  239. if sonts.HasGoods != 0 {
  240. td.TR.Table.TableResult.HasGoods = sonts.HasGoods
  241. }
  242. if sonts.HasBrand != 0 {
  243. td.TR.Table.TableResult.HasBrand = sonts.HasBrand
  244. }
  245. if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table
  246. for _, v := range sonts.BrandData {
  247. if len(v) > 0 {
  248. td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v)
  249. }
  250. }
  251. }
  252. if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
  253. td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
  254. }
  255. if sonts.IsMultiPackage {
  256. td.TR.Table.BPackage = true
  257. tb1 := td.TR.Table.BlockPackage
  258. for _, v := range sonts.PackageMap.Keys {
  259. v1 := sonts.PackageMap.Map[v].(*u.BlockPackage)
  260. if tb1.Map[v] == nil {
  261. tb1.AddKey(v, sonts.PackageMap.Map[v])
  262. } else {
  263. bp := tb1.Map[v].(*u.BlockPackage)
  264. if bp != nil && v1.TableKV != nil {
  265. for k2, v2 := range v1.TableKV.KvTags {
  266. if bp.TableKV == nil {
  267. bp.TableKV = u.NewJobKv()
  268. }
  269. isExists := false
  270. for _, v2v := range v2 {
  271. for _, v2vv := range bp.TableKV.KvTags[k2] {
  272. if v2v.Value == v2vv.Value {
  273. isExists = true
  274. break
  275. }
  276. }
  277. if !isExists {
  278. bp.TableKV.KvTags[k2] = append(bp.TableKV.KvTags[k2], v2v)
  279. bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
  280. }
  281. }
  282. }
  283. }
  284. }
  285. }
  286. //u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
  287. }
  288. }
  289. }
  290. }
  291. //对td单元格值判断是否是表头和根据td内容长度进行分块处理
  292. func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string) {
  293. lenval := len([]rune(td.Val)) //经过处理的td内容长度
  294. //if lentxt > 9 {
  295. //td.KV = GetKVAll(txt, "")
  296. ub := []*u.Block{}
  297. //经过处理的td内容长度大于50,划块,分包
  298. if lenval > 50 { //看是否划块
  299. //u.Debug(txt)
  300. ub, _ = DivideBlock("", td.Text, 2, table.TableResult.RuleBlock,isSite,codeSite) //对td的原始值
  301. //看是否划块
  302. if len(ub) > 0 {
  303. for _, bl := range ub {
  304. //冒号kv
  305. for bl_ck, bl_cv := range bl.ColonKV.KvTags {
  306. td.SortKV.AddKey(bl_ck, bl_cv)
  307. }
  308. //空格kv
  309. for bl_sk, bl_sv := range bl.SpaceKV.KvTags {
  310. td.SortKV.AddKey(bl_sk, bl_sv)
  311. }
  312. }
  313. }
  314. //
  315. blockPackage := map[string]*u.BlockPackage{}
  316. isFindPkg := true
  317. /*if td.ColPos-1 >= 0 && excludeKey.MatchString(tr.TDs[td.ColPos-1].Text) {
  318. isFindPkg = false
  319. } else if len(tr.TDs) > 0 {
  320. tdleft = tr.TDs[len(tr.TDs)-1]
  321. if tdleft.BH && excludeKey.MatchString(tr.TDs[td.ColPos-1].Text) {
  322. isFindPkg = false
  323. }
  324. }*/
  325. if len(tr.TDs) > 0 {
  326. tdleft := tr.TDs[len(tr.TDs)-1]
  327. if tdleft.BH && excludeKey.MatchString(tdleft.Text) { //(涉及包号|包件号?|项目标号|规格|型号|招标范围|业绩|废标)|(^编号$)|([^包段标]编号)
  328. isFindPkg = false
  329. }
  330. }
  331. if isFindPkg {
  332. if len(ub) > 0 {
  333. blockPackage = FindPackageFromBlocks(&ub,isSite,codeSite) //从块里面找分包
  334. } else {
  335. blockPackage = FindPackageFromText("", td.Val,isSite,codeSite) //从正文里面找分包
  336. }
  337. }
  338. if len(blockPackage) > 0 {
  339. table.BPackage = true
  340. for bp_k, bp_v := range blockPackage {
  341. var bp *u.BlockPackage
  342. if table.TableResult.PackageMap.Map[bp_k] == nil {
  343. bp = bp_v
  344. } else {
  345. bp = table.TableResult.PackageMap.Map[bp_k].(*u.BlockPackage)
  346. bp.Text += "\n" + bp_v.Text
  347. }
  348. if bp.TableKV == nil {
  349. bp.TableKV = u.NewJobKv()
  350. }
  351. MergeKvTags(bp.TableKV.KvTags, bp_v.ColonKV.KvTags)
  352. MergeKvTags(bp.TableKV.KvTags, bp_v.SpaceKV.KvTags)
  353. table.TableResult.PackageMap.AddKey(bp_k, bp)
  354. }
  355. }
  356. }
  357. //经过处理的td内容长度小于50,冒号kv,td表头
  358. if lenval < 50 {
  359. // td.SortKV = FindKv(text, "")
  360. kvTitle := ""
  361. if len(td.TR.TDs) > 0 {
  362. kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
  363. }
  364. /*
  365. 预算总价
  366. (人民币:元)
  367. */
  368. if td.Text != "" && strings.Contains(td.Text, "预算总价") && (strings.Contains(td.Text, "(") || strings.Contains(td.Text, "(")) {
  369. tagindex := 0
  370. if tagindex = strings.Index(td.Text, "("); tagindex <= 0 {
  371. tagindex = strings.Index(td.Text, "(")
  372. }
  373. td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值
  374. td.BH = true
  375. }
  376. _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3,isSite,codeSite) //td冒号kv
  377. for k, v := range resm {
  378. if k != "" && v != "" {
  379. td.SortKV.AddKey(k, v) //存放kv值
  380. }
  381. }
  382. //u.Debug(td.SortKV.Keys, "-------2--------------------------------")
  383. // td.SortKV = FindKv(text, "") //GetKvFromtxt(text, "")
  384. //resm := GetKVAll(text, "")
  385. if len(td.SortKV.Keys) > 0 {
  386. //td.KVDirect = 3 //不当头也不当值,忽略
  387. if len(td.SortKV.Keys) == 1 && BHKey.MatchString(td.Val) && !MultiReg.MatchString(td.Val) {
  388. td.Val, _ = td.SortKV.Map[td.SortKV.Keys[0]].(string)
  389. td.BH = true
  390. }
  391. } else if !bsontable {
  392. txt := repSpace.ReplaceAllString(td.Val, "")
  393. btw, must, _, _, repl := CheckHeader(txt)
  394. if lenval > 15 && !strings.Contains(txt, "采购代理机构名称、地址和联系方式") {
  395. btw = false
  396. }
  397. if strings.Contains(td.Val, "个项目") {
  398. must = false
  399. btw = false
  400. }
  401. td.Valtype = repl
  402. td.MustBH = must
  403. td.BH = btw
  404. if strings.Contains(txt,"年估算额年(万元)"){
  405. td.MustBH = true
  406. td.BH = true
  407. }
  408. }
  409. } else if len(ub) == 0 {
  410. //之前这里没加判断,现在加上判断,造成分块之后的kv被覆盖掉
  411. //u.Debug("----\n\n\n", txt, "\n\n\n----")
  412. //u.Debug(GetKVAll(txt, ""))
  413. /*
  414. subVal := submatchreg.FindAllStringSubmatch(txt, -1)
  415. if len(subVal) > 0 {
  416. for _, subv1 := range subVal {
  417. if len(subv1) == 6 {
  418. tr.Table.SortKV.AddKey(If(subv1[2] == "", subv1[3], subv1[2]).(string), subv1[4])
  419. //tr.Table.SortKV.AddKey(subv1[1], subv1[2])
  420. }
  421. }
  422. }
  423. */
  424. fSortKV := FindKv(td.Val, "", 2)
  425. for _, v := range fSortKV.Keys {
  426. td.SortKV.AddKey(v, fSortKV.Map[v])
  427. }
  428. // td.LeftNode.Val
  429. // for _, vvv := range *td.TR {
  430. // u.Debug(">>>>>")
  431. // }
  432. kvTitle := ""
  433. if len(td.TR.TDs) > 0 {
  434. kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
  435. }
  436. _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2,isSite,codeSite) //获取冒号kv入口
  437. for k, v := range resm {
  438. td.SortKV.AddKey(k, v)
  439. }
  440. }
  441. }
  442. func (t *Table) Print() {
  443. for row, trs := range t.TRs {
  444. for col, td := range trs.TDs {
  445. log.Println(row, col, td.Val, td.BH, td.SortKV.Map)
  446. }
  447. }
  448. }
  449. type TR struct {
  450. TDs []*TD
  451. TopTR *TR //上临行
  452. BottomTR *TR //下临行
  453. Table *Table //所属表格对象
  454. RowPos int //当前在第几行
  455. //-----计算
  456. MaxRow int //最大跨行 Max(td.StartRow-td.EndRow)
  457. MinRow int //最小跨行
  458. StartRow int //起始行
  459. EndRow int //结束行
  460. MaxCol int //最大列
  461. MinCol int //最小列
  462. StartCol int //起始列
  463. EndCol int //结束列
  464. BDiffSpanRow bool //起始行,行中有没有不同跨行 - - - = -
  465. BDiffSpanCol bool //起始列,列中有没有不同跨列 |
  466. }
  467. func NewTR(Table *Table) *TR {
  468. return &TR{
  469. TDs: []*TD{},
  470. Table: Table,
  471. }
  472. }
  473. func (tr *TR) AddTD(td *TD) {
  474. /**对跨行没有意义
  475. if len(tr.TDs) > 0 {
  476. td.LeftNode = tr.TDs[len(tr.TDs)-1]
  477. tr.TDs[len(tr.TDs)-1].RightNode = td
  478. }
  479. **/
  480. td.ColPos = len(tr.TDs)
  481. tr.TDs = append(tr.TDs, td)
  482. }
  483. /*-- START --- 处理表头概率开始 -------*/
  484. type pos struct {
  485. Max int
  486. Min int
  487. }
  488. type TDRationScope struct {
  489. Rationmap map[*pos]float32
  490. Tdmap map[*pos][]*TD
  491. Poss []*pos
  492. Parentkey string
  493. }
  494. func NewTDRationScope(key string) *TDRationScope {
  495. return &TDRationScope{map[*pos]float32{}, map[*pos][]*TD{}, []*pos{}, key}
  496. }
  497. func (tdr *TDRationScope) GetPos(td *TD) (poss *pos) {
  498. k1 := tdr.Parentkey[:1]
  499. m1, m2 := td.StartRow, td.EndRow
  500. if k1 == "r" {
  501. m1, m2 = td.StartCol, td.EndCol
  502. }
  503. for _, v := range tdr.Poss {
  504. if v.Max >= m2 && v.Min <= m1 {
  505. poss = v
  506. return
  507. }
  508. }
  509. return
  510. }
  511. func (tdr *TDRationScope) GetTDRation(td *TD) (ration float32, tds []*TD) {
  512. poss := tdr.GetPos(td)
  513. if poss != nil {
  514. ration = tdr.Rationmap[poss]
  515. tds = tdr.Tdmap[poss]
  516. }
  517. return
  518. }
  519. func (tdr *TDRationScope) Addtd(td *TD) {
  520. k1 := tdr.Parentkey[:1]
  521. m1, m2 := td.StartRow, td.EndRow
  522. if k1 == "r" {
  523. m1, m2 = td.StartCol, td.EndCol
  524. }
  525. bfind := false
  526. for _, v := range tdr.Poss {
  527. if m1 == v.Max+1 { //找到
  528. bfind = true
  529. v.Max = m2
  530. tdr.Tdmap[v] = append(tdr.Tdmap[v], td)
  531. break
  532. }
  533. }
  534. if !bfind {
  535. pos1 := &pos{m2, m1}
  536. tdr.Tdmap[pos1] = []*TD{td}
  537. tdr.Poss = append(tdr.Poss, pos1)
  538. }
  539. }
  540. /*-- END --- 处理表头概率 -------*/
  541. //table表格
  542. type Table struct {
  543. Brule bool //是否规则
  544. TRs []*TR
  545. BFirstRow bool
  546. RowNum int //行数
  547. ColNum int //列数
  548. TDNum int //td个数
  549. BPackage bool //是否有包
  550. SortKV *SortMap //带排序的KV值
  551. StandKV map[string][]*u.Tag //过滤后的标准化kv
  552. StandRuleKV map[string]string //过滤后的规则kv
  553. kvscope map[int]map[int][]*TD //sortkey第几个元素的的第几个值的结束位置
  554. kTD map[int]*TD //根据索引找到key的TD元素
  555. SonTables []*Table //孩子表集合
  556. Tag string //表格的标签
  557. Desc string //表格描述内容
  558. Goquery *goquery.Selection //表格的goquery对象
  559. Html string //所属的文本内容
  560. BlockPackage *SortMap //子包数组
  561. TableResult *TableResult //父元素
  562. StartAndEndRation map[string]*TDRationScope //同行或同列的概率,截断的单独起算
  563. StartAndEndRationKSort *SortMap
  564. WinnerOrder []map[string]interface{}
  565. BSplit bool //是否是有一个表拆分成的多个表
  566. BHeader bool //拆分表是否有表头
  567. BrandData [][]map[string]string //品牌抽取结果
  568. HasKey int //有key
  569. HasBrand int //有品牌
  570. HasGoods int //有商品
  571. }
  572. func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Table {
  573. return &Table{
  574. Html: Html,
  575. SortKV: NewSortMap(),
  576. StandKV: map[string][]*u.Tag{},
  577. kvscope: map[int]map[int][]*TD{},
  578. kTD: map[int]*TD{},
  579. SonTables: []*Table{},
  580. Goquery: tab,
  581. TRs: []*TR{},
  582. TableResult: TableResult,
  583. StartAndEndRation: map[string]*TDRationScope{},
  584. StartAndEndRationKSort: NewSortMap(),
  585. BlockPackage: NewSortMap(),
  586. }
  587. }
  588. func (t *Table) AddTR(tr *TR) {
  589. if len(tr.TDs) > 0 {
  590. if len(t.TRs) > 0 {
  591. tr.TopTR = t.TRs[len(t.TRs)-1]
  592. t.TRs[len(t.TRs)-1].BottomTR = tr
  593. }
  594. tr.RowPos = len(t.TRs)
  595. t.TRs = append(t.TRs, tr)
  596. }
  597. }
  598. func (t *Table) InsertTR(tr *TR) {
  599. if len(tr.TDs) > 0 {
  600. if len(t.TRs) > 0 {
  601. t.TRs[0].TopTR = tr
  602. }
  603. tr.RowPos = 0
  604. for _, _tr := range t.TRs {
  605. _tr.RowPos += 1
  606. }
  607. t.TRs = append([]*TR{tr}, t.TRs...)
  608. }
  609. }
  610. //支持排序的map
  611. type SortMap struct {
  612. Index map[string]int
  613. Keys []string
  614. Map map[string]interface{}
  615. Lock sync.Mutex
  616. }
  617. //快速创建排序map
  618. func NewSortMap() *SortMap {
  619. return &SortMap{
  620. Index: map[string]int{},
  621. Keys: []string{},
  622. Map: map[string]interface{}{},
  623. }
  624. }
  625. //增加值
  626. var NullVal = regexp.MustCompile("^[/无,.。;、附]+$|^详见.{2,8}$|(详?见)?附(件|图)")
  627. func (s *SortMap) AddKey(key string, val interface{}) {
  628. //判断val
  629. // if v, ok := val.(string); ok && NullVal.ReplaceAllString(u.TrimLRSpace(v, ""), "") == "" {
  630. // return
  631. // }
  632. s.Lock.Lock()
  633. defer s.Lock.Unlock()
  634. //重复
  635. if s.Map[key] == nil {
  636. s.Index[key] = len(s.Keys)
  637. s.Keys = append(s.Keys, key)
  638. }
  639. s.Map[key] = val
  640. }
  641. //增加值
  642. func (s *SortMap) ReplaceKey(key string, val interface{}, replacekey string) {
  643. s.Lock.Lock()
  644. defer s.Lock.Unlock()
  645. //重复
  646. v := s.Index[replacekey]
  647. s.Index[key] = v
  648. delete(s.Index, replacekey)
  649. s.Keys = append(s.Keys[:v], append([]string{key}, s.Keys[v+1:]...)...)
  650. delete(s.Map, replacekey)
  651. s.Map[key] = val
  652. }
  653. //删除值
  654. func (s *SortMap) RemoveKey(key string) {
  655. s.Lock.Lock()
  656. defer s.Lock.Unlock()
  657. delete(s.Map, key)
  658. pos := s.Index[key]
  659. delete(s.Index, key)
  660. if len(s.Keys) > 0 {
  661. s.Keys = func() []string {
  662. newkeys := []string{}
  663. if len(s.Keys) > 1 {
  664. if pos == 0 {
  665. newkeys = append(newkeys, s.Keys[1:]...)
  666. //每一个都减一
  667. for k, v := range s.Index {
  668. s.Index[k] = v - 1
  669. }
  670. } else if pos == len(s.Keys) {
  671. newkeys = append(newkeys, s.Keys[:pos]...)
  672. } else if len(s.Keys) > 1 {
  673. tmp := s.Keys[pos+1:]
  674. newkeys = append(append(newkeys, s.Keys[:pos]...), tmp...)
  675. for _, v := range tmp {
  676. s.Index[v] -= 1
  677. }
  678. }
  679. }
  680. return newkeys
  681. }()
  682. }
  683. }
  684. //判断表头是key的对象
  685. type TableKeyV1 struct {
  686. TMap map[string]interface{}
  687. TReg []*regexp.Regexp
  688. TRegReplStr []string
  689. }
  690. //判断表头时用到的顺序 正文、结果表头、正常表头
  691. var THeadStr = []string{
  692. "con",
  693. "jghead",
  694. "normalhead",
  695. }
  696. //存放敏感词
  697. var TKMaps = map[string]*TableKeyV1{}
  698. //过滤所有非汉字内容
  699. var filterThText = regexp.MustCompile("([((【\\[].*[))】\\]])|([^0-9a-zA-Z\\p{Han}]+)")
  700. var tLock = sync.Mutex{}
  701. //matchStro为tablev1.json文件中的key,txt为表格的内容也可以是表格的标签
  702. //主要实现表格是否是表头的判断,表格是否有用的判断(如人员情况等是无用的)
  703. func CheckCommon(txt string, matchStr ...string) (res, must bool, stype, reg, repl string) {
  704. txt = filterThText.ReplaceAllString(txt, "")
  705. stype = "con"
  706. if len([]rune(txt)) < 30 {
  707. tLock.Lock()
  708. defer tLock.Unlock()
  709. if len(TKMaps) == 0 {
  710. for k, v := range u.TableK1 {
  711. tk := &TableKeyV1{
  712. map[string]interface{}{},
  713. []*regexp.Regexp{},
  714. []string{},
  715. }
  716. thMap := map[string]interface{}{}
  717. for _, v1 := range v {
  718. v1s := strings.Split(v1, "__")
  719. if len(v1s) == 2 {
  720. tk.TReg = append(tk.TReg, regexp.MustCompile(v1s[0]))
  721. tk.TRegReplStr = append(tk.TRegReplStr, v1s[1])
  722. } else {
  723. key := v1
  724. nowMap := &thMap
  725. for i := 0; i < len(key); i++ {
  726. kc := key[i : i+1]
  727. if v, ok := (*nowMap)[kc]; ok {
  728. nowMap, _ = v.(*map[string]interface{})
  729. } else {
  730. newMap := map[string]interface{}{}
  731. newMap["Y"] = "0"
  732. (*nowMap)[kc] = &newMap
  733. nowMap = &newMap
  734. }
  735. if i == len(key)-1 {
  736. (*nowMap)["Y"] = "1"
  737. (*nowMap)["K"] = key
  738. //(*nowMap)["V"] = v
  739. }
  740. }
  741. }
  742. }
  743. tk.TMap = thMap
  744. TKMaps[k] = tk
  745. }
  746. }
  747. //先正则、后子串查找
  748. L1:
  749. for _, v := range matchStr {
  750. //u.Debug(v)
  751. for n, vreg := range TKMaps[v].TReg {
  752. if vreg.MatchString(txt) {
  753. //u.Debug(txt, v, vreg.String())
  754. reg = vreg.String()
  755. repl = TKMaps[v].TRegReplStr[n]
  756. if v != "con" {
  757. res = true
  758. if "M" == repl {
  759. must = true
  760. }
  761. }
  762. stype = v
  763. break L1
  764. }
  765. }
  766. //以下是敏感词子串查找匹配
  767. pos := 0
  768. thMap := TKMaps[v].TMap
  769. nowMap := &thMap
  770. for i := 0; i < len(txt); i++ {
  771. word := txt[i : i+1]
  772. nowMap, _ = (*nowMap)[word].(*map[string]interface{})
  773. if nowMap != nil { // 存在,则判断是否为最后一个
  774. if pos == 0 {
  775. pos = i
  776. }
  777. if "1" == qutil.ObjToString((*nowMap)["Y"]) {
  778. if v != "con" {
  779. res = true
  780. }
  781. stype = v
  782. pos = 0
  783. break L1
  784. }
  785. } else {
  786. nowMap = &thMap
  787. if pos > 0 {
  788. i = pos
  789. pos = 0
  790. }
  791. }
  792. }
  793. }
  794. return
  795. } else {
  796. return
  797. }
  798. }
  799. //根据td中的内容验证表头,根据tablev1.json中配置的三种规则(含正则和子串查找算法)
  800. func CheckHeader(txt string) (res, must bool, stype, reg, repl string) {
  801. return CheckCommon(txt, THeadStr...)
  802. }
  803. /**
  804. 计算表格占比,返回表格数组、占比
  805. con 文本
  806. strtype 1全文 2块文本
  807. **/
  808. func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) {
  809. defer qutil.Catch()
  810. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  811. cons := doc.Text()
  812. tables := doc.Find("table")
  813. doc = nil
  814. if tables.Size() > 0 {
  815. tabs = []*goquery.Selection{}
  816. for i := 0; i < tables.Size(); i++ {
  817. tmpt := tables.Eq(i)
  818. b := false
  819. for j := 0; j < len(tabs); j++ {
  820. if tabs[j].Contains(tmpt.Get(0)) {
  821. b = true
  822. }
  823. }
  824. if !b {
  825. tabs = append(tabs, tmpt)
  826. }
  827. }
  828. tlen := 0
  829. for _, t := range tabs {
  830. tlen += len(t.Text())
  831. }
  832. ratio = float32(tlen) / float32(len(cons))
  833. }
  834. /**
  835. if ratio < float32(0.992) {
  836. //取出排除表格之外的文本
  837. txt =getTextAfterRemoveTable(con)
  838. }
  839. **/
  840. return
  841. }
  842. //纯文本
  843. func HtmlToText(con string) string {
  844. doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  845. //log.Println(doc2.Html())
  846. doc2.Find("tr").Each(func(i int, selection *goquery.Selection) {
  847. selection.AfterHtml(string(rune(10)))
  848. })
  849. //log.Println(doc2.Html())
  850. return doc2.Text()
  851. }
  852. //取出排除表格之外的文本
  853. func TextAfterRemoveTable(con string) string {
  854. doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  855. doc2.Find("table").Remove()
  856. return doc2.Text()
  857. }
  858. func HtmlAfterRemoveTable(con string) string {
  859. doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  860. doc2.Find("table").Remove()
  861. html, _ := doc2.Html()
  862. return html
  863. }
  864. func If(condition bool, trueVal, falseVal interface{}) interface{} {
  865. if condition {
  866. return trueVal
  867. }
  868. return falseVal
  869. }