util.go 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852
  1. // util
  2. package main
  3. import (
  4. "fmt"
  5. "log"
  6. "regexp"
  7. "sort"
  8. "strconv"
  9. "strings"
  10. "unicode"
  11. "golang.org/x/net/html"
  12. )
  13. var (
  14. //常见表头
  15. theadWordsList = append(theadWordsList_Item, theadPackWordsList...)
  16. theadWordsListCom = []string{
  17. "包段", "标段", "标包", "名称", "包号", "包段", "子包号", "子项", "中标人",
  18. "包件号", "包件代码", "包件编号", "分包编码", "分标编号", "分标编码", "合同段",
  19. "标的", "标项", "采购合同", "包件编号", "项目编号", "评价机构",
  20. "地点", "日期", "单位", "是否"}
  21. theadWordsListComReg = regexp.MustCompile("(" + strings.Join(theadWordsListCom, "|") + ")(?:[^0-9a-zA-Z]|$)")
  22. //分包必含表头
  23. theadPackWordsList = []string{
  24. "包段", "标段", "标段名称", "标包", "标包名称", "标包号", "包号", "包段", "子包号", "子标段名称", "子项", "子项名称",
  25. "包件号", "包件代码", "包件编号", "分包编码", "分包名称", "分标编号", "分标编码", "合同段", "包件名称",
  26. "标的", "标的名称", "标项", "标项名称", "采购合同", "标段(包)名称",
  27. "项目/包件编号", "项目编号", "服务名称", "项目名称"}
  28. theadPackWords = "(" + strings.Join(theadPackWordsList, "|") + ")(?:[^0-9a-zA-Z]|$)"
  29. theadPackWordsReg = regexp.MustCompile(theadPackWords)
  30. //候选人表头
  31. theadWords_order = "(包件号|标的|标段|候选人|供应商)"
  32. theadWordsReg_order = regexp.MustCompile(theadWords_order)
  33. //删除干扰数据
  34. delRowKeys = "未成交|未中标原因"
  35. delRowKeysReg = regexp.MustCompile(delRowKeys)
  36. //负向表头,用于剔除干扰表格
  37. reverseTheadKeys = map[string][]string{
  38. "bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分", "专家"},
  39. //"spotcheck": []string{"项目名称", "抽取家数"},
  40. }
  41. //干扰内容清理
  42. clearKeys = []string{"承包(一|二|三|四)级", "开标(\\d)(室|厅)", "\\d+\\.\\d+", "\\d+(.{1,10}).(pdf|doc|zip|rar)",
  43. "([一二三四五六七八九十0-9]+次)", "\\d+g.{0,20}包", "\\d+包.{0,20}(纸|箱)", "标段\\d+/标包\\d+", "\\d+年",
  44. "(\\d{1,2})-(\\d{1,2})(段|包|标)", "[一二三四五六七八九十]、[一二三四五六七八九十](段|包|标)",
  45. }
  46. clearKeysBack = []string{"上一篇", "下一篇", "历史业绩",
  47. "候选人业绩", "候选人企业业绩", "候选人类似业绩", "企业类似项目业绩",
  48. "投标业绩", "投标人业绩", "企业业绩", "工程业绩", "设计单位业绩", "施工单位业绩",
  49. "单位业绩情况", "投标文件中载明的业绩情况", "质量标准:"}
  50. //干扰内容替换
  51. replaceMap = map[string]string{
  52. "服务项目": "",
  53. "标项目": "标",
  54. "总承包": "",
  55. "三安小区": "",
  56. "I": "Ⅰ",
  57. "—": "",
  58. }
  59. //联合体投标判断
  60. consortium = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))"
  61. consortiumKeysReg = regexp.MustCompile(consortium)
  62. )
  63. // 判断是否有分包数据
  64. func TableIsPackage(tables *AllTablesData) (bool, int, int) {
  65. ispack := false
  66. tablesNumRows, tablesStrRows := 0, 0
  67. allCellVal := map[int]map[int][]string{}
  68. for kt, tv := range tables.Tables {
  69. allCellVal[kt] = getPackAllCellVal_v1(tv)
  70. //log.Println("allCellVal sss", kt, len(allCellVal), allCellVal)
  71. if len(allCellVal[kt]) < 1 {
  72. allCellVal[kt] = getPackAllCellVal_v2(tv)
  73. //log.Println("allCellVal", allCellVal)
  74. }
  75. for _, cellsVal := range allCellVal {
  76. // log.Println("cellsVal", cellsVal)
  77. for _, cells := range cellsVal {
  78. numKey := map[string]string{}
  79. strKey := map[string]string{}
  80. L:
  81. for _, cellVal := range cells {
  82. for _, word := range theadWordsList { //过滤 theadWordsList 中的词
  83. if strings.EqualFold(word, cellVal) {
  84. break L
  85. }
  86. }
  87. if val, err := strconv.Atoi(cellVal); err == nil {
  88. numKey[fmt.Sprint(val)] = cellVal
  89. } else {
  90. if len(cellVal) > 0 {
  91. strKey[cellVal] = cellVal
  92. }
  93. }
  94. }
  95. if tablesNumRows < len(numKey) {
  96. tablesNumRows = len(numKey)
  97. }
  98. if tablesStrRows < len(strKey) {
  99. tablesStrRows = len(strKey)
  100. }
  101. if blog {
  102. log.Println(kt, "numKey", numKey)
  103. log.Println(kt, "strKey", strKey)
  104. }
  105. }
  106. }
  107. if tablesStrRows > 1 && tablesNumRows > 1 ||
  108. tablesStrRows < 1 && tablesNumRows > 1 ||
  109. tablesStrRows > 1 && tablesNumRows < 1 {
  110. ispack = true
  111. }
  112. }
  113. return ispack, tablesNumRows, tablesStrRows
  114. }
  115. // row.HeaderRote > 50,提取分包特征值
  116. func getPackAllCellVal_v1(rows TableData) map[int][]string {
  117. //如果是标的物、评分、抽查列表,放弃解析
  118. bidTheadNum := 0
  119. if len(rows.Rows) > 0 {
  120. for _, theadKeys := range reverseTheadKeys {
  121. for _, v := range theadKeys {
  122. for _, cell := range rows.Rows[0].Row {
  123. if strings.EqualFold(v, cell.Text) {
  124. bidTheadNum++
  125. }
  126. }
  127. }
  128. }
  129. }
  130. if bidTheadNum > 1 {
  131. if blog {
  132. log.Println("标的物、评分、抽查列表,放弃解析")
  133. }
  134. return nil
  135. }
  136. cellIndex_keyVals := map[int][]string{}
  137. kcell := []int{}
  138. startAdd := false //开始取数标识
  139. startRows := 0 //开始取数据行
  140. L:
  141. for kr, row := range rows.Rows {
  142. cellOk := 0 //如果单元格数据有效值不足3项,跳过
  143. for _, cell := range row.Row {
  144. if len(cell.Text) > 0 {
  145. cellOk++
  146. }
  147. }
  148. if cellOk < 2 {
  149. continue
  150. }
  151. if startAdd { //开始提取数据,并非从第二行开始取数据
  152. for i, k := range kcell {
  153. if row.HeaderRote < 100 {
  154. if startRows == 0 {
  155. startRows = i
  156. }
  157. cellIndex_keyVals[k] = append(cellIndex_keyVals[k], row.Row[k].Text)
  158. }
  159. //如果已有数据,再次碰到行表头行放弃数据
  160. if startRows > 0 && row.HeaderRote > 0 {
  161. if blog {
  162. log.Println("中断", row.HeaderRote, row)
  163. }
  164. break L
  165. }
  166. }
  167. }
  168. if blog {
  169. log.Println("整行是表头v1 row", startAdd, cellOk, bidTheadNum, kr, row.HeaderRote, row.Row)
  170. }
  171. //首次获取行表头中 分包索引号
  172. if !startAdd && row.HeaderRote > 50 {
  173. for i, cell := range row.Row {
  174. for _, word := range theadPackWordsList {
  175. if strings.EqualFold(word, cell.Text) {
  176. // log.Println("word", i, word, strings.EqualFold(word, cell.Text))
  177. kcell = append(kcell, i)
  178. startAdd = true
  179. }
  180. }
  181. }
  182. }
  183. }
  184. return cellIndex_keyVals
  185. }
  186. // row.HeaderRote <= 50,提取分包特征值
  187. func getPackAllCellVal_v2(rows TableData) map[int][]string {
  188. //如果是标的物、评分、抽查列表,放弃解析
  189. bidTheadNum := 0
  190. if len(rows.Rows) > 0 {
  191. for _, theadKeys := range reverseTheadKeys {
  192. for _, v := range theadKeys {
  193. for _, cell := range rows.Rows[0].Row {
  194. if strings.EqualFold(v, cell.Text) {
  195. bidTheadNum++
  196. }
  197. }
  198. }
  199. }
  200. }
  201. if bidTheadNum > 1 {
  202. return nil
  203. }
  204. cellIndex_keyVals := map[int][]string{}
  205. L:
  206. for _, row := range rows.Rows {
  207. cellOk := 0 //如果单元格数据有效值不足3项,跳过
  208. for _, cell := range row.Row {
  209. if len(cell.Text) > 0 {
  210. cellOk++
  211. }
  212. }
  213. if cellOk < 3 {
  214. continue
  215. }
  216. if row.HeaderRote <= 50 {
  217. for i, cell := range row.Row {
  218. for _, word := range theadPackWordsList {
  219. if strings.EqualFold(word, cell.Text) {
  220. if len(row.Row) > i+1 {
  221. cellIndex_keyVals[0] = append(cellIndex_keyVals[0], row.Row[i+1].Text)
  222. break L
  223. //log.Println("ssss", word, row.Row[i+1].Text)
  224. }
  225. }
  226. }
  227. }
  228. }
  229. }
  230. return cellIndex_keyVals
  231. }
  232. func setRowsHeaderRote(tables *AllTablesData) *AllTablesData {
  233. //判断表头模式
  234. for k, table := range tables.Tables {
  235. for i, row := range table.Rows {
  236. rowLen := len(row.Row)
  237. rowHeardNum := 0
  238. for _, cell := range row.Row {
  239. if cell.IsHeader {
  240. rowHeardNum++
  241. }
  242. }
  243. if rowLen == rowHeardNum || rowHeardNum > rowLen/2 {
  244. row.HeaderRote = 100
  245. } else if rowLen%2 == 0 && rowHeardNum == rowLen/2 {
  246. row.HeaderRote = 50
  247. } else if rowHeardNum > 0 { //有表头个数不定
  248. row.HeaderRote = 1
  249. // log.Println("row.HeaderRote", row.HeaderRote, row)
  250. } else {
  251. row.HeaderRote = 0
  252. }
  253. table.Rows[i] = row
  254. // if blog {
  255. // log.Println("setRowsHeaderRote", row.HeaderRote, row.Row)
  256. // }
  257. }
  258. tables.Tables[k] = table
  259. }
  260. return tables
  261. }
  262. // 匹配<table>标签及其内容的正则表达式
  263. func removeTables(html string) string {
  264. re := regexp.MustCompile(`(?i)<table[^>]*>[\s\S]*?</table>`)
  265. html = re.ReplaceAllString(html, "")
  266. // re = regexp.MustCompile(`<[^>]*>`)
  267. // html = re.ReplaceAllString(html, "")
  268. return html
  269. }
  270. // 表格检测,检查表格是否存在及是否存在合并单元格
  271. func CheckTableMerged(htmlContent string) (hasTable bool, hasMerged bool, err error) {
  272. doc, err := html.Parse(strings.NewReader(htmlContent))
  273. if err != nil {
  274. return false, false, err
  275. }
  276. // 递归查找所有表格
  277. tables := findTables(doc)
  278. hasTable = len(tables) > 0
  279. // 检查所有表格中的合并单元格
  280. for _, table := range tables {
  281. if checkTableForMergedCells(table) {
  282. hasMerged = true
  283. break
  284. }
  285. }
  286. return hasTable, hasMerged, nil
  287. }
  288. func findTables(n *html.Node) []*html.Node { // 递归查找文档中的所有<table>元素
  289. var tables []*html.Node
  290. if n.Type == html.ElementNode && n.Data == "table" {
  291. tables = append(tables, n)
  292. }
  293. for c := n.FirstChild; c != nil; c = c.NextSibling {
  294. tables = append(tables, findTables(c)...)
  295. }
  296. return tables
  297. }
  298. func checkTableForMergedCells(table *html.Node) bool { //检查单个表格中是否存在合并单元格
  299. // 使用栈进行非递归深度优先遍历
  300. stack := []*html.Node{table}
  301. for len(stack) > 0 {
  302. node := stack[len(stack)-1]
  303. stack = stack[:len(stack)-1]
  304. // 遇到嵌套表格则跳过
  305. if node != table && node.Type == html.ElementNode && node.Data == "table" {
  306. continue
  307. }
  308. // 检查当前节点是否为单元格
  309. if node.Type == html.ElementNode && (node.Data == "td" || node.Data == "th") {
  310. if hasMergeAttribute(node) {
  311. return true
  312. }
  313. }
  314. // 将子节点逆序压入栈中
  315. for child := node.LastChild; child != nil; child = child.PrevSibling {
  316. stack = append(stack, child)
  317. }
  318. }
  319. return false
  320. }
  321. func hasMergeAttribute(cell *html.Node) bool { // 检查单元格是否包含合并属性
  322. for _, attr := range cell.Attr {
  323. if attr.Key == "rowspan" || attr.Key == "colspan" {
  324. // 尝试解析属性值为整数
  325. if val, err := strconv.Atoi(attr.Val); err == nil {
  326. if val > 1 {
  327. return true
  328. }
  329. }
  330. // 如果值无法解析为整数,但属性存在且非"1",也视为合并
  331. if attr.Val != "1" {
  332. return true
  333. }
  334. }
  335. }
  336. return false
  337. }
  338. // 替换文本数据
  339. func repalceString(input string, replace map[string]string) string {
  340. for k, v := range replace {
  341. input = strings.Replace(input, k, v, -1)
  342. }
  343. return input
  344. }
  345. // cleanWebText 删除包含指定关键词及其后续的所有内容
  346. func cleanWebText(input string, keywords, keywordsback []string) string {
  347. // 构建关键词正则表达式(使用OR连接)
  348. keywordPattern := strings.Join(keywordsback, "|")
  349. re, err := regexp.Compile(fmt.Sprintf(`(?s)(%s).*`, keywordPattern))
  350. if err != nil {
  351. return input // 正则编译失败时返回原始文本
  352. }
  353. input = re.ReplaceAllString(input, "")
  354. for _, v := range keywords {
  355. re, err = regexp.Compile(v)
  356. if err != nil {
  357. continue
  358. }
  359. input = re.ReplaceAllString(input, "")
  360. }
  361. return input
  362. }
  363. // 支持中文数字(零一二三四五六七八九十)、阿拉伯数字(0-9)、罗马数字(Ⅰ-Ⅻ)
  364. func convertNumerals(input string) string {
  365. // 字符映射表
  366. chineseNumMap := map[rune]rune{
  367. '零': '0', '一': '1', '二': '2', '三': '3', '四': '4',
  368. '五': '5', '六': '6', '七': '7', '八': '8', '九': '9',
  369. '十': '1', // 仅处理个位,十位需特殊处理
  370. }
  371. romanNumMap := map[rune]rune{
  372. 'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5',
  373. 'Ⅵ': '6', 'Ⅶ': '7', 'Ⅷ': '8', 'Ⅸ': '9', 'Ⅹ': '1', // 仅处理个位
  374. 'Ⅺ': '1', 'Ⅻ': '1', // 罗马数字11和12仅处理十位
  375. // 'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5',
  376. // 'F': '6', 'G': '7', 'H': '8', 'J': '9', 'K': '1',
  377. // 'a': '1', 'b': '2', 'c': '3', 'd': '4', 'e': '5',
  378. // 'f': '6', 'g': '7', 'h': '8', 'j': '9', 'k': '1',
  379. }
  380. var result strings.Builder
  381. var result1 strings.Builder
  382. for _, char := range input {
  383. // 检查阿拉伯数字
  384. if char >= '0' && char <= '9' {
  385. result1.WriteRune(char)
  386. continue
  387. }
  388. }
  389. var result2 strings.Builder
  390. for _, char := range input {
  391. // 检查中文数字
  392. if num, exists := chineseNumMap[char]; exists {
  393. result2.WriteRune(num)
  394. continue
  395. }
  396. }
  397. var result3 strings.Builder
  398. for _, char := range input {
  399. // 检查罗马数字
  400. if num, exists := romanNumMap[char]; exists {
  401. result3.WriteRune(num)
  402. continue
  403. }
  404. }
  405. if result1.Len() > result.Len() {
  406. result = result1
  407. }
  408. if result2.Len() > result.Len() {
  409. result = result2
  410. }
  411. if result3.Len() > result.Len() {
  412. result = result3
  413. }
  414. return result.String()
  415. }
  416. // Unicode判断工具函数
  417. func isChineseRune(r rune) bool {
  418. // 基础汉字检测
  419. if r >= 0x4E00 && r <= 0x9FFF {
  420. return true
  421. }
  422. // CJK符号和标点
  423. if r >= 0x3000 && r <= 0x303F {
  424. return true
  425. }
  426. // 全角符号(过滤字母数字)
  427. if r >= 0xFF00 && r <= 0xFFEF {
  428. // 排除全角字母
  429. if (r >= 0xFF21 && r <= 0xFF3A) || // 大写字母
  430. (r >= 0xFF41 && r <= 0xFF5A) { // 小写字母
  431. return false
  432. }
  433. // 排除全角数字
  434. if r >= 0xFF10 && r <= 0xFF19 {
  435. return false
  436. }
  437. return true
  438. }
  439. // 特殊符号检测
  440. switch r {
  441. case 0x2018, 0x2019, 0x201C, 0x201D, // 引号
  442. 0x2014, 0x2026, // 破折号、省略号
  443. 0x3010, 0x3011, // 【】
  444. 0x3008, 0x3009, 0x300A, 0x300B: // 《》〈〉
  445. return true
  446. }
  447. return false
  448. }
  449. // CellData 存储单元格数据
  450. type CellData struct {
  451. Text string `json:"text"` // 单元格文本内容
  452. IsHeader bool `json:"isHeader"` // 是否为表头单元格
  453. }
  454. // RowData 存储单元格数据
  455. type RowData struct {
  456. Row []CellData `json:"row"` // 行数据
  457. HeaderRote int `json:"isHeader"` // 表头权重 100 50 0
  458. }
  459. // TableData 存储表格的行列数据
  460. type TableData struct {
  461. Rows []RowData `json:"rows"` // 表格行数据
  462. NestedLevel int `json:"nestedLevel"` // 表格的嵌套层级,0表示顶层表格
  463. ChildTables []int `json:"childTables"` // 子表格的索引列表
  464. ParentIndex int `json:"parentIndex"` // 父表格的索引,-1表示没有父表格
  465. HasMerged bool `json:"hasMerged"` // 是否存在合并单元格
  466. }
  467. // AllTablesData 存储所有表格的数据
  468. type AllTablesData struct {
  469. Tables []TableData `json:"tables"`
  470. }
  471. // TableExtractDatas 解析HTML中的表格,返回Markdown格式和所有表格的行列数据
  472. func TableExtractDatas(htmlStr string) (string, *AllTablesData, error) {
  473. doc, err := html.Parse(strings.NewReader(htmlStr))
  474. if err != nil {
  475. return "", nil, err
  476. }
  477. allTablesData := &AllTablesData{}
  478. var markdownBuilder strings.Builder
  479. tableIndex := 0
  480. // 递归解析表格
  481. var parseNode func(*html.Node, int, int)
  482. parseNode = func(n *html.Node, level int, parentIdx int) {
  483. if n.Type == html.ElementNode && n.Data == "table" {
  484. // 记录当前表格的父索引
  485. currentParent := parentIdx
  486. currentIndex := tableIndex
  487. // 解析当前表格
  488. tableMarkdown, tableData := parseTable(n, level, currentIndex, currentParent)
  489. tableData.NestedLevel = level
  490. tableData.ParentIndex = currentParent
  491. // 添加到结果集
  492. allTablesData.Tables = append(allTablesData.Tables, tableData)
  493. markdownBuilder.WriteString(tableMarkdown)
  494. markdownBuilder.WriteString("\n\n")
  495. // 更新父表格的子表格列表
  496. if currentParent != -1 {
  497. parentTable := &allTablesData.Tables[currentParent]
  498. parentTable.ChildTables = append(parentTable.ChildTables, currentIndex)
  499. }
  500. // 增加表格索引
  501. tableIndex++
  502. // 递归处理子节点(使用新的父索引)
  503. for c := n.FirstChild; c != nil; c = c.NextSibling {
  504. parseNode(c, level+1, currentIndex)
  505. }
  506. return
  507. }
  508. // 递归处理其他节点
  509. for c := n.FirstChild; c != nil; c = c.NextSibling {
  510. parseNode(c, level, parentIdx)
  511. }
  512. }
  513. // 从文档根节点开始解析
  514. parseNode(doc, 0, -1)
  515. return markdownBuilder.String(), allTablesData, nil
  516. }
  517. // 获取单元格的rowspan和colspan属性
  518. func getSpanTable(cell *html.Node) (int, int) {
  519. rowspan, colspan := 1, 1
  520. for _, attr := range cell.Attr {
  521. switch attr.Key {
  522. case "rowspan":
  523. if val, err := strconv.Atoi(attr.Val); err == nil && val > 1 {
  524. rowspan = val
  525. }
  526. case "colspan":
  527. if val, err := strconv.Atoi(attr.Val); err == nil && val > 1 {
  528. colspan = val
  529. }
  530. }
  531. }
  532. return rowspan, colspan
  533. }
  534. // 解析单个表格
  535. func parseTable(tableNode *html.Node, level int, currentIndex int, parentIndex int) (string, TableData) {
  536. var tableData TableData
  537. tableData.ParentIndex = parentIndex
  538. tableData.HasMerged = false
  539. // 使用网格(grid)构建表格结构
  540. var grid []RowData
  541. rows := getTableRows(tableNode)
  542. // 处理表格行
  543. for rowIdx, row := range rows {
  544. // 扩展grid到当前行
  545. if rowIdx >= len(grid) {
  546. grid = append(grid, RowData{})
  547. }
  548. // 跳过已被合并单元格占用的位置
  549. col := 0
  550. for col < len(grid[rowIdx].Row) && grid[rowIdx].Row[col].Text != "" {
  551. col++
  552. }
  553. cells := getRowCells(row)
  554. for _, cell := range cells {
  555. // 获取单元格的跨行跨列属性
  556. rowspan, colspan := getSpanTable(cell)
  557. if rowspan > 1 || colspan > 1 {
  558. tableData.HasMerged = true
  559. }
  560. // 确保grid有足够的行
  561. for len(grid) < rowIdx+rowspan {
  562. grid = append(grid, RowData{})
  563. }
  564. // 确保所有相关行有足够的列
  565. targetCol := col + colspan
  566. for r := rowIdx; r < rowIdx+rowspan; r++ {
  567. if len(grid[r].Row) < targetCol {
  568. // 扩展行
  569. newRow := make([]CellData, targetCol)
  570. copy(newRow, grid[r].Row)
  571. grid[r].Row = newRow
  572. }
  573. }
  574. // 提取单元格文本并确定是否为表头
  575. text := extractCellText(cell)
  576. text = RemoveAllSpaces(text)
  577. isHeader := cell.Data == "th"
  578. //根据text内容和常见关键词判断,是否是表头
  579. if !isHeader && len([]rune(text)) < 20 {
  580. //核心词走正则匹配
  581. if theadPackWordsReg.MatchString(text) {
  582. isHeader = true
  583. }
  584. if !isHeader {
  585. isHeader = theadWordsListComReg.MatchString(text)
  586. }
  587. //非核心词,走EqualFold匹配
  588. if !isHeader {
  589. for _, word := range theadWordsList {
  590. if strings.EqualFold(word, text) {
  591. isHeader = true
  592. break
  593. }
  594. }
  595. }
  596. }
  597. // log.Println("cellData IsHeader", isHeader, text)
  598. cellData := CellData{Text: text, IsHeader: isHeader}
  599. // 将单元格数据填充到所有合并位置
  600. for r := 0; r < rowspan; r++ {
  601. for c := 0; c < colspan; c++ {
  602. grid[rowIdx+r].Row[col+c] = cellData
  603. }
  604. }
  605. // 移动到下一列位置
  606. col += colspan
  607. }
  608. }
  609. // 设置最终的行数据
  610. tableData.Rows = grid
  611. // 生成Markdown表格 (保持原有逻辑,只使用文本内容)
  612. markdown := generateMarkdownTable(grid, level)
  613. return markdown, tableData
  614. }
  615. // 生成Markdown格式的表格 (只使用单元格文本)
  616. func generateMarkdownTable(grid []RowData, level int) string {
  617. if len(grid) == 0 {
  618. return ""
  619. }
  620. var builder strings.Builder
  621. builder.WriteString(fmt.Sprintf("### Table at level %d\n\n", level))
  622. // 添加表头
  623. for i, row := range grid {
  624. builder.WriteString("| ")
  625. for j, cell := range row.Row {
  626. text := cell.Text
  627. if text == "" {
  628. builder.WriteString(" ")
  629. } else {
  630. builder.WriteString(text)
  631. }
  632. if j < len(row.Row)-1 {
  633. builder.WriteString(" | ")
  634. }
  635. }
  636. builder.WriteString(" |\n")
  637. // 添加表头分隔线
  638. if i == 0 {
  639. builder.WriteString("|")
  640. for j := 0; j < len(row.Row); j++ {
  641. builder.WriteString(" --- |")
  642. }
  643. builder.WriteString("\n")
  644. }
  645. }
  646. return builder.String()
  647. }
  648. // 获取表格中的所有行
  649. func getTableRows(tableNode *html.Node) []*html.Node {
  650. var rows []*html.Node
  651. var traverse func(*html.Node)
  652. traverse = func(n *html.Node) {
  653. if n.Type == html.ElementNode {
  654. switch n.Data {
  655. case "tr":
  656. rows = append(rows, n)
  657. case "thead", "tbody", "tfoot", "table":
  658. // 继续遍历
  659. for c := n.FirstChild; c != nil; c = c.NextSibling {
  660. traverse(c)
  661. }
  662. }
  663. }
  664. }
  665. for c := tableNode.FirstChild; c != nil; c = c.NextSibling {
  666. traverse(c)
  667. }
  668. return rows
  669. }
  670. // 获取行中的所有单元格
  671. func getRowCells(rowNode *html.Node) []*html.Node {
  672. var cells []*html.Node
  673. for c := rowNode.FirstChild; c != nil; c = c.NextSibling {
  674. if c.Type == html.ElementNode && (c.Data == "td" || c.Data == "th") {
  675. cells = append(cells, c)
  676. }
  677. }
  678. return cells
  679. }
  680. // 提取单元格文本
  681. func extractCellText(cellNode *html.Node) string {
  682. var textBuilder strings.Builder
  683. var extract func(*html.Node)
  684. extract = func(n *html.Node) {
  685. if n.Type == html.TextNode {
  686. textBuilder.WriteString(strings.TrimSpace(n.Data))
  687. textBuilder.WriteString(" ")
  688. } else if n.Type == html.ElementNode {
  689. // 跳过嵌套表格
  690. if n.Data != "table" {
  691. for c := n.FirstChild; c != nil; c = c.NextSibling {
  692. extract(c)
  693. }
  694. } else {
  695. textBuilder.WriteString("[Table]")
  696. }
  697. }
  698. }
  699. for c := cellNode.FirstChild; c != nil; c = c.NextSibling {
  700. extract(c)
  701. }
  702. // 清理文本
  703. result := strings.TrimSpace(textBuilder.String())
  704. if result == "" {
  705. return " "
  706. }
  707. return result
  708. }
  709. // 提取文本汉字
  710. func GetChineseText(htmlContent string) string {
  711. // 移除HTML标签
  712. reHTML := regexp.MustCompile("<[^>]*>")
  713. cleanText := reHTML.ReplaceAllString(htmlContent, "")
  714. // 提取汉字(Unicode范围:\u4e00-\u9fa5)
  715. reChinese := regexp.MustCompile("[\u4e00-\u9fa5]")
  716. chineseChars := reChinese.FindAllString(cleanText, -1)
  717. return strings.Join(chineseChars, "")
  718. }
  719. // RemoveAllSpaces 移除字符串中的所有空白字符
  720. func RemoveAllSpaces(s string) string {
  721. // 使用 strings.Builder 高效构建新字符串
  722. var b strings.Builder
  723. b.Grow(len(s)) // 预分配空间,提高性能
  724. // 遍历字符串的每个字符
  725. for _, r := range s {
  726. // 如果不是空白字符,则添加到结果中
  727. if !unicode.IsSpace(r) {
  728. b.WriteRune(r)
  729. }
  730. }
  731. return b.String()
  732. }
  733. // 按字符串长度排序的类型
  734. type ByLength []string
  735. func (s ByLength) Len() int { return len(s) }
  736. func (s ByLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
  737. func (s ByLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
  738. // 分组函数
  739. func groupStrings(strings []string) [][]string {
  740. if len(strings) == 0 {
  741. return nil
  742. }
  743. // 按长度排序
  744. sorted := make(ByLength, len(strings))
  745. copy(sorted, strings)
  746. sort.Sort(sorted)
  747. var groups [][]string
  748. currentGroup := []string{sorted[0]}
  749. currentMaxLen := len(sorted[0])
  750. // 遍历排序后的字符串,构建分组
  751. for i := 1; i < len(sorted); i++ {
  752. currentLen := len(sorted[i])
  753. if currentLen-currentMaxLen <= 2 {
  754. // 当前字符串可以加入当前组
  755. currentGroup = append(currentGroup, sorted[i])
  756. } else {
  757. // 创建新组
  758. groups = append(groups, currentGroup)
  759. currentGroup = []string{sorted[i]}
  760. currentMaxLen = currentLen
  761. }
  762. }
  763. // 添加最后一个组
  764. groups = append(groups, currentGroup)
  765. return groups
  766. }
  767. // 判断是否是联合体中标
  768. func isConsortiumKeysReg(content string) bool {
  769. return consortiumKeysReg.MatchString(content)
  770. }