123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852 |
- // util
- package main
- import (
- "fmt"
- "log"
- "regexp"
- "sort"
- "strconv"
- "strings"
- "unicode"
- "golang.org/x/net/html"
- )
- var (
- //常见表头
- theadWordsList = append(theadWordsList_Item, theadPackWordsList...)
- theadWordsListCom = []string{
- "包段", "标段", "标包", "名称", "包号", "包段", "子包号", "子项", "中标人",
- "包件号", "包件代码", "包件编号", "分包编码", "分标编号", "分标编码", "合同段",
- "标的", "标项", "采购合同", "包件编号", "项目编号", "评价机构",
- "地点", "日期", "单位", "是否"}
- theadWordsListComReg = regexp.MustCompile("(" + strings.Join(theadWordsListCom, "|") + ")(?:[^0-9a-zA-Z]|$)")
- //分包必含表头
- theadPackWordsList = []string{
- "包段", "标段", "标段名称", "标包", "标包名称", "标包号", "包号", "包段", "子包号", "子标段名称", "子项", "子项名称",
- "包件号", "包件代码", "包件编号", "分包编码", "分包名称", "分标编号", "分标编码", "合同段", "包件名称",
- "标的", "标的名称", "标项", "标项名称", "采购合同", "标段(包)名称",
- "项目/包件编号", "项目编号", "服务名称", "项目名称"}
- theadPackWords = "(" + strings.Join(theadPackWordsList, "|") + ")(?:[^0-9a-zA-Z]|$)"
- theadPackWordsReg = regexp.MustCompile(theadPackWords)
- //候选人表头
- theadWords_order = "(包件号|标的|标段|候选人|供应商)"
- theadWordsReg_order = regexp.MustCompile(theadWords_order)
- //删除干扰数据
- delRowKeys = "未成交|未中标原因"
- delRowKeysReg = regexp.MustCompile(delRowKeys)
- //负向表头,用于剔除干扰表格
- reverseTheadKeys = map[string][]string{
- "bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分", "专家"},
- //"spotcheck": []string{"项目名称", "抽取家数"},
- }
- //干扰内容清理
- clearKeys = []string{"承包(一|二|三|四)级", "开标(\\d)(室|厅)", "\\d+\\.\\d+", "\\d+(.{1,10}).(pdf|doc|zip|rar)",
- "([一二三四五六七八九十0-9]+次)", "\\d+g.{0,20}包", "\\d+包.{0,20}(纸|箱)", "标段\\d+/标包\\d+", "\\d+年",
- "(\\d{1,2})-(\\d{1,2})(段|包|标)", "[一二三四五六七八九十]、[一二三四五六七八九十](段|包|标)",
- }
- clearKeysBack = []string{"上一篇", "下一篇", "历史业绩",
- "候选人业绩", "候选人企业业绩", "候选人类似业绩", "企业类似项目业绩",
- "投标业绩", "投标人业绩", "企业业绩", "工程业绩", "设计单位业绩", "施工单位业绩",
- "单位业绩情况", "投标文件中载明的业绩情况", "质量标准:"}
- //干扰内容替换
- replaceMap = map[string]string{
- "服务项目": "",
- "标项目": "标",
- "总承包": "",
- "三安小区": "",
- "I": "Ⅰ",
- "—": "",
- }
- //联合体投标判断
- consortium = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))"
- consortiumKeysReg = regexp.MustCompile(consortium)
- )
- // 判断是否有分包数据
- func TableIsPackage(tables *AllTablesData) (bool, int, int) {
- ispack := false
- tablesNumRows, tablesStrRows := 0, 0
- allCellVal := map[int]map[int][]string{}
- for kt, tv := range tables.Tables {
- allCellVal[kt] = getPackAllCellVal_v1(tv)
- //log.Println("allCellVal sss", kt, len(allCellVal), allCellVal)
- if len(allCellVal[kt]) < 1 {
- allCellVal[kt] = getPackAllCellVal_v2(tv)
- //log.Println("allCellVal", allCellVal)
- }
- for _, cellsVal := range allCellVal {
- // log.Println("cellsVal", cellsVal)
- for _, cells := range cellsVal {
- numKey := map[string]string{}
- strKey := map[string]string{}
- L:
- for _, cellVal := range cells {
- for _, word := range theadWordsList { //过滤 theadWordsList 中的词
- if strings.EqualFold(word, cellVal) {
- break L
- }
- }
- if val, err := strconv.Atoi(cellVal); err == nil {
- numKey[fmt.Sprint(val)] = cellVal
- } else {
- if len(cellVal) > 0 {
- strKey[cellVal] = cellVal
- }
- }
- }
- if tablesNumRows < len(numKey) {
- tablesNumRows = len(numKey)
- }
- if tablesStrRows < len(strKey) {
- tablesStrRows = len(strKey)
- }
- if blog {
- log.Println(kt, "numKey", numKey)
- log.Println(kt, "strKey", strKey)
- }
- }
- }
- if tablesStrRows > 1 && tablesNumRows > 1 ||
- tablesStrRows < 1 && tablesNumRows > 1 ||
- tablesStrRows > 1 && tablesNumRows < 1 {
- ispack = true
- }
- }
- return ispack, tablesNumRows, tablesStrRows
- }
- // row.HeaderRote > 50,提取分包特征值
- func getPackAllCellVal_v1(rows TableData) map[int][]string {
- //如果是标的物、评分、抽查列表,放弃解析
- bidTheadNum := 0
- if len(rows.Rows) > 0 {
- for _, theadKeys := range reverseTheadKeys {
- for _, v := range theadKeys {
- for _, cell := range rows.Rows[0].Row {
- if strings.EqualFold(v, cell.Text) {
- bidTheadNum++
- }
- }
- }
- }
- }
- if bidTheadNum > 1 {
- if blog {
- log.Println("标的物、评分、抽查列表,放弃解析")
- }
- return nil
- }
- cellIndex_keyVals := map[int][]string{}
- kcell := []int{}
- startAdd := false //开始取数标识
- startRows := 0 //开始取数据行
- L:
- for kr, row := range rows.Rows {
- cellOk := 0 //如果单元格数据有效值不足3项,跳过
- for _, cell := range row.Row {
- if len(cell.Text) > 0 {
- cellOk++
- }
- }
- if cellOk < 2 {
- continue
- }
- if startAdd { //开始提取数据,并非从第二行开始取数据
- for i, k := range kcell {
- if row.HeaderRote < 100 {
- if startRows == 0 {
- startRows = i
- }
- cellIndex_keyVals[k] = append(cellIndex_keyVals[k], row.Row[k].Text)
- }
- //如果已有数据,再次碰到行表头行放弃数据
- if startRows > 0 && row.HeaderRote > 0 {
- if blog {
- log.Println("中断", row.HeaderRote, row)
- }
- break L
- }
- }
- }
- if blog {
- log.Println("整行是表头v1 row", startAdd, cellOk, bidTheadNum, kr, row.HeaderRote, row.Row)
- }
- //首次获取行表头中 分包索引号
- if !startAdd && row.HeaderRote > 50 {
- for i, cell := range row.Row {
- for _, word := range theadPackWordsList {
- if strings.EqualFold(word, cell.Text) {
- // log.Println("word", i, word, strings.EqualFold(word, cell.Text))
- kcell = append(kcell, i)
- startAdd = true
- }
- }
- }
- }
- }
- return cellIndex_keyVals
- }
- // row.HeaderRote <= 50,提取分包特征值
- func getPackAllCellVal_v2(rows TableData) map[int][]string {
- //如果是标的物、评分、抽查列表,放弃解析
- bidTheadNum := 0
- if len(rows.Rows) > 0 {
- for _, theadKeys := range reverseTheadKeys {
- for _, v := range theadKeys {
- for _, cell := range rows.Rows[0].Row {
- if strings.EqualFold(v, cell.Text) {
- bidTheadNum++
- }
- }
- }
- }
- }
- if bidTheadNum > 1 {
- return nil
- }
- cellIndex_keyVals := map[int][]string{}
- L:
- for _, row := range rows.Rows {
- cellOk := 0 //如果单元格数据有效值不足3项,跳过
- for _, cell := range row.Row {
- if len(cell.Text) > 0 {
- cellOk++
- }
- }
- if cellOk < 3 {
- continue
- }
- if row.HeaderRote <= 50 {
- for i, cell := range row.Row {
- for _, word := range theadPackWordsList {
- if strings.EqualFold(word, cell.Text) {
- if len(row.Row) > i+1 {
- cellIndex_keyVals[0] = append(cellIndex_keyVals[0], row.Row[i+1].Text)
- break L
- //log.Println("ssss", word, row.Row[i+1].Text)
- }
- }
- }
- }
- }
- }
- return cellIndex_keyVals
- }
- func setRowsHeaderRote(tables *AllTablesData) *AllTablesData {
- //判断表头模式
- for k, table := range tables.Tables {
- for i, row := range table.Rows {
- rowLen := len(row.Row)
- rowHeardNum := 0
- for _, cell := range row.Row {
- if cell.IsHeader {
- rowHeardNum++
- }
- }
- if rowLen == rowHeardNum || rowHeardNum > rowLen/2 {
- row.HeaderRote = 100
- } else if rowLen%2 == 0 && rowHeardNum == rowLen/2 {
- row.HeaderRote = 50
- } else if rowHeardNum > 0 { //有表头个数不定
- row.HeaderRote = 1
- // log.Println("row.HeaderRote", row.HeaderRote, row)
- } else {
- row.HeaderRote = 0
- }
- table.Rows[i] = row
- // if blog {
- // log.Println("setRowsHeaderRote", row.HeaderRote, row.Row)
- // }
- }
- tables.Tables[k] = table
- }
- return tables
- }
- // 匹配<table>标签及其内容的正则表达式
- func removeTables(html string) string {
- re := regexp.MustCompile(`(?i)<table[^>]*>[\s\S]*?</table>`)
- html = re.ReplaceAllString(html, "")
- // re = regexp.MustCompile(`<[^>]*>`)
- // html = re.ReplaceAllString(html, "")
- return html
- }
- // 表格检测,检查表格是否存在及是否存在合并单元格
- func CheckTableMerged(htmlContent string) (hasTable bool, hasMerged bool, err error) {
- doc, err := html.Parse(strings.NewReader(htmlContent))
- if err != nil {
- return false, false, err
- }
- // 递归查找所有表格
- tables := findTables(doc)
- hasTable = len(tables) > 0
- // 检查所有表格中的合并单元格
- for _, table := range tables {
- if checkTableForMergedCells(table) {
- hasMerged = true
- break
- }
- }
- return hasTable, hasMerged, nil
- }
- func findTables(n *html.Node) []*html.Node { // 递归查找文档中的所有<table>元素
- var tables []*html.Node
- if n.Type == html.ElementNode && n.Data == "table" {
- tables = append(tables, n)
- }
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- tables = append(tables, findTables(c)...)
- }
- return tables
- }
- func checkTableForMergedCells(table *html.Node) bool { //检查单个表格中是否存在合并单元格
- // 使用栈进行非递归深度优先遍历
- stack := []*html.Node{table}
- for len(stack) > 0 {
- node := stack[len(stack)-1]
- stack = stack[:len(stack)-1]
- // 遇到嵌套表格则跳过
- if node != table && node.Type == html.ElementNode && node.Data == "table" {
- continue
- }
- // 检查当前节点是否为单元格
- if node.Type == html.ElementNode && (node.Data == "td" || node.Data == "th") {
- if hasMergeAttribute(node) {
- return true
- }
- }
- // 将子节点逆序压入栈中
- for child := node.LastChild; child != nil; child = child.PrevSibling {
- stack = append(stack, child)
- }
- }
- return false
- }
- func hasMergeAttribute(cell *html.Node) bool { // 检查单元格是否包含合并属性
- for _, attr := range cell.Attr {
- if attr.Key == "rowspan" || attr.Key == "colspan" {
- // 尝试解析属性值为整数
- if val, err := strconv.Atoi(attr.Val); err == nil {
- if val > 1 {
- return true
- }
- }
- // 如果值无法解析为整数,但属性存在且非"1",也视为合并
- if attr.Val != "1" {
- return true
- }
- }
- }
- return false
- }
- // 替换文本数据
- func repalceString(input string, replace map[string]string) string {
- for k, v := range replace {
- input = strings.Replace(input, k, v, -1)
- }
- return input
- }
- // cleanWebText 删除包含指定关键词及其后续的所有内容
- func cleanWebText(input string, keywords, keywordsback []string) string {
- // 构建关键词正则表达式(使用OR连接)
- keywordPattern := strings.Join(keywordsback, "|")
- re, err := regexp.Compile(fmt.Sprintf(`(?s)(%s).*`, keywordPattern))
- if err != nil {
- return input // 正则编译失败时返回原始文本
- }
- input = re.ReplaceAllString(input, "")
- for _, v := range keywords {
- re, err = regexp.Compile(v)
- if err != nil {
- continue
- }
- input = re.ReplaceAllString(input, "")
- }
- return input
- }
- // 支持中文数字(零一二三四五六七八九十)、阿拉伯数字(0-9)、罗马数字(Ⅰ-Ⅻ)
- func convertNumerals(input string) string {
- // 字符映射表
- chineseNumMap := map[rune]rune{
- '零': '0', '一': '1', '二': '2', '三': '3', '四': '4',
- '五': '5', '六': '6', '七': '7', '八': '8', '九': '9',
- '十': '1', // 仅处理个位,十位需特殊处理
- }
- romanNumMap := map[rune]rune{
- 'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5',
- 'Ⅵ': '6', 'Ⅶ': '7', 'Ⅷ': '8', 'Ⅸ': '9', 'Ⅹ': '1', // 仅处理个位
- 'Ⅺ': '1', 'Ⅻ': '1', // 罗马数字11和12仅处理十位
- // 'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5',
- // 'F': '6', 'G': '7', 'H': '8', 'J': '9', 'K': '1',
- // 'a': '1', 'b': '2', 'c': '3', 'd': '4', 'e': '5',
- // 'f': '6', 'g': '7', 'h': '8', 'j': '9', 'k': '1',
- }
- var result strings.Builder
- var result1 strings.Builder
- for _, char := range input {
- // 检查阿拉伯数字
- if char >= '0' && char <= '9' {
- result1.WriteRune(char)
- continue
- }
- }
- var result2 strings.Builder
- for _, char := range input {
- // 检查中文数字
- if num, exists := chineseNumMap[char]; exists {
- result2.WriteRune(num)
- continue
- }
- }
- var result3 strings.Builder
- for _, char := range input {
- // 检查罗马数字
- if num, exists := romanNumMap[char]; exists {
- result3.WriteRune(num)
- continue
- }
- }
- if result1.Len() > result.Len() {
- result = result1
- }
- if result2.Len() > result.Len() {
- result = result2
- }
- if result3.Len() > result.Len() {
- result = result3
- }
- return result.String()
- }
- // Unicode判断工具函数
- func isChineseRune(r rune) bool {
- // 基础汉字检测
- if r >= 0x4E00 && r <= 0x9FFF {
- return true
- }
- // CJK符号和标点
- if r >= 0x3000 && r <= 0x303F {
- return true
- }
- // 全角符号(过滤字母数字)
- if r >= 0xFF00 && r <= 0xFFEF {
- // 排除全角字母
- if (r >= 0xFF21 && r <= 0xFF3A) || // 大写字母
- (r >= 0xFF41 && r <= 0xFF5A) { // 小写字母
- return false
- }
- // 排除全角数字
- if r >= 0xFF10 && r <= 0xFF19 {
- return false
- }
- return true
- }
- // 特殊符号检测
- switch r {
- case 0x2018, 0x2019, 0x201C, 0x201D, // 引号
- 0x2014, 0x2026, // 破折号、省略号
- 0x3010, 0x3011, // 【】
- 0x3008, 0x3009, 0x300A, 0x300B: // 《》〈〉
- return true
- }
- return false
- }
- // CellData 存储单元格数据
- type CellData struct {
- Text string `json:"text"` // 单元格文本内容
- IsHeader bool `json:"isHeader"` // 是否为表头单元格
- }
- // RowData 存储单元格数据
- type RowData struct {
- Row []CellData `json:"row"` // 行数据
- HeaderRote int `json:"isHeader"` // 表头权重 100 50 0
- }
- // TableData 存储表格的行列数据
- type TableData struct {
- Rows []RowData `json:"rows"` // 表格行数据
- NestedLevel int `json:"nestedLevel"` // 表格的嵌套层级,0表示顶层表格
- ChildTables []int `json:"childTables"` // 子表格的索引列表
- ParentIndex int `json:"parentIndex"` // 父表格的索引,-1表示没有父表格
- HasMerged bool `json:"hasMerged"` // 是否存在合并单元格
- }
- // AllTablesData 存储所有表格的数据
- type AllTablesData struct {
- Tables []TableData `json:"tables"`
- }
- // TableExtractDatas 解析HTML中的表格,返回Markdown格式和所有表格的行列数据
- func TableExtractDatas(htmlStr string) (string, *AllTablesData, error) {
- doc, err := html.Parse(strings.NewReader(htmlStr))
- if err != nil {
- return "", nil, err
- }
- allTablesData := &AllTablesData{}
- var markdownBuilder strings.Builder
- tableIndex := 0
- // 递归解析表格
- var parseNode func(*html.Node, int, int)
- parseNode = func(n *html.Node, level int, parentIdx int) {
- if n.Type == html.ElementNode && n.Data == "table" {
- // 记录当前表格的父索引
- currentParent := parentIdx
- currentIndex := tableIndex
- // 解析当前表格
- tableMarkdown, tableData := parseTable(n, level, currentIndex, currentParent)
- tableData.NestedLevel = level
- tableData.ParentIndex = currentParent
- // 添加到结果集
- allTablesData.Tables = append(allTablesData.Tables, tableData)
- markdownBuilder.WriteString(tableMarkdown)
- markdownBuilder.WriteString("\n\n")
- // 更新父表格的子表格列表
- if currentParent != -1 {
- parentTable := &allTablesData.Tables[currentParent]
- parentTable.ChildTables = append(parentTable.ChildTables, currentIndex)
- }
- // 增加表格索引
- tableIndex++
- // 递归处理子节点(使用新的父索引)
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- parseNode(c, level+1, currentIndex)
- }
- return
- }
- // 递归处理其他节点
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- parseNode(c, level, parentIdx)
- }
- }
- // 从文档根节点开始解析
- parseNode(doc, 0, -1)
- return markdownBuilder.String(), allTablesData, nil
- }
- // 获取单元格的rowspan和colspan属性
- func getSpanTable(cell *html.Node) (int, int) {
- rowspan, colspan := 1, 1
- for _, attr := range cell.Attr {
- switch attr.Key {
- case "rowspan":
- if val, err := strconv.Atoi(attr.Val); err == nil && val > 1 {
- rowspan = val
- }
- case "colspan":
- if val, err := strconv.Atoi(attr.Val); err == nil && val > 1 {
- colspan = val
- }
- }
- }
- return rowspan, colspan
- }
- // 解析单个表格
- func parseTable(tableNode *html.Node, level int, currentIndex int, parentIndex int) (string, TableData) {
- var tableData TableData
- tableData.ParentIndex = parentIndex
- tableData.HasMerged = false
- // 使用网格(grid)构建表格结构
- var grid []RowData
- rows := getTableRows(tableNode)
- // 处理表格行
- for rowIdx, row := range rows {
- // 扩展grid到当前行
- if rowIdx >= len(grid) {
- grid = append(grid, RowData{})
- }
- // 跳过已被合并单元格占用的位置
- col := 0
- for col < len(grid[rowIdx].Row) && grid[rowIdx].Row[col].Text != "" {
- col++
- }
- cells := getRowCells(row)
- for _, cell := range cells {
- // 获取单元格的跨行跨列属性
- rowspan, colspan := getSpanTable(cell)
- if rowspan > 1 || colspan > 1 {
- tableData.HasMerged = true
- }
- // 确保grid有足够的行
- for len(grid) < rowIdx+rowspan {
- grid = append(grid, RowData{})
- }
- // 确保所有相关行有足够的列
- targetCol := col + colspan
- for r := rowIdx; r < rowIdx+rowspan; r++ {
- if len(grid[r].Row) < targetCol {
- // 扩展行
- newRow := make([]CellData, targetCol)
- copy(newRow, grid[r].Row)
- grid[r].Row = newRow
- }
- }
- // 提取单元格文本并确定是否为表头
- text := extractCellText(cell)
- text = RemoveAllSpaces(text)
- isHeader := cell.Data == "th"
- //根据text内容和常见关键词判断,是否是表头
- if !isHeader && len([]rune(text)) < 20 {
- //核心词走正则匹配
- if theadPackWordsReg.MatchString(text) {
- isHeader = true
- }
- if !isHeader {
- isHeader = theadWordsListComReg.MatchString(text)
- }
- //非核心词,走EqualFold匹配
- if !isHeader {
- for _, word := range theadWordsList {
- if strings.EqualFold(word, text) {
- isHeader = true
- break
- }
- }
- }
- }
- // log.Println("cellData IsHeader", isHeader, text)
- cellData := CellData{Text: text, IsHeader: isHeader}
- // 将单元格数据填充到所有合并位置
- for r := 0; r < rowspan; r++ {
- for c := 0; c < colspan; c++ {
- grid[rowIdx+r].Row[col+c] = cellData
- }
- }
- // 移动到下一列位置
- col += colspan
- }
- }
- // 设置最终的行数据
- tableData.Rows = grid
- // 生成Markdown表格 (保持原有逻辑,只使用文本内容)
- markdown := generateMarkdownTable(grid, level)
- return markdown, tableData
- }
- // 生成Markdown格式的表格 (只使用单元格文本)
- func generateMarkdownTable(grid []RowData, level int) string {
- if len(grid) == 0 {
- return ""
- }
- var builder strings.Builder
- builder.WriteString(fmt.Sprintf("### Table at level %d\n\n", level))
- // 添加表头
- for i, row := range grid {
- builder.WriteString("| ")
- for j, cell := range row.Row {
- text := cell.Text
- if text == "" {
- builder.WriteString(" ")
- } else {
- builder.WriteString(text)
- }
- if j < len(row.Row)-1 {
- builder.WriteString(" | ")
- }
- }
- builder.WriteString(" |\n")
- // 添加表头分隔线
- if i == 0 {
- builder.WriteString("|")
- for j := 0; j < len(row.Row); j++ {
- builder.WriteString(" --- |")
- }
- builder.WriteString("\n")
- }
- }
- return builder.String()
- }
- // 获取表格中的所有行
- func getTableRows(tableNode *html.Node) []*html.Node {
- var rows []*html.Node
- var traverse func(*html.Node)
- traverse = func(n *html.Node) {
- if n.Type == html.ElementNode {
- switch n.Data {
- case "tr":
- rows = append(rows, n)
- case "thead", "tbody", "tfoot", "table":
- // 继续遍历
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- traverse(c)
- }
- }
- }
- }
- for c := tableNode.FirstChild; c != nil; c = c.NextSibling {
- traverse(c)
- }
- return rows
- }
- // 获取行中的所有单元格
- func getRowCells(rowNode *html.Node) []*html.Node {
- var cells []*html.Node
- for c := rowNode.FirstChild; c != nil; c = c.NextSibling {
- if c.Type == html.ElementNode && (c.Data == "td" || c.Data == "th") {
- cells = append(cells, c)
- }
- }
- return cells
- }
- // 提取单元格文本
- func extractCellText(cellNode *html.Node) string {
- var textBuilder strings.Builder
- var extract func(*html.Node)
- extract = func(n *html.Node) {
- if n.Type == html.TextNode {
- textBuilder.WriteString(strings.TrimSpace(n.Data))
- textBuilder.WriteString(" ")
- } else if n.Type == html.ElementNode {
- // 跳过嵌套表格
- if n.Data != "table" {
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- extract(c)
- }
- } else {
- textBuilder.WriteString("[Table]")
- }
- }
- }
- for c := cellNode.FirstChild; c != nil; c = c.NextSibling {
- extract(c)
- }
- // 清理文本
- result := strings.TrimSpace(textBuilder.String())
- if result == "" {
- return " "
- }
- return result
- }
- // 提取文本汉字
- func GetChineseText(htmlContent string) string {
- // 移除HTML标签
- reHTML := regexp.MustCompile("<[^>]*>")
- cleanText := reHTML.ReplaceAllString(htmlContent, "")
- // 提取汉字(Unicode范围:\u4e00-\u9fa5)
- reChinese := regexp.MustCompile("[\u4e00-\u9fa5]")
- chineseChars := reChinese.FindAllString(cleanText, -1)
- return strings.Join(chineseChars, "")
- }
- // RemoveAllSpaces 移除字符串中的所有空白字符
- func RemoveAllSpaces(s string) string {
- // 使用 strings.Builder 高效构建新字符串
- var b strings.Builder
- b.Grow(len(s)) // 预分配空间,提高性能
- // 遍历字符串的每个字符
- for _, r := range s {
- // 如果不是空白字符,则添加到结果中
- if !unicode.IsSpace(r) {
- b.WriteRune(r)
- }
- }
- return b.String()
- }
- // 按字符串长度排序的类型
- type ByLength []string
- func (s ByLength) Len() int { return len(s) }
- func (s ByLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
- func (s ByLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
- // 分组函数
- func groupStrings(strings []string) [][]string {
- if len(strings) == 0 {
- return nil
- }
- // 按长度排序
- sorted := make(ByLength, len(strings))
- copy(sorted, strings)
- sort.Sort(sorted)
- var groups [][]string
- currentGroup := []string{sorted[0]}
- currentMaxLen := len(sorted[0])
- // 遍历排序后的字符串,构建分组
- for i := 1; i < len(sorted); i++ {
- currentLen := len(sorted[i])
- if currentLen-currentMaxLen <= 2 {
- // 当前字符串可以加入当前组
- currentGroup = append(currentGroup, sorted[i])
- } else {
- // 创建新组
- groups = append(groups, currentGroup)
- currentGroup = []string{sorted[i]}
- currentMaxLen = currentLen
- }
- }
- // 添加最后一个组
- groups = append(groups, currentGroup)
- return groups
- }
- // 判断是否是联合体中标
- func isConsortiumKeysReg(content string) bool {
- return consortiumKeysReg.MatchString(content)
- }
|