// util package main import ( "fmt" "log" "regexp" "sort" "strconv" "strings" "unicode" "golang.org/x/net/html" ) var ( //常见表头 theadWordsList = append(theadWordsList_Item, theadPackWordsList...) theadWordsListCom = []string{ "包段", "标段", "标包", "名称", "包号", "包段", "子包号", "子项", "中标人", "包件号", "包件代码", "包件编号", "分包编码", "分标编号", "分标编码", "合同段", "标的", "标项", "采购合同", "包件编号", "项目编号", "评价机构", "地点", "日期", "单位", "是否"} theadWordsListComReg = regexp.MustCompile("(" + strings.Join(theadWordsListCom, "|") + ")(?:[^0-9a-zA-Z]|$)") //分包必含表头 theadPackWordsList = []string{ "包段", "标段", "标段名称", "标包", "标包名称", "标包号", "包号", "包段", "子包号", "子标段名称", "子项", "子项名称", "包件号", "包件代码", "包件编号", "分包编码", "分包名称", "分标编号", "分标编码", "合同段", "包件名称", "标的", "标的名称", "标项", "标项名称", "采购合同", "标段(包)名称", "项目/包件编号", "项目编号", "服务名称", "项目名称"} theadPackWords = "(" + strings.Join(theadPackWordsList, "|") + ")(?:[^0-9a-zA-Z]|$)" theadPackWordsReg = regexp.MustCompile(theadPackWords) //候选人表头 theadWords_order = "(包件号|标的|标段|候选人|供应商)" theadWordsReg_order = regexp.MustCompile(theadWords_order) //删除干扰数据 delRowKeys = "未成交|未中标原因" delRowKeysReg = regexp.MustCompile(delRowKeys) //负向表头,用于剔除干扰表格 reverseTheadKeys = map[string][]string{ "bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分", "专家"}, //"spotcheck": []string{"项目名称", "抽取家数"}, } //干扰内容清理 clearKeys = []string{"承包(一|二|三|四)级", "开标(\\d)(室|厅)", "\\d+\\.\\d+", "\\d+(.{1,10}).(pdf|doc|zip|rar)", "([一二三四五六七八九十0-9]+次)", "\\d+g.{0,20}包", "\\d+包.{0,20}(纸|箱)", "标段\\d+/标包\\d+", "\\d+年", "(\\d{1,2})-(\\d{1,2})(段|包|标)", "[一二三四五六七八九十]、[一二三四五六七八九十](段|包|标)", } clearKeysBack = []string{"上一篇", "下一篇", "历史业绩", "候选人业绩", "候选人企业业绩", "候选人类似业绩", "企业类似项目业绩", "投标业绩", "投标人业绩", "企业业绩", "工程业绩", "设计单位业绩", "施工单位业绩", "单位业绩情况", "投标文件中载明的业绩情况", "质量标准:"} //干扰内容替换 replaceMap = map[string]string{ "服务项目": "", "标项目": "标", "总承包": "", "三安小区": "", "I": "Ⅰ", "—": "", } //联合体投标判断 consortium = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))" consortiumKeysReg = regexp.MustCompile(consortium) ) // 判断是否有分包数据 func TableIsPackage(tables *AllTablesData) (bool, int, int) { ispack := false tablesNumRows, tablesStrRows := 0, 0 allCellVal := map[int]map[int][]string{} for kt, tv := range tables.Tables { allCellVal[kt] = getPackAllCellVal_v1(tv) //log.Println("allCellVal sss", kt, len(allCellVal), allCellVal) if len(allCellVal[kt]) < 1 { allCellVal[kt] = getPackAllCellVal_v2(tv) //log.Println("allCellVal", allCellVal) } for _, cellsVal := range allCellVal { // log.Println("cellsVal", cellsVal) for _, cells := range cellsVal { numKey := map[string]string{} strKey := map[string]string{} L: for _, cellVal := range cells { for _, word := range theadWordsList { //过滤 theadWordsList 中的词 if strings.EqualFold(word, cellVal) { break L } } if val, err := strconv.Atoi(cellVal); err == nil { numKey[fmt.Sprint(val)] = cellVal } else { if len(cellVal) > 0 { strKey[cellVal] = cellVal } } } if tablesNumRows < len(numKey) { tablesNumRows = len(numKey) } if tablesStrRows < len(strKey) { tablesStrRows = len(strKey) } if blog { log.Println(kt, "numKey", numKey) log.Println(kt, "strKey", strKey) } } } if tablesStrRows > 1 && tablesNumRows > 1 || tablesStrRows < 1 && tablesNumRows > 1 || tablesStrRows > 1 && tablesNumRows < 1 { ispack = true } } return ispack, tablesNumRows, tablesStrRows } // row.HeaderRote > 50,提取分包特征值 func getPackAllCellVal_v1(rows TableData) map[int][]string { //如果是标的物、评分、抽查列表,放弃解析 bidTheadNum := 0 if len(rows.Rows) > 0 { for _, theadKeys := range reverseTheadKeys { for _, v := range theadKeys { for _, cell := range rows.Rows[0].Row { if strings.EqualFold(v, cell.Text) { bidTheadNum++ } } } } } if bidTheadNum > 1 { if blog { log.Println("标的物、评分、抽查列表,放弃解析") } return nil } cellIndex_keyVals := map[int][]string{} kcell := []int{} startAdd := false //开始取数标识 startRows := 0 //开始取数据行 L: for kr, row := range rows.Rows { cellOk := 0 //如果单元格数据有效值不足3项,跳过 for _, cell := range row.Row { if len(cell.Text) > 0 { cellOk++ } } if cellOk < 2 { continue } if startAdd { //开始提取数据,并非从第二行开始取数据 for i, k := range kcell { if row.HeaderRote < 100 { if startRows == 0 { startRows = i } cellIndex_keyVals[k] = append(cellIndex_keyVals[k], row.Row[k].Text) } //如果已有数据,再次碰到行表头行放弃数据 if startRows > 0 && row.HeaderRote > 0 { if blog { log.Println("中断", row.HeaderRote, row) } break L } } } if blog { log.Println("整行是表头v1 row", startAdd, cellOk, bidTheadNum, kr, row.HeaderRote, row.Row) } //首次获取行表头中 分包索引号 if !startAdd && row.HeaderRote > 50 { for i, cell := range row.Row { for _, word := range theadPackWordsList { if strings.EqualFold(word, cell.Text) { // log.Println("word", i, word, strings.EqualFold(word, cell.Text)) kcell = append(kcell, i) startAdd = true } } } } } return cellIndex_keyVals } // row.HeaderRote <= 50,提取分包特征值 func getPackAllCellVal_v2(rows TableData) map[int][]string { //如果是标的物、评分、抽查列表,放弃解析 bidTheadNum := 0 if len(rows.Rows) > 0 { for _, theadKeys := range reverseTheadKeys { for _, v := range theadKeys { for _, cell := range rows.Rows[0].Row { if strings.EqualFold(v, cell.Text) { bidTheadNum++ } } } } } if bidTheadNum > 1 { return nil } cellIndex_keyVals := map[int][]string{} L: for _, row := range rows.Rows { cellOk := 0 //如果单元格数据有效值不足3项,跳过 for _, cell := range row.Row { if len(cell.Text) > 0 { cellOk++ } } if cellOk < 3 { continue } if row.HeaderRote <= 50 { for i, cell := range row.Row { for _, word := range theadPackWordsList { if strings.EqualFold(word, cell.Text) { if len(row.Row) > i+1 { cellIndex_keyVals[0] = append(cellIndex_keyVals[0], row.Row[i+1].Text) break L //log.Println("ssss", word, row.Row[i+1].Text) } } } } } } return cellIndex_keyVals } func setRowsHeaderRote(tables *AllTablesData) *AllTablesData { //判断表头模式 for k, table := range tables.Tables { for i, row := range table.Rows { rowLen := len(row.Row) rowHeardNum := 0 for _, cell := range row.Row { if cell.IsHeader { rowHeardNum++ } } if rowLen == rowHeardNum || rowHeardNum > rowLen/2 { row.HeaderRote = 100 } else if rowLen%2 == 0 && rowHeardNum == rowLen/2 { row.HeaderRote = 50 } else if rowHeardNum > 0 { //有表头个数不定 row.HeaderRote = 1 // log.Println("row.HeaderRote", row.HeaderRote, row) } else { row.HeaderRote = 0 } table.Rows[i] = row // if blog { // log.Println("setRowsHeaderRote", row.HeaderRote, row.Row) // } } tables.Tables[k] = table } return tables } // 匹配标签及其内容的正则表达式 func removeTables(html string) string { re := regexp.MustCompile(`(?i)]*>[\s\S]*?
`) html = re.ReplaceAllString(html, "") // re = regexp.MustCompile(`<[^>]*>`) // html = re.ReplaceAllString(html, "") return html } // 表格检测,检查表格是否存在及是否存在合并单元格 func CheckTableMerged(htmlContent string) (hasTable bool, hasMerged bool, err error) { doc, err := html.Parse(strings.NewReader(htmlContent)) if err != nil { return false, false, err } // 递归查找所有表格 tables := findTables(doc) hasTable = len(tables) > 0 // 检查所有表格中的合并单元格 for _, table := range tables { if checkTableForMergedCells(table) { hasMerged = true break } } return hasTable, hasMerged, nil } func findTables(n *html.Node) []*html.Node { // 递归查找文档中的所有元素 var tables []*html.Node if n.Type == html.ElementNode && n.Data == "table" { tables = append(tables, n) } for c := n.FirstChild; c != nil; c = c.NextSibling { tables = append(tables, findTables(c)...) } return tables } func checkTableForMergedCells(table *html.Node) bool { //检查单个表格中是否存在合并单元格 // 使用栈进行非递归深度优先遍历 stack := []*html.Node{table} for len(stack) > 0 { node := stack[len(stack)-1] stack = stack[:len(stack)-1] // 遇到嵌套表格则跳过 if node != table && node.Type == html.ElementNode && node.Data == "table" { continue } // 检查当前节点是否为单元格 if node.Type == html.ElementNode && (node.Data == "td" || node.Data == "th") { if hasMergeAttribute(node) { return true } } // 将子节点逆序压入栈中 for child := node.LastChild; child != nil; child = child.PrevSibling { stack = append(stack, child) } } return false } func hasMergeAttribute(cell *html.Node) bool { // 检查单元格是否包含合并属性 for _, attr := range cell.Attr { if attr.Key == "rowspan" || attr.Key == "colspan" { // 尝试解析属性值为整数 if val, err := strconv.Atoi(attr.Val); err == nil { if val > 1 { return true } } // 如果值无法解析为整数,但属性存在且非"1",也视为合并 if attr.Val != "1" { return true } } } return false } // 替换文本数据 func repalceString(input string, replace map[string]string) string { for k, v := range replace { input = strings.Replace(input, k, v, -1) } return input } // cleanWebText 删除包含指定关键词及其后续的所有内容 func cleanWebText(input string, keywords, keywordsback []string) string { // 构建关键词正则表达式(使用OR连接) keywordPattern := strings.Join(keywordsback, "|") re, err := regexp.Compile(fmt.Sprintf(`(?s)(%s).*`, keywordPattern)) if err != nil { return input // 正则编译失败时返回原始文本 } input = re.ReplaceAllString(input, "") for _, v := range keywords { re, err = regexp.Compile(v) if err != nil { continue } input = re.ReplaceAllString(input, "") } return input } // 支持中文数字(零一二三四五六七八九十)、阿拉伯数字(0-9)、罗马数字(Ⅰ-Ⅻ) func convertNumerals(input string) string { // 字符映射表 chineseNumMap := map[rune]rune{ '零': '0', '一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十': '1', // 仅处理个位,十位需特殊处理 } romanNumMap := map[rune]rune{ 'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5', 'Ⅵ': '6', 'Ⅶ': '7', 'Ⅷ': '8', 'Ⅸ': '9', 'Ⅹ': '1', // 仅处理个位 'Ⅺ': '1', 'Ⅻ': '1', // 罗马数字11和12仅处理十位 // 'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5', // 'F': '6', 'G': '7', 'H': '8', 'J': '9', 'K': '1', // 'a': '1', 'b': '2', 'c': '3', 'd': '4', 'e': '5', // 'f': '6', 'g': '7', 'h': '8', 'j': '9', 'k': '1', } var result strings.Builder var result1 strings.Builder for _, char := range input { // 检查阿拉伯数字 if char >= '0' && char <= '9' { result1.WriteRune(char) continue } } var result2 strings.Builder for _, char := range input { // 检查中文数字 if num, exists := chineseNumMap[char]; exists { result2.WriteRune(num) continue } } var result3 strings.Builder for _, char := range input { // 检查罗马数字 if num, exists := romanNumMap[char]; exists { result3.WriteRune(num) continue } } if result1.Len() > result.Len() { result = result1 } if result2.Len() > result.Len() { result = result2 } if result3.Len() > result.Len() { result = result3 } return result.String() } // Unicode判断工具函数 func isChineseRune(r rune) bool { // 基础汉字检测 if r >= 0x4E00 && r <= 0x9FFF { return true } // CJK符号和标点 if r >= 0x3000 && r <= 0x303F { return true } // 全角符号(过滤字母数字) if r >= 0xFF00 && r <= 0xFFEF { // 排除全角字母 if (r >= 0xFF21 && r <= 0xFF3A) || // 大写字母 (r >= 0xFF41 && r <= 0xFF5A) { // 小写字母 return false } // 排除全角数字 if r >= 0xFF10 && r <= 0xFF19 { return false } return true } // 特殊符号检测 switch r { case 0x2018, 0x2019, 0x201C, 0x201D, // 引号 0x2014, 0x2026, // 破折号、省略号 0x3010, 0x3011, // 【】 0x3008, 0x3009, 0x300A, 0x300B: // 《》〈〉 return true } return false } // CellData 存储单元格数据 type CellData struct { Text string `json:"text"` // 单元格文本内容 IsHeader bool `json:"isHeader"` // 是否为表头单元格 } // RowData 存储单元格数据 type RowData struct { Row []CellData `json:"row"` // 行数据 HeaderRote int `json:"isHeader"` // 表头权重 100 50 0 } // TableData 存储表格的行列数据 type TableData struct { Rows []RowData `json:"rows"` // 表格行数据 NestedLevel int `json:"nestedLevel"` // 表格的嵌套层级,0表示顶层表格 ChildTables []int `json:"childTables"` // 子表格的索引列表 ParentIndex int `json:"parentIndex"` // 父表格的索引,-1表示没有父表格 HasMerged bool `json:"hasMerged"` // 是否存在合并单元格 } // AllTablesData 存储所有表格的数据 type AllTablesData struct { Tables []TableData `json:"tables"` } // TableExtractDatas 解析HTML中的表格,返回Markdown格式和所有表格的行列数据 func TableExtractDatas(htmlStr string) (string, *AllTablesData, error) { doc, err := html.Parse(strings.NewReader(htmlStr)) if err != nil { return "", nil, err } allTablesData := &AllTablesData{} var markdownBuilder strings.Builder tableIndex := 0 // 递归解析表格 var parseNode func(*html.Node, int, int) parseNode = func(n *html.Node, level int, parentIdx int) { if n.Type == html.ElementNode && n.Data == "table" { // 记录当前表格的父索引 currentParent := parentIdx currentIndex := tableIndex // 解析当前表格 tableMarkdown, tableData := parseTable(n, level, currentIndex, currentParent) tableData.NestedLevel = level tableData.ParentIndex = currentParent // 添加到结果集 allTablesData.Tables = append(allTablesData.Tables, tableData) markdownBuilder.WriteString(tableMarkdown) markdownBuilder.WriteString("\n\n") // 更新父表格的子表格列表 if currentParent != -1 { parentTable := &allTablesData.Tables[currentParent] parentTable.ChildTables = append(parentTable.ChildTables, currentIndex) } // 增加表格索引 tableIndex++ // 递归处理子节点(使用新的父索引) for c := n.FirstChild; c != nil; c = c.NextSibling { parseNode(c, level+1, currentIndex) } return } // 递归处理其他节点 for c := n.FirstChild; c != nil; c = c.NextSibling { parseNode(c, level, parentIdx) } } // 从文档根节点开始解析 parseNode(doc, 0, -1) return markdownBuilder.String(), allTablesData, nil } // 获取单元格的rowspan和colspan属性 func getSpanTable(cell *html.Node) (int, int) { rowspan, colspan := 1, 1 for _, attr := range cell.Attr { switch attr.Key { case "rowspan": if val, err := strconv.Atoi(attr.Val); err == nil && val > 1 { rowspan = val } case "colspan": if val, err := strconv.Atoi(attr.Val); err == nil && val > 1 { colspan = val } } } return rowspan, colspan } // 解析单个表格 func parseTable(tableNode *html.Node, level int, currentIndex int, parentIndex int) (string, TableData) { var tableData TableData tableData.ParentIndex = parentIndex tableData.HasMerged = false // 使用网格(grid)构建表格结构 var grid []RowData rows := getTableRows(tableNode) // 处理表格行 for rowIdx, row := range rows { // 扩展grid到当前行 if rowIdx >= len(grid) { grid = append(grid, RowData{}) } // 跳过已被合并单元格占用的位置 col := 0 for col < len(grid[rowIdx].Row) && grid[rowIdx].Row[col].Text != "" { col++ } cells := getRowCells(row) for _, cell := range cells { // 获取单元格的跨行跨列属性 rowspan, colspan := getSpanTable(cell) if rowspan > 1 || colspan > 1 { tableData.HasMerged = true } // 确保grid有足够的行 for len(grid) < rowIdx+rowspan { grid = append(grid, RowData{}) } // 确保所有相关行有足够的列 targetCol := col + colspan for r := rowIdx; r < rowIdx+rowspan; r++ { if len(grid[r].Row) < targetCol { // 扩展行 newRow := make([]CellData, targetCol) copy(newRow, grid[r].Row) grid[r].Row = newRow } } // 提取单元格文本并确定是否为表头 text := extractCellText(cell) text = RemoveAllSpaces(text) isHeader := cell.Data == "th" //根据text内容和常见关键词判断,是否是表头 if !isHeader && len([]rune(text)) < 20 { //核心词走正则匹配 if theadPackWordsReg.MatchString(text) { isHeader = true } if !isHeader { isHeader = theadWordsListComReg.MatchString(text) } //非核心词,走EqualFold匹配 if !isHeader { for _, word := range theadWordsList { if strings.EqualFold(word, text) { isHeader = true break } } } } // log.Println("cellData IsHeader", isHeader, text) cellData := CellData{Text: text, IsHeader: isHeader} // 将单元格数据填充到所有合并位置 for r := 0; r < rowspan; r++ { for c := 0; c < colspan; c++ { grid[rowIdx+r].Row[col+c] = cellData } } // 移动到下一列位置 col += colspan } } // 设置最终的行数据 tableData.Rows = grid // 生成Markdown表格 (保持原有逻辑,只使用文本内容) markdown := generateMarkdownTable(grid, level) return markdown, tableData } // 生成Markdown格式的表格 (只使用单元格文本) func generateMarkdownTable(grid []RowData, level int) string { if len(grid) == 0 { return "" } var builder strings.Builder builder.WriteString(fmt.Sprintf("### Table at level %d\n\n", level)) // 添加表头 for i, row := range grid { builder.WriteString("| ") for j, cell := range row.Row { text := cell.Text if text == "" { builder.WriteString(" ") } else { builder.WriteString(text) } if j < len(row.Row)-1 { builder.WriteString(" | ") } } builder.WriteString(" |\n") // 添加表头分隔线 if i == 0 { builder.WriteString("|") for j := 0; j < len(row.Row); j++ { builder.WriteString(" --- |") } builder.WriteString("\n") } } return builder.String() } // 获取表格中的所有行 func getTableRows(tableNode *html.Node) []*html.Node { var rows []*html.Node var traverse func(*html.Node) traverse = func(n *html.Node) { if n.Type == html.ElementNode { switch n.Data { case "tr": rows = append(rows, n) case "thead", "tbody", "tfoot", "table": // 继续遍历 for c := n.FirstChild; c != nil; c = c.NextSibling { traverse(c) } } } } for c := tableNode.FirstChild; c != nil; c = c.NextSibling { traverse(c) } return rows } // 获取行中的所有单元格 func getRowCells(rowNode *html.Node) []*html.Node { var cells []*html.Node for c := rowNode.FirstChild; c != nil; c = c.NextSibling { if c.Type == html.ElementNode && (c.Data == "td" || c.Data == "th") { cells = append(cells, c) } } return cells } // 提取单元格文本 func extractCellText(cellNode *html.Node) string { var textBuilder strings.Builder var extract func(*html.Node) extract = func(n *html.Node) { if n.Type == html.TextNode { textBuilder.WriteString(strings.TrimSpace(n.Data)) textBuilder.WriteString(" ") } else if n.Type == html.ElementNode { // 跳过嵌套表格 if n.Data != "table" { for c := n.FirstChild; c != nil; c = c.NextSibling { extract(c) } } else { textBuilder.WriteString("[Table]") } } } for c := cellNode.FirstChild; c != nil; c = c.NextSibling { extract(c) } // 清理文本 result := strings.TrimSpace(textBuilder.String()) if result == "" { return " " } return result } // 提取文本汉字 func GetChineseText(htmlContent string) string { // 移除HTML标签 reHTML := regexp.MustCompile("<[^>]*>") cleanText := reHTML.ReplaceAllString(htmlContent, "") // 提取汉字(Unicode范围:\u4e00-\u9fa5) reChinese := regexp.MustCompile("[\u4e00-\u9fa5]") chineseChars := reChinese.FindAllString(cleanText, -1) return strings.Join(chineseChars, "") } // RemoveAllSpaces 移除字符串中的所有空白字符 func RemoveAllSpaces(s string) string { // 使用 strings.Builder 高效构建新字符串 var b strings.Builder b.Grow(len(s)) // 预分配空间,提高性能 // 遍历字符串的每个字符 for _, r := range s { // 如果不是空白字符,则添加到结果中 if !unicode.IsSpace(r) { b.WriteRune(r) } } return b.String() } // 按字符串长度排序的类型 type ByLength []string func (s ByLength) Len() int { return len(s) } func (s ByLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func (s ByLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) } // 分组函数 func groupStrings(strings []string) [][]string { if len(strings) == 0 { return nil } // 按长度排序 sorted := make(ByLength, len(strings)) copy(sorted, strings) sort.Sort(sorted) var groups [][]string currentGroup := []string{sorted[0]} currentMaxLen := len(sorted[0]) // 遍历排序后的字符串,构建分组 for i := 1; i < len(sorted); i++ { currentLen := len(sorted[i]) if currentLen-currentMaxLen <= 2 { // 当前字符串可以加入当前组 currentGroup = append(currentGroup, sorted[i]) } else { // 创建新组 groups = append(groups, currentGroup) currentGroup = []string{sorted[i]} currentMaxLen = currentLen } } // 添加最后一个组 groups = append(groups, currentGroup) return groups } // 判断是否是联合体中标 func isConsortiumKeysReg(content string) bool { return consortiumKeysReg.MatchString(content) }