|
@@ -5,207 +5,355 @@ import (
|
|
|
"fmt"
|
|
|
"log"
|
|
|
"regexp"
|
|
|
+ "sort"
|
|
|
+ "strconv"
|
|
|
"strings"
|
|
|
- "time"
|
|
|
+ "unicode"
|
|
|
|
|
|
- "github.com/PuerkitoBio/goquery"
|
|
|
"golang.org/x/net/html"
|
|
|
)
|
|
|
|
|
|
var (
|
|
|
- theadWords = "(标段|标包|标包号|包号|包段|子包号|子标段名称|子项|包件号|包件代码|包件编号|分包编码|分包名称|分标编号|分标编码|合同段|包件名称|标包名称|" +
|
|
|
- "中标单位|中标人|中商人|成交人|成交人名称|供应商|供应商名称|项目名称|项目地址|标的|标的名称|标项名称|采购合同|" +
|
|
|
- "成交价格|中标报价|简要规格描述|预算金额|最高限价|中标、成交供应商|中标、成交金额|中标、成交结果情况说明)"
|
|
|
+ //常见表头
|
|
|
+ theadWordsList = append(theadWordsList_Item, theadPackWordsList...)
|
|
|
+ theadWordsListCom = []string{
|
|
|
+ "包段", "标段", "标包", "名称", "包号", "包段", "子包号", "子项", "中标人",
|
|
|
+ "包件号", "包件代码", "包件编号", "分包编码", "分标编号", "分标编码", "合同段",
|
|
|
+ "标的", "标项", "采购合同", "包件编号", "项目编号", "评价机构",
|
|
|
+ "地点", "日期", "单位", "是否"}
|
|
|
+ theadWordsListComReg = regexp.MustCompile("(" + strings.Join(theadWordsListCom, "|") + ")(?:[^0-9a-zA-Z]|$)")
|
|
|
+
|
|
|
+ //分包必含表头
|
|
|
+ theadPackWordsList = []string{
|
|
|
+ "包段", "标段", "标段名称", "标包", "标包名称", "标包号", "包号", "包段", "子包号", "子标段名称", "子项", "子项名称",
|
|
|
+ "包件号", "包件代码", "包件编号", "分包编码", "分包名称", "分标编号", "分标编码", "合同段", "包件名称",
|
|
|
+ "标的", "标的名称", "标项", "标项名称", "采购合同", "标段(包)名称",
|
|
|
+ "项目/包件编号", "项目编号", "服务名称", "项目名称"}
|
|
|
+ theadPackWords = "(" + strings.Join(theadPackWordsList, "|") + ")(?:[^0-9a-zA-Z]|$)"
|
|
|
+ theadPackWordsReg = regexp.MustCompile(theadPackWords)
|
|
|
+
|
|
|
+ //候选人表头
|
|
|
theadWords_order = "(包件号|标的|标段|候选人|供应商)"
|
|
|
- theadWordsReg = regexp.MustCompile(theadWords)
|
|
|
theadWordsReg_order = regexp.MustCompile(theadWords_order)
|
|
|
-
|
|
|
+ //删除干扰数据
|
|
|
delRowKeys = "未成交|未中标原因"
|
|
|
delRowKeysReg = regexp.MustCompile(delRowKeys)
|
|
|
-
|
|
|
//负向表头,用于剔除干扰表格
|
|
|
reverseTheadKeys = map[string][]string{
|
|
|
- "bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分"},
|
|
|
+ "bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分", "专家"},
|
|
|
//"spotcheck": []string{"项目名称", "抽取家数"},
|
|
|
}
|
|
|
-
|
|
|
- //联合体投标判断
|
|
|
- consortium = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))"
|
|
|
- consortiumKeysReg = regexp.MustCompile(consortium)
|
|
|
-
|
|
|
//干扰内容清理
|
|
|
clearKeys = []string{"承包(一|二|三|四)级", "开标(\\d)(室|厅)", "\\d+\\.\\d+", "\\d+(.{1,10}).(pdf|doc|zip|rar)",
|
|
|
- "([一二三四五六七八九十0-9]+次)"}
|
|
|
- clearKeysBack = []string{"上一篇", "下一篇", "历史业绩", "候选人企业业绩", "候选人类似业绩", "成交候选人业绩", "企业类似项目业绩",
|
|
|
- "投标人业绩", "企业业绩", "投标文件中载明的业绩情况", "质量标准:", "评审专家"}
|
|
|
-
|
|
|
+ "([一二三四五六七八九十0-9]+次)", "\\d+g.{0,20}包", "\\d+包.{0,20}(纸|箱)", "标段\\d+/标包\\d+", "\\d+年",
|
|
|
+ "(\\d{1,2})-(\\d{1,2})(段|包|标)", "[一二三四五六七八九十]、[一二三四五六七八九十](段|包|标)",
|
|
|
+ }
|
|
|
+ clearKeysBack = []string{"上一篇", "下一篇", "历史业绩",
|
|
|
+ "候选人业绩", "候选人企业业绩", "候选人类似业绩", "企业类似项目业绩",
|
|
|
+ "投标业绩", "投标人业绩", "企业业绩", "工程业绩", "设计单位业绩", "施工单位业绩",
|
|
|
+ "单位业绩情况", "投标文件中载明的业绩情况", "质量标准:"}
|
|
|
//干扰内容替换
|
|
|
replaceMap = map[string]string{
|
|
|
- "标项目": "标",
|
|
|
+ "服务项目": "",
|
|
|
+ "标项目": "标",
|
|
|
+ "总承包": "",
|
|
|
+ "三安小区": "",
|
|
|
+ "I": "Ⅰ",
|
|
|
+ "—": "",
|
|
|
}
|
|
|
-)
|
|
|
|
|
|
-func getIdFromDate(startStr, endStr string) (string, string) {
|
|
|
- start, _ := time.Parse("2006-01-02", startStr)
|
|
|
- end, _ := time.Parse("2006-01-02", endStr)
|
|
|
- // 昨天凌晨0点时间戳
|
|
|
- hexTimestamp1 := fmt.Sprintf("%X", start.Unix()) + "0000000000000000"
|
|
|
- // 今天凌晨0点时间戳
|
|
|
- hexTimestamp2 := fmt.Sprintf("%X", end.Unix()) + "0000000000000000"
|
|
|
- return hexTimestamp1, hexTimestamp2
|
|
|
-}
|
|
|
+ //联合体投标判断
|
|
|
+ consortium = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))"
|
|
|
+ consortiumKeysReg = regexp.MustCompile(consortium)
|
|
|
+)
|
|
|
|
|
|
-// 判断是否有嵌套表格
|
|
|
-func tableIsPackage(htmlContent string) (bool, int) {
|
|
|
- //判断是否有多层表格嵌套
|
|
|
- if hasNestedTables(htmlContent) {
|
|
|
- //log.Println("表格嵌套")
|
|
|
- return false, 0
|
|
|
- }
|
|
|
+// 判断是否有分包数据
|
|
|
+func TableIsPackage(tables *AllTablesData) (bool, int, int) {
|
|
|
ispack := false
|
|
|
- tablesMixRows := 0
|
|
|
- tablesData := getPackAgeByTable(htmlContent)
|
|
|
- for _, dataRows := range tablesData {
|
|
|
- // for k, v := range dataRows {
|
|
|
- // log.Println(i, k, v)
|
|
|
- // }
|
|
|
- if len(dataRows) > 2 {
|
|
|
- ispack = true
|
|
|
+ tablesNumRows, tablesStrRows := 0, 0
|
|
|
+ allCellVal := map[int]map[int][]string{}
|
|
|
+ for kt, tv := range tables.Tables {
|
|
|
+ allCellVal[kt] = getPackAllCellVal_v1(tv)
|
|
|
+ //log.Println("allCellVal sss", kt, len(allCellVal), allCellVal)
|
|
|
+ if len(allCellVal[kt]) < 1 {
|
|
|
+ allCellVal[kt] = getPackAllCellVal_v2(tv)
|
|
|
+ //log.Println("allCellVal", allCellVal)
|
|
|
+ }
|
|
|
+ for _, cellsVal := range allCellVal {
|
|
|
+ // log.Println("cellsVal", cellsVal)
|
|
|
+ for _, cells := range cellsVal {
|
|
|
+ numKey := map[string]string{}
|
|
|
+ strKey := map[string]string{}
|
|
|
+ L:
|
|
|
+ for _, cellVal := range cells {
|
|
|
+ for _, word := range theadWordsList { //过滤 theadWordsList 中的词
|
|
|
+ if strings.EqualFold(word, cellVal) {
|
|
|
+ break L
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if val, err := strconv.Atoi(cellVal); err == nil {
|
|
|
+ numKey[fmt.Sprint(val)] = cellVal
|
|
|
+ } else {
|
|
|
+ if len(cellVal) > 0 {
|
|
|
+ strKey[cellVal] = cellVal
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if tablesNumRows < len(numKey) {
|
|
|
+ tablesNumRows = len(numKey)
|
|
|
+ }
|
|
|
+ if tablesStrRows < len(strKey) {
|
|
|
+ tablesStrRows = len(strKey)
|
|
|
+ }
|
|
|
+ if blog {
|
|
|
+ log.Println(kt, "numKey", numKey)
|
|
|
+ log.Println(kt, "strKey", strKey)
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
- if tablesMixRows < len(dataRows) {
|
|
|
- tablesMixRows = len(dataRows)
|
|
|
+ if tablesStrRows > 1 && tablesNumRows > 1 ||
|
|
|
+ tablesStrRows < 1 && tablesNumRows > 1 ||
|
|
|
+ tablesStrRows > 1 && tablesNumRows < 1 {
|
|
|
+ ispack = true
|
|
|
}
|
|
|
}
|
|
|
- //log.Println(ispack, tablesMixRows)
|
|
|
- return ispack, tablesMixRows
|
|
|
+ return ispack, tablesNumRows, tablesStrRows
|
|
|
}
|
|
|
|
|
|
-// 提取疑似表格分包数据
|
|
|
-func getPackAgeByTable(htmlContent string) map[string][]map[string]string {
|
|
|
- // 解析HTML文档
|
|
|
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
|
|
|
- if err != nil {
|
|
|
- log.Println(err)
|
|
|
- }
|
|
|
- // 遍历所有表格
|
|
|
- tableDataRows := map[string][]map[string]string{}
|
|
|
- doc.Find("table").Each(func(i int, table *goquery.Selection) {
|
|
|
- var headers []string
|
|
|
- var rows []map[string]string
|
|
|
- // 遍历表格行
|
|
|
- table.Find("tr").Each(func(rowIdx int, row *goquery.Selection) {
|
|
|
- // 自动识别表头行(根据单元格内容特征)
|
|
|
- if isHeaderRow(row) && len(headers) < 1 {
|
|
|
- isDelHeader := false
|
|
|
- tmphead := []string{}
|
|
|
-
|
|
|
- bidTheadKeysIndexNum := map[string]int{} //记录满足剔除的表头的阈值
|
|
|
- row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) {
|
|
|
- text := strings.TrimSpace(cell.Text())
|
|
|
- tmphead = append(tmphead, text)
|
|
|
- if delRowKeysReg.MatchString(text) {
|
|
|
- isDelHeader = true
|
|
|
+// row.HeaderRote > 50,提取分包特征值
|
|
|
+func getPackAllCellVal_v1(rows TableData) map[int][]string {
|
|
|
+ //如果是标的物、评分、抽查列表,放弃解析
|
|
|
+ bidTheadNum := 0
|
|
|
+ if len(rows.Rows) > 0 {
|
|
|
+ for _, theadKeys := range reverseTheadKeys {
|
|
|
+ for _, v := range theadKeys {
|
|
|
+ for _, cell := range rows.Rows[0].Row {
|
|
|
+ if strings.EqualFold(v, cell.Text) {
|
|
|
+ bidTheadNum++
|
|
|
}
|
|
|
- //如果是标的物、评分、抽查列表,剔除
|
|
|
- for k, theadKeys := range reverseTheadKeys {
|
|
|
- for _, v := range theadKeys {
|
|
|
- if strings.Contains(text, v) {
|
|
|
- bidTheadKeysIndexNum[k]++
|
|
|
- }
|
|
|
- }
|
|
|
- if bidTheadKeysIndexNum[k] > 1 { //满足一个以上的表头删除
|
|
|
- isDelHeader = true
|
|
|
- }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if bidTheadNum > 1 {
|
|
|
+ if blog {
|
|
|
+ log.Println("标的物、评分、抽查列表,放弃解析")
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+ cellIndex_keyVals := map[int][]string{}
|
|
|
+ kcell := []int{}
|
|
|
+ startAdd := false //开始取数标识
|
|
|
+ startRows := 0 //开始取数据行
|
|
|
+L:
|
|
|
+ for kr, row := range rows.Rows {
|
|
|
+ cellOk := 0 //如果单元格数据有效值不足3项,跳过
|
|
|
+ for _, cell := range row.Row {
|
|
|
+ if len(cell.Text) > 0 {
|
|
|
+ cellOk++
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if cellOk < 2 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ if startAdd { //开始提取数据,并非从第二行开始取数据
|
|
|
+ for i, k := range kcell {
|
|
|
+ if row.HeaderRote < 100 {
|
|
|
+ if startRows == 0 {
|
|
|
+ startRows = i
|
|
|
+ }
|
|
|
+ cellIndex_keyVals[k] = append(cellIndex_keyVals[k], row.Row[k].Text)
|
|
|
+ }
|
|
|
+ //如果已有数据,再次碰到行表头行放弃数据
|
|
|
+ if startRows > 0 && row.HeaderRote > 0 {
|
|
|
+ if blog {
|
|
|
+ log.Println("中断", row.HeaderRote, row)
|
|
|
+ }
|
|
|
+ break L
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if blog {
|
|
|
+ log.Println("整行是表头v1 row", startAdd, cellOk, bidTheadNum, kr, row.HeaderRote, row.Row)
|
|
|
+ }
|
|
|
+ //首次获取行表头中 分包索引号
|
|
|
+ if !startAdd && row.HeaderRote > 50 {
|
|
|
+ for i, cell := range row.Row {
|
|
|
+ for _, word := range theadPackWordsList {
|
|
|
+ if strings.EqualFold(word, cell.Text) {
|
|
|
+ // log.Println("word", i, word, strings.EqualFold(word, cell.Text))
|
|
|
+ kcell = append(kcell, i)
|
|
|
+ startAdd = true
|
|
|
}
|
|
|
-
|
|
|
- })
|
|
|
- //log.Println("tmphead th", tmphead)
|
|
|
- if len(tmphead) < 1 {
|
|
|
- row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
|
|
|
- text := strings.TrimSpace(cell.Text())
|
|
|
- tmphead = append(tmphead, text)
|
|
|
- if delRowKeysReg.MatchString(text) {
|
|
|
- isDelHeader = true
|
|
|
- }
|
|
|
- //如果是标的物、评分、抽查列表,剔除
|
|
|
- for k, theadKeys := range reverseTheadKeys {
|
|
|
- for _, v := range theadKeys {
|
|
|
- if strings.Contains(text, v) {
|
|
|
- bidTheadKeysIndexNum[k]++
|
|
|
- }
|
|
|
- }
|
|
|
- if bidTheadKeysIndexNum[k] > 1 {
|
|
|
- isDelHeader = true
|
|
|
- }
|
|
|
- }
|
|
|
- })
|
|
|
}
|
|
|
- //log.Println("tmphead td", tmphead)
|
|
|
- if !isDelHeader {
|
|
|
- headers = append(headers, tmphead...)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return cellIndex_keyVals
|
|
|
+}
|
|
|
+
|
|
|
+// row.HeaderRote <= 50,提取分包特征值
|
|
|
+func getPackAllCellVal_v2(rows TableData) map[int][]string {
|
|
|
+ //如果是标的物、评分、抽查列表,放弃解析
|
|
|
+ bidTheadNum := 0
|
|
|
+ if len(rows.Rows) > 0 {
|
|
|
+ for _, theadKeys := range reverseTheadKeys {
|
|
|
+ for _, v := range theadKeys {
|
|
|
+ for _, cell := range rows.Rows[0].Row {
|
|
|
+ if strings.EqualFold(v, cell.Text) {
|
|
|
+ bidTheadNum++
|
|
|
+ }
|
|
|
}
|
|
|
- //log.Println("headers", headers)
|
|
|
}
|
|
|
- // 处理数据行
|
|
|
- if len(headers) > 0 {
|
|
|
- isDelRows := false //是否需要屏蔽词
|
|
|
- rowData := make(map[string]string)
|
|
|
- row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
|
|
|
- if cellIdx < len(headers) {
|
|
|
- header := headers[cellIdx]
|
|
|
- text := strings.TrimSpace(cell.Text())
|
|
|
- rowData[header] = text
|
|
|
- if delRowKeysReg.MatchString(text) {
|
|
|
- isDelRows = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if bidTheadNum > 1 {
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+ cellIndex_keyVals := map[int][]string{}
|
|
|
+L:
|
|
|
+ for _, row := range rows.Rows {
|
|
|
+ cellOk := 0 //如果单元格数据有效值不足3项,跳过
|
|
|
+ for _, cell := range row.Row {
|
|
|
+ if len(cell.Text) > 0 {
|
|
|
+ cellOk++
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if cellOk < 3 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ if row.HeaderRote <= 50 {
|
|
|
+ for i, cell := range row.Row {
|
|
|
+ for _, word := range theadPackWordsList {
|
|
|
+ if strings.EqualFold(word, cell.Text) {
|
|
|
+ if len(row.Row) > i+1 {
|
|
|
+ cellIndex_keyVals[0] = append(cellIndex_keyVals[0], row.Row[i+1].Text)
|
|
|
+ break L
|
|
|
+ //log.Println("ssss", word, row.Row[i+1].Text)
|
|
|
}
|
|
|
}
|
|
|
- })
|
|
|
- //log.Println(isDelRows, rowData)
|
|
|
- if !isDelRows {
|
|
|
- rows = append(rows, rowData)
|
|
|
}
|
|
|
}
|
|
|
- })
|
|
|
- tableDataRows[fmt.Sprint(i)] = rows
|
|
|
- })
|
|
|
- return tableDataRows
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return cellIndex_keyVals
|
|
|
}
|
|
|
|
|
|
-// 自定义表头判断逻辑(根据单元格内容特征)
|
|
|
-func isHeaderRow(row *goquery.Selection) bool {
|
|
|
- // 判断条件示例 包含 theadWords 特定关键词
|
|
|
- hasAttributeKeyword := false
|
|
|
- matchNum := 0
|
|
|
- row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
|
|
|
- text := strings.TrimSpace(cell.Text())
|
|
|
- if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 {
|
|
|
- matchNum++
|
|
|
- hasAttributeKeyword = true
|
|
|
- } else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 {
|
|
|
- matchNum++
|
|
|
- hasAttributeKeyword = true
|
|
|
- }
|
|
|
- //log.Println(text, matchNum, hasAttributeKeyword)
|
|
|
- })
|
|
|
- row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) {
|
|
|
- text := strings.TrimSpace(cell.Text())
|
|
|
- if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 {
|
|
|
- matchNum++
|
|
|
- hasAttributeKeyword = true
|
|
|
- } else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 {
|
|
|
- matchNum++
|
|
|
- hasAttributeKeyword = true
|
|
|
- }
|
|
|
- //log.Println(text, matchNum, hasAttributeKeyword)
|
|
|
- })
|
|
|
- //log.Println("isHeaderRow", matchNum, hasAttributeKeyword, matchNum > 1 && hasAttributeKeyword)
|
|
|
- return matchNum > 1 && hasAttributeKeyword
|
|
|
+func setRowsHeaderRote(tables *AllTablesData) *AllTablesData {
|
|
|
+ //判断表头模式
|
|
|
+ for k, table := range tables.Tables {
|
|
|
+ for i, row := range table.Rows {
|
|
|
+ rowLen := len(row.Row)
|
|
|
+ rowHeardNum := 0
|
|
|
+ for _, cell := range row.Row {
|
|
|
+ if cell.IsHeader {
|
|
|
+ rowHeardNum++
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if rowLen == rowHeardNum || rowHeardNum > rowLen/2 {
|
|
|
+ row.HeaderRote = 100
|
|
|
+ } else if rowLen%2 == 0 && rowHeardNum == rowLen/2 {
|
|
|
+ row.HeaderRote = 50
|
|
|
+ } else if rowHeardNum > 0 { //有表头个数不定
|
|
|
+ row.HeaderRote = 1
|
|
|
+ // log.Println("row.HeaderRote", row.HeaderRote, row)
|
|
|
+ } else {
|
|
|
+ row.HeaderRote = 0
|
|
|
+ }
|
|
|
+ table.Rows[i] = row
|
|
|
+ // if blog {
|
|
|
+ // log.Println("setRowsHeaderRote", row.HeaderRote, row.Row)
|
|
|
+ // }
|
|
|
+ }
|
|
|
+ tables.Tables[k] = table
|
|
|
+ }
|
|
|
+ return tables
|
|
|
}
|
|
|
|
|
|
// 匹配<table>标签及其内容的正则表达式
|
|
|
func removeTables(html string) string {
|
|
|
re := regexp.MustCompile(`(?i)<table[^>]*>[\s\S]*?</table>`)
|
|
|
- return re.ReplaceAllString(html, "")
|
|
|
+ html = re.ReplaceAllString(html, "")
|
|
|
+
|
|
|
+ // re = regexp.MustCompile(`<[^>]*>`)
|
|
|
+ // html = re.ReplaceAllString(html, "")
|
|
|
+ return html
|
|
|
+}
|
|
|
+
|
|
|
+// 表格检测,检查表格是否存在及是否存在合并单元格
|
|
|
+func CheckTableMerged(htmlContent string) (hasTable bool, hasMerged bool, err error) {
|
|
|
+ doc, err := html.Parse(strings.NewReader(htmlContent))
|
|
|
+ if err != nil {
|
|
|
+ return false, false, err
|
|
|
+ }
|
|
|
+
|
|
|
+ // 递归查找所有表格
|
|
|
+ tables := findTables(doc)
|
|
|
+ hasTable = len(tables) > 0
|
|
|
+
|
|
|
+ // 检查所有表格中的合并单元格
|
|
|
+ for _, table := range tables {
|
|
|
+ if checkTableForMergedCells(table) {
|
|
|
+ hasMerged = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return hasTable, hasMerged, nil
|
|
|
+}
|
|
|
+func findTables(n *html.Node) []*html.Node { // 递归查找文档中的所有<table>元素
|
|
|
+ var tables []*html.Node
|
|
|
+ if n.Type == html.ElementNode && n.Data == "table" {
|
|
|
+ tables = append(tables, n)
|
|
|
+ }
|
|
|
+
|
|
|
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
|
+ tables = append(tables, findTables(c)...)
|
|
|
+ }
|
|
|
+ return tables
|
|
|
+}
|
|
|
+func checkTableForMergedCells(table *html.Node) bool { //检查单个表格中是否存在合并单元格
|
|
|
+ // 使用栈进行非递归深度优先遍历
|
|
|
+ stack := []*html.Node{table}
|
|
|
+ for len(stack) > 0 {
|
|
|
+ node := stack[len(stack)-1]
|
|
|
+ stack = stack[:len(stack)-1]
|
|
|
+
|
|
|
+ // 遇到嵌套表格则跳过
|
|
|
+ if node != table && node.Type == html.ElementNode && node.Data == "table" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ // 检查当前节点是否为单元格
|
|
|
+ if node.Type == html.ElementNode && (node.Data == "td" || node.Data == "th") {
|
|
|
+ if hasMergeAttribute(node) {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 将子节点逆序压入栈中
|
|
|
+ for child := node.LastChild; child != nil; child = child.PrevSibling {
|
|
|
+ stack = append(stack, child)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false
|
|
|
+}
|
|
|
+func hasMergeAttribute(cell *html.Node) bool { // 检查单元格是否包含合并属性
|
|
|
+ for _, attr := range cell.Attr {
|
|
|
+ if attr.Key == "rowspan" || attr.Key == "colspan" {
|
|
|
+ // 尝试解析属性值为整数
|
|
|
+ if val, err := strconv.Atoi(attr.Val); err == nil {
|
|
|
+ if val > 1 {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // 如果值无法解析为整数,但属性存在且非"1",也视为合并
|
|
|
+ if attr.Val != "1" {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false
|
|
|
}
|
|
|
|
|
|
// 替换文本数据
|
|
@@ -226,12 +374,14 @@ func cleanWebText(input string, keywords, keywordsback []string) string {
|
|
|
}
|
|
|
input = re.ReplaceAllString(input, "")
|
|
|
|
|
|
- keyword := strings.Join(keywords, "|")
|
|
|
- re, err = regexp.Compile(keyword)
|
|
|
- if err != nil {
|
|
|
- return input
|
|
|
+ for _, v := range keywords {
|
|
|
+ re, err = regexp.Compile(v)
|
|
|
+ if err != nil {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ input = re.ReplaceAllString(input, "")
|
|
|
}
|
|
|
- return re.ReplaceAllString(input, "")
|
|
|
+ return input
|
|
|
}
|
|
|
|
|
|
// 支持中文数字(零一二三四五六七八九十)、阿拉伯数字(0-9)、罗马数字(Ⅰ-Ⅻ)
|
|
@@ -248,13 +398,12 @@ func convertNumerals(input string) string {
|
|
|
'Ⅵ': '6', 'Ⅶ': '7', 'Ⅷ': '8', 'Ⅸ': '9', 'Ⅹ': '1', // 仅处理个位
|
|
|
'Ⅺ': '1', 'Ⅻ': '1', // 罗马数字11和12仅处理十位
|
|
|
|
|
|
- 'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5',
|
|
|
- 'F': '6', 'G': '7', 'H': '8', 'J': '9', 'K': '1',
|
|
|
+ // 'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5',
|
|
|
+ // 'F': '6', 'G': '7', 'H': '8', 'J': '9', 'K': '1',
|
|
|
|
|
|
- 'a': '1', 'b': '2', 'c': '3', 'd': '4', 'e': '5',
|
|
|
- 'f': '6', 'g': '7', 'h': '8', 'j': '9', 'k': '1',
|
|
|
+ // 'a': '1', 'b': '2', 'c': '3', 'd': '4', 'e': '5',
|
|
|
+ // 'f': '6', 'g': '7', 'h': '8', 'j': '9', 'k': '1',
|
|
|
}
|
|
|
-
|
|
|
var result strings.Builder
|
|
|
var result1 strings.Builder
|
|
|
for _, char := range input {
|
|
@@ -293,33 +442,6 @@ func convertNumerals(input string) string {
|
|
|
return result.String()
|
|
|
}
|
|
|
|
|
|
-// 检查HTML文本中是否存在多层表格嵌套
|
|
|
-func hasNestedTables(htmlContent string) bool {
|
|
|
- doc, err := html.Parse(strings.NewReader(htmlContent))
|
|
|
- if err != nil {
|
|
|
- return false
|
|
|
- }
|
|
|
-
|
|
|
- var hasNested bool
|
|
|
- var checkNested func(node *html.Node, depth int)
|
|
|
- checkNested = func(node *html.Node, depth int) {
|
|
|
- if node.Type == html.ElementNode && node.Data == "table" {
|
|
|
- if depth > 0 { // 非顶层表格
|
|
|
- hasNested = true
|
|
|
- return
|
|
|
- }
|
|
|
- depth++
|
|
|
- }
|
|
|
-
|
|
|
- for c := node.FirstChild; c != nil && !hasNested; c = c.NextSibling {
|
|
|
- checkNested(c, depth)
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- checkNested(doc, 0)
|
|
|
- return hasNested
|
|
|
-}
|
|
|
-
|
|
|
// Unicode判断工具函数
|
|
|
func isChineseRune(r rune) bool {
|
|
|
// 基础汉字检测
|
|
@@ -357,6 +479,373 @@ func isChineseRune(r rune) bool {
|
|
|
return false
|
|
|
}
|
|
|
|
|
|
+// CellData 存储单元格数据
|
|
|
+type CellData struct {
|
|
|
+ Text string `json:"text"` // 单元格文本内容
|
|
|
+ IsHeader bool `json:"isHeader"` // 是否为表头单元格
|
|
|
+}
|
|
|
+
|
|
|
+// RowData 存储单元格数据
|
|
|
+type RowData struct {
|
|
|
+ Row []CellData `json:"row"` // 行数据
|
|
|
+ HeaderRote int `json:"isHeader"` // 表头权重 100 50 0
|
|
|
+}
|
|
|
+
|
|
|
+// TableData 存储表格的行列数据
|
|
|
+type TableData struct {
|
|
|
+ Rows []RowData `json:"rows"` // 表格行数据
|
|
|
+ NestedLevel int `json:"nestedLevel"` // 表格的嵌套层级,0表示顶层表格
|
|
|
+ ChildTables []int `json:"childTables"` // 子表格的索引列表
|
|
|
+ ParentIndex int `json:"parentIndex"` // 父表格的索引,-1表示没有父表格
|
|
|
+ HasMerged bool `json:"hasMerged"` // 是否存在合并单元格
|
|
|
+}
|
|
|
+
|
|
|
+// AllTablesData 存储所有表格的数据
|
|
|
+type AllTablesData struct {
|
|
|
+ Tables []TableData `json:"tables"`
|
|
|
+}
|
|
|
+
|
|
|
+// TableExtractDatas 解析HTML中的表格,返回Markdown格式和所有表格的行列数据
|
|
|
+func TableExtractDatas(htmlStr string) (string, *AllTablesData, error) {
|
|
|
+ doc, err := html.Parse(strings.NewReader(htmlStr))
|
|
|
+ if err != nil {
|
|
|
+ return "", nil, err
|
|
|
+ }
|
|
|
+
|
|
|
+ allTablesData := &AllTablesData{}
|
|
|
+ var markdownBuilder strings.Builder
|
|
|
+ tableIndex := 0
|
|
|
+
|
|
|
+ // 递归解析表格
|
|
|
+ var parseNode func(*html.Node, int, int)
|
|
|
+ parseNode = func(n *html.Node, level int, parentIdx int) {
|
|
|
+ if n.Type == html.ElementNode && n.Data == "table" {
|
|
|
+ // 记录当前表格的父索引
|
|
|
+ currentParent := parentIdx
|
|
|
+ currentIndex := tableIndex
|
|
|
+
|
|
|
+ // 解析当前表格
|
|
|
+ tableMarkdown, tableData := parseTable(n, level, currentIndex, currentParent)
|
|
|
+ tableData.NestedLevel = level
|
|
|
+ tableData.ParentIndex = currentParent
|
|
|
+
|
|
|
+ // 添加到结果集
|
|
|
+ allTablesData.Tables = append(allTablesData.Tables, tableData)
|
|
|
+ markdownBuilder.WriteString(tableMarkdown)
|
|
|
+ markdownBuilder.WriteString("\n\n")
|
|
|
+
|
|
|
+ // 更新父表格的子表格列表
|
|
|
+ if currentParent != -1 {
|
|
|
+ parentTable := &allTablesData.Tables[currentParent]
|
|
|
+ parentTable.ChildTables = append(parentTable.ChildTables, currentIndex)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 增加表格索引
|
|
|
+ tableIndex++
|
|
|
+
|
|
|
+ // 递归处理子节点(使用新的父索引)
|
|
|
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
|
+ parseNode(c, level+1, currentIndex)
|
|
|
+ }
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ // 递归处理其他节点
|
|
|
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
|
+ parseNode(c, level, parentIdx)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 从文档根节点开始解析
|
|
|
+ parseNode(doc, 0, -1)
|
|
|
+ return markdownBuilder.String(), allTablesData, nil
|
|
|
+}
|
|
|
+
|
|
|
+// 获取单元格的rowspan和colspan属性
|
|
|
+func getSpanTable(cell *html.Node) (int, int) {
|
|
|
+ rowspan, colspan := 1, 1
|
|
|
+ for _, attr := range cell.Attr {
|
|
|
+ switch attr.Key {
|
|
|
+ case "rowspan":
|
|
|
+ if val, err := strconv.Atoi(attr.Val); err == nil && val > 1 {
|
|
|
+ rowspan = val
|
|
|
+ }
|
|
|
+ case "colspan":
|
|
|
+ if val, err := strconv.Atoi(attr.Val); err == nil && val > 1 {
|
|
|
+ colspan = val
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return rowspan, colspan
|
|
|
+}
|
|
|
+
|
|
|
+// 解析单个表格
|
|
|
+func parseTable(tableNode *html.Node, level int, currentIndex int, parentIndex int) (string, TableData) {
|
|
|
+ var tableData TableData
|
|
|
+ tableData.ParentIndex = parentIndex
|
|
|
+ tableData.HasMerged = false
|
|
|
+
|
|
|
+ // 使用网格(grid)构建表格结构
|
|
|
+ var grid []RowData
|
|
|
+ rows := getTableRows(tableNode)
|
|
|
+
|
|
|
+ // 处理表格行
|
|
|
+ for rowIdx, row := range rows {
|
|
|
+ // 扩展grid到当前行
|
|
|
+ if rowIdx >= len(grid) {
|
|
|
+ grid = append(grid, RowData{})
|
|
|
+ }
|
|
|
+
|
|
|
+ // 跳过已被合并单元格占用的位置
|
|
|
+ col := 0
|
|
|
+ for col < len(grid[rowIdx].Row) && grid[rowIdx].Row[col].Text != "" {
|
|
|
+ col++
|
|
|
+ }
|
|
|
+
|
|
|
+ cells := getRowCells(row)
|
|
|
+ for _, cell := range cells {
|
|
|
+ // 获取单元格的跨行跨列属性
|
|
|
+ rowspan, colspan := getSpanTable(cell)
|
|
|
+ if rowspan > 1 || colspan > 1 {
|
|
|
+ tableData.HasMerged = true
|
|
|
+ }
|
|
|
+
|
|
|
+ // 确保grid有足够的行
|
|
|
+ for len(grid) < rowIdx+rowspan {
|
|
|
+ grid = append(grid, RowData{})
|
|
|
+ }
|
|
|
+
|
|
|
+ // 确保所有相关行有足够的列
|
|
|
+ targetCol := col + colspan
|
|
|
+ for r := rowIdx; r < rowIdx+rowspan; r++ {
|
|
|
+ if len(grid[r].Row) < targetCol {
|
|
|
+ // 扩展行
|
|
|
+ newRow := make([]CellData, targetCol)
|
|
|
+ copy(newRow, grid[r].Row)
|
|
|
+ grid[r].Row = newRow
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 提取单元格文本并确定是否为表头
|
|
|
+ text := extractCellText(cell)
|
|
|
+ text = RemoveAllSpaces(text)
|
|
|
+ isHeader := cell.Data == "th"
|
|
|
+ //根据text内容和常见关键词判断,是否是表头
|
|
|
+ if !isHeader && len([]rune(text)) < 20 {
|
|
|
+ //核心词走正则匹配
|
|
|
+ if theadPackWordsReg.MatchString(text) {
|
|
|
+ isHeader = true
|
|
|
+ }
|
|
|
+ if !isHeader {
|
|
|
+ isHeader = theadWordsListComReg.MatchString(text)
|
|
|
+ }
|
|
|
+ //非核心词,走EqualFold匹配
|
|
|
+ if !isHeader {
|
|
|
+ for _, word := range theadWordsList {
|
|
|
+ if strings.EqualFold(word, text) {
|
|
|
+ isHeader = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // log.Println("cellData IsHeader", isHeader, text)
|
|
|
+ cellData := CellData{Text: text, IsHeader: isHeader}
|
|
|
+
|
|
|
+ // 将单元格数据填充到所有合并位置
|
|
|
+ for r := 0; r < rowspan; r++ {
|
|
|
+ for c := 0; c < colspan; c++ {
|
|
|
+ grid[rowIdx+r].Row[col+c] = cellData
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 移动到下一列位置
|
|
|
+ col += colspan
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 设置最终的行数据
|
|
|
+ tableData.Rows = grid
|
|
|
+
|
|
|
+ // 生成Markdown表格 (保持原有逻辑,只使用文本内容)
|
|
|
+ markdown := generateMarkdownTable(grid, level)
|
|
|
+
|
|
|
+ return markdown, tableData
|
|
|
+}
|
|
|
+
|
|
|
+// 生成Markdown格式的表格 (只使用单元格文本)
|
|
|
+func generateMarkdownTable(grid []RowData, level int) string {
|
|
|
+ if len(grid) == 0 {
|
|
|
+ return ""
|
|
|
+ }
|
|
|
+
|
|
|
+ var builder strings.Builder
|
|
|
+ builder.WriteString(fmt.Sprintf("### Table at level %d\n\n", level))
|
|
|
+
|
|
|
+ // 添加表头
|
|
|
+ for i, row := range grid {
|
|
|
+ builder.WriteString("| ")
|
|
|
+ for j, cell := range row.Row {
|
|
|
+ text := cell.Text
|
|
|
+ if text == "" {
|
|
|
+ builder.WriteString(" ")
|
|
|
+ } else {
|
|
|
+ builder.WriteString(text)
|
|
|
+ }
|
|
|
+ if j < len(row.Row)-1 {
|
|
|
+ builder.WriteString(" | ")
|
|
|
+ }
|
|
|
+ }
|
|
|
+ builder.WriteString(" |\n")
|
|
|
+
|
|
|
+ // 添加表头分隔线
|
|
|
+ if i == 0 {
|
|
|
+ builder.WriteString("|")
|
|
|
+ for j := 0; j < len(row.Row); j++ {
|
|
|
+ builder.WriteString(" --- |")
|
|
|
+ }
|
|
|
+ builder.WriteString("\n")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return builder.String()
|
|
|
+}
|
|
|
+
|
|
|
+// 获取表格中的所有行
|
|
|
+func getTableRows(tableNode *html.Node) []*html.Node {
|
|
|
+ var rows []*html.Node
|
|
|
+ var traverse func(*html.Node)
|
|
|
+
|
|
|
+ traverse = func(n *html.Node) {
|
|
|
+ if n.Type == html.ElementNode {
|
|
|
+ switch n.Data {
|
|
|
+ case "tr":
|
|
|
+ rows = append(rows, n)
|
|
|
+ case "thead", "tbody", "tfoot", "table":
|
|
|
+ // 继续遍历
|
|
|
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
|
+ traverse(c)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for c := tableNode.FirstChild; c != nil; c = c.NextSibling {
|
|
|
+ traverse(c)
|
|
|
+ }
|
|
|
+ return rows
|
|
|
+}
|
|
|
+
|
|
|
+// 获取行中的所有单元格
|
|
|
+func getRowCells(rowNode *html.Node) []*html.Node {
|
|
|
+ var cells []*html.Node
|
|
|
+ for c := rowNode.FirstChild; c != nil; c = c.NextSibling {
|
|
|
+ if c.Type == html.ElementNode && (c.Data == "td" || c.Data == "th") {
|
|
|
+ cells = append(cells, c)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return cells
|
|
|
+}
|
|
|
+
|
|
|
+// 提取单元格文本
|
|
|
+func extractCellText(cellNode *html.Node) string {
|
|
|
+ var textBuilder strings.Builder
|
|
|
+ var extract func(*html.Node)
|
|
|
+
|
|
|
+ extract = func(n *html.Node) {
|
|
|
+ if n.Type == html.TextNode {
|
|
|
+ textBuilder.WriteString(strings.TrimSpace(n.Data))
|
|
|
+ textBuilder.WriteString(" ")
|
|
|
+ } else if n.Type == html.ElementNode {
|
|
|
+ // 跳过嵌套表格
|
|
|
+ if n.Data != "table" {
|
|
|
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
|
+ extract(c)
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ textBuilder.WriteString("[Table]")
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for c := cellNode.FirstChild; c != nil; c = c.NextSibling {
|
|
|
+ extract(c)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 清理文本
|
|
|
+ result := strings.TrimSpace(textBuilder.String())
|
|
|
+ if result == "" {
|
|
|
+ return " "
|
|
|
+ }
|
|
|
+ return result
|
|
|
+}
|
|
|
+
|
|
|
+// 提取文本汉字
|
|
|
+func GetChineseText(htmlContent string) string {
|
|
|
+ // 移除HTML标签
|
|
|
+ reHTML := regexp.MustCompile("<[^>]*>")
|
|
|
+ cleanText := reHTML.ReplaceAllString(htmlContent, "")
|
|
|
+ // 提取汉字(Unicode范围:\u4e00-\u9fa5)
|
|
|
+ reChinese := regexp.MustCompile("[\u4e00-\u9fa5]")
|
|
|
+ chineseChars := reChinese.FindAllString(cleanText, -1)
|
|
|
+ return strings.Join(chineseChars, "")
|
|
|
+}
|
|
|
+
|
|
|
+// RemoveAllSpaces 移除字符串中的所有空白字符
|
|
|
+func RemoveAllSpaces(s string) string {
|
|
|
+ // 使用 strings.Builder 高效构建新字符串
|
|
|
+ var b strings.Builder
|
|
|
+ b.Grow(len(s)) // 预分配空间,提高性能
|
|
|
+
|
|
|
+ // 遍历字符串的每个字符
|
|
|
+ for _, r := range s {
|
|
|
+ // 如果不是空白字符,则添加到结果中
|
|
|
+ if !unicode.IsSpace(r) {
|
|
|
+ b.WriteRune(r)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return b.String()
|
|
|
+}
|
|
|
+
|
|
|
+// 按字符串长度排序的类型
|
|
|
+type ByLength []string
|
|
|
+
|
|
|
+func (s ByLength) Len() int { return len(s) }
|
|
|
+func (s ByLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
|
|
+func (s ByLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
|
|
|
+
|
|
|
+// 分组函数
|
|
|
+func groupStrings(strings []string) [][]string {
|
|
|
+ if len(strings) == 0 {
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+ // 按长度排序
|
|
|
+ sorted := make(ByLength, len(strings))
|
|
|
+ copy(sorted, strings)
|
|
|
+ sort.Sort(sorted)
|
|
|
+
|
|
|
+ var groups [][]string
|
|
|
+ currentGroup := []string{sorted[0]}
|
|
|
+ currentMaxLen := len(sorted[0])
|
|
|
+
|
|
|
+ // 遍历排序后的字符串,构建分组
|
|
|
+ for i := 1; i < len(sorted); i++ {
|
|
|
+ currentLen := len(sorted[i])
|
|
|
+ if currentLen-currentMaxLen <= 2 {
|
|
|
+ // 当前字符串可以加入当前组
|
|
|
+ currentGroup = append(currentGroup, sorted[i])
|
|
|
+ } else {
|
|
|
+ // 创建新组
|
|
|
+ groups = append(groups, currentGroup)
|
|
|
+ currentGroup = []string{sorted[i]}
|
|
|
+ currentMaxLen = currentLen
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // 添加最后一个组
|
|
|
+ groups = append(groups, currentGroup)
|
|
|
+ return groups
|
|
|
+}
|
|
|
+
|
|
|
// 判断是否是联合体中标
|
|
|
func isConsortiumKeysReg(content string) bool {
|
|
|
return consortiumKeysReg.MatchString(content)
|