|
@@ -0,0 +1,329 @@
|
|
|
+// util
|
|
|
+package main
|
|
|
+
|
|
|
+import (
|
|
|
+ "fmt"
|
|
|
+ "log"
|
|
|
+ "regexp"
|
|
|
+ "strings"
|
|
|
+ "time"
|
|
|
+
|
|
|
+ "github.com/PuerkitoBio/goquery"
|
|
|
+ "golang.org/x/net/html"
|
|
|
+)
|
|
|
+
|
|
|
+var (
|
|
|
+ theadWords = "(标段|标包|标包号|包号|包段|子包号|子标段名称|子项|包件号|包件代码|包件编号|分包编码|分包名称|分标编号|分标编码|合同段|包件名称|标包名称|" +
|
|
|
+ "中标单位|中标人|中商人|成交人|成交人名称|供应商|供应商名称|项目名称|项目地址|标的|标的名称|标项名称|采购合同|" +
|
|
|
+ "成交价格|中标报价|简要规格描述)"
|
|
|
+ theadWords_order = "(包件号|标的|标段|候选人|供应商)"
|
|
|
+ theadWordsReg = regexp.MustCompile(theadWords)
|
|
|
+ theadWordsReg_order = regexp.MustCompile(theadWords_order)
|
|
|
+
|
|
|
+ delRowKeys = "未成交|未中标原因"
|
|
|
+ delRowKeysReg = regexp.MustCompile(delRowKeys)
|
|
|
+
|
|
|
+ //负向表头,用于剔除干扰表格
|
|
|
+ reverseTheadKeys = map[string][]string{
|
|
|
+ "bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分"},
|
|
|
+ //"spotcheck": []string{"项目名称", "抽取家数"},
|
|
|
+ }
|
|
|
+
|
|
|
+ //联合体投标判断
|
|
|
+ consortium = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))"
|
|
|
+ consortiumKeysReg = regexp.MustCompile(consortium)
|
|
|
+
|
|
|
+ clearKeys = []string{"承包(一|二|三|四)级", "开标(\\d)(室|厅)", "\\d+\\.\\d+", "\\d+(.{1,10}).(pdf|doc|zip|rar)"}
|
|
|
+ clearKeysBack = []string{"上一篇", "下一篇", "历史业绩", "候选人企业业绩"}
|
|
|
+)
|
|
|
+
|
|
|
+func getIdFromDate(startStr, endStr string) (string, string) {
|
|
|
+ start, _ := time.Parse("2006-01-02", startStr)
|
|
|
+ end, _ := time.Parse("2006-01-02", endStr)
|
|
|
+ // 昨天凌晨0点时间戳
|
|
|
+ hexTimestamp1 := fmt.Sprintf("%X", start.Unix()) + "0000000000000000"
|
|
|
+ // 今天凌晨0点时间戳
|
|
|
+ hexTimestamp2 := fmt.Sprintf("%X", end.Unix()) + "0000000000000000"
|
|
|
+ return hexTimestamp1, hexTimestamp2
|
|
|
+}
|
|
|
+
|
|
|
+// 判断是否有嵌套表格
|
|
|
+func tableIsPackage(htmlContent string) (bool, int) {
|
|
|
+ //判断是否有多层表格嵌套
|
|
|
+ if hasNestedTables(htmlContent) {
|
|
|
+ //log.Println("表格嵌套")
|
|
|
+ return false, 0
|
|
|
+ }
|
|
|
+ ispack := false
|
|
|
+ tablesMixRows := 0
|
|
|
+ tablesData := getPackAgeByTable(htmlContent)
|
|
|
+ for _, dataRows := range tablesData {
|
|
|
+ // for k, v := range dataRows {
|
|
|
+ // log.Println(i, k, v)
|
|
|
+ // }
|
|
|
+ if len(dataRows) > 2 {
|
|
|
+ ispack = true
|
|
|
+ }
|
|
|
+ if tablesMixRows < len(dataRows) {
|
|
|
+ tablesMixRows = len(dataRows)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //log.Println(ispack, tablesMixRows)
|
|
|
+ return ispack, tablesMixRows
|
|
|
+}
|
|
|
+
|
|
|
+// 提取疑似表格分包数据
|
|
|
+func getPackAgeByTable(htmlContent string) map[string][]map[string]string {
|
|
|
+ // 解析HTML文档
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
|
|
|
+ if err != nil {
|
|
|
+ log.Println(err)
|
|
|
+ }
|
|
|
+ // 遍历所有表格
|
|
|
+ tableDataRows := map[string][]map[string]string{}
|
|
|
+ doc.Find("table").Each(func(i int, table *goquery.Selection) {
|
|
|
+ var headers []string
|
|
|
+ var rows []map[string]string
|
|
|
+ // 遍历表格行
|
|
|
+ table.Find("tr").Each(func(rowIdx int, row *goquery.Selection) {
|
|
|
+ // 自动识别表头行(根据单元格内容特征)
|
|
|
+ if isHeaderRow(row) && len(headers) < 1 {
|
|
|
+ isDelHeader := false
|
|
|
+ tmphead := []string{}
|
|
|
+
|
|
|
+ bidTheadKeysIndexNum := map[string]int{} //记录满足剔除的表头的阈值
|
|
|
+ row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) {
|
|
|
+ text := strings.TrimSpace(cell.Text())
|
|
|
+ tmphead = append(tmphead, text)
|
|
|
+ if delRowKeysReg.MatchString(text) {
|
|
|
+ isDelHeader = true
|
|
|
+ }
|
|
|
+ //如果是标的物、评分、抽查列表,剔除
|
|
|
+ for k, theadKeys := range reverseTheadKeys {
|
|
|
+ for _, v := range theadKeys {
|
|
|
+ if strings.Contains(text, v) {
|
|
|
+ bidTheadKeysIndexNum[k]++
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if bidTheadKeysIndexNum[k] > 1 { //满足一个以上的表头删除
|
|
|
+ isDelHeader = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ })
|
|
|
+ //log.Println("tmphead th", tmphead)
|
|
|
+ if len(tmphead) < 1 {
|
|
|
+ row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
|
|
|
+ text := strings.TrimSpace(cell.Text())
|
|
|
+ tmphead = append(tmphead, text)
|
|
|
+ if delRowKeysReg.MatchString(text) {
|
|
|
+ isDelHeader = true
|
|
|
+ }
|
|
|
+ //如果是标的物、评分、抽查列表,剔除
|
|
|
+ for k, theadKeys := range reverseTheadKeys {
|
|
|
+ for _, v := range theadKeys {
|
|
|
+ if strings.Contains(text, v) {
|
|
|
+ bidTheadKeysIndexNum[k]++
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if bidTheadKeysIndexNum[k] > 1 {
|
|
|
+ isDelHeader = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ })
|
|
|
+ }
|
|
|
+ //log.Println("tmphead td", tmphead)
|
|
|
+ if !isDelHeader {
|
|
|
+ headers = append(headers, tmphead...)
|
|
|
+ }
|
|
|
+ //log.Println("headers", headers)
|
|
|
+ }
|
|
|
+ // 处理数据行
|
|
|
+ if len(headers) > 0 {
|
|
|
+ isDelRows := false //是否需要屏蔽词
|
|
|
+ rowData := make(map[string]string)
|
|
|
+ row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
|
|
|
+ if cellIdx < len(headers) {
|
|
|
+ header := headers[cellIdx]
|
|
|
+ text := strings.TrimSpace(cell.Text())
|
|
|
+ rowData[header] = text
|
|
|
+ if delRowKeysReg.MatchString(text) {
|
|
|
+ isDelRows = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ })
|
|
|
+ //log.Println(isDelRows, rowData)
|
|
|
+ if !isDelRows {
|
|
|
+ rows = append(rows, rowData)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ })
|
|
|
+ tableDataRows[fmt.Sprint(i)] = rows
|
|
|
+ })
|
|
|
+ return tableDataRows
|
|
|
+}
|
|
|
+
|
|
|
+// 自定义表头判断逻辑(根据单元格内容特征)
|
|
|
+func isHeaderRow(row *goquery.Selection) bool {
|
|
|
+ // 判断条件示例 包含 theadWords 特定关键词
|
|
|
+ hasAttributeKeyword := false
|
|
|
+ matchNum := 0
|
|
|
+ row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
|
|
|
+ text := strings.TrimSpace(cell.Text())
|
|
|
+ if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 {
|
|
|
+ matchNum++
|
|
|
+ hasAttributeKeyword = true
|
|
|
+ } else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 {
|
|
|
+ matchNum++
|
|
|
+ hasAttributeKeyword = true
|
|
|
+ }
|
|
|
+ //log.Println(text, matchNum, hasAttributeKeyword)
|
|
|
+ })
|
|
|
+ row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) {
|
|
|
+ text := strings.TrimSpace(cell.Text())
|
|
|
+ if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 {
|
|
|
+ matchNum++
|
|
|
+ hasAttributeKeyword = true
|
|
|
+ } else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 {
|
|
|
+ matchNum++
|
|
|
+ hasAttributeKeyword = true
|
|
|
+ }
|
|
|
+ //log.Println(text, matchNum, hasAttributeKeyword)
|
|
|
+ })
|
|
|
+ //log.Println("isHeaderRow", matchNum, hasAttributeKeyword, matchNum > 1 && hasAttributeKeyword)
|
|
|
+ return matchNum > 1 && hasAttributeKeyword
|
|
|
+}
|
|
|
+func removeTables(html string) string {
|
|
|
+ // 匹配<table>标签及其内容的正则表达式
|
|
|
+ re := regexp.MustCompile(`(?i)<table[^>]*>[\s\S]*?</table>`)
|
|
|
+ return re.ReplaceAllString(html, "")
|
|
|
+}
|
|
|
+
|
|
|
+// cleanWebText 删除包含指定关键词及其后续的所有内容
|
|
|
+func cleanWebText(input string, keywords, keywordsback []string) string {
|
|
|
+ // 构建关键词正则表达式(使用OR连接)
|
|
|
+ keywordPattern := strings.Join(keywordsback, "|")
|
|
|
+ re, err := regexp.Compile(fmt.Sprintf(`(?s)(%s).*`, keywordPattern))
|
|
|
+ if err != nil {
|
|
|
+ return input // 正则编译失败时返回原始文本
|
|
|
+ }
|
|
|
+ input = re.ReplaceAllString(input, "")
|
|
|
+
|
|
|
+ keyword := strings.Join(keywords, "|")
|
|
|
+ re, err = regexp.Compile(keyword)
|
|
|
+ if err != nil {
|
|
|
+ return input
|
|
|
+ }
|
|
|
+ return re.ReplaceAllString(input, "")
|
|
|
+}
|
|
|
+
|
|
|
+// 支持中文数字(零一二三四五六七八九十)、阿拉伯数字(0-9)、罗马数字(Ⅰ-Ⅻ)
|
|
|
+func convertNumerals(input string) string {
|
|
|
+ // 字符映射表
|
|
|
+ chineseNumMap := map[rune]rune{
|
|
|
+ '零': '0', '一': '1', '二': '2', '三': '3', '四': '4',
|
|
|
+ '五': '5', '六': '6', '七': '7', '八': '8', '九': '9',
|
|
|
+ '十': '1', // 仅处理个位,十位需特殊处理
|
|
|
+ }
|
|
|
+
|
|
|
+ romanNumMap := map[rune]rune{
|
|
|
+ 'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5',
|
|
|
+ 'Ⅵ': '6', 'Ⅶ': '7', 'Ⅷ': '8', 'Ⅸ': '9', 'Ⅹ': '1', // 仅处理个位
|
|
|
+ 'Ⅺ': '1', 'Ⅻ': '1', // 罗马数字11和12仅处理十位
|
|
|
+ }
|
|
|
+
|
|
|
+ var result strings.Builder
|
|
|
+ for _, char := range input {
|
|
|
+ // 直接检查阿拉伯数字
|
|
|
+ if char >= '0' && char <= '9' {
|
|
|
+ result.WriteRune(char)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ // 检查中文数字
|
|
|
+ if num, exists := chineseNumMap[char]; exists {
|
|
|
+ result.WriteRune(num)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ // 检查罗马数字
|
|
|
+ if num, exists := romanNumMap[char]; exists {
|
|
|
+ result.WriteRune(num)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ // 非数字字符保持不变
|
|
|
+ result.WriteRune(char)
|
|
|
+ }
|
|
|
+
|
|
|
+ return result.String()
|
|
|
+}
|
|
|
+
|
|
|
+// 检查HTML文本中是否存在多层表格嵌套
|
|
|
+func hasNestedTables(htmlContent string) bool {
|
|
|
+ doc, err := html.Parse(strings.NewReader(htmlContent))
|
|
|
+ if err != nil {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+
|
|
|
+ var hasNested bool
|
|
|
+ var checkNested func(node *html.Node, depth int)
|
|
|
+ checkNested = func(node *html.Node, depth int) {
|
|
|
+ if node.Type == html.ElementNode && node.Data == "table" {
|
|
|
+ if depth > 0 { // 非顶层表格
|
|
|
+ hasNested = true
|
|
|
+ return
|
|
|
+ }
|
|
|
+ depth++
|
|
|
+ }
|
|
|
+
|
|
|
+ for c := node.FirstChild; c != nil && !hasNested; c = c.NextSibling {
|
|
|
+ checkNested(c, depth)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ checkNested(doc, 0)
|
|
|
+ return hasNested
|
|
|
+}
|
|
|
+
|
|
|
+// Unicode判断工具函数
|
|
|
+func isChineseRune(r rune) bool {
|
|
|
+ // 基础汉字检测
|
|
|
+ if r >= 0x4E00 && r <= 0x9FFF {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+
|
|
|
+ // CJK符号和标点
|
|
|
+ if r >= 0x3000 && r <= 0x303F {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+
|
|
|
+ // 全角符号(过滤字母数字)
|
|
|
+ if r >= 0xFF00 && r <= 0xFFEF {
|
|
|
+ // 排除全角字母
|
|
|
+ if (r >= 0xFF21 && r <= 0xFF3A) || // 大写字母
|
|
|
+ (r >= 0xFF41 && r <= 0xFF5A) { // 小写字母
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ // 排除全角数字
|
|
|
+ if r >= 0xFF10 && r <= 0xFF19 {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ return true
|
|
|
+ }
|
|
|
+
|
|
|
+ // 特殊符号检测
|
|
|
+ switch r {
|
|
|
+ case 0x2018, 0x2019, 0x201C, 0x201D, // 引号
|
|
|
+ 0x2014, 0x2026, // 破折号、省略号
|
|
|
+ 0x3010, 0x3011, // 【】
|
|
|
+ 0x3008, 0x3009, 0x300A, 0x300B: // 《》〈〉
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
+// 判断是否是联合体中标
|
|
|
+func isConsortiumKeysReg(content string) bool {
|
|
|
+ return consortiumKeysReg.MatchString(content)
|
|
|
+}
|