123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363 |
- // util
- package main
- import (
- "fmt"
- "log"
- "regexp"
- "strings"
- "time"
- "github.com/PuerkitoBio/goquery"
- "golang.org/x/net/html"
- )
- var (
- theadWords = "(标段|标包|标包号|包号|包段|子包号|子标段名称|子项|包件号|包件代码|包件编号|分包编码|分包名称|分标编号|分标编码|合同段|包件名称|标包名称|" +
- "中标单位|中标人|中商人|成交人|成交人名称|供应商|供应商名称|项目名称|项目地址|标的|标的名称|标项名称|采购合同|" +
- "成交价格|中标报价|简要规格描述|预算金额|最高限价|中标、成交供应商|中标、成交金额|中标、成交结果情况说明)"
- theadWords_order = "(包件号|标的|标段|候选人|供应商)"
- theadWordsReg = regexp.MustCompile(theadWords)
- theadWordsReg_order = regexp.MustCompile(theadWords_order)
- delRowKeys = "未成交|未中标原因"
- delRowKeysReg = regexp.MustCompile(delRowKeys)
- //负向表头,用于剔除干扰表格
- reverseTheadKeys = map[string][]string{
- "bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分"},
- //"spotcheck": []string{"项目名称", "抽取家数"},
- }
-
- //联合体投标判断
- consortium = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))"
- consortiumKeysReg = regexp.MustCompile(consortium)
- //干扰内容清理
- clearKeys = []string{"承包(一|二|三|四)级", "开标(\\d)(室|厅)", "\\d+\\.\\d+", "\\d+(.{1,10}).(pdf|doc|zip|rar)",
- "([一二三四五六七八九十0-9]+次)"}
- clearKeysBack = []string{"上一篇", "下一篇", "历史业绩", "候选人企业业绩", "候选人类似业绩", "成交候选人业绩", "企业类似项目业绩",
- "投标人业绩", "企业业绩", "投标文件中载明的业绩情况", "质量标准:", "评审专家"}
- //干扰内容替换
- replaceMap = map[string]string{
- "标项目": "标",
- }
- )
- func getIdFromDate(startStr, endStr string) (string, string) {
- start, _ := time.Parse("2006-01-02", startStr)
- end, _ := time.Parse("2006-01-02", endStr)
- // 昨天凌晨0点时间戳
- hexTimestamp1 := fmt.Sprintf("%X", start.Unix()) + "0000000000000000"
- // 今天凌晨0点时间戳
- hexTimestamp2 := fmt.Sprintf("%X", end.Unix()) + "0000000000000000"
- return hexTimestamp1, hexTimestamp2
- }
- // 判断是否有嵌套表格
- func tableIsPackage(htmlContent string) (bool, int) {
- //判断是否有多层表格嵌套
- if hasNestedTables(htmlContent) {
- //log.Println("表格嵌套")
- return false, 0
- }
- ispack := false
- tablesMixRows := 0
- tablesData := getPackAgeByTable(htmlContent)
- for _, dataRows := range tablesData {
- // for k, v := range dataRows {
- // log.Println(i, k, v)
- // }
- if len(dataRows) > 2 {
- ispack = true
- }
- if tablesMixRows < len(dataRows) {
- tablesMixRows = len(dataRows)
- }
- }
- //log.Println(ispack, tablesMixRows)
- return ispack, tablesMixRows
- }
- // 提取疑似表格分包数据
- func getPackAgeByTable(htmlContent string) map[string][]map[string]string {
- // 解析HTML文档
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
- if err != nil {
- log.Println(err)
- }
- // 遍历所有表格
- tableDataRows := map[string][]map[string]string{}
- doc.Find("table").Each(func(i int, table *goquery.Selection) {
- var headers []string
- var rows []map[string]string
- // 遍历表格行
- table.Find("tr").Each(func(rowIdx int, row *goquery.Selection) {
- // 自动识别表头行(根据单元格内容特征)
- if isHeaderRow(row) && len(headers) < 1 {
- isDelHeader := false
- tmphead := []string{}
- bidTheadKeysIndexNum := map[string]int{} //记录满足剔除的表头的阈值
- row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) {
- text := strings.TrimSpace(cell.Text())
- tmphead = append(tmphead, text)
- if delRowKeysReg.MatchString(text) {
- isDelHeader = true
- }
- //如果是标的物、评分、抽查列表,剔除
- for k, theadKeys := range reverseTheadKeys {
- for _, v := range theadKeys {
- if strings.Contains(text, v) {
- bidTheadKeysIndexNum[k]++
- }
- }
- if bidTheadKeysIndexNum[k] > 1 { //满足一个以上的表头删除
- isDelHeader = true
- }
- }
- })
- //log.Println("tmphead th", tmphead)
- if len(tmphead) < 1 {
- row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
- text := strings.TrimSpace(cell.Text())
- tmphead = append(tmphead, text)
- if delRowKeysReg.MatchString(text) {
- isDelHeader = true
- }
- //如果是标的物、评分、抽查列表,剔除
- for k, theadKeys := range reverseTheadKeys {
- for _, v := range theadKeys {
- if strings.Contains(text, v) {
- bidTheadKeysIndexNum[k]++
- }
- }
- if bidTheadKeysIndexNum[k] > 1 {
- isDelHeader = true
- }
- }
- })
- }
- //log.Println("tmphead td", tmphead)
- if !isDelHeader {
- headers = append(headers, tmphead...)
- }
- //log.Println("headers", headers)
- }
- // 处理数据行
- if len(headers) > 0 {
- isDelRows := false //是否需要屏蔽词
- rowData := make(map[string]string)
- row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
- if cellIdx < len(headers) {
- header := headers[cellIdx]
- text := strings.TrimSpace(cell.Text())
- rowData[header] = text
- if delRowKeysReg.MatchString(text) {
- isDelRows = true
- }
- }
- })
- //log.Println(isDelRows, rowData)
- if !isDelRows {
- rows = append(rows, rowData)
- }
- }
- })
- tableDataRows[fmt.Sprint(i)] = rows
- })
- return tableDataRows
- }
- // 自定义表头判断逻辑(根据单元格内容特征)
- func isHeaderRow(row *goquery.Selection) bool {
- // 判断条件示例 包含 theadWords 特定关键词
- hasAttributeKeyword := false
- matchNum := 0
- row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
- text := strings.TrimSpace(cell.Text())
- if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 {
- matchNum++
- hasAttributeKeyword = true
- } else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 {
- matchNum++
- hasAttributeKeyword = true
- }
- //log.Println(text, matchNum, hasAttributeKeyword)
- })
- row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) {
- text := strings.TrimSpace(cell.Text())
- if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 {
- matchNum++
- hasAttributeKeyword = true
- } else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 {
- matchNum++
- hasAttributeKeyword = true
- }
- //log.Println(text, matchNum, hasAttributeKeyword)
- })
- //log.Println("isHeaderRow", matchNum, hasAttributeKeyword, matchNum > 1 && hasAttributeKeyword)
- return matchNum > 1 && hasAttributeKeyword
- }
- // 匹配<table>标签及其内容的正则表达式
- func removeTables(html string) string {
- re := regexp.MustCompile(`(?i)<table[^>]*>[\s\S]*?</table>`)
- return re.ReplaceAllString(html, "")
- }
- // 替换文本数据
- func repalceString(input string, replace map[string]string) string {
- for k, v := range replace {
- input = strings.Replace(input, k, v, -1)
- }
- return input
- }
- // cleanWebText 删除包含指定关键词及其后续的所有内容
- func cleanWebText(input string, keywords, keywordsback []string) string {
- // 构建关键词正则表达式(使用OR连接)
- keywordPattern := strings.Join(keywordsback, "|")
- re, err := regexp.Compile(fmt.Sprintf(`(?s)(%s).*`, keywordPattern))
- if err != nil {
- return input // 正则编译失败时返回原始文本
- }
- input = re.ReplaceAllString(input, "")
- keyword := strings.Join(keywords, "|")
- re, err = regexp.Compile(keyword)
- if err != nil {
- return input
- }
- return re.ReplaceAllString(input, "")
- }
- // 支持中文数字(零一二三四五六七八九十)、阿拉伯数字(0-9)、罗马数字(Ⅰ-Ⅻ)
- func convertNumerals(input string) string {
- // 字符映射表
- chineseNumMap := map[rune]rune{
- '零': '0', '一': '1', '二': '2', '三': '3', '四': '4',
- '五': '5', '六': '6', '七': '7', '八': '8', '九': '9',
- '十': '1', // 仅处理个位,十位需特殊处理
- }
- romanNumMap := map[rune]rune{
- 'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5',
- 'Ⅵ': '6', 'Ⅶ': '7', 'Ⅷ': '8', 'Ⅸ': '9', 'Ⅹ': '1', // 仅处理个位
- 'Ⅺ': '1', 'Ⅻ': '1', // 罗马数字11和12仅处理十位
- 'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5',
- 'F': '6', 'G': '7', 'H': '8', 'J': '9', 'K': '1',
- 'a': '1', 'b': '2', 'c': '3', 'd': '4', 'e': '5',
- 'f': '6', 'g': '7', 'h': '8', 'j': '9', 'k': '1',
- }
- var result strings.Builder
- var result1 strings.Builder
- for _, char := range input {
- // 检查阿拉伯数字
- if char >= '0' && char <= '9' {
- result1.WriteRune(char)
- continue
- }
- }
- var result2 strings.Builder
- for _, char := range input {
- // 检查中文数字
- if num, exists := chineseNumMap[char]; exists {
- result2.WriteRune(num)
- continue
- }
- }
- var result3 strings.Builder
- for _, char := range input {
- // 检查罗马数字
- if num, exists := romanNumMap[char]; exists {
- result3.WriteRune(num)
- continue
- }
- }
- if result1.Len() > result.Len() {
- result = result1
- }
- if result2.Len() > result.Len() {
- result = result2
- }
- if result3.Len() > result.Len() {
- result = result3
- }
- return result.String()
- }
- // 检查HTML文本中是否存在多层表格嵌套
- func hasNestedTables(htmlContent string) bool {
- doc, err := html.Parse(strings.NewReader(htmlContent))
- if err != nil {
- return false
- }
- var hasNested bool
- var checkNested func(node *html.Node, depth int)
- checkNested = func(node *html.Node, depth int) {
- if node.Type == html.ElementNode && node.Data == "table" {
- if depth > 0 { // 非顶层表格
- hasNested = true
- return
- }
- depth++
- }
- for c := node.FirstChild; c != nil && !hasNested; c = c.NextSibling {
- checkNested(c, depth)
- }
- }
- checkNested(doc, 0)
- return hasNested
- }
- // Unicode判断工具函数
- func isChineseRune(r rune) bool {
- // 基础汉字检测
- if r >= 0x4E00 && r <= 0x9FFF {
- return true
- }
- // CJK符号和标点
- if r >= 0x3000 && r <= 0x303F {
- return true
- }
- // 全角符号(过滤字母数字)
- if r >= 0xFF00 && r <= 0xFFEF {
- // 排除全角字母
- if (r >= 0xFF21 && r <= 0xFF3A) || // 大写字母
- (r >= 0xFF41 && r <= 0xFF5A) { // 小写字母
- return false
- }
- // 排除全角数字
- if r >= 0xFF10 && r <= 0xFF19 {
- return false
- }
- return true
- }
- // 特殊符号检测
- switch r {
- case 0x2018, 0x2019, 0x201C, 0x201D, // 引号
- 0x2014, 0x2026, // 破折号、省略号
- 0x3010, 0x3011, // 【】
- 0x3008, 0x3009, 0x300A, 0x300B: // 《》〈〉
- return true
- }
- return false
- }
- // 判断是否是联合体中标
- func isConsortiumKeysReg(content string) bool {
- return consortiumKeysReg.MatchString(content)
- }
|