// util package main import ( "fmt" "log" "regexp" "strings" "time" "github.com/PuerkitoBio/goquery" "golang.org/x/net/html" ) var ( theadWords = "(标段|标包|标包号|包号|包段|子包号|子标段名称|子项|包件号|包件代码|包件编号|分包编码|分包名称|分标编号|分标编码|合同段|包件名称|标包名称|" + "中标单位|中标人|中商人|成交人|成交人名称|供应商|供应商名称|项目名称|项目地址|标的|标的名称|标项名称|采购合同|" + "成交价格|中标报价|简要规格描述|预算金额|最高限价|中标、成交供应商|中标、成交金额|中标、成交结果情况说明)" theadWords_order = "(包件号|标的|标段|候选人|供应商)" theadWordsReg = regexp.MustCompile(theadWords) theadWordsReg_order = regexp.MustCompile(theadWords_order) delRowKeys = "未成交|未中标原因" delRowKeysReg = regexp.MustCompile(delRowKeys) //负向表头,用于剔除干扰表格 reverseTheadKeys = map[string][]string{ "bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分"}, //"spotcheck": []string{"项目名称", "抽取家数"}, } //联合体投标判断 consortium = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))" consortiumKeysReg = regexp.MustCompile(consortium) //干扰内容清理 clearKeys = []string{"承包(一|二|三|四)级", "开标(\\d)(室|厅)", "\\d+\\.\\d+", "\\d+(.{1,10}).(pdf|doc|zip|rar)", "([一二三四五六七八九十0-9]+次)"} clearKeysBack = []string{"上一篇", "下一篇", "历史业绩", "候选人企业业绩", "候选人类似业绩", "成交候选人业绩", "企业类似项目业绩", "投标人业绩", "企业业绩", "投标文件中载明的业绩情况", "质量标准:", "评审专家"} //干扰内容替换 replaceMap = map[string]string{ "标项目": "标", } ) func getIdFromDate(startStr, endStr string) (string, string) { start, _ := time.Parse("2006-01-02", startStr) end, _ := time.Parse("2006-01-02", endStr) // 昨天凌晨0点时间戳 hexTimestamp1 := fmt.Sprintf("%X", start.Unix()) + "0000000000000000" // 今天凌晨0点时间戳 hexTimestamp2 := fmt.Sprintf("%X", end.Unix()) + "0000000000000000" return hexTimestamp1, hexTimestamp2 } // 判断是否有嵌套表格 func tableIsPackage(htmlContent string) (bool, int) { //判断是否有多层表格嵌套 if hasNestedTables(htmlContent) { //log.Println("表格嵌套") return false, 0 } ispack := false tablesMixRows := 0 tablesData := getPackAgeByTable(htmlContent) for _, dataRows := range tablesData { // for k, v := range dataRows { // log.Println(i, k, v) // } if len(dataRows) > 2 { ispack = true } if tablesMixRows < len(dataRows) { tablesMixRows = len(dataRows) } } //log.Println(ispack, tablesMixRows) return ispack, tablesMixRows } // 提取疑似表格分包数据 func getPackAgeByTable(htmlContent string) map[string][]map[string]string { // 解析HTML文档 doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) if err != nil { log.Println(err) } // 遍历所有表格 tableDataRows := map[string][]map[string]string{} doc.Find("table").Each(func(i int, table *goquery.Selection) { var headers []string var rows []map[string]string // 遍历表格行 table.Find("tr").Each(func(rowIdx int, row *goquery.Selection) { // 自动识别表头行(根据单元格内容特征) if isHeaderRow(row) && len(headers) < 1 { isDelHeader := false tmphead := []string{} bidTheadKeysIndexNum := map[string]int{} //记录满足剔除的表头的阈值 row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) { text := strings.TrimSpace(cell.Text()) tmphead = append(tmphead, text) if delRowKeysReg.MatchString(text) { isDelHeader = true } //如果是标的物、评分、抽查列表,剔除 for k, theadKeys := range reverseTheadKeys { for _, v := range theadKeys { if strings.Contains(text, v) { bidTheadKeysIndexNum[k]++ } } if bidTheadKeysIndexNum[k] > 1 { //满足一个以上的表头删除 isDelHeader = true } } }) //log.Println("tmphead th", tmphead) if len(tmphead) < 1 { row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) { text := strings.TrimSpace(cell.Text()) tmphead = append(tmphead, text) if delRowKeysReg.MatchString(text) { isDelHeader = true } //如果是标的物、评分、抽查列表,剔除 for k, theadKeys := range reverseTheadKeys { for _, v := range theadKeys { if strings.Contains(text, v) { bidTheadKeysIndexNum[k]++ } } if bidTheadKeysIndexNum[k] > 1 { isDelHeader = true } } }) } //log.Println("tmphead td", tmphead) if !isDelHeader { headers = append(headers, tmphead...) } //log.Println("headers", headers) } // 处理数据行 if len(headers) > 0 { isDelRows := false //是否需要屏蔽词 rowData := make(map[string]string) row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) { if cellIdx < len(headers) { header := headers[cellIdx] text := strings.TrimSpace(cell.Text()) rowData[header] = text if delRowKeysReg.MatchString(text) { isDelRows = true } } }) //log.Println(isDelRows, rowData) if !isDelRows { rows = append(rows, rowData) } } }) tableDataRows[fmt.Sprint(i)] = rows }) return tableDataRows } // 自定义表头判断逻辑(根据单元格内容特征) func isHeaderRow(row *goquery.Selection) bool { // 判断条件示例 包含 theadWords 特定关键词 hasAttributeKeyword := false matchNum := 0 row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) { text := strings.TrimSpace(cell.Text()) if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 { matchNum++ hasAttributeKeyword = true } else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 { matchNum++ hasAttributeKeyword = true } //log.Println(text, matchNum, hasAttributeKeyword) }) row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) { text := strings.TrimSpace(cell.Text()) if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 { matchNum++ hasAttributeKeyword = true } else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 { matchNum++ hasAttributeKeyword = true } //log.Println(text, matchNum, hasAttributeKeyword) }) //log.Println("isHeaderRow", matchNum, hasAttributeKeyword, matchNum > 1 && hasAttributeKeyword) return matchNum > 1 && hasAttributeKeyword } // 匹配