|
@@ -15,7 +15,7 @@ import (
|
|
var (
|
|
var (
|
|
theadWords = "(标段|标包|标包号|包号|包段|子包号|子标段名称|子项|包件号|包件代码|包件编号|分包编码|分包名称|分标编号|分标编码|合同段|包件名称|标包名称|" +
|
|
theadWords = "(标段|标包|标包号|包号|包段|子包号|子标段名称|子项|包件号|包件代码|包件编号|分包编码|分包名称|分标编号|分标编码|合同段|包件名称|标包名称|" +
|
|
"中标单位|中标人|中商人|成交人|成交人名称|供应商|供应商名称|项目名称|项目地址|标的|标的名称|标项名称|采购合同|" +
|
|
"中标单位|中标人|中商人|成交人|成交人名称|供应商|供应商名称|项目名称|项目地址|标的|标的名称|标项名称|采购合同|" +
|
|
- "成交价格|中标报价|简要规格描述)"
|
|
|
|
|
|
+ "成交价格|中标报价|简要规格描述|预算金额|最高限价|中标、成交供应商|中标、成交金额|中标、成交结果情况说明)"
|
|
theadWords_order = "(包件号|标的|标段|候选人|供应商)"
|
|
theadWords_order = "(包件号|标的|标段|候选人|供应商)"
|
|
theadWordsReg = regexp.MustCompile(theadWords)
|
|
theadWordsReg = regexp.MustCompile(theadWords)
|
|
theadWordsReg_order = regexp.MustCompile(theadWords_order)
|
|
theadWordsReg_order = regexp.MustCompile(theadWords_order)
|
|
@@ -28,13 +28,21 @@ var (
|
|
"bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分"},
|
|
"bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分"},
|
|
//"spotcheck": []string{"项目名称", "抽取家数"},
|
|
//"spotcheck": []string{"项目名称", "抽取家数"},
|
|
}
|
|
}
|
|
-
|
|
|
|
|
|
+
|
|
//联合体投标判断
|
|
//联合体投标判断
|
|
consortium = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))"
|
|
consortium = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))"
|
|
consortiumKeysReg = regexp.MustCompile(consortium)
|
|
consortiumKeysReg = regexp.MustCompile(consortium)
|
|
|
|
|
|
- clearKeys = []string{"承包(一|二|三|四)级", "开标(\\d)(室|厅)", "\\d+\\.\\d+", "\\d+(.{1,10}).(pdf|doc|zip|rar)"}
|
|
|
|
- clearKeysBack = []string{"上一篇", "下一篇", "历史业绩", "候选人企业业绩"}
|
|
|
|
|
|
+ //干扰内容清理
|
|
|
|
+ clearKeys = []string{"承包(一|二|三|四)级", "开标(\\d)(室|厅)", "\\d+\\.\\d+", "\\d+(.{1,10}).(pdf|doc|zip|rar)",
|
|
|
|
+ "([一二三四五六七八九十0-9]+次)"}
|
|
|
|
+ clearKeysBack = []string{"上一篇", "下一篇", "历史业绩", "候选人企业业绩", "候选人类似业绩", "成交候选人业绩", "企业类似项目业绩",
|
|
|
|
+ "投标人业绩", "企业业绩", "投标文件中载明的业绩情况", "质量标准:", "评审专家"}
|
|
|
|
+
|
|
|
|
+ //干扰内容替换
|
|
|
|
+ replaceMap = map[string]string{
|
|
|
|
+ "标项目": "标",
|
|
|
|
+ }
|
|
)
|
|
)
|
|
|
|
|
|
func getIdFromDate(startStr, endStr string) (string, string) {
|
|
func getIdFromDate(startStr, endStr string) (string, string) {
|
|
@@ -193,12 +201,21 @@ func isHeaderRow(row *goquery.Selection) bool {
|
|
//log.Println("isHeaderRow", matchNum, hasAttributeKeyword, matchNum > 1 && hasAttributeKeyword)
|
|
//log.Println("isHeaderRow", matchNum, hasAttributeKeyword, matchNum > 1 && hasAttributeKeyword)
|
|
return matchNum > 1 && hasAttributeKeyword
|
|
return matchNum > 1 && hasAttributeKeyword
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+// 匹配<table>标签及其内容的正则表达式
|
|
func removeTables(html string) string {
|
|
func removeTables(html string) string {
|
|
- // 匹配<table>标签及其内容的正则表达式
|
|
|
|
re := regexp.MustCompile(`(?i)<table[^>]*>[\s\S]*?</table>`)
|
|
re := regexp.MustCompile(`(?i)<table[^>]*>[\s\S]*?</table>`)
|
|
return re.ReplaceAllString(html, "")
|
|
return re.ReplaceAllString(html, "")
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+// 替换文本数据
|
|
|
|
+func repalceString(input string, replace map[string]string) string {
|
|
|
|
+ for k, v := range replace {
|
|
|
|
+ input = strings.Replace(input, k, v, -1)
|
|
|
|
+ }
|
|
|
|
+ return input
|
|
|
|
+}
|
|
|
|
+
|
|
// cleanWebText 删除包含指定关键词及其后续的所有内容
|
|
// cleanWebText 删除包含指定关键词及其后续的所有内容
|
|
func cleanWebText(input string, keywords, keywordsback []string) string {
|
|
func cleanWebText(input string, keywords, keywordsback []string) string {
|
|
// 构建关键词正则表达式(使用OR连接)
|
|
// 构建关键词正则表达式(使用OR连接)
|
|
@@ -230,32 +247,49 @@ func convertNumerals(input string) string {
|
|
'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5',
|
|
'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5',
|
|
'Ⅵ': '6', 'Ⅶ': '7', 'Ⅷ': '8', 'Ⅸ': '9', 'Ⅹ': '1', // 仅处理个位
|
|
'Ⅵ': '6', 'Ⅶ': '7', 'Ⅷ': '8', 'Ⅸ': '9', 'Ⅹ': '1', // 仅处理个位
|
|
'Ⅺ': '1', 'Ⅻ': '1', // 罗马数字11和12仅处理十位
|
|
'Ⅺ': '1', 'Ⅻ': '1', // 罗马数字11和12仅处理十位
|
|
|
|
+
|
|
|
|
+ 'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5',
|
|
|
|
+ 'F': '6', 'G': '7', 'H': '8', 'J': '9', 'K': '1',
|
|
|
|
+
|
|
|
|
+ 'a': '1', 'b': '2', 'c': '3', 'd': '4', 'e': '5',
|
|
|
|
+ 'f': '6', 'g': '7', 'h': '8', 'j': '9', 'k': '1',
|
|
}
|
|
}
|
|
|
|
|
|
var result strings.Builder
|
|
var result strings.Builder
|
|
|
|
+ var result1 strings.Builder
|
|
for _, char := range input {
|
|
for _, char := range input {
|
|
- // 直接检查阿拉伯数字
|
|
|
|
|
|
+ // 检查阿拉伯数字
|
|
if char >= '0' && char <= '9' {
|
|
if char >= '0' && char <= '9' {
|
|
- result.WriteRune(char)
|
|
|
|
|
|
+ result1.WriteRune(char)
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
-
|
|
|
|
|
|
+ }
|
|
|
|
+ var result2 strings.Builder
|
|
|
|
+ for _, char := range input {
|
|
// 检查中文数字
|
|
// 检查中文数字
|
|
if num, exists := chineseNumMap[char]; exists {
|
|
if num, exists := chineseNumMap[char]; exists {
|
|
- result.WriteRune(num)
|
|
|
|
|
|
+ result2.WriteRune(num)
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
-
|
|
|
|
|
|
+ }
|
|
|
|
+ var result3 strings.Builder
|
|
|
|
+ for _, char := range input {
|
|
// 检查罗马数字
|
|
// 检查罗马数字
|
|
if num, exists := romanNumMap[char]; exists {
|
|
if num, exists := romanNumMap[char]; exists {
|
|
- result.WriteRune(num)
|
|
|
|
|
|
+ result3.WriteRune(num)
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
-
|
|
|
|
- // 非数字字符保持不变
|
|
|
|
- result.WriteRune(char)
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ if result1.Len() > result.Len() {
|
|
|
|
+ result = result1
|
|
|
|
+ }
|
|
|
|
+ if result2.Len() > result.Len() {
|
|
|
|
+ result = result2
|
|
|
|
+ }
|
|
|
|
+ if result3.Len() > result.Len() {
|
|
|
|
+ result = result3
|
|
|
|
+ }
|
|
return result.String()
|
|
return result.String()
|
|
}
|
|
}
|
|
|
|
|