Ver código fonte

分包优化

zhangjinkun 1 mês atrás
pai
commit
c572a3aca3
5 arquivos alterados com 1280 adições e 237 exclusões
  1. 25 6
      bidding/main.go
  2. 133 35
      bidding/package.go
  3. 12 1
      bidding/quotemode.go
  4. 426 0
      bidding/theadKey.go
  5. 684 195
      bidding/util.go

+ 25 - 6
bidding/main.go

@@ -10,6 +10,7 @@ import (
 var (
 	classifier       *MultiPackageClassifier
 	quote_classifier *QuoteClassifier
+	blog             = false
 )
 
 func init() {
@@ -20,7 +21,7 @@ func init() {
 
 func main() {
 	//packAgeDemo()
-	//quoteDemo()
+	// quoteDemo()
 	// 注册路由
 	http.HandleFunc("/classify", classifyHandler)
 	http.HandleFunc("/quote_classify", quoteClassifyHandler)
@@ -35,6 +36,17 @@ func quoteDemo() {
 	listContent := []string{
 		"下浮 20.22%",
 		"上浮动:百分之三十",
+		"报价费率%: 7.00",
+		"费率(%):92.30",
+		"投标报价(折扣报价):92.00%",
+		"报价折扣:82.80%。",
+		"中标价(费率或单价等):",
+		`中标价
+		75.00%`,
+		"应答报价(综合折扣):87.50%",
+		"报价折扣(不含税):98%。",
+		"费率2.69%",
+		"下浮率(%):5.0000000",
 	}
 	for k, content := range listContent {
 		// 执行分类判断
@@ -43,7 +55,7 @@ func quoteDemo() {
 		}
 		// 执行分类判断
 		modenum, _ := quote_classifier.QuoteMode(doc)
-		log.Println(modenum, k)
+		log.Println(k, modenum, content)
 	}
 }
 
@@ -140,15 +152,22 @@ func classifyHandler(w http.ResponseWriter, r *http.Request) {
 
 // 分类逻辑
 func classifyBid(data map[string]interface{}) (int, bool) {
-	content := fmt.Sprint(data["title"]) + "\n" + fmt.Sprint(data["detail"])
-	// 文本清理
+	content := fmt.Sprint(data["detail"])
+	//文本清理
+	replaceMap[fmt.Sprint(data["title"])] = ""
 	content = cleanWebText(content, clearKeys, clearKeysBack)
-	content_rmtable := removeTables(content)
+	content = repalceString(content, replaceMap)
 
+	content_1 := removeTables(fmt.Sprint(data["detail"]))
+	content_1 = cleanWebText(content_1, clearKeys, clearKeysBack)
+	content_1 = repalceString(content_1, replaceMap)
+	if len(content_1) < 100 { //清除表格后,如果文本太短,使用原文本
+		content_1 = content
+	}
 	// 执行分类判断
 	doc := BidDocument{
 		Content:         content,
-		Content_NoTable: content_rmtable,
+		Content_NoTable: content_1,
 		Budget:          content,
 		AwardNotice:     content,
 		BidderOptions:   content,

+ 133 - 35
bidding/package.go

@@ -40,7 +40,7 @@ func NewClassifier() *MultiPackageClassifier {
 	coreKeys := "包,分标,标段,子标段,标段(包),分标,分段招标,多标段,分标段,标段划分表" +
 		"包件,分包,包号,包划分,标包,多包,分项招标" +
 		"分包方案,包别,分段实施,独立投标,兼投,兼中,№.1标包,№.2标包,№.3标包"
-	packKeys := `(标段|子标段|标段\(包\)|包|包号|采购包|分标|包件|包组编号|标项)`
+	packKeys := `(子项|标段|子标段|标段\(包\)|包|包号|采购包|分标|包件|包组编号|标项|标包)`
 	packMatch := `(\s)?[\(\)一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+(\s)?`
 	packTeMatch := `(:|:|\s)[0-9A-Za-z]{1,}`
 	packNumMatch := `[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]`
@@ -54,6 +54,9 @@ func NewClassifier() *MultiPackageClassifier {
 			regexp.MustCompile(packKeys + packMatch),
 			regexp.MustCompile(packKeys + packTeMatch),
 			regexp.MustCompile(packMatch + packKeys),
+			regexp.MustCompile(`(第|工程|项目|施工)(.{0,3})` + packNumMatch + `(标|包|段)`),
+			regexp.MustCompile(`(项目|标)` + packNumMatch + `{1,}[::]+`),
+			regexp.MustCompile(`[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ](标|包|段)`),
 		},
 
 		//3、 排除条件正则(整体招标相关表述)
@@ -77,52 +80,79 @@ func NewClassifier() *MultiPackageClassifier {
 }
 
 // IsMultiPackage 主判断方法,返回是否多包项目及判断依据
+// 1多包,0不确定,-1无多包
 func (c *MultiPackageClassifier) IsMultiPackage(doc BidDocument) (int, map[string]interface{}) {
-	result := make(map[string]interface{})
-
+	result := map[string]interface{}{}
 	// 第一步:排除条件检查(具有一票否决权)
 	if c.hasExclusion(doc.Content) {
 		result["exclusion"] = "存在排除关键词"
 		//log.Println("存在排除关键词")
 		return -1, result
 	}
-
 	// 第二步:核心特征检查,匹配数量
-	//1、表格检查分包
-	isTablePack, tablePackNums := tableIsPackage(doc.Content)
+	regMatchResult := map[string]MatchResult{}
+	//1、表格检测分包
+	isTablePack := false                                //表格分包
+	tablePackNumRows, tablePackStrRows := 0, 0          //表格数据
+	_, isMegerTable, _ := CheckTableMerged(doc.Content) //检查是否存在表格、表格是否有合并
+	//log.Println("isMegerTable", isMegerTable)
+	if !isMegerTable {
+		_, tableData, _ := TableExtractDatas(doc.Content)
+		tableData = setRowsHeaderRote(tableData)
+		isTablePack, tablePackNumRows, tablePackStrRows = TableIsPackage(tableData)
+		if blog {
+			log.Printf("简单表格,是否分包%t,数值类包%d,字符类包%d", isTablePack, tablePackNumRows, tablePackStrRows)
+		}
+	} else { //复杂表格,正则提取文本数据
+		_, tableData, _ := TableExtractDatas(doc.Content)
+		tableData = setRowsHeaderRote(tableData)
+		isTablePack, tablePackNumRows, tablePackStrRows = TableIsPackage(tableData)
+		if blog {
+			log.Printf("复杂表格,是否分包%t,数值类包%d,字符类包%d", isTablePack, tablePackNumRows, tablePackStrRows)
+		}
+	}
 	//2、核心关键词数据检测
-	coreFeaturesNum := c.checkCoreFeatures(doc, result)
-	//3、分包编码检测
-	packCodeNum, packCodeKeys := c.getPackNumCode(c.regMatchPackagePatterns(doc, result))
+	coreFeaturesNum := c.checkCoreFeatures(doc, isMegerTable, result)
+	//3、提取分包编码数据
+	if tablePackNumRows+tablePackStrRows < 1 { //如果表格未匹配到任何数据,使用原文匹配
+		regMatchResult = c.regMatchPackagePatterns(doc.Content, regMatchResult)
+	} else { //使用清理表格后的数据匹配
+		regMatchResult = c.regMatchPackagePatterns(doc.Content_NoTable, regMatchResult)
+	}
+	//4、获取分包编码
+	packCodeNum, packCodeKeys := c.getPackNumCode(regMatchResult)
 
 	// 第三步:辅助特征检查
-	auxFeatures := c.checkAuxFeatures(doc, result)
+	auxFeatures := c.checkAuxFeatures(doc)
 
-	isPackage := 0 // 1多包,0不确定,-1无多包
-	// 决策逻辑
+	// 第四步:决策逻辑,输入出结果
+	isPackage := 0
 	if packCodeNum > 1 || isTablePack || (coreFeaturesNum >= 1 && auxFeatures >= 1) { //1多包  分包编码大于1  | 表格判定多包 | (核心特征大于等于1 && 1个辅助特征)
 		isPackage = 1
+	} else if tablePackNumRows == 1 && tablePackStrRows > 1 || tablePackNumRows > 1 && tablePackStrRows == 1 {
+		isPackage = 0
 	} else if coreFeaturesNum <= 2 && packCodeNum <= 1 && auxFeatures < 1 { //单包 核心特征<2 && 分包编码<2 && 无辅助特征
 		isPackage = -1
 	}
-	result["isPackage"] = isPackage
-	result["core_count"] = coreFeaturesNum
-	result["aux_count"] = auxFeatures
-	if 1 == 0 {
-		log.Printf("核心关键词:%d,分包编码数量:%d,表格分包:%t,表格数据量:%d,辅助词:%d \n", coreFeaturesNum, packCodeNum, isTablePack, tablePackNums, auxFeatures)
+	result = map[string]interface{}{
+		"isPackage":  isPackage,
+		"core_count": coreFeaturesNum,
+	}
+	if blog {
+		printStr := "核心关键词:%d,分包编码数量:%d,表格分包:%t,表格数据量(Num):%d,表格数据量(Str):%d,辅助词:%d \n"
+		log.Printf(printStr, coreFeaturesNum, packCodeNum, isTablePack, tablePackNumRows, tablePackStrRows, auxFeatures)
 		log.Println("分包编码", packCodeKeys, result)
 		//log.Println(doc.Content)
 	}
-	//log.Println(result)
 	return isPackage, result
 }
 
 // checkCoreFeatures 检查核心特征并返回符合数量
-func (c *MultiPackageClassifier) checkCoreFeatures(doc BidDocument, result map[string]interface{}) int {
+func (c *MultiPackageClassifier) checkCoreFeatures(doc BidDocument, isMegerTable bool, result map[string]interface{}) int {
 	count := 0
 	text := doc.Content_NoTable
 	// 特征1:存在标段相关关键词
-	if hasNestedTables(doc.Content) { //如果有表格嵌套,使用全文
+	if isMegerTable { //如果有表格嵌套,使用全文
 		text = doc.Content
 	}
 	exitstKeys := c.checkKeywords(text)
@@ -146,9 +176,9 @@ func (c *MultiPackageClassifier) checkCoreFeatures(doc BidDocument, result map[s
 }
 
 // checkAuxFeatures 检查辅助特征并返回符合数量
-func (c *MultiPackageClassifier) checkAuxFeatures(doc BidDocument, result map[string]interface{}) int {
+func (c *MultiPackageClassifier) checkAuxFeatures(doc BidDocument) int {
+	result := map[string]interface{}{}
 	count := 0
-
 	// 辅助特征1:存在分项预算
 	if matches := c.budgetPattern.FindAllString(doc.Budget, -1); len(matches) > 1 {
 		result["split_budget"] = matches
@@ -160,7 +190,9 @@ func (c *MultiPackageClassifier) checkAuxFeatures(doc BidDocument, result map[st
 		result["multiple_awards"] = matches
 		count++
 	}
-
+	if blog {
+		log.Println("辅助特征", count, result)
+	}
 	return count
 }
 
@@ -188,8 +220,10 @@ func (c *MultiPackageClassifier) checkPackageNumbers(text string) bool {
 
 // checkBidderOptions 检查投标人选项
 func (c *MultiPackageClassifier) checkBidderOptions(options string) bool {
-	return strings.Contains(options, "可选投") ||
+	return strings.Contains(
+		options, "可选投") ||
 		strings.Contains(options, "兼投") ||
+		strings.Contains(options, "兼中") ||
 		strings.Contains(options, "可投多个")
 }
 
@@ -199,21 +233,20 @@ func (c *MultiPackageClassifier) hasExclusion(text string) bool {
 }
 
 // regMatchPackagePatterns 匹配结果
-func (c *MultiPackageClassifier) regMatchPackagePatterns(doc BidDocument, result map[string]interface{}) map[string]MatchResult {
+func (c *MultiPackageClassifier) regMatchPackagePatterns(htmlContent string, results map[string]MatchResult) map[string]MatchResult {
 	// 查找所有匹配项的位置和内容
-	results := map[string]MatchResult{}
 	for _, reg := range c.packagePatterns {
 		//log.Println(reg)
-		matches := reg.FindAllStringSubmatchIndex(doc.Content, -1)
+		matches := reg.FindAllStringSubmatchIndex(htmlContent, -1)
 		// 遍历所有匹配项
 		for _, match := range matches {
-			full := doc.Content[match[0]:match[1]] // 提取完整匹配内容
+			full := htmlContent[match[0]:match[1]] // 提取完整匹配内容
 			groups := make([]string, 0)            // 提取分组内容
 			for i := 2; i < len(match); i += 2 {
 				start := match[i]
 				end := match[i+1]
 				if start >= 0 { // 分组可能不存在
-					groups = append(groups, doc.Content[start:end])
+					groups = append(groups, htmlContent[start:end])
 				}
 			}
 			// 保存结果
@@ -223,6 +256,9 @@ func (c *MultiPackageClassifier) regMatchPackagePatterns(doc BidDocument, result
 				Start:     match[0],
 				End:       match[1],
 			}
+			if blog {
+				log.Println(full)
+			}
 		}
 	}
 	return results
@@ -230,18 +266,80 @@ func (c *MultiPackageClassifier) regMatchPackagePatterns(doc BidDocument, result
 
 // checkPackageNumbers 检查标段编号模式
 func (c *MultiPackageClassifier) getPackNumCode(match map[string]MatchResult) (int, map[string]bool) {
-	keys := map[string]bool{}
+	keysPack := map[string]bool{}
+	keysDuan := map[string]bool{}
+	keysBiao := map[string]bool{}
+	keysItem := map[string]bool{}
+	keysOther1 := map[string]bool{}
+	keysOther2 := map[string]bool{}
+	result := map[string]bool{}
 	for k, _ := range match {
 		ss := c.packNumCode.FindAllString(k, -1)
 		key := ""
 		for _, v := range ss {
 			key = key + v
 		}
-		if len(key) > 0 {
-			keys[convertNumerals(key)] = true
+		if strings.Contains(k, "包") && convertNumerals(key) != "" {
+			//需要区分数字包OR字母包
+			keysPack[convertNumerals(key)] = true
+		} else if strings.Contains(k, "段") && convertNumerals(key) != "" {
+			//需要区分数字包OR字母包
+			keysDuan[convertNumerals(key)] = true
+		} else if strings.Contains(k, "标") && convertNumerals(key) != "" {
+			keysBiao[convertNumerals(key)] = true
+		} else if strings.Contains(k, "项目") && convertNumerals(key) != "" {
+			keysItem[convertNumerals(key)] = true
+		} else {
+			if convertNumerals(key) != "" {
+				keysOther1[convertNumerals(key)] = true
+			} else {
+				keysOther2[key] = true
+			}
+		}
+		if blog {
+			log.Println(k, key)
+		}
+	}
+	keysNum := len(keysPack)
+	result = keysPack
+	if keysNum < len(keysDuan) {
+		keysNum = len(keysDuan)
+		result = keysDuan
+	}
+	if keysNum < len(keysBiao) {
+		keysNum = len(keysBiao)
+		result = keysBiao
+	}
+	if keysNum < len(keysItem) {
+		keysNum = len(keysItem)
+		result = keysItem
+	}
+	if keysNum < len(keysOther1) {
+		keysNum = len(keysOther1)
+		result = keysOther1
+	}
+	if keysNum < len(keysOther2) {
+		keysNum = len(keysOther2)
+		result = keysOther2
+	}
+
+	//如果分包编码长度相差太大,进行分组,取分组最大个数
+	if len(result) > 1 {
+		packStr := []string{}
+		for key, _ := range result {
+			// log.Println("result", key)
+			packStr = append(packStr, key)
+		}
+		maxGroupNum := 0
+		groupPackStr := groupStrings(packStr)
+		for _, grop := range groupPackStr {
+			if len(grop) > maxGroupNum {
+				maxGroupNum = len(grop)
+			}
+		}
+		if maxGroupNum < keysNum {
+			keysNum = maxGroupNum
 		}
-		//log.Println(k, key, convertNumerals(key))
 	}
-	//log.Println("keys", keys)
-	return len(keys), keys
+	return keysNum, result
 }

+ 12 - 1
bidding/quotemode.go

@@ -30,12 +30,16 @@ func NewQuoteClassifier() *QuoteClassifier {
 		unitPricePattern: []*regexp.Regexp{},
 
 		//②费率正则模式
-		ratePattern: []*regexp.Regexp{},
+		ratePattern: []*regexp.Regexp{
+			regexp.MustCompile(`(费率|折扣|中标价)(.{0,10})%(.{0,10})([\d.]+)`),
+			regexp.MustCompile(`(费率|折扣|中标价)(.{0,10})[::\s]?(.{0,10})([\d.]+%)`),
+		},
 
 		//③上浮下浮率正则模式
 		floatingRatePattern: []*regexp.Regexp{
 			regexp.MustCompile(`([上下](浮|浮率|浮动)[::\s]?.{0,30}百分之)`),
 			regexp.MustCompile(`([上下](浮|浮率|浮动)[::\s]?([\d.]+%))`),
+			regexp.MustCompile(`([上下](浮|浮率|浮动)(.{0,3})%(.{0,3})[::\s]?([\d.]+))`),
 		},
 
 		//④整标报价正则模式
@@ -47,6 +51,13 @@ func NewQuoteClassifier() *QuoteClassifier {
 func (c *QuoteClassifier) QuoteMode(doc BidDocument) (int, map[string]interface{}) {
 	result := make(map[string]interface{})
 	quoteType := 0 //默认整标报价
+
+	for _, reg := range c.ratePattern {
+		if reg.MatchString(doc.Content) {
+			quoteType = 2
+		}
+	}
+
 	for _, reg := range c.floatingRatePattern {
 		if reg.MatchString(doc.Content) {
 			quoteType = 3

+ 426 - 0
bidding/theadKey.go

@@ -0,0 +1,426 @@
+// theadKey
+package main
+
+// 常见表头
+var theadWordsList_Item = []string{
+	"序号",
+	"编号",
+	"顺序",
+	"名称",
+	"项目地点",
+	"住址",
+	"地址",
+	"单位",
+	"备注信息",
+	"最高限价",
+	"成交金额",
+	"得分",
+	"服务期限",
+	"品牌",
+	"单价",
+	"数量",
+	"规格",
+	"型号",
+	"招标人",
+	"中标",
+	"供应商",
+	"成交人",
+	"中标人",
+	"候选人",
+	"联系人",
+	"联系电话",
+	"联系方式",
+	"招标内容",
+	"计量单位",
+	"服务要求",
+	"服务范围",
+	"服务标准",
+	"挂牌价格",
+	"招标时间",
+	"挂牌时间",
+	"交货时间",
+	"投标保证金",
+	"最终评审价(元)",
+	"最终得分",
+	"最终报价时间",
+	"最终报价(元)",
+	"最终报价(元)",
+	"最少供应量",
+	"最高限价(万元)",
+	"最高限价(万元)",
+	"最高投标限价(元)",
+	"最低限价",
+	"组织询问核实情况",
+	"总金额(含税)",
+	"总价(元)",
+	"总价(元)",
+	"总价(大写)",
+	"总价",
+	"总分",
+	"综合评估法",
+	"资质等级",
+	"资信业绩部分得分",
+	"资信情况",
+	"资信分",
+	"资信标得分(权重)",
+	"资审不通过具体原因",
+	"资格条件响应情况",
+	"资格审查结果",
+	"资格评审情况",
+	"资格能力条件",
+	"资格能力",
+	"资产标的基本信息",
+	"专用资格要求",
+	"专业",
+	"专家姓名",
+	"专家",
+	"中选人",
+	"中选份额(%)",
+	"中标折扣率(%)",
+	"中标人数量及份额",
+	"中标人候选人排序",
+	"中标金额(元/次)",
+	"中标金额(元)",
+	"中标金额(元)",
+	"中标金额(万元)",
+	"中标金额(万元)",
+	"中标价格(元元)/费率",
+	"中标价格(元)/费率",
+	"中标价格(元)",
+	"中标价格(人民币)",
+	"中标候选人排序",
+	"中标候选人名单",
+	"中标候选人名称",
+	"中标候选人单位名称",
+	"中标单价(含税)",
+	"中标/成交折扣率",
+	"中标、成交供应商",
+	"中标(成交金额)",
+	"中标(成交)折扣率",
+	"中标(成交)金额(元)",
+	"中标(成交)金额",
+	"中标(成交)供应商",
+	"中标(成交)公告",
+	"质保期",
+	"原因及依据",
+	"原因",
+	"元",
+	"预算总金额",
+	"预算金额(元)",
+	"预算金额(万元)",
+	"预算金额(含税,万元)",
+	"预算(费率)",
+	"预估站点数量(个)",
+	"业绩类型",
+	"业绩",
+	"要求的资格能力条件",
+	"要求",
+	"样品",
+	"验收金额(元)",
+	"验收结果",
+	"宣传形式",
+	"序",
+	"需求内容",
+	"需求描述",
+	"修正原因",
+	"姓名",
+	"性别",
+	"小计",
+	"项目总监",
+	"项目周期",
+	"项目需求概况",
+	"项目信息",
+	"项目投资代码(编号)",
+	"项目所在地",
+	"项目名称",
+	"项目类物资设备",
+	"项目类别",
+	"项目经理",
+	"项目及标段名称",
+	"项目管理机构人员",
+	"项目管理机构得分",
+	"项目概算(亿元)",
+	"项目概况",
+	"项目编号",
+	"项目报价(元)",
+	"项目班子人员信息",
+	"项目班子成员",
+	"项目(标段)编号",
+	"项目",
+	"响应总报价(元)",
+	"响应招标文件要求",
+	"响应招标文件",
+	"响应文件响应",
+	"响应人",
+	"响应情况",
+	"响应评审情况",
+	"响应报价(元/升)",
+	"响应报价(元)",
+	"响应报价(%)",
+	"详细评审得分情况",
+	"下浮率",
+	"物资描述",
+	"物资编码",
+	"物料描述(材料名称)",
+	"物料类别",
+	"无效投标原因",
+	"问题内容",
+	"万元",
+	"推荐情况",
+	"推荐排名",
+	"推荐的中标候选人",
+	"推荐的成交候选人",
+	"推荐承包入库单位",
+	"图号材质",
+	"投资项目代码",
+	"投资金额(人民币)",
+	"投诉电话",
+	"投标总价(元)",
+	"投标总价(元)",
+	"投标总报价(元)",
+	"投标总报价(元)",
+	"投标总报价(费率)",
+	"投标总报价",
+	"投标质量",
+	"投标下浮率",
+	"投标文件密封情况",
+	"投标文件递交时间",
+	"投标人资格条件",
+	"投标人资格其它要求",
+	"投标人名称",
+	"投标人",
+	"投标金额",
+	"投标价格(元)",
+	"投标价格(万元)",
+	"投标价格",
+	"投标价(元)",
+	"投标工期",
+	"投标单价(元/吨)",
+	"投标报价说明",
+	"投标报价金额",
+	"投标报价(元元)",
+	"投标报价(元/人)",
+	"投标报价(元)",
+	"投标报价(元)",
+	"投标报价(万元)",
+	"投标报价(万元)",
+	"投标报价(含税)",
+	"投标报价(费率)",
+	"投标报价(%)",
+	"投标报价",
+	"投标保证金(元)",
+	"投标保证金(元)",
+	"投标保证金(万元)",
+	"投标保证",
+	"套",
+	"说明",
+	"税率%",
+	"税率",
+	"收费金额(元)",
+	"实际报价(元)",
+	"时长/形式",
+	"生产厂家",
+	"审计项目全称",
+	"审计期间",
+	"审计类型",
+	"审查结果",
+	"设计费总报价(元)",
+	"设计费",
+	"设备状态",
+	"商务要求",
+	"商务项目",
+	"商务分",
+	"商务标得分(权重)",
+	"商品描述",
+	"商品价格",
+	"商品分类",
+	"商城展示价",
+	"入围商品价格",
+	"入围商品标项名称",
+	"入围情况",
+	"入围期限",
+	"入围金额",
+	"入围家数",
+	"入围代理商",
+	"日期",
+	"任职阶段",
+	"人员资质要求",
+	"人员类别",
+	"人",
+	"权重分",
+	"其它",
+	"其他",
+	"评委",
+	"评审小组职务",
+	"评审项目",
+	"评审日期",
+	"评审情况",
+	"评审内容",
+	"评审结论(√/×)",
+	"评审结果",
+	"评审汇总结果",
+	"评审合格标准",
+	"评审分数",
+	"评审步骤",
+	"评审标准",
+	"评审报价/评审得分",
+	"评审报价(万元)",
+	"评审报价",
+	"评审办法条款号",
+	"评价内容",
+	"评估金额(元)",
+	"评分项",
+	"评分汇总表",
+	"评分标准",
+	"评分",
+	"评标总分",
+	"评标委员会成员",
+	"评标情况资料",
+	"评标情况",
+	"评标明细",
+	"评标开始时间",
+	"评标结束时间",
+	"评标结果",
+	"评标价格(元)",
+	"评标价格",
+	"评标价(元)",
+	"评标价",
+	"品目号",
+	"品目分类",
+	"品目编号及品目名称",
+	"品目",
+	"品名",
+	"排序",
+	"排名价(元)",
+	"排名",
+	"年份",
+	"内容",
+	"控制金额(元)",
+	"控制价(元)",
+	"控制价(元)",
+	"控制单价(元)",
+	"开标异常情况说明",
+	"开标异常记录",
+	"开标时间",
+	"开标日期",
+	"开标记录表",
+	"开标地点",
+	"开标备注",
+	"竞价类型",
+	"金额(元)",
+	"金额(元)",
+	"金额",
+	"价格分",
+	"价格",
+	"技术指标",
+	"技术要求",
+	"技术评分",
+	"技术分",
+	"技术方案编号",
+	"技术参数",
+	"技术标平均分(权重)",
+	"技术标平均分",
+	"技术标得分",
+	"计划投资(万元)",
+	"货物类",
+	"合同总价",
+	"合同金额(万元)",
+	"合同金额(万元)",
+	"合同金额",
+	"合同估算价(万元)",
+	"合同估算价",
+	"合同公示信息",
+	"合同包",
+	"合计金额",
+	"合计",
+	"含税预算(万元)",
+	"含税响应总价(元)",
+	"含税报价金额(元)",
+	"关联范围内容",
+	"各项费用合计",
+	"各评委评分平均值",
+	"各评委评分合计",
+	"份额占比",
+	"份额及占比",
+	"份额比例",
+	"份额",
+	"分值",
+	"分数",
+	"分类",
+	"分标标号",
+	"废标原因及依据",
+	"废标原因",
+	"废标依据",
+	"发包人",
+	"电话",
+	"地市(建设地点)",
+	"地区",
+	"单价(元/KG)",
+	"成交总金额(元)",
+	"成交年租金(元/年)",
+	"成交内容",
+	"成交金额/人民币",
+	"成交金额(元)",
+	"成交金额(万元)",
+	"成交价格(元/年)",
+	"成交价格(元):",
+	"成交价格",
+	"成交价不含税(元)",
+	"成交价(元含税)",
+	"成交价(万元)",
+	"成交价(含税)",
+	"成交价",
+	"成交候选人排",
+	"成交候选人名称",
+	"成交候选排序",
+	"成交服务商",
+	"成交份额",
+	"成交报价(元/年)",
+	"成交报价",
+	"车型",
+	"厂家(品牌)要求",
+	"参数要求",
+	"参数",
+	"采购状态",
+	"采购预算价",
+	"采购预算(人民币)",
+	"采购预算",
+	"采购文件费(元)",
+	"采购数量(kg)",
+	"采购时间",
+	"采购人",
+	"采购企业",
+	"采购内容",
+	"采购类型",
+	"采购计划信息",
+	"采购计划文号信息",
+	"采购计划金额",
+	"采购工厂",
+	"采购代理机构",
+	"采购包",
+	"标候选人",
+	"标段名称",
+	"标段金额(万元)",
+	"标包金额(万元)",
+	"编列内容",
+	"比例",
+	"备注",
+	"报价人",
+	"报价金额",
+	"报价分",
+	"报价(元)",
+	"报价(元)",
+	"报价(万元)",
+	"报价",
+	"保证金形式",
+	"包最高限价(元)",
+	"包组",
+	"包预算(元)",
+	"包名",
+	"包件数量",
+	"包件售价(元)",
+	"包件号",
+	"包件/标段号",
+	"包件(标段)编号",
+	"包件",
+}

+ 684 - 195
bidding/util.go

@@ -5,207 +5,355 @@ import (
 	"fmt"
 	"log"
 	"regexp"
+	"sort"
+	"strconv"
 	"strings"
-	"time"
+	"unicode"
 
-	"github.com/PuerkitoBio/goquery"
 	"golang.org/x/net/html"
 )
 
 var (
-	theadWords = "(标段|标包|标包号|包号|包段|子包号|子标段名称|子项|包件号|包件代码|包件编号|分包编码|分包名称|分标编号|分标编码|合同段|包件名称|标包名称|" +
-		"中标单位|中标人|中商人|成交人|成交人名称|供应商|供应商名称|项目名称|项目地址|标的|标的名称|标项名称|采购合同|" +
-		"成交价格|中标报价|简要规格描述|预算金额|最高限价|中标、成交供应商|中标、成交金额|中标、成交结果情况说明)"
+	//常见表头
+	theadWordsList    = append(theadWordsList_Item, theadPackWordsList...)
+	theadWordsListCom = []string{
+		"包段", "标段", "标包", "名称", "包号", "包段", "子包号", "子项", "中标人",
+		"包件号", "包件代码", "包件编号", "分包编码", "分标编号", "分标编码", "合同段",
+		"标的", "标项", "采购合同", "包件编号", "项目编号", "评价机构",
+		"地点", "日期", "单位", "是否"}
+	theadWordsListComReg = regexp.MustCompile("(" + strings.Join(theadWordsListCom, "|") + ")(?:[^0-9a-zA-Z]|$)")
+
+	//分包必含表头
+	theadPackWordsList = []string{
+		"包段", "标段", "标段名称", "标包", "标包名称", "标包号", "包号", "包段", "子包号", "子标段名称", "子项", "子项名称",
+		"包件号", "包件代码", "包件编号", "分包编码", "分包名称", "分标编号", "分标编码", "合同段", "包件名称",
+		"标的", "标的名称", "标项", "标项名称", "采购合同", "标段(包)名称",
+		"项目/包件编号", "项目编号", "服务名称", "项目名称"}
+	theadPackWords    = "(" + strings.Join(theadPackWordsList, "|") + ")(?:[^0-9a-zA-Z]|$)"
+	theadPackWordsReg = regexp.MustCompile(theadPackWords)
+
+	//候选人表头
 	theadWords_order    = "(包件号|标的|标段|候选人|供应商)"
-	theadWordsReg       = regexp.MustCompile(theadWords)
 	theadWordsReg_order = regexp.MustCompile(theadWords_order)
-
+	//删除干扰数据
 	delRowKeys    = "未成交|未中标原因"
 	delRowKeysReg = regexp.MustCompile(delRowKeys)
-
 	//负向表头,用于剔除干扰表格
 	reverseTheadKeys = map[string][]string{
-		"bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分"},
+		"bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分", "专家"},
 		//"spotcheck": []string{"项目名称", "抽取家数"},
 	}
-	
-	//联合体投标判断
-	consortium        = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))"
-	consortiumKeysReg = regexp.MustCompile(consortium)
-
 	//干扰内容清理
 	clearKeys = []string{"承包(一|二|三|四)级", "开标(\\d)(室|厅)", "\\d+\\.\\d+", "\\d+(.{1,10}).(pdf|doc|zip|rar)",
-		"([一二三四五六七八九十0-9]+次)"}
-	clearKeysBack = []string{"上一篇", "下一篇", "历史业绩", "候选人企业业绩", "候选人类似业绩", "成交候选人业绩", "企业类似项目业绩",
-		"投标人业绩", "企业业绩", "投标文件中载明的业绩情况", "质量标准:", "评审专家"}
-
+		"([一二三四五六七八九十0-9]+次)", "\\d+g.{0,20}包", "\\d+包.{0,20}(纸|箱)", "标段\\d+/标包\\d+", "\\d+年",
+		"(\\d{1,2})-(\\d{1,2})(段|包|标)", "[一二三四五六七八九十]、[一二三四五六七八九十](段|包|标)",
+	}
+	clearKeysBack = []string{"上一篇", "下一篇", "历史业绩",
+		"候选人业绩", "候选人企业业绩", "候选人类似业绩", "企业类似项目业绩",
+		"投标业绩", "投标人业绩", "企业业绩", "工程业绩", "设计单位业绩", "施工单位业绩",
+		"单位业绩情况", "投标文件中载明的业绩情况", "质量标准:"}
 	//干扰内容替换
 	replaceMap = map[string]string{
-		"标项目": "标",
+		"服务项目": "",
+		"标项目":  "标",
+		"总承包":  "",
+		"三安小区": "",
+		"I":    "Ⅰ",
+		"—":    "",
 	}
-)
 
-func getIdFromDate(startStr, endStr string) (string, string) {
-	start, _ := time.Parse("2006-01-02", startStr)
-	end, _ := time.Parse("2006-01-02", endStr)
-	// 昨天凌晨0点时间戳
-	hexTimestamp1 := fmt.Sprintf("%X", start.Unix()) + "0000000000000000"
-	// 今天凌晨0点时间戳
-	hexTimestamp2 := fmt.Sprintf("%X", end.Unix()) + "0000000000000000"
-	return hexTimestamp1, hexTimestamp2
-}
+	//联合体投标判断
+	consortium        = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))"
+	consortiumKeysReg = regexp.MustCompile(consortium)
+)
 
-// 判断是否有嵌套表格
-func tableIsPackage(htmlContent string) (bool, int) {
-	//判断是否有多层表格嵌套
-	if hasNestedTables(htmlContent) {
-		//log.Println("表格嵌套")
-		return false, 0
-	}
+// 判断是否有分包数据
+func TableIsPackage(tables *AllTablesData) (bool, int, int) {
 	ispack := false
-	tablesMixRows := 0
-	tablesData := getPackAgeByTable(htmlContent)
-	for _, dataRows := range tablesData {
-		// for k, v := range dataRows {
-		// 	log.Println(i, k, v)
-		// }
-		if len(dataRows) > 2 {
-			ispack = true
+	tablesNumRows, tablesStrRows := 0, 0
+	allCellVal := map[int]map[int][]string{}
+	for kt, tv := range tables.Tables {
+		allCellVal[kt] = getPackAllCellVal_v1(tv)
+		//log.Println("allCellVal sss", kt, len(allCellVal), allCellVal)
+		if len(allCellVal[kt]) < 1 {
+			allCellVal[kt] = getPackAllCellVal_v2(tv)
+			//log.Println("allCellVal", allCellVal)
+		}
+		for _, cellsVal := range allCellVal {
+			// log.Println("cellsVal", cellsVal)
+			for _, cells := range cellsVal {
+				numKey := map[string]string{}
+				strKey := map[string]string{}
+			L:
+				for _, cellVal := range cells {
+					for _, word := range theadWordsList { //过滤 theadWordsList 中的词
+						if strings.EqualFold(word, cellVal) {
+							break L
+						}
+					}
+					if val, err := strconv.Atoi(cellVal); err == nil {
+						numKey[fmt.Sprint(val)] = cellVal
+					} else {
+						if len(cellVal) > 0 {
+							strKey[cellVal] = cellVal
+						}
+					}
+				}
+				if tablesNumRows < len(numKey) {
+					tablesNumRows = len(numKey)
+				}
+				if tablesStrRows < len(strKey) {
+					tablesStrRows = len(strKey)
+				}
+				if blog {
+					log.Println(kt, "numKey", numKey)
+					log.Println(kt, "strKey", strKey)
+				}
+			}
 		}
-		if tablesMixRows < len(dataRows) {
-			tablesMixRows = len(dataRows)
+		if tablesStrRows > 1 && tablesNumRows > 1 ||
+			tablesStrRows < 1 && tablesNumRows > 1 ||
+			tablesStrRows > 1 && tablesNumRows < 1 {
+			ispack = true
 		}
 	}
-	//log.Println(ispack, tablesMixRows)
-	return ispack, tablesMixRows
+	return ispack, tablesNumRows, tablesStrRows
 }
 
-// 提取疑似表格分包数据
-func getPackAgeByTable(htmlContent string) map[string][]map[string]string {
-	// 解析HTML文档
-	doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
-	if err != nil {
-		log.Println(err)
-	}
-	// 遍历所有表格
-	tableDataRows := map[string][]map[string]string{}
-	doc.Find("table").Each(func(i int, table *goquery.Selection) {
-		var headers []string
-		var rows []map[string]string
-		// 遍历表格行
-		table.Find("tr").Each(func(rowIdx int, row *goquery.Selection) {
-			// 自动识别表头行(根据单元格内容特征)
-			if isHeaderRow(row) && len(headers) < 1 {
-				isDelHeader := false
-				tmphead := []string{}
-
-				bidTheadKeysIndexNum := map[string]int{} //记录满足剔除的表头的阈值
-				row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) {
-					text := strings.TrimSpace(cell.Text())
-					tmphead = append(tmphead, text)
-					if delRowKeysReg.MatchString(text) {
-						isDelHeader = true
+// row.HeaderRote > 50,提取分包特征值
+func getPackAllCellVal_v1(rows TableData) map[int][]string {
+	//如果是标的物、评分、抽查列表,放弃解析
+	bidTheadNum := 0
+	if len(rows.Rows) > 0 {
+		for _, theadKeys := range reverseTheadKeys {
+			for _, v := range theadKeys {
+				for _, cell := range rows.Rows[0].Row {
+					if strings.EqualFold(v, cell.Text) {
+						bidTheadNum++
 					}
-					//如果是标的物、评分、抽查列表,剔除
-					for k, theadKeys := range reverseTheadKeys {
-						for _, v := range theadKeys {
-							if strings.Contains(text, v) {
-								bidTheadKeysIndexNum[k]++
-							}
-						}
-						if bidTheadKeysIndexNum[k] > 1 { //满足一个以上的表头删除
-							isDelHeader = true
-						}
+				}
+			}
+		}
+	}
+	if bidTheadNum > 1 {
+		if blog {
+			log.Println("标的物、评分、抽查列表,放弃解析")
+		}
+		return nil
+	}
+	cellIndex_keyVals := map[int][]string{}
+	kcell := []int{}
+	startAdd := false //开始取数标识
+	startRows := 0    //开始取数据行
+L:
+	for kr, row := range rows.Rows {
+		cellOk := 0 //如果单元格数据有效值不足3项,跳过
+		for _, cell := range row.Row {
+			if len(cell.Text) > 0 {
+				cellOk++
+			}
+		}
+		if cellOk < 2 {
+			continue
+		}
+		if startAdd { //开始提取数据,并非从第二行开始取数据
+			for i, k := range kcell {
+				if row.HeaderRote < 100 {
+					if startRows == 0 {
+						startRows = i
+					}
+					cellIndex_keyVals[k] = append(cellIndex_keyVals[k], row.Row[k].Text)
+				}
+				//如果已有数据,再次碰到行表头行放弃数据
+				if startRows > 0 && row.HeaderRote > 0 {
+					if blog {
+						log.Println("中断", row.HeaderRote, row)
+					}
+					break L
+				}
+			}
+		}
+		if blog {
+			log.Println("整行是表头v1 row", startAdd, cellOk, bidTheadNum, kr, row.HeaderRote, row.Row)
+		}
+		//首次获取行表头中 分包索引号
+		if !startAdd && row.HeaderRote > 50 {
+			for i, cell := range row.Row {
+				for _, word := range theadPackWordsList {
+					if strings.EqualFold(word, cell.Text) {
+						// log.Println("word", i, word, strings.EqualFold(word, cell.Text))
+						kcell = append(kcell, i)
+						startAdd = true
 					}
-
-				})
-				//log.Println("tmphead th", tmphead)
-				if len(tmphead) < 1 {
-					row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
-						text := strings.TrimSpace(cell.Text())
-						tmphead = append(tmphead, text)
-						if delRowKeysReg.MatchString(text) {
-							isDelHeader = true
-						}
-						//如果是标的物、评分、抽查列表,剔除
-						for k, theadKeys := range reverseTheadKeys {
-							for _, v := range theadKeys {
-								if strings.Contains(text, v) {
-									bidTheadKeysIndexNum[k]++
-								}
-							}
-							if bidTheadKeysIndexNum[k] > 1 {
-								isDelHeader = true
-							}
-						}
-					})
 				}
-				//log.Println("tmphead td", tmphead)
-				if !isDelHeader {
-					headers = append(headers, tmphead...)
+			}
+		}
+	}
+	return cellIndex_keyVals
+}
+
+// row.HeaderRote <= 50,提取分包特征值
+func getPackAllCellVal_v2(rows TableData) map[int][]string {
+	//如果是标的物、评分、抽查列表,放弃解析
+	bidTheadNum := 0
+	if len(rows.Rows) > 0 {
+		for _, theadKeys := range reverseTheadKeys {
+			for _, v := range theadKeys {
+				for _, cell := range rows.Rows[0].Row {
+					if strings.EqualFold(v, cell.Text) {
+						bidTheadNum++
+					}
 				}
-				//log.Println("headers", headers)
 			}
-			// 处理数据行
-			if len(headers) > 0 {
-				isDelRows := false //是否需要屏蔽词
-				rowData := make(map[string]string)
-				row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
-					if cellIdx < len(headers) {
-						header := headers[cellIdx]
-						text := strings.TrimSpace(cell.Text())
-						rowData[header] = text
-						if delRowKeysReg.MatchString(text) {
-							isDelRows = true
+		}
+	}
+	if bidTheadNum > 1 {
+		return nil
+	}
+	cellIndex_keyVals := map[int][]string{}
+L:
+	for _, row := range rows.Rows {
+		cellOk := 0 //如果单元格数据有效值不足3项,跳过
+		for _, cell := range row.Row {
+			if len(cell.Text) > 0 {
+				cellOk++
+			}
+		}
+		if cellOk < 3 {
+			continue
+		}
+		if row.HeaderRote <= 50 {
+			for i, cell := range row.Row {
+				for _, word := range theadPackWordsList {
+					if strings.EqualFold(word, cell.Text) {
+						if len(row.Row) > i+1 {
+							cellIndex_keyVals[0] = append(cellIndex_keyVals[0], row.Row[i+1].Text)
+							break L
+							//log.Println("ssss", word, row.Row[i+1].Text)
 						}
 					}
-				})
-				//log.Println(isDelRows, rowData)
-				if !isDelRows {
-					rows = append(rows, rowData)
 				}
 			}
-		})
-		tableDataRows[fmt.Sprint(i)] = rows
-	})
-	return tableDataRows
+		}
+	}
+	return cellIndex_keyVals
 }
 
-// 自定义表头判断逻辑(根据单元格内容特征)
-func isHeaderRow(row *goquery.Selection) bool {
-	// 判断条件示例 包含 theadWords 特定关键词
-	hasAttributeKeyword := false
-	matchNum := 0
-	row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
-		text := strings.TrimSpace(cell.Text())
-		if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 {
-			matchNum++
-			hasAttributeKeyword = true
-		} else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 {
-			matchNum++
-			hasAttributeKeyword = true
-		}
-		//log.Println(text, matchNum, hasAttributeKeyword)
-	})
-	row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) {
-		text := strings.TrimSpace(cell.Text())
-		if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 {
-			matchNum++
-			hasAttributeKeyword = true
-		} else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 {
-			matchNum++
-			hasAttributeKeyword = true
-		}
-		//log.Println(text, matchNum, hasAttributeKeyword)
-	})
-	//log.Println("isHeaderRow", matchNum, hasAttributeKeyword, matchNum > 1 && hasAttributeKeyword)
-	return matchNum > 1 && hasAttributeKeyword
+func setRowsHeaderRote(tables *AllTablesData) *AllTablesData {
+	//判断表头模式
+	for k, table := range tables.Tables {
+		for i, row := range table.Rows {
+			rowLen := len(row.Row)
+			rowHeardNum := 0
+			for _, cell := range row.Row {
+				if cell.IsHeader {
+					rowHeardNum++
+				}
+			}
+			if rowLen == rowHeardNum || rowHeardNum > rowLen/2 {
+				row.HeaderRote = 100
+			} else if rowLen%2 == 0 && rowHeardNum == rowLen/2 {
+				row.HeaderRote = 50
+			} else if rowHeardNum > 0 { //有表头个数不定
+				row.HeaderRote = 1
+				// log.Println("row.HeaderRote", row.HeaderRote, row)
+			} else {
+				row.HeaderRote = 0
+			}
+			table.Rows[i] = row
+			// if blog {
+			// 	log.Println("setRowsHeaderRote", row.HeaderRote, row.Row)
+			// }
+		}
+		tables.Tables[k] = table
+	}
+	return tables
 }
 
 // 匹配<table>标签及其内容的正则表达式
 func removeTables(html string) string {
 	re := regexp.MustCompile(`(?i)<table[^>]*>[\s\S]*?</table>`)
-	return re.ReplaceAllString(html, "")
+	html = re.ReplaceAllString(html, "")
+
+	// re = regexp.MustCompile(`<[^>]*>`)
+	// html = re.ReplaceAllString(html, "")
+	return html
+}
+
+// 表格检测,检查表格是否存在及是否存在合并单元格
+func CheckTableMerged(htmlContent string) (hasTable bool, hasMerged bool, err error) {
+	doc, err := html.Parse(strings.NewReader(htmlContent))
+	if err != nil {
+		return false, false, err
+	}
+
+	// 递归查找所有表格
+	tables := findTables(doc)
+	hasTable = len(tables) > 0
+
+	// 检查所有表格中的合并单元格
+	for _, table := range tables {
+		if checkTableForMergedCells(table) {
+			hasMerged = true
+			break
+		}
+	}
+
+	return hasTable, hasMerged, nil
+}
+func findTables(n *html.Node) []*html.Node { // 递归查找文档中的所有<table>元素
+	var tables []*html.Node
+	if n.Type == html.ElementNode && n.Data == "table" {
+		tables = append(tables, n)
+	}
+
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		tables = append(tables, findTables(c)...)
+	}
+	return tables
+}
+func checkTableForMergedCells(table *html.Node) bool { //检查单个表格中是否存在合并单元格
+	// 使用栈进行非递归深度优先遍历
+	stack := []*html.Node{table}
+	for len(stack) > 0 {
+		node := stack[len(stack)-1]
+		stack = stack[:len(stack)-1]
+
+		// 遇到嵌套表格则跳过
+		if node != table && node.Type == html.ElementNode && node.Data == "table" {
+			continue
+		}
+
+		// 检查当前节点是否为单元格
+		if node.Type == html.ElementNode && (node.Data == "td" || node.Data == "th") {
+			if hasMergeAttribute(node) {
+				return true
+			}
+		}
+
+		// 将子节点逆序压入栈中
+		for child := node.LastChild; child != nil; child = child.PrevSibling {
+			stack = append(stack, child)
+		}
+	}
+	return false
+}
+func hasMergeAttribute(cell *html.Node) bool { // 检查单元格是否包含合并属性
+	for _, attr := range cell.Attr {
+		if attr.Key == "rowspan" || attr.Key == "colspan" {
+			// 尝试解析属性值为整数
+			if val, err := strconv.Atoi(attr.Val); err == nil {
+				if val > 1 {
+					return true
+				}
+			}
+			// 如果值无法解析为整数,但属性存在且非"1",也视为合并
+			if attr.Val != "1" {
+				return true
+			}
+		}
+	}
+	return false
 }
 
 // 替换文本数据
@@ -226,12 +374,14 @@ func cleanWebText(input string, keywords, keywordsback []string) string {
 	}
 	input = re.ReplaceAllString(input, "")
 
-	keyword := strings.Join(keywords, "|")
-	re, err = regexp.Compile(keyword)
-	if err != nil {
-		return input
+	for _, v := range keywords {
+		re, err = regexp.Compile(v)
+		if err != nil {
+			continue
+		}
+		input = re.ReplaceAllString(input, "")
 	}
-	return re.ReplaceAllString(input, "")
+	return input
 }
 
 // 支持中文数字(零一二三四五六七八九十)、阿拉伯数字(0-9)、罗马数字(Ⅰ-Ⅻ)
@@ -248,13 +398,12 @@ func convertNumerals(input string) string {
 		'Ⅵ': '6', 'Ⅶ': '7', 'Ⅷ': '8', 'Ⅸ': '9', 'Ⅹ': '1', // 仅处理个位
 		'Ⅺ': '1', 'Ⅻ': '1', // 罗马数字11和12仅处理十位
 
-		'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5',
-		'F': '6', 'G': '7', 'H': '8', 'J': '9', 'K': '1',
+		// 'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5',
+		// 'F': '6', 'G': '7', 'H': '8', 'J': '9', 'K': '1',
 
-		'a': '1', 'b': '2', 'c': '3', 'd': '4', 'e': '5',
-		'f': '6', 'g': '7', 'h': '8', 'j': '9', 'k': '1',
+		// 'a': '1', 'b': '2', 'c': '3', 'd': '4', 'e': '5',
+		// 'f': '6', 'g': '7', 'h': '8', 'j': '9', 'k': '1',
 	}
-
 	var result strings.Builder
 	var result1 strings.Builder
 	for _, char := range input {
@@ -293,33 +442,6 @@ func convertNumerals(input string) string {
 	return result.String()
 }
 
-// 检查HTML文本中是否存在多层表格嵌套
-func hasNestedTables(htmlContent string) bool {
-	doc, err := html.Parse(strings.NewReader(htmlContent))
-	if err != nil {
-		return false
-	}
-
-	var hasNested bool
-	var checkNested func(node *html.Node, depth int)
-	checkNested = func(node *html.Node, depth int) {
-		if node.Type == html.ElementNode && node.Data == "table" {
-			if depth > 0 { // 非顶层表格
-				hasNested = true
-				return
-			}
-			depth++
-		}
-
-		for c := node.FirstChild; c != nil && !hasNested; c = c.NextSibling {
-			checkNested(c, depth)
-		}
-	}
-
-	checkNested(doc, 0)
-	return hasNested
-}
-
 // Unicode判断工具函数
 func isChineseRune(r rune) bool {
 	// 基础汉字检测
@@ -357,6 +479,373 @@ func isChineseRune(r rune) bool {
 	return false
 }
 
+// CellData 存储单元格数据
+type CellData struct {
+	Text     string `json:"text"`     // 单元格文本内容
+	IsHeader bool   `json:"isHeader"` // 是否为表头单元格
+}
+
+// RowData 存储单元格数据
+type RowData struct {
+	Row        []CellData `json:"row"`      // 行数据
+	HeaderRote int        `json:"isHeader"` // 表头权重 100 50 0
+}
+
+// TableData 存储表格的行列数据
+type TableData struct {
+	Rows        []RowData `json:"rows"`        // 表格行数据
+	NestedLevel int       `json:"nestedLevel"` // 表格的嵌套层级,0表示顶层表格
+	ChildTables []int     `json:"childTables"` // 子表格的索引列表
+	ParentIndex int       `json:"parentIndex"` // 父表格的索引,-1表示没有父表格
+	HasMerged   bool      `json:"hasMerged"`   // 是否存在合并单元格
+}
+
+// AllTablesData 存储所有表格的数据
+type AllTablesData struct {
+	Tables []TableData `json:"tables"`
+}
+
+// TableExtractDatas 解析HTML中的表格,返回Markdown格式和所有表格的行列数据
+func TableExtractDatas(htmlStr string) (string, *AllTablesData, error) {
+	doc, err := html.Parse(strings.NewReader(htmlStr))
+	if err != nil {
+		return "", nil, err
+	}
+
+	allTablesData := &AllTablesData{}
+	var markdownBuilder strings.Builder
+	tableIndex := 0
+
+	// 递归解析表格
+	var parseNode func(*html.Node, int, int)
+	parseNode = func(n *html.Node, level int, parentIdx int) {
+		if n.Type == html.ElementNode && n.Data == "table" {
+			// 记录当前表格的父索引
+			currentParent := parentIdx
+			currentIndex := tableIndex
+
+			// 解析当前表格
+			tableMarkdown, tableData := parseTable(n, level, currentIndex, currentParent)
+			tableData.NestedLevel = level
+			tableData.ParentIndex = currentParent
+
+			// 添加到结果集
+			allTablesData.Tables = append(allTablesData.Tables, tableData)
+			markdownBuilder.WriteString(tableMarkdown)
+			markdownBuilder.WriteString("\n\n")
+
+			// 更新父表格的子表格列表
+			if currentParent != -1 {
+				parentTable := &allTablesData.Tables[currentParent]
+				parentTable.ChildTables = append(parentTable.ChildTables, currentIndex)
+			}
+
+			// 增加表格索引
+			tableIndex++
+
+			// 递归处理子节点(使用新的父索引)
+			for c := n.FirstChild; c != nil; c = c.NextSibling {
+				parseNode(c, level+1, currentIndex)
+			}
+			return
+		}
+
+		// 递归处理其他节点
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			parseNode(c, level, parentIdx)
+		}
+	}
+
+	// 从文档根节点开始解析
+	parseNode(doc, 0, -1)
+	return markdownBuilder.String(), allTablesData, nil
+}
+
+// 获取单元格的rowspan和colspan属性
+func getSpanTable(cell *html.Node) (int, int) {
+	rowspan, colspan := 1, 1
+	for _, attr := range cell.Attr {
+		switch attr.Key {
+		case "rowspan":
+			if val, err := strconv.Atoi(attr.Val); err == nil && val > 1 {
+				rowspan = val
+			}
+		case "colspan":
+			if val, err := strconv.Atoi(attr.Val); err == nil && val > 1 {
+				colspan = val
+			}
+		}
+	}
+	return rowspan, colspan
+}
+
+// 解析单个表格
+func parseTable(tableNode *html.Node, level int, currentIndex int, parentIndex int) (string, TableData) {
+	var tableData TableData
+	tableData.ParentIndex = parentIndex
+	tableData.HasMerged = false
+
+	// 使用网格(grid)构建表格结构
+	var grid []RowData
+	rows := getTableRows(tableNode)
+
+	// 处理表格行
+	for rowIdx, row := range rows {
+		// 扩展grid到当前行
+		if rowIdx >= len(grid) {
+			grid = append(grid, RowData{})
+		}
+
+		// 跳过已被合并单元格占用的位置
+		col := 0
+		for col < len(grid[rowIdx].Row) && grid[rowIdx].Row[col].Text != "" {
+			col++
+		}
+
+		cells := getRowCells(row)
+		for _, cell := range cells {
+			// 获取单元格的跨行跨列属性
+			rowspan, colspan := getSpanTable(cell)
+			if rowspan > 1 || colspan > 1 {
+				tableData.HasMerged = true
+			}
+
+			// 确保grid有足够的行
+			for len(grid) < rowIdx+rowspan {
+				grid = append(grid, RowData{})
+			}
+
+			// 确保所有相关行有足够的列
+			targetCol := col + colspan
+			for r := rowIdx; r < rowIdx+rowspan; r++ {
+				if len(grid[r].Row) < targetCol {
+					// 扩展行
+					newRow := make([]CellData, targetCol)
+					copy(newRow, grid[r].Row)
+					grid[r].Row = newRow
+				}
+			}
+
+			// 提取单元格文本并确定是否为表头
+			text := extractCellText(cell)
+			text = RemoveAllSpaces(text)
+			isHeader := cell.Data == "th"
+			//根据text内容和常见关键词判断,是否是表头
+			if !isHeader && len([]rune(text)) < 20 {
+				//核心词走正则匹配
+				if theadPackWordsReg.MatchString(text) {
+					isHeader = true
+				}
+				if !isHeader {
+					isHeader = theadWordsListComReg.MatchString(text)
+				}
+				//非核心词,走EqualFold匹配
+				if !isHeader {
+					for _, word := range theadWordsList {
+						if strings.EqualFold(word, text) {
+							isHeader = true
+							break
+						}
+					}
+				}
+			}
+			// log.Println("cellData IsHeader", isHeader, text)
+			cellData := CellData{Text: text, IsHeader: isHeader}
+
+			// 将单元格数据填充到所有合并位置
+			for r := 0; r < rowspan; r++ {
+				for c := 0; c < colspan; c++ {
+					grid[rowIdx+r].Row[col+c] = cellData
+				}
+			}
+
+			// 移动到下一列位置
+			col += colspan
+		}
+	}
+
+	// 设置最终的行数据
+	tableData.Rows = grid
+
+	// 生成Markdown表格 (保持原有逻辑,只使用文本内容)
+	markdown := generateMarkdownTable(grid, level)
+
+	return markdown, tableData
+}
+
+// 生成Markdown格式的表格 (只使用单元格文本)
+func generateMarkdownTable(grid []RowData, level int) string {
+	if len(grid) == 0 {
+		return ""
+	}
+
+	var builder strings.Builder
+	builder.WriteString(fmt.Sprintf("### Table at level %d\n\n", level))
+
+	// 添加表头
+	for i, row := range grid {
+		builder.WriteString("| ")
+		for j, cell := range row.Row {
+			text := cell.Text
+			if text == "" {
+				builder.WriteString(" ")
+			} else {
+				builder.WriteString(text)
+			}
+			if j < len(row.Row)-1 {
+				builder.WriteString(" | ")
+			}
+		}
+		builder.WriteString(" |\n")
+
+		// 添加表头分隔线
+		if i == 0 {
+			builder.WriteString("|")
+			for j := 0; j < len(row.Row); j++ {
+				builder.WriteString(" --- |")
+			}
+			builder.WriteString("\n")
+		}
+	}
+
+	return builder.String()
+}
+
+// 获取表格中的所有行
+func getTableRows(tableNode *html.Node) []*html.Node {
+	var rows []*html.Node
+	var traverse func(*html.Node)
+
+	traverse = func(n *html.Node) {
+		if n.Type == html.ElementNode {
+			switch n.Data {
+			case "tr":
+				rows = append(rows, n)
+			case "thead", "tbody", "tfoot", "table":
+				// 继续遍历
+				for c := n.FirstChild; c != nil; c = c.NextSibling {
+					traverse(c)
+				}
+			}
+		}
+	}
+
+	for c := tableNode.FirstChild; c != nil; c = c.NextSibling {
+		traverse(c)
+	}
+	return rows
+}
+
+// 获取行中的所有单元格
+func getRowCells(rowNode *html.Node) []*html.Node {
+	var cells []*html.Node
+	for c := rowNode.FirstChild; c != nil; c = c.NextSibling {
+		if c.Type == html.ElementNode && (c.Data == "td" || c.Data == "th") {
+			cells = append(cells, c)
+		}
+	}
+	return cells
+}
+
+// 提取单元格文本
+func extractCellText(cellNode *html.Node) string {
+	var textBuilder strings.Builder
+	var extract func(*html.Node)
+
+	extract = func(n *html.Node) {
+		if n.Type == html.TextNode {
+			textBuilder.WriteString(strings.TrimSpace(n.Data))
+			textBuilder.WriteString(" ")
+		} else if n.Type == html.ElementNode {
+			// 跳过嵌套表格
+			if n.Data != "table" {
+				for c := n.FirstChild; c != nil; c = c.NextSibling {
+					extract(c)
+				}
+			} else {
+				textBuilder.WriteString("[Table]")
+			}
+		}
+	}
+
+	for c := cellNode.FirstChild; c != nil; c = c.NextSibling {
+		extract(c)
+	}
+
+	// 清理文本
+	result := strings.TrimSpace(textBuilder.String())
+	if result == "" {
+		return " "
+	}
+	return result
+}
+
+// 提取文本汉字
+func GetChineseText(htmlContent string) string {
+	// 移除HTML标签
+	reHTML := regexp.MustCompile("<[^>]*>")
+	cleanText := reHTML.ReplaceAllString(htmlContent, "")
+	// 提取汉字(Unicode范围:\u4e00-\u9fa5)
+	reChinese := regexp.MustCompile("[\u4e00-\u9fa5]")
+	chineseChars := reChinese.FindAllString(cleanText, -1)
+	return strings.Join(chineseChars, "")
+}
+
+// RemoveAllSpaces 移除字符串中的所有空白字符
+func RemoveAllSpaces(s string) string {
+	// 使用 strings.Builder 高效构建新字符串
+	var b strings.Builder
+	b.Grow(len(s)) // 预分配空间,提高性能
+
+	// 遍历字符串的每个字符
+	for _, r := range s {
+		// 如果不是空白字符,则添加到结果中
+		if !unicode.IsSpace(r) {
+			b.WriteRune(r)
+		}
+	}
+	return b.String()
+}
+
+// 按字符串长度排序的类型
+type ByLength []string
+
+func (s ByLength) Len() int           { return len(s) }
+func (s ByLength) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+func (s ByLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
+
+// 分组函数
+func groupStrings(strings []string) [][]string {
+	if len(strings) == 0 {
+		return nil
+	}
+	// 按长度排序
+	sorted := make(ByLength, len(strings))
+	copy(sorted, strings)
+	sort.Sort(sorted)
+
+	var groups [][]string
+	currentGroup := []string{sorted[0]}
+	currentMaxLen := len(sorted[0])
+
+	// 遍历排序后的字符串,构建分组
+	for i := 1; i < len(sorted); i++ {
+		currentLen := len(sorted[i])
+		if currentLen-currentMaxLen <= 2 {
+			// 当前字符串可以加入当前组
+			currentGroup = append(currentGroup, sorted[i])
+		} else {
+			// 创建新组
+			groups = append(groups, currentGroup)
+			currentGroup = []string{sorted[i]}
+			currentMaxLen = currentLen
+		}
+	}
+	// 添加最后一个组
+	groups = append(groups, currentGroup)
+	return groups
+}
+
 // 判断是否是联合体中标
 func isConsortiumKeysReg(content string) bool {
 	return consortiumKeysReg.MatchString(content)