package extract import ( "regexp" "strings" ) // 中国电信集团有限公司驻马店分公司 2025-3-08 17:10:30 提交报价¥266000 竞价成交 var clean1 = regexp.MustCompile("(([\u4E00-\u9FA5]{5,30}公司)[\\s-0-9::]+提交报价[¥]?([\\s0-9]+)竞价成交\n)") var clean2 = regexp.MustCompile("([\\s ]+([0-9]+)[\\s ]+([0-9]+)[\\s ]+)") var blTextReg *regexp.Regexp = regexp.MustCompile("(打分表|负责人|单位|个人|投标人|项目|企业)业绩|业绩奖项|主要人员相关资料|唱标记录|否决投标的?情况说明") var unblTextReg *regexp.Regexp = regexp.MustCompile("(项目业绩案例|类似项目业绩)") var beforeTextReg *regexp.Regexp = regexp.MustCompile("(招标代理机构|招标单位|招标人)[::].{4,25}\n") // 清洗文本 func CleanText(detail string) string { //业绩排除 detail = ExcludeYeJi(detail) //特殊文本结构转换 detail = clean1.ReplaceAllString(detail, "\n中标单位:${2}\n中标金额:${3}") //对数字空格进行转换 detail = clean2.ReplaceAllString(detail, "${2}${3}") return detail } // 排除业绩 func ExcludeYeJi(detail string) string { if blTextReg.MatchString(detail) && !unblTextReg.MatchString(detail) { if strings.Index(detail, "业绩") > 1 { before_arr := []string{} //如果有采购单位信息-文本置前 if beforeTextReg.MatchString(detail) { before_arr = beforeTextReg.FindAllString(detail, -1) } detail = detail[:strings.Index(detail, "业绩")] if len(before_arr) > 0 { detail = strings.Join(before_arr, "\n") + detail } } } return detail }