12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- package extract
- import (
- "regexp"
- "strings"
- )
- // 中国电信集团有限公司驻马店分公司 2025-3-08 17:10:30 提交报价¥266000 竞价成交
- var clean1 = regexp.MustCompile("(([\u4E00-\u9FA5]{5,30}公司)[\\s-0-9::]+提交报价[¥]?([\\s0-9]+)竞价成交\n)")
- var clean2 = regexp.MustCompile("([\\s ]+([0-9]+)[\\s ]+([0-9]+)[\\s ]+)")
- var blTextReg *regexp.Regexp = regexp.MustCompile("(打分表|负责人|单位|个人|投标人|项目|企业)业绩|业绩奖项|主要人员相关资料|唱标记录|否决投标的?情况说明")
- var unblTextReg *regexp.Regexp = regexp.MustCompile("(项目业绩案例|类似项目业绩)")
- var beforeTextReg *regexp.Regexp = regexp.MustCompile("(招标代理机构|招标单位|招标人)[::].{4,25}\n")
- // 清洗文本
- func CleanText(detail string) string {
- //业绩排除
- detail = ExcludeYeJi(detail)
- //特殊文本结构转换
- detail = clean1.ReplaceAllString(detail, "\n中标单位:${2}\n中标金额:${3}")
- //对数字空格进行转换
- detail = clean2.ReplaceAllString(detail, "${2}${3}")
- return detail
- }
- // 排除业绩
- func ExcludeYeJi(detail string) string {
- if blTextReg.MatchString(detail) && !unblTextReg.MatchString(detail) {
- if strings.Index(detail, "业绩") > 1 {
- before_arr := []string{} //如果有采购单位信息-文本置前
- if beforeTextReg.MatchString(detail) {
- before_arr = beforeTextReg.FindAllString(detail, -1)
- }
- detail = detail[:strings.Index(detail, "业绩")]
- if len(before_arr) > 0 {
- detail = strings.Join(before_arr, "\n") + detail
- }
- }
- }
- return detail
- }
|