clean.go 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. package extract
  2. import (
  3. "regexp"
  4. "strings"
  5. )
  6. // 中国电信集团有限公司驻马店分公司 2025-3-08 17:10:30 提交报价¥266000 竞价成交
  7. var clean1 = regexp.MustCompile("(([\u4E00-\u9FA5]{5,30}公司)[\\s-0-9::]+提交报价[¥]?([\\s0-9]+)竞价成交\n)")
  8. var clean2 = regexp.MustCompile("([\\s ]+([0-9]+)[\\s ]+([0-9]+)[\\s ]+)")
  9. var blTextReg *regexp.Regexp = regexp.MustCompile("(打分表|负责人|单位|个人|投标人|项目|企业)业绩|业绩奖项|主要人员相关资料|唱标记录|否决投标的?情况说明")
  10. var unblTextReg *regexp.Regexp = regexp.MustCompile("(项目业绩案例|类似项目业绩)")
  11. var beforeTextReg *regexp.Regexp = regexp.MustCompile("(招标代理机构|招标单位|招标人)[::].{4,25}\n")
  12. // 清洗文本
  13. func CleanText(detail string) string {
  14. //业绩排除
  15. detail = ExcludeYeJi(detail)
  16. //特殊文本结构转换
  17. detail = clean1.ReplaceAllString(detail, "\n中标单位:${2}\n中标金额:${3}")
  18. //对数字空格进行转换
  19. detail = clean2.ReplaceAllString(detail, "${2}${3}")
  20. return detail
  21. }
  22. // 排除业绩
  23. func ExcludeYeJi(detail string) string {
  24. if blTextReg.MatchString(detail) && !unblTextReg.MatchString(detail) {
  25. if strings.Index(detail, "业绩") > 1 {
  26. before_arr := []string{} //如果有采购单位信息-文本置前
  27. if beforeTextReg.MatchString(detail) {
  28. before_arr = beforeTextReg.FindAllString(detail, -1)
  29. }
  30. detail = detail[:strings.Index(detail, "业绩")]
  31. if len(before_arr) > 0 {
  32. detail = strings.Join(before_arr, "\n") + detail
  33. }
  34. }
  35. }
  36. return detail
  37. }