multipackage.go 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. package pretreated
  2. import (
  3. "regexp"
  4. "sort"
  5. )
  6. var (
  7. /**
  8. 监理 施工没有处理
  9. **/
  10. //替换容易混淆的词
  11. PreReg = regexp.MustCompile("(同|每|对|[^其]中|仅|分|任意)[一二三四五六七八九十\\d]个?(子|合同|分|施工|监理)?(标段?|包)|项目标号|文件A包|涉及包号|包件号?|标段(名称|编号)|0\\s?个标段|1\\-[\\d]标段|子包(\\d、)+\\d|\\d\\.\\d(标段|包)[^一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]|[1-9]标。")
  12. PreReg1 = regexp.MustCompile("[^\n]([A-Z]?([一二三四五六七八九十]|\\d)、)+[A-Z]?([一二三四五六七八九十]|\\d)(标段?|包)")
  13. //有分包划分情况的直接对比是1的肯定不是分包
  14. PreCheckMulti = regexp.MustCompile("[^第]([一二三四五六七八九十两0-9ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)[  \u3000\u2003\u00a0]*个?((子|合同|分|施工|监理)?(标段?|包|合同段|标包))进行|(划分|分[设为成]?|共[分设有计]?)[::]?[  \u3000\u2003\u00a0]*([一二三四五六七八九十两0-9ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)[  \u3000\u2003\u00a0]*个?((子|合同|分|施工|监理)?(标段?|包|合同段|标包|项目))")
  15. //替换容易混淆的词
  16. PreCon = regexp.MustCompile("([\r\n]|^)[\u3000\u2003\u00a0\\s]*(\\d\\.)+\\d|[一二三四五1-9、.]+[  \u3000\u2003\u00a0]*((标段|分包)(划分|情况)|(标书))|([上下]一[条页篇][::]?[^,,。\\n]{0,120}|备注[::][^\\n]{0,120}|业绩[::][^\\n,。,]{0,80}|三包(手册|服务|政策|凭证|期|标准|规定|责任|要求|售后)|(要求|提供|质量|国家|享受|负责|实行|执行|承诺|门前|法定|规定).{0,6}三包|“三包”|\\d+万?([个套只支分名][^标包])|[?]|[((]请?注意[::][^((]+[))])")
  17. PreCon2 = regexp.MustCompile("[评中开定]\\s?标\\s?[0-9一二三四五六七八九十]+|标[准尺高书注]|[^中]标价|[开鼠投招军指企目]标|包[括含装为内]|[承树]包|CA证书")
  18. //替换容易混淆的词
  19. PreCon1 = regexp.MustCompile("(\\d+\\.?)+万?元")
  20. //提取分包标识
  21. MultiReg = regexp.MustCompile("[第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)#?((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|分|合同|分|施工|监理)?(标|包件?)(段|号)?)[  \u3000\u2003\u00a0]*((\\d[.])+\\d|[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)[::]?|操作系统")
  22. //匹配到的包格式分类统计
  23. keyregs = []map[*regexp.Regexp]int{
  24. map[*regexp.Regexp]int{
  25. regexp.MustCompile("^[一二三四五六七八九十]+$"): 8,
  26. },
  27. map[*regexp.Regexp]int{
  28. regexp.MustCompile("^[0-9]+$"): 7,
  29. },
  30. map[*regexp.Regexp]int{
  31. regexp.MustCompile("^[A-Za-z]+[0-9]*$"): 6,
  32. },
  33. }
  34. //冒号处理优先级高 如标段一:
  35. MH = regexp.MustCompile("[::]")
  36. //匹配包有的时候类似 包2LN2的只保留前面数字
  37. ignoreReg = regexp.MustCompile("^(\\d+)[A-Za-z].*")
  38. //标题中含分包特征的标段一、二、三或包1、包2之类,根据
  39. TitleReg = regexp.MustCompile("([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ、\\-~至]+(子|合同|分|施工|监理|标)?[包标段][号段]?[、]?)+|((子|合同|分|施工|监理|标)?[包标段][号段]?[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ、\\-~至]+[、]?)+|(子|合同|分|施工|监理|标)?[包标段][号段]?[a-zA-Z0-9]+[\\-~-至、](子|合同|分|施工|监理|标)?[包标段][号段]?[a-zA-Z0-9]+")
  40. clearPkgFlag = regexp.MustCompile("^[\\-]+|[\\-]+$")
  41. )
  42. //判断分包
  43. func CheckMultiPackage(con, title string) (content string, m map[string][]string, b bool) {
  44. m = map[string][]string{}
  45. //if TitleReg.MatchString(title) {
  46. //log.Println(title+"\n------------------", TitleReg.FindAllStringSubmatch(title, -1))
  47. //}
  48. con = PreReg.ReplaceAllString(con, "")
  49. con = PreReg1.ReplaceAllString(con, "")
  50. pres := PreCheckMulti.FindStringSubmatch(con)
  51. if len(pres) == 10 {
  52. //log.Println(pres)
  53. k := pres[1]
  54. if k == "" {
  55. k = pres[6]
  56. }
  57. if k == "1" || k == "一" {
  58. return
  59. } else {
  60. //log.Println("all: ", k)
  61. }
  62. }
  63. con = PreCheckMulti.ReplaceAllString(con, "")
  64. con = PreCon.ReplaceAllString(con, "\n")
  65. content = con
  66. con = PreCon2.ReplaceAllString(con, "")
  67. con = PreCon1.ReplaceAllString(con, "")
  68. res := MultiReg.FindAllStringSubmatch(con, -1)
  69. if len(res) > 0 { //5 6
  70. mindex := map[string]int{}
  71. for index, v := range res {
  72. k := v[1]
  73. vindex := 2
  74. if k == "" {
  75. k = v[9]
  76. vindex = 5
  77. }
  78. if len(m[k]) == 0 && k != "" {
  79. k = ignoreReg.ReplaceAllString(k, "$1")
  80. k = clearPkgFlag.ReplaceAllString(k, "")
  81. //log.Println(k, "----")
  82. m[k] = []string{clearPkgFlag.ReplaceAllString(v[0], ""), v[vindex]}
  83. mindex[k] = index
  84. }
  85. }
  86. if len(m) > 1 {
  87. //对k优先级进行处理过滤
  88. SEL := -1 //确定以哪种类型为标段标识,没有去判断v相同不相同,存在一定的误判!如 1:合同包1 1:包
  89. mapclassstr := map[int]map[string][]string{}
  90. mapclass := map[int]int{}
  91. for k, v := range m {
  92. str := res[mindex[k]][0]
  93. mk := 5
  94. for _, keyreg := range keyregs {
  95. for reg, pos := range keyreg {
  96. if reg.MatchString(k) {
  97. mk = pos
  98. break
  99. }
  100. }
  101. }
  102. mapclass[mk]++
  103. if mapclassstr[mk] == nil {
  104. mapclassstr[mk] = map[string][]string{}
  105. }
  106. mapclassstr[mk][k] = []string{v[0], v[1]}
  107. if MH.MatchString(str) { //如果有冒号直接确定
  108. SEL = mk
  109. }
  110. }
  111. //log.Println(mapclassstr, mapclass, SEL)
  112. if SEL > 0 {
  113. m = mapclassstr[SEL]
  114. } else {
  115. //比较出哪个最多,倒排,如果都一样,按 9 8 7 6 5来处理
  116. max := 0
  117. maxk := []int{}
  118. for k, v := range mapclass {
  119. if v > max {
  120. max = v
  121. maxk = []int{}
  122. maxk = append(maxk, k)
  123. } else if v == max {
  124. maxk = append(maxk, k)
  125. }
  126. }
  127. if len(maxk) > 0 {
  128. sort.Ints(maxk)
  129. m = mapclassstr[maxk[len(maxk)-1]]
  130. }
  131. }
  132. }
  133. if len(m) > 0 {
  134. b = true
  135. }
  136. //log.Println(m, res)
  137. }
  138. return
  139. }