cutspace.go 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. package clear
  2. import (
  3. "fmt"
  4. "regexp"
  5. "strings"
  6. )
  7. var (
  8. cutSpace *regexp.Regexp
  9. cutAllSpace *regexp.Regexp
  10. catSymbol *regexp.Regexp
  11. separateSymbol *regexp.Regexp
  12. placeReg *regexp.Regexp
  13. )
  14. var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n"}
  15. func init() {
  16. cutSpace, _ = regexp.Compile(`^\s*|\s*$`)
  17. cutAllSpace, _ = regexp.Compile(`\s*`)
  18. catSymbol, _ = regexp.Compile(`[]+`)
  19. separateSymbol, _ = regexp.Compile("[\\s\u3000\u2003\u00a0\\n,,、/。|]")
  20. placeReg, _ = regexp.Compile("^.*(公司|学(校)?|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体)|工作室)$")
  21. }
  22. var LableStr = "&?(amp|nbsp|#8266);?|(<).*?(>?)"
  23. var at = rune('&')
  24. var ed = rune(';')
  25. var lableMap = map[string]rune{
  26. "&": rune('&'),
  27. " ": rune(' '),
  28. ">": rune('>'),
  29. "&lt;": rune('<'),
  30. }
  31. //处理转义标签
  32. func CutLableStr(con string) string {
  33. for i := 0; i < 3; i++ {
  34. runes := []rune{}
  35. pools := []rune{}
  36. bpool := false
  37. strings.IndexFunc(con, func(s rune) bool {
  38. if !bpool && s == at {
  39. bpool = true
  40. pools = []rune{}
  41. }
  42. if bpool {
  43. pools = append(pools, s)
  44. if s == ed { //结束
  45. lb := lableMap[string(pools)]
  46. if lb != 0 {
  47. runes = append(runes, lb)
  48. } else {
  49. runes = append(runes, pools...)
  50. }
  51. bpool = false
  52. } else if len(pools) > 6 {
  53. bpool = false
  54. runes = append(runes, pools...)
  55. }
  56. } else {
  57. runes = append(runes, s)
  58. }
  59. return false
  60. })
  61. str1 := string(runes)
  62. if i > 0 && con == str1 {
  63. break
  64. }
  65. con = str1
  66. }
  67. return con
  68. }
  69. //清理开始、结尾的空白字符
  70. func CutSpace(data []interface{}) []interface{} {
  71. tmp := cutSpace.ReplaceAllString(strings.Replace(fmt.Sprint(data[0]), " ", " ", -1), "")
  72. tmp = replaceSymbol(tmp, spaces)
  73. //fmt.Println("cutspace", tmp)
  74. data[0] = tmp
  75. return data
  76. }
  77. //清理所有空白符
  78. func CutAllSpace(data []interface{}) []interface{} {
  79. tmp := cutAllSpace.ReplaceAllString(fmt.Sprint(data[0]), "")
  80. tmp = replaceSymbol(tmp, spaces)
  81. data[0] = tmp
  82. return data
  83. }
  84. //清理符号
  85. func CutSymbol(data []interface{}) []interface{} {
  86. value := fmt.Sprint(CutSpace(data)[0])
  87. symbol := ",,;;::'\"“”。.\\??、/+=\\_—*&……\\^%$¥@!!`~·"
  88. startSymbol := "^[" + ")\\)>》】\\]}}〕" + symbol + "]+"
  89. endSymbol := "[" + "(\\(<《【\\[{{〔" + symbol + "]+$"
  90. startReg := regexp.MustCompile(startSymbol)
  91. endReg := regexp.MustCompile(endSymbol)
  92. value = startReg.ReplaceAllString(value, "")
  93. value = endReg.ReplaceAllString(value, "")
  94. value = fmt.Sprint(CutSpace([]interface{}{value, data[1]})[0])
  95. return []interface{}{value, data[1]}
  96. }
  97. //不成对出现的符号,把符号后面的内容清理掉
  98. func CutNotPrs(data []interface{}) []interface{} {
  99. return childCutNotPrs(data, 1)
  100. }
  101. //不成对出现的符号,把符号后面的内容清理掉
  102. func childCutNotPrs(data []interface{}, count int) []interface{} {
  103. value := fmt.Sprint(data[0])
  104. if count >= 50 || value == "" {
  105. return data
  106. }
  107. startChars := []string{"[((]", "[\\[【]", "[{{]", "[<《]", "〔"}
  108. endChars := []string{"[))]", "[\\]】]", "[}}]", "[>》]", "〕"}
  109. for k, v := range startChars {
  110. sReg := regexp.MustCompile(v)
  111. eReg := regexp.MustCompile(endChars[k])
  112. sIndex := sReg.FindAllStringIndex(value, -1)
  113. eIndex := eReg.FindAllStringIndex(value, -1)
  114. sCount := len(sIndex)
  115. eCount := len(eIndex)
  116. if sCount == eCount {
  117. continue
  118. }
  119. //清理前面
  120. if sCount > eCount {
  121. value = value[sIndex[eCount][1]:]
  122. }
  123. //清理后面
  124. if sCount < eCount {
  125. value = value[:eIndex[sCount][0]]
  126. }
  127. }
  128. //交叉出现情况处理
  129. sReplReg := regexp.MustCompile("[((\\[【{{〔<《][^))\\]】}}〕>》]*$")
  130. eReplReg := regexp.MustCompile("^[^((\\[【{{〔<《]*[))\\]】}}〕>》]")
  131. if sReplReg.MatchString(value) || eReplReg.MatchString(value) {
  132. value = sReplReg.ReplaceAllString(value, "")
  133. value = eReplReg.ReplaceAllString(value, "")
  134. value = fmt.Sprint(childCutNotPrs([]interface{}{value, data[1]}, count+1)[0])
  135. }
  136. data[0] = value
  137. return data
  138. }
  139. //全部是汉字或者特殊符号的情况,清理掉
  140. func ClearAllWord(data []interface{}) []interface{} {
  141. value := fmt.Sprint(data[0])
  142. reg := regexp.MustCompile("^[\u4e00-\u9fa5、,,.。??'\"“”‘’·~!@#¥$%…&*()()\\-—+=【】\\[\\]{}{}<>《》|\\/\\s]+$")
  143. data[0] = reg.ReplaceAllString(value, "")
  144. return data
  145. }
  146. //中文符号转英文
  147. func ChiToEng(data []interface{}) []interface{} {
  148. value := fmt.Sprint(data[0])
  149. startChars := []string{"(", "【", "{", "“", ")", "】", "}", "”"}
  150. endChars := []string{"(", "[", "{", "\"", ")", "]", "}", "\""}
  151. for i, v := range startChars {
  152. sReg := regexp.MustCompile(v)
  153. sIndex := sReg.FindAllStringIndex(value, -1)
  154. for j := 1; j <= len(sIndex); j++ {
  155. value = sReg.ReplaceAllString(value, endChars[i])
  156. }
  157. }
  158. data[0] = value
  159. return data
  160. }
  161. func ClearBuyerPerson(data []interface{}) []interface{} {
  162. value := fmt.Sprint(data[0])
  163. //tmp := []string{}
  164. if len([]rune(value)) > 4 { //名字默认最长4
  165. tmp := ""
  166. valuearr := separateSymbol.Split(value, -1)
  167. length := len(valuearr)
  168. for i, v := range valuearr {
  169. if v == "" {
  170. continue
  171. }
  172. if i == 0 && placeReg.MatchString(v) {
  173. if length == 1 {
  174. if len([]rune(v)) >= 4 {
  175. tmp = ""
  176. } else {
  177. tmp = tmp + v
  178. }
  179. } else {
  180. tmp = tmp + v + "-"
  181. }
  182. } else if len([]rune(v)) <= 4 {
  183. if i+1 != length {
  184. tmp = tmp + v + ","
  185. } else {
  186. tmp = tmp + v
  187. }
  188. }
  189. }
  190. data[0] = tmp
  191. } else {
  192. value = separateSymbol.ReplaceAllString(value, "")
  193. data[0] = value
  194. }
  195. return data
  196. }