cutspace.go 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. package clear
  2. import (
  3. "fmt"
  4. "regexp"
  5. "strings"
  6. )
  7. var cutSpace *regexp.Regexp
  8. var cutAllSpace *regexp.Regexp
  9. var catSymbol *regexp.Regexp
  10. var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n"}
  11. func init() {
  12. cutSpace, _ = regexp.Compile(`^\s*|\s*$`)
  13. cutAllSpace, _ = regexp.Compile(`\s*`)
  14. catSymbol, _ = regexp.Compile(`[]+`)
  15. }
  16. var LableStr = "&?(amp|nbsp|#8266);?|(<).*?(>?)"
  17. var at = rune('&')
  18. var ed = rune(';')
  19. var lableMap = map[string]rune{
  20. "&": rune('&'),
  21. " ": rune(' '),
  22. ">": rune('>'),
  23. "&lt;": rune('<'),
  24. }
  25. //处理转义标签
  26. func CutLableStr(con string) string {
  27. for i := 0; i < 3; i++ {
  28. runes := []rune{}
  29. pools := []rune{}
  30. bpool := false
  31. strings.IndexFunc(con, func(s rune) bool {
  32. if !bpool && s == at {
  33. bpool = true
  34. pools = []rune{}
  35. }
  36. if bpool {
  37. pools = append(pools, s)
  38. if s == ed { //结束
  39. lb := lableMap[string(pools)]
  40. if lb != 0 {
  41. runes = append(runes, lb)
  42. } else {
  43. runes = append(runes, pools...)
  44. }
  45. bpool = false
  46. } else if len(pools) > 6 {
  47. bpool = false
  48. runes = append(runes, pools...)
  49. }
  50. } else {
  51. runes = append(runes, s)
  52. }
  53. return false
  54. })
  55. str1 := string(runes)
  56. if i > 0 && con == str1 {
  57. break
  58. }
  59. con = str1
  60. }
  61. return con
  62. }
  63. //清理开始、结尾的空白字符
  64. func CutSpace(data []interface{}) []interface{} {
  65. tmp := cutSpace.ReplaceAllString(strings.Replace(fmt.Sprint(data[0]), " ", " ", -1), "")
  66. tmp = replaceSymbol(tmp, spaces)
  67. //fmt.Println("cutspace", tmp)
  68. data[0] = tmp
  69. return data
  70. }
  71. //清理所有空白符
  72. func CutAllSpace(data []interface{}) []interface{} {
  73. tmp := cutAllSpace.ReplaceAllString(fmt.Sprint(data[0]), "")
  74. tmp = replaceSymbol(tmp, spaces)
  75. data[0] = tmp
  76. return data
  77. }
  78. //清理符号
  79. func CutSymbol(data []interface{}) []interface{} {
  80. value := fmt.Sprint(CutSpace(data)[0])
  81. symbol := ",,;;::'\"“”。.\\??、/+=\\_—*&……\\^%$¥@!!`~·"
  82. startSymbol := "^[" + ")\\)>》】\\]}}〕" + symbol + "]+"
  83. endSymbol := "[" + "(\\(<《【\\[{{〔" + symbol + "]+$"
  84. startReg := regexp.MustCompile(startSymbol)
  85. endReg := regexp.MustCompile(endSymbol)
  86. value = startReg.ReplaceAllString(value, "")
  87. value = endReg.ReplaceAllString(value, "")
  88. value = fmt.Sprint(CutSpace([]interface{}{value, data[1]})[0])
  89. return []interface{}{value, data[1]}
  90. }
  91. //不成对出现的符号,把符号后面的内容清理掉
  92. func CutNotPrs(data []interface{}) []interface{} {
  93. return childCutNotPrs(data, 1)
  94. }
  95. //不成对出现的符号,把符号后面的内容清理掉
  96. func childCutNotPrs(data []interface{}, count int) []interface{} {
  97. value := fmt.Sprint(data[0])
  98. if count >= 50 || value == "" {
  99. return data
  100. }
  101. startChars := []string{"[((]", "[\\[【]", "[{{]", "[<《]", "[>》]", "〔"}
  102. endChars := []string{"[))]", "[\\]】]", "[}}]", "[<《]", "[>》]", "〕"}
  103. for k, v := range startChars {
  104. sReg := regexp.MustCompile(v)
  105. eReg := regexp.MustCompile(endChars[k])
  106. sIndex := sReg.FindAllStringIndex(value, -1)
  107. eIndex := eReg.FindAllStringIndex(value, -1)
  108. sCount := len(sIndex)
  109. eCount := len(eIndex)
  110. if sCount == eCount {
  111. continue
  112. }
  113. //清理前面
  114. if sCount > eCount {
  115. value = value[sIndex[eCount][1]:]
  116. }
  117. //清理后面
  118. if sCount < eCount {
  119. value = value[:eIndex[sCount][0]]
  120. }
  121. }
  122. //交叉出现情况处理
  123. sReplReg := regexp.MustCompile("[((\\[【{{〔<《][^))\\]】}}〕>》]*$")
  124. eReplReg := regexp.MustCompile("^[^((\\[【{{〔<《]*[))\\]】}}〕>》]")
  125. if sReplReg.MatchString(value) || eReplReg.MatchString(value) {
  126. value = sReplReg.ReplaceAllString(value, "")
  127. value = eReplReg.ReplaceAllString(value, "")
  128. value = fmt.Sprint(childCutNotPrs([]interface{}{value, data[1]}, count+1)[0])
  129. }
  130. data[0] = value
  131. return data
  132. }
  133. //全部是汉字或者特殊符号的情况,清理掉
  134. func ClearAllWord(data []interface{}) []interface{} {
  135. value := fmt.Sprint(data[0])
  136. reg := regexp.MustCompile("^[\u4e00-\u9fa5、,,.。??'\"“”‘’·~!@#¥$%…&*()()\\-—+=【】\\[\\]{}{}<>《》|\\/\\s]+$")
  137. data[0] = reg.ReplaceAllString(value, "")
  138. return data
  139. }