123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- package clear
- import (
- "fmt"
- "regexp"
- "strings"
- )
- var cutSpace *regexp.Regexp
- var cutAllSpace *regexp.Regexp
- var catSymbol *regexp.Regexp
- var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n"}
- func init() {
- cutSpace, _ = regexp.Compile(`^\s*|\s*$`)
- cutAllSpace, _ = regexp.Compile(`\s*`)
- catSymbol, _ = regexp.Compile(`[]+`)
- }
- var LableStr = "&?(amp|nbsp|#8266);?|(<).*?(>?)"
- var at = rune('&')
- var ed = rune(';')
- var lableMap = map[string]rune{
- "&": rune('&'),
- " ": rune(' '),
- ">": rune('>'),
- "<": rune('<'),
- }
- //处理转义标签
- func CutLableStr(con string) string {
- for i := 0; i < 3; i++ {
- runes := []rune{}
- pools := []rune{}
- bpool := false
- strings.IndexFunc(con, func(s rune) bool {
- if !bpool && s == at {
- bpool = true
- pools = []rune{}
- }
- if bpool {
- pools = append(pools, s)
- if s == ed { //结束
- lb := lableMap[string(pools)]
- if lb != 0 {
- runes = append(runes, lb)
- } else {
- runes = append(runes, pools...)
- }
- bpool = false
- } else if len(pools) > 6 {
- bpool = false
- runes = append(runes, pools...)
- }
- } else {
- runes = append(runes, s)
- }
- return false
- })
- str1 := string(runes)
- if i > 0 && con == str1 {
- break
- }
- con = str1
- }
- return con
- }
- //清理开始、结尾的空白字符
- func CutSpace(data []interface{}) []interface{} {
- tmp := cutSpace.ReplaceAllString(strings.Replace(fmt.Sprint(data[0]), " ", " ", -1), "")
- tmp = replaceSymbol(tmp, spaces)
- //fmt.Println("cutspace", tmp)
- data[0] = tmp
- return data
- }
- //清理所有空白符
- func CutAllSpace(data []interface{}) []interface{} {
- tmp := cutAllSpace.ReplaceAllString(fmt.Sprint(data[0]), "")
- tmp = replaceSymbol(tmp, spaces)
- data[0] = tmp
- return data
- }
- //清理符号
- func CutSymbol(data []interface{}) []interface{} {
- value := fmt.Sprint(CutSpace(data)[0])
- symbol := ",,;;::'\"“”。.\\??、/+=\\_—*&……\\^%$¥@!!`~·"
- startSymbol := "^[" + ")\\)>》】\\]}}〕" + symbol + "]+"
- endSymbol := "[" + "(\\(<《【\\[{{〔" + symbol + "]+$"
- startReg := regexp.MustCompile(startSymbol)
- endReg := regexp.MustCompile(endSymbol)
- value = startReg.ReplaceAllString(value, "")
- value = endReg.ReplaceAllString(value, "")
- value = fmt.Sprint(CutSpace([]interface{}{value, data[1]})[0])
- return []interface{}{value, data[1]}
- }
- //不成对出现的符号,把符号后面的内容清理掉
- func CutNotPrs(data []interface{}) []interface{} {
- return childCutNotPrs(data, 1)
- }
- //不成对出现的符号,把符号后面的内容清理掉
- func childCutNotPrs(data []interface{}, count int) []interface{} {
- value := fmt.Sprint(data[0])
- if count >= 50 || value == "" {
- return data
- }
- startChars := []string{"[((]", "[\\[【]", "[{{]", "[<《]", "[>》]", "〔"}
- endChars := []string{"[))]", "[\\]】]", "[}}]", "[<《]", "[>》]", "〕"}
- for k, v := range startChars {
- sReg := regexp.MustCompile(v)
- eReg := regexp.MustCompile(endChars[k])
- sIndex := sReg.FindAllStringIndex(value, -1)
- eIndex := eReg.FindAllStringIndex(value, -1)
- sCount := len(sIndex)
- eCount := len(eIndex)
- if sCount == eCount {
- continue
- }
- //清理前面
- if sCount > eCount {
- value = value[sIndex[eCount][1]:]
- }
- //清理后面
- if sCount < eCount {
- value = value[:eIndex[sCount][0]]
- }
- }
- //交叉出现情况处理
- sReplReg := regexp.MustCompile("[((\\[【{{〔<《][^))\\]】}}〕>》]*$")
- eReplReg := regexp.MustCompile("^[^((\\[【{{〔<《]*[))\\]】}}〕>》]")
- if sReplReg.MatchString(value) || eReplReg.MatchString(value) {
- value = sReplReg.ReplaceAllString(value, "")
- value = eReplReg.ReplaceAllString(value, "")
- value = fmt.Sprint(childCutNotPrs([]interface{}{value, data[1]}, count+1)[0])
- }
- data[0] = value
- return data
- }
- //全部是汉字或者特殊符号的情况,清理掉
- func ClearAllWord(data []interface{}) []interface{} {
- value := fmt.Sprint(data[0])
- reg := regexp.MustCompile("^[\u4e00-\u9fa5、,,.。??'\"“”‘’·~!@#¥$%…&*()()\\-—+=【】\\[\\]{}{}<>《》|\\/\\s]+$")
- data[0] = reg.ReplaceAllString(value, "")
- return data
- }
|