package clear import ( "fmt" "regexp" "strings" ) var cutSpace *regexp.Regexp var cutAllSpace *regexp.Regexp var catSymbol *regexp.Regexp var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n"} func init() { cutSpace, _ = regexp.Compile(`^\s*|\s*$`) cutAllSpace, _ = regexp.Compile(`\s*`) catSymbol, _ = regexp.Compile(`[]+`) } var LableStr = "&?(amp|nbsp|#8266);?|(<).*?(>?)" var at = rune('&') var ed = rune(';') var lableMap = map[string]rune{ "&": rune('&'), " ": rune(' '), ">": rune('>'), "<": rune('<'), } //处理转义标签 func CutLableStr(con string) string { for i := 0; i < 3; i++ { runes := []rune{} pools := []rune{} bpool := false strings.IndexFunc(con, func(s rune) bool { if !bpool && s == at { bpool = true pools = []rune{} } if bpool { pools = append(pools, s) if s == ed { //结束 lb := lableMap[string(pools)] if lb != 0 { runes = append(runes, lb) } else { runes = append(runes, pools...) } bpool = false } else if len(pools) > 6 { bpool = false runes = append(runes, pools...) } } else { runes = append(runes, s) } return false }) str1 := string(runes) if i > 0 && con == str1 { break } con = str1 } return con } //清理开始、结尾的空白字符 func CutSpace(data []interface{}) []interface{} { tmp := cutSpace.ReplaceAllString(strings.Replace(fmt.Sprint(data[0]), " ", " ", -1), "") tmp = replaceSymbol(tmp, spaces) //fmt.Println("cutspace", tmp) data[0] = tmp return data } //清理所有空白符 func CutAllSpace(data []interface{}) []interface{} { tmp := cutAllSpace.ReplaceAllString(fmt.Sprint(data[0]), "") tmp = replaceSymbol(tmp, spaces) data[0] = tmp return data } //清理符号 func CutSymbol(data []interface{}) []interface{} { value := fmt.Sprint(CutSpace(data)[0]) symbol := ",,;;::'\"“”。.\\??、/+=\\_—*&……\\^%$¥@!!`~·" startSymbol := "^[" + ")\\)>》】\\]}}〕" + symbol + "]+" endSymbol := "[" + "(\\(<《【\\[{{〔" + symbol + "]+$" startReg := regexp.MustCompile(startSymbol) endReg := regexp.MustCompile(endSymbol) value = startReg.ReplaceAllString(value, "") value = endReg.ReplaceAllString(value, "") value = fmt.Sprint(CutSpace([]interface{}{value, data[1]})[0]) return []interface{}{value, data[1]} } //不成对出现的符号,把符号后面的内容清理掉 func CutNotPrs(data []interface{}) []interface{} { return childCutNotPrs(data, 1) } //不成对出现的符号,把符号后面的内容清理掉 func childCutNotPrs(data []interface{}, count int) []interface{} { value := fmt.Sprint(data[0]) if count >= 50 || value == "" { return data } startChars := []string{"[((]", "[\\[【]", "[{{]", "[<《]", "[>》]", "〔"} endChars := []string{"[))]", "[\\]】]", "[}}]", "[<《]", "[>》]", "〕"} for k, v := range startChars { sReg := regexp.MustCompile(v) eReg := regexp.MustCompile(endChars[k]) sIndex := sReg.FindAllStringIndex(value, -1) eIndex := eReg.FindAllStringIndex(value, -1) sCount := len(sIndex) eCount := len(eIndex) if sCount == eCount { continue } //清理前面 if sCount > eCount { value = value[sIndex[eCount][1]:] } //清理后面 if sCount < eCount { value = value[:eIndex[sCount][0]] } } //交叉出现情况处理 sReplReg := regexp.MustCompile("[((\\[【{{〔<《][^))\\]】}}〕>》]*$") eReplReg := regexp.MustCompile("^[^((\\[【{{〔<《]*[))\\]】}}〕>》]") if sReplReg.MatchString(value) || eReplReg.MatchString(value) { value = sReplReg.ReplaceAllString(value, "") value = eReplReg.ReplaceAllString(value, "") value = fmt.Sprint(childCutNotPrs([]interface{}{value, data[1]}, count+1)[0]) } data[0] = value return data } //全部是汉字或者特殊符号的情况,清理掉 func ClearAllWord(data []interface{}) []interface{} { value := fmt.Sprint(data[0]) reg := regexp.MustCompile("^[\u4e00-\u9fa5、,,.。??'\"“”‘’·~!@#¥$%…&*()()\\-—+=【】\\[\\]{}{}<>《》|\\/\\s]+$") data[0] = reg.ReplaceAllString(value, "") return data }