package clear import ( "fmt" "regexp" "strings" ) var ( cutSpace *regexp.Regexp cutAllSpace *regexp.Regexp catSymbol *regexp.Regexp separateSymbol *regexp.Regexp placeReg *regexp.Regexp ) var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n"} func init() { cutSpace, _ = regexp.Compile(`^\s*|\s*$`) cutAllSpace, _ = regexp.Compile(`\s*`) catSymbol, _ = regexp.Compile(`[]+`) separateSymbol, _ = regexp.Compile("[\\s\u3000\u2003\u00a0\\n,,、/。|]") placeReg, _ = regexp.Compile("^.*(公司|学(校)?|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体)|工作室)$") } var LableStr = "&?(amp|nbsp|#8266);?|(<).*?(>?)" var at = rune('&') var ed = rune(';') var lableMap = map[string]rune{ "&": rune('&'), " ": rune(' '), ">": rune('>'), "<": rune('<'), } //处理转义标签 func CutLableStr(con string) string { for i := 0; i < 3; i++ { runes := []rune{} pools := []rune{} bpool := false strings.IndexFunc(con, func(s rune) bool { if !bpool && s == at { bpool = true pools = []rune{} } if bpool { pools = append(pools, s) if s == ed { //结束 lb := lableMap[string(pools)] if lb != 0 { runes = append(runes, lb) } else { runes = append(runes, pools...) } bpool = false } else if len(pools) > 6 { bpool = false runes = append(runes, pools...) } } else { runes = append(runes, s) } return false }) str1 := string(runes) if i > 0 && con == str1 { break } con = str1 } return con } //清理开始、结尾的空白字符 func CutSpace(data []interface{}) []interface{} { tmp := cutSpace.ReplaceAllString(strings.Replace(fmt.Sprint(data[0]), " ", " ", -1), "") tmp = replaceSymbol(tmp, spaces) //fmt.Println("cutspace", tmp) data[0] = tmp return data } //清理所有空白符 func CutAllSpace(data []interface{}) []interface{} { tmp := cutAllSpace.ReplaceAllString(fmt.Sprint(data[0]), "") tmp = replaceSymbol(tmp, spaces) data[0] = tmp return data } //清理符号 func CutSymbol(data []interface{}) []interface{} { value := fmt.Sprint(CutSpace(data)[0]) symbol := ",,;;::'\"“”。.\\??、/+=\\_—*&……\\^%$¥@!!`~·" startSymbol := "^[" + ")\\)>》】\\]}}〕" + symbol + "]+" endSymbol := "[" + "(\\(<《【\\[{{〔" + symbol + "]+$" startReg := regexp.MustCompile(startSymbol) endReg := regexp.MustCompile(endSymbol) value = startReg.ReplaceAllString(value, "") value = endReg.ReplaceAllString(value, "") value = fmt.Sprint(CutSpace([]interface{}{value, data[1]})[0]) return []interface{}{value, data[1]} } //不成对出现的符号,把符号后面的内容清理掉 func CutNotPrs(data []interface{}) []interface{} { return childCutNotPrs(data, 1) } //不成对出现的符号,把符号后面的内容清理掉 func childCutNotPrs(data []interface{}, count int) []interface{} { value := fmt.Sprint(data[0]) if count >= 50 || value == "" { return data } startChars := []string{"[((]", "[\\[【]", "[{{]", "[<《]", "〔"} endChars := []string{"[))]", "[\\]】]", "[}}]", "[>》]", "〕"} for k, v := range startChars { sReg := regexp.MustCompile(v) eReg := regexp.MustCompile(endChars[k]) sIndex := sReg.FindAllStringIndex(value, -1) eIndex := eReg.FindAllStringIndex(value, -1) sCount := len(sIndex) eCount := len(eIndex) if sCount == eCount { continue } //清理前面 if sCount > eCount { value = value[sIndex[eCount][1]:] } //清理后面 if sCount < eCount { value = value[:eIndex[sCount][0]] } } //交叉出现情况处理 sReplReg := regexp.MustCompile("[((\\[【{{〔<《][^))\\]】}}〕>》]*$") eReplReg := regexp.MustCompile("^[^((\\[【{{〔<《]*[))\\]】}}〕>》]") if sReplReg.MatchString(value) || eReplReg.MatchString(value) { value = sReplReg.ReplaceAllString(value, "") value = eReplReg.ReplaceAllString(value, "") value = fmt.Sprint(childCutNotPrs([]interface{}{value, data[1]}, count+1)[0]) } data[0] = value return data } //全部是汉字或者特殊符号的情况,清理掉 func ClearAllWord(data []interface{}) []interface{} { value := fmt.Sprint(data[0]) reg := regexp.MustCompile("^[\u4e00-\u9fa5、,,.。??'\"“”‘’·~!@#¥$%…&*()()\\-—+=【】\\[\\]{}{}<>《》|\\/\\s]+$") data[0] = reg.ReplaceAllString(value, "") return data } //中文符号转英文 func ChiToEng(data []interface{}) []interface{} { value := fmt.Sprint(data[0]) startChars := []string{"(", "【", "{", "“", ")", "】", "}", "”"} endChars := []string{"(", "[", "{", "\"", ")", "]", "}", "\""} for i, v := range startChars { sReg := regexp.MustCompile(v) sIndex := sReg.FindAllStringIndex(value, -1) for j := 1; j <= len(sIndex); j++ { value = sReg.ReplaceAllString(value, endChars[i]) } } data[0] = value return data } func ClearBuyerPerson(data []interface{}) []interface{} { value := fmt.Sprint(data[0]) //tmp := []string{} if len([]rune(value)) > 4 { //名字默认最长4 tmp := "" valuearr := separateSymbol.Split(value, -1) length := len(valuearr) for i, v := range valuearr { if v == "" { continue } if i == 0 && placeReg.MatchString(v) { if length == 1 { if len([]rune(v)) >= 4 { tmp = "" } else { tmp = tmp + v } } else { tmp = tmp + v + "-" } } else if len([]rune(v)) <= 4 { if i+1 != length { tmp = tmp + v + "," } else { tmp = tmp + v } } } data[0] = tmp } else { value = separateSymbol.ReplaceAllString(value, "") data[0] = value } return data }