fengweiqiang 4 năm trước cách đây
mục cha
commit
b7f130fb6c
1 tập tin đã thay đổi với 136 bổ sung0 xóa
  1. 136 0
      udpdataclear/udpSensitiveWords/util/words.go

+ 136 - 0
udpdataclear/udpSensitiveWords/util/words.go

@@ -0,0 +1,136 @@
+package util
+
+import (
+	"encoding/json"
+	"log"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+func dealWithNameScoreRules(name string) (string, bool, []map[string]interface{}) {
+	new_name, new_score, isok := "", float64(0), false
+	old_name := escapeNew(name)
+	if old_name == "" {
+		return "", false, nil
+	}
+	query := `{"query":{"bool":{"must":[{"query_string":{"default_field":"unique_qy.name_word","query":"` + old_name + `"}}],"must_not":[],"should":[]}},"from":"0","size":"300"}`
+	tmp := make(map[string]interface{})
+	json.Unmarshal([]byte(query), &tmp)
+	searchResult, err := Client_Es.Search().Index(es_index).Type(es_type).Source(tmp).Do()
+	if err != nil {
+		log.Println("ES查询出错", name, old_name,err)
+		return "", false, nil
+	}
+	if searchResult.Hits!= nil{
+		resNum := len(searchResult.Hits.Hits)
+		res := make([]map[string]interface{}, resNum)
+		if searchResult.Hits != nil {
+			if resNum < 1000 {
+				for i, hit := range searchResult.Hits.Hits {
+					data := make(map[string]interface{}, 0)
+					json.Unmarshal(*hit.Source, &data)
+					res[i] = map[string]interface{}{
+						"name":  data["name"],
+						"score": *hit.Score,
+					}
+				}
+			} else {
+				log.Println("查询结果太多,查询到:", resNum, "条")
+			}
+		}
+		if len(res) > 0 {
+			//分析分数...取最大
+
+			new_name = ObjToString(res[0]["name"])
+			new_score = Float64All(res[0]["score"])
+		}
+		if new_name != "" { //分析hit比例
+			total, hit := dealWithWordsRules(name, new_name)
+			proportion := float64(hit) / float64(total)
+			if proportion >= 1.0 {
+				isok = true
+			} else {
+				if float64(hit)/float64(total) >= 0.8 && new_score > 4.0 {
+					isok = true
+				}
+			}
+		}
+		return new_name, isok, res
+	}
+	return new_name,isok,nil
+}
+
+//击中数量以及比例
+func dealWithWordsRules(info_name string, source_name string) (int, int) {
+	total, hit := 0, 0
+
+	//字符串处理,替换指定字符
+	info_name = strings.ReplaceAll(info_name, "(", "")
+	info_name = strings.ReplaceAll(info_name, ")", "")
+	info_name = strings.ReplaceAll(info_name, "(", "")
+	info_name = strings.ReplaceAll(info_name, ")", "")
+	source_name = strings.ReplaceAll(source_name, "(", "")
+	source_name = strings.ReplaceAll(source_name, ")", "")
+	source_name = strings.ReplaceAll(source_name, "(", "")
+	source_name = strings.ReplaceAll(source_name, ")", "")
+
+	nameArr, _ := calculateWordCount(info_name)
+	_, total = calculateWordCount(source_name)
+	for _, v1 := range nameArr {
+		if strings.Contains(source_name, v1) {
+			hit++
+		}
+	}
+	return total, hit
+}
+
+//分词结果
+func calculateWordCount(name string) ([]string, int) {
+	arr, space := make([]string, 0), 2
+	total := utf8.RuneCountInString(name) - (space - 1)
+	if name == "" || total <= 0 {
+		return arr, 0
+	}
+	nameRune := []rune(name)
+	for i := 0; i < total; i++ {
+		new_str := string(nameRune[i : space+i])
+		arr = append(arr, new_str)
+	}
+	return arr, len(arr)
+}
+
+//func escape(s string) string {
+//	news := ""
+//	s = strings.ReplaceAll(s," ","")
+//	for _, c := range s {
+//		//if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
+//		//	news = news + string(c)
+//		//}else if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
+//		//	a := string([]rune{os.PathSeparator, '\\'})
+//		//	news = news + a + string(c)
+//		//} else {
+//		//	return ""
+//		//}
+//		if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
+//			a := string([]rune{os.PathSeparator,'\\'})
+//			//news = news + a + `\` + string(c)
+//			news = news + a  + string(c)
+//		} else {
+//			news = news + string(c)
+//		}
+//
+//	}
+//	return news
+//}
+
+func escapeNew(s string) string {
+	news := ""
+	s = strings.ReplaceAll(s, " ", "")
+	for _, c := range s {
+		if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
+			news = news + string(c)
+		}
+	}
+	return news
+}