package util import ( "encoding/json" "log" "regexp" "strings" "unicode" "unicode/utf8" ) func dealWithNameScoreRules(name string) (string, bool, float64 , []map[string]interface{}) { new_name, new_score, isok := "", float64(0), false old_name := escapeNew(name) if old_name == "" { return "", false, new_score ,nil } //标准分: standard_score := float64(4.0) query_name := old_name endstr := endWordReg.FindString(query_name) if endstr !="" { standard_score = float64(3.0) query_name = strings.ReplaceAll(query_name,endstr,"") } query := `{"query":{"bool":{"must":[{"query_string":{"default_field":"unique_qy.name_word","query":"` + query_name + `"}}],"must_not":[],"should":[]}},"from":"0","size":"200"}` tmp := make(map[string]interface{}) json.Unmarshal([]byte(query), &tmp) searchResult, err := Client_Es.Search().Index(es_index).Type(es_type).Source(tmp).Do() if err != nil { log.Println("ES查询出错", name, old_name,err) return "", false,new_score, nil } if searchResult.Hits!= nil{ resNum := len(searchResult.Hits.Hits) res := make([]map[string]interface{}, resNum) if searchResult.Hits != nil { if resNum < 1000 { for i, hit := range searchResult.Hits.Hits { data := make(map[string]interface{}, 0) json.Unmarshal(*hit.Source, &data) res[i] = map[string]interface{}{ "name": data["name"], "score": *hit.Score, } } } else { log.Println("查询结果太多,查询到:", resNum, "条") } } if len(res) > 0 { //针对特殊情况-特殊处理 公司结尾 new_name = ObjToString(res[0]["name"]) new_score = Float64All(res[0]["score"]) if endComReg.MatchString(name) { new_name,new_score = dealWithSpecialName(name,res) if new_name=="" { new_name = ObjToString(res[0]["name"]) new_score = Float64All(res[0]["score"]) } } } if new_name != "" { //分析hit比例 total, hit := dealWithWordsRules(name, new_name) proportion := float64(hit) / float64(total) if proportion >= 1.0 { isok = true } else { //前置规则-与分数无关 江苏凤凰出版社--江苏凤凰出版社有限公司 //吉林省彩虹城市建设工程有限公司--吉林彩虹城市建设工程有限公司 if dealWithPreRule(query_name,new_name) { return new_name, true,new_score, res } if float64(hit)/float64(total) >= 0.8 && new_score > standard_score { str1,str2:=startWordReg_1.FindString(name),startWordReg_1.FindString(new_name) if str1!="" && str2!="" { if strings.Contains(str1,str2)||strings.Contains(str2,str1) { }else { return new_name, false,new_score, res } } str1,str2 = startWordReg_2.FindString(name),startWordReg_2.FindString(new_name) if str1!="" && str2!=""{ if str1 != str2 { return new_name, false,new_score, res } } isok = true }else {} } } return new_name, isok,new_score, res } return new_name,isok,new_score,nil } func dealWithPreRule(name string , new_name string) bool { //log.Println("规则时:",name,new_name) endstr := endWordReg.FindString(new_name) if endstr !="" { new_name = strings.ReplaceAll(new_name,endstr,"") } if name==new_name { return true } //去掉开头 全程简称 str1,str2 := startWordReg_2.FindString(name),startWordReg_2.FindString(new_name) if str1!="" && str2!="" && str1==str2 { //在清全程简称 start_str_all_1 := startWordReg_3.FindString(name) if start_str_all_1 !="" { name = strings.ReplaceAll(name,start_str_all_1,"") }else { start_str_sim_1 := startWordReg_2.FindString(name) name = strings.ReplaceAll(name,start_str_sim_1,"") } start_str_all_2 := startWordReg_3.FindString(new_name) if start_str_all_2 !="" { new_name = strings.ReplaceAll(new_name,start_str_all_2,"") }else { start_str_sim_2 := startWordReg_2.FindString(new_name) new_name = strings.ReplaceAll(new_name,start_str_sim_2,"") } if name==new_name { return true } } //去掉指定维文字 name = strings.ReplaceAll(name,"科技","") new_name = strings.ReplaceAll(new_name,"科技","") name = strings.ReplaceAll(name,"建筑工程","") new_name = strings.ReplaceAll(new_name,"建筑工程","") name = strings.ReplaceAll(name,"工程","") new_name = strings.ReplaceAll(new_name,"工程","") name = strings.ReplaceAll(name,"标识","") new_name = strings.ReplaceAll(new_name,"标识","") name = strings.ReplaceAll(name,"工业","") new_name = strings.ReplaceAll(new_name,"工业","") name = strings.ReplaceAll(name,"公司","") new_name = strings.ReplaceAll(new_name,"公司","") name = strings.ReplaceAll(name,"(","") new_name = strings.ReplaceAll(new_name,"(","") name = strings.ReplaceAll(name,")","") new_name = strings.ReplaceAll(new_name,")","") name = strings.ReplaceAll(name,"(","") new_name = strings.ReplaceAll(new_name,"(","") name = strings.ReplaceAll(name,")","") new_name = strings.ReplaceAll(new_name,")","") name = strings.ReplaceAll(name,"信息技术","信息") new_name = strings.ReplaceAll(new_name,"信息技术","信息") name = strings.ReplaceAll(name,"电子科技","电子") new_name = strings.ReplaceAll(new_name,"电子科技","电子") name = strings.ReplaceAll(name,"电子技术","电子") new_name = strings.ReplaceAll(new_name,"电子技术","电子") name = strings.ReplaceAll(name,"建设集团","建设") new_name = strings.ReplaceAll(new_name,"建设集团","建设") //log.Println("最终清理后-",name,new_name) if name==new_name { return true } return false } //击中数量以及比例 func dealWithWordsRules(info_name string, source_name string) (int, int) { total, hit := 0, 0 //字符串处理,替换指定字符 source_name = strings.ReplaceAll(source_name, "(微型企业)", "") source_name = strings.ReplaceAll(source_name, "(有限合伙)", "") source_name = strings.ReplaceAll(source_name, "(普通合伙)", "") info_name = strings.ReplaceAll(info_name, "(", "") info_name = strings.ReplaceAll(info_name, ")", "") info_name = strings.ReplaceAll(info_name, "(", "") info_name = strings.ReplaceAll(info_name, ")", "") info_name = strings.ReplaceAll(info_name, "〉", "") source_name = strings.ReplaceAll(source_name, "(", "") source_name = strings.ReplaceAll(source_name, ")", "") source_name = strings.ReplaceAll(source_name, "(", "") source_name = strings.ReplaceAll(source_name, ")", "") source_name = strings.ReplaceAll(source_name, "〉", "") nameArr, _ := calculateWordCount(info_name) _, total = calculateWordCount(source_name) for _, v1 := range nameArr { if strings.Contains(source_name, v1) { hit++ } } return total, hit } //分词结果 func calculateWordCount(name string) ([]string, int) { arr, space := make([]string, 0), 2 total := utf8.RuneCountInString(name) - (space - 1) if name == "" || total <= 0 { return arr, 0 } nameRune := []rune(name) for i := 0; i < total; i++ { new_str := string(nameRune[i : space+i]) arr = append(arr, new_str) } return arr, len(arr) } //func escape(s string) string { // news := "" // s = strings.ReplaceAll(s," ","") // for _, c := range s { // //if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) { // // news = news + string(c) // //}else if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' { // // a := string([]rune{os.PathSeparator, '\\'}) // // news = news + a + string(c) // //} else { // // return "" // //} // if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' { // a := string([]rune{os.PathSeparator,'\\'}) // //news = news + a + `\` + string(c) // news = news + a + string(c) // } else { // news = news + string(c) // } // // } // return news //} func escapeNew(s string) string { news := "" s = strings.ReplaceAll(s, " ", "") for _, c := range s { if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) { news = news + string(c) } } return news } //处理特殊情况-分析最终 func dealWithSpecialName(name string,res []map[string]interface{})(string ,float64) { //log.Println("特殊...") new_name ,new_score,proportion := "",float64(0),float64(0) for k,v:=range res { tmp_name := ObjToString(v["name"]) tmp_score := Float64All(v["score"]) if k>=20 || tmp_score<3.0 { break } if endComReg.MatchString(tmp_name) { if endFComReg.MatchString(name) { if !endFComReg.MatchString(tmp_name) { continue } }else { if endFComReg.MatchString(tmp_name) { continue } } }else { continue } total, hit := dealWithWordsRules(tmp_name, name) tmp_proportion := float64(hit) / float64(total) //log.Println(tmp_proportion,tmp_name,name) if tmp_proportion > proportion { proportion = tmp_proportion new_name = tmp_name new_score = tmp_score } } return new_name,float64(new_score) } var endComReg *regexp.Regexp = regexp.MustCompile("(公司)$") var endFComReg *regexp.Regexp = regexp.MustCompile("(分公司)$")