123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278 |
- package util
- import (
- "encoding/json"
- "log"
- "strings"
- "unicode"
- "unicode/utf8"
- )
- func dealWithNameScoreRules(name string) (string, bool, float64 , []map[string]interface{}) {
- new_name, new_score, isok := "", float64(0), false
- old_name := escapeNew(name)
- if old_name == "" {
- return "", false, new_score ,nil
- }
- //标准分:
- standard_score := float64(4.0)
- query_name := old_name
- endstr := endWordReg.FindString(query_name)
- if endstr !="" {
- standard_score = 3.0
- query_name = strings.ReplaceAll(query_name,endstr,"")
- }
- query := `{"query":{"bool":{"must":[{"query_string":{"default_field":"unique_qy.name_word","query":"` + query_name + `"}}],"must_not":[],"should":[]}},"from":"0","size":"200"}`
- tmp := make(map[string]interface{})
- json.Unmarshal([]byte(query), &tmp)
- searchResult, err := Client_Es.Search().Index(es_index).Type(es_type).Source(tmp).Do()
- if err != nil {
- log.Println("ES查询出错", name, old_name,err)
- return "", false,new_score, nil
- }
- if searchResult.Hits!= nil{
- resNum := len(searchResult.Hits.Hits)
- res := make([]map[string]interface{}, resNum)
- if searchResult.Hits != nil {
- if resNum < 1000 {
- for i, hit := range searchResult.Hits.Hits {
- data := make(map[string]interface{}, 0)
- json.Unmarshal(*hit.Source, &data)
- res[i] = map[string]interface{}{
- "name": data["name"],
- "score": *hit.Score,
- }
- }
- } else {
- log.Println("查询结果太多,查询到:", resNum, "条")
- }
- }
- if len(res) > 0 {
- //分析分数...取最大
- new_name = ObjToString(res[0]["name"])
- new_score = Float64All(res[0]["score"])
- }
- if new_name != "" { //分析hit比例
- total, hit := dealWithWordsRules(name, new_name)
- proportion := float64(hit) / float64(total)
- if proportion >= 1.0 {
- isok = true
- } else {
- //前置规则-与分数无关 江苏凤凰出版社--江苏凤凰出版社有限公司
- //吉林省彩虹城市建设工程有限公司--吉林彩虹城市建设工程有限公司
- if dealWithPreRule(query_name,new_name) {
- return new_name, true,new_score, res
- }
- if float64(hit)/float64(total) >= 0.8 && new_score > standard_score {
- str1,str2:=startWordReg_1.FindString(name),startWordReg_1.FindString(new_name)
- if str1!="" && str2!="" {
- if strings.Contains(str1,str2)||strings.Contains(str2,str1) {
- }else {
- return new_name, false,new_score, res
- }
- }
- str1,str2 = startWordReg_2.FindString(name),startWordReg_2.FindString(new_name)
- if str1!="" && str2!=""{
- if str1 != str2 {
- return new_name, false,new_score, res
- }
- }
- isok = true
- }else {}
- }
- }
- return new_name, isok,new_score, res
- }
- return new_name,isok,new_score,nil
- }
- func dealWithPreRule(name string , new_name string) bool {
- //log.Println("规则时:",name,new_name)
- endstr := endWordReg.FindString(new_name)
- if endstr !="" {
- new_name = strings.ReplaceAll(new_name,endstr,"")
- }
- if name==new_name {
- return true
- }
- //去掉开头 全程简称
- str1,str2 := startWordReg_2.FindString(name),startWordReg_2.FindString(new_name)
- if str1!="" && str2!="" && str1==str2 {
- //在清全程简称
- start_str_all_1 := startWordReg_3.FindString(name)
- if start_str_all_1 !="" {
- name = strings.ReplaceAll(name,start_str_all_1,"")
- }else {
- start_str_sim_1 := startWordReg_2.FindString(name)
- name = strings.ReplaceAll(name,start_str_sim_1,"")
- }
- start_str_all_2 := startWordReg_3.FindString(new_name)
- if start_str_all_2 !="" {
- new_name = strings.ReplaceAll(new_name,start_str_all_2,"")
- }else {
- start_str_sim_2 := startWordReg_2.FindString(new_name)
- new_name = strings.ReplaceAll(new_name,start_str_sim_2,"")
- }
- if name==new_name {
- return true
- }
- }
- //去掉指定维文字
- name = strings.ReplaceAll(name,"科技","")
- new_name = strings.ReplaceAll(new_name,"科技","")
- name = strings.ReplaceAll(name,"建筑工程","")
- new_name = strings.ReplaceAll(new_name,"建筑工程","")
- name = strings.ReplaceAll(name,"工程","")
- new_name = strings.ReplaceAll(new_name,"工程","")
- name = strings.ReplaceAll(name,"标识","")
- new_name = strings.ReplaceAll(new_name,"标识","")
- name = strings.ReplaceAll(name,"工业","")
- new_name = strings.ReplaceAll(new_name,"工业","")
- name = strings.ReplaceAll(name,"公司","")
- new_name = strings.ReplaceAll(new_name,"公司","")
- name = strings.ReplaceAll(name,"(","")
- new_name = strings.ReplaceAll(new_name,"(","")
- name = strings.ReplaceAll(name,")","")
- new_name = strings.ReplaceAll(new_name,")","")
- name = strings.ReplaceAll(name,"(","")
- new_name = strings.ReplaceAll(new_name,"(","")
- name = strings.ReplaceAll(name,")","")
- new_name = strings.ReplaceAll(new_name,")","")
- name = strings.ReplaceAll(name,"信息技术","信息")
- new_name = strings.ReplaceAll(new_name,"信息技术","信息")
- name = strings.ReplaceAll(name,"电子科技","电子")
- new_name = strings.ReplaceAll(new_name,"电子科技","电子")
- name = strings.ReplaceAll(name,"电子技术","电子")
- new_name = strings.ReplaceAll(new_name,"电子技术","电子")
- name = strings.ReplaceAll(name,"建设集团","建设")
- new_name = strings.ReplaceAll(new_name,"建设集团","建设")
- //log.Println("最终清理后-",name,new_name)
- if name==new_name {
- return true
- }
- return false
- }
- //击中数量以及比例
- func dealWithWordsRules(info_name string, source_name string) (int, int) {
- total, hit := 0, 0
- //字符串处理,替换指定字符
- source_name = strings.ReplaceAll(source_name, "(微型企业)", "")
- source_name = strings.ReplaceAll(source_name, "(有限合伙)", "")
- source_name = strings.ReplaceAll(source_name, "(普通合伙)", "")
- info_name = strings.ReplaceAll(info_name, "(", "")
- info_name = strings.ReplaceAll(info_name, ")", "")
- info_name = strings.ReplaceAll(info_name, "(", "")
- info_name = strings.ReplaceAll(info_name, ")", "")
- info_name = strings.ReplaceAll(info_name, "〉", "")
- source_name = strings.ReplaceAll(source_name, "(", "")
- source_name = strings.ReplaceAll(source_name, ")", "")
- source_name = strings.ReplaceAll(source_name, "(", "")
- source_name = strings.ReplaceAll(source_name, ")", "")
- source_name = strings.ReplaceAll(source_name, "〉", "")
- nameArr, _ := calculateWordCount(info_name)
- _, total = calculateWordCount(source_name)
- for _, v1 := range nameArr {
- if strings.Contains(source_name, v1) {
- hit++
- }
- }
- return total, hit
- }
- //分词结果
- func calculateWordCount(name string) ([]string, int) {
- arr, space := make([]string, 0), 2
- total := utf8.RuneCountInString(name) - (space - 1)
- if name == "" || total <= 0 {
- return arr, 0
- }
- nameRune := []rune(name)
- for i := 0; i < total; i++ {
- new_str := string(nameRune[i : space+i])
- arr = append(arr, new_str)
- }
- return arr, len(arr)
- }
- //func escape(s string) string {
- // news := ""
- // s = strings.ReplaceAll(s," ","")
- // for _, c := range s {
- // //if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
- // // news = news + string(c)
- // //}else if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
- // // a := string([]rune{os.PathSeparator, '\\'})
- // // news = news + a + string(c)
- // //} else {
- // // return ""
- // //}
- // if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
- // a := string([]rune{os.PathSeparator,'\\'})
- // //news = news + a + `\` + string(c)
- // news = news + a + string(c)
- // } else {
- // news = news + string(c)
- // }
- //
- // }
- // return news
- //}
- func escapeNew(s string) string {
- news := ""
- s = strings.ReplaceAll(s, " ", "")
- for _, c := range s {
- if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
- news = news + string(c)
- }
- }
- return news
- }
|