words.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. package util
  2. import (
  3. "encoding/json"
  4. "log"
  5. "strings"
  6. "unicode"
  7. "unicode/utf8"
  8. )
  9. func dealWithNameScoreRules(name string) (string, bool, float64 , []map[string]interface{}) {
  10. new_name, new_score, isok := "", float64(0), false
  11. old_name := escapeNew(name)
  12. if old_name == "" {
  13. return "", false, new_score ,nil
  14. }
  15. //标准分:
  16. standard_score := float64(4.0)
  17. query_name := old_name
  18. endstr := endWordReg.FindString(query_name)
  19. if endstr !="" {
  20. standard_score = 3.0
  21. query_name = strings.ReplaceAll(query_name,endstr,"")
  22. }
  23. query := `{"query":{"bool":{"must":[{"query_string":{"default_field":"unique_qy.name_word","query":"` + query_name + `"}}],"must_not":[],"should":[]}},"from":"0","size":"200"}`
  24. tmp := make(map[string]interface{})
  25. json.Unmarshal([]byte(query), &tmp)
  26. searchResult, err := Client_Es.Search().Index(es_index).Type(es_type).Source(tmp).Do()
  27. if err != nil {
  28. log.Println("ES查询出错", name, old_name,err)
  29. return "", false,new_score, nil
  30. }
  31. if searchResult.Hits!= nil{
  32. resNum := len(searchResult.Hits.Hits)
  33. res := make([]map[string]interface{}, resNum)
  34. if searchResult.Hits != nil {
  35. if resNum < 1000 {
  36. for i, hit := range searchResult.Hits.Hits {
  37. data := make(map[string]interface{}, 0)
  38. json.Unmarshal(*hit.Source, &data)
  39. res[i] = map[string]interface{}{
  40. "name": data["name"],
  41. "score": *hit.Score,
  42. }
  43. }
  44. } else {
  45. log.Println("查询结果太多,查询到:", resNum, "条")
  46. }
  47. }
  48. if len(res) > 0 {
  49. //分析分数...取最大
  50. new_name = ObjToString(res[0]["name"])
  51. new_score = Float64All(res[0]["score"])
  52. }
  53. if new_name != "" { //分析hit比例
  54. total, hit := dealWithWordsRules(name, new_name)
  55. proportion := float64(hit) / float64(total)
  56. if proportion >= 1.0 {
  57. isok = true
  58. } else {
  59. if float64(hit)/float64(total) >= 0.8 && new_score > standard_score {
  60. str1,str2:=startWordReg_1.FindString(name),startWordReg_1.FindString(new_name)
  61. if str1!="" && str2!="" {
  62. if strings.Contains(str1,str2)||strings.Contains(str2,str1) {
  63. }else {
  64. return new_name, false,new_score, res
  65. }
  66. }
  67. str1,str2 = startWordReg_2.FindString(name),startWordReg_2.FindString(new_name)
  68. if str1!="" && str2!=""{
  69. if str1 != str2 {
  70. return new_name, false,new_score, res
  71. }
  72. }
  73. isok = true
  74. }else if new_score > standard_score {
  75. str1,str2:=name,new_name
  76. str1 = strings.ReplaceAll(str1,"责任","")
  77. str2 = strings.ReplaceAll(str2,"责任","")
  78. str1 = strings.ReplaceAll(str1,"有限","")
  79. str2 = strings.ReplaceAll(str2,"有限","")
  80. str1 = strings.ReplaceAll(str1,"科技","")
  81. str2 = strings.ReplaceAll(str2,"科技","")
  82. str1 = strings.ReplaceAll(str1,"工程","")
  83. str2 = strings.ReplaceAll(str2,"工程","")
  84. if str1==str2 {
  85. return new_name, true,new_score, res
  86. }
  87. }else {
  88. }
  89. }
  90. }
  91. return new_name, isok,new_score, res
  92. }
  93. return new_name,isok,new_score,nil
  94. }
  95. //击中数量以及比例
  96. func dealWithWordsRules(info_name string, source_name string) (int, int) {
  97. total, hit := 0, 0
  98. //字符串处理,替换指定字符
  99. source_name = strings.ReplaceAll(source_name, "(微型企业)", "")
  100. source_name = strings.ReplaceAll(source_name, "(有限合伙)", "")
  101. source_name = strings.ReplaceAll(source_name, "(普通合伙)", "")
  102. info_name = strings.ReplaceAll(info_name, "(", "")
  103. info_name = strings.ReplaceAll(info_name, ")", "")
  104. info_name = strings.ReplaceAll(info_name, "(", "")
  105. info_name = strings.ReplaceAll(info_name, ")", "")
  106. source_name = strings.ReplaceAll(source_name, "(", "")
  107. source_name = strings.ReplaceAll(source_name, ")", "")
  108. source_name = strings.ReplaceAll(source_name, "(", "")
  109. source_name = strings.ReplaceAll(source_name, ")", "")
  110. nameArr, _ := calculateWordCount(info_name)
  111. _, total = calculateWordCount(source_name)
  112. for _, v1 := range nameArr {
  113. if strings.Contains(source_name, v1) {
  114. hit++
  115. }
  116. }
  117. return total, hit
  118. }
  119. //分词结果
  120. func calculateWordCount(name string) ([]string, int) {
  121. arr, space := make([]string, 0), 2
  122. total := utf8.RuneCountInString(name) - (space - 1)
  123. if name == "" || total <= 0 {
  124. return arr, 0
  125. }
  126. nameRune := []rune(name)
  127. for i := 0; i < total; i++ {
  128. new_str := string(nameRune[i : space+i])
  129. arr = append(arr, new_str)
  130. }
  131. return arr, len(arr)
  132. }
  133. //func escape(s string) string {
  134. // news := ""
  135. // s = strings.ReplaceAll(s," ","")
  136. // for _, c := range s {
  137. // //if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
  138. // // news = news + string(c)
  139. // //}else if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
  140. // // a := string([]rune{os.PathSeparator, '\\'})
  141. // // news = news + a + string(c)
  142. // //} else {
  143. // // return ""
  144. // //}
  145. // if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
  146. // a := string([]rune{os.PathSeparator,'\\'})
  147. // //news = news + a + `\` + string(c)
  148. // news = news + a + string(c)
  149. // } else {
  150. // news = news + string(c)
  151. // }
  152. //
  153. // }
  154. // return news
  155. //}
  156. func escapeNew(s string) string {
  157. news := ""
  158. s = strings.ReplaceAll(s, " ", "")
  159. for _, c := range s {
  160. if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
  161. news = news + string(c)
  162. }
  163. }
  164. return news
  165. }