words.go 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. package util
  2. import (
  3. "encoding/json"
  4. "log"
  5. "regexp"
  6. "strings"
  7. "unicode"
  8. "unicode/utf8"
  9. )
  10. func dealWithNameScoreRules(name string) (string, bool, float64 , []map[string]interface{}) {
  11. new_name, new_score, isok := "", float64(0), false
  12. old_name := escapeNew(name)
  13. if old_name == "" {
  14. return "", false, new_score ,nil
  15. }
  16. //标准分:
  17. standard_score := float64(4.0)
  18. query_name := old_name
  19. endstr := endWordReg.FindString(query_name)
  20. if endstr !="" {
  21. standard_score = float64(3.0)
  22. query_name = strings.ReplaceAll(query_name,endstr,"")
  23. }
  24. query := `{"query":{"bool":{"must":[{"query_string":{"default_field":"unique_qy.name_word","query":"` + query_name + `"}}],"must_not":[],"should":[]}},"from":"0","size":"200"}`
  25. tmp := make(map[string]interface{})
  26. json.Unmarshal([]byte(query), &tmp)
  27. searchResult, err := Client_Es.Search().Index(es_index).Type(es_type).Source(tmp).Do()
  28. if err != nil {
  29. log.Println("ES查询出错", name, old_name,err)
  30. return "", false,new_score, nil
  31. }
  32. if searchResult.Hits!= nil{
  33. resNum := len(searchResult.Hits.Hits)
  34. res := make([]map[string]interface{}, resNum)
  35. if searchResult.Hits != nil {
  36. if resNum < 1000 {
  37. for i, hit := range searchResult.Hits.Hits {
  38. data := make(map[string]interface{}, 0)
  39. json.Unmarshal(*hit.Source, &data)
  40. res[i] = map[string]interface{}{
  41. "name": data["name"],
  42. "score": *hit.Score,
  43. }
  44. }
  45. } else {
  46. log.Println("查询结果太多,查询到:", resNum, "条")
  47. }
  48. }
  49. if len(res) > 0 {
  50. //针对特殊情况-特殊处理 公司结尾
  51. new_name = ObjToString(res[0]["name"])
  52. new_score = Float64All(res[0]["score"])
  53. if endComReg.MatchString(name) {
  54. new_name,new_score = dealWithSpecialName(name,res)
  55. if new_name=="" {
  56. new_name = ObjToString(res[0]["name"])
  57. new_score = Float64All(res[0]["score"])
  58. }
  59. }
  60. }
  61. if new_name != "" { //分析hit比例
  62. total, hit := dealWithWordsRules(name, new_name)
  63. proportion := float64(hit) / float64(total)
  64. if proportion >= 1.0 {
  65. isok = true
  66. } else {
  67. //前置规则-与分数无关 江苏凤凰出版社--江苏凤凰出版社有限公司
  68. //吉林省彩虹城市建设工程有限公司--吉林彩虹城市建设工程有限公司
  69. if dealWithPreRule(query_name,new_name) {
  70. return new_name, true,new_score, res
  71. }
  72. if float64(hit)/float64(total) >= 0.8 && new_score > standard_score {
  73. str1,str2:=startWordReg_1.FindString(name),startWordReg_1.FindString(new_name)
  74. if str1!="" && str2!="" {
  75. if strings.Contains(str1,str2)||strings.Contains(str2,str1) {
  76. }else {
  77. return new_name, false,new_score, res
  78. }
  79. }
  80. str1,str2 = startWordReg_2.FindString(name),startWordReg_2.FindString(new_name)
  81. if str1!="" && str2!=""{
  82. if str1 != str2 {
  83. return new_name, false,new_score, res
  84. }
  85. }
  86. isok = true
  87. }else {}
  88. }
  89. }
  90. return new_name, isok,new_score, res
  91. }
  92. return new_name,isok,new_score,nil
  93. }
  94. func dealWithPreRule(name string , new_name string) bool {
  95. //log.Println("规则时:",name,new_name)
  96. endstr := endWordReg.FindString(new_name)
  97. if endstr !="" {
  98. new_name = strings.ReplaceAll(new_name,endstr,"")
  99. }
  100. if name==new_name {
  101. return true
  102. }
  103. //去掉开头 全程简称
  104. str1,str2 := startWordReg_2.FindString(name),startWordReg_2.FindString(new_name)
  105. if str1!="" && str2!="" && str1==str2 {
  106. //在清全程简称
  107. start_str_all_1 := startWordReg_3.FindString(name)
  108. if start_str_all_1 !="" {
  109. name = strings.ReplaceAll(name,start_str_all_1,"")
  110. }else {
  111. start_str_sim_1 := startWordReg_2.FindString(name)
  112. name = strings.ReplaceAll(name,start_str_sim_1,"")
  113. }
  114. start_str_all_2 := startWordReg_3.FindString(new_name)
  115. if start_str_all_2 !="" {
  116. new_name = strings.ReplaceAll(new_name,start_str_all_2,"")
  117. }else {
  118. start_str_sim_2 := startWordReg_2.FindString(new_name)
  119. new_name = strings.ReplaceAll(new_name,start_str_sim_2,"")
  120. }
  121. if name==new_name {
  122. return true
  123. }
  124. }
  125. //去掉指定维文字
  126. name = strings.ReplaceAll(name,"科技","")
  127. new_name = strings.ReplaceAll(new_name,"科技","")
  128. name = strings.ReplaceAll(name,"建筑工程","")
  129. new_name = strings.ReplaceAll(new_name,"建筑工程","")
  130. name = strings.ReplaceAll(name,"工程","")
  131. new_name = strings.ReplaceAll(new_name,"工程","")
  132. name = strings.ReplaceAll(name,"标识","")
  133. new_name = strings.ReplaceAll(new_name,"标识","")
  134. name = strings.ReplaceAll(name,"工业","")
  135. new_name = strings.ReplaceAll(new_name,"工业","")
  136. name = strings.ReplaceAll(name,"公司","")
  137. new_name = strings.ReplaceAll(new_name,"公司","")
  138. name = strings.ReplaceAll(name,"(","")
  139. new_name = strings.ReplaceAll(new_name,"(","")
  140. name = strings.ReplaceAll(name,")","")
  141. new_name = strings.ReplaceAll(new_name,")","")
  142. name = strings.ReplaceAll(name,"(","")
  143. new_name = strings.ReplaceAll(new_name,"(","")
  144. name = strings.ReplaceAll(name,")","")
  145. new_name = strings.ReplaceAll(new_name,")","")
  146. name = strings.ReplaceAll(name,"信息技术","信息")
  147. new_name = strings.ReplaceAll(new_name,"信息技术","信息")
  148. name = strings.ReplaceAll(name,"电子科技","电子")
  149. new_name = strings.ReplaceAll(new_name,"电子科技","电子")
  150. name = strings.ReplaceAll(name,"电子技术","电子")
  151. new_name = strings.ReplaceAll(new_name,"电子技术","电子")
  152. name = strings.ReplaceAll(name,"建设集团","建设")
  153. new_name = strings.ReplaceAll(new_name,"建设集团","建设")
  154. //log.Println("最终清理后-",name,new_name)
  155. if name==new_name {
  156. return true
  157. }
  158. return false
  159. }
  160. //击中数量以及比例
  161. func dealWithWordsRules(info_name string, source_name string) (int, int) {
  162. total, hit := 0, 0
  163. //字符串处理,替换指定字符
  164. source_name = strings.ReplaceAll(source_name, "(微型企业)", "")
  165. source_name = strings.ReplaceAll(source_name, "(有限合伙)", "")
  166. source_name = strings.ReplaceAll(source_name, "(普通合伙)", "")
  167. info_name = strings.ReplaceAll(info_name, "(", "")
  168. info_name = strings.ReplaceAll(info_name, ")", "")
  169. info_name = strings.ReplaceAll(info_name, "(", "")
  170. info_name = strings.ReplaceAll(info_name, ")", "")
  171. info_name = strings.ReplaceAll(info_name, "〉", "")
  172. source_name = strings.ReplaceAll(source_name, "(", "")
  173. source_name = strings.ReplaceAll(source_name, ")", "")
  174. source_name = strings.ReplaceAll(source_name, "(", "")
  175. source_name = strings.ReplaceAll(source_name, ")", "")
  176. source_name = strings.ReplaceAll(source_name, "〉", "")
  177. nameArr, _ := calculateWordCount(info_name)
  178. _, total = calculateWordCount(source_name)
  179. for _, v1 := range nameArr {
  180. if strings.Contains(source_name, v1) {
  181. hit++
  182. }
  183. }
  184. return total, hit
  185. }
  186. //分词结果
  187. func calculateWordCount(name string) ([]string, int) {
  188. arr, space := make([]string, 0), 2
  189. total := utf8.RuneCountInString(name) - (space - 1)
  190. if name == "" || total <= 0 {
  191. return arr, 0
  192. }
  193. nameRune := []rune(name)
  194. for i := 0; i < total; i++ {
  195. new_str := string(nameRune[i : space+i])
  196. arr = append(arr, new_str)
  197. }
  198. return arr, len(arr)
  199. }
  200. //func escape(s string) string {
  201. // news := ""
  202. // s = strings.ReplaceAll(s," ","")
  203. // for _, c := range s {
  204. // //if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
  205. // // news = news + string(c)
  206. // //}else if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
  207. // // a := string([]rune{os.PathSeparator, '\\'})
  208. // // news = news + a + string(c)
  209. // //} else {
  210. // // return ""
  211. // //}
  212. // if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
  213. // a := string([]rune{os.PathSeparator,'\\'})
  214. // //news = news + a + `\` + string(c)
  215. // news = news + a + string(c)
  216. // } else {
  217. // news = news + string(c)
  218. // }
  219. //
  220. // }
  221. // return news
  222. //}
  223. func escapeNew(s string) string {
  224. news := ""
  225. s = strings.ReplaceAll(s, " ", "")
  226. for _, c := range s {
  227. if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
  228. news = news + string(c)
  229. }
  230. }
  231. return news
  232. }
  233. //处理特殊情况-分析最终
  234. func dealWithSpecialName(name string,res []map[string]interface{})(string ,float64) {
  235. //log.Println("特殊...")
  236. new_name ,new_score,proportion := "",float64(0),float64(0)
  237. for k,v:=range res {
  238. tmp_name := ObjToString(v["name"])
  239. tmp_score := Float64All(v["score"])
  240. if k>=20 || tmp_score<3.0 {
  241. break
  242. }
  243. if endComReg.MatchString(tmp_name) {
  244. if endFComReg.MatchString(name) {
  245. if !endFComReg.MatchString(tmp_name) {
  246. continue
  247. }
  248. }else {
  249. if endFComReg.MatchString(tmp_name) {
  250. continue
  251. }
  252. }
  253. }else {
  254. continue
  255. }
  256. total, hit := dealWithWordsRules(tmp_name, name)
  257. tmp_proportion := float64(hit) / float64(total)
  258. //log.Println(tmp_proportion,tmp_name,name)
  259. if tmp_proportion > proportion {
  260. proportion = tmp_proportion
  261. new_name = tmp_name
  262. new_score = tmp_score
  263. }
  264. }
  265. return new_name,float64(new_score)
  266. }
  267. var endComReg *regexp.Regexp = regexp.MustCompile("(公司)$")
  268. var endFComReg *regexp.Regexp = regexp.MustCompile("(分公司)$")