digital.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. # -*- coding: utf-8 -*-
  2. # @Time : 2023/7/10
  3. # @Author : lkj
  4. # @description : 流程1 只查询
  5. import re
  6. import time
  7. from collections import Counter
  8. from numpy import dot
  9. from numpy.linalg import norm
  10. from utils.milvus_hlper import Milvus
  11. from utils.request_fun import text_to_vector
  12. from config import milvus_config
  13. import numpy as np
  14. m = Milvus(table='zc_classify', **milvus_config)
  15. def cosine_similarity(vector1, vector2):
  16. """
  17. 余弦相似计算
  18. """
  19. dot_product = np.dot(vector1, vector2)
  20. norm1 = np.linalg.norm(vector1)
  21. norm2 = np.linalg.norm(vector2)
  22. similarity = dot_product / (norm1 * norm2)
  23. return round(similarity,4)
  24. def check(text_vec,data_es: list):
  25. '''
  26. 规则打可信度
  27. :param text_vec:
  28. :param data_es:
  29. :return:
  30. '''
  31. try:
  32. sim_list = [i[2] for i in data_es] # 相似度列表
  33. sim_mean = sum(sim_list)/len(sim_list) # 平均相似度
  34. data = [item for item in data_es if item[2] >= sim_mean-0.025] # 删除es结果中差异性大且相似度低的值
  35. count = 3 # 统计个数阈值
  36. if len(data) <= 4:
  37. count = 2
  38. if data[0][2] > 0.965: # 查询结果为正序如果满足极大值或者较大差异直接返回第一个数据
  39. return data[0][1], 'mode01'
  40. if data[0][2] - data[1][2] > 0.025 and data[0][2] > 0.9: # 如果第一个可信度大于第二个0.025并且第一个相似度大于0.9
  41. return data[0][1], 'mode1'
  42. # 新判断当出现多个大于0.9判断每个上一级分类的选取最高的
  43. score_list = [i[1] for i in data_es if i[2] > 0.9]
  44. if len(score_list) >= 2:
  45. best_code = ''
  46. best_sim = 0
  47. for end_code in score_list:
  48. father_code = end_code[:-2] # 父级的id
  49. father_code_name = m.get_name(father_code) # 父级的name
  50. father_vec = text_to_vector(father_code_name)
  51. similarity = round(dot(text_vec, father_vec) / (norm(text_vec) * norm(father_vec)), 4) # 相似度计算
  52. if similarity > best_sim: # 循环查找父级相似度最大的
  53. best_code = end_code
  54. best_sim = similarity
  55. return best_code, 'mode1'
  56. # 第三档的可信度规则
  57. elif data[0][2] - data[1][2] > 0.015 and data[0][2] > 0.9:
  58. pcode = data[0][1][:-2]
  59. pname = m.get_name(pcode)
  60. pvec = text_to_vector(pname)
  61. similarity = round(dot(text_vec, pvec) / (norm(text_vec) * norm(pvec)), 4) # 相似度计算
  62. if similarity > 0.8: # 第一个值的父级相似度大于0.8直接返回该值
  63. return data[0][1], 'mode1'
  64. else:
  65. # 统计整个向量库中返回的数据的出现的频率做规则
  66. code_list = []
  67. for row in data:
  68. if len(row[1][:-2]) > 1:
  69. code_list.append(row[1][:-2])
  70. code_list.append(row[1])
  71. word_count = dict(Counter(code_list)) # 统计对应分类及其父类的词频,如果某个词的父类频率高则定位到该类
  72. max_word = [(k, v) for k, v in word_count.items() if v >= count]
  73. if len(max_word) == 1:
  74. return max_word[0][0], 'mode2'
  75. else: # 如果存在多个值进行对比
  76. code = ''
  77. code_sim = 0
  78. for word in max_word:
  79. code_ = word[0]
  80. code_name = m.get_name(code_)
  81. vec = text_to_vector(code_name)
  82. sim_ = round(dot(text_vec, vec) / (norm(text_vec) * norm(vec)), 4) # 相似度计算
  83. if sim_ > code_sim:
  84. code_sim = sim_
  85. code = code_
  86. return code, 'mode3'
  87. return '', 'mode0'
  88. except Exception as e:
  89. print('check_error', e)
  90. return '', 'error'
  91. def run_mode1(text, baseclass):
  92. """
  93. 标的物数字化主函数
  94. :param text:
  95. :param baseclass
  96. :param es_classify_name
  97. :return: result_name:结果名称, similarity:结果与输入文本相似度, mode:流程模式, code:结果编码, credibility:结果可信度
  98. """
  99. vec = text_to_vector(text) # 转成向量
  100. search_result = m.search_china(vec, baseclass) # 查询结果
  101. if not search_result:
  102. return '', 0, 'mode0','',0
  103. similarity = 0 # 文本与结果相似度
  104. result_name = ''
  105. mode = 'mode0'
  106. credibility = 0
  107. code = ''
  108. if search_result:
  109. check_result = check(vec, search_result) # 结果筛选
  110. if check_result[0]:
  111. code = check_result[0]
  112. result_name = m.get_name(check_result[0]) # 名称映射
  113. mode = check_result[1]
  114. if result_name:
  115. res_vec = text_to_vector(result_name)
  116. similarity = round(dot(vec, res_vec) / (norm(vec) * norm(res_vec)), 4) # 相似度计算
  117. if mode == 'mode01':
  118. credibility = 0.99
  119. if mode == 'mode1':
  120. credibility = 0.95
  121. if mode == 'mode3' and similarity > 0.85:
  122. credibility = 0.90
  123. if mode == 'mode2' and similarity > 0.8:
  124. credibility = 0.85
  125. if mode in ['mode2','mode3']:
  126. pcode = code[:-2]
  127. if not pcode:
  128. pcode = code
  129. p_name = m.get_name(pcode) # 父类名称
  130. p_name_vec = text_to_vector(p_name) # 父类向量
  131. p_similarity = round(dot(p_name_vec, vec) / (norm(p_name_vec) * norm(vec)), 4) # 文本与父类计算
  132. if p_similarity > 0.85 and similarity > 0.9 or similarity == 0.99:
  133. mode = 'mode4'
  134. credibility = 0.99
  135. if credibility == 0:
  136. result_name = search_result[0][0]
  137. similarity = search_result[0][2]
  138. mode = ''
  139. code = search_result[0][1]
  140. return result_name, similarity, mode, code, credibility
  141. def run_mode1_main(text, baseclass=None):
  142. """
  143. """
  144. try:
  145. result = list(run_mode1(text, baseclass))
  146. if not result[0]:
  147. return ['', '', '', 0, '']
  148. result.pop(2)
  149. code = result[2]
  150. route = m.get_root_zc(code)
  151. result.append(route)
  152. if result[1] > 0.9 and result[-2] == 0:
  153. result[-2] = 0.85
  154. if result[1] == 1.0:
  155. result[-2] = 0.99
  156. return result
  157. except Exception as e:
  158. print('政采分类错误--->',e)
  159. return ['', '', '', '', '']
  160. if __name__ == '__main__':
  161. print(run_mode1_main('成型设备'
  162. '','工程'))
  163. exit()
  164. while True:
  165. t = input('输入文本:')
  166. print(run_mode1_main(t))
  167. # exit()
  168. # import pandas as pd
  169. # data = pd.read_csv('./data/test.csv',encoding='utf-8',sep='\t')
  170. # for name in data['name']:
  171. # print('intput--->', name)
  172. # run_result = run_mode1(name)
  173. #
  174. # china_name = run_result[0]
  175. # china_name_code = run_result[3]
  176. # china_name_dis = run_result[1]
  177. # score = 0.99
  178. # root = ''
  179. # stop = 2
  180. # for i in range(0, len(china_name_code), 2):
  181. # root = root + name_maps.get(china_name_code[0:stop], '') + '/'
  182. # stop += 2
  183. # res = [name,china_name,china_name_code,root,run_result[4]]
  184. # print('相似度:',run_result[4])
  185. # with open('data/result3.csv', 'a', newline='', encoding='utf-8') as f:
  186. # witer = csv.writer(f)
  187. # witer.writerow(res)
  188. # print('output--->',run_result[0])
  189. # # print('相似度--->',res[1])
  190. # print('模式--->',run_result[2])
  191. # # print('分类解释:', i[3] + '\n')
  192. # print('*' * 30)