topic_extract.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. # -*- coding: utf-8 -*-
  2. # @Time : 2023/3/3 14:04
  3. # @Author : lkj
  4. import json
  5. import re
  6. from pathlib import Path
  7. import jieba
  8. from LAC import LAC
  9. jieba.add_word('等保')
  10. class Topic(object):
  11. def __init__(self):
  12. self.base_dir = Path(__file__).resolve().parent.parent
  13. self.lac = LAC(mode='lac')
  14. self.lac.load_customization('./data/lac_dict.txt')
  15. with open('./data/stopwords_topic.txt', 'r', encoding='utf-8') as f:
  16. stopwords = f.readlines()
  17. self.stopwords = [i.replace('\n', '') for i in stopwords]
  18. with open('./data/stoptext.txt', 'r', encoding='utf-8') as f:
  19. self.stopcontent = f.readlines()
  20. self.hw = open('./data/hw.txt', 'r', encoding='utf-8').readlines()
  21. self.gcs = open('./data/gc.txt', 'r', encoding='utf-8').readlines()
  22. self.fws = open('./data/fw.txt', 'r', encoding='utf-8').readlines()
  23. def classify(self, text):
  24. """
  25. 分类规则
  26. :param text:
  27. :return:
  28. """
  29. class_name = []
  30. flag = text[-4::]
  31. for good in self.hw: # 货物
  32. good = good.replace('\n', '')
  33. if good in flag:
  34. class_name.append('货物')
  35. for gc in self.gcs: # 工程
  36. gc = gc.replace('\n', '')
  37. if gc in flag:
  38. class_name.append('工程')
  39. for fw in self.fws: # 服务
  40. fw = fw.replace('\n', '')
  41. if fw in flag:
  42. class_name.append('服务')
  43. class_name = list(set(class_name))
  44. for i in ['及', '建设', '系统', '升级']: # 不能确定规则
  45. if i in text[-8::]:
  46. class_name.clear()
  47. if len(class_name) > 1:
  48. class_name.clear()
  49. return class_name
  50. def lac_cut(self, text):
  51. """
  52. lac 切除头部数据
  53. :param text:
  54. :return:
  55. """
  56. lac_result = self.lac.run(text)
  57. lac_res = []
  58. index_list = []
  59. for index, pos in enumerate(lac_result[1]):
  60. if pos in ['PER', 'LOC', 'ORG']:
  61. index_list.append(index)
  62. if index_list: # 识别到地点等词性直接去除前边所有
  63. del lac_result[0][0:max(index_list) + 1]
  64. del lac_result[1][0:max(index_list) + 1]
  65. for index, pos in enumerate(lac_result[1]):
  66. if pos in ['w', 't', 'ns', ]: # 判断如果词性保留w词性中‘.’
  67. start = index - 1
  68. if start < 0:
  69. start = 0
  70. end = index + 1
  71. if end == len(lac_result[1]):
  72. end = index
  73. if lac_result[1][start] and lac_result[1][end] == 'm': # 小数点定位
  74. lac_res.append(lac_result[0][index])
  75. continue
  76. lac_res.append(lac_result[0][index])
  77. lac_res = "".join(lac_res)
  78. return lac_res
  79. @staticmethod
  80. def re_process(text):
  81. """
  82. 正则匹配规则
  83. :param text:
  84. :return:
  85. """
  86. text = re.sub('第.*?包', '', text)
  87. re_list2 = re.findall('\(.*?\)', text)
  88. for i in re_list2:
  89. if i not in ['(勘察)', '(测绘)', '(监理)']:
  90. text = text.replace(i, '')
  91. text = re.sub('\[.*?\]', '', text)
  92. text = re.sub('(.*?)', '', text)
  93. text = re.sub('.*大楼', '', text)
  94. text = re.sub('.*号楼', '', text)
  95. text = re.sub(r"\d{4}年\d{1,2}至\d{1,2}月", '', text)
  96. text = re.sub('.*?-竞争性磋商-[a-zA\--Z0-9_]{4,20}', '', text)
  97. text = re.sub('.*?-竞争性谈判-[a-zA\--Z0-9_]{4,20}', '', text)
  98. text = re.sub('.*?-公开招标-[a-zA\--Z0-9_]{4,20}', '', text)
  99. text = re.sub('[0-9]{1,9}年度', '', text)
  100. text = re.sub('[0-9]{4,9}年', '', text)
  101. text = re.sub('[0-9]{1,2}月', '', text)
  102. text = re.sub('[0-9]{1,2}日', '', text)
  103. text = re.sub('[!#%&()*+,/\-·$¥::;;,()|=?@\t—?★【】《》?、!\[\[^_`{|}~]', '', text)
  104. text = re.sub('[a-zA-Z0-9_]{5,30}', '', text)
  105. text = re.sub('工字.*', '', text)
  106. text = re.sub('.*县', '', text)
  107. text = re.sub('.*委员会', '', text)
  108. text = re.sub('第[0-9]{0,4}包', '', text)
  109. text = re.sub('.*村委会', '', text)
  110. text = re.sub('.*州界', '', text)
  111. text = re.sub('.*大学', '', text)
  112. text = re.sub('.*学院', '', text)
  113. text = re.sub('20[0-9]{2}级', '', text)
  114. text = re.sub('20[0-9]{2}', '', text)
  115. return text
  116. def stop_word(self, text: str):
  117. """
  118. 停用词
  119. :param text:
  120. :return:
  121. """
  122. jieba_cut = jieba.lcut(text)
  123. new_text = []
  124. for ind, i in enumerate(jieba_cut):
  125. if i not in self.stopwords:
  126. new_text.append(i)
  127. text = ''.join(new_text)
  128. return text
  129. def stop_content(self, text: str):
  130. """
  131. 停用文本--->当一些固定的词需要切除但是可能会被切词工具切错如:重采购,重招标
  132. :param text:
  133. :return:
  134. """
  135. for sw in self.stopcontent:
  136. sw = sw.replace('\n', '')
  137. if sw in text:
  138. text = text.replace(sw, '')
  139. return text
  140. def tract(self, text):
  141. """
  142. main 函数
  143. :param text:
  144. :return:
  145. """
  146. try:
  147. old_text = text
  148. text = self.re_process(text) # 正则
  149. text = self.stop_content(text) # 停特定文本词汇
  150. text = self.lac_cut(text) # lac去loc,org等词性
  151. text = self.stop_word(text) # 停用词
  152. cls = ''
  153. if text:
  154. if jieba.lcut(text)[0] in ['及', '至', '和', '与', '所', '并']:
  155. text = text[1::]
  156. cls = ''.join(self.classify(text))
  157. # print('类别-->', cls)
  158. return text, cls
  159. except Exception as e:
  160. print('规则error',e)
  161. return '',''
  162. if __name__ == '__main__':
  163. t = Topic()
  164. while True:
  165. a = input('>>>>>')
  166. print(t.tract(a))