123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179 |
- # -*- coding: utf-8 -*-
- # @Time : 2023/3/3 14:04
- # @Author : lkj
- import json
- import re
- from pathlib import Path
- import jieba
- from LAC import LAC
- jieba.add_word('等保')
- class Topic(object):
- def __init__(self):
- self.base_dir = Path(__file__).resolve().parent.parent
- self.lac = LAC(mode='lac')
- self.lac.load_customization('./data/lac_dict.txt')
- with open('./data/stopwords_topic.txt', 'r', encoding='utf-8') as f:
- stopwords = f.readlines()
- self.stopwords = [i.replace('\n', '') for i in stopwords]
- with open('./data/stoptext.txt', 'r', encoding='utf-8') as f:
- self.stopcontent = f.readlines()
- self.hw = open('./data/hw.txt', 'r', encoding='utf-8').readlines()
- self.gcs = open('./data/gc.txt', 'r', encoding='utf-8').readlines()
- self.fws = open('./data/fw.txt', 'r', encoding='utf-8').readlines()
- def classify(self, text):
- """
- 分类规则
- :param text:
- :return:
- """
- class_name = []
- flag = text[-4::]
- for good in self.hw: # 货物
- good = good.replace('\n', '')
- if good in flag:
- class_name.append('货物')
- for gc in self.gcs: # 工程
- gc = gc.replace('\n', '')
- if gc in flag:
- class_name.append('工程')
- for fw in self.fws: # 服务
- fw = fw.replace('\n', '')
- if fw in flag:
- class_name.append('服务')
- class_name = list(set(class_name))
- for i in ['及', '建设', '系统', '升级']: # 不能确定规则
- if i in text[-8::]:
- class_name.clear()
- if len(class_name) > 1:
- class_name.clear()
- return class_name
- def lac_cut(self, text):
- """
- lac 切除头部数据
- :param text:
- :return:
- """
- lac_result = self.lac.run(text)
- lac_res = []
- index_list = []
- for index, pos in enumerate(lac_result[1]):
- if pos in ['PER', 'LOC', 'ORG']:
- index_list.append(index)
- if index_list: # 识别到地点等词性直接去除前边所有
- del lac_result[0][0:max(index_list) + 1]
- del lac_result[1][0:max(index_list) + 1]
- for index, pos in enumerate(lac_result[1]):
- if pos in ['w', 't', 'ns', ]: # 判断如果词性保留w词性中‘.’
- start = index - 1
- if start < 0:
- start = 0
- end = index + 1
- if end == len(lac_result[1]):
- end = index
- if lac_result[1][start] and lac_result[1][end] == 'm': # 小数点定位
- lac_res.append(lac_result[0][index])
- continue
- lac_res.append(lac_result[0][index])
- lac_res = "".join(lac_res)
- return lac_res
- @staticmethod
- def re_process(text):
- """
- 正则匹配规则
- :param text:
- :return:
- """
- text = re.sub('第.*?包', '', text)
- re_list2 = re.findall('\(.*?\)', text)
- for i in re_list2:
- if i not in ['(勘察)', '(测绘)', '(监理)']:
- text = text.replace(i, '')
- text = re.sub('\[.*?\]', '', text)
- text = re.sub('(.*?)', '', text)
- text = re.sub('.*大楼', '', text)
- text = re.sub('.*号楼', '', text)
- text = re.sub(r"\d{4}年\d{1,2}至\d{1,2}月", '', text)
- text = re.sub('.*?-竞争性磋商-[a-zA\--Z0-9_]{4,20}', '', text)
- text = re.sub('.*?-竞争性谈判-[a-zA\--Z0-9_]{4,20}', '', text)
- text = re.sub('.*?-公开招标-[a-zA\--Z0-9_]{4,20}', '', text)
- text = re.sub('[0-9]{1,9}年度', '', text)
- text = re.sub('[0-9]{4,9}年', '', text)
- text = re.sub('[0-9]{1,2}月', '', text)
- text = re.sub('[0-9]{1,2}日', '', text)
- text = re.sub('[!#%&()*+,/\-·$¥::;;,()|=?@\t—?★【】《》?、!\[\[^_`{|}~]', '', text)
- text = re.sub('[a-zA-Z0-9_]{5,30}', '', text)
- text = re.sub('工字.*', '', text)
- text = re.sub('.*县', '', text)
- text = re.sub('.*委员会', '', text)
- text = re.sub('第[0-9]{0,4}包', '', text)
- text = re.sub('.*村委会', '', text)
- text = re.sub('.*州界', '', text)
- text = re.sub('.*大学', '', text)
- text = re.sub('.*学院', '', text)
- text = re.sub('20[0-9]{2}级', '', text)
- text = re.sub('20[0-9]{2}', '', text)
- return text
- def stop_word(self, text: str):
- """
- 停用词
- :param text:
- :return:
- """
- jieba_cut = jieba.lcut(text)
- new_text = []
- for ind, i in enumerate(jieba_cut):
- if i not in self.stopwords:
- new_text.append(i)
- text = ''.join(new_text)
- return text
- def stop_content(self, text: str):
- """
- 停用文本--->当一些固定的词需要切除但是可能会被切词工具切错如:重采购,重招标
- :param text:
- :return:
- """
- for sw in self.stopcontent:
- sw = sw.replace('\n', '')
- if sw in text:
- text = text.replace(sw, '')
- return text
- def tract(self, text):
- """
- main 函数
- :param text:
- :return:
- """
- try:
- old_text = text
- text = self.re_process(text) # 正则
- text = self.stop_content(text) # 停特定文本词汇
- text = self.lac_cut(text) # lac去loc,org等词性
- text = self.stop_word(text) # 停用词
- cls = ''
- if text:
- if jieba.lcut(text)[0] in ['及', '至', '和', '与', '所', '并']:
- text = text[1::]
- cls = ''.join(self.classify(text))
- # print('类别-->', cls)
- return text, cls
- except Exception as e:
- print('规则error',e)
- return '',''
- if __name__ == '__main__':
- t = Topic()
- while True:
- a = input('>>>>>')
- print(t.tract(a))
|