# -*- coding: utf-8 -*- # @Time : 2023/3/3 14:04 # @Author : lkj import json import re from pathlib import Path import jieba from LAC import LAC jieba.add_word('等保') class Topic(object): def __init__(self): self.base_dir = Path(__file__).resolve().parent.parent self.lac = LAC(mode='lac') self.lac.load_customization('./data/lac_dict.txt') with open('./data/stopwords_topic.txt', 'r', encoding='utf-8') as f: stopwords = f.readlines() self.stopwords = [i.replace('\n', '') for i in stopwords] with open('./data/stoptext.txt', 'r', encoding='utf-8') as f: self.stopcontent = f.readlines() self.hw = open('./data/hw.txt', 'r', encoding='utf-8').readlines() self.gcs = open('./data/gc.txt', 'r', encoding='utf-8').readlines() self.fws = open('./data/fw.txt', 'r', encoding='utf-8').readlines() def classify(self, text): """ 分类规则 :param text: :return: """ class_name = [] flag = text[-4::] for good in self.hw: # 货物 good = good.replace('\n', '') if good in flag: class_name.append('货物') for gc in self.gcs: # 工程 gc = gc.replace('\n', '') if gc in flag: class_name.append('工程') for fw in self.fws: # 服务 fw = fw.replace('\n', '') if fw in flag: class_name.append('服务') class_name = list(set(class_name)) for i in ['及', '建设', '系统', '升级']: # 不能确定规则 if i in text[-8::]: class_name.clear() if len(class_name) > 1: class_name.clear() return class_name def lac_cut(self, text): """ lac 切除头部数据 :param text: :return: """ lac_result = self.lac.run(text) lac_res = [] index_list = [] for index, pos in enumerate(lac_result[1]): if pos in ['PER', 'LOC', 'ORG']: index_list.append(index) if index_list: # 识别到地点等词性直接去除前边所有 del lac_result[0][0:max(index_list) + 1] del lac_result[1][0:max(index_list) + 1] for index, pos in enumerate(lac_result[1]): if pos in ['w', 't', 'ns', ]: # 判断如果词性保留w词性中‘.’ start = index - 1 if start < 0: start = 0 end = index + 1 if end == len(lac_result[1]): end = index if lac_result[1][start] and lac_result[1][end] == 'm': # 小数点定位 lac_res.append(lac_result[0][index]) continue lac_res.append(lac_result[0][index]) lac_res = "".join(lac_res) return lac_res @staticmethod def re_process(text): """ 正则匹配规则 :param text: :return: """ text = re.sub('第.*?包', '', text) re_list2 = re.findall('\(.*?\)', text) for i in re_list2: if i not in ['(勘察)', '(测绘)', '(监理)']: text = text.replace(i, '') text = re.sub('\[.*?\]', '', text) text = re.sub('(.*?)', '', text) text = re.sub('.*大楼', '', text) text = re.sub('.*号楼', '', text) text = re.sub(r"\d{4}年\d{1,2}至\d{1,2}月", '', text) text = re.sub('.*?-竞争性磋商-[a-zA\--Z0-9_]{4,20}', '', text) text = re.sub('.*?-竞争性谈判-[a-zA\--Z0-9_]{4,20}', '', text) text = re.sub('.*?-公开招标-[a-zA\--Z0-9_]{4,20}', '', text) text = re.sub('[0-9]{1,9}年度', '', text) text = re.sub('[0-9]{4,9}年', '', text) text = re.sub('[0-9]{1,2}月', '', text) text = re.sub('[0-9]{1,2}日', '', text) text = re.sub('[!#%&()*+,/\-·$¥::;;,()|=?@\t—?★【】《》?、!\[\[^_`{|}~]', '', text) text = re.sub('[a-zA-Z0-9_]{5,30}', '', text) text = re.sub('工字.*', '', text) text = re.sub('.*县', '', text) text = re.sub('.*委员会', '', text) text = re.sub('第[0-9]{0,4}包', '', text) text = re.sub('.*村委会', '', text) text = re.sub('.*州界', '', text) text = re.sub('.*大学', '', text) text = re.sub('.*学院', '', text) text = re.sub('20[0-9]{2}级', '', text) text = re.sub('20[0-9]{2}', '', text) return text def stop_word(self, text: str): """ 停用词 :param text: :return: """ jieba_cut = jieba.lcut(text) new_text = [] for ind, i in enumerate(jieba_cut): if i not in self.stopwords: new_text.append(i) text = ''.join(new_text) return text def stop_content(self, text: str): """ 停用文本--->当一些固定的词需要切除但是可能会被切词工具切错如:重采购,重招标 :param text: :return: """ for sw in self.stopcontent: sw = sw.replace('\n', '') if sw in text: text = text.replace(sw, '') return text def tract(self, text): """ main 函数 :param text: :return: """ try: old_text = text text = self.re_process(text) # 正则 text = self.stop_content(text) # 停特定文本词汇 text = self.lac_cut(text) # lac去loc,org等词性 text = self.stop_word(text) # 停用词 cls = '' if text: if jieba.lcut(text)[0] in ['及', '至', '和', '与', '所', '并']: text = text[1::] cls = ''.join(self.classify(text)) # print('类别-->', cls) return text, cls except Exception as e: print('规则error',e) return '','' if __name__ == '__main__': t = Topic() while True: a = input('>>>>>') print(t.tract(a))