liukangjia
/
text_to_vector


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
							# -*- coding: utf-8 -*-
# @Time    : 2023/3/3 14:04
# @Author  : lkj
import json
import re
from pathlib import Path

import jieba
from LAC import LAC

jieba.add_word('等保')


class Topic(object):

    def __init__(self):

        self.base_dir = Path(__file__).resolve().parent.parent
        self.lac = LAC(mode='lac')
        self.lac.load_customization('./data/lac_dict.txt')
        with open('./data/stopwords_topic.txt', 'r', encoding='utf-8') as f:
            stopwords = f.readlines()
            self.stopwords = [i.replace('\n', '') for i in stopwords]
        with open('./data/stoptext.txt', 'r', encoding='utf-8') as f:
            self.stopcontent = f.readlines()
        self.hw = open('./data/hw.txt', 'r', encoding='utf-8').readlines()
        self.gcs = open('./data/gc.txt', 'r', encoding='utf-8').readlines()
        self.fws = open('./data/fw.txt', 'r', encoding='utf-8').readlines()

    def classify(self, text):
        """
        分类规则
        :param text:
        :return:
        """
        class_name = []
        flag = text[-4::]
        for good in self.hw:  # 货物
            good = good.replace('\n', '')
            if good in flag:
                class_name.append('货物')
        for gc in self.gcs:  # 工程
            gc = gc.replace('\n', '')
            if gc in flag:
                class_name.append('工程')
        for fw in self.fws:  # 服务
            fw = fw.replace('\n', '')
            if fw in flag:
                class_name.append('服务')
        class_name = list(set(class_name))
        for i in ['及', '建设', '系统', '升级']:  # 不能确定规则
            if i in text[-8::]:
                class_name.clear()
        if len(class_name) > 1:
            class_name.clear()
        return class_name

    def lac_cut(self, text):
        """
        lac 切除头部数据
        :param text:
        :return:
        """
        lac_result = self.lac.run(text)
        lac_res = []
        index_list = []
        for index, pos in enumerate(lac_result[1]):
            if pos in ['PER', 'LOC', 'ORG']:
                index_list.append(index)
        if index_list:  # 识别到地点等词性直接去除前边所有
            del lac_result[0][0:max(index_list) + 1]
            del lac_result[1][0:max(index_list) + 1]
        for index, pos in enumerate(lac_result[1]):
            if pos in ['w', 't', 'ns', ]:  # 判断如果词性保留w词性中‘.’
                start = index - 1
                if start < 0:
                    start = 0
                end = index + 1
                if end == len(lac_result[1]):
                    end = index
                if lac_result[1][start] and lac_result[1][end] == 'm':  # 小数点定位
                    lac_res.append(lac_result[0][index])
                continue
            lac_res.append(lac_result[0][index])
        lac_res = "".join(lac_res)
        return lac_res

    @staticmethod
    def re_process(text):
        """
        正则匹配规则
        :param text:
        :return:
        """
        text = re.sub('第.*?包', '', text)
        re_list2 = re.findall('\(.*?\)', text)
        for i in re_list2:
            if i not in ['(勘察)', '(测绘)', '(监理)']:
                text = text.replace(i, '')
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('（.*?）', '', text)
        text = re.sub('.*大楼', '', text)
        text = re.sub('.*号楼', '', text)
        text = re.sub(r"\d{4}年\d{1,2}至\d{1,2}月", '', text)
        text = re.sub('.*?-竞争性磋商-[a-zA\--Z0-9_]{4,20}', '', text)
        text = re.sub('.*?-竞争性谈判-[a-zA\--Z0-9_]{4,20}', '', text)
        text = re.sub('.*?-公开招标-[a-zA\--Z0-9_]{4,20}', '', text)
        text = re.sub('[0-9]{1,9}年度', '', text)
        text = re.sub('[0-9]{4,9}年', '', text)
        text = re.sub('[0-9]{1,2}月', '', text)
        text = re.sub('[0-9]{1,2}日', '', text)
        text = re.sub('[!#%&()*+,/\-·$￥:：;；，（）|=?@\t—?★【】《》？、！\[\[^_`{|}~]', '', text)
        text = re.sub('[a-zA-Z0-9_]{5,30}', '', text)
        text = re.sub('工字.*', '', text)
        text = re.sub('.*县', '', text)
        text = re.sub('.*委员会', '', text)
        text = re.sub('第[0-9]{0,4}包', '', text)
        text = re.sub('.*村委会', '', text)
        text = re.sub('.*州界', '', text)
        text = re.sub('.*大学', '', text)
        text = re.sub('.*学院', '', text)
        text = re.sub('20[0-9]{2}级', '', text)
        text = re.sub('20[0-9]{2}', '', text)
        return text

    def stop_word(self, text: str):
        """
        停用词
        :param text:
        :return:
        """
        jieba_cut = jieba.lcut(text)
        new_text = []
        for ind, i in enumerate(jieba_cut):
            if i not in self.stopwords:
                new_text.append(i)
        text = ''.join(new_text)
        return text

    def stop_content(self, text: str):
        """
        停用文本--->当一些固定的词需要切除但是可能会被切词工具切错如：重采购，重招标
        :param text:
        :return:
        """
        for sw in self.stopcontent:
            sw = sw.replace('\n', '')
            if sw in text:
                text = text.replace(sw, '')
        return text

    def tract(self, text):
        """
        main 函数
        :param text:
        :return:
        """
        try:
            old_text = text
            text = self.re_process(text)  # 正则
            text = self.stop_content(text)  # 停特定文本词汇
            text = self.lac_cut(text)  # lac去loc，org等词性
            text = self.stop_word(text)  # 停用词
            cls = ''
            if text:
                if jieba.lcut(text)[0] in ['及', '至', '和', '与', '所', '并']:
                    text = text[1::]
                cls = ''.join(self.classify(text))
                # print('类别-->', cls)
            return text, cls
        except Exception as e:
            print('规则error',e)
            return '',''

if __name__ == '__main__':
    t = Topic()
    while True:
        a = input('>>>>>')
        print(t.tract(a))