from pathlib import Path from .models.predict import PredictModel from .utils.cut_word import CutWord from .utils.remove_tags import deal_tag_a _base_path = Path(__file__).parent _dictionary_path = (_base_path / 'docs/topic_model/ztb_small_data').resolve() _model_path = (_base_path / 'docs/topic_model/SGD_valid_pre.model').resolve() topic_model = PredictModel(_dictionary_path, _model_path, threshold_val=0.6) cut_word = CutWord(drop_seg=["x", "m", "eng"], stop_words=[]) __all__ = ['replace_word', 'cut_data', 'exists_ztb'] def replace_word(text: str): """ 替换空格 :param text: :return: """ words = [" ", "\r", "\n", "\u3000"] for word in words: text = text.replace(word, " ") return text def cut_data(content): """ 切词 :param content: :return: """ cut_ret = cut_word.cut_word(content) if len(cut_ret) < 20: return "" return " ".join(cut_ret) def exists_ztb(contents: dict): """ 是否招投标判断 :param contents:{唯一id:content} :return: """ result = {} body = [] id_list = [] # 与body中的文本做索引关联,同时也是预测结果的索引 # 切词 for no_key, content in contents.items(): source = replace_word(content) texts = deal_tag_a(source) space_contents = cut_data(texts) if not space_contents: result[no_key] = 0 continue body.append(space_contents) id_list.append(no_key) # 生成预测结果 predict_threshold = topic_model.predict(body, threshold=True) for no_key, pre_ret in zip(id_list, predict_threshold): result[no_key] = pre_ret return result