123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263 |
- from pathlib import Path
- from .models.predict import PredictModel
- from .utils.cut_word import CutWord
- from .utils.remove_tags import deal_tag_a
- _base_path = Path(__file__).parent
- _dictionary_path = (_base_path / 'docs/topic_model/ztb_small_data').resolve()
- _model_path = (_base_path / 'docs/topic_model/SGD_valid_pre.model').resolve()
- topic_model = PredictModel(_dictionary_path, _model_path, threshold_val=0.6)
- cut_word = CutWord(drop_seg=["x", "m", "eng"], stop_words=[])
- __all__ = ['replace_word', 'cut_data', 'exists_ztb']
- def replace_word(text: str):
- """
- 替换空格
- :param text:
- :return:
- """
- words = [" ", "\r", "\n", "\u3000"]
- for word in words:
- text = text.replace(word, " ")
- return text
- def cut_data(content):
- """
- 切词
- :param content:
- :return:
- """
- cut_ret = cut_word.cut_word(content)
- if len(cut_ret) < 20:
- return ""
- return " ".join(cut_ret)
- def exists_ztb(contents: dict):
- """
- 是否招投标判断
- :param contents:{唯一id:content}
- :return:
- """
- result = {}
- body = []
- id_list = [] # 与body中的文本做索引关联,同时也是预测结果的索引
- # 切词
- for no_key, content in contents.items():
- source = replace_word(content)
- texts = deal_tag_a(source)
- space_contents = cut_data(texts)
- if not space_contents:
- result[no_key] = 0
- continue
- body.append(space_contents)
- id_list.append(no_key)
- # 生成预测结果
- predict_threshold = topic_model.predict(body, threshold=True)
- for no_key, pre_ret in zip(id_list, predict_threshold):
- result[no_key] = pre_ret
- return result
|