|
@@ -0,0 +1,63 @@
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+from .models.predict import PredictModel
|
|
|
+from .utils.cut_word import CutWord
|
|
|
+from .utils.remove_tags import deal_tag_a
|
|
|
+
|
|
|
+_base_path = Path(__file__).parent
|
|
|
+_dictionary_path = (_base_path / 'docs/topic_model/ztb_small_data').resolve()
|
|
|
+_model_path = (_base_path / 'docs/topic_model/SGD_valid_pre.model').resolve()
|
|
|
+topic_model = PredictModel(_dictionary_path, _model_path, threshold_val=0.6)
|
|
|
+cut_word = CutWord(drop_seg=["x", "m", "eng"], stop_words=[])
|
|
|
+
|
|
|
+__all__ = ['replace_word', 'cut_data', 'exists_ztb']
|
|
|
+
|
|
|
+
|
|
|
+def replace_word(text: str):
|
|
|
+ """
|
|
|
+ 替换空格
|
|
|
+ :param text:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ words = [" ", "\r", "\n", "\u3000"]
|
|
|
+ for word in words:
|
|
|
+ text = text.replace(word, " ")
|
|
|
+ return text
|
|
|
+
|
|
|
+
|
|
|
+def cut_data(content):
|
|
|
+ """
|
|
|
+ 切词
|
|
|
+ :param content:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ cut_ret = cut_word.cut_word(content)
|
|
|
+ if len(cut_ret) < 20:
|
|
|
+ return ""
|
|
|
+ return " ".join(cut_ret)
|
|
|
+
|
|
|
+
|
|
|
+def exists_ztb(contents: dict):
|
|
|
+ """
|
|
|
+ 是否招投标判断
|
|
|
+ :param contents:{唯一id:content}
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ result = {}
|
|
|
+ body = []
|
|
|
+ id_list = [] # 与body中的文本做索引关联,同时也是预测结果的索引
|
|
|
+ # 切词
|
|
|
+ for no_key, content in contents.items():
|
|
|
+ source = replace_word(content)
|
|
|
+ texts = deal_tag_a(source)
|
|
|
+ space_contents = cut_data(texts)
|
|
|
+ if not space_contents:
|
|
|
+ result[no_key] = 0
|
|
|
+ continue
|
|
|
+ body.append(space_contents)
|
|
|
+ id_list.append(no_key)
|
|
|
+ # 生成预测结果
|
|
|
+ predict_threshold = topic_model.predict(body, threshold=True)
|
|
|
+ for no_key, pre_ret in zip(id_list, predict_threshold):
|
|
|
+ result[no_key] = pre_ret
|
|
|
+ return result
|