__init__.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. from pathlib import Path
  2. from .models.predict import PredictModel
  3. from .utils.cut_word import CutWord
  4. from .utils.remove_tags import deal_tag_a
  5. _base_path = Path(__file__).parent
  6. _dictionary_path = (_base_path / 'docs/topic_model/ztb_small_data').resolve()
  7. _model_path = (_base_path / 'docs/topic_model/SGD_valid_pre.model').resolve()
  8. topic_model = PredictModel(_dictionary_path, _model_path, threshold_val=0.6)
  9. cut_word = CutWord(drop_seg=["x", "m", "eng"], stop_words=[])
  10. __all__ = ['replace_word', 'cut_data', 'exists_ztb']
  11. def replace_word(text: str):
  12. """
  13. 替换空格
  14. :param text:
  15. :return:
  16. """
  17. words = [" ", "\r", "\n", "\u3000"]
  18. for word in words:
  19. text = text.replace(word, " ")
  20. return text
  21. def cut_data(content):
  22. """
  23. 切词
  24. :param content:
  25. :return:
  26. """
  27. cut_ret = cut_word.cut_word(content)
  28. if len(cut_ret) < 20:
  29. return ""
  30. return " ".join(cut_ret)
  31. def exists_ztb(contents: dict):
  32. """
  33. 是否招投标判断
  34. :param contents:{唯一id:content}
  35. :return:
  36. """
  37. result = {}
  38. body = []
  39. id_list = [] # 与body中的文本做索引关联,同时也是预测结果的索引
  40. # 切词
  41. for no_key, content in contents.items():
  42. source = replace_word(content)
  43. texts = deal_tag_a(source)
  44. space_contents = cut_data(texts)
  45. if not space_contents:
  46. result[no_key] = 0
  47. continue
  48. body.append(space_contents)
  49. id_list.append(no_key)
  50. # 生成预测结果
  51. predict_threshold = topic_model.predict(body, threshold=True)
  52. for no_key, pre_ret in zip(id_list, predict_threshold):
  53. result[no_key] = pre_ret
  54. return result