data_spider
/
topic_spider


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
							from pathlib import Path

from .models.predict import PredictModel
from .utils.cut_word import CutWord
from .utils.remove_tags import deal_tag_a

_base_path = Path(__file__).parent
_dictionary_path = (_base_path / 'docs/topic_model/ztb_small_data').resolve()
_model_path = (_base_path / 'docs/topic_model/SGD_valid_pre.model').resolve()
topic_model = PredictModel(_dictionary_path, _model_path, threshold_val=0.6)
cut_word = CutWord(drop_seg=["x", "m", "eng"], stop_words=[])

__all__ = ['replace_word', 'cut_data', 'exists_ztb']


def replace_word(text: str):
    """
    替换空格
    :param text:
    :return:
    """
    words = ["  ", "\r", "\n", "\u3000"]
    for word in words:
        text = text.replace(word, " ")
    return text


def cut_data(content):
    """
    切词
    :param content:
    :return:
    """
    cut_ret = cut_word.cut_word(content)
    if len(cut_ret) < 20:
        return ""
    return " ".join(cut_ret)


def exists_ztb(contents: dict):
    """
    是否招投标判断
    :param contents:{唯一id:content}
    :return:
    """
    result = {}
    body = []
    id_list = []  # 与body中的文本做索引关联,同时也是预测结果的索引
    # 切词
    for no_key, content in contents.items():
        source = replace_word(content)
        texts = deal_tag_a(source)
        space_contents = cut_data(texts)
        if not space_contents:
            result[no_key] = 0
            continue
        body.append(space_contents)
        id_list.append(no_key)
    # 生成预测结果
    predict_threshold = topic_model.predict(body, threshold=True)
    for no_key, pre_ret in zip(id_list, predict_threshold):
        result[no_key] = pre_ret
    return result