Sfoglia il codice sorgente

new add - 招投标预测模型

dongzhaorui 2 anni fa
parent
commit
c85a415764

+ 9 - 0
find_source/crawler/utils.py

@@ -17,6 +17,7 @@ from crawler.defaults import (
     PAGE_TEXT_CHECK_WORDS,
     PAGE_TEXT_FILTER_WORDS
 )
+from predict_bidding_model import exists_ztb
 
 
 def err_details(worker):
@@ -266,3 +267,11 @@ def check_page_by_words(val: str):
             if search is not None:
                 return True
     return False
+
+
+def predict_bidding_model(item: dict):
+    result = {**item}
+    predict_result = exists_ztb(item)
+    predict = any({v for _, v in predict_result.items()})
+    result['predict'] = int(predict)
+    return result

+ 63 - 0
find_source/predict_bidding_model/__init__.py

@@ -0,0 +1,63 @@
+from pathlib import Path
+
+from .models.predict import PredictModel
+from .utils.cut_word import CutWord
+from .utils.remove_tags import deal_tag_a
+
+_base_path = Path(__file__).parent
+_dictionary_path = (_base_path / 'docs/topic_model/ztb_small_data').resolve()
+_model_path = (_base_path / 'docs/topic_model/SGD_valid_pre.model').resolve()
+topic_model = PredictModel(_dictionary_path, _model_path, threshold_val=0.6)
+cut_word = CutWord(drop_seg=["x", "m", "eng"], stop_words=[])
+
+__all__ = ['replace_word', 'cut_data', 'exists_ztb']
+
+
+def replace_word(text: str):
+    """
+    替换空格
+    :param text:
+    :return:
+    """
+    words = ["  ", "\r", "\n", "\u3000"]
+    for word in words:
+        text = text.replace(word, " ")
+    return text
+
+
+def cut_data(content):
+    """
+    切词
+    :param content:
+    :return:
+    """
+    cut_ret = cut_word.cut_word(content)
+    if len(cut_ret) < 20:
+        return ""
+    return " ".join(cut_ret)
+
+
+def exists_ztb(contents: dict):
+    """
+    是否招投标判断
+    :param contents:{唯一id:content}
+    :return:
+    """
+    result = {}
+    body = []
+    id_list = []  # 与body中的文本做索引关联,同时也是预测结果的索引
+    # 切词
+    for no_key, content in contents.items():
+        source = replace_word(content)
+        texts = deal_tag_a(source)
+        space_contents = cut_data(texts)
+        if not space_contents:
+            result[no_key] = 0
+            continue
+        body.append(space_contents)
+        id_list.append(no_key)
+    # 生成预测结果
+    predict_threshold = topic_model.predict(body, threshold=True)
+    for no_key, pre_ret in zip(id_list, predict_threshold):
+        result[no_key] = pre_ret
+    return result

BIN
find_source/predict_bidding_model/docs/topic_model/SGD_valid_pre.model


+ 21 - 0
find_source/predict_bidding_model/docs/topic_model/report.txt

@@ -0,0 +1,21 @@
+train score:  0.996954 ;  test score:  0.996009
+启用阈值:
+              precision    recall  f1-score   support
+
+           0       0.99      1.00      1.00     38245
+           1       1.00      0.99      0.99     30413
+
+    accuracy                           1.00     68658
+   macro avg       1.00      1.00      1.00     68658
+weighted avg       1.00      1.00      1.00     68658
+
+不使用阈值:
+              precision    recall  f1-score   support
+
+           0       0.99      1.00      1.00     38245
+           1       1.00      0.99      1.00     30413
+
+    accuracy                           1.00     68658
+   macro avg       1.00      1.00      1.00     68658
+weighted avg       1.00      1.00      1.00     68658
+

BIN
find_source/predict_bidding_model/docs/topic_model/ztb_small_data


+ 1 - 0
find_source/predict_bidding_model/models/__init__.py

@@ -0,0 +1 @@
+from .predict import PredictModel

+ 33 - 0
find_source/predict_bidding_model/models/predict.py

@@ -0,0 +1,33 @@
+# coding:utf-8
+import joblib
+
+
+class PredictModel(object):
+    def __init__(self, dictionary_path, model_path, threshold_val=0.8):
+        self.dictionary = joblib.load(dictionary_path)
+        self.model = joblib.load(model_path)
+        self._threshold_val = threshold_val
+
+    def predict(self, contents, threshold=True):
+        """
+        结果预测
+        :param contents: 需要预测的文本列表
+        :param threshold:
+        :return:
+        """
+        if not contents:
+            return []
+        content_vec = self.dictionary.transform(contents)
+        if threshold:
+            predict_result = self.model.predict_proba(content_vec)
+            predict_result = list(map(self.threshold, predict_result))
+        else:
+            predict_result = self.model.predict(content_vec)
+        return predict_result
+
+    def threshold(self, x):
+        # 预测结果
+        if x[1] > self._threshold_val:
+            return 1
+        else:
+            return 0

+ 5 - 0
find_source/predict_bidding_model/readme.md

@@ -0,0 +1,5 @@
+1、协议安装包
+pip3 install jieba==0.42.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
+pip3 install joblib==1.1.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
+pip3 install sklearn==0.0 -i https://pypi.tuna.tsinghua.edu.cn/simple 
+pip3 install scikit-learn==0.24.2 -i https://pypi.tuna.tsinghua.edu.cn/simple 

+ 2 - 0
find_source/predict_bidding_model/utils/__init__.py

@@ -0,0 +1,2 @@
+from .cut_word import CutWord
+from .remove_tags import deal_tag_a

+ 29 - 0
find_source/predict_bidding_model/utils/cut_word.py

@@ -0,0 +1,29 @@
+# coding:utf-8
+
+import multiprocessing
+import platform
+
+import jieba
+import jieba.posseg as psg
+
+if platform.system() == 'linux':
+    jieba.enable_parallel(multiprocessing.cpu_count())
+
+
+class CutWord(object):
+
+    def __init__(self, stop_words=None, drop_seg=None):
+        self.stop_words = stop_words if stop_words else []  # 停用词
+        self.drop_seg = drop_seg if drop_seg else []  # 停用词性
+
+    def cut_word(self, content):
+        words = []
+        for w, x in psg.cut(content):
+            if w not in self.stop_words and x not in self.drop_seg:
+                words.append(w)
+        return words
+
+
+# if __name__ == '__main__':
+#     cut_obj = CutWord(drop_seg=["x"])
+#     cut_obj.cut_word("名称:计算机教室教师机终端及教室管理软件 品牌(如有):联想 规格型号:")

+ 35 - 0
find_source/predict_bidding_model/utils/remove_tags.py

@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+from html import unescape
+
+from lxml.html import fromstring, tostring
+
+
+def deal_tag_a(html):
+    """
+    清理标签,去除干扰
+    :param html:
+    :return:
+    """
+    if not html:
+        return ""
+    element = fromstring(html)
+    # 清理a标签
+    div = element.xpath('//a')
+    for d in div:
+        d.getparent().remove(d)
+    # 清理样式
+    style = element.xpath('//style')
+    for s in style:
+        s.getparent().remove(s)
+    # 清除html头
+    head = element.xpath('//head')
+    for s in head:
+        s.getparent().remove(s)
+    # 清除js
+    script = element.xpath('//script')
+    for s in script:
+        s.getparent().remove(s)
+    # 转字符串
+    text = tostring(element, encoding='utf-8').decode()
+    text = unescape(text)
+    return text

+ 2 - 0
find_source/settings.py

@@ -3,6 +3,8 @@ from common.databases import mongo_table, redis_client
 
 '''Mongo'''
 MGO_DATABASE = 'shujuziyuan'
+'''招投标数据预测结果'''
+Dzr = mongo_table(db=MGO_DATABASE, name='predict_results')
 '''垃圾表'''
 MGO_DATA_GARBAGE = mongo_table(db=MGO_DATABASE, name='data_garbage')
 '''寻源结果表'''