2 anni fa · c85a415764
--- a/find_source/crawler/utils.py
+++ b/find_source/crawler/utils.py
@@ -17,6 +17,7 @@ from crawler.defaults import (
 
				     PAGE_TEXT_CHECK_WORDS,
			
 
				     PAGE_TEXT_FILTER_WORDS
			
 
				 )
			
 
				+from predict_bidding_model import exists_ztb
			
 
				 
			
 
				 
			
 
				 def err_details(worker):
			
@@ -266,3 +267,11 @@ def check_page_by_words(val: str):
 
				             if search is not None:
			
 
				                 return True
			
 
				     return False
			
 
				+
			
 
				+
			
 
				+def predict_bidding_model(item: dict):
			
 
				+    result = {**item}
			
 
				+    predict_result = exists_ztb(item)
			
 
				+    predict = any({v for _, v in predict_result.items()})
			
 
				+    result['predict'] = int(predict)
			
 
				+    return result
			
--- a/find_source/predict_bidding_model/__init__.py
+++ b/find_source/predict_bidding_model/__init__.py
@@ -0,0 +1,63 @@
 
				+from pathlib import Path
			
 
				+
			
 
				+from .models.predict import PredictModel
			
 
				+from .utils.cut_word import CutWord
			
 
				+from .utils.remove_tags import deal_tag_a
			
 
				+
			
 
				+_base_path = Path(__file__).parent
			
 
				+_dictionary_path = (_base_path / 'docs/topic_model/ztb_small_data').resolve()
			
 
				+_model_path = (_base_path / 'docs/topic_model/SGD_valid_pre.model').resolve()
			
 
				+topic_model = PredictModel(_dictionary_path, _model_path, threshold_val=0.6)
			
 
				+cut_word = CutWord(drop_seg=["x", "m", "eng"], stop_words=[])
			
 
				+
			
 
				+__all__ = ['replace_word', 'cut_data', 'exists_ztb']
			
 
				+
			
 
				+
			
 
				+def replace_word(text: str):
			
 
				+    """
			
 
				+    替换空格
			
 
				+    :param text:
			
 
				+    :return:
			
 
				+    """
			
 
				+    words = ["  ", "\r", "\n", "\u3000"]
			
 
				+    for word in words:
			
 
				+        text = text.replace(word, " ")
			
 
				+    return text
			
 
				+
			
 
				+
			
 
				+def cut_data(content):
			
 
				+    """
			
 
				+    切词
			
 
				+    :param content:
			
 
				+    :return:
			
 
				+    """
			
 
				+    cut_ret = cut_word.cut_word(content)
			
 
				+    if len(cut_ret) < 20:
			
 
				+        return ""
			
 
				+    return " ".join(cut_ret)
			
 
				+
			
 
				+
			
 
				+def exists_ztb(contents: dict):
			
 
				+    """
			
 
				+    是否招投标判断
			
 
				+    :param contents:{唯一id:content}
			
 
				+    :return:
			
 
				+    """
			
 
				+    result = {}
			
 
				+    body = []
			
 
				+    id_list = []  # 与body中的文本做索引关联,同时也是预测结果的索引
			
 
				+    # 切词
			
 
				+    for no_key, content in contents.items():
			
 
				+        source = replace_word(content)
			
 
				+        texts = deal_tag_a(source)
			
 
				+        space_contents = cut_data(texts)
			
 
				+        if not space_contents:
			
 
				+            result[no_key] = 0
			
 
				+            continue
			
 
				+        body.append(space_contents)
			
 
				+        id_list.append(no_key)
			
 
				+    # 生成预测结果
			
 
				+    predict_threshold = topic_model.predict(body, threshold=True)
			
 
				+    for no_key, pre_ret in zip(id_list, predict_threshold):
			
 
				+        result[no_key] = pre_ret
			
 
				+    return result
			
--- a/find_source/predict_bidding_model/docs/topic_model/SGD_valid_pre.model
+++ b/find_source/predict_bidding_model/docs/topic_model/SGD_valid_pre.model
--- a/find_source/predict_bidding_model/docs/topic_model/report.txt
+++ b/find_source/predict_bidding_model/docs/topic_model/report.txt
@@ -0,0 +1,21 @@
 
				+train score:  0.996954 ;  test score:  0.996009
			
 
				+启用阈值：
			
 
				+              precision    recall  f1-score   support
			
 
				+
			
 
				+           0       0.99      1.00      1.00     38245
			
 
				+           1       1.00      0.99      0.99     30413
			
 
				+
			
 
				+    accuracy                           1.00     68658
			
 
				+   macro avg       1.00      1.00      1.00     68658
			
 
				+weighted avg       1.00      1.00      1.00     68658
			
 
				+
			
 
				+不使用阈值：
			
 
				+              precision    recall  f1-score   support
			
 
				+
			
 
				+           0       0.99      1.00      1.00     38245
			
 
				+           1       1.00      0.99      1.00     30413
			
 
				+
			
 
				+    accuracy                           1.00     68658
			
 
				+   macro avg       1.00      1.00      1.00     68658
			
 
				+weighted avg       1.00      1.00      1.00     68658
			
 
				+
			
--- a/find_source/predict_bidding_model/docs/topic_model/ztb_small_data
+++ b/find_source/predict_bidding_model/docs/topic_model/ztb_small_data
--- a/find_source/predict_bidding_model/models/__init__.py
+++ b/find_source/predict_bidding_model/models/__init__.py
@@ -0,0 +1 @@
 
				+from .predict import PredictModel
			
--- a/find_source/predict_bidding_model/models/predict.py
+++ b/find_source/predict_bidding_model/models/predict.py
@@ -0,0 +1,33 @@
 
				+# coding:utf-8
			
 
				+import joblib
			
 
				+
			
 
				+
			
 
				+class PredictModel(object):
			
 
				+    def __init__(self, dictionary_path, model_path, threshold_val=0.8):
			
 
				+        self.dictionary = joblib.load(dictionary_path)
			
 
				+        self.model = joblib.load(model_path)
			
 
				+        self._threshold_val = threshold_val
			
 
				+
			
 
				+    def predict(self, contents, threshold=True):
			
 
				+        """
			
 
				+        结果预测
			
 
				+        :param contents: 需要预测的文本列表
			
 
				+        :param threshold:
			
 
				+        :return:
			
 
				+        """
			
 
				+        if not contents:
			
 
				+            return []
			
 
				+        content_vec = self.dictionary.transform(contents)
			
 
				+        if threshold:
			
 
				+            predict_result = self.model.predict_proba(content_vec)
			
 
				+            predict_result = list(map(self.threshold, predict_result))
			
 
				+        else:
			
 
				+            predict_result = self.model.predict(content_vec)
			
 
				+        return predict_result
			
 
				+
			
 
				+    def threshold(self, x):
			
 
				+        # 预测结果
			
 
				+        if x[1] > self._threshold_val:
			
 
				+            return 1
			
 
				+        else:
			
 
				+            return 0
			
--- a/find_source/predict_bidding_model/readme.md
+++ b/find_source/predict_bidding_model/readme.md
@@ -0,0 +1,5 @@
 
				+1、协议安装包
			
 
				+pip3 install jieba==0.42.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
			
 
				+pip3 install joblib==1.1.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
			
 
				+pip3 install sklearn==0.0 -i https://pypi.tuna.tsinghua.edu.cn/simple 
			
 
				+pip3 install scikit-learn==0.24.2 -i https://pypi.tuna.tsinghua.edu.cn/simple 
			
--- a/find_source/predict_bidding_model/utils/__init__.py
+++ b/find_source/predict_bidding_model/utils/__init__.py
@@ -0,0 +1,2 @@
 
				+from .cut_word import CutWord
			
 
				+from .remove_tags import deal_tag_a
			
--- a/find_source/predict_bidding_model/utils/cut_word.py
+++ b/find_source/predict_bidding_model/utils/cut_word.py
@@ -0,0 +1,29 @@
 
				+# coding:utf-8
			
 
				+
			
 
				+import multiprocessing
			
 
				+import platform
			
 
				+
			
 
				+import jieba
			
 
				+import jieba.posseg as psg
			
 
				+
			
 
				+if platform.system() == 'linux':
			
 
				+    jieba.enable_parallel(multiprocessing.cpu_count())
			
 
				+
			
 
				+
			
 
				+class CutWord(object):
			
 
				+
			
 
				+    def __init__(self, stop_words=None, drop_seg=None):
			
 
				+        self.stop_words = stop_words if stop_words else []  # 停用词
			
 
				+        self.drop_seg = drop_seg if drop_seg else []  # 停用词性
			
 
				+
			
 
				+    def cut_word(self, content):
			
 
				+        words = []
			
 
				+        for w, x in psg.cut(content):
			
 
				+            if w not in self.stop_words and x not in self.drop_seg:
			
 
				+                words.append(w)
			
 
				+        return words
			
 
				+
			
 
				+
			
 
				+# if __name__ == '__main__':
			
 
				+#     cut_obj = CutWord(drop_seg=["x"])
			
 
				+#     cut_obj.cut_word("名称：计算机教室教师机终端及教室管理软件 品牌（如有）：联想 规格型号：")
			
--- a/find_source/predict_bidding_model/utils/remove_tags.py
+++ b/find_source/predict_bidding_model/utils/remove_tags.py
@@ -0,0 +1,35 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+from html import unescape
			
 
				+
			
 
				+from lxml.html import fromstring, tostring
			
 
				+
			
 
				+
			
 
				+def deal_tag_a(html):
			
 
				+    """
			
 
				+    清理标签，去除干扰
			
 
				+    :param html:
			
 
				+    :return:
			
 
				+    """
			
 
				+    if not html:
			
 
				+        return ""
			
 
				+    element = fromstring(html)
			
 
				+    # 清理a标签
			
 
				+    div = element.xpath('//a')
			
 
				+    for d in div:
			
 
				+        d.getparent().remove(d)
			
 
				+    # 清理样式
			
 
				+    style = element.xpath('//style')
			
 
				+    for s in style:
			
 
				+        s.getparent().remove(s)
			
 
				+    # 清除html头
			
 
				+    head = element.xpath('//head')
			
 
				+    for s in head:
			
 
				+        s.getparent().remove(s)
			
 
				+    # 清除js
			
 
				+    script = element.xpath('//script')
			
 
				+    for s in script:
			
 
				+        s.getparent().remove(s)
			
 
				+    # 转字符串
			
 
				+    text = tostring(element, encoding='utf-8').decode()
			
 
				+    text = unescape(text)
			
 
				+    return text
			
--- a/find_source/settings.py
+++ b/find_source/settings.py
@@ -3,6 +3,8 @@ from common.databases import mongo_table, redis_client
 
				 
			
 
				 '''Mongo'''
			
 
				 MGO_DATABASE = 'shujuziyuan'
			
 
				+'''招投标数据预测结果'''
			
 
				+Dzr = mongo_table(db=MGO_DATABASE, name='predict_results')
			
 
				 '''垃圾表'''
			
 
				 MGO_DATA_GARBAGE = mongo_table(db=MGO_DATABASE, name='data_garbage')
			
 
				 '''寻源结果表'''