5 kuukautta sitten · 3af84b8d2d
--- a/dbs/RedisDB.py
+++ b/dbs/RedisDB.py
@@ -0,0 +1,99 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-02-27
			
 
				+---------
			
 
				+@summary: redis 去重
			
 
				+---------
			
 
				+@author: Lzz
			
 
				+"""
			
 
				+import hashlib
			
 
				+
			
 
				+import redis
			
 
				+
			
 
				+import setting
			
 
				+
			
 
				+
			
 
				+class RedisFilter:
			
 
				+
			
 
				+    def __init__(self, url=None, expire_time=None):
			
 
				+        if not url:
			
 
				+            url = setting.REDIS_URL
			
 
				+        self.redis_db = redis.StrictRedis.from_url(url)
			
 
				+        self._ex = expire_time or setting.REDIS_EXPIRE_TIME
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "<RedisFilter: {}>".format(self.redis_db)
			
 
				+
			
 
				+    def exists(self, key):
			
 
				+        """全量检索"""
			
 
				+        if self.redis_db.exists(key) > 0:
			
 
				+            return True
			
 
				+        return False
			
 
				+
			
 
				+    def add(self, keys):
			
 
				+        """
			
 
				+        添加数据
			
 
				+
			
 
				+        @param keys: 检查关键词在 redis 中是否存在，支持列表批量
			
 
				+        @return: list / 单个值(添加失败返回False, 添加成功返回True)
			
 
				+        """
			
 
				+        is_list = isinstance(keys, list)
			
 
				+        keys = keys if is_list else [keys]
			
 
				+
			
 
				+        is_added = []
			
 
				+        for key in keys:
			
 
				+            pkey = "pylist_" + self.fingerprint(key)
			
 
				+            if not self.exists(pkey):
			
 
				+                is_added.append(self.redis_db.set(pkey, 1, ex=self._ex))
			
 
				+            else:
			
 
				+                is_added.append(False)
			
 
				+
			
 
				+        return is_added if is_list else is_added[0]
			
 
				+
			
 
				+    def delete(self, keys):
			
 
				+        is_list = isinstance(keys, list)
			
 
				+        keys = keys if is_list else [keys]
			
 
				+
			
 
				+        for key in keys:
			
 
				+            pkey = "pylist_" + self.fingerprint(key)
			
 
				+            self.redis_db.delete(pkey)
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    def get(self, keys):
			
 
				+        """
			
 
				+        检查数据是否存在
			
 
				+        @param keys: list / 单个值
			
 
				+        @return: list / 单个值 （存在返回True 不存在返回False)
			
 
				+        """
			
 
				+        is_list = isinstance(keys, list)
			
 
				+        keys = keys if is_list else [keys]
			
 
				+
			
 
				+        is_exist = []
			
 
				+        for key in keys:
			
 
				+            pkey = "pylist_" + self.fingerprint(key)
			
 
				+            is_exist.append(self.exists(pkey))
			
 
				+
			
 
				+        # 判断数据本身是否重复
			
 
				+        temp_set = set()
			
 
				+        for i, key in enumerate(keys):
			
 
				+            if key in temp_set:
			
 
				+                is_exist[i] = True
			
 
				+            else:
			
 
				+                temp_set.add(key)
			
 
				+
			
 
				+        return is_exist if is_list else is_exist[0]
			
 
				+
			
 
				+    def fingerprint(self, *args):
			
 
				+        """
			
 
				+        @summary: 获取唯一的64位值,获取唯一数据指纹
			
 
				+        ---------
			
 
				+        @param args: 去重数据集合
			
 
				+        ---------
			
 
				+        @result: 5580c91ea29bf5bd963f4c08dfcacd983566e44ecea1735102bc380576fd6f30
			
 
				+        """
			
 
				+        args = sorted(args)
			
 
				+        sha256 = hashlib.sha256()
			
 
				+        for arg in args:
			
 
				+            sha256.update(str(arg).encode())
			
 
				+        return sha256.hexdigest()
			
--- a/dbs/__init__.py
+++ b/dbs/__init__.py
@@ -0,0 +1,8 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2025-02-25 
			
 
				+---------
			
 
				+@summary:  
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,20 @@
 
				+version: "3"
			
 
				+services:
			
 
				+  ybw-spider:
			
 
				+    image: 172.17.189.142:8081/pyspider/payappcrawl:latest
			
 
				+    volumes:
			
 
				+      - /mnt/ybw:/mnt
			
 
				+    restart: always
			
 
				+    privileged: true
			
 
				+    tty: true
			
 
				+    logging:
			
 
				+      driver: "json-file"
			
 
				+      options:
			
 
				+        max-size: "50M"
			
 
				+        max-file: "1"
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        limits:
			
 
				+          memory: 4G
			
 
				+        reservations:
			
 
				+          memory: 10M
			
--- a/login_cookie.json
+++ b/login_cookie.json
@@ -0,0 +1,16 @@
 
				+{
			
 
				+    "zzxyxm2024": {
			
 
				+        "Hm_lpvt_0bf7d2e4ce4104fa77e95b012f750771": "1740537212",
			
 
				+        "Hm_lvt_0bf7d2e4ce4104fa77e95b012f750771": "1740537094",
			
 
				+        "b5897e326c6777f3_gr_session_id": "2c9d935c-ecf9-4ecb-9cbe-71a3e47bc1fc",
			
 
				+        "b5897e326c6777f3_gr_session_id_2c9d935c-ecf9-4ecb-9cbe-71a3e47bc1fc": "true",
			
 
				+        "gr_user_id": "d230297d-3397-45d1-b5cf-7e4ce93f0c36",
			
 
				+        "CBL_SESSION": "a8d1fdd39640794b282235f958637f89b8525344-___TS=1776537214279&___ID=6994e797-8e6f-45c8-b503-c59b879f0cdf",
			
 
				+        "acw_tc": "2760820017405370876417012ec57b187f91a901c74a6d00ac4fe5c907e723",
			
 
				+        "browser_id": "-1772104107",
			
 
				+        "pop_status": "1",
			
 
				+        "b5897e326c6777f3_gr_cs1": "300310393",
			
 
				+        "b5897e326c6777f3_gr_last_sent_cs1": "300310393",
			
 
				+        "b5897e326c6777f3_gr_last_sent_sid_with_cs1": "2c9d935c-ecf9-4ecb-9cbe-71a3e47bc1fc"
			
 
				+    }
			
 
				+}
			
--- a/node_modules.zip
+++ b/node_modules.zip
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,15 @@
 
				+elasticsearch==7.10.1
			
 
				+loguru==0.5.3
			
 
				+lxml>=4.9.1
			
 
				+PyYAML==5.4.1
			
 
				+pymongo==3.12.0
			
 
				+redis==3.5.3
			
 
				+requests==2.28.1
			
 
				+beautifulsoup4==4.9.3
			
 
				+bs4==0.0.1
			
 
				+jsbeautifier==1.14.7
			
 
				+oss2==2.14.0
			
 
				+urllib3==1.26.13
			
 
				+parsel==1.7.0
			
 
				+PyExecJS>=1.5.1
			
 
				+PySocks
			
--- a/setting.py
+++ b/setting.py
@@ -0,0 +1,37 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-02-28 
			
 
				+---------
			
 
				+@summary:  全局配置
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import pathlib
			
 
				+
			
 
				+ROOT_PATH = pathlib.Path(__file__).absolute().parent
			
 
				+
			
 
				+MONGO_IP = "172.17.4.87"
			
 
				+MONGO_PORT = 27080
			
 
				+MONGO_DB = "py_spider"
			
 
				+
			
 
				+REDIS_URL = "redis://:k5ZJR5KV4q7DRZ92DQ@172.17.162.34:8361/0"
			
 
				+REDIS_EXPIRE_TIME = 86400 * 30 * 6  # 6个月 = 86400 * 30 * 6
			
 
				+
			
 
				+ES_IP = "172.17.4.184"
			
 
				+ES_PORT = 19905
			
 
				+ES_USERNAME = "jybid"
			
 
				+ES_PASSWORD = "Top2023_JEB01i@31"
			
 
				+ES_INDEX = "bidding_v1"  # es库别名
			
 
				+
			
 
				+PROXY_TOKEN = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
			
 
				+PROXY_API = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
			
 
				+
			
 
				+# 详情页采集代理
			
 
				+PROXIES = {
			
 
				+    'https': 'socks5://58.221.59.179:8860',
			
 
				+    'http': 'socks5://58.221.59.179:8860'
			
 
				+}
			
 
				+
			
 
				+# 采集用账密
			
 
				+ACCOUNT = "zzxyxm2024"
			
 
				+PASSWORD = "zzxy2025116"
			
--- a/start.sh
+++ b/start.sh
@@ -0,0 +1,4 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+ps -ef |grep python3 |grep "ybw_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
			
 
				+nohup python3 ybw_details.py > ybw_details.out 2>&1 &
			
--- a/utils/aliyun.py
+++ b/utils/aliyun.py
@@ -0,0 +1,43 @@
 
				+import oss2
			
 
				+
			
 
				+
			
 
				+# 远程bucket配置
			
 
				+oss_conf = {
			
 
				+    "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
			
 
				+    "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
			
 
				+    # "endpoint": "oss-cn-beijing-internal.aliyuncs.com",
			
 
				+    "endpoint": "oss-cn-beijing.aliyuncs.com",
			
 
				+    "bucket_name": "jy-datafile"
			
 
				+}
			
 
				+
			
 
				+
			
 
				+class AliYunService:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.__acc_key_id = oss_conf['key_id']
			
 
				+        self.__acc_key_secret = oss_conf['key_secret']
			
 
				+        self.__endpoint = oss_conf['endpoint']
			
 
				+        self.__bucket_name = oss_conf['bucket_name']
			
 
				+
			
 
				+    def push_oss_from_local(self, key, filename):
			
 
				+        """
			
 
				+        上传一个本地文件到OSS的普通文件
			
 
				+
			
 
				+        :param str key: 上传到OSS的文件名
			
 
				+        :param str filename: 本地文件名，需要有可读权限
			
 
				+        """
			
 
				+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
			
 
				+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
			
 
				+        bucket.put_object_from_file(key, filename)
			
 
				+
			
 
				+    def push_oss_from_stream(self, key, data):
			
 
				+        """
			
 
				+        流式上传oss
			
 
				+
			
 
				+        :param str key: 上传到OSS的文件名
			
 
				+        :param data: 待上传的内容。
			
 
				+        :type data: bytes，str或file-like object
			
 
				+        """
			
 
				+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
			
 
				+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
			
 
				+        bucket.put_object(key, data)
			
--- a/utils/check_utils.py
+++ b/utils/check_utils.py
@@ -0,0 +1,112 @@
 
				+import re
			
 
				+
			
 
				+from utils.execptions import (
			
 
				+    AccountError,
			
 
				+    CheckError,
			
 
				+)
			
 
				+
			
 
				+__all__ = ['CheckText', 'CheckTask']
			
 
				+
			
 
				+
			
 
				+class CheckContent:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.sensitive_words = {
			
 
				+            '正式会员', '账户充值', 'VIP会员查阅', '>(注册)<', '>(登录)<', '高级会员',
			
 
				+            '标准会员', '点击支付',
			
 
				+            # '隐私政策及用户服务协议','.*<a href=\"(.*?)\">点击查看内容'
			
 
				+        }
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def check_text_length(val: str):
			
 
				+        if len(val) == 0:
			
 
				+            raise CheckError(code=10101, reason='文本内容为空')
			
 
				+        elif not re.findall(r'[\u4e00-\u9fa5]+', val, re.S):
			
 
				+            raise CheckError(code=10102, reason='不存在中文字符')
			
 
				+        else:
			
 
				+            '''清洗数字、字母、中文之外的干扰元素'''
			
 
				+            sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
			
 
				+            for pattern in sub_pattern:
			
 
				+                val = re.sub(pattern, '', val)
			
 
				+            # 若文本长度小于2，表示页面内容无详情内容
			
 
				+            if len(val) < 2:
			
 
				+                raise CheckError(code=10102, reason='页面无有效内容')
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def check_content(val: str):
			
 
				+        if val.count("部分文件可能不支持在线浏览"):
			
 
				+            raise CheckError(code=10103, reason='文件不支持在线浏览')
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def check_account_privilege(val: str):
			
 
				+        if val.count("高级会员"):
			
 
				+            raise AccountError(code=10011, reason='账号权限等级过低')
			
 
				+        elif "本招标项目仅供正式会员查阅" in val:
			
 
				+            raise AccountError(code=10012, reason='账号无会员访问权限')
			
 
				+
			
 
				+    def check_sensitive_word(self, val: str):
			
 
				+        total = set()
			
 
				+        for word in self.sensitive_words:
			
 
				+            result = re.search(word, val)
			
 
				+            if result is not None:
			
 
				+                total.add(word)
			
 
				+
			
 
				+        if len(total) > 0:
			
 
				+            raise CheckError(code=10104, reason='敏感词过滤')
			
 
				+
			
 
				+    def __check(self, text):
			
 
				+        self.check_sensitive_word(text)
			
 
				+        self.check_text_length(text)
			
 
				+        self.check_content(text)
			
 
				+        self.check_account_privilege(text)
			
 
				+
			
 
				+    def __call__(self, text: str, *args, **kwargs):
			
 
				+        self.__check(text)
			
 
				+
			
 
				+
			
 
				+class CheckPrePareRequest:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.crawl_keywords = {
			
 
				+            '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
			
 
				+            '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
			
 
				+            '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
			
 
				+            '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
			
 
				+            '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
			
 
				+            '终止', '系统'
			
 
				+        }
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def check_es_cache(title: str, publish_time: int, rows: dict):
			
 
				+        """
			
 
				+
			
 
				+        :param title:  标题
			
 
				+        :param publish_time: 发布时间的时间戳(l_np_publishtime)
			
 
				+        :param rows: 采集内容
			
 
				+        """
			
 
				+        # retrieved_result = es_search(title, publish_time)
			
 
				+        retrieved_result = 0
			
 
				+        if retrieved_result != 0:
			
 
				+            '''es查询数据结果'''
			
 
				+            rows['count'] = retrieved_result
			
 
				+            raise CheckError(code=10105, reason='es已收录标题')
			
 
				+
			
 
				+    def check_crawl_title(self, title: str):
			
 
				+        for keyword in self.crawl_keywords:
			
 
				+            valid_keyword = re.search(keyword, title)
			
 
				+            if valid_keyword is not None:
			
 
				+                break
			
 
				+        else:
			
 
				+            raise CheckError(code=10106, reason='标题未检索到采集关键词', title=title)
			
 
				+
			
 
				+    def __check(self, rows: dict):
			
 
				+        title, publish_time = rows['title'], rows['l_np_publishtime']
			
 
				+        self.check_crawl_title(title)
			
 
				+        self.check_es_cache(title, publish_time, rows)
			
 
				+
			
 
				+    def __call__(self, rows: dict, *args, **kwargs):
			
 
				+        self.__check(rows)
			
 
				+
			
 
				+
			
 
				+CheckText = CheckContent()
			
 
				+CheckTask = CheckPrePareRequest()
			
--- a/utils/clean_html.py
+++ b/utils/clean_html.py
@@ -0,0 +1,147 @@
 
				+import re
			
 
				+__all__ = ['cleaner']
			
 
				+
			
 
				+# 独立元素
			
 
				+INDEPENDENT_TAGS = {
			
 
				+    '<head>[\s\S]*?</head>': '',
			
 
				+    '<html>|<html [^>]*>|</html>': '',
			
 
				+    '<body>|<body [^>]*>|</body>': '',
			
 
				+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
			
 
				+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
			
 
				+    '\\xa0|\\u3000': '',  # 空格
			
 
				+    '<!--[\s\S]*?-->': '',  # 注释
			
 
				+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
			
 
				+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
			
 
				+    '<input>': '',  # 输入框
			
 
				+    '<img[^>]*>': '<br>',  # 图片
			
 
				+}
			
 
				+# 行内元素
			
 
				+INLINE_TAGS = {
			
 
				+    '<a>|<a [^>]*>|</a>': '',  # 超链接
			
 
				+    '<link>|<link [^>]*>|</link>': '',  # 超链接
			
 
				+    '<span>|<span [^>]*>|</span>': '',  # span
			
 
				+    '<label>|<label [^>]*>|</label>': '<br>',  # label
			
 
				+    '<font>|<font [^>]*>|</font>': '',  # font
			
 
				+    'data:image(.*?) ': '',            # 图片base64
			
 
				+}
			
 
				+# 块级元素
			
 
				+BLOCK_TAGS = {
			
 
				+    '<div>\s*?</div>':'',
			
 
				+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
			
 
				+    '<p>|<p [^>]*>': '<br>',  # 段落
			
 
				+    '</p>': '',  # 段落
			
 
				+    '<div>|<div [^>]*>': '<br>',  # 分割 division
			
 
				+    '</div>': '',  # 分割 division
			
 
				+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
			
 
				+}
			
 
				+# 其他
			
 
				+OTHER = {
			
 
				+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
			
 
				+    '<epointform>': '',
			
 
				+    '<!doctype html>|<!doctype html [^>]*>': '',
			
 
				+    '【关闭】|关闭': '',
			
 
				+    '【打印】|打印本页': '',
			
 
				+    '【字体：[\s\S]*】': '',
			
 
				+    '文章来源：[\u4e00-\u9fa5]+': '',
			
 
				+    '浏览次数：.*[<]+': '',
			
 
				+    '（责任编辑：.*?）': '',
			
 
				+    '分享到[：]': '',
			
 
				+
			
 
				+}
			
 
				+# 样式
			
 
				+CSS_STYLE = {
			
 
				+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
			
 
				+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
			
 
				+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
			
 
				+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
			
 
				+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
			
 
				+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
			
 
				+
			
 
				+}
			
 
				+# 空白符
			
 
				+BLANKS = {
			
 
				+    '\n\s*\n': '\n',
			
 
				+    '\s*\n\s*': '\n',
			
 
				+    '[^\S\n]': ' ',
			
 
				+    '\s+': ' ',
			
 
				+}
			
 
				+# css标签集合
			
 
				+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
			
 
				+# css属性集合
			
 
				+ATTRS = {'id', 'class', 'style', 'width'}
			
 
				+
			
 
				+
			
 
				+def _repair_tag():
			
 
				+    """异常的标签组合,用来替换非标准页面的标签"""
			
 
				+    _repairs = {}
			
 
				+    for tag in TAGS:
			
 
				+        for attr in ATTRS:
			
 
				+            key = '{}{}'.format(tag, attr)
			
 
				+            val = '{} {}'.format(tag, attr)
			
 
				+            _repairs[key] = val
			
 
				+    return _repairs
			
 
				+
			
 
				+
			
 
				+def _escape_character(html):
			
 
				+    """转义字符"""
			
 
				+    html = html.replace('&lt;', '<')
			
 
				+    html = html.replace('&gt;', '>')
			
 
				+    html = html.replace('&quot;', '"')
			
 
				+    html = html.replace('&amp;', '&')
			
 
				+    # 不显示输入框边框
			
 
				+    html = html.replace('<input', '<input style="border-color: transparent;"')
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def _lowercase_tag(html):
			
 
				+    """标签归一化处理（全部小写 + 标签修复）"""
			
 
				+    tags = re.findall("<[^>]+>", html)
			
 
				+    tag_sets = set(tags)
			
 
				+
			
 
				+    if len(tag_sets) > 10000:
			
 
				+        from bs4 import BeautifulSoup
			
 
				+        soup = BeautifulSoup(html, "lxml")
			
 
				+        html = str(soup.body.next_element)
			
 
				+    else:
			
 
				+        for tag in tag_sets:
			
 
				+            html = html.replace(tag, str(tag).lower())
			
 
				+
			
 
				+    repair_tags = _repair_tag()
			
 
				+    for err, right in repair_tags.items():
			
 
				+        html = html.replace(err, right)
			
 
				+
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def cleaner(html, special=None, completely=False):
			
 
				+    """
			
 
				+    数据清洗
			
 
				+
			
 
				+    :param html: 清洗的页面
			
 
				+    :param special: 额外指定页面清洗规则
			
 
				+    :param completely: 是否完全清洗页面
			
 
				+    :return: 清洗后的页面源码
			
 
				+    """
			
 
				+    if special is None:
			
 
				+        special = {}
			
 
				+
			
 
				+    OTHER.update(special)
			
 
				+    remove_tags = {
			
 
				+        **INDEPENDENT_TAGS,
			
 
				+        **INLINE_TAGS,
			
 
				+        **BLOCK_TAGS,
			
 
				+        **OTHER,
			
 
				+        **CSS_STYLE,
			
 
				+        **BLANKS,
			
 
				+    }
			
 
				+    html = _lowercase_tag(html)
			
 
				+    for tag, repl in remove_tags.items():
			
 
				+        html = re.sub(tag, repl, html)
			
 
				+
			
 
				+    if completely:
			
 
				+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
			
 
				+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
			
 
				+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
			
 
				+
			
 
				+    html = _escape_character(html)
			
 
				+    return html
			
--- a/utils/es_query.py
+++ b/utils/es_query.py
@@ -0,0 +1,55 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-02-02
			
 
				+---------
			
 
				+@summary: es
			
 
				+---------
			
 
				+@author: Lzz
			
 
				+"""
			
 
				+from elasticsearch import Elasticsearch
			
 
				+from utils.title_participle import get_should
			
 
				+
			
 
				+
			
 
				+# es:
			
 
				+#   host: 172.17.4.184
			
 
				+#   usename: "jybid"
			
 
				+#   pwd: "Top2023_JEB01i@31"
			
 
				+#   port: !!int 19905
			
 
				+#   db: biddingall # es库别名
			
 
				+
			
 
				+
			
 
				+def es_client():
			
 
				+    cfg = {"host": "172.17.4.184",
			
 
				+           "port": 19905,
			
 
				+           "usename": "jybid",
			
 
				+           "pwd": "Top2023_JEB01i@31"}
			
 
				+    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}],http_auth=(cfg['usename'], cfg['pwd']))
			
 
				+
			
 
				+
			
 
				+def es_search(title: str, publish_time: int):
			
 
				+    """
			
 
				+    查询es
			
 
				+
			
 
				+    :param title: 标题
			
 
				+    :param publish_time: 发布时间
			
 
				+    :return:
			
 
				+    """
			
 
				+    client = es_client()
			
 
				+    stime = publish_time - 432000  # 往前推5天
			
 
				+    etime = publish_time + 432000
			
 
				+
			
 
				+    time_limit = {"range": {"publishtime": {"from": stime, "to": etime}}}
			
 
				+    should_list = get_should(title)   # 对标题进行分词组合query语句
			
 
				+    # 通过发布标题和发布时间范围查询
			
 
				+    query = {
			
 
				+        "query": {
			
 
				+            "bool": {
			
 
				+                "must": [time_limit],
			
 
				+                "should": should_list,
			
 
				+                "minimum_should_match": "10<80%",
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    result = client.search(index="biddingall", body=query, request_timeout=100)
			
 
				+    total = int(result['hits']['total']['value'])
			
 
				+    return total
			
--- a/utils/execptions.py
+++ b/utils/execptions.py
@@ -0,0 +1,37 @@
 
				+class YbwCrawlError(Exception):
			
 
				+
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        self.code = kwargs.get('code', 10000)
			
 
				+        self.reason = kwargs.get('reason', '元博网采集未知错误,请手动处理')
			
 
				+
			
 
				+        if 'code' not in kwargs:
			
 
				+            kwargs['code'] = self.code
			
 
				+        if 'reason' not in kwargs:
			
 
				+            kwargs['reason'] = self.reason
			
 
				+
			
 
				+        [setattr(self, key, val) for key, val in kwargs.items()]
			
 
				+        super(YbwCrawlError, self).__init__(*args, kwargs)
			
 
				+
			
 
				+
			
 
				+class AccountError(YbwCrawlError):
			
 
				+
			
 
				+    def __init__(self, reason='账号异常', code=10001, **kwargs):
			
 
				+        super(AccountError, self).__init__(code=code, reason=reason, **kwargs)
			
 
				+
			
 
				+
			
 
				+class CheckError(YbwCrawlError):
			
 
				+
			
 
				+    def __init__(self, reason='数据检查异常', code=10002, **kwargs):
			
 
				+        super(CheckError, self).__init__(code=code, reason=reason, **kwargs)
			
 
				+
			
 
				+
			
 
				+class CrawlError(YbwCrawlError):
			
 
				+
			
 
				+    def __init__(self, reason='数据采集异常', code=10003, **kwargs):
			
 
				+        super(CrawlError, self).__init__(code=code, reason=reason, **kwargs)
			
 
				+
			
 
				+
			
 
				+class AttachmentError(YbwCrawlError):
			
 
				+
			
 
				+    def __init__(self, reason='附件异常', code=10004, **kwargs):
			
 
				+        super(AttachmentError, self).__init__(code=code, reason=reason, **kwargs)
			
--- a/utils/login.py
+++ b/utils/login.py
@@ -0,0 +1,427 @@
 
				+import io
			
 
				+import json
			
 
				+import threading
			
 
				+import time
			
 
				+import uuid
			
 
				+from collections import namedtuple
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import execjs
			
 
				+import requests
			
 
				+from requests import Session
			
 
				+from requests.utils import dict_from_cookiejar
			
 
				+
			
 
				+from utils.execptions import CrawlError
			
 
				+from loguru import logger
			
 
				+import setting
			
 
				+
			
 
				+LOCK = threading.RLock()
			
 
				+
			
 
				+_node_modules = (setting.ROOT_PATH / 'node_modules').resolve()
			
 
				+JSON_LOGIN_COOKIE = (setting.ROOT_PATH / 'login_cookie.json').resolve()
			
 
				+
			
 
				+User = namedtuple('User', ['phone', 'passwd'])
			
 
				+
			
 
				+
			
 
				+def _open_file():
			
 
				+    try:
			
 
				+        fp = open(JSON_LOGIN_COOKIE, encoding='utf-8')
			
 
				+    except FileNotFoundError:
			
 
				+        fp = open(JSON_LOGIN_COOKIE, 'w+', encoding='utf-8')
			
 
				+    return fp
			
 
				+
			
 
				+
			
 
				+def load_login_cookies(user_name: str):
			
 
				+    fp = _open_file()
			
 
				+    try:
			
 
				+        cookies: dict = json.load(fp).get(user_name)
			
 
				+        return cookies
			
 
				+    except json.decoder.JSONDecodeError:
			
 
				+        pass
			
 
				+    fp.close()
			
 
				+
			
 
				+
			
 
				+def save_login_cookies(user_name: str, login_cookie: dict):
			
 
				+    with LOCK:
			
 
				+        # 文件存在就读取,不存在就创建
			
 
				+        fp = _open_file()
			
 
				+        # 内容存在就加载到内存,不存在就设置为空字典
			
 
				+        try:
			
 
				+            user_maps: dict = json.load(fp)
			
 
				+        except json.decoder.JSONDecodeError:
			
 
				+            user_maps = {}
			
 
				+        # print(user_maps)
			
 
				+
			
 
				+        if user_name not in user_maps:
			
 
				+            user_maps.setdefault(user_name, login_cookie)
			
 
				+        else:
			
 
				+            cookies = {user_name: login_cookie}
			
 
				+            user_maps.update(cookies)
			
 
				+
			
 
				+        wp = open(JSON_LOGIN_COOKIE, 'w+', encoding='utf-8')
			
 
				+        wp.write(json.dumps(user_maps, indent=4))
			
 
				+        fp.close()
			
 
				+        wp.close()
			
 
				+
			
 
				+
			
 
				+def update_login_cookies(user_name: str, update_val: dict):
			
 
				+    """
			
 
				+    更新登录 cookie 内容
			
 
				+
			
 
				+    Args:
			
 
				+        user_name: 账号
			
 
				+        update_val: 需要更新的cookie内容
			
 
				+
			
 
				+    """
			
 
				+    with LOCK:
			
 
				+        fp = open(JSON_LOGIN_COOKIE, encoding='utf-8')
			
 
				+        user_maps: dict = json.load(fp)
			
 
				+        login_cookies: dict = user_maps.get(user_name)
			
 
				+        if login_cookies is not None and len(update_val) > 0:
			
 
				+            login_cookies.update(update_val)
			
 
				+            user_login_info = {user_name: login_cookies}
			
 
				+            user_maps.update(user_login_info)
			
 
				+            wp = open(JSON_LOGIN_COOKIE, 'w+', encoding='utf-8')
			
 
				+            wp.write(json.dumps(user_maps, indent=4))
			
 
				+            wp.close()
			
 
				+        fp.close()
			
 
				+
			
 
				+
			
 
				+def convert1(plaintext):
			
 
				+    """
			
 
				+    AES CBC模式 Pkcs7填充 加密
			
 
				+
			
 
				+    @param plaintext: 加密文本
			
 
				+    @return:
			
 
				+    """
			
 
				+    js_str = '''
			
 
				+    const CryptoJS = require('crypto-js');
			
 
				+    function convert1(txt) {
			
 
				+        var a = '434D643932666D644B454E304E646C616535334D6435666E';
			
 
				+        a = CryptoJS.enc.Hex.parse(a)
			
 
				+        b = CryptoJS.enc.Hex.parse("30393138313633304D4D474C435A5059")
			
 
				+        var enc = CryptoJS.AES.encrypt(txt, a, {
			
 
				+            iv: b,
			
 
				+            mode: CryptoJS.mode.CBC,
			
 
				+            padding: CryptoJS.pad.Pkcs7
			
 
				+        })
			
 
				+        return enc.ciphertext.toString()
			
 
				+    }
			
 
				+    '''
			
 
				+    ctx = execjs.compile(js_str, cwd=_node_modules)
			
 
				+    return ctx.call('convert1', plaintext)
			
 
				+
			
 
				+
			
 
				+def recognition_captcha(image_stream, proxies=None, timeout=None):
			
 
				+    """
			
 
				+    验证码识别
			
 
				+
			
 
				+    @param image_stream: 验证码图片流
			
 
				+    @param proxies: 代理
			
 
				+    @param timeout: 超时时间
			
 
				+    @return:
			
 
				+    """
			
 
				+    url = "http://pycaptcha.spdata.jianyu360.com/v1/images/verify"
			
 
				+    img_headers = {'accept': 'application/json'}
			
 
				+    image_file = {'file': image_stream}
			
 
				+    r = requests.post(url,
			
 
				+                      headers=img_headers,
			
 
				+                      files=image_file,
			
 
				+                      stream=True,
			
 
				+                      proxies=proxies,
			
 
				+                      timeout=timeout)
			
 
				+    json_resp = r.json()
			
 
				+    if "msg" in json_resp and "success" == json_resp["msg"]:
			
 
				+        return str(json_resp["r"]["code"]).upper()
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def download_captcha(image, session: Session, save_to_local=False, proxies=None, timeout=None):
			
 
				+    """下载验证码"""
			
 
				+    js_str = '''
			
 
				+        function changeYzmL() {
			
 
				+            var randomNum = ('000000' + Math.floor(Math.random() * 999999)).slice(-6);
			
 
				+            var time = new Date();
			
 
				+            var nowTime = String(time.getFullYear()) + String(time.getMonth() + 1) + String(time.getDate()) + String(
			
 
				+            time.getHours()) + String(time.getMinutes()) + String(time.getSeconds());
			
 
				+            return "https://www.chinabidding.cn/cblcn/member.Login/captcha?randomID=" + randomNum + "&t=" + nowTime
			
 
				+        }
			
 
				+    '''
			
 
				+    ctx = execjs.compile(js_str)
			
 
				+    url = ctx.call('changeYzmL')
			
 
				+    headers = {
			
 
				+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
			
 
				+        'Referer': 'https://www.chinabidding.cn/public/2020/html/login.html?source=1',
			
 
				+    }
			
 
				+    r = session.get(url, headers=headers, stream=True, proxies=proxies, timeout=timeout)
			
 
				+    stream = io.BytesIO()
			
 
				+    stream.write(r.content)
			
 
				+    if save_to_local:
			
 
				+        with open(image, 'wb') as f:
			
 
				+            f.write(r.content)
			
 
				+    logger.info(f'[验证码]下载成功')
			
 
				+    return stream
			
 
				+
			
 
				+
			
 
				+def captcha(session, phone, proxies=None, timeout=None):
			
 
				+    """
			
 
				+    验证码下载与识别
			
 
				+    @param session: requests.session会话对象
			
 
				+    @param phone: 验证码图片命名规则
			
 
				+    @param proxies: 代理
			
 
				+    @param timeout: 超时时间
			
 
				+    @return:
			
 
				+    """
			
 
				+    name = f'{phone}.jpg'
			
 
				+    img_stream = download_captcha(name, session, proxies=proxies, timeout=timeout)
			
 
				+    code = recognition_captcha(img_stream.getvalue(), proxies=proxies, timeout=timeout)
			
 
				+    logger.info(f'[验证码识别]{code}')
			
 
				+    return convert1(code)
			
 
				+
			
 
				+
			
 
				+def login_session(phone: str, password: str, proxies=None, timeout=None):
			
 
				+    """
			
 
				+    登录会话
			
 
				+
			
 
				+    @param phone: 登录手机号
			
 
				+    @param password: 登录密码
			
 
				+    @param proxies: 代理
			
 
				+    @param timeout: 超时时间
			
 
				+    @return: requests.session()
			
 
				+    """
			
 
				+    logger.info('账号登录:{phone}', phone=phone)
			
 
				+    session = requests.session()
			
 
				+    # 生成浏览器身份id
			
 
				+    gr_user_id = uuid.uuid4()
			
 
				+    gr_session_id = uuid.uuid4()
			
 
				+    login_ts = int(time.time())
			
 
				+    session.cookies['gr_user_id'] = str(gr_user_id)
			
 
				+    session.cookies['b5897e326c6777f3_gr_session_id'] = str(gr_session_id)
			
 
				+    session.cookies[f'b5897e326c6777f3_gr_session_id_{gr_session_id}'] = 'true'
			
 
				+    session.cookies['Hm_lvt_0bf7d2e4ce4104fa77e95b012f750771'] = str(login_ts)
			
 
				+    session.cookies['Hm_lpvt_0bf7d2e4ce4104fa77e95b012f750771'] = str(login_ts)
			
 
				+
			
 
				+    # 验证码识别
			
 
				+    yzm = captcha(session, phone, proxies=proxies, timeout=timeout)
			
 
				+    '''
			
 
				+        1、下载验证码操作行为时,会导致服务器记录该次请求的客户端身份id;(服务器保存处理大约需要1s以上)        
			
 
				+    '''
			
 
				+    time.sleep(2)
			
 
				+    now_time_str = '''
			
 
				+        function t() {
			
 
				+            var time = new Date();
			
 
				+            var nowTime = String(time.getFullYear()) + String(time.getMonth() + 1) + String(time.getDate()) + String(
			
 
				+            time.getHours()) + String(time.getMinutes()) + String(time.getSeconds());
			
 
				+            return nowTime
			
 
				+        }
			
 
				+        '''
			
 
				+    now_time_ctx = execjs.compile(now_time_str)
			
 
				+    now_time = now_time_ctx.call('t')
			
 
				+    data = {
			
 
				+        'phone': convert1(phone),
			
 
				+        'password': convert1(password),
			
 
				+        'yzm': yzm,
			
 
				+        't': now_time
			
 
				+    }
			
 
				+    '''
			
 
				+        2、提交登录用户信息时,必须保证提交的会话与下载验证码的会话保持一致,否则服务器验证码无法通过验证
			
 
				+    '''
			
 
				+    headers = {
			
 
				+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
			
 
				+        'Referer': 'https://www.chinabidding.cn/public/2020/html/login.html?source=1',
			
 
				+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
			
 
				+        'X-Requested-With': 'XMLHttpRequest',
			
 
				+    }
			
 
				+    url = 'https://www.chinabidding.cn/yuan/login/loginnew/login'
			
 
				+    r = session.post(url, headers=headers, data=data, proxies=proxies, timeout=timeout)
			
 
				+    assert r.status_code == 200
			
 
				+    logger.info(f'登录信息: {r.json()}')
			
 
				+    return r, session
			
 
				+
			
 
				+
			
 
				+def login_session_by_cookies(cookies, url, headers, data=None, proxies=None, timeout=None):
			
 
				+    """
			
 
				+    使用cookies获取 login session
			
 
				+
			
 
				+    @param dict cookies: 用户登录后的cookies
			
 
				+    @param str url: 登录检查地址
			
 
				+    @param dict headers: 请求头
			
 
				+    @param data: 毫秒级时间戳
			
 
				+    @param proxies: 代理
			
 
				+    @param timeout: 超时时间
			
 
				+    @return: 身份信息和请求结束的响应对象
			
 
				+    """
			
 
				+    session = requests.session()
			
 
				+    r = session.post(url,
			
 
				+                     headers=headers,
			
 
				+                     data=data,
			
 
				+                     cookies=cookies,
			
 
				+                     proxies=proxies,
			
 
				+                     timeout=timeout)
			
 
				+    assert r.status_code == 200
			
 
				+    return r, session
			
 
				+
			
 
				+
			
 
				+def login_check_and_get_meta(session=None, allow_output_log=True, proxies=None, timeout=None):
			
 
				+    """
			
 
				+    检查账号登录状态和获取账号身份数据
			
 
				+
			
 
				+    @param Session session: 账号登录后的 session
			
 
				+    @param allow_output_log: 是否打印日志
			
 
				+    @param proxies: 代理
			
 
				+    @param timeout: 超时时间
			
 
				+    @return: 账号身份数据
			
 
				+    """
			
 
				+    url = "https://www.chinabidding.cn/cblcn/Home/logincheckAndGetMeta"
			
 
				+    payload = f"t={int(round(time.time() * 1000))}"
			
 
				+    headers = {
			
 
				+        'Host': 'www.chinabidding.cn',
			
 
				+        'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
			
 
				+        'Accept': 'application/json, text/javascript, */*; q=0.01',
			
 
				+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
			
 
				+        'X-Requested-With': 'XMLHttpRequest',
			
 
				+        'sec-ch-ua-mobile': '?0',
			
 
				+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
			
 
				+        'sec-ch-ua-platform': '"Windows"',
			
 
				+        'Origin': 'https://www.chinabidding.cn',
			
 
				+        'Sec-Fetch-Site': 'same-origin',
			
 
				+        'Sec-Fetch-Mode': 'cors',
			
 
				+        'Sec-Fetch-Dest': 'empty',
			
 
				+        'Referer': 'https://www.chinabidding.cn/public/2020/html/login.html?source=1',
			
 
				+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
			
 
				+    }
			
 
				+    r = session.post(url, headers=headers, data=payload, proxies=proxies, timeout=timeout)
			
 
				+    assert r.status_code == 200
			
 
				+    member: dict = r.json().get('member')
			
 
				+    if allow_output_log:
			
 
				+        logger.info("账号信息:{}", json.dumps(member, indent=4, ensure_ascii=False))
			
 
				+    return member, r
			
 
				+
			
 
				+
			
 
				+def login_check(account=None, refer=None, allow_output_log=True, proxies=None, timeout=None):
			
 
				+    """
			
 
				+    用户身份信息状态检查
			
 
				+
			
 
				+    Args:
			
 
				+        account(str): 用户账号
			
 
				+        refer: 引用页
			
 
				+        allow_output_log: 是否打印日志
			
 
				+        proxies: 代理
			
 
				+        timeout: 超时时间
			
 
				+
			
 
				+    Returns:
			
 
				+        登录有效时返回False,登录无效时返回True
			
 
				+    """
			
 
				+    url = "https://www.chinabidding.cn/cblcn/Home/newLoginCheck"
			
 
				+    ts = int(time.time())
			
 
				+    ts2tms = int(round(ts * 1000))
			
 
				+    payload = f"t={ts2tms}"
			
 
				+    headers = {
			
 
				+        'Host': 'www.chinabidding.cn',
			
 
				+        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
			
 
				+        'Accept': 'application/json, text/javascript, */*; q=0.01',
			
 
				+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
			
 
				+        'sec-ch-ua-mobile': '?0',
			
 
				+        'X-Requested-With': 'XMLHttpRequest',
			
 
				+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
			
 
				+        'sec-ch-ua-platform': '"Windows"',
			
 
				+        'Origin': 'https://www.chinabidding.cn',
			
 
				+        'Sec-Fetch-Site': 'same-origin',
			
 
				+        'Sec-Fetch-Mode': 'cors',
			
 
				+        'Sec-Fetch-Dest': 'empty',
			
 
				+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
			
 
				+    }
			
 
				+    if refer is not None:
			
 
				+        headers.update({'Referer': refer})
			
 
				+
			
 
				+    cookies = load_login_cookies(account)
			
 
				+    if cookies is None:
			
 
				+        '''没有该账号的cookies信息,请检查 login_cookie.json 配置文件'''
			
 
				+        return True
			
 
				+
			
 
				+    ts = int(time.time())
			
 
				+    r, session = login_session_by_cookies(cookies, url, headers, data=payload, proxies=proxies, timeout=timeout)
			
 
				+
			
 
				+    try:
			
 
				+        member = r.json()
			
 
				+    except json.decoder.JSONDecodeError:
			
 
				+        raise CrawlError(code=10021, reason="系统繁忙，请等待一会儿，自动刷新。")
			
 
				+
			
 
				+    if allow_output_log:
			
 
				+        logger.info("账号信息:{}", json.dumps(member, indent=4, ensure_ascii=False))
			
 
				+
			
 
				+    '''处理本地 cookies'''
			
 
				+    login_cookies: dict = dict_from_cookiejar(r.cookies)
			
 
				+    request_ts = dict(
			
 
				+        # 上一次时间访问时间戳（秒）
			
 
				+        Hm_lvt_0bf7d2e4ce4104fa77e95b012f750771=cookies.get("Hm_lpvt_0bf7d2e4ce4104fa77e95b012f750771",""),
			
 
				+        # 当前访问时间戳（秒）
			
 
				+        Hm_lpvt_0bf7d2e4ce4104fa77e95b012f750771=str(ts)
			
 
				+    )
			
 
				+    login_cookies.update(request_ts)
			
 
				+    update_login_cookies(account, login_cookies)
			
 
				+    if member is not None and len(member) > 1:
			
 
				+        '''登录有效'''
			
 
				+        return False
			
 
				+    else:
			
 
				+        '''登录失效'''
			
 
				+        return True
			
 
				+
			
 
				+
			
 
				+def login_session_check(session, account, allow_output_log=True, proxies=None, timeout=None):
			
 
				+    """
			
 
				+    账号登录状态
			
 
				+
			
 
				+    @param Session session: 登录后的用户 session
			
 
				+    @param str account: 账号
			
 
				+    @param allow_output_log: 是否打印日志
			
 
				+    @param proxies: 代理
			
 
				+    @param timeout: 超时时间
			
 
				+    @return: 身份检查是否有效的布尔值
			
 
				+    """
			
 
				+    member, r = login_check_and_get_meta(session,
			
 
				+                                         allow_output_log,
			
 
				+                                         proxies=proxies,
			
 
				+                                         timeout=timeout)
			
 
				+    if member is not None and len(member) > 0:
			
 
				+        login_cookies: dict = dict_from_cookiejar(r.cookies)
			
 
				+        update_login_cookies(account, login_cookies)
			
 
				+        '''身份有效'''
			
 
				+        return False
			
 
				+    else:
			
 
				+        '''身份无效'''
			
 
				+        return True
			
 
				+
			
 
				+
			
 
				+def login(phone, password, proxies=None, timeout=None):
			
 
				+    """
			
 
				+    登录
			
 
				+
			
 
				+    @param str phone: 账号
			
 
				+    @param str password: 密码
			
 
				+    @param dict proxies: 代理
			
 
				+    @param int|tuple timeout: 超时时间
			
 
				+    @return: 登录会话和网络状态码
			
 
				+    """
			
 
				+    r, session = login_session(phone, password, proxies=proxies, timeout=timeout)
			
 
				+    if r.json()['code'] == 200:
			
 
				+        member, _ = login_check_and_get_meta(session)
			
 
				+        login_cookies: dict = dict_from_cookiejar(session.cookies)
			
 
				+        if member is not None:
			
 
				+            record_id = str(member['record_id'])
			
 
				+            login_meta = dict(
			
 
				+                b5897e326c6777f3_gr_cs1=record_id,
			
 
				+                b5897e326c6777f3_gr_last_sent_cs1=record_id,
			
 
				+                b5897e326c6777f3_gr_last_sent_sid_with_cs1=login_cookies['b5897e326c6777f3_gr_session_id']
			
 
				+            )
			
 
				+            login_cookies.update(login_meta)
			
 
				+        save_login_cookies(phone, login_cookies)
			
 
				+        return session, 200
			
 
				+    else:
			
 
				+        '''
			
 
				+            514 IP限制
			
 
				+        '''
			
 
				+        logger.error(f'[登录失败]{r.json()["code"]}-{r.json()["msg"]},账号:{phone}')
			
 
				+        if r.json()["code"] == 514:
			
 
				+            time.sleep(300)
			
 
				+        return requests.session(), int(r.json()["code"])
			
--- a/utils/title_participle.py
+++ b/utils/title_participle.py
@@ -0,0 +1,50 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2023-10-10 
			
 
				+---------
			
 
				+@summary: 标题分词，组合es查询语句
			
 
				+---------
			
 
				+@author: Lzz
			
 
				+"""
			
 
				+from requests.auth import HTTPBasicAuth
			
 
				+import requests
			
 
				+import json
			
 
				+
			
 
				+
			
 
				+def get_should(title):
			
 
				+
			
 
				+    # url = "http://192.168.3.149:9201/_analyze"  # 测试
			
 
				+    url = "http://172.17.4.184:19905/_analyze"  # 线上
			
 
				+    username = "jybid"
			
 
				+    password = "Top2023_JEB01i@31"
			
 
				+
			
 
				+    headers = {"Content-Type": "application/json"}
			
 
				+    auth = HTTPBasicAuth(username, password)
			
 
				+    data = {
			
 
				+        "analyzer": "ik_smart",
			
 
				+        "text": title
			
 
				+    }
			
 
				+
			
 
				+    res = requests.post(url, headers=headers, auth=auth, json=data, timeout=10)
			
 
				+
			
 
				+    try:
			
 
				+        res_text = json.loads(res.text).get('tokens') or [{"token":title}]
			
 
				+    except:
			
 
				+        res_text = [{"token":title}]
			
 
				+
			
 
				+    should_list = []
			
 
				+    for key in res_text:
			
 
				+        single_dict = {
			
 
				+            "multi_match": {
			
 
				+                "query": f"{key.get('token')}",
			
 
				+                "type": "phrase",
			
 
				+                "fields": [
			
 
				+                    "title"
			
 
				+                ]
			
 
				+            }
			
 
				+        }
			
 
				+        should_list.append(single_dict)
			
 
				+
			
 
				+    return should_list
			
 
				+
			
 
				+
			
--- a/utils/tools.py
+++ b/utils/tools.py
@@ -0,0 +1,66 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-02-02
			
 
				+---------
			
 
				+@summary: 工具类
			
 
				+---------
			
 
				+@author: Lzz
			
 
				+"""
			
 
				+import datetime
			
 
				+import re
			
 
				+import time
			
 
				+
			
 
				+import bson
			
 
				+import requests
			
 
				+from loguru import logger
			
 
				+import setting
			
 
				+
			
 
				+
			
 
				+def clean_title(title):
			
 
				+    if title:
			
 
				+        rule_list = [
			
 
				+            '\(\d{1,20}\)',
			
 
				+            '\[[\u4e00-\u9fa5]{1,9}\]',
			
 
				+            '【[\u4e00-\u9fa5]{1,9}】',
			
 
				+        ]
			
 
				+        for rule in rule_list:
			
 
				+            title = re.sub(rule, '', title)
			
 
				+
			
 
				+    return title
			
 
				+
			
 
				+
			
 
				+def get_proxy():
			
 
				+    url = setting.PROXY_API
			
 
				+    headers = {"Authorization": setting.PROXY_TOKEN}
			
 
				+    proxy = requests.get(url, headers=headers, timeout=10).json().get("data")
			
 
				+    logger.info("切换代理：{}".format(proxy))
			
 
				+    return proxy
			
 
				+
			
 
				+
			
 
				+def int2long(param: int):
			
 
				+    """int 转换成 long """
			
 
				+    return bson.int64.Int64(param)
			
 
				+
			
 
				+
			
 
				+def get_current_date(date_format="%Y-%m-%d %H:%M:%S"):
			
 
				+    return datetime.datetime.now().strftime(date_format)
			
 
				+
			
 
				+
			
 
				+def date_to_timestamp(date, fmt="%Y-%m-%d %H:%M:%S"):
			
 
				+    """
			
 
				+    @summary:
			
 
				+    ---------
			
 
				+    @param date:将"2011-09-28 10:00:00"时间格式转化为时间戳
			
 
				+    @param fmt:时间格式
			
 
				+    ---------
			
 
				+    @result: 返回时间戳
			
 
				+    """
			
 
				+    if ":" in date:
			
 
				+        timestamp = time.mktime(time.strptime(date, fmt))
			
 
				+    else:
			
 
				+        timestamp = time.mktime(time.strptime(date, "%Y-%m-%d"))
			
 
				+    return int(timestamp)
			
 
				+
			
 
				+
			
 
				+def get_today_of_day(day_offset=0):
			
 
				+    return str(datetime.date.today() + datetime.timedelta(days=day_offset))
			
--- a/ybw_crontab.txt
+++ b/ybw_crontab.txt
@@ -0,0 +1,5 @@
 
				+# 元博网 限量采集
			
 
				+30 9 * * * cd /mnt && python3 ybw_query_list.py > ybw_list.out 2>&1
			
 
				+0 3 * * * python3 /mnt/ybw_release_account.py
			
 
				+0 9-16/2 * * 1-5 cd /mnt && ./start.sh
			
 
				+50 8-18/2 * * * cd /mnt && python3 ybw_esquery.py > esquery.out 2>&1
			
--- a/ybw_details.py
+++ b/ybw_details.py
@@ -0,0 +1,316 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-06-17
			
 
				+---------
			
 
				+@summary: 元博网 详情页采集
			
 
				+---------
			
 
				+@author: Lzz
			
 
				+"""
			
 
				+import random
			
 
				+import re
			
 
				+import time
			
 
				+
			
 
				+import requests.exceptions
			
 
				+from loguru import logger
			
 
				+from lxml.html import fromstring, HtmlElement, tostring
			
 
				+from lxml.html.clean import Cleaner
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+import setting
			
 
				+import utils.tools as tool
			
 
				+from dbs.RedisDB import RedisFilter
			
 
				+from utils.check_utils import CheckText, CheckTask
			
 
				+from utils.clean_html import cleaner
			
 
				+from utils.login import User, load_login_cookies, login, login_check
			
 
				+
			
 
				+_proxies = setting.PROXIES
			
 
				+
			
 
				+
			
 
				+def iter_node(element: HtmlElement):
			
 
				+    yield element
			
 
				+    for sub_element in element:
			
 
				+        if isinstance(sub_element, HtmlElement):
			
 
				+            yield from iter_node(sub_element)
			
 
				+
			
 
				+
			
 
				+def pre_parse(element: HtmlElement):
			
 
				+    """对 HTML 进行预处理可能会破坏 HTML 原有的结构，导致根据原始 HTML 编写的 XPath 不可用"""
			
 
				+    pre_remove = {
			
 
				+        'log_col2', 'log_col1', 'cz', 'iconfont closei', 'p2 p1', 'cnxh_b',
			
 
				+        'msg_error', 'r_gg TB-focus', 'april', 'cont2', 'to_login', 'regtxt',
			
 
				+        'shouchang an_n sc', 'april_title red', 'cn_lt', 'dayin an_n',
			
 
				+        'dl_zc vip_t free_member', 'rmbq', 'login-form cl', 'dian_g fr',
			
 
				+        'di_n', 'd_fx', 'd_tub', 'd_dy', 'anniu1', 'cnxh_list', 'btns cl',
			
 
				+        'active', 'close', 'd_an fr', 'avatar', 'toolbar', 'deng_l',
			
 
				+        'cen_right fr', 'log_col5', 'agreement', 'log_col3',
			
 
				+        'shouchang_af an_n sc_after', 'fast_box', 'di_nr fl', 'xgfj', 'dianh',
			
 
				+        'cnxh_list tab_b2 city_list', 'contract cl', 'zb_cen_r fr', 'd_zsms',
			
 
				+        'sc_after active', 'dl_k', 'ewm_b', 'fl', 'wypj', 'rukou', 'p1',
			
 
				+        'dl_zc', 'success', 'daoh h_30', 'bd', 'april_content', 'print',
			
 
				+        'foot', 'cnxh zbgg', 'april_first', 'fastlog', 'tx_mc user_name',
			
 
				+        'tab_h2', 'fanding an_n', 'toux', 'log_col4 cl', 'hangy rem_1', 'red',
			
 
				+        'regshadow', 'bottom', 'dl_zc vip_t fee_member', 'xszn fl', 'no-print',
			
 
				+        'cnxh_b zbgg_b', 'rem rem_1', 'logshadowz', 'd_pj fl', 'tjgjc',
			
 
				+        'spdujaiwlohh', 'di_ewm fr', 'dian_h fl',
			
 
				+        'tab_h2 zbgg_b_gray', 'fanshou an_n fs', 'login-btn', 'fl gjc',
			
 
				+        'agreeshadow', 'guang_db', 'footer_1', 'log_p', 'cnxh_list tab_b2',
			
 
				+        'd_sw', 'april_close', 'd_sc', 'erweima no-print', 'qgzx', 'p2', 'sc',
			
 
				+        'hd', 'log_col6', 'dh_b', 'dian_guang', 'zhu_c', 'ck cai_k', 'april_box',
			
 
				+        'display:none'
			
 
				+    }
			
 
				+    for node in iter_node(element):
			
 
				+        id_attr = node.attrib.get('id')
			
 
				+        class_attr = node.attrib.get('class')
			
 
				+        style_attr = node.attrib.get('style')
			
 
				+        if any([id_attr in pre_remove,
			
 
				+                class_attr in pre_remove,
			
 
				+                style_attr in pre_remove]):
			
 
				+            node.drop_tree()
			
 
				+    return element
			
 
				+
			
 
				+
			
 
				+def page_source(element: HtmlElement):
			
 
				+    clear = Cleaner(
			
 
				+        forms=False,
			
 
				+        style=True
			
 
				+    )
			
 
				+    return clear.clean_html(tostring(element, encoding="utf-8").decode())
			
 
				+
			
 
				+
			
 
				+class DetailSpider:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        _mgo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
			
 
				+        self.ybw_list = _mgo[setting.MONGO_DB]["ybw_list"]
			
 
				+        self.ybw_info = _mgo[setting.MONGO_DB]["ybw_info"]
			
 
				+        self.save_tab = _mgo[setting.MONGO_DB]["data_bak"]
			
 
				+
			
 
				+        self.dedup = RedisFilter()
			
 
				+
			
 
				+        self.user = User(phone=setting.ACCOUNT, passwd=setting.PASSWORD)
			
 
				+        self.login_times = 0
			
 
				+
			
 
				+    def json_request(self, fid, request_params):
			
 
				+        headers = {
			
 
				+            "Accept": "application/json, text/plain, */*",
			
 
				+            "Accept-Language": "zh-CN,zh;q=0.9",
			
 
				+            "Connection": "keep-alive",
			
 
				+            "Referer": "https://www.chinabidding.cn/public/bidagency/index.html",
			
 
				+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
			
 
				+        }
			
 
				+        url = "https://www.chinabidding.cn/agency.info.Detail/show"
			
 
				+        params = {
			
 
				+            "fid": f"{fid}"
			
 
				+        }
			
 
				+
			
 
				+        res = requests.get(url, headers=headers, params=params, **request_params)
			
 
				+        return res
			
 
				+
			
 
				+    def crawl_request(self, item: dict):
			
 
				+        url = item['competehref']
			
 
				+        headers = {
			
 
				+            'Host': 'www.chinabidding.cn',
			
 
				+            'Upgrade-Insecure-Requests': '1',
			
 
				+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
			
 
				+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
			
 
				+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
			
 
				+        }
			
 
				+
			
 
				+        request_params = {}
			
 
				+        request_params.setdefault('headers', headers)
			
 
				+        request_params.setdefault('timeout', 30)
			
 
				+        request_params.setdefault('proxies', _proxies)
			
 
				+
			
 
				+        # 登录代理
			
 
				+        proxy_params = dict(proxies=_proxies, timeout=180)
			
 
				+
			
 
				+        retries = 0
			
 
				+        retries_502, max_retries_502 = 0, 3
			
 
				+        while retries < 3:
			
 
				+            if retries_502 > max_retries_502:
			
 
				+                # 网站已移除该数据
			
 
				+                self.ybw_list.update_one({'_id': item["_id"]}, {'$set': {"crawl_status": "remove"}})
			
 
				+                break
			
 
				+
			
 
				+            login_cookies = load_login_cookies(self.user.phone)
			
 
				+            if login_cookies is None:
			
 
				+                login(*self.user, **proxy_params)
			
 
				+                self.login_times += 1
			
 
				+                continue
			
 
				+            elif 'cookies' not in request_params:
			
 
				+                request_params.setdefault('cookies', login_cookies)
			
 
				+            else:
			
 
				+                request_params.update({'cookies': login_cookies})
			
 
				+
			
 
				+            fid = "".join(re.findall('\?fid=(.*)', url)).split('&')[0]
			
 
				+            if fid:
			
 
				+                try:
			
 
				+                    request_params.pop('headers', None)
			
 
				+                    r = self.json_request(fid, request_params)
			
 
				+                    # 账号登录状态检查
			
 
				+                    retry_login = login_check(self.user.phone, url, False, **proxy_params)
			
 
				+                    if retry_login:
			
 
				+                        logger.info(f"[重新登录]{self.user.phone}")
			
 
				+                        _, code = login(*self.user, **proxy_params)
			
 
				+                        self.login_times += 1
			
 
				+                        if code == 200:
			
 
				+                            retries += 1
			
 
				+                        else:
			
 
				+                            time.sleep(600)
			
 
				+                            retries += 1
			
 
				+                        continue
			
 
				+                    logger.info(f'[采集正文] fid_{fid}')
			
 
				+                    return r
			
 
				+                except:
			
 
				+                    retries += 1
			
 
				+                    continue
			
 
				+            else:
			
 
				+                try:
			
 
				+                    r = requests.get(url, **request_params)
			
 
				+                    # 账号登录状态检查
			
 
				+                    retry_login = login_check(self.user.phone, url, False, **proxy_params)
			
 
				+                    if retry_login:
			
 
				+                        logger.info(f"[重新登录]{self.user.phone}")
			
 
				+                        _, code = login(*self.user, **proxy_params)
			
 
				+                        self.login_times += 1
			
 
				+                        if code == 200:
			
 
				+                            retries += 1
			
 
				+                        else:
			
 
				+                            time.sleep(1800)
			
 
				+                            retries += 1
			
 
				+                        continue
			
 
				+
			
 
				+                    element = fromstring(r.text)
			
 
				+                    nodes = element.xpath('//*[@id="main_dom"]/div[1]')
			
 
				+                    if len(nodes) != 1:
			
 
				+                        retries_502 += 1
			
 
				+                        logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
			
 
				+                        continue
			
 
				+                    else:
			
 
				+                        node = nodes[0]
			
 
				+                        logger.info(f'[采集正文] id={node.attrib.get("id")}')
			
 
				+                        return r
			
 
				+                except requests.RequestException:
			
 
				+                    retries += 1
			
 
				+                    continue
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				+    def crawl_response(self, response, item):
			
 
				+        if re.match('^\{', response.text):
			
 
				+            html = response.json().get('c_info').get('content')
			
 
				+        else:
			
 
				+            element: HtmlElement = fromstring(response.text)
			
 
				+            node = element.xpath('//*[@id="infoDescription"]')[0]
			
 
				+            node = pre_parse(node)
			
 
				+            features = {
			
 
				+                './div[@class="ckgys_cont"]',
			
 
				+                './/div[@class="detail-title ng-scope"]',
			
 
				+                './/table[@class="detail_Table"]',
			
 
				+            }
			
 
				+            for feature in features:
			
 
				+                extract_node = node.xpath(feature)
			
 
				+                if len(extract_node) > 0:
			
 
				+                    valid_node = extract_node[0]
			
 
				+                    break
			
 
				+            else:
			
 
				+                valid_node = node
			
 
				+
			
 
				+            html = page_source(valid_node)
			
 
				+
			
 
				+        '''检查原始页面内容'''
			
 
				+        CheckText(html)
			
 
				+        item["contenthtml"] = html
			
 
				+        special = {
			
 
				+            '若附件无法下载，你可以尝试使用360极速浏览器进行下载！': '',
			
 
				+            # 'DD000E;|EE000F;|FF000E;': '',
			
 
				+            '[(]?[)]?[A-Z]{2}000[A-Z]{1};[(]?[\d{1,4}]*[;]?[)]?[;]?': '',
			
 
				+        }
			
 
				+        item["detail"] = cleaner(html, special)
			
 
				+        item["comeintime"] = tool.int2long(int(time.time()))
			
 
				+
			
 
				+        '''检查清洗之后的详情'''
			
 
				+        CheckText(item["detail"])
			
 
				+        insert = {}
			
 
				+        for key, val in item.items():
			
 
				+            if key not in ['crawl_status', 'crawl', 'count', '_id']:
			
 
				+                insert[key] = val
			
 
				+
			
 
				+        self.save_tab.insert_one(insert)
			
 
				+        logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
			
 
				+
			
 
				+    def crawl_spider(self, schedule, item):
			
 
				+        count = schedule['count']
			
 
				+        self.login_times = schedule['login_times']
			
 
				+        if count >= schedule['total'] or self.login_times >= 3:
			
 
				+            ''' 账号限制 '''
			
 
				+            logger.warning("账号限制")
			
 
				+            return '账号限制'
			
 
				+
			
 
				+        _id = item["_id"]
			
 
				+        err = "error"
			
 
				+        for _ in range(3):
			
 
				+            try:
			
 
				+                CheckTask(item)  # 检查请求采集任务
			
 
				+                response = self.crawl_request(item)
			
 
				+                if response is not None:
			
 
				+                    self.crawl_response(response, item)
			
 
				+                    count += 1
			
 
				+                    self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
			
 
				+                    self.ybw_info.update_one(
			
 
				+                        {"account": self.user.phone},
			
 
				+                        {"$set": {
			
 
				+                            "count": count,
			
 
				+                            "update_time": tool.get_current_date(),
			
 
				+                            "login_times": self.login_times
			
 
				+                        }}
			
 
				+                    )
			
 
				+                    return True
			
 
				+            except Exception as e:
			
 
				+                err = e
			
 
				+                logger.error(f"请求错误：{err}")
			
 
				+
			
 
				+        self.ybw_list.update_one({'_id': _id}, {'$set': {'crawl_status': f'{err}'}})
			
 
				+        return False
			
 
				+
			
 
				+    def start(self):
			
 
				+        logger.debug(" *** start ***")
			
 
				+
			
 
				+        schedule = self.ybw_info.find_one({"account": self.user.phone})
			
 
				+        if schedule is None:
			
 
				+            logger.error(f"数据库无此账号信息|{self.user.phone}")
			
 
				+            return
			
 
				+
			
 
				+        query = {"crawl_status": {"$exists": False}, "es_count": 0}
			
 
				+        sort = [('publishtime', -1)]
			
 
				+        limit = 100
			
 
				+        with self.ybw_list.find(query, sort=sort).limit(limit) as cursor:
			
 
				+            tasks = [doc for doc in cursor]
			
 
				+
			
 
				+        download_count = 0
			
 
				+        rdm = random.randint(30, 50)
			
 
				+        for item in tasks:
			
 
				+            publish_ts = tool.date_to_timestamp(item['publishtime'])
			
 
				+            if publish_ts > int(time.time()) - 43200:
			
 
				+                logger.warning("未到采集时间")
			
 
				+                continue
			
 
				+
			
 
				+            fp = 'detail_' + item.get('competehref')
			
 
				+            if not self.dedup.get(fp):
			
 
				+                self.dedup.add(fp)
			
 
				+
			
 
				+                download_count += 1
			
 
				+                rst = self.crawl_spider(schedule, item)
			
 
				+                if not rst or '账号限制' in str(rst):
			
 
				+                    self.dedup.delete(fp)
			
 
				+
			
 
				+                if download_count >= rdm or '账号限制' in str(rst):
			
 
				+                    break
			
 
				+
			
 
				+                time.sleep(random.randint(80, 180))
			
 
				+
			
 
				+        logger.debug(" *** end ***")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    DetailSpider().start()
			
--- a/ybw_esquery.py
+++ b/ybw_esquery.py
@@ -0,0 +1,103 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-06-18
			
 
				+---------
			
 
				+@summary: 在发布时间范围内精准检索标题
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import datetime
			
 
				+import warnings
			
 
				+from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				+
			
 
				+from elasticsearch import Elasticsearch
			
 
				+from loguru import logger
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+import setting
			
 
				+import utils.tools as tool
			
 
				+
			
 
				+warnings.filterwarnings('ignore')
			
 
				+
			
 
				+
			
 
				+def es_client():
			
 
				+    cfg = {
			
 
				+        "host": setting.ES_IP,
			
 
				+        "port": setting.ES_PORT,
			
 
				+        "usename": setting.ES_USERNAME,
			
 
				+        "pwd": setting.ES_PASSWORD,
			
 
				+    }
			
 
				+    hosts = [{"host": cfg['host'], "port": cfg['port']}]
			
 
				+    return Elasticsearch(hosts, http_auth=(cfg['usename'], cfg['pwd']))
			
 
				+
			
 
				+
			
 
				+def es_search(title, publish_time):
			
 
				+    """
			
 
				+    查询es
			
 
				+
			
 
				+    :param str title: 标题
			
 
				+    :param int publish_time: 发布时间
			
 
				+    :return:
			
 
				+    """
			
 
				+    dt = datetime.datetime.fromtimestamp(publish_time)
			
 
				+    day_0am = datetime.datetime(dt.year, dt.month, dt.day)
			
 
				+    gte = int(day_0am.timestamp())
			
 
				+    lt = int(day_0am.timestamp()) + 24 * 3600
			
 
				+
			
 
				+    with es_client() as client:
			
 
				+        body = {
			
 
				+            "query": {
			
 
				+                "bool": {
			
 
				+                    "must": [
			
 
				+                        {"term": {"title.mtitle": title}},
			
 
				+                        {"range": {"publishtime": {"gte": gte, "lt": lt}}}
			
 
				+                    ]
			
 
				+                }
			
 
				+            },
			
 
				+            "from": 0,
			
 
				+            "size": 10
			
 
				+        }
			
 
				+        result = client.search(index=setting.ES_INDEX, body=body, request_timeout=100)
			
 
				+        return int(result['hits']['total']['value'])
			
 
				+
			
 
				+
			
 
				+def retrieval(item):
			
 
				+    ts = tool.date_to_timestamp(item["publishtime"], "%Y-%m-%d %H:%M:%S")
			
 
				+    try:
			
 
				+        es_count = es_search(title=item["title"], publish_time=ts)
			
 
				+    except Exception as e:
			
 
				+        logger.exception(e)
			
 
				+        es_count = 0
			
 
				+
			
 
				+    item["es_count"] = es_count
			
 
				+    return item
			
 
				+
			
 
				+
			
 
				+def start(threads):
			
 
				+    logger.info("es数据检索")
			
 
				+
			
 
				+    to_mongo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
			
 
				+    collection = to_mongo[setting.MONGO_DB]["ybw_list"]
			
 
				+
			
 
				+    p = {"crawl_status": {"$exists": False}, "es_count": {"$exists": False}}
			
 
				+    with collection.find(p, no_cursor_timeout=True) as cursor:
			
 
				+        data_lst = [item for item in cursor]
			
 
				+
			
 
				+    data_count = 0
			
 
				+    with ThreadPoolExecutor(max_workers=threads) as executor:
			
 
				+        fs = [executor.submit(retrieval, data) for data in data_lst]
			
 
				+        for f in as_completed(fs):
			
 
				+            data_count += 1
			
 
				+            ret = f.result()
			
 
				+            es_count = ret["es_count"]
			
 
				+            collection.update_one(
			
 
				+                {"_id": ret["_id"]},
			
 
				+                {"$set": {"es_count": es_count}}
			
 
				+            )
			
 
				+
			
 
				+    to_mongo.close()
			
 
				+    logger.info(f"本次es共检索数据 {data_count} 条")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    start(threads=1)
			
--- a/ybw_query_list.py
+++ b/ybw_query_list.py
@@ -0,0 +1,183 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-06-17
			
 
				+---------
			
 
				+@summary: 元博网 - 列表页信息检索
			
 
				+---------
			
 
				+@author: Lzz
			
 
				+"""
			
 
				+import math
			
 
				+import random
			
 
				+import time
			
 
				+import warnings
			
 
				+from collections import namedtuple
			
 
				+
			
 
				+import requests
			
 
				+from loguru import logger
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+import setting
			
 
				+import utils.tools as tool
			
 
				+from dbs.RedisDB import RedisFilter
			
 
				+
			
 
				+warnings.filterwarnings('ignore')
			
 
				+
			
 
				+
			
 
				+class Spider:
			
 
				+    def __init__(self):
			
 
				+        _mgo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
			
 
				+        self.ybw_list = _mgo[setting.MONGO_DB]["ybw_list"]
			
 
				+
			
 
				+        self.dedup = RedisFilter()
			
 
				+
			
 
				+        self.total = 0
			
 
				+        self.crawl_page = 1
			
 
				+        self.areas_dict = {1: '北京', 2: '上海', 3: '天津', 4: '重庆', 5: '河北', 6: '山西', 7: '内蒙古', 8: '辽宁',
			
 
				+                           9: '吉林', 10: '黑龙江', 11: '江苏', 12: '浙江', 13: '安徽', 14: '福建', 15: '江西',
			
 
				+                           16: '山东', 17: '河南', 18: '湖北', 19: '湖南', 20: '广东', 21: '广西', 22: '海南',
			
 
				+                           23: '贵州', 24: '云南', 25: '西藏', 26: '陕西', 27: '四川', 28: '甘肃', 29: '青海',
			
 
				+                           30: '新疆', 31: '宁夏'}
			
 
				+
			
 
				+    def fetch_request(self, page, key, proxies=False):
			
 
				+        url = "https://www.chinabidding.cn/302e302e7379675f73736f/datax/json/gj_zbcg_daylimit"
			
 
				+        headers = {
			
 
				+            "accept": "application/json, text/javascript, */*; q=0.01",
			
 
				+            "accept-language": "zh-CN,zh;q=0.9",
			
 
				+            "cache-control": "no-cache",
			
 
				+            "pragma": "no-cache",
			
 
				+            "priority": "u=1, i",
			
 
				+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
			
 
				+            "x-requested-with": "XMLHttpRequest"
			
 
				+        }
			
 
				+        params = {
			
 
				+            "device": "es",
			
 
				+            "cpcode": "es001",
			
 
				+            "keywords": f"{key}",
			
 
				+            "table_type": "4,",
			
 
				+            "search_type": "CONTEXT",
			
 
				+            "areaid": "17,",
			
 
				+            "categoryid": "",
			
 
				+            "b_date": "week",
			
 
				+            "time_start": "",
			
 
				+            "time_end": "",
			
 
				+            "page": f"{page}",
			
 
				+            "rp": "30",
			
 
				+            "usrecord_id": "",
			
 
				+        }
			
 
				+        request_params = dict(
			
 
				+            headers=headers,
			
 
				+            params=params,
			
 
				+            proxies=proxies,
			
 
				+            timeout=60,
			
 
				+            verify=False
			
 
				+        )
			
 
				+        return requests.get(url, **request_params)
			
 
				+
			
 
				+    def parse(self, response, query_date):
			
 
				+        total = response.json().get('result').get('total', 0)
			
 
				+        self.crawl_page = math.ceil(total / 30)
			
 
				+
			
 
				+        results = []
			
 
				+        info_list = response.json().get('result').get('list', [])
			
 
				+        for info in info_list:
			
 
				+            publish_time = info.get('fields').get('publish_date')
			
 
				+            title = tool.clean_title(info.get('fields').get('title').strip())
			
 
				+            competehref = info.get('fields').get('url')
			
 
				+            if "chinabidding" not in competehref:
			
 
				+                competehref = 'https://www.chinabidding.cn{}'.format(competehref)
			
 
				+
			
 
				+            area = self.areas_dict[int(info.get('fields').get('area_id'))] or "全国"
			
 
				+
			
 
				+            if title is None:
			
 
				+                logger.error(f"[标题为空]{competehref}")
			
 
				+                return
			
 
				+
			
 
				+            if not self.dedup.get(competehref) and query_date in publish_time:
			
 
				+                item = {
			
 
				+                    "site": "元博网（采购与招标网）",
			
 
				+                    "channel": "政府采购",
			
 
				+                    "area": area if area != '跨省' else '全国',
			
 
				+                    "_d": "comeintime",
			
 
				+                    "comeintime": tool.int2long(int(time.time())),
			
 
				+                    "T": "bidding",
			
 
				+                    "sendflag": "false",
			
 
				+                    "spidercode": "a_ybwcgyzbw_zfcg",
			
 
				+                    "city": "",
			
 
				+                    "infoformat": 1,
			
 
				+                    "type": "",
			
 
				+                    "publishdept": "",
			
 
				+                    "title": title,
			
 
				+                    "competehref": competehref,
			
 
				+                    "href": "#",
			
 
				+                    "publishtime": publish_time,
			
 
				+                    "l_np_publishtime": tool.int2long(tool.date_to_timestamp(publish_time)),
			
 
				+                }
			
 
				+                self.ybw_list.insert_one(item)
			
 
				+                self.dedup.add(competehref)
			
 
				+                results.append(item)
			
 
				+                self.total += 1
			
 
				+
			
 
				+        logger.info(
			
 
				+            f' *** 检索完成：去重 {len(info_list) - len(results)} 条 - 入库 {len(results)} 条 *** <{self.total}>')
			
 
				+
			
 
				+    def crawl_list_spider(self, page, key, query_date):
			
 
				+        retry_times = 0
			
 
				+        while retry_times < 3:
			
 
				+            proxies = tool.get_proxy()
			
 
				+            try:
			
 
				+                response = self.fetch_request(page=page, key=key, proxies=proxies)
			
 
				+                response.raise_for_status()  # requests 自检
			
 
				+                if response is not None and response.status_code == 200:
			
 
				+                    self.parse(response, query_date)
			
 
				+                    logger.debug(f"[检索完成] {key}")
			
 
				+                    time.sleep(random.random())
			
 
				+                    return
			
 
				+                else:
			
 
				+                    retry_times += 1
			
 
				+                    time.sleep(1)
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"采集异常：{e}")
			
 
				+                retry_times += 1
			
 
				+                time.sleep(2)
			
 
				+
			
 
				+        logger.warning(f"[检索失败] {key}")
			
 
				+
			
 
				+    def start(self, query_date):
			
 
				+        logger.debug("********** 检索开始 **********")
			
 
				+
			
 
				+        data_sets = {
			
 
				+            "中国移动河南分公司",
			
 
				+            "中国移动通信集团",
			
 
				+            "中移建设有限公司",
			
 
				+            "中移铁通有限公司",
			
 
				+            "中移系统集成有限公司",
			
 
				+            "中移信息系统集成有限公司",
			
 
				+            "中移在线服务有限公司",
			
 
				+            "联通(河南)产业互联网有限公司",
			
 
				+            "联通数字科技有限公司",
			
 
				+            "中国联合网络通信",
			
 
				+            "中国联合网络通信有限公司",
			
 
				+            "中讯邮电咨询设计院有限公司",
			
 
				+            "天翼云科技有限公司",
			
 
				+            "中电信数智科技有限公司",
			
 
				+            "中国电信股份有限公司",
			
 
				+            "中国电信集团有限公司",
			
 
				+            "中国电信数智科技有限公司",
			
 
				+            "中国联合网络通信有限公司"
			
 
				+        }
			
 
				+        for key in data_sets:
			
 
				+            self.crawl_list_spider(1, key, query_date)
			
 
				+            if self.crawl_page != 1:
			
 
				+                for page in range(2, self.crawl_page + 1):
			
 
				+                    self.crawl_list_spider(page, key, query_date)
			
 
				+
			
 
				+        logger.debug("********** 检索结束 **********")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    Menu = namedtuple(
			
 
				+        'Menu',
			
 
				+        ['channel', 'code', 'types', 'rout', 'query_date', 'crawl_page']
			
 
				+    )
			
 
				+    query_date = tool.get_today_of_day(-1)
			
 
				+    Spider().start(query_date)
			
--- a/ybw_release_account.py
+++ b/ybw_release_account.py
@@ -0,0 +1,18 @@
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+import setting
			
 
				+import utils.tools as tool
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    to_mongo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
			
 
				+    account_coll = to_mongo[setting.MONGO_DB]['ybw_info']
			
 
				+
			
 
				+    # 重置 账号信息
			
 
				+    with account_coll.find() as cursor:
			
 
				+        for item in cursor:
			
 
				+            update_date = tool.get_current_date()
			
 
				+            account_coll.update_one(
			
 
				+                {"_id": item["_id"]},
			
 
				+                {"$set": {"count": 0, "login_times": 0, "update_time": update_date()}}
			
 
				+            )
			
 
				+            print(f" {item['account']} 已更新 < {update_date} >")