dzr 5 kuukautta sitten
vanhempi
commit
3af84b8d2d
21 muutettua tiedostoa jossa 1761 lisäystä ja 0 poistoa
  1. 99 0
      dbs/RedisDB.py
  2. 8 0
      dbs/__init__.py
  3. 20 0
      docker-compose.yml
  4. 16 0
      login_cookie.json
  5. BIN
      node_modules.zip
  6. 15 0
      requirements.txt
  7. 37 0
      setting.py
  8. 4 0
      start.sh
  9. 43 0
      utils/aliyun.py
  10. 112 0
      utils/check_utils.py
  11. 147 0
      utils/clean_html.py
  12. 55 0
      utils/es_query.py
  13. 37 0
      utils/execptions.py
  14. 427 0
      utils/login.py
  15. 50 0
      utils/title_participle.py
  16. 66 0
      utils/tools.py
  17. 5 0
      ybw_crontab.txt
  18. 316 0
      ybw_details.py
  19. 103 0
      ybw_esquery.py
  20. 183 0
      ybw_query_list.py
  21. 18 0
      ybw_release_account.py

+ 99 - 0
dbs/RedisDB.py

@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-02-27
+---------
+@summary: redis 去重
+---------
+@author: Lzz
+"""
+import hashlib
+
+import redis
+
+import setting
+
+
+class RedisFilter:
+
+    def __init__(self, url=None, expire_time=None):
+        if not url:
+            url = setting.REDIS_URL
+        self.redis_db = redis.StrictRedis.from_url(url)
+        self._ex = expire_time or setting.REDIS_EXPIRE_TIME
+
+    def __repr__(self):
+        return "<RedisFilter: {}>".format(self.redis_db)
+
+    def exists(self, key):
+        """全量检索"""
+        if self.redis_db.exists(key) > 0:
+            return True
+        return False
+
+    def add(self, keys):
+        """
+        添加数据
+
+        @param keys: 检查关键词在 redis 中是否存在,支持列表批量
+        @return: list / 单个值(添加失败返回False, 添加成功返回True)
+        """
+        is_list = isinstance(keys, list)
+        keys = keys if is_list else [keys]
+
+        is_added = []
+        for key in keys:
+            pkey = "pylist_" + self.fingerprint(key)
+            if not self.exists(pkey):
+                is_added.append(self.redis_db.set(pkey, 1, ex=self._ex))
+            else:
+                is_added.append(False)
+
+        return is_added if is_list else is_added[0]
+
+    def delete(self, keys):
+        is_list = isinstance(keys, list)
+        keys = keys if is_list else [keys]
+
+        for key in keys:
+            pkey = "pylist_" + self.fingerprint(key)
+            self.redis_db.delete(pkey)
+
+        return True
+
+    def get(self, keys):
+        """
+        检查数据是否存在
+        @param keys: list / 单个值
+        @return: list / 单个值 (存在返回True 不存在返回False)
+        """
+        is_list = isinstance(keys, list)
+        keys = keys if is_list else [keys]
+
+        is_exist = []
+        for key in keys:
+            pkey = "pylist_" + self.fingerprint(key)
+            is_exist.append(self.exists(pkey))
+
+        # 判断数据本身是否重复
+        temp_set = set()
+        for i, key in enumerate(keys):
+            if key in temp_set:
+                is_exist[i] = True
+            else:
+                temp_set.add(key)
+
+        return is_exist if is_list else is_exist[0]
+
+    def fingerprint(self, *args):
+        """
+        @summary: 获取唯一的64位值,获取唯一数据指纹
+        ---------
+        @param args: 去重数据集合
+        ---------
+        @result: 5580c91ea29bf5bd963f4c08dfcacd983566e44ecea1735102bc380576fd6f30
+        """
+        args = sorted(args)
+        sha256 = hashlib.sha256()
+        for arg in args:
+            sha256.update(str(arg).encode())
+        return sha256.hexdigest()

+ 8 - 0
dbs/__init__.py

@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-02-25 
+---------
+@summary:  
+---------
+@author: Dzr
+"""

+ 20 - 0
docker-compose.yml

@@ -0,0 +1,20 @@
+version: "3"
+services:
+  ybw-spider:
+    image: 172.17.189.142:8081/pyspider/payappcrawl:latest
+    volumes:
+      - /mnt/ybw:/mnt
+    restart: always
+    privileged: true
+    tty: true
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "50M"
+        max-file: "1"
+    deploy:
+      resources:
+        limits:
+          memory: 4G
+        reservations:
+          memory: 10M

+ 16 - 0
login_cookie.json

@@ -0,0 +1,16 @@
+{
+    "zzxyxm2024": {
+        "Hm_lpvt_0bf7d2e4ce4104fa77e95b012f750771": "1740537212",
+        "Hm_lvt_0bf7d2e4ce4104fa77e95b012f750771": "1740537094",
+        "b5897e326c6777f3_gr_session_id": "2c9d935c-ecf9-4ecb-9cbe-71a3e47bc1fc",
+        "b5897e326c6777f3_gr_session_id_2c9d935c-ecf9-4ecb-9cbe-71a3e47bc1fc": "true",
+        "gr_user_id": "d230297d-3397-45d1-b5cf-7e4ce93f0c36",
+        "CBL_SESSION": "a8d1fdd39640794b282235f958637f89b8525344-___TS=1776537214279&___ID=6994e797-8e6f-45c8-b503-c59b879f0cdf",
+        "acw_tc": "2760820017405370876417012ec57b187f91a901c74a6d00ac4fe5c907e723",
+        "browser_id": "-1772104107",
+        "pop_status": "1",
+        "b5897e326c6777f3_gr_cs1": "300310393",
+        "b5897e326c6777f3_gr_last_sent_cs1": "300310393",
+        "b5897e326c6777f3_gr_last_sent_sid_with_cs1": "2c9d935c-ecf9-4ecb-9cbe-71a3e47bc1fc"
+    }
+}

BIN
node_modules.zip


+ 15 - 0
requirements.txt

@@ -0,0 +1,15 @@
+elasticsearch==7.10.1
+loguru==0.5.3
+lxml>=4.9.1
+PyYAML==5.4.1
+pymongo==3.12.0
+redis==3.5.3
+requests==2.28.1
+beautifulsoup4==4.9.3
+bs4==0.0.1
+jsbeautifier==1.14.7
+oss2==2.14.0
+urllib3==1.26.13
+parsel==1.7.0
+PyExecJS>=1.5.1
+PySocks

+ 37 - 0
setting.py

@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-02-28 
+---------
+@summary:  全局配置
+---------
+@author: Dzr
+"""
+import pathlib
+
+ROOT_PATH = pathlib.Path(__file__).absolute().parent
+
+MONGO_IP = "172.17.4.87"
+MONGO_PORT = 27080
+MONGO_DB = "py_spider"
+
+REDIS_URL = "redis://:k5ZJR5KV4q7DRZ92DQ@172.17.162.34:8361/0"
+REDIS_EXPIRE_TIME = 86400 * 30 * 6  # 6个月 = 86400 * 30 * 6
+
+ES_IP = "172.17.4.184"
+ES_PORT = 19905
+ES_USERNAME = "jybid"
+ES_PASSWORD = "Top2023_JEB01i@31"
+ES_INDEX = "bidding_v1"  # es库别名
+
+PROXY_TOKEN = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
+PROXY_API = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
+
+# 详情页采集代理
+PROXIES = {
+    'https': 'socks5://58.221.59.179:8860',
+    'http': 'socks5://58.221.59.179:8860'
+}
+
+# 采集用账密
+ACCOUNT = "zzxyxm2024"
+PASSWORD = "zzxy2025116"

+ 4 - 0
start.sh

@@ -0,0 +1,4 @@
+#!/bin/bash
+
+ps -ef |grep python3 |grep "ybw_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+nohup python3 ybw_details.py > ybw_details.out 2>&1 &

+ 43 - 0
utils/aliyun.py

@@ -0,0 +1,43 @@
+import oss2
+
+
+# 远程bucket配置
+oss_conf = {
+    "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
+    "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
+    # "endpoint": "oss-cn-beijing-internal.aliyuncs.com",
+    "endpoint": "oss-cn-beijing.aliyuncs.com",
+    "bucket_name": "jy-datafile"
+}
+
+
+class AliYunService:
+
+    def __init__(self):
+        self.__acc_key_id = oss_conf['key_id']
+        self.__acc_key_secret = oss_conf['key_secret']
+        self.__endpoint = oss_conf['endpoint']
+        self.__bucket_name = oss_conf['bucket_name']
+
+    def push_oss_from_local(self, key, filename):
+        """
+        上传一个本地文件到OSS的普通文件
+
+        :param str key: 上传到OSS的文件名
+        :param str filename: 本地文件名,需要有可读权限
+        """
+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
+        bucket.put_object_from_file(key, filename)
+
+    def push_oss_from_stream(self, key, data):
+        """
+        流式上传oss
+
+        :param str key: 上传到OSS的文件名
+        :param data: 待上传的内容。
+        :type data: bytes,str或file-like object
+        """
+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
+        bucket.put_object(key, data)

+ 112 - 0
utils/check_utils.py

@@ -0,0 +1,112 @@
+import re
+
+from utils.execptions import (
+    AccountError,
+    CheckError,
+)
+
+__all__ = ['CheckText', 'CheckTask']
+
+
+class CheckContent:
+
+    def __init__(self):
+        self.sensitive_words = {
+            '正式会员', '账户充值', 'VIP会员查阅', '>(注册)<', '>(登录)<', '高级会员',
+            '标准会员', '点击支付',
+            # '隐私政策及用户服务协议','.*<a href=\"(.*?)\">点击查看内容'
+        }
+
+    @staticmethod
+    def check_text_length(val: str):
+        if len(val) == 0:
+            raise CheckError(code=10101, reason='文本内容为空')
+        elif not re.findall(r'[\u4e00-\u9fa5]+', val, re.S):
+            raise CheckError(code=10102, reason='不存在中文字符')
+        else:
+            '''清洗数字、字母、中文之外的干扰元素'''
+            sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
+            for pattern in sub_pattern:
+                val = re.sub(pattern, '', val)
+            # 若文本长度小于2,表示页面内容无详情内容
+            if len(val) < 2:
+                raise CheckError(code=10102, reason='页面无有效内容')
+
+    @staticmethod
+    def check_content(val: str):
+        if val.count("部分文件可能不支持在线浏览"):
+            raise CheckError(code=10103, reason='文件不支持在线浏览')
+
+    @staticmethod
+    def check_account_privilege(val: str):
+        if val.count("高级会员"):
+            raise AccountError(code=10011, reason='账号权限等级过低')
+        elif "本招标项目仅供正式会员查阅" in val:
+            raise AccountError(code=10012, reason='账号无会员访问权限')
+
+    def check_sensitive_word(self, val: str):
+        total = set()
+        for word in self.sensitive_words:
+            result = re.search(word, val)
+            if result is not None:
+                total.add(word)
+
+        if len(total) > 0:
+            raise CheckError(code=10104, reason='敏感词过滤')
+
+    def __check(self, text):
+        self.check_sensitive_word(text)
+        self.check_text_length(text)
+        self.check_content(text)
+        self.check_account_privilege(text)
+
+    def __call__(self, text: str, *args, **kwargs):
+        self.__check(text)
+
+
+class CheckPrePareRequest:
+
+    def __init__(self):
+        self.crawl_keywords = {
+            '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
+            '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
+            '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
+            '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
+            '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
+            '终止', '系统'
+        }
+
+    @staticmethod
+    def check_es_cache(title: str, publish_time: int, rows: dict):
+        """
+
+        :param title:  标题
+        :param publish_time: 发布时间的时间戳(l_np_publishtime)
+        :param rows: 采集内容
+        """
+        # retrieved_result = es_search(title, publish_time)
+        retrieved_result = 0
+        if retrieved_result != 0:
+            '''es查询数据结果'''
+            rows['count'] = retrieved_result
+            raise CheckError(code=10105, reason='es已收录标题')
+
+    def check_crawl_title(self, title: str):
+        for keyword in self.crawl_keywords:
+            valid_keyword = re.search(keyword, title)
+            if valid_keyword is not None:
+                break
+        else:
+            raise CheckError(code=10106, reason='标题未检索到采集关键词', title=title)
+
+    def __check(self, rows: dict):
+        title, publish_time = rows['title'], rows['l_np_publishtime']
+        self.check_crawl_title(title)
+        self.check_es_cache(title, publish_time, rows)
+
+    def __call__(self, rows: dict, *args, **kwargs):
+        self.__check(rows)
+
+
+CheckText = CheckContent()
+CheckTask = CheckPrePareRequest()

+ 147 - 0
utils/clean_html.py

@@ -0,0 +1,147 @@
+import re
+__all__ = ['cleaner']
+
+# 独立元素
+INDEPENDENT_TAGS = {
+    '<head>[\s\S]*?</head>': '',
+    '<html>|<html [^>]*>|</html>': '',
+    '<body>|<body [^>]*>|</body>': '',
+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
+    '\\xa0|\\u3000': '',  # 空格
+    '<!--[\s\S]*?-->': '',  # 注释
+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
+    '<input>': '',  # 输入框
+    '<img[^>]*>': '<br>',  # 图片
+}
+# 行内元素
+INLINE_TAGS = {
+    '<a>|<a [^>]*>|</a>': '',  # 超链接
+    '<link>|<link [^>]*>|</link>': '',  # 超链接
+    '<span>|<span [^>]*>|</span>': '',  # span
+    '<label>|<label [^>]*>|</label>': '<br>',  # label
+    '<font>|<font [^>]*>|</font>': '',  # font
+    'data:image(.*?) ': '',            # 图片base64
+}
+# 块级元素
+BLOCK_TAGS = {
+    '<div>\s*?</div>':'',
+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
+    '<p>|<p [^>]*>': '<br>',  # 段落
+    '</p>': '',  # 段落
+    '<div>|<div [^>]*>': '<br>',  # 分割 division
+    '</div>': '',  # 分割 division
+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
+}
+# 其他
+OTHER = {
+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
+    '<epointform>': '',
+    '<!doctype html>|<!doctype html [^>]*>': '',
+    '【关闭】|关闭': '',
+    '【打印】|打印本页': '',
+    '【字体:[\s\S]*】': '',
+    '文章来源:[\u4e00-\u9fa5]+': '',
+    '浏览次数:.*[<]+': '',
+    '(责任编辑:.*?)': '',
+    '分享到[:]': '',
+
+}
+# 样式
+CSS_STYLE = {
+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
+
+}
+# 空白符
+BLANKS = {
+    '\n\s*\n': '\n',
+    '\s*\n\s*': '\n',
+    '[^\S\n]': ' ',
+    '\s+': ' ',
+}
+# css标签集合
+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
+# css属性集合
+ATTRS = {'id', 'class', 'style', 'width'}
+
+
+def _repair_tag():
+    """异常的标签组合,用来替换非标准页面的标签"""
+    _repairs = {}
+    for tag in TAGS:
+        for attr in ATTRS:
+            key = '{}{}'.format(tag, attr)
+            val = '{} {}'.format(tag, attr)
+            _repairs[key] = val
+    return _repairs
+
+
+def _escape_character(html):
+    """转义字符"""
+    html = html.replace('&lt;', '<')
+    html = html.replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = html.replace('&amp;', '&')
+    # 不显示输入框边框
+    html = html.replace('<input', '<input style="border-color: transparent;"')
+    return html
+
+
+def _lowercase_tag(html):
+    """标签归一化处理(全部小写 + 标签修复)"""
+    tags = re.findall("<[^>]+>", html)
+    tag_sets = set(tags)
+
+    if len(tag_sets) > 10000:
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(html, "lxml")
+        html = str(soup.body.next_element)
+    else:
+        for tag in tag_sets:
+            html = html.replace(tag, str(tag).lower())
+
+    repair_tags = _repair_tag()
+    for err, right in repair_tags.items():
+        html = html.replace(err, right)
+
+    return html
+
+
+def cleaner(html, special=None, completely=False):
+    """
+    数据清洗
+
+    :param html: 清洗的页面
+    :param special: 额外指定页面清洗规则
+    :param completely: 是否完全清洗页面
+    :return: 清洗后的页面源码
+    """
+    if special is None:
+        special = {}
+
+    OTHER.update(special)
+    remove_tags = {
+        **INDEPENDENT_TAGS,
+        **INLINE_TAGS,
+        **BLOCK_TAGS,
+        **OTHER,
+        **CSS_STYLE,
+        **BLANKS,
+    }
+    html = _lowercase_tag(html)
+    for tag, repl in remove_tags.items():
+        html = re.sub(tag, repl, html)
+
+    if completely:
+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
+
+    html = _escape_character(html)
+    return html

+ 55 - 0
utils/es_query.py

@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-02-02
+---------
+@summary: es
+---------
+@author: Lzz
+"""
+from elasticsearch import Elasticsearch
+from utils.title_participle import get_should
+
+
+# es:
+#   host: 172.17.4.184
+#   usename: "jybid"
+#   pwd: "Top2023_JEB01i@31"
+#   port: !!int 19905
+#   db: biddingall # es库别名
+
+
+def es_client():
+    cfg = {"host": "172.17.4.184",
+           "port": 19905,
+           "usename": "jybid",
+           "pwd": "Top2023_JEB01i@31"}
+    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}],http_auth=(cfg['usename'], cfg['pwd']))
+
+
+def es_search(title: str, publish_time: int):
+    """
+    查询es
+
+    :param title: 标题
+    :param publish_time: 发布时间
+    :return:
+    """
+    client = es_client()
+    stime = publish_time - 432000  # 往前推5天
+    etime = publish_time + 432000
+
+    time_limit = {"range": {"publishtime": {"from": stime, "to": etime}}}
+    should_list = get_should(title)   # 对标题进行分词组合query语句
+    # 通过发布标题和发布时间范围查询
+    query = {
+        "query": {
+            "bool": {
+                "must": [time_limit],
+                "should": should_list,
+                "minimum_should_match": "10<80%",
+            }
+        }
+    }
+    result = client.search(index="biddingall", body=query, request_timeout=100)
+    total = int(result['hits']['total']['value'])
+    return total

+ 37 - 0
utils/execptions.py

@@ -0,0 +1,37 @@
+class YbwCrawlError(Exception):
+
+    def __init__(self, *args, **kwargs):
+        self.code = kwargs.get('code', 10000)
+        self.reason = kwargs.get('reason', '元博网采集未知错误,请手动处理')
+
+        if 'code' not in kwargs:
+            kwargs['code'] = self.code
+        if 'reason' not in kwargs:
+            kwargs['reason'] = self.reason
+
+        [setattr(self, key, val) for key, val in kwargs.items()]
+        super(YbwCrawlError, self).__init__(*args, kwargs)
+
+
+class AccountError(YbwCrawlError):
+
+    def __init__(self, reason='账号异常', code=10001, **kwargs):
+        super(AccountError, self).__init__(code=code, reason=reason, **kwargs)
+
+
+class CheckError(YbwCrawlError):
+
+    def __init__(self, reason='数据检查异常', code=10002, **kwargs):
+        super(CheckError, self).__init__(code=code, reason=reason, **kwargs)
+
+
+class CrawlError(YbwCrawlError):
+
+    def __init__(self, reason='数据采集异常', code=10003, **kwargs):
+        super(CrawlError, self).__init__(code=code, reason=reason, **kwargs)
+
+
+class AttachmentError(YbwCrawlError):
+
+    def __init__(self, reason='附件异常', code=10004, **kwargs):
+        super(AttachmentError, self).__init__(code=code, reason=reason, **kwargs)

+ 427 - 0
utils/login.py

@@ -0,0 +1,427 @@
+import io
+import json
+import threading
+import time
+import uuid
+from collections import namedtuple
+from pathlib import Path
+
+import execjs
+import requests
+from requests import Session
+from requests.utils import dict_from_cookiejar
+
+from utils.execptions import CrawlError
+from loguru import logger
+import setting
+
+LOCK = threading.RLock()
+
+_node_modules = (setting.ROOT_PATH / 'node_modules').resolve()
+JSON_LOGIN_COOKIE = (setting.ROOT_PATH / 'login_cookie.json').resolve()
+
+User = namedtuple('User', ['phone', 'passwd'])
+
+
+def _open_file():
+    try:
+        fp = open(JSON_LOGIN_COOKIE, encoding='utf-8')
+    except FileNotFoundError:
+        fp = open(JSON_LOGIN_COOKIE, 'w+', encoding='utf-8')
+    return fp
+
+
+def load_login_cookies(user_name: str):
+    fp = _open_file()
+    try:
+        cookies: dict = json.load(fp).get(user_name)
+        return cookies
+    except json.decoder.JSONDecodeError:
+        pass
+    fp.close()
+
+
+def save_login_cookies(user_name: str, login_cookie: dict):
+    with LOCK:
+        # 文件存在就读取,不存在就创建
+        fp = _open_file()
+        # 内容存在就加载到内存,不存在就设置为空字典
+        try:
+            user_maps: dict = json.load(fp)
+        except json.decoder.JSONDecodeError:
+            user_maps = {}
+        # print(user_maps)
+
+        if user_name not in user_maps:
+            user_maps.setdefault(user_name, login_cookie)
+        else:
+            cookies = {user_name: login_cookie}
+            user_maps.update(cookies)
+
+        wp = open(JSON_LOGIN_COOKIE, 'w+', encoding='utf-8')
+        wp.write(json.dumps(user_maps, indent=4))
+        fp.close()
+        wp.close()
+
+
+def update_login_cookies(user_name: str, update_val: dict):
+    """
+    更新登录 cookie 内容
+
+    Args:
+        user_name: 账号
+        update_val: 需要更新的cookie内容
+
+    """
+    with LOCK:
+        fp = open(JSON_LOGIN_COOKIE, encoding='utf-8')
+        user_maps: dict = json.load(fp)
+        login_cookies: dict = user_maps.get(user_name)
+        if login_cookies is not None and len(update_val) > 0:
+            login_cookies.update(update_val)
+            user_login_info = {user_name: login_cookies}
+            user_maps.update(user_login_info)
+            wp = open(JSON_LOGIN_COOKIE, 'w+', encoding='utf-8')
+            wp.write(json.dumps(user_maps, indent=4))
+            wp.close()
+        fp.close()
+
+
+def convert1(plaintext):
+    """
+    AES CBC模式 Pkcs7填充 加密
+
+    @param plaintext: 加密文本
+    @return:
+    """
+    js_str = '''
+    const CryptoJS = require('crypto-js');
+    function convert1(txt) {
+        var a = '434D643932666D644B454E304E646C616535334D6435666E';
+        a = CryptoJS.enc.Hex.parse(a)
+        b = CryptoJS.enc.Hex.parse("30393138313633304D4D474C435A5059")
+        var enc = CryptoJS.AES.encrypt(txt, a, {
+            iv: b,
+            mode: CryptoJS.mode.CBC,
+            padding: CryptoJS.pad.Pkcs7
+        })
+        return enc.ciphertext.toString()
+    }
+    '''
+    ctx = execjs.compile(js_str, cwd=_node_modules)
+    return ctx.call('convert1', plaintext)
+
+
+def recognition_captcha(image_stream, proxies=None, timeout=None):
+    """
+    验证码识别
+
+    @param image_stream: 验证码图片流
+    @param proxies: 代理
+    @param timeout: 超时时间
+    @return:
+    """
+    url = "http://pycaptcha.spdata.jianyu360.com/v1/images/verify"
+    img_headers = {'accept': 'application/json'}
+    image_file = {'file': image_stream}
+    r = requests.post(url,
+                      headers=img_headers,
+                      files=image_file,
+                      stream=True,
+                      proxies=proxies,
+                      timeout=timeout)
+    json_resp = r.json()
+    if "msg" in json_resp and "success" == json_resp["msg"]:
+        return str(json_resp["r"]["code"]).upper()
+    return None
+
+
+def download_captcha(image, session: Session, save_to_local=False, proxies=None, timeout=None):
+    """下载验证码"""
+    js_str = '''
+        function changeYzmL() {
+            var randomNum = ('000000' + Math.floor(Math.random() * 999999)).slice(-6);
+            var time = new Date();
+            var nowTime = String(time.getFullYear()) + String(time.getMonth() + 1) + String(time.getDate()) + String(
+            time.getHours()) + String(time.getMinutes()) + String(time.getSeconds());
+            return "https://www.chinabidding.cn/cblcn/member.Login/captcha?randomID=" + randomNum + "&t=" + nowTime
+        }
+    '''
+    ctx = execjs.compile(js_str)
+    url = ctx.call('changeYzmL')
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
+        'Referer': 'https://www.chinabidding.cn/public/2020/html/login.html?source=1',
+    }
+    r = session.get(url, headers=headers, stream=True, proxies=proxies, timeout=timeout)
+    stream = io.BytesIO()
+    stream.write(r.content)
+    if save_to_local:
+        with open(image, 'wb') as f:
+            f.write(r.content)
+    logger.info(f'[验证码]下载成功')
+    return stream
+
+
+def captcha(session, phone, proxies=None, timeout=None):
+    """
+    验证码下载与识别
+    @param session: requests.session会话对象
+    @param phone: 验证码图片命名规则
+    @param proxies: 代理
+    @param timeout: 超时时间
+    @return:
+    """
+    name = f'{phone}.jpg'
+    img_stream = download_captcha(name, session, proxies=proxies, timeout=timeout)
+    code = recognition_captcha(img_stream.getvalue(), proxies=proxies, timeout=timeout)
+    logger.info(f'[验证码识别]{code}')
+    return convert1(code)
+
+
+def login_session(phone: str, password: str, proxies=None, timeout=None):
+    """
+    登录会话
+
+    @param phone: 登录手机号
+    @param password: 登录密码
+    @param proxies: 代理
+    @param timeout: 超时时间
+    @return: requests.session()
+    """
+    logger.info('账号登录:{phone}', phone=phone)
+    session = requests.session()
+    # 生成浏览器身份id
+    gr_user_id = uuid.uuid4()
+    gr_session_id = uuid.uuid4()
+    login_ts = int(time.time())
+    session.cookies['gr_user_id'] = str(gr_user_id)
+    session.cookies['b5897e326c6777f3_gr_session_id'] = str(gr_session_id)
+    session.cookies[f'b5897e326c6777f3_gr_session_id_{gr_session_id}'] = 'true'
+    session.cookies['Hm_lvt_0bf7d2e4ce4104fa77e95b012f750771'] = str(login_ts)
+    session.cookies['Hm_lpvt_0bf7d2e4ce4104fa77e95b012f750771'] = str(login_ts)
+
+    # 验证码识别
+    yzm = captcha(session, phone, proxies=proxies, timeout=timeout)
+    '''
+        1、下载验证码操作行为时,会导致服务器记录该次请求的客户端身份id;(服务器保存处理大约需要1s以上)        
+    '''
+    time.sleep(2)
+    now_time_str = '''
+        function t() {
+            var time = new Date();
+            var nowTime = String(time.getFullYear()) + String(time.getMonth() + 1) + String(time.getDate()) + String(
+            time.getHours()) + String(time.getMinutes()) + String(time.getSeconds());
+            return nowTime
+        }
+        '''
+    now_time_ctx = execjs.compile(now_time_str)
+    now_time = now_time_ctx.call('t')
+    data = {
+        'phone': convert1(phone),
+        'password': convert1(password),
+        'yzm': yzm,
+        't': now_time
+    }
+    '''
+        2、提交登录用户信息时,必须保证提交的会话与下载验证码的会话保持一致,否则服务器验证码无法通过验证
+    '''
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
+        'Referer': 'https://www.chinabidding.cn/public/2020/html/login.html?source=1',
+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'X-Requested-With': 'XMLHttpRequest',
+    }
+    url = 'https://www.chinabidding.cn/yuan/login/loginnew/login'
+    r = session.post(url, headers=headers, data=data, proxies=proxies, timeout=timeout)
+    assert r.status_code == 200
+    logger.info(f'登录信息: {r.json()}')
+    return r, session
+
+
+def login_session_by_cookies(cookies, url, headers, data=None, proxies=None, timeout=None):
+    """
+    使用cookies获取 login session
+
+    @param dict cookies: 用户登录后的cookies
+    @param str url: 登录检查地址
+    @param dict headers: 请求头
+    @param data: 毫秒级时间戳
+    @param proxies: 代理
+    @param timeout: 超时时间
+    @return: 身份信息和请求结束的响应对象
+    """
+    session = requests.session()
+    r = session.post(url,
+                     headers=headers,
+                     data=data,
+                     cookies=cookies,
+                     proxies=proxies,
+                     timeout=timeout)
+    assert r.status_code == 200
+    return r, session
+
+
+def login_check_and_get_meta(session=None, allow_output_log=True, proxies=None, timeout=None):
+    """
+    检查账号登录状态和获取账号身份数据
+
+    @param Session session: 账号登录后的 session
+    @param allow_output_log: 是否打印日志
+    @param proxies: 代理
+    @param timeout: 超时时间
+    @return: 账号身份数据
+    """
+    url = "https://www.chinabidding.cn/cblcn/Home/logincheckAndGetMeta"
+    payload = f"t={int(round(time.time() * 1000))}"
+    headers = {
+        'Host': 'www.chinabidding.cn',
+        'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
+        'Accept': 'application/json, text/javascript, */*; q=0.01',
+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'X-Requested-With': 'XMLHttpRequest',
+        'sec-ch-ua-mobile': '?0',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
+        'sec-ch-ua-platform': '"Windows"',
+        'Origin': 'https://www.chinabidding.cn',
+        'Sec-Fetch-Site': 'same-origin',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Dest': 'empty',
+        'Referer': 'https://www.chinabidding.cn/public/2020/html/login.html?source=1',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+    }
+    r = session.post(url, headers=headers, data=payload, proxies=proxies, timeout=timeout)
+    assert r.status_code == 200
+    member: dict = r.json().get('member')
+    if allow_output_log:
+        logger.info("账号信息:{}", json.dumps(member, indent=4, ensure_ascii=False))
+    return member, r
+
+
+def login_check(account=None, refer=None, allow_output_log=True, proxies=None, timeout=None):
+    """
+    用户身份信息状态检查
+
+    Args:
+        account(str): 用户账号
+        refer: 引用页
+        allow_output_log: 是否打印日志
+        proxies: 代理
+        timeout: 超时时间
+
+    Returns:
+        登录有效时返回False,登录无效时返回True
+    """
+    url = "https://www.chinabidding.cn/cblcn/Home/newLoginCheck"
+    ts = int(time.time())
+    ts2tms = int(round(ts * 1000))
+    payload = f"t={ts2tms}"
+    headers = {
+        'Host': 'www.chinabidding.cn',
+        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
+        'Accept': 'application/json, text/javascript, */*; q=0.01',
+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'sec-ch-ua-mobile': '?0',
+        'X-Requested-With': 'XMLHttpRequest',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
+        'sec-ch-ua-platform': '"Windows"',
+        'Origin': 'https://www.chinabidding.cn',
+        'Sec-Fetch-Site': 'same-origin',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Dest': 'empty',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+    }
+    if refer is not None:
+        headers.update({'Referer': refer})
+
+    cookies = load_login_cookies(account)
+    if cookies is None:
+        '''没有该账号的cookies信息,请检查 login_cookie.json 配置文件'''
+        return True
+
+    ts = int(time.time())
+    r, session = login_session_by_cookies(cookies, url, headers, data=payload, proxies=proxies, timeout=timeout)
+
+    try:
+        member = r.json()
+    except json.decoder.JSONDecodeError:
+        raise CrawlError(code=10021, reason="系统繁忙,请等待一会儿,自动刷新。")
+
+    if allow_output_log:
+        logger.info("账号信息:{}", json.dumps(member, indent=4, ensure_ascii=False))
+
+    '''处理本地 cookies'''
+    login_cookies: dict = dict_from_cookiejar(r.cookies)
+    request_ts = dict(
+        # 上一次时间访问时间戳(秒)
+        Hm_lvt_0bf7d2e4ce4104fa77e95b012f750771=cookies.get("Hm_lpvt_0bf7d2e4ce4104fa77e95b012f750771",""),
+        # 当前访问时间戳(秒)
+        Hm_lpvt_0bf7d2e4ce4104fa77e95b012f750771=str(ts)
+    )
+    login_cookies.update(request_ts)
+    update_login_cookies(account, login_cookies)
+    if member is not None and len(member) > 1:
+        '''登录有效'''
+        return False
+    else:
+        '''登录失效'''
+        return True
+
+
+def login_session_check(session, account, allow_output_log=True, proxies=None, timeout=None):
+    """
+    账号登录状态
+
+    @param Session session: 登录后的用户 session
+    @param str account: 账号
+    @param allow_output_log: 是否打印日志
+    @param proxies: 代理
+    @param timeout: 超时时间
+    @return: 身份检查是否有效的布尔值
+    """
+    member, r = login_check_and_get_meta(session,
+                                         allow_output_log,
+                                         proxies=proxies,
+                                         timeout=timeout)
+    if member is not None and len(member) > 0:
+        login_cookies: dict = dict_from_cookiejar(r.cookies)
+        update_login_cookies(account, login_cookies)
+        '''身份有效'''
+        return False
+    else:
+        '''身份无效'''
+        return True
+
+
+def login(phone, password, proxies=None, timeout=None):
+    """
+    登录
+
+    @param str phone: 账号
+    @param str password: 密码
+    @param dict proxies: 代理
+    @param int|tuple timeout: 超时时间
+    @return: 登录会话和网络状态码
+    """
+    r, session = login_session(phone, password, proxies=proxies, timeout=timeout)
+    if r.json()['code'] == 200:
+        member, _ = login_check_and_get_meta(session)
+        login_cookies: dict = dict_from_cookiejar(session.cookies)
+        if member is not None:
+            record_id = str(member['record_id'])
+            login_meta = dict(
+                b5897e326c6777f3_gr_cs1=record_id,
+                b5897e326c6777f3_gr_last_sent_cs1=record_id,
+                b5897e326c6777f3_gr_last_sent_sid_with_cs1=login_cookies['b5897e326c6777f3_gr_session_id']
+            )
+            login_cookies.update(login_meta)
+        save_login_cookies(phone, login_cookies)
+        return session, 200
+    else:
+        '''
+            514 IP限制
+        '''
+        logger.error(f'[登录失败]{r.json()["code"]}-{r.json()["msg"]},账号:{phone}')
+        if r.json()["code"] == 514:
+            time.sleep(300)
+        return requests.session(), int(r.json()["code"])

+ 50 - 0
utils/title_participle.py

@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-10-10 
+---------
+@summary: 标题分词,组合es查询语句
+---------
+@author: Lzz
+"""
+from requests.auth import HTTPBasicAuth
+import requests
+import json
+
+
+def get_should(title):
+
+    # url = "http://192.168.3.149:9201/_analyze"  # 测试
+    url = "http://172.17.4.184:19905/_analyze"  # 线上
+    username = "jybid"
+    password = "Top2023_JEB01i@31"
+
+    headers = {"Content-Type": "application/json"}
+    auth = HTTPBasicAuth(username, password)
+    data = {
+        "analyzer": "ik_smart",
+        "text": title
+    }
+
+    res = requests.post(url, headers=headers, auth=auth, json=data, timeout=10)
+
+    try:
+        res_text = json.loads(res.text).get('tokens') or [{"token":title}]
+    except:
+        res_text = [{"token":title}]
+
+    should_list = []
+    for key in res_text:
+        single_dict = {
+            "multi_match": {
+                "query": f"{key.get('token')}",
+                "type": "phrase",
+                "fields": [
+                    "title"
+                ]
+            }
+        }
+        should_list.append(single_dict)
+
+    return should_list
+
+

+ 66 - 0
utils/tools.py

@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-02-02
+---------
+@summary: 工具类
+---------
+@author: Lzz
+"""
+import datetime
+import re
+import time
+
+import bson
+import requests
+from loguru import logger
+import setting
+
+
+def clean_title(title):
+    if title:
+        rule_list = [
+            '\(\d{1,20}\)',
+            '\[[\u4e00-\u9fa5]{1,9}\]',
+            '【[\u4e00-\u9fa5]{1,9}】',
+        ]
+        for rule in rule_list:
+            title = re.sub(rule, '', title)
+
+    return title
+
+
+def get_proxy():
+    url = setting.PROXY_API
+    headers = {"Authorization": setting.PROXY_TOKEN}
+    proxy = requests.get(url, headers=headers, timeout=10).json().get("data")
+    logger.info("切换代理:{}".format(proxy))
+    return proxy
+
+
+def int2long(param: int):
+    """int 转换成 long """
+    return bson.int64.Int64(param)
+
+
+def get_current_date(date_format="%Y-%m-%d %H:%M:%S"):
+    return datetime.datetime.now().strftime(date_format)
+
+
+def date_to_timestamp(date, fmt="%Y-%m-%d %H:%M:%S"):
+    """
+    @summary:
+    ---------
+    @param date:将"2011-09-28 10:00:00"时间格式转化为时间戳
+    @param fmt:时间格式
+    ---------
+    @result: 返回时间戳
+    """
+    if ":" in date:
+        timestamp = time.mktime(time.strptime(date, fmt))
+    else:
+        timestamp = time.mktime(time.strptime(date, "%Y-%m-%d"))
+    return int(timestamp)
+
+
+def get_today_of_day(day_offset=0):
+    return str(datetime.date.today() + datetime.timedelta(days=day_offset))

+ 5 - 0
ybw_crontab.txt

@@ -0,0 +1,5 @@
+# 元博网 限量采集
+30 9 * * * cd /mnt && python3 ybw_query_list.py > ybw_list.out 2>&1
+0 3 * * * python3 /mnt/ybw_release_account.py
+0 9-16/2 * * 1-5 cd /mnt && ./start.sh
+50 8-18/2 * * * cd /mnt && python3 ybw_esquery.py > esquery.out 2>&1

+ 316 - 0
ybw_details.py

@@ -0,0 +1,316 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-06-17
+---------
+@summary: 元博网 详情页采集
+---------
+@author: Lzz
+"""
+import random
+import re
+import time
+
+import requests.exceptions
+from loguru import logger
+from lxml.html import fromstring, HtmlElement, tostring
+from lxml.html.clean import Cleaner
+from pymongo import MongoClient
+
+import setting
+import utils.tools as tool
+from dbs.RedisDB import RedisFilter
+from utils.check_utils import CheckText, CheckTask
+from utils.clean_html import cleaner
+from utils.login import User, load_login_cookies, login, login_check
+
+_proxies = setting.PROXIES
+
+
+def iter_node(element: HtmlElement):
+    yield element
+    for sub_element in element:
+        if isinstance(sub_element, HtmlElement):
+            yield from iter_node(sub_element)
+
+
+def pre_parse(element: HtmlElement):
+    """对 HTML 进行预处理可能会破坏 HTML 原有的结构,导致根据原始 HTML 编写的 XPath 不可用"""
+    pre_remove = {
+        'log_col2', 'log_col1', 'cz', 'iconfont closei', 'p2 p1', 'cnxh_b',
+        'msg_error', 'r_gg TB-focus', 'april', 'cont2', 'to_login', 'regtxt',
+        'shouchang an_n sc', 'april_title red', 'cn_lt', 'dayin an_n',
+        'dl_zc vip_t free_member', 'rmbq', 'login-form cl', 'dian_g fr',
+        'di_n', 'd_fx', 'd_tub', 'd_dy', 'anniu1', 'cnxh_list', 'btns cl',
+        'active', 'close', 'd_an fr', 'avatar', 'toolbar', 'deng_l',
+        'cen_right fr', 'log_col5', 'agreement', 'log_col3',
+        'shouchang_af an_n sc_after', 'fast_box', 'di_nr fl', 'xgfj', 'dianh',
+        'cnxh_list tab_b2 city_list', 'contract cl', 'zb_cen_r fr', 'd_zsms',
+        'sc_after active', 'dl_k', 'ewm_b', 'fl', 'wypj', 'rukou', 'p1',
+        'dl_zc', 'success', 'daoh h_30', 'bd', 'april_content', 'print',
+        'foot', 'cnxh zbgg', 'april_first', 'fastlog', 'tx_mc user_name',
+        'tab_h2', 'fanding an_n', 'toux', 'log_col4 cl', 'hangy rem_1', 'red',
+        'regshadow', 'bottom', 'dl_zc vip_t fee_member', 'xszn fl', 'no-print',
+        'cnxh_b zbgg_b', 'rem rem_1', 'logshadowz', 'd_pj fl', 'tjgjc',
+        'spdujaiwlohh', 'di_ewm fr', 'dian_h fl',
+        'tab_h2 zbgg_b_gray', 'fanshou an_n fs', 'login-btn', 'fl gjc',
+        'agreeshadow', 'guang_db', 'footer_1', 'log_p', 'cnxh_list tab_b2',
+        'd_sw', 'april_close', 'd_sc', 'erweima no-print', 'qgzx', 'p2', 'sc',
+        'hd', 'log_col6', 'dh_b', 'dian_guang', 'zhu_c', 'ck cai_k', 'april_box',
+        'display:none'
+    }
+    for node in iter_node(element):
+        id_attr = node.attrib.get('id')
+        class_attr = node.attrib.get('class')
+        style_attr = node.attrib.get('style')
+        if any([id_attr in pre_remove,
+                class_attr in pre_remove,
+                style_attr in pre_remove]):
+            node.drop_tree()
+    return element
+
+
+def page_source(element: HtmlElement):
+    clear = Cleaner(
+        forms=False,
+        style=True
+    )
+    return clear.clean_html(tostring(element, encoding="utf-8").decode())
+
+
+class DetailSpider:
+
+    def __init__(self):
+        _mgo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
+        self.ybw_list = _mgo[setting.MONGO_DB]["ybw_list"]
+        self.ybw_info = _mgo[setting.MONGO_DB]["ybw_info"]
+        self.save_tab = _mgo[setting.MONGO_DB]["data_bak"]
+
+        self.dedup = RedisFilter()
+
+        self.user = User(phone=setting.ACCOUNT, passwd=setting.PASSWORD)
+        self.login_times = 0
+
+    def json_request(self, fid, request_params):
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9",
+            "Connection": "keep-alive",
+            "Referer": "https://www.chinabidding.cn/public/bidagency/index.html",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
+        }
+        url = "https://www.chinabidding.cn/agency.info.Detail/show"
+        params = {
+            "fid": f"{fid}"
+        }
+
+        res = requests.get(url, headers=headers, params=params, **request_params)
+        return res
+
+    def crawl_request(self, item: dict):
+        url = item['competehref']
+        headers = {
+            'Host': 'www.chinabidding.cn',
+            'Upgrade-Insecure-Requests': '1',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+        }
+
+        request_params = {}
+        request_params.setdefault('headers', headers)
+        request_params.setdefault('timeout', 30)
+        request_params.setdefault('proxies', _proxies)
+
+        # 登录代理
+        proxy_params = dict(proxies=_proxies, timeout=180)
+
+        retries = 0
+        retries_502, max_retries_502 = 0, 3
+        while retries < 3:
+            if retries_502 > max_retries_502:
+                # 网站已移除该数据
+                self.ybw_list.update_one({'_id': item["_id"]}, {'$set': {"crawl_status": "remove"}})
+                break
+
+            login_cookies = load_login_cookies(self.user.phone)
+            if login_cookies is None:
+                login(*self.user, **proxy_params)
+                self.login_times += 1
+                continue
+            elif 'cookies' not in request_params:
+                request_params.setdefault('cookies', login_cookies)
+            else:
+                request_params.update({'cookies': login_cookies})
+
+            fid = "".join(re.findall('\?fid=(.*)', url)).split('&')[0]
+            if fid:
+                try:
+                    request_params.pop('headers', None)
+                    r = self.json_request(fid, request_params)
+                    # 账号登录状态检查
+                    retry_login = login_check(self.user.phone, url, False, **proxy_params)
+                    if retry_login:
+                        logger.info(f"[重新登录]{self.user.phone}")
+                        _, code = login(*self.user, **proxy_params)
+                        self.login_times += 1
+                        if code == 200:
+                            retries += 1
+                        else:
+                            time.sleep(600)
+                            retries += 1
+                        continue
+                    logger.info(f'[采集正文] fid_{fid}')
+                    return r
+                except:
+                    retries += 1
+                    continue
+            else:
+                try:
+                    r = requests.get(url, **request_params)
+                    # 账号登录状态检查
+                    retry_login = login_check(self.user.phone, url, False, **proxy_params)
+                    if retry_login:
+                        logger.info(f"[重新登录]{self.user.phone}")
+                        _, code = login(*self.user, **proxy_params)
+                        self.login_times += 1
+                        if code == 200:
+                            retries += 1
+                        else:
+                            time.sleep(1800)
+                            retries += 1
+                        continue
+
+                    element = fromstring(r.text)
+                    nodes = element.xpath('//*[@id="main_dom"]/div[1]')
+                    if len(nodes) != 1:
+                        retries_502 += 1
+                        logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
+                        continue
+                    else:
+                        node = nodes[0]
+                        logger.info(f'[采集正文] id={node.attrib.get("id")}')
+                        return r
+                except requests.RequestException:
+                    retries += 1
+                    continue
+
+        return None
+
+    def crawl_response(self, response, item):
+        if re.match('^\{', response.text):
+            html = response.json().get('c_info').get('content')
+        else:
+            element: HtmlElement = fromstring(response.text)
+            node = element.xpath('//*[@id="infoDescription"]')[0]
+            node = pre_parse(node)
+            features = {
+                './div[@class="ckgys_cont"]',
+                './/div[@class="detail-title ng-scope"]',
+                './/table[@class="detail_Table"]',
+            }
+            for feature in features:
+                extract_node = node.xpath(feature)
+                if len(extract_node) > 0:
+                    valid_node = extract_node[0]
+                    break
+            else:
+                valid_node = node
+
+            html = page_source(valid_node)
+
+        '''检查原始页面内容'''
+        CheckText(html)
+        item["contenthtml"] = html
+        special = {
+            '若附件无法下载,你可以尝试使用360极速浏览器进行下载!': '',
+            # 'DD000E;|EE000F;|FF000E;': '',
+            '[(]?[)]?[A-Z]{2}000[A-Z]{1};[(]?[\d{1,4}]*[;]?[)]?[;]?': '',
+        }
+        item["detail"] = cleaner(html, special)
+        item["comeintime"] = tool.int2long(int(time.time()))
+
+        '''检查清洗之后的详情'''
+        CheckText(item["detail"])
+        insert = {}
+        for key, val in item.items():
+            if key not in ['crawl_status', 'crawl', 'count', '_id']:
+                insert[key] = val
+
+        self.save_tab.insert_one(insert)
+        logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
+
+    def crawl_spider(self, schedule, item):
+        count = schedule['count']
+        self.login_times = schedule['login_times']
+        if count >= schedule['total'] or self.login_times >= 3:
+            ''' 账号限制 '''
+            logger.warning("账号限制")
+            return '账号限制'
+
+        _id = item["_id"]
+        err = "error"
+        for _ in range(3):
+            try:
+                CheckTask(item)  # 检查请求采集任务
+                response = self.crawl_request(item)
+                if response is not None:
+                    self.crawl_response(response, item)
+                    count += 1
+                    self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
+                    self.ybw_info.update_one(
+                        {"account": self.user.phone},
+                        {"$set": {
+                            "count": count,
+                            "update_time": tool.get_current_date(),
+                            "login_times": self.login_times
+                        }}
+                    )
+                    return True
+            except Exception as e:
+                err = e
+                logger.error(f"请求错误:{err}")
+
+        self.ybw_list.update_one({'_id': _id}, {'$set': {'crawl_status': f'{err}'}})
+        return False
+
+    def start(self):
+        logger.debug(" *** start ***")
+
+        schedule = self.ybw_info.find_one({"account": self.user.phone})
+        if schedule is None:
+            logger.error(f"数据库无此账号信息|{self.user.phone}")
+            return
+
+        query = {"crawl_status": {"$exists": False}, "es_count": 0}
+        sort = [('publishtime', -1)]
+        limit = 100
+        with self.ybw_list.find(query, sort=sort).limit(limit) as cursor:
+            tasks = [doc for doc in cursor]
+
+        download_count = 0
+        rdm = random.randint(30, 50)
+        for item in tasks:
+            publish_ts = tool.date_to_timestamp(item['publishtime'])
+            if publish_ts > int(time.time()) - 43200:
+                logger.warning("未到采集时间")
+                continue
+
+            fp = 'detail_' + item.get('competehref')
+            if not self.dedup.get(fp):
+                self.dedup.add(fp)
+
+                download_count += 1
+                rst = self.crawl_spider(schedule, item)
+                if not rst or '账号限制' in str(rst):
+                    self.dedup.delete(fp)
+
+                if download_count >= rdm or '账号限制' in str(rst):
+                    break
+
+                time.sleep(random.randint(80, 180))
+
+        logger.debug(" *** end ***")
+
+
+if __name__ == '__main__':
+    DetailSpider().start()

+ 103 - 0
ybw_esquery.py

@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-06-18
+---------
+@summary: 在发布时间范围内精准检索标题
+---------
+@author: Dzr
+"""
+import datetime
+import warnings
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from elasticsearch import Elasticsearch
+from loguru import logger
+from pymongo import MongoClient
+
+import setting
+import utils.tools as tool
+
+warnings.filterwarnings('ignore')
+
+
+def es_client():
+    cfg = {
+        "host": setting.ES_IP,
+        "port": setting.ES_PORT,
+        "usename": setting.ES_USERNAME,
+        "pwd": setting.ES_PASSWORD,
+    }
+    hosts = [{"host": cfg['host'], "port": cfg['port']}]
+    return Elasticsearch(hosts, http_auth=(cfg['usename'], cfg['pwd']))
+
+
+def es_search(title, publish_time):
+    """
+    查询es
+
+    :param str title: 标题
+    :param int publish_time: 发布时间
+    :return:
+    """
+    dt = datetime.datetime.fromtimestamp(publish_time)
+    day_0am = datetime.datetime(dt.year, dt.month, dt.day)
+    gte = int(day_0am.timestamp())
+    lt = int(day_0am.timestamp()) + 24 * 3600
+
+    with es_client() as client:
+        body = {
+            "query": {
+                "bool": {
+                    "must": [
+                        {"term": {"title.mtitle": title}},
+                        {"range": {"publishtime": {"gte": gte, "lt": lt}}}
+                    ]
+                }
+            },
+            "from": 0,
+            "size": 10
+        }
+        result = client.search(index=setting.ES_INDEX, body=body, request_timeout=100)
+        return int(result['hits']['total']['value'])
+
+
+def retrieval(item):
+    ts = tool.date_to_timestamp(item["publishtime"], "%Y-%m-%d %H:%M:%S")
+    try:
+        es_count = es_search(title=item["title"], publish_time=ts)
+    except Exception as e:
+        logger.exception(e)
+        es_count = 0
+
+    item["es_count"] = es_count
+    return item
+
+
+def start(threads):
+    logger.info("es数据检索")
+
+    to_mongo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
+    collection = to_mongo[setting.MONGO_DB]["ybw_list"]
+
+    p = {"crawl_status": {"$exists": False}, "es_count": {"$exists": False}}
+    with collection.find(p, no_cursor_timeout=True) as cursor:
+        data_lst = [item for item in cursor]
+
+    data_count = 0
+    with ThreadPoolExecutor(max_workers=threads) as executor:
+        fs = [executor.submit(retrieval, data) for data in data_lst]
+        for f in as_completed(fs):
+            data_count += 1
+            ret = f.result()
+            es_count = ret["es_count"]
+            collection.update_one(
+                {"_id": ret["_id"]},
+                {"$set": {"es_count": es_count}}
+            )
+
+    to_mongo.close()
+    logger.info(f"本次es共检索数据 {data_count} 条")
+
+
+if __name__ == '__main__':
+    start(threads=1)

+ 183 - 0
ybw_query_list.py

@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-06-17
+---------
+@summary: 元博网 - 列表页信息检索
+---------
+@author: Lzz
+"""
+import math
+import random
+import time
+import warnings
+from collections import namedtuple
+
+import requests
+from loguru import logger
+from pymongo import MongoClient
+
+import setting
+import utils.tools as tool
+from dbs.RedisDB import RedisFilter
+
+warnings.filterwarnings('ignore')
+
+
+class Spider:
+    def __init__(self):
+        _mgo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
+        self.ybw_list = _mgo[setting.MONGO_DB]["ybw_list"]
+
+        self.dedup = RedisFilter()
+
+        self.total = 0
+        self.crawl_page = 1
+        self.areas_dict = {1: '北京', 2: '上海', 3: '天津', 4: '重庆', 5: '河北', 6: '山西', 7: '内蒙古', 8: '辽宁',
+                           9: '吉林', 10: '黑龙江', 11: '江苏', 12: '浙江', 13: '安徽', 14: '福建', 15: '江西',
+                           16: '山东', 17: '河南', 18: '湖北', 19: '湖南', 20: '广东', 21: '广西', 22: '海南',
+                           23: '贵州', 24: '云南', 25: '西藏', 26: '陕西', 27: '四川', 28: '甘肃', 29: '青海',
+                           30: '新疆', 31: '宁夏'}
+
+    def fetch_request(self, page, key, proxies=False):
+        url = "https://www.chinabidding.cn/302e302e7379675f73736f/datax/json/gj_zbcg_daylimit"
+        headers = {
+            "accept": "application/json, text/javascript, */*; q=0.01",
+            "accept-language": "zh-CN,zh;q=0.9",
+            "cache-control": "no-cache",
+            "pragma": "no-cache",
+            "priority": "u=1, i",
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
+            "x-requested-with": "XMLHttpRequest"
+        }
+        params = {
+            "device": "es",
+            "cpcode": "es001",
+            "keywords": f"{key}",
+            "table_type": "4,",
+            "search_type": "CONTEXT",
+            "areaid": "17,",
+            "categoryid": "",
+            "b_date": "week",
+            "time_start": "",
+            "time_end": "",
+            "page": f"{page}",
+            "rp": "30",
+            "usrecord_id": "",
+        }
+        request_params = dict(
+            headers=headers,
+            params=params,
+            proxies=proxies,
+            timeout=60,
+            verify=False
+        )
+        return requests.get(url, **request_params)
+
+    def parse(self, response, query_date):
+        total = response.json().get('result').get('total', 0)
+        self.crawl_page = math.ceil(total / 30)
+
+        results = []
+        info_list = response.json().get('result').get('list', [])
+        for info in info_list:
+            publish_time = info.get('fields').get('publish_date')
+            title = tool.clean_title(info.get('fields').get('title').strip())
+            competehref = info.get('fields').get('url')
+            if "chinabidding" not in competehref:
+                competehref = 'https://www.chinabidding.cn{}'.format(competehref)
+
+            area = self.areas_dict[int(info.get('fields').get('area_id'))] or "全国"
+
+            if title is None:
+                logger.error(f"[标题为空]{competehref}")
+                return
+
+            if not self.dedup.get(competehref) and query_date in publish_time:
+                item = {
+                    "site": "元博网(采购与招标网)",
+                    "channel": "政府采购",
+                    "area": area if area != '跨省' else '全国',
+                    "_d": "comeintime",
+                    "comeintime": tool.int2long(int(time.time())),
+                    "T": "bidding",
+                    "sendflag": "false",
+                    "spidercode": "a_ybwcgyzbw_zfcg",
+                    "city": "",
+                    "infoformat": 1,
+                    "type": "",
+                    "publishdept": "",
+                    "title": title,
+                    "competehref": competehref,
+                    "href": "#",
+                    "publishtime": publish_time,
+                    "l_np_publishtime": tool.int2long(tool.date_to_timestamp(publish_time)),
+                }
+                self.ybw_list.insert_one(item)
+                self.dedup.add(competehref)
+                results.append(item)
+                self.total += 1
+
+        logger.info(
+            f' *** 检索完成:去重 {len(info_list) - len(results)} 条 - 入库 {len(results)} 条 *** <{self.total}>')
+
+    def crawl_list_spider(self, page, key, query_date):
+        retry_times = 0
+        while retry_times < 3:
+            proxies = tool.get_proxy()
+            try:
+                response = self.fetch_request(page=page, key=key, proxies=proxies)
+                response.raise_for_status()  # requests 自检
+                if response is not None and response.status_code == 200:
+                    self.parse(response, query_date)
+                    logger.debug(f"[检索完成] {key}")
+                    time.sleep(random.random())
+                    return
+                else:
+                    retry_times += 1
+                    time.sleep(1)
+            except Exception as e:
+                logger.error(f"采集异常:{e}")
+                retry_times += 1
+                time.sleep(2)
+
+        logger.warning(f"[检索失败] {key}")
+
+    def start(self, query_date):
+        logger.debug("********** 检索开始 **********")
+
+        data_sets = {
+            "中国移动河南分公司",
+            "中国移动通信集团",
+            "中移建设有限公司",
+            "中移铁通有限公司",
+            "中移系统集成有限公司",
+            "中移信息系统集成有限公司",
+            "中移在线服务有限公司",
+            "联通(河南)产业互联网有限公司",
+            "联通数字科技有限公司",
+            "中国联合网络通信",
+            "中国联合网络通信有限公司",
+            "中讯邮电咨询设计院有限公司",
+            "天翼云科技有限公司",
+            "中电信数智科技有限公司",
+            "中国电信股份有限公司",
+            "中国电信集团有限公司",
+            "中国电信数智科技有限公司",
+            "中国联合网络通信有限公司"
+        }
+        for key in data_sets:
+            self.crawl_list_spider(1, key, query_date)
+            if self.crawl_page != 1:
+                for page in range(2, self.crawl_page + 1):
+                    self.crawl_list_spider(page, key, query_date)
+
+        logger.debug("********** 检索结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple(
+        'Menu',
+        ['channel', 'code', 'types', 'rout', 'query_date', 'crawl_page']
+    )
+    query_date = tool.get_today_of_day(-1)
+    Spider().start(query_date)

+ 18 - 0
ybw_release_account.py

@@ -0,0 +1,18 @@
+from pymongo import MongoClient
+
+import setting
+import utils.tools as tool
+
+if __name__ == '__main__':
+    to_mongo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
+    account_coll = to_mongo[setting.MONGO_DB]['ybw_info']
+
+    # 重置 账号信息
+    with account_coll.find() as cursor:
+        for item in cursor:
+            update_date = tool.get_current_date()
+            account_coll.update_one(
+                {"_id": item["_id"]},
+                {"$set": {"count": 0, "login_times": 0, "update_time": update_date()}}
+            )
+            print(f" {item['account']} 已更新 < {update_date} >")