Ver Fonte

new project

dongzhaorui@topnet.net.cn há 3 anos atrás
pai
commit
d930f04108
50 ficheiros alterados com 3732 adições e 0 exclusões
  1. 0 0
      ybw/__init__.py
  2. 0 0
      ybw/config/__init__.py
  3. 38 0
      ybw/config/areas.yaml
  4. 31 0
      ybw/config/conf.yaml
  5. 19 0
      ybw/config/constants.yaml
  6. 33 0
      ybw/config/load.py
  7. 0 0
      ybw/crawler/__init__.py
  8. 104 0
      ybw/crawler/check_utils.py
  9. 84 0
      ybw/crawler/clean_html.py
  10. 99 0
      ybw/crawler/crawl_record.py
  11. 133 0
      ybw/crawler/crawl_scheduler.py
  12. 396 0
      ybw/crawler/login.py
  13. 248 0
      ybw/detail_spider.py
  14. 206 0
      ybw/list_spider.py
  15. 79 0
      ybw/requirements.txt
  16. 7 0
      ybw/start.sh
  17. 0 0
      ybw/utils/__init__.py
  18. 47 0
      ybw/utils/databases.py
  19. 46 0
      ybw/utils/es_query.py
  20. 49 0
      ybw/utils/execptions.py
  21. 19 0
      ybw/utils/log.py
  22. 153 0
      ybw/utils/socks5.py
  23. 10 0
      ybw/utils/tools.py
  24. 0 0
      zbytb/config/__init__.py
  25. 44 0
      zbytb/config/conf.yaml
  26. 2 0
      zbytb/config/constants.yaml
  27. 30 0
      zbytb/config/load.py
  28. 2 0
      zbytb/crawler/__init__.py
  29. 103 0
      zbytb/crawler/check_utils.py
  30. 90 0
      zbytb/crawler/clean_html.py
  31. 145 0
      zbytb/crawler/crawl_scheduler.py
  32. 92 0
      zbytb/crawler/defaults.py
  33. 118 0
      zbytb/crawler/login.py
  34. 159 0
      zbytb/crawler/sessions_521.py
  35. 297 0
      zbytb/crawler/spiders/DetailPageSpider.py
  36. 150 0
      zbytb/crawler/spiders/ListPageSpider.py
  37. 2 0
      zbytb/crawler/spiders/__init__.py
  38. 57 0
      zbytb/error_html/error_page.html
  39. 32 0
      zbytb/main.py
  40. 79 0
      zbytb/requirements.txt
  41. 7 0
      zbytb/start.sh
  42. 0 0
      zbytb/utils/__init__.py
  43. 23 0
      zbytb/utils/aliyun.py
  44. 198 0
      zbytb/utils/attachment.py
  45. 47 0
      zbytb/utils/databases.py
  46. 45 0
      zbytb/utils/es_query.py
  47. 35 0
      zbytb/utils/execptions.py
  48. 11 0
      zbytb/utils/log.py
  49. 153 0
      zbytb/utils/socks5.py
  50. 10 0
      zbytb/utils/tools.py

+ 0 - 0
ybw/__init__.py


+ 0 - 0
ybw/config/__init__.py


+ 38 - 0
ybw/config/areas.yaml

@@ -0,0 +1,38 @@
+area:
+  1: '北京'
+  2: '上海'
+  3: '天津'
+  4: '重庆'
+  5: '河北'
+  6: '山西'
+  7: '内蒙古'
+  8: '辽宁'
+  9: '吉林'
+  10: '黑龙江'
+  11: '江苏'
+  12: '浙江'
+  13: '安徽'
+  14: '福建'
+  15: '江西'
+  16: '山东'
+  17: '河南'
+  18: '湖北'
+  19: '湖南'
+  20: '广东'
+  21: '广西'
+  22: '海南'
+  23: '贵州'
+  24: '云南'
+  25: '西藏'
+  26: '陕西'
+  27: '四川'
+  28: '甘肃'
+  29: '青海'
+  30: '新疆'
+  31: '宁夏'
+  32: '香港'
+  33: '澳门'
+  34: '台湾'
+  36: '跨省'
+  40: '海外'
+

+ 31 - 0
ybw/config/conf.yaml

@@ -0,0 +1,31 @@
+# mongo
+mongo:
+  host: 172.17.4.87
+  port: !!int 27080
+#  host: 127.0.0.1
+#  port: !!int 27017
+
+
+# redis
+redis:
+  host: 127.0.0.1
+  port: !!int 6379
+  pwd: ""
+  db: !!int 10
+
+
+# es
+es:
+  host: 172.17.145.170
+#  host: 127.0.0.1
+#  host: 192.168.3.206
+  port: !!int 9800
+
+
+# 阿里oss
+ali_oss:
+  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
+  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
+#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
+  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
+  bucket_name: jy-datafile

+ 19 - 0
ybw/config/constants.yaml

@@ -0,0 +1,19 @@
+headers:
+  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36
+  Accept: '*/*'
+
+proxy:
+  socks5:
+    url: http://socks.spdata.jianyu360.com/socks/getips?limit=10
+    decrypt: ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/
+
+
+node_module:
+  windows: C:\Users\dell\AppData\Roaming\npm\node_modules
+  linux: /usr/lib/node_modules
+
+
+crawler_url:
+  home_page: https://www.chinabidding.cn/search/searchgj/zbcg?areaid={}&keywords=&time_start={}&time_end={}&search_type=CONTEXT&categoryid=&rp={}&table_type={}&b_date=custom
+  list_url: https://www.chinabidding.cn/search/searchgj/zbcg?areaid={}&keywords=&time_start={}&time_end={}&page={}&search_type=CONTEXT&categoryid=&rp={}&table_type={}&b_date=custom
+  refer: https://www.chinabidding.cn/search/searchgj/zbcg?keywords={}

+ 33 - 0
ybw/config/load.py

@@ -0,0 +1,33 @@
+from pathlib import Path
+
+import yaml
+
+__all__ = [
+    'mongo_conf', 'redis_conf', 'oss_conf', 'es_conf',
+    'constants',
+    'headers', 'jy_proxy', 'node_module', 'crawler_url',
+    'region'
+]
+
+base_path = Path(__file__).parent
+yaml_conf = (base_path / 'conf.yaml').resolve()
+yaml_constants = (base_path / 'constants.yaml').resolve()
+yaml_areas = (base_path / 'areas.yaml').resolve()
+
+with open(yaml_conf, encoding="utf-8") as f:
+    conf = yaml.safe_load(f)
+    mongo_conf = conf['mongo']
+    redis_conf = conf['redis']
+    es_conf: dict = conf['es']
+    oss_conf: dict = conf['ali_oss']
+
+with open(yaml_constants, encoding="utf-8") as fp:
+    constants = yaml.safe_load(fp)
+    headers: dict = constants['headers']
+    jy_proxy: dict = constants['proxy']
+    node_module: dict = constants['node_module']
+    crawler_url: dict = constants['crawler_url']
+
+with open(yaml_areas, encoding="utf-8") as fr:
+    areas = yaml.safe_load(fr)
+    region: dict = areas['area']

+ 0 - 0
ybw/crawler/__init__.py


+ 104 - 0
ybw/crawler/check_utils.py

@@ -0,0 +1,104 @@
+import re
+
+from utils.es_query import get_es
+from utils.execptions import (
+    CustomAccountPrivilegeError,
+    CustomCheckError
+)
+
+__all__ = ['CheckText', 'CheckTask']
+
+
+class CheckContent:
+
+    def __init__(self):
+        self.sensitive_words = {
+            '正式会员', '账户充值', 'VIP会员查阅', '>(注册)<', '>(登录)<', '高级会员',
+            '标准会员', '点击支付', '隐私政策及用户服务协议',
+            '.*<a href=\"(.*?)\">点击查看内容'
+        }
+
+    @staticmethod
+    def check_text_length(val: str):
+        if len(val) == 0:
+            raise CustomCheckError(code=10101, reason='文本内容为空')
+        elif not re.findall(r'[\u4e00-\u9fa5]', val, re.S):
+            raise CustomCheckError(code=10102, reason='不存在中文字符')
+
+    @staticmethod
+    def check_content(val: str):
+        if val.count("部分文件可能不支持在线浏览"):
+            raise CustomCheckError(code=10103, reason='文件不支持在线浏览')
+
+    @staticmethod
+    def check_account_privilege(val: str):
+        if val.count("高级会员"):
+            raise CustomAccountPrivilegeError
+        elif "本招标项目仅供正式会员查阅" in val:
+            raise CustomAccountPrivilegeError
+
+    def check_sensitive_word(self, val: str):
+        total = set()
+        for word in self.sensitive_words:
+            result = re.search(word, val)
+            if result is not None:
+                total.add(word)
+
+        if len(total) > 0:
+            raise CustomCheckError(code=10104, reason='详情内容包含敏感词')
+
+    def __check(self, text):
+        self.check_sensitive_word(text)
+        self.check_text_length(text)
+        self.check_content(text)
+        self.check_account_privilege(text)
+
+    def __call__(self, text: str, *args, **kwargs):
+        self.__check(text)
+
+
+class CheckPrePareRequest:
+
+    def __init__(self):
+        self.crawl_keywords = {
+            '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
+            '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
+            '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
+            '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
+            '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
+            '终止', '系统'
+        }
+
+    @staticmethod
+    def check_es_cache(title: str, publish_time: int, rows: dict):
+        """
+
+        :param title:  标题
+        :param publish_time: 发布时间的时间戳(l_np_publishtime)
+        :param rows: 采集内容
+        """
+        retrieved_result = get_es(title, publish_time)
+        if retrieved_result != 0:
+            '''es查询数据结果'''
+            rows['count'] = retrieved_result
+            raise CustomCheckError(code=10105, reason='标题内容已存在es')
+
+    def check_crawl_title(self, title: str):
+        for keyword in self.crawl_keywords:
+            valid_keyword = re.search(keyword, title)
+            if valid_keyword is not None:
+                break
+        else:
+            raise CustomCheckError(code=10106, reason='标题未检索到采集关键词', title=title)
+
+    def __check(self, rows: dict):
+        title, publish_time = rows['title'], rows['l_np_publishtime']
+        self.check_crawl_title(title)
+        self.check_es_cache(title, publish_time, rows)
+
+    def __call__(self, rows: dict, *args, **kwargs):
+        self.__check(rows)
+
+
+CheckText = CheckContent()
+CheckTask = CheckPrePareRequest()

+ 84 - 0
ybw/crawler/clean_html.py

@@ -0,0 +1,84 @@
+import re
+
+
+def clean_html(html_str):
+    """HTML 替换"""
+    patterns = {
+        '<!--.*?-->': '',
+        '"': "'",
+        '\n': '',
+        '\xa0': "",
+        '<span .*?>': '',
+        '<link .*?>': '',
+        '</span> ': '',
+        '</span>': '',
+        '<span>': '',
+        '<p.*?>': '<br>',
+        '</p>': '<br>',
+        '<div>': '<br>',
+        '<div .*?>': '<br>',
+        '</div>': '<br>',
+        '<img .*?>': '<br>',
+        '<style.*?</style>': '',
+        '<EpointForm>': '',
+        '<html.*?</head>': '',
+        '<!DOCTYPE.*?>': '',
+        '</meta>': '',
+        '<?xml:.*?>': '',
+        '<label.*?>': '<br>',
+        '</label>': '',
+        'style=".*?"': '',
+        "style='.*?'": '',
+        'class=".*?"': '',
+        "class='.*?'": '',
+        "align='.*?'": '',
+        'align=".*?"': '',
+        'border=".*?"': '',
+        "border='.*?'": '',
+        'cellpadding=".*?"': '',
+        "cellpadding='.*?'": '',
+        'cellspacing=".*?"': '',
+        "cellspacing='.*?'": '',
+        'center=".*?"': '',
+        "center='.*?'": '',
+        'width=".*?"': '',
+        "width='.*?'": '',
+        "bordercolor='.*?'": '',
+        'bgcolor=".*?"': '',
+        'BORDERCOLOR=".*?"': '',
+        '<a name=".*?">': '',
+        '<o:p>': '',
+        '</o:p>': '',
+        '<A name=.*?>': '',
+        '<a .*?>': '',
+        '</a>': '',
+        '<font .*?>': '',
+        '</font>': '',
+        '<body.*?>': '',
+        '</body>': '',
+        '<script.*?>': '',
+        '</script>': '',
+        '【关闭】': '',
+        '【打印】': '',
+        '若附件无法下载,你可以尝试使用360极速浏览器进行下载!': '',
+    }
+
+    all_tag = re.findall("<[^>]+>", html_str)
+    for tag in all_tag:
+        html_str = html_str.replace(tag, str(tag).lower())
+
+    repl_str = [
+        '中国采购与招标网',
+        '采购与招标网',
+        'www.chinabidding.com.cn',
+        'www.chinabidding.cn'
+    ]
+    for repl in repl_str:
+        html_str = re.sub(repl, '___', html_str, re.S | re.M)
+
+    def substitutes(k, v, c):
+        return re.sub(k, v, c)
+
+    for k, v in patterns.items():
+        html_str = re.sub(k, v, substitutes(k, v, html_str), re.S, re.M)
+    return html_str

+ 99 - 0
ybw/crawler/crawl_record.py

@@ -0,0 +1,99 @@
+import datetime
+import json
+import threading
+from pathlib import Path
+
+LOCK = threading.RLock()
+ROOT_PATH = Path(__file__).parent.parent
+CRAWL_RECORD = (ROOT_PATH / 'crawl_records.json').resolve()
+
+
+def _open_file():
+    try:
+        fp = open(CRAWL_RECORD, encoding='utf-8')
+    except FileNotFoundError:
+        fp = open(CRAWL_RECORD, 'w+', encoding='utf-8')
+    return fp
+
+
+def load_records(account: str, date_str: str):
+    """
+    读取当天详情页采集数量
+
+    Args:
+        account: 账号
+        date_str: 采集日期 ‘YYYY-mm-dd‘
+
+    Returns:
+        采集数量
+    """
+    fp = _open_file()
+    try:
+        records: dict = json.load(fp)
+        return int(records.get(account, {}).get(date_str, '0'))
+    except json.decoder.JSONDecodeError:
+        return 0
+
+
+def reset_records(account: str, date_str=None):
+    """
+    重置当天采集记录
+
+    Args:
+        account: 账号
+        date_str: 采集日期 ‘YYYY-mm-dd‘
+
+    """
+    if date_str is None:
+        today = datetime.datetime.today().strftime('%Y-%m-%d')
+    else:
+        today = date_str
+
+    with LOCK:
+        fp = _open_file()
+        try:
+            records: dict = json.load(fp)
+            records[account][today] = '0'
+        except json.decoder.JSONDecodeError:
+            records = {account: {today: '0'}}
+
+        wp = open(CRAWL_RECORD, 'w+', encoding='utf-8')
+        wp.write(json.dumps(records, indent=4))
+        wp.close()
+        fp.close()
+    # print(f"{today}采集记录重置完成")
+
+
+def update_records(account: str, number: int):
+    """
+    更新详情页采集数量记录
+
+    Args:
+        account: 账号
+        number: 采集数量
+
+    """
+    today = datetime.datetime.today().strftime('%Y-%m-%d')
+    with LOCK:
+        fp = _open_file()
+        try:
+            records: dict = json.load(fp)
+        except json.decoder.JSONDecodeError:
+            records = {account: {today: '0'}}
+
+        if account not in records:
+            info = {today: str(number)}
+        else:
+            info: dict = records.get(account)
+            record = int(info.get(today, '0'))
+            if len(info) > 7:
+                '''采集记录保存7天'''
+                info = {today: str(record)}
+            record += number
+            info.update({today: str(record)})
+
+        records.update({account: info})
+        wp = open(CRAWL_RECORD, 'w+', encoding='utf-8')
+        wp.write(json.dumps(records, indent=4))
+        wp.close()
+        fp.close()

+ 133 - 0
ybw/crawler/crawl_scheduler.py

@@ -0,0 +1,133 @@
+import datetime
+import random
+import time
+import traceback
+from datetime import date, timedelta
+
+import requests
+
+from crawler.login import User
+from utils.databases import MongoDBS
+from utils.execptions import JyBasicException
+from utils.log import logger
+from utils.tools import int2long, object_id
+
+
+class Scheduler:
+
+    def __init__(self, query: dict):
+        self.query = query
+        self.crawl_account_tab = MongoDBS('py_spider', 'match_account').coll
+        self.crawl_error_tab = MongoDBS('py_spider', 'crawl_error').coll
+        self.crawl_start = False
+        self.account_id = None
+        self.user = None
+        self.spider_code = None
+        self.crawl_url = None
+        self.crawl_params = None
+        self.crawl_exception = None
+        self.crawl_type = None
+        self.__records = None
+
+    def _release_account(self):
+        rows = dict(
+            used=False,
+            update_time=datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
+        )
+        if self.account_id is not None:
+            self.crawl_account_tab.update_one(
+                {'_id': self.account_id},
+                {'$set': rows}
+            )
+
+    def __enter__(self):
+        # 取出一个空闲并且使用次数较少的账号
+        rows = self.crawl_account_tab.find_one(self.query, sort=[('usage', 1)])
+        if rows is not None:
+            self.account_id = rows['_id']
+            self.user = User(rows['account'], rows['password'])
+            logger.info(f'[开启调度]启用账号: {self.user.phone}')
+            usage = int(rows['usage'])
+            rows['usage'] = usage + 1
+            use_time = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
+            rows['update_time'] = use_time
+            rows['used'] = True
+            self.crawl_account_tab.update_one(
+                {'_id': self.account_id},
+                {'$set': rows}
+            )
+            self.crawl_start = True  # 控制调度的状态
+        else:
+            # TODO 没有空闲账号时,取出使用次数最少的账号,暂未实现
+            logger.warning(f'请检查mongo表 {self.crawl_account_tab.name} 账号状态')
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        logger.info(f'[关闭调度]')
+        self._release_account()
+        self.crawl_start = False
+
+        if exc_type is not None:
+            errmsg = traceback.extract_tb(exc_tb)
+            e = JyBasicException(
+                code=10500,
+                reason=str(exc_type),
+                title='未知系统错误'
+            )
+            self.err_record(e)
+            logger.error(f'错误类型: {exc_type}, 错误内容: {exc_val}, 错误详情: {errmsg}')
+        return True
+
+    def finished(self, execute_next_time=None):
+        logger.info("任务完成")
+        self._release_account()
+        self.sleep(execute_next_time)
+
+    @staticmethod
+    def wait_for_next_task(wait_time=None):
+        _sleep = (wait_time or random.choice(range(5, 15)))
+        time.sleep(_sleep)
+
+    @staticmethod
+    def sleep(wait_time=None):
+        sleep_time = (wait_time or 600)
+        time.sleep(sleep_time)
+
+    @property
+    def today(self):
+        return datetime.datetime.today().strftime('%Y-%m-%d')
+
+    @property
+    def yesterday(self):
+        return (date.today() - timedelta(days=1)).strftime("%Y-%m-%d")
+
+    def err_record(self, e: JyBasicException):
+        rows = {
+            'account': self.user.phone if self.user is not None else '',
+            'spidercode': self.spider_code,
+            'url': self.crawl_url,
+            'status_code': e.code,
+            'reason': e.reason,
+            'params': getattr(e, 'title', ''),
+            'crawl_time': int2long(int(time.time())),
+            'crawl_type': self.crawl_type,
+        }
+        self.crawl_error_tab.insert_one(rows)
+
+    @property
+    def crawl_task(self):
+        results = {}
+        url = 'http://cc.spdata.jianyu360.com/schedule/crawl_task/ybw_scheduler'
+        # url = 'http://127.0.0.1:1405/schedule/crawl_task/ybw_scheduler'
+        try:
+            response = requests.get(url, timeout=10)
+            if response.status_code == 200:
+                data = response.json()['data']
+                if len(data) > 0:
+                    results['_id'] = object_id(data['_id'])
+                    for key, val in data.items():
+                        if key != '_id':
+                            results[key] = val
+            return results
+        except requests.RequestException:
+            return results

+ 396 - 0
ybw/crawler/login.py

@@ -0,0 +1,396 @@
+import json
+import sys
+import threading
+import time
+import uuid
+from collections import namedtuple
+from pathlib import Path
+
+import execjs
+import requests
+from requests import Session
+from requests.utils import dict_from_cookiejar
+
+from config.load import node_module
+from utils.log import logger
+
+if sys.platform == 'linux':
+    node_module_path = node_module['linux']
+else:
+    node_module_path = node_module['windows']
+
+LOCK = threading.RLock()
+
+ROOT_PATH = Path(__file__).parent.parent
+JSON_LOGIN_COOKIE = (ROOT_PATH / 'config/login_cookie.json').resolve()
+
+User = namedtuple('User', ['phone', 'passwd'])
+
+
+def _open_file():
+    try:
+        fp = open(JSON_LOGIN_COOKIE, encoding='utf-8')
+    except FileNotFoundError:
+        fp = open(JSON_LOGIN_COOKIE, 'w+', encoding='utf-8')
+    return fp
+
+
+def load_login_cookies(user_name: str):
+    fp = _open_file()
+    try:
+        cookies: dict = json.load(fp).get(user_name)
+        return cookies
+    except json.decoder.JSONDecodeError:
+        pass
+    fp.close()
+
+
+def save_login_cookies(user_name: str, login_cookie: dict):
+    with LOCK:
+        # 文件存在就读取,不存在就创建
+        fp = _open_file()
+        # 内容存在就加载到内存,不存在就设置为空字典
+        try:
+            user_maps: dict = json.load(fp)
+        except json.decoder.JSONDecodeError:
+            user_maps = {}
+        # print(user_maps)
+
+        if user_name not in user_maps:
+            user_maps.setdefault(user_name, login_cookie)
+        else:
+            cookies = {user_name: login_cookie}
+            user_maps.update(cookies)
+
+        wp = open(JSON_LOGIN_COOKIE, 'w+', encoding='utf-8')
+        wp.write(json.dumps(user_maps, indent=4))
+        fp.close()
+        wp.close()
+
+
+def update_login_cookies(user_name: str, update_val: dict):
+    """
+    更新登录 cookie 内容
+
+    Args:
+        user_name: 账号
+        update_val: 需要更新的cookie内容
+
+    """
+    with LOCK:
+        fp = open(JSON_LOGIN_COOKIE, encoding='utf-8')
+        user_maps: dict = json.load(fp)
+        login_cookies: dict = user_maps.get(user_name)
+        if login_cookies is not None and len(update_val) > 0:
+            login_cookies.update(update_val)
+            user_login_info = {user_name: login_cookies}
+            user_maps.update(user_login_info)
+            wp = open(JSON_LOGIN_COOKIE, 'w+', encoding='utf-8')
+            wp.write(json.dumps(user_maps, indent=4))
+            wp.close()
+        fp.close()
+
+
+def convert1(plaintext):
+    """
+    AES CBC模式 Pkcs7填充 加密
+
+    @param plaintext: 加密文本
+    @return:
+    """
+    js_str = '''
+    const CryptoJS = require('crypto-js');
+    function convert1(txt) {
+        var a = '434D643932666D644B454E304E646C616535334D6435666E';
+        a = CryptoJS.enc.Hex.parse(a)
+        b = CryptoJS.enc.Hex.parse("30393138313633304D4D474C435A5059")
+        var enc = CryptoJS.AES.encrypt(txt, a, {
+            iv: b,
+            mode: CryptoJS.mode.CBC,
+            padding: CryptoJS.pad.Pkcs7
+        })
+        return enc.ciphertext.toString()
+    }
+    '''
+    ctx = execjs.compile(js_str, cwd=node_module_path)
+    return ctx.call('convert1', plaintext)
+
+
+def recognition_captcha(image):
+    """
+    验证码识别
+
+    @param image: 验证码图片名称或者路径
+    @return:
+    """
+    url = "http://123.57.163.80:2119/v1/images/verify"
+    img_headers = {'accept': 'application/json'}
+    with open(image, 'rb') as f:
+        image_bytes = f.read()
+    image_file = {'file': image_bytes}
+    r = requests.post(url, headers=img_headers, files=image_file, stream=True)
+    json_resp = r.json()
+    if "msg" in json_resp and "success" == json_resp["msg"]:
+        return str(json_resp["r"]["code"]).upper()
+    return None
+
+
+def download_captcha(image, session: Session):
+    """下载验证码"""
+    js_str = '''
+        function changeYzmL() {
+            var randomNum = ('000000' + Math.floor(Math.random() * 999999)).slice(-6);
+            var time = new Date();
+            var nowTime = String(time.getFullYear()) + String(time.getMonth() + 1) + String(time.getDate()) + String(
+            time.getHours()) + String(time.getMinutes()) + String(time.getSeconds());
+            return "https://www.chinabidding.cn/cblcn/member.Login/captcha?randomID=" + randomNum + "&t=" + nowTime
+        }
+    '''
+    ctx = execjs.compile(js_str)
+    url = ctx.call('changeYzmL')
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
+        'Referer': 'https://www.chinabidding.cn/public/2020/html/login.html?source=1',
+    }
+    r = session.get(url, headers=headers, stream=True)
+    with open(image, 'wb') as f:
+        f.write(r.content)
+    logger.info('验证码下载成功')
+
+
+def captcha(session, phone):
+    """
+    验证码下载与识别
+    @param session: requests.session会话对象
+    @param phone: 验证码图片命名规则
+    @return:
+    """
+    name = f'{phone}.jpg'
+    download_captcha(name, session)
+    code = recognition_captcha(name)
+    logger.info(f'验证码识别: {code}')
+    return convert1(code)
+
+
+def login_session(phone: str, password: str, proxies=None):
+    """
+    登录会话
+
+    @param phone: 登录手机号
+    @param password: 登录密码
+    @param proxies: 代理
+    @return: requests.session()
+    """
+    logger.info('账号登录:{phone}', phone=phone)
+    session = requests.session()
+    # 生成浏览器身份id
+    gr_user_id = uuid.uuid4()
+    gr_session_id = uuid.uuid4()
+    login_ts = int(time.time())
+    session.cookies['gr_user_id'] = str(gr_user_id)
+    session.cookies['b5897e326c6777f3_gr_session_id'] = str(gr_session_id)
+    session.cookies[f'b5897e326c6777f3_gr_session_id_{gr_session_id}'] = 'true'
+    session.cookies['Hm_lvt_0bf7d2e4ce4104fa77e95b012f750771'] = str(login_ts)
+    session.cookies['Hm_lpvt_0bf7d2e4ce4104fa77e95b012f750771'] = str(login_ts)
+    # 验证码识别
+    yzm = captcha(session, phone)
+    '''
+        1、下载验证码操作行为时,会导致服务器记录该次请求的客户端身份id;(服务器保存处理大约需要1s以上)        
+    '''
+    time.sleep(2)
+    now_time_str = '''
+        function t() {
+            var time = new Date();
+            var nowTime = String(time.getFullYear()) + String(time.getMonth() + 1) + String(time.getDate()) + String(
+            time.getHours()) + String(time.getMinutes()) + String(time.getSeconds());
+            return nowTime
+        }
+        '''
+    now_time_ctx = execjs.compile(now_time_str)
+    now_time = now_time_ctx.call('t')
+    data = {
+        'phone': convert1(phone),
+        'password': convert1(password),
+        'yzm': yzm,
+        't': now_time
+    }
+    '''
+        2、提交登录用户信息时,必须保证提交的会话与下载验证码的会话保持一致,否则服务器验证码无法通过验证
+    '''
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
+        'Referer': 'https://www.chinabidding.cn/public/2020/html/login.html?source=1',
+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'X-Requested-With': 'XMLHttpRequest',
+    }
+    url = 'https://www.chinabidding.cn/yuan/login/loginnew/login'
+    r = session.post(url, headers=headers, data=data, proxies=proxies)
+    assert r.status_code == 200
+    logger.info(f'登录信息: {r.json()}')
+    return r, session
+
+
+def login_session_by_cookies(cookies: dict, url: str, headers: dict, data=None):
+    """
+    使用cookies获取 login session
+
+    @param cookies: 用户登录后的cookies
+    @param url: 登录检查地址
+    @param headers: 请求头
+    @param data: 毫秒级时间戳
+    @return: 身份信息和请求结束的响应对象
+    """
+    session = requests.session()
+    r = session.post(url, headers=headers, data=data, cookies=cookies)
+    assert r.status_code == 200
+    return r, session
+
+
+def login_check_and_get_meta(session: Session = None, allow_output_log=True):
+    """
+    检查账号登录状态和获取账号身份数据
+
+    @param session: 账号登录后的 session
+    @param allow_output_log: 是否打印日志
+    @return: 账号身份数据
+    """
+    url = "https://www.chinabidding.cn/cblcn/Home/logincheckAndGetMeta"
+    payload = f"t={int(round(time.time() * 1000))}"
+    headers = {
+        'Host': 'www.chinabidding.cn',
+        'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
+        'Accept': 'application/json, text/javascript, */*; q=0.01',
+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'X-Requested-With': 'XMLHttpRequest',
+        'sec-ch-ua-mobile': '?0',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
+        'sec-ch-ua-platform': '"Windows"',
+        'Origin': 'https://www.chinabidding.cn',
+        'Sec-Fetch-Site': 'same-origin',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Dest': 'empty',
+        'Referer': 'https://www.chinabidding.cn/public/2020/html/login.html?source=1',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+    }
+    r = session.post(url, headers=headers, data=payload)
+    assert r.status_code == 200
+    member: dict = r.json().get('member')
+    if allow_output_log:
+        logger.info("账号信息:{}", json.dumps(member, indent=4, ensure_ascii=False))
+    return member, r
+
+
+def login_check(account: str = None, refer=None, allow_output_log=True):
+    """
+    用户身份信息状态检查
+
+    Args:
+        account: 用户账号
+        refer: 引用页
+        allow_output_log: 是否打印日志
+
+    Returns:
+        登录有效时返回False,登录无效时返回True
+    """
+    url = "https://www.chinabidding.cn/cblcn/Home/newLoginCheck"
+    ts = int(time.time())
+    ts2tms = int(round(ts * 1000))
+    payload = f"t={ts2tms}"
+    headers = {
+        'Host': 'www.chinabidding.cn',
+        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
+        'Accept': 'application/json, text/javascript, */*; q=0.01',
+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'sec-ch-ua-mobile': '?0',
+        'X-Requested-With': 'XMLHttpRequest',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
+        'sec-ch-ua-platform': '"Windows"',
+        'Origin': 'https://www.chinabidding.cn',
+        'Sec-Fetch-Site': 'same-origin',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Dest': 'empty',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+    }
+    if refer is not None:
+        headers.update({'Referer': refer})
+
+    cookies = load_login_cookies(account)
+    if cookies is None:
+        '''没有该账号的cookies信息,请检查 login_cookie.json 配置文件'''
+        return True
+
+    ts = int(time.time())
+    r, session = login_session_by_cookies(cookies, url, headers, data=payload)
+    member = r.json()
+    if allow_output_log:
+        logger.info("账号信息:{}", json.dumps(member, indent=4, ensure_ascii=False))
+
+    '''处理本地 cookies'''
+    login_cookies: dict = dict_from_cookiejar(r.cookies)
+    request_ts = dict(
+        # 上一次时间访问时间戳(秒)
+        Hm_lvt_0bf7d2e4ce4104fa77e95b012f750771=cookies['Hm_lpvt_0bf7d2e4ce4104fa77e95b012f750771'],
+        # 当前访问时间戳(秒)
+        Hm_lpvt_0bf7d2e4ce4104fa77e95b012f750771=str(ts)
+    )
+    login_cookies.update(request_ts)
+    update_login_cookies(account, login_cookies)
+    if member is not None and len(member) > 1:
+        '''登录有效'''
+        return False
+    else:
+        '''登录失效'''
+        return True
+
+
+def login_session_check(session: Session, account: str, allow_output_log=True):
+    """
+    账号登录状态
+
+    @param session: 登录后的用户 session
+    @param account: 账号
+    @param allow_output_log: 是否打印日志
+    @return: bool
+    """
+    member, r = login_check_and_get_meta(session, allow_output_log)
+    if member is not None and len(member) > 0:
+        login_cookies: dict = dict_from_cookiejar(r.cookies)
+        update_login_cookies(account, login_cookies)
+        '''身份有效'''
+        return False
+    else:
+        '''身份无效'''
+        return True
+
+
+def login(phone: str, password: str, proxies=None):
+    """
+    登录
+
+    @param phone: 账号
+    @param password: 密码
+    @param proxies: 代理
+    @return: 登录会话和网络状态码
+    """
+    r, session = login_session(phone, password, proxies)
+    if r.json()['code'] == 200:
+        member, _ = login_check_and_get_meta(session)
+        login_cookies: dict = dict_from_cookiejar(session.cookies)
+        if member is not None:
+            record_id = str(member['record_id'])
+            login_meta = dict(
+                b5897e326c6777f3_gr_cs1=record_id,
+                b5897e326c6777f3_gr_last_sent_cs1=record_id,
+                b5897e326c6777f3_gr_last_sent_sid_with_cs1=login_cookies['b5897e326c6777f3_gr_session_id']
+            )
+            login_cookies.update(login_meta)
+        save_login_cookies(phone, login_cookies)
+        return session, 200
+    else:
+        '''
+            514 IP限制
+        '''
+        logger.error(f'[登录失败]{r.json()["code"]}-{r.json()["msg"]},账号:{phone}')
+        if r.json()["code"] == 514:
+            time.sleep(300)
+        return requests.session(), int(r.json()["code"])

+ 248 - 0
ybw/detail_spider.py

@@ -0,0 +1,248 @@
+import time
+
+import requests.exceptions
+from lxml.html import fromstring, HtmlElement, tostring
+from lxml.html.clean import Cleaner
+from pymongo.errors import DuplicateKeyError
+
+from crawler.check_utils import CheckText, CheckTask
+from crawler.clean_html import clean_html
+from crawler.crawl_record import update_records, load_records
+from crawler.crawl_scheduler import Scheduler
+from crawler.login import login, load_login_cookies, login_check
+from utils.databases import MongoDBS
+from utils.execptions import VoidCrawlError, JyBasicException
+from utils.log import logger
+from utils.socks5 import Proxy
+from utils.tools import int2long
+
+
+def iter_node(element: HtmlElement):
+    yield element
+    for sub_element in element:
+        if isinstance(sub_element, HtmlElement):
+            yield from iter_node(sub_element)
+
+
+def pre_parse(element: HtmlElement):
+    """对 HTML 进行预处理可能会破坏 HTML 原有的结构,导致根据原始 HTML 编写的 XPath 不可用"""
+    pre_remove = {
+        'log_col2', 'log_col1', 'cz', 'iconfont closei', 'p2 p1', 'cnxh_b',
+        'msg_error', 'r_gg TB-focus', 'april', 'cont2', 'to_login', 'regtxt',
+        'shouchang an_n sc', 'april_title red', 'cn_lt', 'dayin an_n',
+        'dl_zc vip_t free_member', 'rmbq', 'login-form cl', 'dian_g fr',
+        'di_n', 'd_fx', 'd_tub', 'd_dy', 'anniu1', 'cnxh_list', 'btns cl',
+        'active', 'close', 'd_an fr', 'avatar', 'toolbar', 'deng_l',
+        'cen_right fr', 'log_col5', 'agreement', 'log_col3',
+        'shouchang_af an_n sc_after', 'fast_box', 'di_nr fl', 'xgfj', 'dianh',
+        'cnxh_list tab_b2 city_list', 'contract cl', 'zb_cen_r fr', 'd_zsms',
+        'sc_after active', 'dl_k', 'ewm_b', 'fl', 'wypj', 'rukou', 'p1',
+        'dl_zc', 'success', 'daoh h_30', 'bd', 'april_content', 'print',
+        'foot', 'cnxh zbgg', 'april_first', 'fastlog', 'tx_mc user_name',
+        'tab_h2', 'fanding an_n', 'toux', 'log_col4 cl', 'hangy rem_1', 'red',
+        'regshadow', 'bottom', 'dl_zc vip_t fee_member', 'xszn fl', 'no-print',
+        'cnxh_b zbgg_b', 'rem rem_1', 'logshadowz', 'd_pj fl', 'tjgjc',
+        'spdujaiwlohh', 'di_ewm fr', 'dian_h fl',
+        'tab_h2 zbgg_b_gray', 'fanshou an_n fs', 'login-btn', 'fl gjc',
+        'agreeshadow', 'guang_db', 'footer_1', 'log_p', 'cnxh_list tab_b2',
+        'd_sw', 'april_close', 'd_sc', 'erweima no-print', 'qgzx', 'p2', 'sc',
+        'hd', 'log_col6', 'dh_b', 'dian_guang', 'zhu_c', 'ck cai_k', 'april_box',
+        'display:none'
+    }
+    for node in iter_node(element):
+        id_attr = node.attrib.get('id')
+        class_attr = node.attrib.get('class')
+        style_attr = node.attrib.get('style')
+        if any([id_attr in pre_remove,
+                class_attr in pre_remove,
+                style_attr in pre_remove]):
+            node.drop_tree()
+    return element
+
+
+def page_source(element: HtmlElement):
+    clear = Cleaner(
+        forms=False,
+        style=True
+    )
+    return clear.clean_html(tostring(element, encoding="utf-8").decode())
+
+
+class DetailSpider:
+
+    def __init__(
+            self,
+            db: str,
+            crawl_tab: str,
+            save_tab: str,
+            crawl_total=None,
+    ):
+        self.crawl_tab = MongoDBS(db, crawl_tab).coll
+        self.save_tab = MongoDBS(db, save_tab).coll
+        self.crawl_total = crawl_total or 6000
+        self.user = None
+
+    def crawl_request(self, url):
+        headers = {
+            'Host': 'www.chinabidding.cn',
+            'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"Windows"',
+            'Upgrade-Insecure-Requests': '1',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+            'Sec-Fetch-Site': 'none',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-User': '?1',
+            'Sec-Fetch-Dest': 'document',
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+        }
+        request_params = {}
+        request_params.setdefault('headers', headers)
+        request_params.setdefault('timeout', 60)
+
+        retries = 0
+        proxy, proxies = None, None
+        while retries < 3:
+            login_cookies = load_login_cookies(self.user.phone)
+            if login_cookies is None:
+                login(*self.user)
+                continue
+            elif 'cookies' not in request_params:
+                request_params.setdefault('cookies', login_cookies)
+            else:
+                request_params.update({'cookies': login_cookies})
+
+            try:
+                r = requests.get(url, **request_params)
+                '''账号登录状态检查'''
+                retry_login = login_check(self.user.phone, url, False)
+                if retry_login:
+                    logger.info(f"[重新登录]{self.user.phone}")
+                    _, code = login(*self.user, proxies=proxies)
+                    if code == 200:
+                        retries += 1
+                    else:
+                        if proxy is None:
+                            proxy = Proxy(True)
+                        else:
+                            proxy.switch()
+                        proxies = proxy.proxies
+                        retries += 1
+                    continue
+                element = fromstring(r.text)
+                nodes = element.xpath('//*[@id="main_dom"]/div[1]')
+                if len(nodes) != 1:
+                    raise VoidCrawlError
+                else:
+                    node = nodes[0]
+                    logger.info(f'[采集正文] id={node.attrib.get("id")}')
+                    return r
+            except requests.RequestException:
+                retries += 1
+                continue
+
+        return None
+
+    def crawl_response(self, response, item):
+        element: HtmlElement = fromstring(response.text)
+        node = element.xpath('//*[@id="infoDescription"]')[0]
+        node = pre_parse(node)
+        features = {
+            './div[@class="ckgys_cont"]',
+            './/div[@class="detail-title ng-scope"]',
+            './/table[@class="detail_Table"]',
+        }
+        for feature in features:
+            extract_node = node.xpath(feature)
+            if len(extract_node) > 0:
+                valid_node = extract_node[0]
+                break
+        else:
+            valid_node = node
+
+        html = page_source(valid_node)
+
+        '''检查文本内容'''
+        CheckText(html)
+        item["contenthtml"] = html
+        item["detail"] = clean_html(html)
+        item["comeintime"] = int2long(int(time.time()))
+        del item['count'], item['crawl']
+        if 'crawl_status' in item:
+            del item['crawl_status']
+        try:
+            self.save_tab.insert_one(item)
+        except DuplicateKeyError:
+            pass
+        logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
+
+    def set_crawl_status(self, item: dict, status: bool):
+        self.crawl_tab.update_one(
+            {'_id': item['_id']},
+            {'$set': {'crawl': status}}
+        )
+
+    def crawl_spider(self, sc: Scheduler):
+        while True:
+            if load_records(self.user.phone, sc.today) >= self.crawl_total:
+                return True
+            item = sc.crawl_task
+            if len(item) == 0:
+                return False
+            self.set_crawl_status(item, True)
+            '''使用调度器记录采集内容,出现错误时错误写入数据库'''
+            sc.spider_code = item['spidercode']
+            sc.crawl_url = item['competehref']
+            try:
+                '''检查请求采集任务'''
+                CheckTask(item)
+                url = item['competehref']
+                response = self.crawl_request(url)
+                if response is not None:
+                    self.crawl_response(response, item)
+                    self.crawl_tab.update_one(
+                        {"_id": item["_id"]},
+                        {'$set': {'crawl_status': 'finished'}}
+                    )
+                    update_records(self.user.phone, 1)
+            except JyBasicException as e:
+                if e.code == 10105:
+                    '''检查出该异常时,程序会将es查询结果更新采集表'''
+                    self.crawl_tab.update_one(
+                        {"_id": item["_id"]},
+                        {'$set': {'count': item['count']}}
+                    )
+                else:
+                    sc.err_record(e)
+                    self.crawl_tab.update_one(
+                        {"_id": item["_id"]},
+                        {'$set': {'crawl_status': 'error'}}
+                    )
+            finally:
+                self.set_crawl_status(item, False)
+                sc.wait_for_next_task()
+
+    def start(self):
+        query = {'used': False, 'site': '元博网', 'class': 'detail'}
+        while True:
+            with Scheduler(query) as scheduler:
+                scheduler.crawl_type = 'detail'
+                if scheduler.crawl_start:
+                    self.user = scheduler.user
+                    finished = self.crawl_spider(scheduler)
+                    if finished:
+                        '''完成采集任务'''
+                        scheduler.finished()
+                    else:
+                        '''暂无采集任务'''
+                        scheduler.wait_for_next_task()
+
+
+if __name__ == '__main__':
+    DetailSpider(
+        db='py_spider',
+        crawl_tab='ybw_list',
+        save_tab='data_bak',
+        crawl_total=6000,
+    ).start()

+ 206 - 0
ybw/list_spider.py

@@ -0,0 +1,206 @@
+import random
+import time
+from collections import namedtuple
+from urllib.parse import quote
+
+import requests
+from lxml.html import fromstring, HtmlElement
+
+from config.load import crawler_url, region
+from crawler.crawl_scheduler import Scheduler
+from crawler.login import login, load_login_cookies, login_session_check
+from utils.databases import MongoDBS
+from utils.es_query import get_es
+from utils.execptions import CustomCheckError, VoidCrawlError, JyBasicException
+from utils.log import logger
+from utils.socks5 import Proxy
+from utils.tools import int2long
+
+CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode', 'table_type'])
+
+
+class ListSpider:
+
+    def __init__(self, db: str, crawl_tab: str, crawl_max_page=None):
+        self.crawl_menus = [
+            # CrawlMenu('企业采购', 'a_ybwcgyzbw_qycg', '7%2C'),
+            CrawlMenu('政府采购', 'a_ybwcgyzbw_zfcg', '6%2C'),
+            CrawlMenu('招标预告', 'a_ybwcgyzbw_zbyg', '5%2C'),
+            CrawlMenu('中标公示', 'a_ybwcgyzbw_zbgs', '4%2C'),
+            CrawlMenu('服务招标', 'a_ybwcgyzbw_fwzb', '3%2C'),
+            CrawlMenu('货物招标', 'a_ybwcgyzbw_hwzb', '2%2C'),
+            CrawlMenu('工程招标', 'a_ybwcgyzbw_gczb', '1%2C'),
+        ]
+        self.crawl_max_page = crawl_max_page or 1
+        self.crawl_tab = MongoDBS(db, crawl_tab).coll
+        self.user = None
+        self.session = None
+
+    def crawl_request(self, url: str, refer: str, **kwargs):
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+            'Referer': refer
+        }
+        request_params = {}
+        request_params.setdefault('headers', headers)
+        request_params.setdefault('timeout', 60)
+        if kwargs.get('cookies') is not None:
+            request_params.setdefault('cookies', kwargs.get('cookies'))
+
+        retries = 0
+        proxy, proxies = None, None
+        while retries < 5:
+            try:
+                response = self.session.get(url, **request_params)
+            except requests.exceptions.Timeout:
+                time.sleep(10)
+                retries += 1
+                continue
+
+            element = fromstring(response.text)
+            feature = '//div[@id="pages"]/following-sibling::table//tr'
+            if element.xpath('//*[@id="password"]|//*[@id="renew_pop"]'):
+                '''当出现登录或者注册弹窗页面,此时添加账号身份信息并检查账号状态'''
+                retry_login = login_session_check(self.session, self.user.phone)
+                if retry_login:
+                    logger.info(f"[重新登录]{self.user.phone}")
+                    self.session, code = login(*self.user, proxies=proxies)
+                    if code != 200:
+                        '''1小时内登录频繁会限制ip,此时添加代理登录账号'''
+                        if proxy is None:
+                            proxy = Proxy(True)
+                        else:
+                            proxy.switch()
+                        proxies = proxy.proxies
+                        retries += 1
+                else:
+                    login_cookies = load_login_cookies(self.user.phone)
+                    request_params.update({'cookies': login_cookies})
+            elif element.xpath('//*[@id="pages"]') and len(element.xpath(feature)) > 0:
+                return response
+            else:
+                '''没有搜索到任何内容的页面'''
+                return None
+
+        raise VoidCrawlError(code=100020, reason='列表页采集异常')
+
+    def crawl_response(self, response, menu: CrawlMenu):
+        results = []
+        element: HtmlElement = fromstring(response.text)
+        feature = '//div[@id="pages"]/following-sibling::table//tr'
+        for node in element.xpath(feature):
+            publish_time = "".join(node.xpath('./td[6]/text()')).strip()
+            if '-' not in publish_time:
+                publish_time = "".join(node.xpath('./td[7]/text()')).strip()
+            area = "".join("".join(node.xpath('./td[5]/text()')).split())
+            title = "".join("".join(node.xpath('./td[2]/a/text()')).split())
+            competehref = 'https://www.chinabidding.cn{}'.format("".join(node.xpath('./td[2]/a/@href')))
+            item = {
+                "site": "元博网(采购与招标网)",
+                "channel": menu.channel,
+                "area": area if area != '跨省' else '全国',
+                "_d": "comeintime",
+                "comeintime": int2long(int(time.time())),
+                "T": "bidding",
+                "sendflag": "false",
+                "spidercode": menu.spidercode,
+                "city": "",
+                "type": "",
+                "publishdept": "",
+                "title": title,
+                "competehref": competehref,
+                "href": "#",
+                "publishtime": publish_time,
+                "l_np_publishtime": int2long(int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))),
+            }
+            if title is None:
+                raise CustomCheckError(code=10107, reason='发布标题解析空值错误')
+            item['count'] = get_es(item["title"], item["l_np_publishtime"])
+            item['crawl'] = False
+            # print(f'>>> {title} - {competehref}')
+            results.append(item)
+
+        if len(results) > 0:
+            self.crawl_tab.insert_many(results)
+        return len(results)
+
+    def crawl_spider(self, sc: Scheduler, menu: CrawlMenu):
+        for region_id, region_name in region.items():
+            previous_url = None
+            crawl_total, cookies = 1, None
+            self.session = requests.session()
+            '''每个普通账号仅能查询4000条数据,设置每页最大条数:100,共计40页'''
+            page_size = 100
+            for page in range(1, self.crawl_max_page + 1):
+                '''生成 url 和 refer'''
+                if page == 1:
+                    url = crawler_url['home_page'].format(
+                        region_id,
+                        sc.yesterday,
+                        sc.yesterday,
+                        page_size,
+                        menu.table_type
+                    )
+                    previous_url = url
+                    refer = crawler_url['refer'].format(quote(region_name))
+                else:
+                    url = crawler_url['list_url'].format(
+                        region_id,
+                        sc.yesterday,
+                        sc.yesterday,
+                        page,
+                        page_size,
+                        menu.table_type
+                    )
+                    refer = previous_url
+                    previous_url = url
+                sc.crawl_url = url
+                sc.spider_code = menu.spidercode
+                print(">>> ", url)
+                if crawl_total >= 4:
+                    '''列表数据从第4页开始,普通登录账号登录状态下才能获取数据'''
+                    cookies = load_login_cookies(self.user.phone)
+                try:
+                    response = self.crawl_request(url, refer, cookies=cookies)
+                    if response is None:
+                        logger.info(f'[采集成功]{menu.channel}-{region_name}-第{page}页-0条')
+                        break
+                    item_size = self.crawl_response(response, menu)
+                    logger.info(f'[采集成功]{menu.channel}-{region_name}-第{page}页-{item_size}条')
+                    if item_size < page_size:
+                        '''当前页面的发布的条数小于网站页面固定发布数量,下一页不存在数据,直接跳过'''
+                        break
+                    else:
+                        crawl_total += 1
+                except JyBasicException as e:
+                    sc.err_record(e)
+
+                sc.wait_for_next_task(random.choice(range(2, 8)))
+            self.session.close()
+
+    def start(self):
+        query = {'used': False, 'site': '元博网', 'class': 'list'}
+        with Scheduler(query) as scheduler:
+            scheduler.crawl_type = 'list'
+            if scheduler.crawl_start:
+                for menu in self.crawl_menus:
+                    self.user = scheduler.user
+                    while True:
+                        try:
+                            self.crawl_spider(scheduler, menu)
+                            break
+                        except Exception as e:
+                            logger.error('采集分类的名称:{} 错误类型:{} '.format(
+                                menu.channel,
+                                e.__class__.__name__,
+                            ))
+                    scheduler.finished()
+
+
+if __name__ == '__main__':
+    ListSpider(
+        db='py_spider',
+        crawl_tab='ybw_list',
+        crawl_max_page=40,
+    ).start()

+ 79 - 0
ybw/requirements.txt

@@ -0,0 +1,79 @@
+aliyun-python-sdk-core==2.13.35
+aliyun-python-sdk-kms==2.14.0
+aniso8601==9.0.1
+backcall==0.2.0
+beautifulsoup4==4.9.3
+better-exceptions==0.3.3
+bs4==0.0.1
+certifi==2021.10.8
+cffi==1.15.0
+chardet==3.0.4
+click==8.0.3
+colorama==0.4.4
+copyheaders==0.0.2
+crcmod==1.7
+cryptography==35.0.0
+cssselect==1.1.0
+DBUtils==2.0.2
+ddddocr==1.1.0
+decorator==5.1.0
+EditorConfig==0.12.3
+elasticsearch==7.10.1
+et-xmlfile==1.1.0
+fake-useragent==0.1.11
+Flask==2.0.2
+flatbuffers==2.0
+greenlet==1.1.2
+idna==2.8
+influxdb==5.3.1
+ipython==7.30.1
+itsdangerous==2.0.1
+jedi==0.18.1
+jieba==0.42.1
+Jinja2==3.0.3
+jmespath==0.10.0
+jsbeautifier==1.14.0
+loguru==0.5.3
+lxml==4.6.3
+MarkupSafe==2.0.1
+matplotlib-inline==0.1.3
+msgpack==1.0.3
+numpy==1.21.4
+onnxruntime==1.9.0
+openpyxl==3.0.9
+oss2==2.14.0
+parsel==1.6.0
+parso==0.8.3
+pickleshare==0.7.5
+Pillow==8.2.0
+pinyin==0.4.0
+playwright==1.17.0
+prompt-toolkit==3.0.24
+protobuf==3.19.1
+pycparser==2.21
+pycryptodome==3.11.0
+pyee==8.2.2
+PyExecJS==1.5.1
+Pygments==2.10.0
+pymongo==3.8.0
+PyMySQL==1.0.2
+PySocks==1.7.1
+python-dateutil==2.8.2
+pytz==2021.3
+PyYAML==5.4.1
+redis==3.3.6
+redis-py-cluster==2.1.3
+requests==2.22.0
+retrying==1.3.3
+selenium==3.141.0
+six==1.16.0
+soupsieve==2.3.1
+traitlets==5.1.1
+typing_extensions==4.0.0
+urllib3==1.25.11
+w3lib==1.22.0
+wcwidth==0.2.5
+websockets==10.1
+Werkzeug==2.0.2
+win32-setctime==1.0.3
+wincertstore==0.2

+ 7 - 0
ybw/start.sh

@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# 切换到指定目录
+cd /mnt/ybw
+/usr/bin/python3 /mnt/ybw/detail_spider.py
+#保留终端,防止容器自动退出
+/usr/sbin/init

+ 0 - 0
ybw/utils/__init__.py


+ 47 - 0
ybw/utils/databases.py

@@ -0,0 +1,47 @@
+from typing import Optional
+
+import pymongo
+import redis
+from pymongo.collection import Collection
+from pymongo.database import Database
+
+from config.load import mongo_conf, redis_conf
+
+__all__ = ['MongoDBS', 'RedisDBS']
+
+
+class MongoDBS:
+    """ Mongo """
+
+    def __init__(self, db: str, collection: str, cfg: dict = mongo_conf):
+        self.client = pymongo.MongoClient(host=cfg['host'], port=cfg['port'])
+        self.db: Database = self.client[db]
+        self.coll: Collection = self.db[collection]
+
+    def __enter__(self):
+        return self.coll
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # 上下文管理器,实例调用完毕后,关闭客户端连接
+        self.client.close()
+
+    def __del__(self):
+        # 实例调用完毕后,关闭客户端连接
+        self.client.close()
+
+
+class RedisDBS:
+    """ redis """
+
+    def __init__(self, cfg: Optional[dict] = redis_conf):
+        pool = redis.ConnectionPool(
+            host=cfg['host'],
+            port=cfg['port'],
+            password=cfg['pwd'],
+            db=cfg['db']
+        )
+        self.__r = redis.Redis(connection_pool=pool, decode_responses=True)
+
+    @property
+    def redis(self):
+        return self.__r

+ 46 - 0
ybw/utils/es_query.py

@@ -0,0 +1,46 @@
+import requests
+from elasticsearch import Elasticsearch
+
+from config.load import es_conf
+
+es = Elasticsearch([{"host": es_conf['host'], "port": es_conf['port']}])
+
+
+def httpAz(title):
+    url = "http://{}:{}/bidding/_analyze".format(es_conf['host'], es_conf['port'])
+    params = {"text": title, "analyzer": "ik_smart"}
+    arr = []
+    # TODO 测试es查询响应时长过长影响效率
+    res = requests.get(url=url, params=params, timeout=60)
+    if res.status_code == 200:
+        tokens = res.json().get('tokens', [])
+        for x in tokens:
+            if x["token"].encode('utf-8').isalpha():
+                continue
+            arr.append(x["token"])
+
+    q = [{"multi_match": {"query": v, "type": "phrase", "fields": ["title"]}} for v in arr]
+    return q
+
+
+def get_es(title, publishtime):
+    """
+    :param title: 标题
+    :param publishtime: 发布时间
+    :return:
+    """
+    stime = publishtime - 432000    # 往前推5天
+    etime = publishtime + 432000
+    q1 = httpAz(title)
+    q1.append({"range": {"publishtime": {"from": stime, "to": etime}}})
+    esQuery = {
+        "query": {
+            "bool": {
+                "must": q1,
+                "minimum_should_match": 1
+            }
+        }
+    }
+    result = es.search(index='bidding', body=esQuery, request_timeout=100)
+    count = len(result['hits']['hits'])
+    return count

+ 49 - 0
ybw/utils/execptions.py

@@ -0,0 +1,49 @@
+
+class JyBasicException(Exception):
+
+    def __init__(self, code: int, reason: str, **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class CustomAccountPrivilegeError(JyBasicException):
+
+    def __init__(self, code: int = 10001, reason: str = '账号权限登录异常', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class CustomCheckError(JyBasicException):
+
+    def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class VoidCrawlError(JyBasicException):
+
+    def __init__(self, code: int = 10003, reason: str = '空页面采集错误', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class AttachmentNullError(JyBasicException):
+
+    def __init__(self, code: int = 10004, reason: str = '附件下载异常', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)

+ 19 - 0
ybw/utils/log.py

@@ -0,0 +1,19 @@
+from loguru import logger
+
+logger.add(
+    'logs/crawl_{time:YYYY-MM-DD}.log',
+    format='{time:YYYY-MM-DD HH:mm:ss} - {file}:{line} - {level} - {message}',
+    level='INFO',
+    rotation='00:00',
+    retention='1 week',
+    encoding='utf-8',
+    filter=lambda x: '采集' in x['message']
+)
+
+logger.add(
+    'logs/error.log',
+    format='{time:YYYY-MM-DD HH:mm:ss} - {file}:{line} - {level} - {message}',
+    rotation="500 MB",
+    encoding='utf-8',
+    level='ERROR'
+)

+ 153 - 0
ybw/utils/socks5.py

@@ -0,0 +1,153 @@
+import threading
+import time
+from collections import deque
+from urllib.parse import urlparse
+
+import requests
+
+from config.load import jy_proxy, headers
+from utils.log import logger
+
+__all__ = ['Proxy']
+
+
+def decrypt(input_str: str) -> str:
+    """
+    定义base64解密函数
+
+    :param input_str:
+    :return:
+    """
+    # 对前面不是“=”的字节取索引,然后转换为2进制
+    key = jy_proxy['socks5']['decrypt']
+    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
+    output_str = ''
+    # 补齐“=”的个数
+    equal_num = input_str.count('=')
+    while ascii_list:
+        temp_list = ascii_list[:4]
+        # 转换成2进制字符串
+        temp_str = ''.join(temp_list)
+        # 对没有8位2进制的字符串补够8位2进制
+        if len(temp_str) % 8 != 0:
+            temp_str = temp_str[0:-1 * equal_num * 2]
+        # 4个6字节的二进制  转换  为三个8字节的二进制
+        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
+        # 二进制转为10进制
+        temp_str_list = [int(x, 2) for x in temp_str_list if x]
+        # 连接成字符串
+        output_str += ''.join([chr(x) for x in temp_str_list])
+        ascii_list = ascii_list[4:]
+    return output_str
+
+
+class Socks5Proxy:
+
+    __instance = None
+
+    def __new__(cls, *args, **kwargs):
+        if cls.__instance is None:
+            cls.__instance = super().__new__(cls)
+        return cls.__instance
+
+    def __init__(self):
+        self.seconds = 60
+        self._lock = threading.RLock()
+        self._url = jy_proxy['socks5']['url']
+        self._dq = deque([])
+        self._proxies = {}
+        self._pool = []
+        self._counter = {}
+
+    def _init(self):
+        while not self._proxies:
+            if len(self._dq) > 0:
+                '''队列左边取值'''
+                self._proxies = self._dq.popleft()
+                '''添加到队尾'''
+                self._dq.append(self._proxies)
+            else:
+                self.__request_service()
+                self.__check_proxies()
+
+    @property
+    def proxies(self):
+        with self._lock:
+            return self._proxies if len(self._proxies) > 0 else None
+
+    def switch(self, reset=False):
+        with self._lock:
+            if reset is True:
+                self.__flush_proxy_pool()
+            elif len(self._counter) > 0:
+                end_time = self._counter[self.get_netloc(self._proxies)]
+                current_time = int(time.time())
+                if end_time - current_time < self.seconds:
+                    logger.info(f"[移除socks5代理]{self.get_netloc(self._proxies)}")
+                    self._dq.remove(self._proxies)
+                    del self._counter[self.get_netloc(self._proxies)]
+                    logger.info(f"[socks5代理]剩余 {len(self._dq)} 个")
+
+            self._proxies = {}  # 重置代理
+            while len(self._proxies) == 0:
+                if len(self._dq) > 0:
+                    self._proxies = self._dq.popleft()
+                    self._dq.append(self._proxies)
+                else:
+                    self.__flush_proxy_pool()
+
+    @staticmethod
+    def get_netloc(item: dict):
+        parser = urlparse(item.get('http'))
+        return parser.netloc
+
+    def __request_service(self):
+        try:
+            response = requests.get(self._url, timeout=10)
+            self.__extract_ip(response)
+        except requests.RequestException:
+            pass
+
+    def __extract_ip(self, response):
+        for proxy in response.json():
+            host = decrypt(proxy['host'])
+            port = int(proxy['port'])
+            end_time = proxy['EndTime']
+            items = {
+                'http': 'socks5://{}:{}'.format(host, port),
+                'https': 'socks5://{}:{}'.format(host, port)
+            }
+            self._pool.append(items)
+            self._counter.setdefault(self.get_netloc(items), end_time)
+
+    def __check_proxies(self):
+        check_ip = 'https://myip.ipip.net'
+        logger.info(f"[socks5代理检验]访问地址-{check_ip}")
+        for proxies in self._pool:
+            try:
+                requests_param = {
+                    "headers": headers,
+                    "proxies": proxies,
+                    "timeout": 2
+                }
+                requests.get(check_ip, **requests_param)
+                self._dq.append(proxies)
+            except requests.RequestException:
+                del self._counter[self.get_netloc(proxies)]
+
+    def __flush_proxy_pool(self):
+        logger.info(f"[socks5代理]刷新代理池")
+        self._pool.clear()
+        self._dq.clear()
+        self._counter.clear()
+        self.__request_service()
+        self.__check_proxies()
+
+    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
+        if enable_proxy:
+            logger.info("[加载socks5代理]")
+            self._init()
+        return self
+
+
+Proxy = Socks5Proxy()

+ 10 - 0
ybw/utils/tools.py

@@ -0,0 +1,10 @@
+import bson
+
+
+def int2long(param: int):
+    """int 转换成 long """
+    return bson.int64.Int64(param)
+
+
+def object_id(_id: str):
+    return bson.objectid.ObjectId(_id)

+ 0 - 0
zbytb/config/__init__.py


+ 44 - 0
zbytb/config/conf.yaml

@@ -0,0 +1,44 @@
+# mongo
+mongo:
+  host: 172.17.4.87
+  port: !!int 27080
+#  host: 127.0.0.1
+#  port: !!int 27017
+
+
+# redis
+redis:
+  host: 127.0.0.1
+  port: !!int 6379
+  pwd: ""
+  db: !!int 10
+
+
+# 阿里oss
+ali_oss:
+  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
+  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
+#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
+  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
+  bucket_name: jy-datafile
+
+
+# es
+es:
+  host: 172.17.145.170
+#  host: 192.168.3.206
+#  host: 127.0.0.1
+  port: !!int 9800
+
+
+# 代理
+proxy:
+  socks5:
+    url: http://socks.spdata.jianyu360.com/socks/getips?limit=10
+    decrypt: ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/
+
+
+# node库位置
+node_module:
+  windows: C:\Users\dell\AppData\Roaming\npm\node_modules
+  linux: /usr/lib/node_modules

+ 2 - 0
zbytb/config/constants.yaml

@@ -0,0 +1,2 @@
+headers:
+  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36

+ 30 - 0
zbytb/config/load.py

@@ -0,0 +1,30 @@
+from pathlib import Path
+
+import yaml
+
+__all__ = [
+    'mongo_conf',
+    'redis_conf',
+    'oss_conf',
+    'jy_proxy',
+    'node_module',
+    'es_conf',
+    'headers'
+]
+
+base_path = Path(__file__).parent
+yaml_conf = (base_path / 'conf.yaml').resolve()
+yaml_constants = (base_path / 'constants.yaml').resolve()
+
+with open(yaml_conf, encoding="utf-8") as f:
+    conf = yaml.safe_load(f)
+    mongo_conf = conf['mongo']
+    redis_conf = conf['redis']
+    oss_conf: dict = conf['ali_oss']
+    es_conf: dict = conf['es']
+    jy_proxy: dict = conf['proxy']
+    node_module: dict = conf['node_module']
+
+with open(yaml_constants, encoding="utf-8") as fp:
+    constants = yaml.safe_load(fp)
+    headers: dict = constants['headers']

+ 2 - 0
zbytb/crawler/__init__.py

@@ -0,0 +1,2 @@
+from crawler.spiders.DetailPageSpider import CrawlDetailPageSpider
+from crawler.spiders.ListPageSpider import CrawlListPageSpider

+ 103 - 0
zbytb/crawler/check_utils.py

@@ -0,0 +1,103 @@
+import re
+
+from utils.es_query import get_es
+from utils.execptions import (
+    CustomAccountPrivilegeError,
+    CustomCheckError
+)
+
+__all__ = ['CheckText', 'CheckTask']
+
+
+class CheckContent:
+
+    def __init__(self):
+        self.sensitive_words = {
+            '正式会员', '账户充值', 'VIP会员查阅', '>(注册)<', '>(登录)<', '高级会员',
+            '标准会员', '点击支付'
+        }
+
+    @staticmethod
+    def check_text_length(val: str):
+        if len(val) == 0:
+            raise CustomCheckError(code=10101, reason='文本内容为空')
+        elif not re.findall(r'[\u4e00-\u9fa5]', val, re.S):
+            raise CustomCheckError(code=10102, reason='不存在中文字符')
+
+    @staticmethod
+    def check_content(val: str):
+        if val.count("部分文件可能不支持在线浏览"):
+            raise CustomCheckError(code=10103, reason='文件不支持在线浏览')
+
+    @staticmethod
+    def check_account_privilege(val: str):
+        if val.count("高级会员"):
+            raise CustomAccountPrivilegeError
+        elif "本招标项目仅供正式会员查阅" in val:
+            raise CustomAccountPrivilegeError
+
+    def check_sensitive_word(self, val: str):
+        total = set()
+        for word in self.sensitive_words:
+            result = re.search(word, val)
+            if result is not None:
+                total.add(word)
+
+        if len(total) > 0:
+            raise CustomCheckError(code=10104, reason='详情内容包含敏感词')
+
+    def __check(self, text):
+        self.check_sensitive_word(text)
+        self.check_text_length(text)
+        self.check_content(text)
+        self.check_account_privilege(text)
+
+    def __call__(self, text: str, *args, **kwargs):
+        self.__check(text)
+
+
+class CheckPrePareRequest:
+
+    def __init__(self):
+        self.crawl_keywords = {
+            '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
+            '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
+            '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
+            '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
+            '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
+            '终止', '系统'
+        }
+
+    @staticmethod
+    def check_es_cache(title: str, publish_time: int, rows: dict):
+        """
+
+        :param title:  标题
+        :param publish_time: 发布时间的时间戳(l_np_publishtime)
+        :param rows: 采集内容
+        """
+        retrieved_result = get_es(title, publish_time)
+        if retrieved_result != 0:
+            '''es查询数据结果'''
+            rows['count'] = retrieved_result
+            raise CustomCheckError(code=10105, reason='标题内容已存在es')
+
+    def check_crawl_title(self, title: str):
+        for keyword in self.crawl_keywords:
+            valid_keyword = re.search(keyword, title)
+            if valid_keyword is not None:
+                break
+        else:
+            raise CustomCheckError(code=10106, reason='标题未检索到采集关键词', title=title)
+
+    def __check(self, rows: dict):
+        title, publish_time = rows['title'], rows['l_np_publishtime']
+        self.check_crawl_title(title)
+        self.check_es_cache(title, publish_time, rows)
+
+    def __call__(self, rows: dict, *args, **kwargs):
+        self.__check(rows)
+
+
+CheckText = CheckContent()
+CheckTask = CheckPrePareRequest()

+ 90 - 0
zbytb/crawler/clean_html.py

@@ -0,0 +1,90 @@
+# HTML 替换
+import re
+
+
+def th(neirong):
+    tihuan = {
+        '<!--.*?-->': '',
+        '"': "'",
+        '\n': '',
+        '\xa0': "",
+        '<script .*?>': '',
+        '</script>': '',
+        '<span .*?>': '',
+        '</span> ': '',
+        '<p.*?>': '<br>',
+        '</p>': '<br>',
+        '<div>': '<br>',
+        '<div .*?>': '<br>',
+        '<img .*?>': '<br>',
+        '</div>': '<br>',
+        '<style.*?</style>': '',
+        '<EpointForm>': '',
+        '<html.*?</head>': '',
+        '<input .*?>': '',
+        '<!DOCTYPE.*?>': '',
+        '</meta>': '',
+        '<?xml:.*?>': '',
+        '<label.*?>': '<br>',
+        '</label>': '',
+        'style=".*?"': '',
+        "style='.*?'": '',
+        'class=".*?"': '',
+        "class='.*?'": '',
+        "bordercolor='.*?'": '',
+        'bgcolor=".*?"': '',
+        'BORDERCOLOR=".*?"': '',
+        'width=".*?"': '',
+        '<a name=".*?">': '',
+        '<o:p>': '',
+        '</o:p>': '',
+        '<A name=.*?>': '',
+        '<a .*?>': '',
+        '</a>': '',
+        '<font .*?>': '',
+        '</font>': '',
+        '<body>': '',
+        '</body>': '',
+        '<h\d{1}\s{0,10}>.*</h\d{1}\s{0,10}>': '',
+        '</h\d{1}\s{0,10}>': '',
+        '<h\d{1}\s{0,10}}>': '',
+        '【关闭】': '',
+        '【打印】': '',
+    }
+
+    nr = neirong
+
+    all_tag = re.findall("<[^>]+>", nr)
+    for tag in all_tag:
+        nr = nr.replace(tag, str(tag).lower())
+
+    def thh(k, v, c):
+        return re.sub(k, v, c)
+
+    for k, v in tihuan.items():
+        nr = re.sub(k, v, thh(k, v, nr), re.S, re.M)
+    return nr
+
+
+def th_1(neirong):
+    tihuan = {
+        '<!--.*?-->': '',
+        '"': "'",
+        '\n': '',
+        '\xa0': "",
+        '<script .*?>': '',
+        '</script>': '',
+    }
+
+    nr = neirong
+
+    all_tag = re.findall("<[^>]+>", nr)
+    for tag in all_tag:
+        nr = nr.replace(tag, str(tag).lower())
+
+    def thh(k, v, c):
+        return re.sub(k, v, c)
+
+    for k, v in tihuan.items():
+        nr = re.sub(k, v, thh(k, v, nr), re.S, re.M)
+    return nr

+ 145 - 0
zbytb/crawler/crawl_scheduler.py

@@ -0,0 +1,145 @@
+import datetime
+import random
+import time
+import traceback
+from datetime import date, timedelta
+
+import requests
+
+from crawler.login import User
+from utils.databases import MongoDBS
+from utils.execptions import JyBasicException
+from utils.log import logger
+from utils.tools import int2long, object_id
+
+
+class Scheduler:
+
+    def __init__(self, query: dict):
+        self.query = query
+        self.crawl_account_tab = MongoDBS('py_spider', 'match_account').coll
+        self.crawl_error_tab = MongoDBS('py_spider', 'crawl_error').coll
+        self.crawl_start = False
+        self.account_id = None
+        self.user = None
+        self.spider_code = None
+        self.crawl_url = None
+        self.crawl_params = None
+        self.crawl_exception = None
+        self.crawl_type = None
+        self.__records = None
+
+    def _release_account(self):
+        rows = dict(
+            used=False,
+            update_time=datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
+        )
+        if self.account_id is not None:
+            self.crawl_account_tab.update_one(
+                {'_id': self.account_id},
+                {'$set': rows}
+            )
+
+    def _set_account(self, item: dict):
+        self.account_id = item['_id']
+        self.user = User(item['account'], item['password'])
+        logger.info(f'[开启调度]启用账号: {self.user.username}')
+        usage = int(item['usage'])
+        item['usage'] = usage + 1
+        use_time = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
+        item['update_time'] = use_time
+        item['used'] = True
+        self.crawl_account_tab.update_one(
+            {'_id': self.account_id},
+            {'$set': item}
+        )
+
+    def _query_account(self, query: dict):
+        return self.crawl_account_tab.find_one(query, sort=[('usage', 1)])
+
+    def __enter__(self):
+        rows = self._query_account(self.query)
+        if rows is not None:
+            self._set_account(rows)
+            self.crawl_start = True  # 控制调度的状态
+        else:
+            # TODO 没有空闲账号时,取出使用次数最少的账号,暂未实现
+            logger.warning(f'请检查mongo表 {self.crawl_account_tab.name} 账号状态')
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        logger.info(f'[关闭调度]')
+        self._release_account()
+        self.crawl_start = False
+
+        if exc_type is not None:
+            errmsg = traceback.extract_tb(exc_tb)
+            e = JyBasicException(
+                code=10500,
+                reason=str(exc_type),
+                title='未知系统错误'
+            )
+            self.err_record(e)
+            logger.error(f'错误类型: {exc_type}, 错误内容: {exc_val}, 错误详情: {errmsg}')
+        return True
+
+    def finished(self, execute_next_time=None):
+        logger.info("任务结束")
+        self._release_account()
+        self.sleep(execute_next_time)
+
+    @staticmethod
+    def sleep(wait_time=None):
+        sleep_time = (wait_time or 600)
+        time.sleep(sleep_time)
+
+    @staticmethod
+    def wait_for_next_task(wait_time=None):
+        _sleep = (wait_time or random.choice(range(5, 15)))
+        time.sleep(_sleep)
+
+    @property
+    def today(self):
+        return datetime.datetime.today().strftime('%Y-%m-%d')
+
+    @property
+    def yesterday(self):
+        return (date.today() - timedelta(days=1)).strftime("%Y-%m-%d")
+
+    def err_record(self, e: JyBasicException):
+        rows = {
+            'account': self.user.username if self.user is not None else '',
+            'spidercode': self.spider_code,
+            'url': self.crawl_url,
+            'status_code': e.code,
+            'reason': e.reason,
+            'params': getattr(e, 'title', ''),
+            'crawl_time': int2long(int(time.time())),
+            'crawl_type': self.crawl_type,
+        }
+        self.crawl_error_tab.insert_one(rows)
+
+    @property
+    def crawl_task(self):
+        results = {}
+        url = 'http://cc.spdata.jianyu360.com/schedule/crawl_task/zgytb_scheduler'
+        # url = 'http://127.0.0.1:1405/schedule/crawl_task/zgytb_scheduler'
+        try:
+            response = requests.get(url, timeout=10)
+            if response.status_code == 200:
+                data = response.json()['data']
+                if len(data) > 0:
+                    results['_id'] = object_id(data['_id'])
+                    for key, val in data.items():
+                        if key != '_id':
+                            results[key] = val
+            return results
+        except requests.RequestException:
+            return results
+
+    def query_user(self, account: str):
+        query = {'account': account}
+        rows = self.crawl_account_tab.find_one(query)
+        if rows is None:
+            raise
+        return User(rows['account'], rows['password'])

+ 92 - 0
zbytb/crawler/defaults.py

@@ -0,0 +1,92 @@
+import requests
+import urllib3
+from requests.models import Response, Request
+
+from crawler.login import update_login_cookies
+from crawler.sessions_521 import http_session_521
+
+urllib3.disable_warnings()
+
+
+def prepare_request(
+        headers: dict = None,
+        proxies: dict = None,
+        timeout: int = None,
+        verify: bool = None,
+        cookies=None,
+):
+    request_params = {}
+    request_params.setdefault('headers', headers)
+    request_params.setdefault('timeout', timeout or 60)
+    request_params.setdefault('proxies', proxies)
+    if cookies is not None:
+        request_params.setdefault('cookies', cookies)
+    if verify is not None:
+        request_params.setdefault('verify', verify)
+    return request_params
+
+
+def http_request_get(url, account=None, **kwargs):
+    request_params = prepare_request(**kwargs)
+    retries = 0
+    response = Response()
+    session = requests.Session()
+    while retries < 3:
+        try:
+            response = session.get(url, **request_params)
+            if response.status_code == 200:
+                response.encoding = response.apparent_encoding
+                return True, response
+            elif response.status_code == 521:
+                print("****** 521 ******")
+                response.status_code = 10521
+                _, session, _jsl_clearance_s = http_session_521(url, **request_params)
+                # TODO 验证是否需要将这个临时变量写入等录cookies,临时写入内存,如有必要再添加
+                # if account is not None:
+                #     update_login_cookies(account, _jsl_clearance_s)
+                if 'cookies' not in request_params:
+                    request_params.setdefault('cookies', _jsl_clearance_s)
+                else:
+                    _cookies: dict = request_params.get('cookies')
+                    _cookies.update(_jsl_clearance_s)
+                    request_params.update({'cookies': _cookies})
+                continue
+            elif response.status_code in [404, 301]:
+                response.status_code = response.status_code
+                response.reason = '网站反爬'
+                response.request = Request(
+                    method='get',
+                    url=url,
+                    headers=request_params.get('headers'),
+                )
+                return False, response
+            elif 500 <= response.status_code < 521:
+                response.status_code = response.status_code
+                response.reason = '网站页面无法访问'
+                response.request = Request(
+                    method='get',
+                    url=url,
+                    headers=request_params.get('headers'),
+                )
+                return False, response
+            else:
+                retries += 1
+        except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
+            response.status_code = 10000
+            response.reason = e.__class__.__name__
+            response.request = Request(
+                method='get',
+                url=url,
+                headers=request_params.get('headers'),
+            )
+            return False, response
+        except requests.RequestException as e:
+            response.status_code = 10001
+            response.reason = e.__class__.__name__
+            response.request = Request(
+                method='get',
+                url=url,
+                headers=request_params.get('headers'),
+            )
+            retries += 1
+    return False, response

+ 118 - 0
zbytb/crawler/login.py

@@ -0,0 +1,118 @@
+import json
+import re
+import threading
+from collections import namedtuple
+from pathlib import Path
+
+import requests
+from requests.utils import dict_from_cookiejar
+
+_lock = threading.Lock()
+ROOT_PATH = Path(__file__).parent.parent
+
+JSON_LOGIN_COOKIE = (ROOT_PATH / 'config/login_cookie.json').resolve()
+
+User = namedtuple('User', ['username', 'password'])
+
+
+def _open_file():
+    try:
+        fp = open(JSON_LOGIN_COOKIE, encoding='utf-8')
+    except FileNotFoundError:
+        fp = open(JSON_LOGIN_COOKIE, 'w+', encoding='utf-8')
+    return fp
+
+
+def load_login_cookies(user_name: str):
+    fp = _open_file()
+    try:
+        cookies: dict = json.load(fp).get(user_name)
+        return cookies
+    except json.decoder.JSONDecodeError:
+        pass
+    fp.close()
+
+
+def update_login_cookies(user_name: str, update_val: dict):
+    """
+    更新登录 cookie 内容
+
+    Args:
+        user_name: 账号
+        update_val: 需要更新的cookie内容
+
+    """
+    with _lock:
+        fp = open(JSON_LOGIN_COOKIE, encoding='utf-8')
+        user_maps: dict = json.load(fp)
+        login_cookies: dict = user_maps.get(user_name)
+        if login_cookies is not None and len(update_val) > 0:
+            login_cookies.update(update_val)
+            user_login_info = {user_name: login_cookies}
+            user_maps.update(user_login_info)
+            wp = open(JSON_LOGIN_COOKIE, 'w+', encoding='utf-8')
+            wp.write(json.dumps(user_maps, indent=4))
+            wp.close()
+        fp.close()
+
+
+def save_login_cookies(user_name: str, login_cookie: dict):
+    with _lock:
+        fp = _open_file()
+        '''内容存在就加载到内存,不存在就设置为空字典'''
+        try:
+            user_maps: dict = json.load(fp)
+        except json.decoder.JSONDecodeError:
+            user_maps = {}
+        # print(user_maps)
+
+        if user_name not in user_maps:
+            user_maps.setdefault(user_name, login_cookie)
+        else:
+            cookies = {user_name: login_cookie}
+            user_maps.update(cookies)
+
+        wp = open(JSON_LOGIN_COOKIE, 'w+', encoding='utf-8')
+        wp.write(json.dumps(user_maps, indent=4))
+        fp.close()
+        wp.close()
+
+
+def login_status_check(response):
+    userid = re.findall('var destoon_userid = (\d+);', response.text)
+    userid = "".join(userid).strip()
+    username = re.findall('var destoon_username = \'(.*?)\'', response.text)
+    username = "".join(username).strip()
+    # print(">>> _user ", (userid, username))
+    if userid == '0' and username == '':
+        # 登录失效
+        return True
+    else:
+        # 登录有效
+        return False
+
+
+def login(user_name: str, password: str):
+    headers = {
+        "Connection": "keep-alive",
+        "Accept": "application/json, text/javascript, */*; q=0.01",
+        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
+        "X-Requested-With": "XMLHttpRequest",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
+        "Origin": "https://www.zbytb.com",
+        "Referer": "https://www.zbytb.com/member/login.php",
+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
+    }
+    data = {
+        "username": str(user_name),
+        "password": str(password),
+        "cookietime": "on"
+    }
+    login_url = "https://www.zbytb.com/member/ajaxlogin.php"
+    response = requests.post(login_url, headers=headers, data=data)
+    login_status = response.json()
+    # print(user_name + ' ' + login_status.get('msg'))
+    if login_status.get('msg') == '登录成功':
+        login_cookie = dict_from_cookiejar(response.cookies)
+        save_login_cookies(user_name, login_cookie)
+        return login_cookie

+ 159 - 0
zbytb/crawler/sessions_521.py

@@ -0,0 +1,159 @@
+import copy
+import re
+import sys
+
+import execjs
+import jsbeautifier
+import requests
+from requests.utils import dict_from_cookiejar
+
+from config.load import node_module
+
+if sys.platform == 'linux':
+    node_module_path = node_module['linux']
+else:
+    node_module_path = node_module['windows']
+
+
+def save_js_script(js_code: str, allow_beautify_code=False):
+    with open('etx.js', 'w', encoding='utf-8') as f:
+        if allow_beautify_code:
+            # 解压缩js代码
+            f.write(jsbeautifier.beautify(js_code))
+        f.write(js_code)
+
+
+def load_js_script():
+    with open('etx.js', encoding='utf-8') as f:
+        return f.read()
+
+
+def modify_go_func(repl_js: str, js_func: str):
+    document_code = re.search('document\[.*?\]\s{0,1}=.*?;', repl_js).group()
+    property_name = re.search('document\[.*?\]\s{0,1}?', document_code).group()
+    return_back = 'return {};'.format(property_name)
+    new_js = '\n{a}{b}\n{a}{c}\n{a}'.format(
+        a=' ' * 6,
+        b=document_code,
+        c=return_back
+    )
+    return js_func.replace(repl_js, new_js)
+
+
+def execute_js_script(script_js: str):
+    js_header = '''
+            const jsdom = require("jsdom");
+            const { JSDOM } = jsdom;
+            const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
+            window = dom.window;
+            document = window.document;
+        '''
+    js_script = js_header + script_js
+    # 格式化js代码
+    beautify_js = jsbeautifier.beautify(js_script)
+    # 将js代码弹窗事件修改为控制台输出
+    js_script = beautify_js.replace('alert', 'console.log')
+    # 将源js脚本代码中go函数回调逻辑修改为 return
+    go_etx = re.search('go\(\{.*?\}\)', js_script, flags=re.S).group()
+    return_go_etx = 'return ' + go_etx
+    js_script = js_script.replace(go_etx, return_go_etx)
+    # 替换js代码中setTimeout事件
+    go_func = re.search('function go\(.*\};', js_script, flags=re.S).group()
+    patterns = {
+        'p1': '\n[ ]+.{10,100}\(setTimeout.*\n[ ]+document.*\n[ ]+location.*?\n.*?\n[ ]+',
+        'p2': '\n[ ]+setTimeout.*\n[ ]+.*document.*\n[ ]+location.*?\n.*?\n[ ]+',
+        'p3': '[ ]+.{10,100}\(setTimeout.*[ ]+if \(.*\) \{.*\}.*, _\w{8}\);',
+        'p4': '[ ]+setTimeout.*_\w{8}\);',
+    }
+    go_func_new = copy.deepcopy(go_func)
+    for p, pattern in patterns.items():
+        if p in ['p3', 'p4']:
+            # p1 会误判 p3 情况
+            results = re.findall(pattern, go_func_new, flags=re.S)
+        else:
+            results = re.findall(pattern, go_func_new)
+
+        if len(results) > 0:
+            # print(f"清洗规则:{p}")
+            for obj_js in results:
+                go_func_new = modify_go_func(obj_js, go_func_new)
+
+    js_script = js_script.replace(go_func, go_func_new)
+    js_script = 'function getCookies(){' + js_script + '}'
+    # print(js_script)
+    cwd = node_module_path
+    etx = execjs.compile(js_script, cwd=cwd)
+    return etx.call('getCookies')
+
+
+def extract_clearance(js_code: str):
+    result = re.search('_s=.*%3D', js_code)
+    '''
+    '__jsl_clearance_s=1641259814.553|-1|LlKSd3QgHj0KliuCI5cEMbwU7HU%3D;max-age=3600;path=/'
+    '__jsl_clearance_s=1641259382.821|0|ThOeD4stO5usoh9oC0MP5%2Fx3SPc%3D'
+    '''
+    if result is None:
+        return None
+    result = result.group()
+    result = result.replace('_s=', '')
+    return result
+
+
+def extract_cookies_js(js_html: str):
+    patterns = [
+        '<script>document\.cookie=(.*);location\.href=location\.pathname\+location\.search</script>',
+        '<script>(.*)</script>'
+    ]
+    for pattern in patterns:
+        result = re.search(pattern, js_html)
+        if result is not None:
+            return result.group(1)
+    else:
+        return None
+
+
+def http_session_521(url: str, headers: dict, **kwargs):
+    if 'Cookie' in headers:
+        del headers['Cookie']
+    if 'cookies' in kwargs:
+        del kwargs['cookies']
+
+    request_params = {}
+    request_params.setdefault('proxies', kwargs.get('proxies'))
+    request_params.setdefault('timeout', kwargs.get('timeout') or 60)
+    http_session = requests.Session()
+    try:
+        resp1 = http_session.get(url, headers=headers, **request_params)
+    except requests.RequestException:
+        # print("代理超时")
+        return False, http_session, None
+    else:
+        if resp1.status_code != 521:
+            # print(dict_from_cookiejar(resp1.cookies))
+            return True, http_session, dict_from_cookiejar(resp1.cookies)
+
+        cookies_js1 = extract_cookies_js(resp1.text)
+        if cookies_js1 is None:
+            return False, http_session, None
+
+    resp1_jsl_clearance_s = execjs.eval(cookies_js1)
+    clearance1 = extract_clearance(resp1_jsl_clearance_s)
+    resp1_cookies = dict_from_cookiejar(resp1.cookies)
+    resp1_cookies.update({'__jsl_clearance_s': clearance1})
+    try:
+        resp2 = http_session.get(url, headers=headers, cookies=resp1_cookies, **request_params)
+    except requests.RequestException:
+        # print("代理超时")
+        return False, http_session, None
+    else:
+        cookies_js2 = extract_cookies_js(resp2.text)
+        if cookies_js2 is None:
+            return False, http_session, None
+
+    # save_js_script(cookies_js2)
+    js_script = cookies_js2.replace('失败', '')
+    resp2_jsl_clearance_s = execute_js_script(js_script)
+    clearance2 = extract_clearance(resp2_jsl_clearance_s)
+    resp2_cookies = dict_from_cookiejar(resp2.cookies)
+    resp2_cookies.update({'__jsl_clearance_s': clearance2})
+    return True, http_session, resp2_cookies

+ 297 - 0
zbytb/crawler/spiders/DetailPageSpider.py

@@ -0,0 +1,297 @@
+import random
+import re
+import time
+from urllib.parse import urlencode, urlparse
+
+from bs4 import BeautifulSoup
+from pymongo.errors import DuplicateKeyError
+
+from crawler.check_utils import CheckText, CheckTask
+from crawler.clean_html import th_1, th
+from crawler.crawl_scheduler import Scheduler
+from crawler.defaults import http_request_get
+from crawler.login import load_login_cookies, login, User, login_status_check
+from utils.attachment import (
+    extract_file_type,
+    AttachmentDownloader,
+    extract_file_name, extract_file_name_by_href
+)
+from utils.databases import MongoDBS
+from utils.execptions import (
+    CustomAccountPrivilegeError,
+    AttachmentNullError,
+    CustomCheckError, JyBasicException
+)
+from utils.log import logger
+from utils.tools import int2long
+
+
+class CrawlDetailPageSpider:
+
+    def __init__(
+            self,
+            db: str,
+            crawl_tab: str,
+            save_tab: str,
+            error_tab: str,
+    ):
+        self.crawl_tab = MongoDBS(db, crawl_tab).coll
+        self.save_tab = MongoDBS(db, save_tab).coll
+        self.crawl_error_tab = MongoDBS(db, error_tab).coll
+
+        self.senior_account = 'runhekeji'
+
+        self.attachment_downloader = AttachmentDownloader()
+
+    @staticmethod
+    def select_user(rows: dict, sc: Scheduler):
+        """
+        选择用户账号,并在采集内容中添加用户账号
+
+        :param rows: 采集内容
+        :param sc: 采集账号任务分配调度器
+        :return: 用户账号和账号cookie
+        """
+        account = rows.get('account', sc.user.username)
+        rows.update({'account': account})
+        return account, load_login_cookies(account)
+
+    @staticmethod
+    def extract_response_content(response):
+        results = re.findall(r'Inner(.*?);Inner', response.text)
+        if len(results) > 0:
+            return results[0][13:-1]
+        else:
+            return ''
+
+    @staticmethod
+    def prepare_url(rows: dict):
+        host = "https://www.zbytb.com/api/task.js.php"
+        params = {
+            "moduleid": rows["type_code"],
+            "html": "show",
+            "itemid": re.findall(r"\d+\.?\d*", rows["competehref"])[0][:-1],
+            "page": "1",
+            "es": "",
+            "refresh": "{}.js".format(random.random())
+        }
+        url = host + '?' + urlencode(params)
+        return url
+
+    def handler_error(self, response, reason: str, code: int, rows: dict):
+        logger.error(reason)
+        if code == 10104 and rows.get('account') != self.senior_account:
+            self.set_senior_privilege(rows)
+            return
+        else:
+            self.crawl_tab.update_one(
+                {'_id': rows['_id']},
+                {'$set': {'crawl_status': 'error'}}
+            )
+        response.status_code = code
+        err_msg = response.reason = reason
+        response.request.url = rows['competehref']
+        self.crawl_error(
+            response=response,
+            spider_code=rows['spidercode'],
+            account=rows.get('account'),
+            err_msg=err_msg,
+        )
+
+    def process_attachment(self, content: str, rows: dict):
+        soup = BeautifulSoup(content, "lxml")
+        all_a = soup.findAll("a")
+        attachments = {}
+        index = 0
+        for tag_a in all_a:
+            file_name, file_type = (tag_a.string or tag_a.text), None
+            file_path = tag_a.attrs.get("href", "")
+            if file_type is None:
+                # 抽取文件类型
+                file_type = (extract_file_type(file_name)
+                             or extract_file_type(file_path))
+            # 抽取文件名称
+            parser = urlparse(file_path)
+            if parser.scheme in ['https', 'http'] and file_type is not None:
+                if not file_name:
+                    name = extract_file_name_by_href(file_path, file_type)
+                    if name is None:
+                        file_name = name
+                    else:
+                        file_name = f"{rows['title']}_{index}"
+
+                attachment = self.attachment_downloader.download(
+                    file_name=file_name,
+                    file_type=file_type,
+                    download_url=file_path,
+                )
+                attachments[str(index + 1)] = attachment
+                index += 1
+
+        if attachments:
+            rows["projectinfo"] = {"attachments": attachments}
+
+    def process_content(self, content, rows: dict):
+        self.process_attachment(content, rows)
+        rows["contenthtml"] = th_1(content)
+        rows["detail"] = th(content)
+        rows["comeintime"] = int2long(int(time.time()))
+        '''清除采集字段'''
+        if 'crawl_status' in rows:
+            del rows['crawl_status']
+        del rows['type_code'], rows['account'], rows['crawl'], rows['count']
+        try:
+            self.save_tab.insert_one(rows)
+        except DuplicateKeyError:
+            pass
+        logger.info("[采集成功]{}-{}".format(rows['title'], rows['publishtime']))
+
+    def set_senior_privilege(self, rows: dict):
+        """
+        设置高级账号
+
+        :param rows: 采集数据内容
+        """
+        '''需要高级会员才能查询的招标信息,设置高级账号'''
+        self.crawl_tab.update_one(
+            {"_id": rows["_id"]},
+            {'$set': {'account': self.senior_account}}
+        )
+        '''释放采集状态'''
+        self.set_crawl_status(rows, False)
+
+    def crawl_error(
+            self,
+            *,
+            spider_code: str,
+            account: str,
+            err_msg='采集失败',
+            response=None,
+            rows=None,
+    ):
+        items = {
+            'account': account,
+            'spidercode': spider_code,
+            'crawl_time': int2long(int(time.time())),
+            'crawl_type': 'detail'
+        }
+        if response is not None:
+            items.update({
+                'url': response.request.url,
+                'status_code': response.status_code,
+                'reason': response.reason,
+                'params': getattr(response.request, 'params', None),
+            })
+        elif rows is not None:
+            items.update({
+                'url': rows['url'],
+                'status_code': rows['status_code'],
+                'reason': rows['reason'],
+                'params': rows['params'],
+            })
+        self.crawl_error_tab.insert_one(items)
+        logger.error(err_msg)
+
+    def crawl_success(self, response, rows: dict):
+        content = self.extract_response_content(response)
+        try:
+            CheckText(content)
+            self.process_content(content, rows)
+            self.crawl_tab.update_one(
+                {'_id': rows['_id']},
+                {'$set': {'crawl_status': 'finished'}}
+            )
+        except (AttachmentNullError, CustomCheckError) as e:
+            self.handler_error(response, e.reason, e.code, rows)
+        except CustomAccountPrivilegeError:
+            self.set_senior_privilege(rows)
+
+    @staticmethod
+    def crawl_request(user: User, url: str, headers: dict, cookies: dict):
+        retries = 0
+        while True:
+            success, response = http_request_get(
+                url,
+                headers=headers,
+                cookies=cookies,
+                verify=False,
+            )
+            if not success and response.status_code == 10000 and retries < 3:
+                retries += 1
+            else:
+                retry_login = login_status_check(response)
+                if retry_login:
+                    cookies = login(*user)
+                    logger.info(f"重新登录:{user.username}")
+                else:
+                    break
+        return success, response
+
+    def set_crawl_status(self, rows: dict, status: bool):
+        self.crawl_tab.update_one(
+            {'_id': rows['_id']},
+            {'$set': {'crawl': status}}
+        )
+
+    def crawl_spider(self, rows: dict, sc: Scheduler):
+        headers = {
+            'Host': 'www.zbytb.com',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
+            'Accept': '*/*',
+            'Referer': 'https://www.zbytb.com/s-zb-20147673.html',
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+        }
+        headers.update({'Referer': rows['competehref']})
+        url = self.prepare_url(rows)
+        account, cookies = self.select_user(rows, sc)
+        user = sc.query_user(account)
+        success, response = self.crawl_request(user, url, headers, cookies)
+        print(rows['competehref'])
+        if success:
+            self.crawl_success(response, rows)
+        else:
+            self.crawl_error(
+                spider_code=rows['spidercode'],
+                account=account,
+                response=response
+            )
+
+    def _spider(self, sc: Scheduler):
+        while True:
+            item = sc.crawl_task
+            if len(item) == 0:
+                return False
+            self.set_crawl_status(item, True)
+            sc.spider_code = item['spidercode']
+            sc.crawl_url = item['competehref']
+            try:
+                CheckTask(item)
+                self.crawl_spider(item, sc)
+                self.set_crawl_status(item, False)
+                sc.wait_for_next_task(10)
+            except JyBasicException as e:
+                if e.code == 10105:
+                    '''检查出该异常时,程序会将es查询结果更新采集表'''
+                    self.crawl_tab.update_one(
+                        {"_id": item["_id"]},
+                        {'$set': {'count': item['count']}}
+                    )
+                else:
+                    sc.err_record(e)
+                    self.crawl_tab.update_one(
+                        {"_id": item["_id"]},
+                        {'$set': {'crawl_status': 'error'}}
+                    )
+                self.set_crawl_status(item, False)
+
+    def start(self):
+        query = {'used': False, 'site': '中国招标与采购网'}
+        while True:
+            with Scheduler(query) as scheduler:
+                scheduler.crawl_type = 'detail'
+                if scheduler.crawl_start:
+                    finished = self._spider(scheduler)
+                    if not finished:
+                        scheduler.wait_for_next_task(2)
+                else:
+                    scheduler.wait_for_next_task(60)

+ 150 - 0
zbytb/crawler/spiders/ListPageSpider.py

@@ -0,0 +1,150 @@
+import time
+from concurrent.futures import ThreadPoolExecutor
+from urllib.parse import urlencode
+
+from lxml.html import fromstring
+
+from config.load import headers
+from crawler.defaults import http_request_get
+from utils.databases import MongoDBS
+from utils.es_query import get_es
+from utils.log import logger
+from utils.socks5 import Proxy
+from utils.tools import int2long
+
+
+class CrawlListPageSpider:
+
+    def __init__(
+            self,
+            db: str,
+            crawl_tab: str,
+            error_tab: str,
+            enable_proxy=False,
+            **kwargs
+    ):
+        self.crawl_tab = MongoDBS(db, crawl_tab).coll
+        self.crawl_error_tab = MongoDBS(db, error_tab).coll
+
+        self.host = 'https://www.zbytb.com/search'
+        self.headers = kwargs.get('headers') or headers
+        self.enable_proxy = enable_proxy
+        self.proxy = Proxy(enable_proxy)
+        self.proxies = None
+
+    def crawl_request(self, url, **kwargs):
+        retries = 0
+        while True:
+            success, response = http_request_get(
+                url,
+                headers=kwargs.get('headers'),
+                proxies=self.proxies,
+            )
+            if not success and response.status_code == 10000 and retries < 3:
+                self.proxy.switch()
+                self.proxies = self.proxy.proxies
+                retries += 1
+            else:
+                break
+        return success, response
+
+    def crawl_success(self, response, **kwargs):
+        label_info: dict = kwargs.get('label_info')
+        element = fromstring(response.text)
+        nodes = element.xpath('//*[@class="zblist_table"]//tr[position()>1]')
+        results = []
+        for node in nodes:
+            publish_time = str(node.xpath("./td[4]/text()")[0])
+            l_np_publishtime = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
+            info = {
+                "title": node.xpath("./td[2]/a/text()")[0],
+                "competehref": node.xpath("./td[2]/a/@href")[0],
+                "area": node.xpath("./td[1]/a/text()")[0],
+                "publishtime": publish_time,
+                "l_np_publishtime": int2long(l_np_publishtime),
+                "site": "中国招标与采购网",
+                "comeintime": int2long(int(time.time())),
+                "href": "#",
+                "T": "bidding",
+                "sendflag": "false",
+                "_d": "comeintime",
+                **label_info,
+                "crawl": False,
+            }
+            info["count"] = get_es(info["title"], info["l_np_publishtime"])
+            # print('>>> ', info['competehref'])
+            results.append(info)
+        self.crawl_tab.insert_many(results)
+        logger.info(f'[采集成功]{len(results)}条')
+
+    def crawl_error(self, response, **kwargs):
+        items = {
+            'url': response.request.url,
+            'status_code': response.status_code,
+            'reason': response.reason,
+            'params': response.request.params,
+            'spidercode': kwargs.get('label_info').get('spidercode'),
+            'crawl_time': int2long(int(time.time())),
+            'crawl_type': 'list'
+        }
+        self.crawl_error_tab.insert_one(items)
+        logger.error('采集失败')
+
+    def crawl_spider(self, task: tuple):
+        module_id, area_id, page = task
+        label_info = {}
+        if module_id == 27:
+            label_info = {
+                'type_code': 25,  # 详情api中使用参数
+                'channel': '询价',
+                'spidercode': 'a_zgzbycgw_zbxx_xj',
+            }
+        elif module_id == 24:
+            label_info = {
+                'type_code': 24,
+                'channel': '中标',
+                'spidercode': 'a_zgzbycgw_zbxx_zb',
+            }
+        elif module_id == 25:
+            label_info = {
+                'type_code': 25,
+                'channel': '招标信息',
+                'spidercode': 'a_zgzbycgw_zbxx_zbxx',
+            }
+        elif module_id == 26:
+            label_info = {
+                'type_code': 26,
+                'channel': '拟在建',
+                'infoformat': 2,  # 拟建爬虫标记
+                'spidercode': 'a_zgzbycgw_zbxx_nzj',
+            }
+        params = {
+            'moduleid': module_id,
+            'areaids': area_id,
+            'page': page,
+        }
+        referer = url = '{}/?{}'.format(self.host, urlencode(params))
+        self.headers.update({'Referer': referer})
+        logger.info(f"开始请求: {url}")
+        success, response = self.crawl_request(url, headers=self.headers)
+        if not success:
+            self.crawl_error(response, label_info=label_info)
+        else:
+            self.crawl_success(response, label_info=label_info, page=page)
+
+    @property
+    def crawl_tasks(self):
+        tasks = []
+        for module_id in [24, 25, 26, 27]:
+            # 遍历省份
+            for area_id in range(1, 32):
+                # 遍历页码
+                for page in range(1, 10):
+                    tasks.append((module_id, area_id, page))
+        yield from tasks
+
+    def start(self, workers=1):
+        if self.enable_proxy:
+            self.proxies = self.proxy.proxies
+        with ThreadPoolExecutor(max_workers=workers) as Executor:
+            Executor.map(self.crawl_spider, self.crawl_tasks)

+ 2 - 0
zbytb/crawler/spiders/__init__.py

@@ -0,0 +1,2 @@
+from .ListPageSpider import CrawlListPageSpider
+from .DetailPageSpider import CrawlDetailPageSpider

+ 57 - 0
zbytb/error_html/error_page.html

@@ -0,0 +1,57 @@
+<!doctype html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport"
+          content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0">
+    <meta http-equiv="X-UA-Compatible" content="ie=edge">
+    <title>Document</title>
+</head>
+<body>
+<script type="text/javascript"
+        src="https://www.zbytb.com/skin/default/flexpaper/flexpaper_flash.js"></script>
+<div class="content">
+    <div class="texts">
+        <p>本招标项目仅供高级会员、VIP会员查阅,您的权限不能浏览详细信息,请点击<a
+                href="https://www.zbytb.com/member/register.php">注册</a>/<a
+                href="https://www.zbytb.com/member/login.php">登录</a>,或联系工作人员办理会员入网事宜,成为正式会员后方可获取详细的招标公告、报名表格、项目附件及部分项目招标文件等。<br>联系人:徐畅<br/>电话:010-53602088<br/>手机:13683389727
+            (微信同号)<br/>邮箱:xuchang@zbytb.com<br>传真:010-53602088
+        <p>
+    </div>
+    <br/>
+    <p style="width:800px; height:30px;padding:8px;margin:auto; margin-bottom:15px;background:#eef7fe; line-height:30px; font-weight:bold; text-align:center">
+        此项目必须是高级会员及以上查看,请点击<a href="https://www.zbytb.com/member/register.php"
+                              style="color:red;">注册</a>或<a
+            href="https://www.zbytb.com/member/grade.php?groupid=#up"
+            style="color:red;">升级</a>为<font color="red"><b>高级会员</b></font>查看公告详情!
+    </p>
+    <p class="t_c f_18 h_40">您可以通过以下两种方式来查看本项目全部内容。</p>
+    <div class="fangfa clearfix">
+        <div class="fangfa_l">
+            <p class=" f_18">方式一</p>
+            <p>查看详情需要支付资金<span class="color_r">9000</span>元</p>
+            <p>您的资金余额 <span class="color_r">0.00</span>元</p>
+            <p>请点击支付按钮支付后查看</p>
+            <div class="hc_10">&nbsp;</div>
+            <p>
+                <a href="https://www.zbytb.com/member/pay.php?mid=25&itemid=20076620&username=xuchang&fee_back=0&fee=9000&currency=money&sign=CBB9D2160F1DA534F3C49B842FF08636&title=%E5%A4%A9%E6%B4%A5%E5%85%89%E5%A4%A7%E5%85%B4%E8%BE%B0%E7%8E%AF%E4%BF%9D%E8%83%BD%E6%BA%90%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8%E9%A4%90%E5%8E%A8%E5%9C%B0%E6%B2%9F%E6%B2%B9%E8%84%82%E9%94%80%E5%94%AE%E8%AF%A2%E4%BB%B7%E5%85%AC%E5%91%8A&forward=https%3A%2F%2Fwww.zbytb.com%2Fzb%2Fhttps%3A%2F%2Fwww.zbytb.com%2Fs-zb-20076620.html"
+                   class="zhifu" target="_blank" rel="nofollow">立即支付</a> &nbsp;&nbsp;
+                <a href="https://www.zbytb.com/member/charge.php?action=pay"
+                   class="chongzhi" target="_blank" rel="nofollow">账户充值</a></p>
+        </div>
+        <div class="fangfa_l">
+            <p class=" f_18">方式二</p>
+            <p>请升级成正式会员后查看</p>
+            <p>您当前所在会员组标准会员</p>
+            <p>升级为<span class="color_r">高级会员</span>后即可查看</p>
+            <div class="hc_10">&nbsp;</div>
+            <p><a href="https://www.zbytb.com/member/grade.php?groupid=#up"
+                  class="zhifu" target="_blank" rel="nofollow">现在升级</a></p>
+        </div>
+    </div>
+    <div class="hc_20">&nbsp;</div>
+</div>
+<br/>
+<br/>
+</body>
+</html>

+ 32 - 0
zbytb/main.py

@@ -0,0 +1,32 @@
+from crawler.spiders.DetailPageSpider import CrawlDetailPageSpider
+from crawler.spiders.ListPageSpider import CrawlListPageSpider
+
+
+def list_page_spider():
+    headers = {
+        'Host': 'www.zbytb.com',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+    }
+    CrawlListPageSpider(
+        db='py_spider',
+        crawl_tab='zbytb_list',
+        error_tab='crawl_error',
+        enable_proxy=True,
+        headers=headers,
+    ).start(workers=4)
+
+
+def detail_page_spider():
+    CrawlDetailPageSpider(
+        db='py_spider',
+        crawl_tab='zbytb_list',
+        save_tab='data_bak',
+        error_tab='crawl_error',
+    ).start()
+
+
+if __name__ == '__main__':
+    detail_page_spider()

+ 79 - 0
zbytb/requirements.txt

@@ -0,0 +1,79 @@
+aliyun-python-sdk-core==2.13.35
+aliyun-python-sdk-kms==2.14.0
+aniso8601==9.0.1
+backcall==0.2.0
+beautifulsoup4==4.9.3
+better-exceptions==0.3.3
+bs4==0.0.1
+certifi==2021.10.8
+cffi==1.15.0
+chardet==3.0.4
+click==8.0.3
+colorama==0.4.4
+copyheaders==0.0.2
+crcmod==1.7
+cryptography==35.0.0
+cssselect==1.1.0
+DBUtils==2.0.2
+ddddocr==1.1.0
+decorator==5.1.0
+EditorConfig==0.12.3
+elasticsearch==7.10.1
+et-xmlfile==1.1.0
+fake-useragent==0.1.11
+Flask==2.0.2
+flatbuffers==2.0
+greenlet==1.1.2
+idna==2.8
+influxdb==5.3.1
+ipython==7.30.1
+itsdangerous==2.0.1
+jedi==0.18.1
+jieba==0.42.1
+Jinja2==3.0.3
+jmespath==0.10.0
+jsbeautifier==1.14.0
+loguru==0.5.3
+lxml==4.6.3
+MarkupSafe==2.0.1
+matplotlib-inline==0.1.3
+msgpack==1.0.3
+numpy==1.21.4
+onnxruntime==1.9.0
+openpyxl==3.0.9
+oss2==2.14.0
+parsel==1.6.0
+parso==0.8.3
+pickleshare==0.7.5
+Pillow==8.2.0
+pinyin==0.4.0
+playwright==1.17.0
+prompt-toolkit==3.0.24
+protobuf==3.19.1
+pycparser==2.21
+pycryptodome==3.11.0
+pyee==8.2.2
+PyExecJS==1.5.1
+Pygments==2.10.0
+pymongo==3.8.0
+PyMySQL==1.0.2
+PySocks==1.7.1
+python-dateutil==2.8.2
+pytz==2021.3
+PyYAML==5.4.1
+redis==3.3.6
+redis-py-cluster==2.1.3
+requests==2.22.0
+retrying==1.3.3
+selenium==3.141.0
+six==1.16.0
+soupsieve==2.3.1
+traitlets==5.1.1
+typing_extensions==4.0.0
+urllib3==1.25.11
+w3lib==1.22.0
+wcwidth==0.2.5
+websockets==10.1
+Werkzeug==2.0.2
+win32-setctime==1.0.3
+wincertstore==0.2

+ 7 - 0
zbytb/start.sh

@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# 切换到指定目录
+cd /mnt/zbytb
+/usr/bin/python3 /mnt/zbytb/main.py
+#保留终端,防止容器自动退出
+/usr/sbin/init

+ 0 - 0
zbytb/utils/__init__.py


+ 23 - 0
zbytb/utils/aliyun.py

@@ -0,0 +1,23 @@
+import oss2
+
+from config.load import oss_conf
+
+
+class AliYunService:
+
+    def __init__(self):
+        self.__acc_key_id = oss_conf['key_id']
+        self.__acc_key_secret = oss_conf['key_secret']
+        self.__endpoint = oss_conf['endpoint']
+        self.__bucket_name = oss_conf['bucket_name']
+
+    def _push_oss_from_local(self, key, filename):
+        """
+        上传一个本地文件到OSS的普通文件
+
+        :param str key: 上传到OSS的文件名
+        :param str filename: 本地文件名,需要有可读权限
+        """
+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
+        bucket.put_object_from_file(key, filename)

+ 198 - 0
zbytb/utils/attachment.py

@@ -0,0 +1,198 @@
+import hashlib
+import os
+import re
+import traceback
+import uuid
+import warnings
+from urllib.parse import urlparse, unquote
+
+import requests
+import urllib3
+
+from config.load import headers
+from utils.aliyun import AliYunService
+from utils.execptions import AttachmentNullError
+from utils.socks5 import Proxy
+
+urllib3.disable_warnings()
+
+
+def sha1(val):
+    _sha1 = hashlib.sha1()
+    if isinstance(val, bytes):
+        _sha1.update(str(val).encode("utf-8"))
+    elif isinstance(val, str):
+        _sha1.update(val.encode("utf-8"))
+    return _sha1.hexdigest()
+
+
+def remove(file_path: str):
+    os.remove(file_path)
+
+
+def getsize(file_path: str):
+    try:
+        return os.path.getsize(file_path)
+    except FileNotFoundError:
+        return 0
+
+
+def discern_file_format(text):
+    file_types = {
+        'pdf', 'doc', 'docx', 'rar', 'zip', 'gzzb', 'jpg', 'png', 'swf'
+    }
+    for file_type in file_types:
+        all_file_format = [file_type, file_type.upper()]
+        for t in all_file_format:
+            result = re.match(f'.*{t}$', text, re.S)
+            if result is not None:
+                return t
+    else:
+        return None
+
+
+def extract_file_type(text):
+    if text is None:
+        return None
+    return discern_file_format(text)
+
+
+def extract_file_name_by_href(href: str, file_type: str):
+    """从url中抽取文件名称"""
+    # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
+    # 中文字符:[\u4e00 -\u9fa5]
+    zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
+    parser = urlparse(href)
+    query = (parser.query or parser.path)
+    result = re.search(f'.*\\.{file_type}', query, re.S)
+    if result is not None:
+        encode_str = unquote(result.group())
+        name = re.search(zh_char_pattern, encode_str)
+        if name is not None:
+            return unquote(name.group())
+    return None
+
+
+def extract_file_name(text):
+    file_type = discern_file_format(text)
+    if file_type is not None:
+        repl = '.{}'.format(file_type)
+        text = text.replace(repl, '')
+    return text
+
+
+def verify_file_name(name):
+    if extract_file_type(name) is None:
+        raise ValueError
+
+
+class AttachmentDownloader(AliYunService):
+
+    def __init__(self):
+        super(AttachmentDownloader, self).__init__()
+        self.dir_name = 'file'
+
+    def _create_file(self, filename, filetype):
+        os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
+        file = "{filename}.{filetype}".format(
+            filename=sha1("{}_{}".format(filename, uuid.uuid4())),
+            filetype=filetype
+        )
+        return "{}/{}".format(self.dir_name, file)
+
+    @staticmethod
+    def _create_fid(file_stream: bytes):
+        return sha1(file_stream)
+
+    @staticmethod
+    def _origin_filename(fid: str, filetype: str):
+        return "{}.{}".format(fid, filetype)
+
+    @staticmethod
+    def _file_size(file: str):
+        _kb = float(getsize(file)) / 1024
+        if _kb >= 1024:
+            _M = _kb / 1024
+            if _M >= 1024:
+                _G = _M / 1024
+                return "{:.1f} G".format(_G)
+            else:
+                return "{:.1f} M".format(_M)
+        else:
+            return "{:.1f} kb".format(_kb)
+
+    @staticmethod
+    def _download(
+            url: str,
+            file: str,
+            enable_proxy=False,
+            allow_show_exception=False,
+            **kwargs
+    ):
+        request_params = {}
+        request_params.setdefault('headers', kwargs.get('headers') or headers)
+        request_params.setdefault('proxies', kwargs.get('proxies'))
+        request_params.setdefault('timeout', kwargs.get('timeout') or 60)
+        request_params.setdefault('stream', kwargs.get('stream') or True)
+        request_params.setdefault('verify', kwargs.get('verify') or False)
+        proxy = Proxy(enable_proxy)
+        retries = 0
+        while retries < 3:
+            try:
+                with requests.get(url, **request_params) as req:
+                    if req.status_code == 200:
+                        stream = req.content
+                        with open(file, 'wb') as f:
+                            f.write(stream)
+                        return stream
+                    else:
+                        retries += 1
+            except requests.RequestException:
+                if allow_show_exception:
+                    traceback.print_exc()
+                if enable_proxy:
+                    proxy.switch()
+                    request_params.update({'proxies': proxy.proxies})
+                retries += 1
+        return b''
+
+    def download(
+            self,
+            file_name: str,
+            file_type: str,
+            download_url: str,
+            enable_proxy=False,
+            allow_request_exception=False,
+            **kwargs
+    ):
+        if not file_name or not file_type or not download_url:
+            raise AttachmentNullError
+
+        local_tmp_file = self._create_file(file_name, file_type)
+        file_stream = self._download(
+            download_url,
+            local_tmp_file,
+            enable_proxy,
+            allow_request_exception,
+            **kwargs
+        )
+        result = {
+            'filename': '{}.{}'.format(file_name, file_type),
+            'org_url': download_url
+        }
+        if len(file_stream) > 0:
+            try:
+                fid = self._create_fid(file_stream)
+                key = self._origin_filename(fid, file_type)
+                result.setdefault('fid', key)
+                result.setdefault('ftype', file_type)
+                result.setdefault('size', self._file_size(local_tmp_file))
+                result.setdefault('url', 'oss')
+                super()._push_oss_from_local(key, local_tmp_file)
+            except Exception as e:
+                warnings.warn(
+                    "[{}]下载异常,原因:{}".format(file_name, e.__class__.__name__)
+                )
+        remove(local_tmp_file)
+        '''上传/下载,无论失败/成功必须返回附件信息'''
+        return result

+ 47 - 0
zbytb/utils/databases.py

@@ -0,0 +1,47 @@
+from typing import Optional
+
+import pymongo
+import redis
+from pymongo.collection import Collection
+from pymongo.database import Database
+
+from config.load import mongo_conf, redis_conf
+
+__all__ = ['MongoDBS', 'RedisDBS']
+
+
+class MongoDBS:
+    """ Mongo """
+
+    def __init__(self, db: str, collection: str, cfg: dict = mongo_conf):
+        self.client = pymongo.MongoClient(host=cfg['host'], port=cfg['port'])
+        self.db: Database = self.client[db]
+        self.coll: Collection = self.db[collection]
+
+    def __enter__(self):
+        return self.coll
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # 上下文管理器,实例调用完毕后,关闭客户端连接
+        self.client.close()
+
+    def __del__(self):
+        # 实例调用完毕后,关闭客户端连接
+        self.client.close()
+
+
+class RedisDBS:
+    """ redis """
+
+    def __init__(self, cfg: Optional[dict] = redis_conf):
+        pool = redis.ConnectionPool(
+            host=cfg['host'],
+            port=cfg['port'],
+            password=cfg['pwd'],
+            db=cfg['db']
+        )
+        self.__r = redis.Redis(connection_pool=pool, decode_responses=True)
+
+    @property
+    def redis(self):
+        return self.__r

+ 45 - 0
zbytb/utils/es_query.py

@@ -0,0 +1,45 @@
+import requests
+from elasticsearch import Elasticsearch
+
+from config.load import es_conf
+
+es = Elasticsearch([{"host": es_conf['host'], "port": es_conf['port']}])
+
+
+def httpAz(title):
+    url = "http://{}:{}/bidding/_analyze".format(es_conf['host'], es_conf['port'])
+    params = {"text": title, "analyzer": "ik_smart"}
+    arr = []
+    res = requests.get(url=url, params=params, timeout=60)
+    if res.status_code == 200:
+        tokens = res.json().get('tokens', [])
+        for x in tokens:
+            if x["token"].encode('utf-8').isalpha():
+                continue
+            arr.append(x["token"])
+
+    q = [{"multi_match": {"query": v, "type": "phrase", "fields": ["title"]}} for v in arr]
+    return q
+
+
+def get_es(title, publishtime):
+    """
+    :param title: 标题
+    :param publishtime: 发布时间
+    :return:
+    """
+    stime = publishtime - 432000    # 往前推5天
+    etime = publishtime + 432000
+    q1 = httpAz(title)
+    q1.append({"range": {"publishtime": {"from": stime, "to": etime}}})
+    esQuery = {
+        "query": {
+            "bool": {
+                "must": q1,
+                "minimum_should_match": 1
+            }
+        }
+    }
+    result = es.search(index='bidding', body=esQuery, request_timeout=100)
+    count = len(result['hits']['hits'])
+    return count

+ 35 - 0
zbytb/utils/execptions.py

@@ -0,0 +1,35 @@
+
+class JyBasicException(Exception):
+
+    def __init__(self, code: int, reason: str, **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class CustomCheckError(JyBasicException):
+
+    def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class AttachmentNullError(JyBasicException):
+
+    def __init__(self, code: int = 10004, reason: str = '附件下载异常', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class CustomAccountPrivilegeError(JyBasicException):
+
+    def __init__(self, *args, **kwargs):
+        pass

+ 11 - 0
zbytb/utils/log.py

@@ -0,0 +1,11 @@
+from loguru import logger
+
+logger.add(
+    'logs/crawl-{time:YYYY-MM-DD}.log',
+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
+    level='INFO',
+    rotation='00:00',
+    retention='1 week',
+    encoding='utf-8',
+    # filter=lambda x: '采集' in x['message']
+)

+ 153 - 0
zbytb/utils/socks5.py

@@ -0,0 +1,153 @@
+import threading
+import time
+from collections import deque
+from urllib.parse import urlparse
+
+import requests
+
+from config.load import jy_proxy, headers
+from utils.log import logger
+
+__all__ = ['Proxy']
+
+
+def decrypt(input_str: str) -> str:
+    """
+    定义base64解密函数
+
+    :param input_str:
+    :return:
+    """
+    # 对前面不是“=”的字节取索引,然后转换为2进制
+    key = jy_proxy['socks5']['decrypt']
+    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
+    output_str = ''
+    # 补齐“=”的个数
+    equal_num = input_str.count('=')
+    while ascii_list:
+        temp_list = ascii_list[:4]
+        # 转换成2进制字符串
+        temp_str = ''.join(temp_list)
+        # 对没有8位2进制的字符串补够8位2进制
+        if len(temp_str) % 8 != 0:
+            temp_str = temp_str[0:-1 * equal_num * 2]
+        # 4个6字节的二进制  转换  为三个8字节的二进制
+        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
+        # 二进制转为10进制
+        temp_str_list = [int(x, 2) for x in temp_str_list if x]
+        # 连接成字符串
+        output_str += ''.join([chr(x) for x in temp_str_list])
+        ascii_list = ascii_list[4:]
+    return output_str
+
+
+class Socks5Proxy:
+
+    __instance = None
+
+    def __new__(cls, *args, **kwargs):
+        if cls.__instance is None:
+            cls.__instance = super().__new__(cls)
+        return cls.__instance
+
+    def __init__(self):
+        self.seconds = 60
+        self._lock = threading.RLock()
+        self._url = jy_proxy['socks5']['url']
+        self._dq = deque([])
+        self._proxies = {}
+        self._pool = []
+        self._counter = {}
+
+    def _init(self):
+        while not self._proxies:
+            if len(self._dq) > 0:
+                '''队列左边取值'''
+                self._proxies = self._dq.popleft()
+                '''添加到队尾'''
+                self._dq.append(self._proxies)
+            else:
+                self.__request_service()
+                self.__check_proxies()
+
+    @property
+    def proxies(self):
+        with self._lock:
+            return self._proxies if len(self._proxies) > 0 else None
+
+    def switch(self, reset=False):
+        with self._lock:
+            if reset is True:
+                self.__flush_proxy_pool()
+            elif len(self._counter) > 0:
+                end_time = self._counter[self.get_netloc(self._proxies)]
+                current_time = int(time.time())
+                if end_time - current_time < self.seconds:
+                    logger.info(f"[移除socks5代理]{self.get_netloc(self._proxies)}")
+                    self._dq.remove(self._proxies)
+                    del self._counter[self.get_netloc(self._proxies)]
+                    logger.info(f"[socks5代理]剩余 {len(self._dq)} 个")
+
+            self._proxies = {}  # 重置代理
+            while len(self._proxies) == 0:
+                if len(self._dq) > 0:
+                    self._proxies = self._dq.popleft()
+                    self._dq.append(self._proxies)
+                else:
+                    self.__flush_proxy_pool()
+
+    @staticmethod
+    def get_netloc(item: dict):
+        parser = urlparse(item.get('http'))
+        return parser.netloc
+
+    def __request_service(self):
+        try:
+            response = requests.get(self._url, timeout=10)
+            self.__extract_ip(response)
+        except requests.RequestException:
+            pass
+
+    def __extract_ip(self, response):
+        for proxy in response.json():
+            host = decrypt(proxy['host'])
+            port = int(proxy['port'])
+            end_time = proxy['EndTime']
+            items = {
+                'http': 'socks5://{}:{}'.format(host, port),
+                'https': 'socks5://{}:{}'.format(host, port)
+            }
+            self._pool.append(items)
+            self._counter.setdefault(self.get_netloc(items), end_time)
+
+    def __check_proxies(self):
+        check_ip = 'https://myip.ipip.net'
+        logger.info(f"[socks5代理检验]访问地址-{check_ip}")
+        for proxies in self._pool:
+            try:
+                requests_param = {
+                    "headers": headers,
+                    "proxies": proxies,
+                    "timeout": 2
+                }
+                requests.get(check_ip, **requests_param)
+                self._dq.append(proxies)
+            except requests.RequestException:
+                del self._counter[self.get_netloc(proxies)]
+
+    def __flush_proxy_pool(self):
+        logger.info(f"[socks5代理]刷新代理池")
+        self._pool.clear()
+        self._dq.clear()
+        self._counter.clear()
+        self.__request_service()
+        self.__check_proxies()
+
+    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
+        if enable_proxy:
+            logger.info("[加载socks5代理]")
+            self._init()
+        return self
+
+
+Proxy = Socks5Proxy()

+ 10 - 0
zbytb/utils/tools.py

@@ -0,0 +1,10 @@
+import bson
+
+
+def int2long(param: int):
+    """int 转换成 long """
+    return bson.int64.Int64(param)
+
+
+def object_id(_id: str):
+    return bson.objectid.ObjectId(_id)