Pārlūkot izejas kodu

new add project

萤火也是火 3 gadi atpakaļ
vecāks
revīzija
317021dae1

+ 0 - 0
zgzb/common/__init__.py


+ 23 - 0
zgzb/common/aliyun.py

@@ -0,0 +1,23 @@
+import oss2
+
+from config.load import oss_conf
+
+
+class AliYunService:
+
+    def __init__(self):
+        self.__acc_key_id = oss_conf['key_id']
+        self.__acc_key_secret = oss_conf['key_secret']
+        self.__endpoint = oss_conf['endpoint']
+        self.__bucket_name = oss_conf['bucket_name']
+
+    def _push_oss_from_local(self, key, filename):
+        """
+        上传一个本地文件到OSS的普通文件
+
+        :param str key: 上传到OSS的文件名
+        :param str filename: 本地文件名,需要有可读权限
+        """
+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
+        bucket.put_object_from_file(key, filename)

+ 224 - 0
zgzb/common/attachment.py

@@ -0,0 +1,224 @@
+import hashlib
+import os
+import re
+import traceback
+import uuid
+from urllib.parse import urlparse, unquote
+
+import requests
+import urllib3
+
+from common.aliyun import AliYunService
+from common.execptions import AttachmentNullError
+from common.log import logger
+from common.socks5 import Proxy
+from config.load import headers
+
+urllib3.disable_warnings()
+# 文件文档类型
+DOCTYPE = {
+    'txt', 'rtf', 'dps', 'et', 'ett', 'xls',
+    'xlsx', 'xlsb', 'xlsm', 'xlt', 'ods', 'pmd', 'pmdx',
+    'doc', 'docm', 'docx', 'dot', 'dotm', 'dotx',
+    'odt', 'wps', 'csv', 'xml', 'xps'
+}
+# 压缩类型
+COMPRESSION_TYPE = {
+    'rar', 'zip', 'gzzb', '7z', 'tar', 'gz', 'bz2', 'jar', 'iso', 'cab',
+    'arj', 'lzh', 'ace', 'uue', 'edxz',
+}
+# 图片类型
+IMAGE_TYPE = {
+    'jpg', 'png', 'jpeg', 'tiff', 'gif', 'psd', 'raw', 'eps', 'svg', 'bmp',
+    'pdf'
+}
+# 其他类型
+OTHER_TYPE = {
+    'swf', 'nxzf', 'xezf', 'nxcf'
+}
+
+
+def sha1(val):
+    _sha1 = hashlib.sha1()
+    if isinstance(val, bytes):
+        _sha1.update(str(val).encode("utf-8"))
+    elif isinstance(val, str):
+        _sha1.update(val.encode("utf-8"))
+    return _sha1.hexdigest()
+
+
+def remove(file_path: str):
+    os.remove(file_path)
+
+
+def getsize(file):
+    try:
+        return os.path.getsize(file)
+    except FileNotFoundError:
+        return 0
+
+
+def discern_file_format(text):
+    file_types = {
+        *DOCTYPE,
+        *COMPRESSION_TYPE,
+        *IMAGE_TYPE,
+        *OTHER_TYPE
+    }
+    for file_type in file_types:
+        all_file_format = [file_type, file_type.upper()]
+        for t in all_file_format:
+            result = re.match(f'.*{t}$', text, re.S)
+            if result is not None:
+                return t
+    else:
+        unknown_type = re.findall('[^.\\/:*?"<>|\r\n]+$', text, re.S)
+        logger.warning(f'[附件类型识别]未定义的文件类型{unknown_type}')
+        return None
+
+
+def extract_file_type(text):
+    if text is None:
+        return None
+    return discern_file_format(text)
+
+
+def extract_file_name_by_href(href: str, file_type: str):
+    """从url中抽取文件名称"""
+    # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
+    # 中文字符:[\u4e00 -\u9fa5]
+    zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
+    parser = urlparse(href)
+    query = (parser.query or parser.path)
+    result = re.search(f'.*\\.{file_type}', query, re.S)
+    if result is not None:
+        encode_str = unquote(result.group())
+        name = re.search(zh_char_pattern, encode_str)
+        if name is not None:
+            return unquote(name.group())
+    return None
+
+
+def extract_file_name(text):
+    file_type = discern_file_format(text)
+    if file_type is not None:
+        repl = '.{}'.format(file_type)
+        text = text.replace(repl, '')
+    return text
+
+
+def verify_file_name(name):
+    if extract_file_type(name) is None:
+        raise ValueError
+
+
+class AttachmentDownloader(AliYunService):
+
+    def __init__(self):
+        super(AttachmentDownloader, self).__init__()
+        self.dir_name = 'file'
+
+    def _create_file(self, filename, filetype):
+        os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
+        file = "{filename}.{filetype}".format(
+            filename=sha1("{}_{}".format(filename, uuid.uuid4())),
+            filetype=filetype
+        )
+        return "{}/{}".format(self.dir_name, file)
+
+    @staticmethod
+    def _create_fid(file_stream: bytes):
+        return sha1(file_stream)
+
+    @staticmethod
+    def _origin_filename(fid: str, filetype: str):
+        return "{}.{}".format(fid, filetype)
+
+    @staticmethod
+    def _file_size(file: str):
+        _kb = float(getsize(file)) / 1024
+        if _kb >= 1024:
+            _M = _kb / 1024
+            if _M >= 1024:
+                _G = _M / 1024
+                return "{:.1f} G".format(_G)
+            else:
+                return "{:.1f} M".format(_M)
+        else:
+            return "{:.1f} kb".format(_kb)
+
+    @staticmethod
+    def _download(
+            url: str,
+            file: str,
+            enable_proxy=False,
+            allow_show_exception=False,
+            **kwargs
+    ):
+        request_params = {}
+        request_params.setdefault('headers', kwargs.get('headers') or headers)
+        request_params.setdefault('proxies', kwargs.get('proxies'))
+        request_params.setdefault('timeout', kwargs.get('timeout') or 60)
+        request_params.setdefault('stream', kwargs.get('stream') or True)
+        request_params.setdefault('verify', kwargs.get('verify') or False)
+        proxy = Proxy(enable_proxy)
+        retries = 0
+        while retries < 3:
+            try:
+                with requests.get(url, **request_params) as req:
+                    if req.status_code == 200:
+                        stream = req.content
+                        with open(file, 'wb') as f:
+                            f.write(stream)
+                        return stream
+                    else:
+                        retries += 1
+            except requests.RequestException:
+                if allow_show_exception:
+                    traceback.print_exc()
+                if enable_proxy:
+                    proxy.switch()
+                    request_params.update({'proxies': proxy.proxies})
+                retries += 1
+        return b''
+
+    def download(
+            self,
+            file_name: str,
+            file_type: str,
+            download_url: str,
+            enable_proxy=False,
+            allow_request_exception=False,
+            **kwargs
+    ):
+        if not file_name or not file_type or not download_url:
+            raise AttachmentNullError
+
+        local_tmp_file = self._create_file(file_name, file_type)
+        file_stream = self._download(
+            download_url,
+            local_tmp_file,
+            enable_proxy,
+            allow_request_exception,
+            **kwargs
+        )
+        result = {
+            'filename': '{}.{}'.format(file_name, file_type),
+            'org_url': download_url
+        }
+        if len(file_stream) > 0:
+            try:
+                fid = self._create_fid(file_stream)
+                key = self._origin_filename(fid, file_type)
+                result.setdefault('fid', key)
+                result.setdefault('ftype', file_type)
+                result.setdefault('size', self._file_size(local_tmp_file))
+                result.setdefault('url', 'oss')
+                super()._push_oss_from_local(key, local_tmp_file)
+            except Exception as e:
+                logger.warning(
+                    "[{}]下载异常,原因:{}".format(file_name, e.__class__.__name__)
+                )
+        remove(local_tmp_file)
+        '''上传/下载,无论失败/成功必须返回附件信息'''
+        return result

+ 133 - 0
zgzb/common/clean_html.py

@@ -0,0 +1,133 @@
+import re
+
+__all__ = ['cleaner']
+
+# 独立元素
+INDEPENDENT_TAGS = {
+    '<head>[\s\S]*?</head>': '',
+    '<html>|<html [^>]*>|</html>': '',
+    '<body>|<body [^>]*>|</body>': '',
+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
+    '\\xa0|\\u3000': '',  # 空格
+    '<!--[\s\S]*?-->': '',  # 注释
+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
+    '<input>': '',  # 输入框
+    '<img[^>]*>': '<br>',  # 图片
+}
+# 行内元素
+INLINE_TAGS = {
+    '<a>|<a [^>]*>|</a>': '',  # 超链接
+    '<span>|<span [^>]*>|</span>': '',  # span
+    '<label>|<label [^>]*>|</label>': '<br>',  # label
+    '<font>|<font [^>]*>|</font>': '',  # font
+}
+# 块级元素
+BLOCK_TAGS = {
+    '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
+    # '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
+    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
+    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
+}
+# 其他
+OTHER = {
+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
+    '<epointform>': '',
+    '<!doctype html>|<!doctype html [^>]*>': '',
+    '【关闭】|关闭': '',
+    '【打印】|打印本页': '',
+    '【字体:[\s\S]*】': '',
+    '文章来源:[\u4e00-\u9fa5]+': '',
+    '浏览次数:.*[<]+': '',
+    '(责任编辑:.*?)': '',
+    '分享到[:]': '',
+    '阅读数[::]\d+': '',
+}
+# 样式
+CSS_STYLE = {
+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
+}
+# 空白符
+BLANKS = {
+    '\n\s*\n': '\n',
+    '\s*\n\s*': '\n',
+    '[^\S\n]': ' ',
+    '\s+': ' ',
+}
+# css标签集合
+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
+# css属性集合
+ATTRS = {'id', 'class', 'style', 'width'}
+
+
+def _repair_tag():
+    """异常的标签组合,用来替换非标准页面的标签"""
+    _repairs = {}
+    for tag in TAGS:
+        for attr in ATTRS:
+            key = '{}{}'.format(tag, attr)
+            val = '{} {}'.format(tag, attr)
+            _repairs[key] = val
+    return _repairs
+
+
+def _escape_character(html):
+    """转义字符"""
+    html = html.replace('&lt;', '<')
+    html = html.replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = html.replace('&amp;', '&')
+    return html
+
+
+def _lowercase_tag(html):
+    """标签归一化处理(全部小写)"""
+    tags = re.findall("<[^>]+>", html)
+    for tag in tags:
+        html = html.replace(tag, str(tag).lower())
+
+    repair_tags = _repair_tag()
+    for err, right in repair_tags.items():
+        html = html.replace(err, right)
+
+    return html
+
+
+def cleaner(html, special=None, completely=False):
+    """
+    数据清洗
+
+    :param html: 清洗的页面
+    :param special: 额外指定页面清洗规则
+    :param completely: 是否完全清洗页面
+    :return: 清洗后的页面源码
+    """
+    if special is None:
+        special = {}
+    OTHER.update(special)
+    remove_tags = {
+        **INDEPENDENT_TAGS,
+        **INLINE_TAGS,
+        **BLOCK_TAGS,
+        **OTHER,
+        **CSS_STYLE,
+        **BLANKS,
+    }
+    html = _lowercase_tag(html)
+    for tag, repl in remove_tags.items():
+        html = re.sub(tag, repl, html)
+
+    if completely:
+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
+
+    html = _escape_character(html)
+    return html

+ 112 - 0
zgzb/common/databases.py

@@ -0,0 +1,112 @@
+import bson
+import pymongo
+import redis
+import requests
+from elasticsearch import Elasticsearch
+
+from config.load import mongo_conf, redis_conf, es_conf, analyze_url
+
+
+# ---------------------------------- mongo ----------------------------------
+def mongo_client(cfg=None, host=None, port=None):
+    if cfg is None:
+        if host is not None and port is not None:
+            cfg = {'host': host, 'port': port}
+        else:
+            cfg = mongo_conf
+    return pymongo.MongoClient(host=cfg['host'], port=cfg['port'])
+
+
+def mongo_database(db: str, **kw):
+    client = mongo_client(**kw)
+    return client[db]
+
+
+def mongo_table(db: str, coll: str, **kw):
+    client = mongo_client(**kw)
+    return client[db][coll]
+
+
+def int2long(param: int):
+    """int 转换成 long """
+    return bson.int64.Int64(param)
+
+
+def object_id(_id: str):
+    return bson.objectid.ObjectId(_id)
+
+
+# ---------------------------------- es ----------------------------------
+def es_client(cfg=None):
+    if cfg is None:
+        cfg = es_conf
+    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
+
+
+def es_participles_service(text: str):
+    """
+    获取文本的分词列表
+
+    :param text: 需要分词的文本
+    :return: 分词列表
+    """
+    result = []
+    params = {"text": text, "analyzer": "ik_smart"}
+    res = requests.get(analyze_url, params=params, timeout=60)
+    if res.status_code == 200:
+        tokens = res.json().get('tokens', [])
+        for x in tokens:
+            if x["token"].encode('utf-8').isalpha():
+                continue
+            result.append(x["token"])
+    return result
+
+
+def es_query(title: str, publish_time: int):
+    """
+    查询es
+
+    :param title: 标题
+    :param publish_time: 发布时间
+    :return:
+    """
+    client = es_client()
+    stime = publish_time - 432000  # 往前推5天
+    etime = publish_time + 432000
+    conditions = []
+    participles = es_participles_service(title)
+    for word in participles:
+        conditions.append({
+            "multi_match": {
+                "query": word,
+                "type": "phrase",
+                "fields": ["title"]
+            }
+        })
+    conditions.append({
+        "range": {"publishtime": {"from": stime, "to": etime}}
+    })
+    query = {
+        "query": {
+            "bool": {
+                "must": conditions,
+                "minimum_should_match": 1
+            }
+        }
+    }
+    result = client.search(index='bidding', body=query, request_timeout=100)
+    count = len(result['hits']['hits'])
+    return count
+
+
+# ---------------------------------- redis ----------------------------------
+def redis_client(cfg=None):
+    if cfg is None:
+        cfg = redis_conf
+    pool = redis.ConnectionPool(
+        host=cfg['host'],
+        port=cfg['port'],
+        password=cfg['pwd'],
+        db=cfg['db']
+    )
+    return redis.Redis(connection_pool=pool, decode_responses=True)

+ 35 - 0
zgzb/common/execptions.py

@@ -0,0 +1,35 @@
+
+class JyBasicException(Exception):
+
+    def __init__(self, code: int, reason: str, **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class CustomCheckError(JyBasicException):
+
+    def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class AttachmentNullError(JyBasicException):
+
+    def __init__(self, code: int = 10004, reason: str = '附件下载异常', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class CustomAccountPrivilegeError(JyBasicException):
+
+    def __init__(self, *args, **kwargs):
+        pass

+ 25 - 0
zgzb/common/log.py

@@ -0,0 +1,25 @@
+from pathlib import Path
+
+from loguru import logger
+
+_absolute = Path(__file__).absolute().parent.parent
+_log_path = (_absolute / 'logs/crawl-{time:YYYY-MM-DD}.log').resolve()
+logger.add(
+    _log_path,
+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
+    level='INFO',
+    rotation='00:00',
+    retention='1 week',
+    encoding='utf-8',
+)
+
+_log_path = (_absolute / 'logs/err-text-{time:YYYY-MM-DD}.log').resolve()
+logger.add(
+    _log_path,
+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
+    level='ERROR',
+    rotation='00:00',
+    retention='1 week',
+    filter=lambda x: '[文本异常]' in x['message'],
+    encoding='utf-8',
+)

+ 41 - 0
zgzb/common/socks5.py

@@ -0,0 +1,41 @@
+import threading
+
+import requests
+
+from common.log import logger
+from config.load import jy_proxy
+
+__all__ = ['Proxy']
+
+
+class Socks5Proxy:
+
+    def __init__(self):
+        self._lock = threading.RLock()
+        self._enable_proxy = False
+        self._url = jy_proxy['socks5']['url']
+        self._auth = jy_proxy['socks5']['auth']
+        self._proxies = None
+
+    @property
+    def proxies(self):
+        return self._proxies
+
+    def switch(self):
+        with self._lock:
+            if self._enable_proxy:
+                self._proxies = self._fetch_proxies()
+
+    def _fetch_proxies(self):
+        proxy = requests.get(self._url, headers=self._auth).json()
+        return proxy.get("data")
+
+    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
+        self._enable_proxy = enable_proxy
+        if self._enable_proxy:
+            logger.info("[加载socks5代理]")
+            self._proxies = self._fetch_proxies()
+        return self
+
+
+Proxy = Socks5Proxy()

+ 97 - 0
zgzb/common/tools.py

@@ -0,0 +1,97 @@
+import datetime
+import hashlib
+import re
+import time
+from collections import namedtuple
+
+from lxml.html import HtmlElement, fromstring, tostring
+
+SearchText = namedtuple('SearchText', ['total'])
+
+
+def element2html(element: HtmlElement) -> str:
+    return tostring(element, encoding="utf-8").decode()
+
+
+def html2element(html: str) -> HtmlElement:
+    return fromstring(html)
+
+
+def valid_element(node: HtmlElement, feature: str):
+    if len(node.xpath(feature)) > 0:
+        return True
+    else:
+        return False
+
+
+def text_search(text: str) -> SearchText:
+    """
+    中文检索
+
+    :param text: 文本
+    :return: 中文数量
+    """
+    if not text:
+        return SearchText(0)
+
+    results = re.findall('[\u4e00-\u9fa5]', text, re.S)
+    # 列表长度即是中文的字数
+    return SearchText(len(results))
+
+
+def verify_text(val: str):
+    if val is None:
+        return False
+    """检查数字、字母、中文的个数"""
+    sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
+    for pattern in sub_pattern:
+        val = re.sub(pattern, '', val)
+    # 若文本长度小于50,表示页面内容无详情内容
+    if len(val) < 50:
+        '''无效文本'''
+        return False
+    '''有效文本'''
+    return True
+
+
+def sha1(text: str):
+    """
+    十六进制数字字符串形式摘要值
+
+    @param text: 字符串文本
+    @return: 摘要值
+    """
+    _sha1 = hashlib.sha1()
+    _sha1.update(text.encode("utf-8"))
+    return _sha1.hexdigest()
+
+
+def get_ms() -> int:
+    return int(round(time.time() * 1000))
+
+
+def get_current_date():
+    return datetime.datetime.now().strftime("%Y-%m-%d")
+
+
+def ms_to_date(ms: int):
+    timestamp = float(ms / 1000)
+    time_array = time.localtime(timestamp)
+    return time.strftime("%Y-%m-%d %H:%M:%S", time_array)
+
+
+def str2ts(ts_str):
+    """字符串转时间戳"""
+    return int(float(ts_str) / 1000)
+
+
+def ts2date(ts_str) -> str:
+    """
+    时间戳转成日期
+
+    :param ts_str: 毫秒级时间戳
+    :return: 日期
+    """
+    timestamp = int(float(ts_str) / 1000)
+    time_array = time.localtime(timestamp)
+    return time.strftime("%Y-%m-%d %H:%M:%S", time_array)

+ 418 - 0
zgzb/common/webdriver.py

@@ -0,0 +1,418 @@
+import json
+from collections import namedtuple
+from typing import Optional
+
+from selenium import webdriver
+from selenium.common.exceptions import WebDriverException
+from selenium.webdriver.common.by import By
+from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+from common.log import logger
+
+DEFAULT_USERAGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0"
+Netloc = namedtuple('Netloc', ['host', 'port'])
+
+
+def until_wait(
+        driver,
+        *,
+        xpath=None,
+        classname=None,
+        text=None,
+        timeout=None
+):
+    """
+    显示等待页面加载,否则抛出TimeoutException
+
+    :param driver: 浏览器驱动
+    :param xpath: xpath规则,页面等待特征
+    :param classname: class属性名称,页面等待特征
+    :param text: 期待的文本
+    :param timeout: 超时时间
+    :return:
+    """
+    _timeout = (timeout or 60)
+    wait = WebDriverWait(driver, _timeout, 0.2)
+    if xpath is not None:
+        locator = (By.XPATH, xpath)
+        if text is not None:
+            wait.until(EC.text_to_be_present_in_element(locator, text))
+        else:
+            wait.until(EC.presence_of_element_located(locator))
+
+    elif classname is not None:
+        locator = (By.CLASS_NAME, classname)
+        if text is not None:
+            wait.until(EC.text_to_be_present_in_element(locator, text))
+        else:
+            wait.until(EC.presence_of_element_located(locator))
+
+
+def check_navigator(driver):
+    script = "return window.navigator.webdriver"
+    return driver.execute_script(script)
+
+
+def netloc(proxies: dict) -> Netloc:
+    host, port = proxies["https"].replace("socks5://", "").split(":")
+    return Netloc(host, port)
+
+
+class XhrRequest:
+    def __init__(self, url, data, headers):
+        self.url = url
+        self.data = data
+        self.headers = headers
+
+
+class XhrResponse:
+    def __init__(self, request: XhrRequest, url, headers, content, status_code):
+        self.request = request
+        self.url = url
+        self.headers = headers
+        self.content = content
+        self.status_code = status_code
+
+
+class FireFoxWebDriverError(WebDriverException):
+    pass
+
+
+class FireFoxWebDriver:
+
+    def __init__(
+            self,
+            user_agent=None,
+            proxy=None,
+            headless=True,
+            timeout=60,
+            load_images=False,
+            executable_path=None,
+            window_size: tuple = None,
+            xhr_url_regexes: list = None,
+    ):
+        """
+        支持 firefox
+        Args:
+            user_agent: 字符串 或 无参函数,返回值为user_agent
+            proxy: {'https://sockets:xxx.xxx.xxx.xxx:xxxx'} 或 无参函数,返回值为代理地址
+            headless: 是否启用无头模式, 默认:无头模式
+            timeout: 请求超时时间
+            load_images: 是否加载图片
+            executable_path: 浏览器路径,默认为默认路径
+            window_size: # 窗口大小
+            xhr_url_regexes: 拦截xhr接口,支持正则,数组类型
+        """
+        self._user_agent = user_agent or DEFAULT_USERAGENT
+        self._proxy = proxy
+        self._load_images = load_images
+        self._headless = headless
+        self._timeout = timeout
+        self._xhr_url_regexes = xhr_url_regexes
+        self._window_size = window_size
+        self._executable_path = executable_path
+
+        firefox_profile = webdriver.FirefoxProfile()
+        firefox_options = webdriver.FirefoxOptions()
+        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
+        if self._proxy:
+            proxy = self._proxy() if callable(self._proxy) else self._proxy
+            host, port = netloc(proxy)
+            # 不使用代理=0, 使用代理=1
+            firefox_profile.set_preference('network.proxy.type', 1)
+            firefox_profile.set_preference('network.proxy.socks', host)
+            # 端口必须使用int类型,才会生效
+            firefox_profile.set_preference('network.proxy.socks_port', int(port))
+            firefox_profile.update_preferences()
+
+        if self._user_agent:
+            firefox_profile.set_preference(
+                "general.useragent.override",
+                self._user_agent() if callable(self._user_agent) else self._user_agent,
+            )
+            firefox_profile.update_preferences()
+
+        if not self._load_images:
+            '''
+            允许加载所有图像,无论来源如何(默认)=1
+            阻止所有图像加载=2
+            防止加载第三方图像=3
+            '''
+            firefox_profile.set_preference("permissions.default.image", 2)
+            firefox_profile.update_preferences()
+
+        if self._headless:
+            firefox_options.add_argument("--headless")
+            firefox_options.add_argument("--disable-gpu")
+
+        if self._executable_path:
+            _driver = webdriver.Firefox(
+                capabilities=firefox_capabilities,
+                options=firefox_options,
+                firefox_profile=firefox_profile,
+                executable_path=self._executable_path,
+            )
+        else:
+            _driver = webdriver.Firefox(
+                capabilities=firefox_capabilities,
+                options=firefox_options,
+                firefox_profile=firefox_profile,
+            )
+
+        if self._window_size:
+            _driver.set_window_size(*self._window_size)
+
+        self.driver = _driver
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_val:
+            logger.error(f'{self.__class__.__name__} >>> {exc_type} <> {exc_val}')
+
+        self.driver.quit()
+        return True
+
+    def set_page_load_timeout(self, timeout=None):
+        """
+        设置selenium页面执行时间
+        :param timeout: 超时时间,默认:60s
+        :return:
+        """
+        _timeout = (timeout or self._timeout)
+        # driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
+        self.driver.set_page_load_timeout(_timeout)
+        # 设置脚本超时时间
+        self.driver.set_script_timeout(_timeout)
+
+    def quit(self):
+        self.driver.quit()
+
+    def xhr_response(self, xhr_url_regex) -> Optional[XhrResponse]:
+        data = self.driver.execute_script(
+            f'return window.__ajaxData["{xhr_url_regex}"];'
+        )
+        if not data:
+            return None
+
+        request = XhrRequest(**data["request"])
+        response = XhrResponse(request, **data["response"])
+        return response
+
+    def xhr_text(self, xhr_url_regex) -> Optional[str]:
+        response = self.xhr_response(xhr_url_regex)
+        if not response:
+            return None
+        return response.content
+
+    def xhr_json(self, xhr_url_regex) -> Optional[dict]:
+        text = self.xhr_text(xhr_url_regex)
+        return json.loads(text)
+
+    def get(self, url):
+        self.driver.get(url)
+
+    @property
+    def user_agent(self):
+        return self.driver.execute_script("return navigator.userAgent;")
+
+    @property
+    def page_title(self):
+        return self.driver.execute_script('return document.title')
+
+    @property
+    def page_source(self):
+        return self.driver.page_source
+
+    def find_element_by_xpath(self, xpath: str):
+        """
+        通过xpath寻找元素,不存在该元素时,抛出 NoSuchElementException
+        :param xpath: 需要寻找的元素的xpath
+        :return:
+        """
+        return self.driver.find_element_by_xpath(xpath)
+
+    def until_wait(
+            self,
+            *,
+            xpath=None,
+            classname=None,
+            text=None,
+            timeout=None
+    ):
+        """
+        显示等待页面加载,否则抛出TimeoutException
+
+        :param xpath: xpath规则,页面等待特征
+        :param classname: class属性名称,页面等待特征
+        :param text: 期待的文本
+        :param timeout: 超时时间
+        :return:
+        """
+        _timeout = (timeout or self._timeout)
+        wait = WebDriverWait(self.driver, _timeout, 0.2)
+        if xpath is not None:
+            locator = (By.XPATH, xpath)
+            if text is not None:
+                wait.until(EC.text_to_be_present_in_element(locator, text))
+            else:
+                wait.until(EC.presence_of_element_located(locator))
+
+        elif classname is not None:
+            locator = (By.CLASS_NAME, classname)
+            if text is not None:
+                wait.until(EC.text_to_be_present_in_element(locator, text))
+            else:
+                wait.until(EC.presence_of_element_located(locator))
+
+    def switch_to_window(self):
+        self.driver.execute_script('window.open();')
+        handles = self.driver.window_handles
+        self.driver.close()
+        self.driver.switch_to.window(handles[-1])
+
+
+class WebDriver(RemoteWebDriver):
+
+    FIREFOX = "FIREFOX"
+
+    def __init__(
+        self,
+        load_images=True,
+        user_agent=None,
+        proxy=None,
+        headless=True,
+        driver_type=FIREFOX,
+        timeout=120,
+        window_size=(1024, 800),
+        executable_path=None,
+        custom_argument=None,
+        **kwargs
+    ):
+        """
+        Args:
+            load_images: 是否加载图片
+            user_agent: 字符串 或 无参函数,返回值为user_agent
+            proxy: {'https://sockets:xxx.xxx.xxx.xxx:xxxx'} 或 无参函数,返回值为代理地址
+            headless: 是否启用无头模式
+            driver_type: FIREFOX
+            timeout: 请求超时时间
+            window_size: # 窗口大小
+            executable_path: 浏览器路径,默认为默认路径
+            **kwargs:
+        """
+        self._load_images = load_images
+        self._user_agent = user_agent or DEFAULT_USERAGENT
+        self._proxy = proxy
+        self._headless = headless
+        self._timeout = timeout
+        self._window_size = window_size
+        self._executable_path = executable_path
+        self._custom_argument = custom_argument
+
+        self.proxies = {}
+        self.user_agent = None
+
+        if driver_type == WebDriver.FIREFOX:
+            self.driver = self.firefox_driver()
+
+        self.driver.set_page_load_timeout(self._timeout)
+        self.driver.set_script_timeout(self._timeout)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_val:
+            logger.error(f'{self.__class__.__name__} >>> {exc_type} <> {exc_val}')
+
+        self.quit()
+        return True
+
+    def firefox_driver(self):
+        firefox_profile = webdriver.FirefoxProfile()
+        firefox_options = webdriver.FirefoxOptions()
+        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
+        firefox_profile.set_preference("dom.webdriver.enabled", False)
+        firefox_profile.set_preference('useAutomationExtension', False)
+        # firefox_profile.set_preference('privacy.resistFingerprinting', True)  # 启用指纹保护
+        if self._proxy:
+            proxy = self._proxy() if callable(self._proxy) else self._proxy
+            host, port = netloc(proxy)
+            # 使用socks5 代理, 不使用代理:0, 使用代理:1
+            firefox_profile.set_preference('network.proxy.type', 1)
+            firefox_profile.set_preference('network.proxy.socks', host)
+            firefox_profile.set_preference('network.proxy.socks_port', int(port))
+
+        if self._user_agent:
+            firefox_profile.set_preference(
+                "general.useragent.override",
+                self._user_agent() if callable(self._user_agent) else self._user_agent,
+            )
+
+        if not self._load_images:
+            '''
+            允许加载所有图像,无论来源如何(默认)=1
+            阻止所有图像加载=2
+            防止加载第三方图像=3
+            '''
+            firefox_profile.set_preference("permissions.default.image", 2)
+
+        firefox_profile.update_preferences()
+
+        if self._headless:
+            firefox_options.add_argument("--headless")
+            firefox_options.add_argument("--disable-gpu")
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                firefox_options.add_argument(arg)
+
+        if self._executable_path:
+            driver = webdriver.Firefox(
+                capabilities=firefox_capabilities,
+                options=firefox_options,
+                firefox_profile=firefox_profile,
+                executable_path=self._executable_path,
+            )
+        else:
+            driver = webdriver.Firefox(
+                capabilities=firefox_capabilities,
+                options=firefox_options,
+                firefox_profile=firefox_profile,
+            )
+
+        if self._window_size:
+            driver.set_window_size(*self._window_size)
+
+        return driver
+
+    @property
+    def cookies(self):
+        cookies_json = {}
+        for cookie in self.driver.get_cookies():
+            cookies_json[cookie["name"]] = cookie["value"]
+        return cookies_json
+
+    @cookies.setter
+    def cookies(self, val: dict):
+        """
+        设置cookie
+        Args:
+            val: {"key":"value", "key2":"value2"}
+
+        Returns:
+
+        """
+        for key, value in val.items():
+            self.driver.add_cookie({"name": key, "value": value})
+
+    def __getattr__(self, name):
+        if self.driver:
+            return getattr(self.driver, name)
+        else:
+            raise AttributeError

+ 0 - 0
zgzb/config/__init__.py


+ 40 - 0
zgzb/config/conf.yaml

@@ -0,0 +1,40 @@
+# mongo
+mongo:
+  host: 172.17.4.87
+  port: !!int 27080
+#  host: 127.0.0.1
+#  port: !!int 27017
+
+
+# redis
+redis:
+  host: 127.0.0.1
+  port: !!int 6379
+  pwd: ""
+  db: !!int 10
+
+
+# 阿里oss
+ali_oss:
+  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
+  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
+#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
+  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
+  bucket_name: jy-datafile
+
+
+# es
+es:
+  host: 172.17.145.170
+#  host: 192.168.3.206
+#  host: 127.0.0.1
+  port: !!int 9800
+  db: biddingall # es库别名
+
+
+# 代理
+proxy:
+  socks5:
+    url: http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch
+    auth:
+      Authorization: Basic amlhbnl1MDAxOjEyM3F3ZSFB

+ 2 - 0
zgzb/config/constants.yaml

@@ -0,0 +1,2 @@
+headers:
+  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36

+ 34 - 0
zgzb/config/load.py

@@ -0,0 +1,34 @@
+from pathlib import Path
+
+import yaml
+
+__all__ = [
+    'mongo_conf',
+    'redis_conf',
+    'oss_conf',
+    'es_conf',
+    'jy_proxy',
+    'node_module_path',
+    'headers',
+    'analyze_url'
+]
+
+_base_path = Path(__file__).parent
+_yaml_conf = (_base_path / 'conf.yaml').resolve()
+_yaml_constants = (_base_path / 'constants.yaml').resolve()
+_node_modules = (_base_path.parent / 'node_modules').resolve()
+
+with open(_yaml_conf, encoding="utf-8") as f:
+    _conf = yaml.safe_load(f)
+    mongo_conf = _conf['mongo']
+    redis_conf = _conf['redis']
+    oss_conf: dict = _conf['ali_oss']
+    es_conf: dict = _conf['es']
+    jy_proxy: dict = _conf['proxy']
+    node_module_path = _node_modules
+    analyze_url = f'http://{es_conf["host"]}:{es_conf["port"]}/{es_conf["db"]}/_analyze'
+
+
+with open(_yaml_constants, encoding="utf-8") as fp:
+    _constants = yaml.safe_load(fp)
+    headers: dict = _constants['headers']

+ 0 - 0
zgzb/crawler/__init__.py


+ 434 - 0
zgzb/crawler/crawl_spider.py

@@ -0,0 +1,434 @@
+import time
+from collections import namedtuple
+
+from selenium.webdriver import ActionChains
+from selenium.webdriver.common.by import By
+
+from common.clean_html import cleaner
+from common.databases import mongo_table, int2long, redis_client
+from common.log import logger
+from common.socks5 import Proxy
+from common.tools import html2element, element2html, verify_text, sha1
+from common.webdriver import WebDriver, until_wait
+
+crawl_tab = mongo_table(db='py_spider', coll='zgzb_wagf_list')
+save_tab = mongo_table(db='py_spider', coll='data_bak')
+redis_key = 'zgzb_wagf_2022'
+r = redis_client()
+'''采集记录'''
+CRAWL_RECORDS = {}
+'''分类'''
+CATEGORY_MAPS = {
+    '招标项目': 'tenderProjectTab',
+    '招标公告': 'tenderBulletin',
+    '开标记录': 'openBidRecord',
+    '评标公示': 'bidEvaluation',
+    '中标公告': 'winBidBulletin',
+    # '签约履行': '',
+}
+'''建立时间'''
+SETUP_MAPS = {
+    '今天': 'jt',
+    '2天内': '2tq',
+    '3天内': '3tq',
+    '1周内': '1zn',
+}
+SETUP_TIME = {
+    '招标项目': {
+        'jt': 'tenderProject_begin1',
+        '2tq': 'tenderProject_begin2',
+        '3tq': 'tenderProject_begin3',
+        '1zn': 'tenderProject_begin7'
+    },
+    '招标公告': {
+        'jt': 'tenderBulletin_begin1',
+        '2tq': 'tenderBulletin_begin2',
+        '3tq': 'tenderBulletin_begin3',
+        '1zn': 'tenderBulletin_begin7'
+    },
+    '开标记录': {
+        'jt': 'openBidRecord_1',
+        '2tq': 'openBidRecord_2',
+        '3tq': 'openBidRecord_3',
+        '1zn': 'openBidRecord_7'
+    },
+    '评标公示': {
+        'jt': 'bidEvaluation_1',
+        '2tq': 'bidEvaluation_2',
+        '3tq': 'bidEvaluation_3',
+        '1zn': 'bidEvaluation_7'
+    },
+    '中标公告': {
+        'jt': 'winBidBulletin_1',
+        '2tq': 'winBidBulletin_2',
+        '3tq': 'winBidBulletin_3',
+        '1zn': 'winBidBulletin_7'
+    }
+}
+'''爬虫清单'''
+CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode'])
+CRAWL_MENU = {
+    '招标项目': CrawlMenu('未按数据规范-招标项目', 'a_zgzbtbggfwpt_wasjgf_zbxm'),
+    '招标公告': CrawlMenu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg'),
+    '开标记录': CrawlMenu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl'),
+    '评标公示': CrawlMenu('未按数据规范-评标公示', 'a_zgzbtbggfwpt_wasjgf_pbgs'),
+    '中标公告': CrawlMenu('未按数据规范-中标公告', 'a_zgzbtbggfwpt_wasjgf_zhbgg'),
+    '签约履行': CrawlMenu('未按数据规范-签约履行', 'a_zgzbtbggfwpt_wasjgf_qylx'),
+}
+
+
+def robots_alert(driver):
+    """机器人警告"""
+    wait = 0
+    while wait < 20:
+        '''等待验证模块加载'''
+        element = html2element(driver.page_source)
+        robot_alert = element.xpath('//span[@class="nc-lang-cnt"]/@data-nc-lang')
+        click_alert = element.xpath('//div[@id="text"]/text()')
+        if len(robot_alert) == 1 and "".join(robot_alert).strip() == '_Loading':
+            time.sleep(0.5)
+        elif len(robot_alert) > 1 and robot_alert[0] in ['_yesTEXT']:
+            '''通过机器人验证'''
+            return False, '0'
+        elif len(robot_alert) > 1 and robot_alert[0] in ['_startTEXT']:
+            '''机器人验证加载完成'''
+            return True, '1'
+        elif len(robot_alert) > 1 and robot_alert[0] in ['_errorNetwork']:
+            '''网络不给力,请点击刷新,或提交反馈 (00)'''
+            return True, '2'
+        elif len(click_alert) > 0 and "".join(click_alert) == '请点击此处完成验证或咨询客服':
+            return True, '3'
+        else:
+            return False, '0'
+
+        wait += 1
+    return True, '999'
+
+
+def check_robots_alert(driver):
+    """检查并处理机器人警告"""
+    while True:
+        alert, alert_type = robots_alert(driver)
+        if not alert:
+            break
+
+        if alert_type == '1':
+            until_wait(driver, xpath='//span[contains(@class, "nc_iconfont btn_slide")]')
+            element = driver.find_element_by_xpath('//span[contains(@class, "nc_iconfont btn_slide")]')
+            if element.is_displayed():
+                # 点击并且不松开鼠标,往右边移动258个位置,松开鼠标
+                ActionChains(driver).click_and_hold(element).move_by_offset(xoffset=258, yoffset=0).release().perform()
+
+        elif alert_type == '2':
+            until_wait(driver, xpath='//span[contains(@class, "nc-lang-cnt")]/a[1]')
+            element = driver.find_element_by_xpath('//span[contains(@class, "nc-lang-cnt")]/a[1]')
+            if element.is_displayed():
+                goto(driver, element, wait_time=2)
+
+        elif alert_type == '3':
+            # until_wait(driver, xpath='//div[@id="container"]')
+            # element = driver.find_element_by_xpath('//div[@id="container"]')
+            # if element.is_displayed():
+            #     goto(driver, element, wait_time=2)
+            #     driver.switch_to.alert.accept()
+            raise ValueError('浏览器指纹被识别,等待重试')
+
+        else:
+            with open('robot.html', 'w') as wp:
+                wp.write(driver.page_source)
+            raise ValueError('未知异常网页,保存在robot.html')
+        time.sleep(2)
+
+
+def refresh_page(driver):
+    """刷新页面"""
+    element = html2element(driver.page_source)
+    node = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
+    if "".join(node) == "暂无详细数据":
+        driver.refresh()
+        time.sleep(1)
+        '''页面alert元素确定'''
+        driver.switch_to.alert.accept()
+    time.sleep(1.5)
+
+
+def goto(driver, web_element, wait_time=None, allow_check_page=False):
+    """执行可点击js事件"""
+    driver.execute_script("arguments[0].click();", web_element)
+    _wait_time = (wait_time or 1)
+    time.sleep(_wait_time)
+    if allow_check_page:
+        check_robots_alert(driver)
+
+
+def extract_text(html: str, feature: str):
+    """抽取文本"""
+    element = html2element(html)
+    return element.xpath(feature)
+
+
+def extract_page_html(html: str, feature: str):
+    """抽取页面源码"""
+    element = html2element(html)
+    try:
+        node = element.xpath(feature)[0]
+        return element2html(node)
+    except IndexError:
+        pass
+
+
+def wait_load_detail(driver, check_feature=None):
+    """等待二次加载页面结果并检测元素变化"""
+    if check_feature is not None:
+        check_count = 0
+        while check_count < 10:
+            element = html2element(driver.page_source)
+            check_node = element.xpath(check_feature)
+            if len(check_node) > 0:
+                break
+            time.sleep(0.5)
+            check_count += 1
+    else:
+        check_count = 0
+        while check_count < 10:
+            element = html2element(driver.page_source)
+            root = element.xpath('//div[@id="xxnrList"]')
+            if len(root) > 0:
+                descendant = element.xpath('//div[@id="xxnrList"]/descendant::*')
+                if len(descendant) > 0:
+                    text = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
+                    children = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/child::*')
+                    if "".join(text) != '暂无详细数据' and len(children) > 0:
+                        break
+            time.sleep(0.5)
+            check_count += 1
+    time.sleep(1)
+
+
+def wait_load_list(driver):
+    while True:
+        element = html2element(driver.page_source)
+        node = element.xpath('//div[@id="myTab_Contenta0"]/div[2]/table//tr')
+        if len(node) > 0:
+            break
+        time.sleep(0.5)
+
+
+def next_page(driver, category):
+    """翻页"""
+    _finished_pages = CRAWL_RECORDS[category]['pages']
+    web_elements = driver.find_elements(by=By.XPATH, value='//div[@id="Pagination"]/div[1]/child::*')
+    for element in web_elements[1:-1]:
+        val = element.text
+        if val not in _finished_pages:
+            goto(driver, element, wait_time=1.2)
+            return int(val)
+    time.sleep(1)
+
+
+def update_crawl_records(category: str, finished: bool):
+    if category in CRAWL_RECORDS:
+        _category = CRAWL_RECORDS[category]
+        _category['finished'] = finished
+        CRAWL_RECORDS.update(_category)
+
+
+def init_crawl_records(driver, web_element, category: str):
+    """"""
+    if category not in CRAWL_RECORDS:
+        goto(driver, web_element)
+        init_config = {'finished': False, 'pages': ['1']}
+        CRAWL_RECORDS.setdefault(category, init_config)
+        return True
+    else:
+        _category = CRAWL_RECORDS[category]
+        if not _category['finished']:
+            goto(driver, web_element)
+            return True
+        else:
+            return False
+
+
+def select_category(driver, custom_category=None):
+    """采集分类"""
+    web_elements = driver.find_elements(by=By.XPATH, value='//ul[@id="myTab3"]/child::li')
+    for element in web_elements:
+        val = element.text
+        if custom_category is None:
+            success = init_crawl_records(driver, element, val)
+            return val if success else None
+        else:
+            if val == custom_category:
+                success = init_crawl_records(driver, element, custom_category)
+                return val if success else None
+
+
+def select_date(driver, category: str, setup_time: str):
+    """选择建立时间"""
+    logger.info(f"[建立时间]{setup_time}")
+    try:
+        attr = SETUP_TIME[category][SETUP_MAPS[setup_time]]
+        element = driver.find_element(by=By.XPATH, value=f'//a[@id="{attr}"]')
+        goto(driver, element)
+    except KeyError:
+        raise KeyError(f'请设置"SETUP_TIME"中{category}对应的建立时间')
+
+
+def crawl_detail(driver, handler, item):
+    for current_handler in driver.window_handles:
+        if current_handler == handler:
+            continue
+        driver.switch_to.window(current_handler)
+        '''加载等待并检查指定页面特征'''
+        wait_load_detail(driver, check_feature='//div[@id="xxnrList"]/div[1]/div[2]/div[2]')
+        '''检查机器人警告并处理'''
+        check_robots_alert(driver)
+        '''二次加载'''
+        refresh_page(driver)
+        '''加载等待'''
+        wait_load_detail(driver)
+        '''抽取源码'''
+        content_html = extract_page_html(driver.page_source, feature='//div[@id="xxnrList"]')
+        if all([content_html is not None, verify_text(content_html)]):
+            item['contenthtml'] = content_html
+            item['detail'] = cleaner(content_html)
+            item['comeintime'] = int2long(int(time.time()))
+            '''保存详情'''
+            save_tab.insert_one(item)
+        else:
+            logger.error(f'[文本异常]{item["channel"]} - {item["title"]}')
+    '''关闭当前页'''
+    driver.close()
+    '''切换主页'''
+    driver.switch_to.window(handler)
+
+
+def crawl_spider(
+        crawl_max_page=1,
+        enable_proxy=False,
+        max_request_times=15,
+        **kw
+):
+    proxy = Proxy(enable_proxy)
+    crawl_category = kw.get('crawl_category')
+    cache_cookies = {}
+    current_request_time = 0
+    headless = kw.get('headless', True)
+    while True:
+        proxies = proxy.proxies
+        logger.info(f"[采集代理]{proxies}")
+        list_page_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do?tabledivIds=searchTabLi2'
+        with WebDriver(load_images=False, proxy=proxies, headless=headless) as browser:
+            browser.get(list_page_url)
+            '''设置缓存cookies'''
+            if len(cache_cookies) > 0:
+                browser.cookies = cache_cookies
+                browser.get(list_page_url)
+            '''等待加载主页'''
+            wait_load_list(browser)
+            '''获取主页句柄'''
+            main_handler = browser.current_window_handle
+            '''选择分类'''
+            category = select_category(browser, crawl_category)
+            crawl_menu = CRAWL_MENU.get(category)
+            if crawl_menu is None:
+                logger.info("任务结束")
+                break
+            logger.info(f"[分类栏目]{category}")
+            '''选择建立时间'''
+            select_date(browser, category, '今天')
+            while True:
+                _allow_next_page = True
+                '''详情清单'''
+                web_elements = browser.find_elements(by=By.XPATH, value=f'//*[@id="{CATEGORY_MAPS[category]}"]//tr')
+                for index, element in enumerate(web_elements):
+                    index += 1
+                    item = {
+                        "site": "中国招标投标公共服务平台",
+                        "channel": crawl_menu.channel,
+                        "spidercode": crawl_menu.spidercode,
+                        "T": "bidding",
+                        "sendflag": "false",
+                        "iscompete": "true",
+                        "_d": "comeintime",
+                        "comeintime": '',
+                        "area": '',
+                        "city": '',
+                        "publishdept": "",
+                        "title": "",
+                        "href": "",
+                        "publishtime": "",
+                        "l_np_publishtime": "",
+                    }
+                    html = browser.page_source
+                    detail_js = "".join(extract_text(html, feature=f'//*[@id="{CATEGORY_MAPS[category]}"]//tr[{index}]/td[1]/a/@onclick')).strip()
+                    sign = sha1(detail_js)
+                    if r.hexists(redis_key, sign):
+                        continue
+                    item['href'] = detail_js
+
+                    node1 = element.find_element_by_xpath('./td[1]/a')
+                    title = node1.text
+                    item['title'] = title
+
+                    node2 = element.find_element_by_xpath('./td[3]/span')
+                    region = str(node2.text).replace('【', '').replace('】', '')
+                    if region.find(" ") > 0:
+                        province, city = region.split(' ')
+                    else:
+                        province = region
+                        city = ''
+                    item['area'] = str(province).replace('省', '').replace('市', '').replace('自治区', '')
+                    item['city'] = city
+
+                    node3 = element.find_element_by_xpath('./td[5]')
+                    publish_time = node3.text
+                    item['publishtime'] = publish_time
+                    item['l_np_publishtime'] = int2long(int(time.mktime(time.strptime(publish_time, "%Y-%m-%d"))))
+                    item['comeintime'] = int2long(int(time.time()))
+                    '''保存列表'''
+                    crawl_tab.insert_one(item)
+                    '''访问详情页'''
+                    goto(browser, node1, wait_time=2)
+                    '''详情页'''
+                    crawl_detail(browser, main_handler, item)
+                    '''添加数据指纹'''
+                    r.hset(redis_key, sign, '')
+                    logger.info(f'[采集成功]{title} - {publish_time}')
+                    '''请求次数计数器,限制ip代理的访问次数'''
+                    current_request_time += 1
+                    if current_request_time > max_request_times:
+                        '''切换代理并清空cookies'''
+                        proxy.switch()
+                        cache_cookies.clear()
+                        current_request_time = 0
+                        _allow_next_page = False
+                        break
+                    else:
+                        cache_cookies.update(browser.cookies)
+
+                if _allow_next_page:
+                    page_num = next_page(browser, category)
+                    logger.info(f"[{category}-第{page_num}页]已完成")
+                    if page_num is None:
+                        update_crawl_records(category, True)
+                        break
+                    elif page_num > crawl_max_page:
+                        update_crawl_records(category, True)
+                        break
+                    else:
+                        '''记录采集页码,已记录页码不在访问'''
+                        finished_pages = CRAWL_RECORDS[category]['pages']
+                        finished_pages.append(str(page_num))
+                        CRAWL_RECORDS.update({category: finished_pages})
+                else:
+                    break
+
+
+# if __name__ == '__main__':
+#     crawl_spider(
+#         crawl_category='',
+#         crawl_max_page=1,
+#         max_request_times=40,
+#         enable_proxy=True,
+#     )

+ 15 - 0
zgzb/kbjl.py

@@ -0,0 +1,15 @@
+from crawler.crawl_spider import crawl_spider
+
+
+def main():
+    crawl_spider(
+        crawl_category='开标记录',
+        crawl_max_page=50,
+        max_request_times=55,
+        enable_proxy=True,
+        headless=True,
+    )
+
+
+if __name__ == '__main__':
+    main()

+ 15 - 0
zgzb/pbjs.py

@@ -0,0 +1,15 @@
+from crawler.crawl_spider import crawl_spider
+
+
+def main():
+    crawl_spider(
+        crawl_category='评标公示',
+        crawl_max_page=85,
+        max_request_times=50,
+        enable_proxy=True,
+        headless=True,
+    )
+
+
+if __name__ == '__main__':
+    main()

+ 15 - 0
zgzb/zbgg.py

@@ -0,0 +1,15 @@
+from crawler.crawl_spider import crawl_spider
+
+
+def main():
+    crawl_spider(
+        crawl_category='招标公告',
+        crawl_max_page=200,
+        max_request_times=45,
+        enable_proxy=True,
+        headless=True,
+    )
+
+
+if __name__ == '__main__':
+    main()

+ 15 - 0
zgzb/zbxm.py

@@ -0,0 +1,15 @@
+from crawler.crawl_spider import crawl_spider
+
+
+def main():
+    crawl_spider(
+        crawl_category='招标项目',
+        crawl_max_page=160,
+        max_request_times=30,
+        enable_proxy=True,
+        headless=True,
+    )
+
+
+if __name__ == '__main__':
+    main()

+ 15 - 0
zgzb/zhbgg.py

@@ -0,0 +1,15 @@
+from crawler.crawl_spider import crawl_spider
+
+
+def main():
+    crawl_spider(
+        crawl_category='中标公告',
+        crawl_max_page=110,
+        max_request_times=60,
+        enable_proxy=True,
+        headless=True,
+    )
+
+
+if __name__ == '__main__':
+    main()