2 сар өмнө · e7a3462dd1
--- a/zgzb/__init__.py
+++ b/zgzb/__init__.py
--- a/zgzb/common/__init__.py
+++ b/zgzb/common/__init__.py
--- a/zgzb/common/aliyun.py
+++ b/zgzb/common/aliyun.py
@@ -1,23 +0,0 @@
 
				-import oss2
			
 
				-
			
 
				-from config.load import oss_conf
			
 
				-
			
 
				-
			
 
				-class AliYunService:
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        self.__acc_key_id = oss_conf['key_id']
			
 
				-        self.__acc_key_secret = oss_conf['key_secret']
			
 
				-        self.__endpoint = oss_conf['endpoint']
			
 
				-        self.__bucket_name = oss_conf['bucket_name']
			
 
				-
			
 
				-    def _push_oss_from_local(self, key, filename):
			
 
				-        """
			
 
				-        上传一个本地文件到OSS的普通文件
			
 
				-
			
 
				-        :param str key: 上传到OSS的文件名
			
 
				-        :param str filename: 本地文件名，需要有可读权限
			
 
				-        """
			
 
				-        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
			
 
				-        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
			
 
				-        bucket.put_object_from_file(key, filename)
			
--- a/zgzb/common/attachment.py
+++ b/zgzb/common/attachment.py
@@ -1,224 +0,0 @@
 
				-import hashlib
			
 
				-import os
			
 
				-import re
			
 
				-import traceback
			
 
				-import web_uuid
			
 
				-from urllib.parse import urlparse, unquote
			
 
				-
			
 
				-import requests
			
 
				-import urllib3
			
 
				-
			
 
				-from common.aliyun import AliYunService
			
 
				-from common.execptions import AttachmentNullError
			
 
				-from common.log import logger
			
 
				-from common.socks5 import Proxy
			
 
				-from config.load import headers
			
 
				-
			
 
				-urllib3.disable_warnings()
			
 
				-# 文件文档类型
			
 
				-DOCTYPE = {
			
 
				-    'txt', 'rtf', 'dps', 'et', 'ett', 'xls',
			
 
				-    'xlsx', 'xlsb', 'xlsm', 'xlt', 'ods', 'pmd', 'pmdx',
			
 
				-    'doc', 'docm', 'docx', 'dot', 'dotm', 'dotx',
			
 
				-    'odt', 'wps', 'csv', 'xml', 'xps'
			
 
				-}
			
 
				-# 压缩类型
			
 
				-COMPRESSION_TYPE = {
			
 
				-    'rar', 'zip', 'gzzb', '7z', 'tar', 'gz', 'bz2', 'jar', 'iso', 'cab',
			
 
				-    'arj', 'lzh', 'ace', 'uue', 'edxz',
			
 
				-}
			
 
				-# 图片类型
			
 
				-IMAGE_TYPE = {
			
 
				-    'jpg', 'png', 'jpeg', 'tiff', 'gif', 'psd', 'raw', 'eps', 'svg', 'bmp',
			
 
				-    'pdf'
			
 
				-}
			
 
				-# 其他类型
			
 
				-OTHER_TYPE = {
			
 
				-    'swf', 'nxzf', 'xezf', 'nxcf'
			
 
				-}
			
 
				-
			
 
				-
			
 
				-def sha1(val):
			
 
				-    _sha1 = hashlib.sha1()
			
 
				-    if isinstance(val, bytes):
			
 
				-        _sha1.update(str(val).encode("utf-8"))
			
 
				-    elif isinstance(val, str):
			
 
				-        _sha1.update(val.encode("utf-8"))
			
 
				-    return _sha1.hexdigest()
			
 
				-
			
 
				-
			
 
				-def remove(file_path: str):
			
 
				-    os.remove(file_path)
			
 
				-
			
 
				-
			
 
				-def getsize(file):
			
 
				-    try:
			
 
				-        return os.path.getsize(file)
			
 
				-    except FileNotFoundError:
			
 
				-        return 0
			
 
				-
			
 
				-
			
 
				-def discern_file_format(text):
			
 
				-    file_types = {
			
 
				-        *DOCTYPE,
			
 
				-        *COMPRESSION_TYPE,
			
 
				-        *IMAGE_TYPE,
			
 
				-        *OTHER_TYPE
			
 
				-    }
			
 
				-    for file_type in file_types:
			
 
				-        all_file_format = [file_type, file_type.upper()]
			
 
				-        for t in all_file_format:
			
 
				-            result = re.match(f'.*{t}$', text, re.S)
			
 
				-            if result is not None:
			
 
				-                return t
			
 
				-    else:
			
 
				-        unknown_type = re.findall('[^.\\/:*?"<>|\r\n]+$', text, re.S)
			
 
				-        logger.warning(f'[附件类型识别]未定义的文件类型{unknown_type}')
			
 
				-        return None
			
 
				-
			
 
				-
			
 
				-def extract_file_type(text):
			
 
				-    if text is None:
			
 
				-        return None
			
 
				-    return discern_file_format(text)
			
 
				-
			
 
				-
			
 
				-def extract_file_name_by_href(href: str, file_type: str):
			
 
				-    """从url中抽取文件名称"""
			
 
				-    # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
			
 
				-    # 中文字符:[\u4e00 -\u9fa5]
			
 
				-    zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
			
 
				-    parser = urlparse(href)
			
 
				-    query = (parser.query or parser.path)
			
 
				-    result = re.search(f'.*\\.{file_type}', query, re.S)
			
 
				-    if result is not None:
			
 
				-        encode_str = unquote(result.group())
			
 
				-        name = re.search(zh_char_pattern, encode_str)
			
 
				-        if name is not None:
			
 
				-            return unquote(name.group())
			
 
				-    return None
			
 
				-
			
 
				-
			
 
				-def extract_file_name(text):
			
 
				-    file_type = discern_file_format(text)
			
 
				-    if file_type is not None:
			
 
				-        repl = '.{}'.format(file_type)
			
 
				-        text = text.replace(repl, '')
			
 
				-    return text
			
 
				-
			
 
				-
			
 
				-def verify_file_name(name):
			
 
				-    if extract_file_type(name) is None:
			
 
				-        raise ValueError
			
 
				-
			
 
				-
			
 
				-class AttachmentDownloader(AliYunService):
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        super(AttachmentDownloader, self).__init__()
			
 
				-        self.dir_name = 'file'
			
 
				-
			
 
				-    def _create_file(self, filename, filetype):
			
 
				-        os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
			
 
				-        file = "{filename}.{filetype}".format(
			
 
				-            filename=sha1("{}_{}".format(filename, uuid.uuid4())),
			
 
				-            filetype=filetype
			
 
				-        )
			
 
				-        return "{}/{}".format(self.dir_name, file)
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _create_fid(file_stream: bytes):
			
 
				-        return sha1(file_stream)
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _origin_filename(fid: str, filetype: str):
			
 
				-        return "{}.{}".format(fid, filetype)
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _file_size(file: str):
			
 
				-        _kb = float(getsize(file)) / 1024
			
 
				-        if _kb >= 1024:
			
 
				-            _M = _kb / 1024
			
 
				-            if _M >= 1024:
			
 
				-                _G = _M / 1024
			
 
				-                return "{:.1f} G".format(_G)
			
 
				-            else:
			
 
				-                return "{:.1f} M".format(_M)
			
 
				-        else:
			
 
				-            return "{:.1f} kb".format(_kb)
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _download(
			
 
				-            url: str,
			
 
				-            file: str,
			
 
				-            enable_proxy=False,
			
 
				-            allow_show_exception=False,
			
 
				-            **kwargs
			
 
				-    ):
			
 
				-        request_params = {}
			
 
				-        request_params.setdefault('headers', kwargs.get('headers') or headers)
			
 
				-        request_params.setdefault('proxies', kwargs.get('proxies'))
			
 
				-        request_params.setdefault('timeout', kwargs.get('timeout') or 60)
			
 
				-        request_params.setdefault('stream', kwargs.get('stream') or True)
			
 
				-        request_params.setdefault('verify', kwargs.get('verify') or False)
			
 
				-        proxy = Proxy(enable_proxy)
			
 
				-        retries = 0
			
 
				-        while retries < 3:
			
 
				-            try:
			
 
				-                with requests.get(url, **request_params) as req:
			
 
				-                    if req.status_code == 200:
			
 
				-                        stream = req.content
			
 
				-                        with open(file, 'wb') as f:
			
 
				-                            f.write(stream)
			
 
				-                        return stream
			
 
				-                    else:
			
 
				-                        retries += 1
			
 
				-            except requests.RequestException:
			
 
				-                if allow_show_exception:
			
 
				-                    traceback.print_exc()
			
 
				-                if enable_proxy:
			
 
				-                    proxy.switch()
			
 
				-                    request_params.update({'proxies': proxy.proxies})
			
 
				-                retries += 1
			
 
				-        return b''
			
 
				-
			
 
				-    def download(
			
 
				-            self,
			
 
				-            file_name: str,
			
 
				-            file_type: str,
			
 
				-            download_url: str,
			
 
				-            enable_proxy=False,
			
 
				-            allow_request_exception=False,
			
 
				-            **kwargs
			
 
				-    ):
			
 
				-        if not file_name or not file_type or not download_url:
			
 
				-            raise AttachmentNullError
			
 
				-
			
 
				-        local_tmp_file = self._create_file(file_name, file_type)
			
 
				-        file_stream = self._download(
			
 
				-            download_url,
			
 
				-            local_tmp_file,
			
 
				-            enable_proxy,
			
 
				-            allow_request_exception,
			
 
				-            **kwargs
			
 
				-        )
			
 
				-        result = {
			
 
				-            'filename': '{}.{}'.format(file_name, file_type),
			
 
				-            'org_url': download_url
			
 
				-        }
			
 
				-        if len(file_stream) > 0:
			
 
				-            try:
			
 
				-                fid = self._create_fid(file_stream)
			
 
				-                key = self._origin_filename(fid, file_type)
			
 
				-                result.setdefault('fid', key)
			
 
				-                result.setdefault('ftype', file_type)
			
 
				-                result.setdefault('size', self._file_size(local_tmp_file))
			
 
				-                result.setdefault('url', 'oss')
			
 
				-                super()._push_oss_from_local(key, local_tmp_file)
			
 
				-            except Exception as e:
			
 
				-                logger.warning(
			
 
				-                    "[{}]下载异常,原因:{}".format(file_name, e.__class__.__name__)
			
 
				-                )
			
 
				-        remove(local_tmp_file)
			
 
				-        '''上传/下载,无论失败/成功必须返回附件信息'''
			
 
				-        return result
			
--- a/zgzb/common/clean_html.py
+++ b/zgzb/common/clean_html.py
@@ -1,133 +0,0 @@
 
				-import re
			
 
				-
			
 
				-__all__ = ['cleaner']
			
 
				-
			
 
				-# 独立元素
			
 
				-INDEPENDENT_TAGS = {
			
 
				-    '<head>[\s\S]*?</head>': '',
			
 
				-    '<html>|<html [^>]*>|</html>': '',
			
 
				-    '<body>|<body [^>]*>|</body>': '',
			
 
				-    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
			
 
				-    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
			
 
				-    '\\xa0|\\u3000': '',  # 空格
			
 
				-    '<!--[\s\S]*?-->': '',  # 注释
			
 
				-    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
			
 
				-    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
			
 
				-    '<input>': '',  # 输入框
			
 
				-    '<img[^>]*>': '<br>',  # 图片
			
 
				-}
			
 
				-# 行内元素
			
 
				-INLINE_TAGS = {
			
 
				-    '<a>|<a [^>]*>|</a>': '',  # 超链接
			
 
				-    '<span>|<span [^>]*>|</span>': '',  # span
			
 
				-    '<label>|<label [^>]*>|</label>': '<br>',  # label
			
 
				-    '<font>|<font [^>]*>|</font>': '',  # font
			
 
				-}
			
 
				-# 块级元素
			
 
				-BLOCK_TAGS = {
			
 
				-    '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
			
 
				-    # '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
			
 
				-    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
			
 
				-    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
			
 
				-    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
			
 
				-}
			
 
				-# 其他
			
 
				-OTHER = {
			
 
				-    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
			
 
				-    '<epointform>': '',
			
 
				-    '<!doctype html>|<!doctype html [^>]*>': '',
			
 
				-    '【关闭】|关闭': '',
			
 
				-    '【打印】|打印本页': '',
			
 
				-    '【字体：[\s\S]*】': '',
			
 
				-    '文章来源：[\u4e00-\u9fa5]+': '',
			
 
				-    '浏览次数：.*[<]+': '',
			
 
				-    '（责任编辑：.*?）': '',
			
 
				-    '分享到[：]': '',
			
 
				-    '阅读数[:：]\d+': '',
			
 
				-}
			
 
				-# 样式
			
 
				-CSS_STYLE = {
			
 
				-    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
			
 
				-    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
			
 
				-    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
			
 
				-    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
			
 
				-    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
			
 
				-    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
			
 
				-}
			
 
				-# 空白符
			
 
				-BLANKS = {
			
 
				-    '\n\s*\n': '\n',
			
 
				-    '\s*\n\s*': '\n',
			
 
				-    '[^\S\n]': ' ',
			
 
				-    '\s+': ' ',
			
 
				-}
			
 
				-# css标签集合
			
 
				-TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
			
 
				-# css属性集合
			
 
				-ATTRS = {'id', 'class', 'style', 'width'}
			
 
				-
			
 
				-
			
 
				-def _repair_tag():
			
 
				-    """异常的标签组合,用来替换非标准页面的标签"""
			
 
				-    _repairs = {}
			
 
				-    for tag in TAGS:
			
 
				-        for attr in ATTRS:
			
 
				-            key = '{}{}'.format(tag, attr)
			
 
				-            val = '{} {}'.format(tag, attr)
			
 
				-            _repairs[key] = val
			
 
				-    return _repairs
			
 
				-
			
 
				-
			
 
				-def _escape_character(html):
			
 
				-    """转义字符"""
			
 
				-    html = html.replace('&lt;', '<')
			
 
				-    html = html.replace('&gt;', '>')
			
 
				-    html = html.replace('&quot;', '"')
			
 
				-    html = html.replace('&amp;', '&')
			
 
				-    return html
			
 
				-
			
 
				-
			
 
				-def _lowercase_tag(html):
			
 
				-    """标签归一化处理（全部小写）"""
			
 
				-    tags = re.findall("<[^>]+>", html)
			
 
				-    for tag in tags:
			
 
				-        html = html.replace(tag, str(tag).lower())
			
 
				-
			
 
				-    repair_tags = _repair_tag()
			
 
				-    for err, right in repair_tags.items():
			
 
				-        html = html.replace(err, right)
			
 
				-
			
 
				-    return html
			
 
				-
			
 
				-
			
 
				-def cleaner(html, special=None, completely=False):
			
 
				-    """
			
 
				-    数据清洗
			
 
				-
			
 
				-    :param html: 清洗的页面
			
 
				-    :param special: 额外指定页面清洗规则
			
 
				-    :param completely: 是否完全清洗页面
			
 
				-    :return: 清洗后的页面源码
			
 
				-    """
			
 
				-    if special is None:
			
 
				-        special = {}
			
 
				-    OTHER.update(special)
			
 
				-    remove_tags = {
			
 
				-        **INDEPENDENT_TAGS,
			
 
				-        **INLINE_TAGS,
			
 
				-        **BLOCK_TAGS,
			
 
				-        **OTHER,
			
 
				-        **CSS_STYLE,
			
 
				-        **BLANKS,
			
 
				-    }
			
 
				-    html = _lowercase_tag(html)
			
 
				-    for tag, repl in remove_tags.items():
			
 
				-        html = re.sub(tag, repl, html)
			
 
				-
			
 
				-    if completely:
			
 
				-        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
			
 
				-        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
			
 
				-        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
			
 
				-
			
 
				-    html = _escape_character(html)
			
 
				-    return html
			
--- a/zgzb/common/databases.py
+++ b/zgzb/common/databases.py
@@ -1,112 +0,0 @@
 
				-import bson
			
 
				-import pymongo
			
 
				-import redis
			
 
				-import requests
			
 
				-from elasticsearch import Elasticsearch
			
 
				-
			
 
				-from config.load import mongo_conf, redis_conf, es_conf, analyze_url
			
 
				-
			
 
				-
			
 
				-# ---------------------------------- mongo ----------------------------------
			
 
				-def mongo_client(cfg=None, host=None, port=None):
			
 
				-    if cfg is None:
			
 
				-        if host is not None and port is not None:
			
 
				-            cfg = {'host': host, 'port': port}
			
 
				-        else:
			
 
				-            cfg = mongo_conf
			
 
				-    return pymongo.MongoClient(host=cfg['host'], port=cfg['port'])
			
 
				-
			
 
				-
			
 
				-def mongo_database(db: str, **kw):
			
 
				-    client = mongo_client(**kw)
			
 
				-    return client[db]
			
 
				-
			
 
				-
			
 
				-def mongo_table(db: str, coll: str, **kw):
			
 
				-    client = mongo_client(**kw)
			
 
				-    return client[db][coll]
			
 
				-
			
 
				-
			
 
				-def int2long(param: int):
			
 
				-    """int 转换成 long """
			
 
				-    return bson.int64.Int64(param)
			
 
				-
			
 
				-
			
 
				-def object_id(_id: str):
			
 
				-    return bson.objectid.ObjectId(_id)
			
 
				-
			
 
				-
			
 
				-# ---------------------------------- es ----------------------------------
			
 
				-def es_client(cfg=None):
			
 
				-    if cfg is None:
			
 
				-        cfg = es_conf
			
 
				-    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
			
 
				-
			
 
				-
			
 
				-def es_participles_service(text: str):
			
 
				-    """
			
 
				-    获取文本的分词列表
			
 
				-
			
 
				-    :param text: 需要分词的文本
			
 
				-    :return: 分词列表
			
 
				-    """
			
 
				-    result = []
			
 
				-    params = {"text": text, "analyzer": "ik_smart"}
			
 
				-    res = requests.get(analyze_url, params=params, timeout=60)
			
 
				-    if res.status_code == 200:
			
 
				-        tokens = res.json().get('tokens', [])
			
 
				-        for x in tokens:
			
 
				-            if x["token"].encode('utf-8').isalpha():
			
 
				-                continue
			
 
				-            result.append(x["token"])
			
 
				-    return result
			
 
				-
			
 
				-
			
 
				-def es_query(title: str, publish_time: int):
			
 
				-    """
			
 
				-    查询es
			
 
				-
			
 
				-    :param title: 标题
			
 
				-    :param publish_time: 发布时间
			
 
				-    :return:
			
 
				-    """
			
 
				-    client = es_client()
			
 
				-    stime = publish_time - 432000  # 往前推5天
			
 
				-    etime = publish_time + 432000
			
 
				-    conditions = []
			
 
				-    participles = es_participles_service(title)
			
 
				-    for word in participles:
			
 
				-        conditions.append({
			
 
				-            "multi_match": {
			
 
				-                "query": word,
			
 
				-                "type": "phrase",
			
 
				-                "fields": ["title"]
			
 
				-            }
			
 
				-        })
			
 
				-    conditions.append({
			
 
				-        "range": {"publishtime": {"from": stime, "to": etime}}
			
 
				-    })
			
 
				-    query = {
			
 
				-        "query": {
			
 
				-            "bool": {
			
 
				-                "must": conditions,
			
 
				-                "minimum_should_match": 1
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-    result = client.search(index='bidding', body=query, request_timeout=100)
			
 
				-    count = len(result['hits']['hits'])
			
 
				-    return count
			
 
				-
			
 
				-
			
 
				-# ---------------------------------- redis ----------------------------------
			
 
				-def redis_client(cfg=None):
			
 
				-    if cfg is None:
			
 
				-        cfg = redis_conf
			
 
				-    pool = redis.ConnectionPool(
			
 
				-        host=cfg['host'],
			
 
				-        port=cfg['port'],
			
 
				-        password=cfg['pwd'],
			
 
				-        db=cfg['db']
			
 
				-    )
			
 
				-    return redis.Redis(connection_pool=pool, decode_responses=True)
			
--- a/zgzb/common/execptions.py
+++ b/zgzb/common/execptions.py
@@ -1,35 +0,0 @@
 
				-
			
 
				-class JyBasicException(Exception):
			
 
				-
			
 
				-    def __init__(self, code: int, reason: str, **kwargs):
			
 
				-        self.code = code
			
 
				-        self.reason = reason
			
 
				-        self.err_details = kwargs
			
 
				-        for key, val in kwargs.items():
			
 
				-            setattr(self, key, val)
			
 
				-
			
 
				-
			
 
				-class CustomCheckError(JyBasicException):
			
 
				-
			
 
				-    def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
			
 
				-        self.code = code
			
 
				-        self.reason = reason
			
 
				-        self.err_details = kwargs
			
 
				-        for key, val in kwargs.items():
			
 
				-            setattr(self, key, val)
			
 
				-
			
 
				-
			
 
				-class AttachmentNullError(JyBasicException):
			
 
				-
			
 
				-    def __init__(self, code: int = 10004, reason: str = '附件下载异常', **kwargs):
			
 
				-        self.code = code
			
 
				-        self.reason = reason
			
 
				-        self.err_details = kwargs
			
 
				-        for key, val in kwargs.items():
			
 
				-            setattr(self, key, val)
			
 
				-
			
 
				-
			
 
				-class CustomAccountPrivilegeError(JyBasicException):
			
 
				-
			
 
				-    def __init__(self, *args, **kwargs):
			
 
				-        pass
			
--- a/zgzb/common/log.py
+++ b/zgzb/common/log.py
@@ -1,14 +0,0 @@
 
				-from pathlib import Path
			
 
				-
			
 
				-from loguru import logger
			
 
				-
			
 
				-_absolute = Path(__file__).absolute().parent.parent
			
 
				-_log_path = (_absolute / 'logs/crawl-{time:YYYY-MM-DD}.log').resolve()
			
 
				-logger.add(
			
 
				-    _log_path,
			
 
				-    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
			
 
				-    level='INFO',
			
 
				-    rotation='00:00',
			
 
				-    retention='1 week',
			
 
				-    encoding='utf-8',
			
 
				-)
			
--- a/zgzb/common/socks5.py
+++ b/zgzb/common/socks5.py
@@ -1,41 +0,0 @@
 
				-import threading
			
 
				-
			
 
				-import requests
			
 
				-
			
 
				-from common.log import logger
			
 
				-from config.load import jy_proxy
			
 
				-
			
 
				-__all__ = ['Proxy']
			
 
				-
			
 
				-
			
 
				-class Socks5Proxy:
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        self._lock = threading.RLock()
			
 
				-        self._enable_proxy = False
			
 
				-        self._url = jy_proxy['socks5']['url']
			
 
				-        self._auth = jy_proxy['socks5']['auth']
			
 
				-        self._proxies = None
			
 
				-
			
 
				-    @property
			
 
				-    def proxies(self):
			
 
				-        return self._proxies
			
 
				-
			
 
				-    def switch(self):
			
 
				-        with self._lock:
			
 
				-            if self._enable_proxy:
			
 
				-                self._proxies = self._fetch_proxies()
			
 
				-
			
 
				-    def _fetch_proxies(self):
			
 
				-        proxy = requests.get(self._url, headers=self._auth).json()
			
 
				-        return proxy.get("data")
			
 
				-
			
 
				-    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
			
 
				-        self._enable_proxy = enable_proxy
			
 
				-        if self._enable_proxy:
			
 
				-            logger.info("[加载socks5代理]")
			
 
				-            self._proxies = self._fetch_proxies()
			
 
				-        return self
			
 
				-
			
 
				-
			
 
				-Proxy = Socks5Proxy()
			
--- a/zgzb/common/tools.py
+++ b/zgzb/common/tools.py
@@ -1,117 +0,0 @@
 
				-import datetime
			
 
				-import hashlib
			
 
				-import re
			
 
				-import time
			
 
				-from collections import namedtuple
			
 
				-
			
 
				-from lxml.html import HtmlElement, fromstring, tostring
			
 
				-
			
 
				-SearchText = namedtuple('SearchText', ['total'])
			
 
				-
			
 
				-
			
 
				-def element2html(element: HtmlElement) -> str:
			
 
				-    return tostring(element, encoding="utf-8").decode()
			
 
				-
			
 
				-
			
 
				-def html2element(html: str) -> HtmlElement:
			
 
				-    return fromstring(html)
			
 
				-
			
 
				-
			
 
				-def valid_element(node: HtmlElement, feature: str):
			
 
				-    if len(node.xpath(feature)) > 0:
			
 
				-        return True
			
 
				-    else:
			
 
				-        return False
			
 
				-
			
 
				-
			
 
				-def remove_node(node: HtmlElement):
			
 
				-    """
			
 
				-    this is a in-place operation, not necessary to return
			
 
				-    :param node:
			
 
				-    :return:
			
 
				-    """
			
 
				-    parent = node.getparent()
			
 
				-    if parent is not None:
			
 
				-        parent.remove(node)
			
 
				-
			
 
				-
			
 
				-def text_search(text: str) -> SearchText:
			
 
				-    """
			
 
				-    中文检索
			
 
				-
			
 
				-    :param text: 文本
			
 
				-    :return: 中文数量
			
 
				-    """
			
 
				-    if not text:
			
 
				-        return SearchText(0)
			
 
				-
			
 
				-    results = re.findall('[\u4e00-\u9fa5]', text, re.S)
			
 
				-    # 列表长度即是中文的字数
			
 
				-    return SearchText(len(results))
			
 
				-
			
 
				-
			
 
				-def verify_text(val: str):
			
 
				-    if val is None:
			
 
				-        return False
			
 
				-    """检查数字、字母、中文的个数"""
			
 
				-    sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
			
 
				-    for pattern in sub_pattern:
			
 
				-        val = re.sub(pattern, '', val)
			
 
				-    # 若文本长度小于50，表示页面内容无详情内容
			
 
				-    if len(val) < 50:
			
 
				-        '''无效文本'''
			
 
				-        return False
			
 
				-    '''有效文本'''
			
 
				-    return True
			
 
				-
			
 
				-
			
 
				-def sha1(text: str):
			
 
				-    """
			
 
				-    十六进制数字字符串形式摘要值
			
 
				-
			
 
				-    @param text: 字符串文本
			
 
				-    @return: 摘要值
			
 
				-    """
			
 
				-    _sha1 = hashlib.sha1()
			
 
				-    _sha1.update(text.encode("utf-8"))
			
 
				-    return _sha1.hexdigest()
			
 
				-
			
 
				-
			
 
				-def get_ms() -> int:
			
 
				-    return int(round(time.time() * 1000))
			
 
				-
			
 
				-
			
 
				-def get_current_date():
			
 
				-    return datetime.datetime.now().strftime("%Y-%m-%d")
			
 
				-
			
 
				-
			
 
				-def ms2date(ms: int, fmt="%Y-%m-%d %H:%M:%S"):
			
 
				-    """毫秒转日期"""
			
 
				-    timestamp = float(ms / 1000)
			
 
				-    time_array = time.localtime(timestamp)
			
 
				-    return time.strftime(fmt, time_array)
			
 
				-
			
 
				-
			
 
				-def convert2type(ts_str):
			
 
				-    """字符串类型时间戳转成整型"""
			
 
				-    return int(float(ts_str) / 1000)
			
 
				-
			
 
				-
			
 
				-def ts2date(ts_str, fmt="%Y-%m-%d %H:%M:%S") -> str:
			
 
				-    """
			
 
				-    时间戳转成日期
			
 
				-
			
 
				-    :param ts_str: 毫秒级时间戳
			
 
				-    :param fmt: 日期格式
			
 
				-    :return: 日期
			
 
				-    """
			
 
				-    timestamp = int(float(ts_str) / 1000)
			
 
				-    time_array = time.localtime(timestamp)
			
 
				-    return time.strftime(fmt, time_array)
			
 
				-
			
 
				-
			
 
				-def date2ts(date_str: str, fmt="%Y-%m-%d"):
			
 
				-    """日期转成时间戳"""
			
 
				-    time_array = time.strptime(date_str, fmt)
			
 
				-    timestamp = int(time.mktime(time_array))
			
 
				-    return timestamp
			
--- a/zgzb/common/webdriver/__init__.py
+++ b/zgzb/common/webdriver/__init__.py
@@ -1,8 +0,0 @@
 
				-from .utils import (
			
 
				-    check_navigator,
			
 
				-    new_window,
			
 
				-    get_user_agent,
			
 
				-    get_title,
			
 
				-    until_wait,
			
 
				-)
			
 
				-from .webdriver import WebDriver, FireFoxWebDriverError
			
--- a/zgzb/common/webdriver/utils.py
+++ b/zgzb/common/webdriver/utils.py
@@ -1,59 +0,0 @@
 
				-from selenium.webdriver.common.by import By
			
 
				-from selenium.webdriver.support import expected_conditions as EC
			
 
				-from selenium.webdriver.support.ui import WebDriverWait
			
 
				-
			
 
				-
			
 
				-def check_navigator(driver):
			
 
				-    """检查navigator属性"""
			
 
				-    script = "return window.navigator.webdriver"
			
 
				-    return driver.execute_script(script)
			
 
				-
			
 
				-
			
 
				-def until_wait(
			
 
				-        driver,
			
 
				-        *,
			
 
				-        xpath=None,
			
 
				-        classname=None,
			
 
				-        text=None,
			
 
				-        timeout=None
			
 
				-):
			
 
				-    """
			
 
				-    显示等待页面加载,否则抛出TimeoutException
			
 
				-
			
 
				-    :param driver: 浏览器驱动
			
 
				-    :param xpath: xpath规则，页面等待特征
			
 
				-    :param classname: class属性名称，页面等待特征
			
 
				-    :param text: 期待的文本
			
 
				-    :param timeout: 超时时间
			
 
				-    :return:
			
 
				-    """
			
 
				-    _timeout = (timeout or 60)
			
 
				-    wait = WebDriverWait(driver, _timeout, 0.2)
			
 
				-    if xpath is not None:
			
 
				-        locator = (By.XPATH, xpath)
			
 
				-        if text is not None:
			
 
				-            wait.until(EC.text_to_be_present_in_element(locator, text))
			
 
				-        else:
			
 
				-            wait.until(EC.presence_of_element_located(locator))
			
 
				-
			
 
				-    elif classname is not None:
			
 
				-        locator = (By.CLASS_NAME, classname)
			
 
				-        if text is not None:
			
 
				-            wait.until(EC.text_to_be_present_in_element(locator, text))
			
 
				-        else:
			
 
				-            wait.until(EC.presence_of_element_located(locator))
			
 
				-
			
 
				-
			
 
				-def new_window(driver):
			
 
				-    """新的窗口"""
			
 
				-    driver.execute_script('window.open();')
			
 
				-    handles = driver.window_handles
			
 
				-    driver.switch_to.window(handles[-1])
			
 
				-
			
 
				-
			
 
				-def get_user_agent(driver):
			
 
				-    return driver.execute_script("return navigator.userAgent;")
			
 
				-
			
 
				-
			
 
				-def get_title(driver):
			
 
				-    return driver.execute_script('return document.title')
			
--- a/zgzb/common/webdriver/webdriver.py
+++ b/zgzb/common/webdriver/webdriver.py
@@ -1,147 +0,0 @@
 
				-import datetime
			
 
				-from collections import namedtuple
			
 
				-from pathlib import Path
			
 
				-
			
 
				-from selenium import webdriver
			
 
				-from selenium.common.exceptions import WebDriverException
			
 
				-from selenium.webdriver import Firefox
			
 
				-
			
 
				-from common.log import logger
			
 
				-
			
 
				-_absolute = Path(__file__).absolute().parent.parent.parent
			
 
				-_date = datetime.datetime.now().strftime('%Y-%m-%d')
			
 
				-SERVICE_LOG_PATH = (_absolute / f'logs/geckodriver-{_date}.log').resolve()
			
 
				-
			
 
				-DEFAULT_USERAGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0"
			
 
				-Netloc = namedtuple('Netloc', ['host', 'port'])
			
 
				-
			
 
				-
			
 
				-def netloc(proxies: dict) -> Netloc:
			
 
				-    host, port = proxies["https"].replace("socks5://", "").split(":")
			
 
				-    return Netloc(host, port)
			
 
				-
			
 
				-
			
 
				-class FireFoxWebDriverError(WebDriverException):
			
 
				-    pass
			
 
				-
			
 
				-
			
 
				-class WebDriver(Firefox):
			
 
				-
			
 
				-    def __init__(self, load_images=True, user_agent=None, proxy=None,
			
 
				-                 headless=True, timeout=60, log_path=None,
			
 
				-                 window_size=(1024, 800), executable_path=None,
			
 
				-                 custom_argument=None, **kwargs):
			
 
				-        """
			
 
				-        Args:
			
 
				-            load_images: 是否加载图片
			
 
				-            user_agent: 字符串 或 无参函数，返回值为user_agent
			
 
				-            proxy: {'https://sockets:xxx.xxx.xxx.xxx:xxxx'} 或 无参函数，返回值为代理地址
			
 
				-            headless: 是否启用无头模式
			
 
				-            driver_type: FIREFOX
			
 
				-            timeout: 请求超时时间
			
 
				-            log_path: Geckodriver服务的日志文件路径
			
 
				-            window_size: 窗口大小
			
 
				-            executable_path: 浏览器路径，默认为默认路径
			
 
				-            custom_argument: 自定义配置参数
			
 
				-            **kwargs: 需要额外配置的Firefox参数
			
 
				-        """
			
 
				-        self._load_images = load_images
			
 
				-        self._user_agent = user_agent or DEFAULT_USERAGENT
			
 
				-        self._proxy = proxy
			
 
				-        self._headless = headless
			
 
				-        self._timeout = timeout
			
 
				-        self._window_size = window_size
			
 
				-        self._executable_path = executable_path
			
 
				-        self._custom_argument = custom_argument
			
 
				-        self._service_log_path = log_path or str(SERVICE_LOG_PATH)
			
 
				-
			
 
				-        _profile = webdriver.FirefoxProfile()
			
 
				-        _options = webdriver.FirefoxOptions()
			
 
				-        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
			
 
				-        _profile.set_preference("dom.webdriver.enabled", False)
			
 
				-        _profile.set_preference('useAutomationExtension', False)
			
 
				-        # _profile.set_preference('privacy.resistFingerprinting', True)  # 启用指纹保护
			
 
				-        if self._proxy:
			
 
				-            proxy = self._proxy() if callable(self._proxy) else self._proxy
			
 
				-            host, port = netloc(proxy)
			
 
				-            # 使用socks5 代理, 不使用代理：0, 使用代理：1
			
 
				-            _profile.set_preference('network.proxy.type', 1)
			
 
				-            _profile.set_preference('network.proxy.socks', host)
			
 
				-            _profile.set_preference('network.proxy.socks_port', int(port))
			
 
				-
			
 
				-        if self._user_agent:
			
 
				-            _profile.set_preference(
			
 
				-                "general.useragent.override",
			
 
				-                self._user_agent() if callable(self._user_agent) else self._user_agent,
			
 
				-            )
			
 
				-
			
 
				-        if not self._load_images:
			
 
				-            '''
			
 
				-            允许加载所有图像，无论来源如何（默认）=1
			
 
				-            阻止所有图像加载=2
			
 
				-            防止加载第三方图像=3
			
 
				-            '''
			
 
				-            _profile.set_preference("permissions.default.image", 2)
			
 
				-
			
 
				-        _profile.update_preferences()
			
 
				-
			
 
				-        if self._headless:
			
 
				-            _options.add_argument("--headless")
			
 
				-            _options.add_argument("--disable-gpu")
			
 
				-
			
 
				-        if self._custom_argument:
			
 
				-            [_options.add_argument(arg) for arg in self._custom_argument]
			
 
				-
			
 
				-        if self._executable_path:
			
 
				-            super(WebDriver, self).__init__(
			
 
				-                service_log_path=self._service_log_path,
			
 
				-                capabilities=firefox_capabilities,
			
 
				-                options=_options,
			
 
				-                firefox_profile=_profile,
			
 
				-                executable_path=self._executable_path,
			
 
				-                **kwargs
			
 
				-            )
			
 
				-        else:
			
 
				-            super(WebDriver, self).__init__(
			
 
				-                service_log_path=self._service_log_path,
			
 
				-                capabilities=firefox_capabilities,
			
 
				-                options=_options,
			
 
				-                firefox_profile=_profile,
			
 
				-                **kwargs
			
 
				-            )
			
 
				-
			
 
				-        if self._window_size:
			
 
				-            self.set_window_size(*self._window_size)
			
 
				-
			
 
				-        self.set_page_load_timeout(self._timeout)
			
 
				-        self.set_script_timeout(self._timeout)
			
 
				-
			
 
				-    def __enter__(self):
			
 
				-        return self
			
 
				-
			
 
				-    def __exit__(self, exc_type, exc_val, exc_tb):
			
 
				-        if exc_val:
			
 
				-            logger.exception(f'{self.__class__.__name__} <> {exc_type.__name__}: {exc_val}')
			
 
				-        self.quit()
			
 
				-        print("关闭浏览器")
			
 
				-        return True
			
 
				-
			
 
				-    @property
			
 
				-    def cookies(self):
			
 
				-        cookies_json = {}
			
 
				-        for cookie in self.get_cookies():
			
 
				-            cookies_json[cookie["name"]] = cookie["value"]
			
 
				-        return cookies_json
			
 
				-
			
 
				-    @cookies.setter
			
 
				-    def cookies(self, val: dict):
			
 
				-        """
			
 
				-        设置cookie
			
 
				-        Args:
			
 
				-            val: {"key":"value", "key2":"value2"}
			
 
				-
			
 
				-        Returns:
			
 
				-
			
 
				-        """
			
 
				-        for key, value in val.items():
			
 
				-            self.add_cookie({"name": key, "value": value})
			
--- a/zgzb/config/__init__.py
+++ b/zgzb/config/__init__.py
--- a/zgzb/config/conf.yaml
+++ b/zgzb/config/conf.yaml
@@ -1,40 +0,0 @@
 
				-# mongo
			
 
				-mongo:
			
 
				-  host: 172.17.4.87
			
 
				-  port: !!int 27080
			
 
				-#  host: 127.0.0.1
			
 
				-#  port: !!int 27017
			
 
				-
			
 
				-
			
 
				-# redis
			
 
				-redis:
			
 
				-  host: 127.0.0.1
			
 
				-  port: !!int 6379
			
 
				-  pwd: ""
			
 
				-  db: !!int 10
			
 
				-
			
 
				-
			
 
				-# 阿里oss
			
 
				-ali_oss:
			
 
				-  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
			
 
				-  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
			
 
				-#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
			
 
				-  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
			
 
				-  bucket_name: jy-datafile
			
 
				-
			
 
				-
			
 
				-# es
			
 
				-es:
			
 
				-  host: 172.17.145.170
			
 
				-#  host: 192.168.3.206
			
 
				-#  host: 127.0.0.1
			
 
				-  port: !!int 9800
			
 
				-  db: biddingall # es库别名
			
 
				-
			
 
				-
			
 
				-# 代理
			
 
				-proxy:
			
 
				-  socks5:
			
 
				-    url: http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch
			
 
				-    auth:
			
 
				-      Authorization: Basic amlhbnl1MDAxOjEyM3F3ZSFB
			
--- a/zgzb/config/constants.yaml
+++ b/zgzb/config/constants.yaml
@@ -1,2 +0,0 @@
 
				-headers:
			
 
				-  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36
			
--- a/zgzb/config/load.py
+++ b/zgzb/config/load.py
@@ -1,34 +0,0 @@
 
				-from pathlib import Path
			
 
				-
			
 
				-import yaml
			
 
				-
			
 
				-__all__ = [
			
 
				-    'mongo_conf',
			
 
				-    'redis_conf',
			
 
				-    'oss_conf',
			
 
				-    'es_conf',
			
 
				-    'jy_proxy',
			
 
				-    'node_module_path',
			
 
				-    'headers',
			
 
				-    'analyze_url'
			
 
				-]
			
 
				-
			
 
				-_base_path = Path(__file__).parent
			
 
				-_yaml_conf = (_base_path / 'conf.yaml').resolve()
			
 
				-_yaml_constants = (_base_path / 'constants.yaml').resolve()
			
 
				-_node_modules = (_base_path.parent / 'node_modules').resolve()
			
 
				-
			
 
				-with open(_yaml_conf, encoding="utf-8") as f:
			
 
				-    _conf = yaml.safe_load(f)
			
 
				-    mongo_conf = _conf['mongo']
			
 
				-    redis_conf = _conf['redis']
			
 
				-    oss_conf: dict = _conf['ali_oss']
			
 
				-    es_conf: dict = _conf['es']
			
 
				-    jy_proxy: dict = _conf['proxy']
			
 
				-    node_module_path = _node_modules
			
 
				-    analyze_url = f'http://{es_conf["host"]}:{es_conf["port"]}/{es_conf["db"]}/_analyze'
			
 
				-
			
 
				-
			
 
				-with open(_yaml_constants, encoding="utf-8") as fp:
			
 
				-    _constants = yaml.safe_load(fp)
			
 
				-    headers: dict = _constants['headers']
			
--- a/zgzb/crawler/__init__.py
+++ b/zgzb/crawler/__init__.py
--- a/zgzb/crawler/crawl_spider.py
+++ b/zgzb/crawler/crawl_spider.py
@@ -1,201 +0,0 @@
 
				-import time
			
 
				-
			
 
				-from selenium.common.exceptions import (
			
 
				-    WebDriverException,
			
 
				-    TimeoutException,
			
 
				-    NoSuchElementException
			
 
				-)
			
 
				-
			
 
				-from common.databases import mongo_table, int2long, redis_client
			
 
				-from common.log import logger
			
 
				-from common.socks5 import Proxy
			
 
				-from common.tools import sha1, date2ts
			
 
				-from common.webdriver import WebDriver
			
 
				-from crawler.defaults import (
			
 
				-    goto,
			
 
				-    crawl_request,
			
 
				-    select_category,
			
 
				-    select_date,
			
 
				-    extract_text,
			
 
				-    crawl_psp_frame,
			
 
				-    crawl_show_details,
			
 
				-    next_page,
			
 
				-    wait_load_list,
			
 
				-    update_crawl_records,
			
 
				-    write_crawl_records,
			
 
				-    parser_list_elements,
			
 
				-    get_crawl_menu,
			
 
				-    get_category_id
			
 
				-)
			
 
				-
			
 
				-crawl_tab = mongo_table(db='py_spider', coll='zgzb_wagf_list')
			
 
				-save_tab = mongo_table(db='py_spider', coll='data_bak')
			
 
				-redis_key = 'zgzb_wagf_2022'
			
 
				-r = redis_client()
			
 
				-
			
 
				-
			
 
				-def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
			
 
				-    proxy = Proxy(enable_proxy)
			
 
				-    headless = kw.get('headless', True)
			
 
				-    crawl_category = kw.get('crawl_category')
			
 
				-    crawl_date = kw.get('crawl_date', '今天')
			
 
				-    prev_num = page_num = 1
			
 
				-    while True:
			
 
				-        proxies = proxy.proxies
			
 
				-        logger.info(f"[采集代理]{proxies}")
			
 
				-        list_page_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do?tabledivIds=searchTabLi2'
			
 
				-        with WebDriver(load_images=False, proxy=proxies, headless=headless) as browser:
			
 
				-            ua = browser.execute_script('return navigator.userAgent')
			
 
				-            print('>>> ', ua)
			
 
				-            success_reqeust = crawl_request(browser, list_page_url)
			
 
				-            if not success_reqeust:
			
 
				-                proxy.switch()
			
 
				-                logger.error('[访问超时]请求列表页')
			
 
				-                continue
			
 
				-
			
 
				-            '''等待加载主页'''
			
 
				-            wait_load_list(browser)
			
 
				-            '''获取主页句柄'''
			
 
				-            main_handler = browser.current_window_handle
			
 
				-            '''选择分类'''
			
 
				-            category = select_category(browser, crawl_category)
			
 
				-            '''分类栏目列表'''
			
 
				-            crawl_menu = get_crawl_menu(category)
			
 
				-            if crawl_menu is None:
			
 
				-                browser.quit()
			
 
				-                logger.info("任务结束")
			
 
				-                break
			
 
				-
			
 
				-            logger.info(f"[分类栏目]{category}")
			
 
				-
			
 
				-            '''选择建立时间'''
			
 
				-            success_select_date = select_date(browser, category, crawl_date)
			
 
				-            if not success_select_date:
			
 
				-                proxy.switch()
			
 
				-                continue
			
 
				-
			
 
				-            exit_crawl = False
			
 
				-            allow_next_page = False
			
 
				-            while True:
			
 
				-                if exit_crawl:
			
 
				-                    proxy.switch()
			
 
				-                    break
			
 
				-
			
 
				-                if allow_next_page:
			
 
				-                    allow_next_page = True
			
 
				-                    try:
			
 
				-                        page_num = next_page(browser, category)
			
 
				-                        if page_num is None or (page_num > crawl_max_page):
			
 
				-                            browser.quit()
			
 
				-                            proxy.switch()
			
 
				-                            update_crawl_records(category, True)
			
 
				-                            break
			
 
				-                        elif page_num != prev_num and page_num % 2 == 0:
			
 
				-                            '''每个代理IP仅采集2页，轮询使用代理'''
			
 
				-                            browser.quit()
			
 
				-                            proxy.switch()
			
 
				-                            prev_num = page_num
			
 
				-                            break
			
 
				-                    except TimeoutException:
			
 
				-                        browser.quit()
			
 
				-                        proxy.switch()
			
 
				-                        logger.error('[访问超时]请求翻页')
			
 
				-                        break
			
 
				-                else:
			
 
				-                    allow_next_page = True
			
 
				-
			
 
				-                '''详情页'''
			
 
				-                web_elements = parser_list_elements(browser, category)
			
 
				-                if web_elements is None:
			
 
				-                    proxy.switch()
			
 
				-                    break
			
 
				-
			
 
				-                for index, element in enumerate(web_elements):
			
 
				-                    index += 1
			
 
				-                    item = {
			
 
				-                        "site": "中国招标投标公共服务平台",
			
 
				-                        "channel": crawl_menu.channel,
			
 
				-                        "spidercode": crawl_menu.spidercode,
			
 
				-                        "T": "bidding",
			
 
				-                        "sendflag": "false",
			
 
				-                        "_d": "comeintime",
			
 
				-                        "comeintime": '',
			
 
				-                        "area": '',
			
 
				-                        "city": '',
			
 
				-                        "publishdept": "",
			
 
				-                        "title": "",
			
 
				-                        "href": "",
			
 
				-                        "publishtime": "",
			
 
				-                        "l_np_publishtime": "",
			
 
				-                    }
			
 
				-                    html = browser.page_source
			
 
				-                    category_id = get_category_id(category)
			
 
				-                    click_detail_js = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@onclick')).strip()
			
 
				-                    href_ = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@href')).strip()
			
 
				-                    detail_js = (click_detail_js or href_)
			
 
				-                    sign = sha1(detail_js)
			
 
				-                    print(f'>>> {sign}')
			
 
				-                    if r.hexists(redis_key, sign):
			
 
				-                        continue
			
 
				-                    '''发布标题'''
			
 
				-                    node1 = element.find_element_by_xpath('./td[1]/a')
			
 
				-                    title = node1.text
			
 
				-                    item['title'] = title
			
 
				-                    '''省市'''
			
 
				-                    node2 = element.find_element_by_xpath('./td[3]/span')
			
 
				-                    region = str(node2.text).replace('【', '').replace('】', '')
			
 
				-                    if region.find(" ") > 0:
			
 
				-                        province, city = region.split(' ')
			
 
				-                    else:
			
 
				-                        province = region
			
 
				-                        city = ''
			
 
				-                    item['area'] = str(province).replace('省', '').replace('市', '').replace('自治区', '')
			
 
				-                    item['city'] = city
			
 
				-                    '''发布时间'''
			
 
				-                    node3 = element.find_element_by_xpath('./td[5]')
			
 
				-                    publish_time = node3.text
			
 
				-                    item['publishtime'] = publish_time
			
 
				-                    item['l_np_publishtime'] = int2long(date2ts(publish_time))
			
 
				-                    item['comeintime'] = int2long(int(time.time()))
			
 
				-                    '''访问详情页'''
			
 
				-                    goto(browser, node1, wait_time=2)
			
 
				-                    '''详情页'''
			
 
				-                    item['href'] = '#'
			
 
				-                    detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do'
			
 
				-                    if detail_js.startswith('showDetails') is False:
			
 
				-                        item['competehref'] = detail_url
			
 
				-                        try:
			
 
				-                            item = crawl_psp_frame(browser, main_handler, item)
			
 
				-                        except NoSuchElementException:
			
 
				-                            exit_crawl = True
			
 
				-                            break
			
 
				-                    else:
			
 
				-                        item['competehref'] = '{}/{}'.format(detail_url, sign)
			
 
				-                        try:
			
 
				-                            item = crawl_show_details(browser, main_handler, item)
			
 
				-                        except (ValueError, WebDriverException) as e:
			
 
				-                            browser.quit()
			
 
				-                            exit_crawl = True
			
 
				-                            if e.__class__.__name__ == 'ValueError':
			
 
				-                                logger.error("[机器人验证]验证失败")
			
 
				-                            break
			
 
				-                    '''入库处理'''
			
 
				-                    if 'contenthtml' not in item:
			
 
				-                        item['crawl_status'] = 'detail_err'
			
 
				-                    else:
			
 
				-                        item['crawl_status'] = 'success'
			
 
				-                        '''保存详情'''
			
 
				-                        save_tab.insert_one(item)
			
 
				-                        del item['contenthtml'], item['detail']
			
 
				-                        if '_id' in item:
			
 
				-                            del item['_id']
			
 
				-                        logger.info(f'[采集成功-{item["channel"]}]{title} - {publish_time}')
			
 
				-                    '''备注:详情页访问参数'''
			
 
				-                    item['remark'] = detail_js
			
 
				-                    '''添加数据指纹'''
			
 
				-                    r.hset(redis_key, sign, '')
			
 
				-                    '''保存列表'''
			
 
				-                    crawl_tab.insert_one(item)
			
 
				-
			
 
				-                logger.info(f"[{category}-第{page_num}页]采集完成")
			
 
				-                write_crawl_records(category, page_num)
			
--- a/zgzb/crawler/defaults.py
+++ b/zgzb/crawler/defaults.py
@@ -1,362 +0,0 @@
 
				-import time
			
 
				-
			
 
				-from selenium.common.exceptions import (
			
 
				-    WebDriverException,
			
 
				-    TimeoutException,
			
 
				-    InvalidSessionIdException,
			
 
				-    NoSuchElementException
			
 
				-)
			
 
				-from selenium.webdriver import ActionChains
			
 
				-from selenium.webdriver.common.by import By
			
 
				-
			
 
				-from common.clean_html import cleaner
			
 
				-from common.databases import int2long
			
 
				-from common.log import logger
			
 
				-from common.tools import html2element, element2html, verify_text, remove_node
			
 
				-from common.webdriver import until_wait
			
 
				-from crawler.params import (
			
 
				-    CRAWL_RECORDS,
			
 
				-    SETUP_TIME,
			
 
				-    SETUP_MAPS,
			
 
				-    CATEGORY_MAPS,
			
 
				-    CRAWL_MENU
			
 
				-)
			
 
				-
			
 
				-
			
 
				-def get_crawl_menu(category: str):
			
 
				-    """采集清单"""
			
 
				-    return CRAWL_MENU.get(category)
			
 
				-
			
 
				-
			
 
				-def get_category_id(category: str):
			
 
				-    """分类id"""
			
 
				-    return CATEGORY_MAPS[category]
			
 
				-
			
 
				-
			
 
				-def extract_text(html: str, feature: str):
			
 
				-    """抽取文本"""
			
 
				-    element = html2element(html)
			
 
				-    return element.xpath(feature)
			
 
				-
			
 
				-
			
 
				-def extract_page_html(html: str, feature: str):
			
 
				-    """抽取页面源码"""
			
 
				-    element = html2element(html)
			
 
				-    '''移除空附件信息'''
			
 
				-    remove_target = element.xpath('//div[@id="isshow"]')
			
 
				-    if len(remove_target) > 0:
			
 
				-        remove_node(remove_target[0])
			
 
				-    try:
			
 
				-        node = element.xpath(feature)[0]
			
 
				-        return element2html(node)
			
 
				-    except IndexError:
			
 
				-        pass
			
 
				-
			
 
				-
			
 
				-def init_crawl_records(driver, web_element, category: str):
			
 
				-    """初始记录"""
			
 
				-    if category not in CRAWL_RECORDS:
			
 
				-        goto(driver, web_element)
			
 
				-        # init_config = {'finished': False, 'pages': ['1']}
			
 
				-        init_config = {'finished': False, 'pages': []}
			
 
				-        CRAWL_RECORDS.setdefault(category, init_config)
			
 
				-        return True
			
 
				-    else:
			
 
				-        _record = CRAWL_RECORDS[category]
			
 
				-        if not _record['finished']:
			
 
				-            goto(driver, web_element)
			
 
				-            return True
			
 
				-        else:
			
 
				-            return False
			
 
				-
			
 
				-
			
 
				-def update_crawl_records(category: str, finished: bool):
			
 
				-    """更新记录"""
			
 
				-    if category in CRAWL_RECORDS:
			
 
				-        _record = CRAWL_RECORDS[category]
			
 
				-        _record['finished'] = finished
			
 
				-        CRAWL_RECORDS.update(_record)
			
 
				-
			
 
				-
			
 
				-def write_crawl_records(category: str, page_num: int):
			
 
				-    """写入记录"""
			
 
				-    if category in CRAWL_RECORDS:
			
 
				-        _record = CRAWL_RECORDS[category]
			
 
				-        '''记录采集页码,已记录页码不在访问'''
			
 
				-        finished_pages = _record['pages']
			
 
				-        finished_pages.append(str(page_num))
			
 
				-        _record.update({'pages': finished_pages})
			
 
				-        CRAWL_RECORDS.update({category: _record})
			
 
				-
			
 
				-
			
 
				-def robots_alert(driver):
			
 
				-    """机器人警告"""
			
 
				-    wait = 0
			
 
				-    while wait < 20:
			
 
				-        '''等待验证模块加载'''
			
 
				-        element = html2element(driver.page_source)
			
 
				-        robot_alert = element.xpath('//span[@class="nc-lang-cnt"]/@data-nc-lang')
			
 
				-        click_alert = element.xpath('//div[@id="text"]/text()')
			
 
				-        if len(robot_alert) == 1 and "".join(robot_alert).strip() == '_Loading':
			
 
				-            time.sleep(0.5)
			
 
				-        elif len(robot_alert) > 1 and robot_alert[0] in ['_yesTEXT']:
			
 
				-            '''通过机器人验证'''
			
 
				-            return False, '0'
			
 
				-        elif len(robot_alert) > 1 and robot_alert[0] in ['_startTEXT']:
			
 
				-            '''机器人验证加载完成'''
			
 
				-            return True, '1'
			
 
				-        elif len(robot_alert) > 1 and robot_alert[0] in ['_errorNetwork']:
			
 
				-            '''网络不给力，请点击刷新，或提交反馈 (00)'''
			
 
				-            return True, '2'
			
 
				-        elif len(click_alert) > 0 and "".join(click_alert) == '请点击此处完成验证或咨询客服':
			
 
				-            return True, '3'
			
 
				-        else:
			
 
				-            return False, '0'
			
 
				-
			
 
				-        wait += 1
			
 
				-    return True, '999'
			
 
				-
			
 
				-
			
 
				-def check_robots_alert(driver):
			
 
				-    """检查并处理机器人警告"""
			
 
				-    while True:
			
 
				-        alert, alert_type = robots_alert(driver)
			
 
				-        if not alert:
			
 
				-            break
			
 
				-
			
 
				-        if alert_type == '1':
			
 
				-            until_wait(driver, xpath='//span[contains(@class, "nc_iconfont btn_slide")]')
			
 
				-            element = driver.find_element_by_xpath('//span[contains(@class, "nc_iconfont btn_slide")]')
			
 
				-            if element.is_displayed():
			
 
				-                # 点击并且不松开鼠标,往右边移动258个位置,松开鼠标
			
 
				-                ActionChains(driver).click_and_hold(element).move_by_offset(xoffset=258, yoffset=0).release().perform()
			
 
				-
			
 
				-        elif alert_type == '2':
			
 
				-            until_wait(driver, xpath='//span[contains(@class, "nc-lang-cnt")]/a[1]')
			
 
				-            element = driver.find_element_by_xpath('//span[contains(@class, "nc-lang-cnt")]/a[1]')
			
 
				-            if element.is_displayed():
			
 
				-                goto(driver, element, wait_time=2)
			
 
				-
			
 
				-        elif alert_type == '3':
			
 
				-            # until_wait(driver, xpath='//div[@id="container"]')
			
 
				-            # element = driver.find_element_by_xpath('//div[@id="container"]')
			
 
				-            # if element.is_displayed():
			
 
				-            #     goto(driver, element, wait_time=3)
			
 
				-            #     driver.switch_to.alert.accept()
			
 
				-            logger.error("[机器人验证]触发浏览器指纹检测,无法自动处理.")
			
 
				-            raise ValueError()
			
 
				-
			
 
				-        else:
			
 
				-            with open('robot.html', 'w') as wp:
			
 
				-                wp.write(driver.page_source)
			
 
				-            logger.error("[未知异常网页]页面源码保存在robot.html")
			
 
				-            raise ValueError()
			
 
				-        time.sleep(2)
			
 
				-
			
 
				-
			
 
				-def refresh_page(driver):
			
 
				-    """刷新页面"""
			
 
				-    element = html2element(driver.page_source)
			
 
				-    node = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
			
 
				-    if "".join(node) == "暂无详细数据":
			
 
				-        driver.refresh()
			
 
				-        time.sleep(1)
			
 
				-        '''页面alert元素确定'''
			
 
				-        driver.switch_to.alert.accept()
			
 
				-    time.sleep(1.5)
			
 
				-    wait_load_detail(driver)
			
 
				-    check_robots_alert(driver)
			
 
				-
			
 
				-
			
 
				-def goto(driver, web_element, wait_time=None, allow_check_page=False):
			
 
				-    """执行可点击js事件"""
			
 
				-    driver.execute_script("arguments[0].click();", web_element)
			
 
				-    _wait_time = (wait_time or 1)
			
 
				-    time.sleep(_wait_time)
			
 
				-    if allow_check_page:
			
 
				-        check_robots_alert(driver)
			
 
				-
			
 
				-
			
 
				-def next_page(driver, category):
			
 
				-    """翻页"""
			
 
				-    _finished_pages = CRAWL_RECORDS[category]['pages']
			
 
				-    while True:
			
 
				-        next_element = driver.find_element_by_xpath('//div[@id="Pagination"]/div[1]/child::a[last()]')
			
 
				-        if next_element.text == '下一页':
			
 
				-            goto(driver, next_element, wait_time=1.2)
			
 
				-            current_element = driver.find_element_by_xpath('//div[@id="Pagination"]/div[1]/child::span[@class="current"]')
			
 
				-            val = current_element.text
			
 
				-            if val not in _finished_pages:
			
 
				-                return int(val)
			
 
				-        else:
			
 
				-            break
			
 
				-    time.sleep(1)
			
 
				-
			
 
				-
			
 
				-def _select_category(driver, custom_category=None):
			
 
				-    """采集分类"""
			
 
				-    web_elements = driver.find_elements(by=By.XPATH, value='//ul[@id="myTab3"]/child::li')
			
 
				-    for element in web_elements:
			
 
				-        val = element.text
			
 
				-        if custom_category is None:
			
 
				-            success = init_crawl_records(driver, element, val)
			
 
				-            return val if success else None
			
 
				-        else:
			
 
				-            if val == custom_category:
			
 
				-                success = init_crawl_records(driver, element, custom_category)
			
 
				-                return val if success else None
			
 
				-
			
 
				-
			
 
				-def select_category(driver, category: str):
			
 
				-    """选择分类"""
			
 
				-    try:
			
 
				-        _category = _select_category(driver, category)
			
 
				-        return _category
			
 
				-    except TimeoutException:
			
 
				-        driver.quit()
			
 
				-        logger.error('[访问超时]选择分类')
			
 
				-        return None
			
 
				-
			
 
				-
			
 
				-def _select_date(driver, category: str, setup_time: str):
			
 
				-    """选择建立时间"""
			
 
				-    logger.info(f"[建立时间]{setup_time}")
			
 
				-    try:
			
 
				-        attr = SETUP_TIME[category][SETUP_MAPS[setup_time]]
			
 
				-        element = driver.find_element(by=By.XPATH, value=f'//a[@id="{attr}"]')
			
 
				-        goto(driver, element)
			
 
				-    except KeyError:
			
 
				-        raise KeyError(f'请设置"SETUP_TIME"中{category}对应的建立时间')
			
 
				-
			
 
				-
			
 
				-def select_date(driver, category, setup_time):
			
 
				-    try:
			
 
				-        _select_date(driver, category, setup_time)
			
 
				-        return True
			
 
				-    except TimeoutException:
			
 
				-        driver.quit()
			
 
				-        logger.error('[访问超时]选择建立时间')
			
 
				-        return False
			
 
				-
			
 
				-
			
 
				-def wait_load_detail(driver, check_feature=None, check_timeout=None):
			
 
				-    """等待二次加载页面结果并检测元素变化"""
			
 
				-    _check_timeout = (check_timeout or 10)
			
 
				-    sleep_interval = 0.5
			
 
				-    max_check_count = int(_check_timeout / sleep_interval)
			
 
				-    if check_feature is not None:
			
 
				-        check_count = 0
			
 
				-        while check_count < max_check_count:
			
 
				-            element = html2element(driver.page_source)
			
 
				-            check_node = element.xpath(check_feature)
			
 
				-            if len(check_node) > 0:
			
 
				-                break
			
 
				-            time.sleep(sleep_interval)
			
 
				-            check_count += 1
			
 
				-    else:
			
 
				-        check_count = 0
			
 
				-        while check_count < max_check_count:
			
 
				-            element = html2element(driver.page_source)
			
 
				-            root = element.xpath('//div[@id="xxnrList"]')
			
 
				-            if len(root) > 0:
			
 
				-                descendant = element.xpath('//div[@id="xxnrList"]/descendant::*')
			
 
				-                if len(descendant) > 0:
			
 
				-                    text = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
			
 
				-                    children = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/child::*')
			
 
				-                    if "".join(text) != '暂无详细数据' and len(children) > 0:
			
 
				-                        break
			
 
				-            time.sleep(sleep_interval)
			
 
				-            check_count += 1
			
 
				-    time.sleep(1)
			
 
				-
			
 
				-
			
 
				-def wait_load_list(driver):
			
 
				-    while True:
			
 
				-        element = html2element(driver.page_source)
			
 
				-        node = element.xpath('//div[@id="myTab_Contenta0"]/div[2]/table//tr')
			
 
				-        if len(node) > 0:
			
 
				-            break
			
 
				-        time.sleep(0.5)
			
 
				-
			
 
				-
			
 
				-def _handler_page_html(html, item):
			
 
				-    """页面源码处理"""
			
 
				-    if all([html is not None, verify_text(html)]):
			
 
				-        item['contenthtml'] = html
			
 
				-        item['detail'] = cleaner(html)
			
 
				-        item['comeintime'] = int2long(int(time.time()))
			
 
				-    else:
			
 
				-        logger.error(
			
 
				-            f'[文本异常-{item["channel"]}]{item["title"]} - {item["publishtime"]}')
			
 
				-    return item
			
 
				-
			
 
				-
			
 
				-def crawl_show_details(driver, handler, item):
			
 
				-    for current_handler in driver.window_handles:
			
 
				-        if current_handler == handler:
			
 
				-            continue
			
 
				-        driver.switch_to.window(current_handler)
			
 
				-        '''加载等待并检查指定页面特征'''
			
 
				-        wait_load_detail(driver, check_feature='//div[@id="xxnrList"]/div[1]/div[2]/div[2]')
			
 
				-        '''检查机器人警告并处理'''
			
 
				-        check_robots_alert(driver)
			
 
				-        '''二次加载'''
			
 
				-        refresh_page(driver)
			
 
				-        '''加载等待'''
			
 
				-        wait_load_detail(driver)
			
 
				-        '''抽取源码'''
			
 
				-        content_html = extract_page_html(driver.page_source, feature='//div[@id="xxnrList"]')
			
 
				-        item = _handler_page_html(content_html, item)
			
 
				-    '''关闭当前页'''
			
 
				-    driver.close()
			
 
				-    '''切换主页'''
			
 
				-    driver.switch_to.window(handler)
			
 
				-    return item
			
 
				-
			
 
				-
			
 
				-def crawl_psp_frame(driver, handler, item):
			
 
				-    """Frame页面"""
			
 
				-    for current_handler in driver.window_handles:
			
 
				-        if current_handler == handler:
			
 
				-            continue
			
 
				-        driver.switch_to.window(current_handler)
			
 
				-        wait_load_detail(
			
 
				-            driver,
			
 
				-            check_feature='//div[contains(@id, "mini-1")]',
			
 
				-            check_timeout=15
			
 
				-        )
			
 
				-        '''切换到frame'''
			
 
				-        try:
			
 
				-            driver.switch_to.frame('mini-iframe-6')
			
 
				-        except NoSuchElementException:
			
 
				-            driver.quit()
			
 
				-            logger.error(f'[未检测到iframe-{item["channel"]}]{item["title"]} - {item["competehref"]}')
			
 
				-            raise NoSuchElementException()
			
 
				-        '''等待加载数据'''
			
 
				-        wait_load_detail(driver, check_feature='//div[contains(@role, "accordion")]')
			
 
				-        content_html = extract_page_html(driver.page_source, feature='//div[@class="fui-accordions"]')
			
 
				-        item = _handler_page_html(content_html, item)
			
 
				-    '''关闭当前页'''
			
 
				-    driver.close()
			
 
				-    '''切换主页'''
			
 
				-    driver.switch_to.window(handler)
			
 
				-    return item
			
 
				-
			
 
				-
			
 
				-def crawl_request(driver, url):
			
 
				-    try:
			
 
				-        driver.get(url)
			
 
				-        return True
			
 
				-    except WebDriverException:
			
 
				-        driver.quit()
			
 
				-        return False
			
 
				-
			
 
				-
			
 
				-def parser_list_elements(driver, category):
			
 
				-    try:
			
 
				-        web_elements = driver.find_elements(by=By.XPATH, value=f'//*[@id="{CATEGORY_MAPS[category]}"]//tr')
			
 
				-        return web_elements
			
 
				-    except InvalidSessionIdException:
			
 
				-        driver.quit()
			
 
				-        logger.error('[数据解析]获取列表失败')
			
 
				-        return None
			
--- a/zgzb/crawler/params.py
+++ b/zgzb/crawler/params.py
@@ -1,70 +0,0 @@
 
				-from collections import namedtuple
			
 
				-
			
 
				-__all__ = [
			
 
				-    'CRAWL_RECORDS',
			
 
				-    'CATEGORY_MAPS',
			
 
				-    'SETUP_MAPS',
			
 
				-    'SETUP_TIME',
			
 
				-    'CRAWL_MENU'
			
 
				-]
			
 
				-
			
 
				-'''采集记录'''
			
 
				-CRAWL_RECORDS = {}
			
 
				-'''分类'''
			
 
				-CATEGORY_MAPS = {
			
 
				-    '招标项目': 'tenderProjectTab',
			
 
				-    '招标公告': 'tenderBulletin',
			
 
				-    '开标记录': 'openBidRecord',
			
 
				-    '评标公示': 'bidEvaluation',
			
 
				-    '中标公告': 'winBidBulletin',
			
 
				-    # '签约履行': '',
			
 
				-}
			
 
				-'''建立时间'''
			
 
				-SETUP_MAPS = {
			
 
				-    '今天': 'jt',
			
 
				-    '2天内': '2tq',
			
 
				-    '3天内': '3tq',
			
 
				-    '1周内': '1zn',
			
 
				-}
			
 
				-SETUP_TIME = {
			
 
				-    '招标项目': {
			
 
				-        'jt': 'tenderProject_begin1',
			
 
				-        '2tq': 'tenderProject_begin2',
			
 
				-        '3tq': 'tenderProject_begin3',
			
 
				-        '1zn': 'tenderProject_begin7'
			
 
				-    },
			
 
				-    '招标公告': {
			
 
				-        'jt': 'tenderBulletin_begin1',
			
 
				-        '2tq': 'tenderBulletin_begin2',
			
 
				-        '3tq': 'tenderBulletin_begin3',
			
 
				-        '1zn': 'tenderBulletin_begin7'
			
 
				-    },
			
 
				-    '开标记录': {
			
 
				-        'jt': 'openBidRecord_1',
			
 
				-        '2tq': 'openBidRecord_2',
			
 
				-        '3tq': 'openBidRecord_3',
			
 
				-        '1zn': 'openBidRecord_7'
			
 
				-    },
			
 
				-    '评标公示': {
			
 
				-        'jt': 'bidEvaluation_1',
			
 
				-        '2tq': 'bidEvaluation_2',
			
 
				-        '3tq': 'bidEvaluation_3',
			
 
				-        '1zn': 'bidEvaluation_7'
			
 
				-    },
			
 
				-    '中标公告': {
			
 
				-        'jt': 'winBidBulletin_1',
			
 
				-        '2tq': 'winBidBulletin_2',
			
 
				-        '3tq': 'winBidBulletin_3',
			
 
				-        '1zn': 'winBidBulletin_7'
			
 
				-    }
			
 
				-}
			
 
				-'''爬虫清单'''
			
 
				-CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode'])
			
 
				-CRAWL_MENU = {
			
 
				-    '招标项目': CrawlMenu('未按数据规范-招标项目', 'a_zgzbtbggfwpt_wasjgf_zbxm'),
			
 
				-    '招标公告': CrawlMenu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg'),
			
 
				-    '开标记录': CrawlMenu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl'),
			
 
				-    '评标公示': CrawlMenu('未按数据规范-评标公示', 'a_zgzbtbggfwpt_wasjgf_pbgs'),
			
 
				-    '中标公告': CrawlMenu('未按数据规范-中标公告', 'a_zgzbtbggfwpt_wasjgf_zhbgg'),
			
 
				-    '签约履行': CrawlMenu('未按数据规范-签约履行', 'a_zgzbtbggfwpt_wasjgf_qylx'),
			
 
				-}
			
--- a/zgzb/kbjl.py
+++ b/zgzb/kbjl.py
@@ -1,14 +0,0 @@
 
				-from crawler.crawl_spider import crawl_spider
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    crawl_spider(
			
 
				-        crawl_category='开标记录',
			
 
				-        crawl_max_page=30,
			
 
				-        enable_proxy=True,
			
 
				-        headless=True,
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/zgzb/pbjs.py
+++ b/zgzb/pbjs.py
@@ -1,14 +0,0 @@
 
				-from crawler.crawl_spider import crawl_spider
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    crawl_spider(
			
 
				-        crawl_category='评标公示',
			
 
				-        crawl_max_page=30,
			
 
				-        enable_proxy=True,
			
 
				-        headless=True,
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/zgzb/zbgg.py
+++ b/zgzb/zbgg.py
@@ -1,14 +0,0 @@
 
				-from crawler.crawl_spider import crawl_spider
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    crawl_spider(
			
 
				-        crawl_category='招标公告',
			
 
				-        crawl_max_page=30,
			
 
				-        enable_proxy=True,
			
 
				-        headless=True,
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/zgzb/zbxm.py
+++ b/zgzb/zbxm.py
@@ -1,14 +0,0 @@
 
				-from crawler.crawl_spider import crawl_spider
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    crawl_spider(
			
 
				-        crawl_category='招标项目',
			
 
				-        crawl_max_page=30,
			
 
				-        enable_proxy=True,
			
 
				-        headless=True,
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/zgzb/zhbgg.py
+++ b/zgzb/zhbgg.py
@@ -1,14 +0,0 @@
 
				-from crawler.crawl_spider import crawl_spider
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    crawl_spider(
			
 
				-        crawl_category='中标公告',
			
 
				-        crawl_max_page=30,
			
 
				-        enable_proxy=True,
			
 
				-        headless=True,
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()