Эх сурвалжийг харах

删除中国招标投标公共服务平台[未按规范]历史失效代码

dzr 2 сар өмнө
parent
commit
e7a3462dd1

+ 0 - 0
zgzb/__init__.py


+ 0 - 0
zgzb/common/__init__.py


+ 0 - 23
zgzb/common/aliyun.py

@@ -1,23 +0,0 @@
-import oss2
-
-from config.load import oss_conf
-
-
-class AliYunService:
-
-    def __init__(self):
-        self.__acc_key_id = oss_conf['key_id']
-        self.__acc_key_secret = oss_conf['key_secret']
-        self.__endpoint = oss_conf['endpoint']
-        self.__bucket_name = oss_conf['bucket_name']
-
-    def _push_oss_from_local(self, key, filename):
-        """
-        上传一个本地文件到OSS的普通文件
-
-        :param str key: 上传到OSS的文件名
-        :param str filename: 本地文件名,需要有可读权限
-        """
-        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
-        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
-        bucket.put_object_from_file(key, filename)

+ 0 - 224
zgzb/common/attachment.py

@@ -1,224 +0,0 @@
-import hashlib
-import os
-import re
-import traceback
-import web_uuid
-from urllib.parse import urlparse, unquote
-
-import requests
-import urllib3
-
-from common.aliyun import AliYunService
-from common.execptions import AttachmentNullError
-from common.log import logger
-from common.socks5 import Proxy
-from config.load import headers
-
-urllib3.disable_warnings()
-# 文件文档类型
-DOCTYPE = {
-    'txt', 'rtf', 'dps', 'et', 'ett', 'xls',
-    'xlsx', 'xlsb', 'xlsm', 'xlt', 'ods', 'pmd', 'pmdx',
-    'doc', 'docm', 'docx', 'dot', 'dotm', 'dotx',
-    'odt', 'wps', 'csv', 'xml', 'xps'
-}
-# 压缩类型
-COMPRESSION_TYPE = {
-    'rar', 'zip', 'gzzb', '7z', 'tar', 'gz', 'bz2', 'jar', 'iso', 'cab',
-    'arj', 'lzh', 'ace', 'uue', 'edxz',
-}
-# 图片类型
-IMAGE_TYPE = {
-    'jpg', 'png', 'jpeg', 'tiff', 'gif', 'psd', 'raw', 'eps', 'svg', 'bmp',
-    'pdf'
-}
-# 其他类型
-OTHER_TYPE = {
-    'swf', 'nxzf', 'xezf', 'nxcf'
-}
-
-
-def sha1(val):
-    _sha1 = hashlib.sha1()
-    if isinstance(val, bytes):
-        _sha1.update(str(val).encode("utf-8"))
-    elif isinstance(val, str):
-        _sha1.update(val.encode("utf-8"))
-    return _sha1.hexdigest()
-
-
-def remove(file_path: str):
-    os.remove(file_path)
-
-
-def getsize(file):
-    try:
-        return os.path.getsize(file)
-    except FileNotFoundError:
-        return 0
-
-
-def discern_file_format(text):
-    file_types = {
-        *DOCTYPE,
-        *COMPRESSION_TYPE,
-        *IMAGE_TYPE,
-        *OTHER_TYPE
-    }
-    for file_type in file_types:
-        all_file_format = [file_type, file_type.upper()]
-        for t in all_file_format:
-            result = re.match(f'.*{t}$', text, re.S)
-            if result is not None:
-                return t
-    else:
-        unknown_type = re.findall('[^.\\/:*?"<>|\r\n]+$', text, re.S)
-        logger.warning(f'[附件类型识别]未定义的文件类型{unknown_type}')
-        return None
-
-
-def extract_file_type(text):
-    if text is None:
-        return None
-    return discern_file_format(text)
-
-
-def extract_file_name_by_href(href: str, file_type: str):
-    """从url中抽取文件名称"""
-    # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
-    # 中文字符:[\u4e00 -\u9fa5]
-    zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
-    parser = urlparse(href)
-    query = (parser.query or parser.path)
-    result = re.search(f'.*\\.{file_type}', query, re.S)
-    if result is not None:
-        encode_str = unquote(result.group())
-        name = re.search(zh_char_pattern, encode_str)
-        if name is not None:
-            return unquote(name.group())
-    return None
-
-
-def extract_file_name(text):
-    file_type = discern_file_format(text)
-    if file_type is not None:
-        repl = '.{}'.format(file_type)
-        text = text.replace(repl, '')
-    return text
-
-
-def verify_file_name(name):
-    if extract_file_type(name) is None:
-        raise ValueError
-
-
-class AttachmentDownloader(AliYunService):
-
-    def __init__(self):
-        super(AttachmentDownloader, self).__init__()
-        self.dir_name = 'file'
-
-    def _create_file(self, filename, filetype):
-        os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
-        file = "{filename}.{filetype}".format(
-            filename=sha1("{}_{}".format(filename, uuid.uuid4())),
-            filetype=filetype
-        )
-        return "{}/{}".format(self.dir_name, file)
-
-    @staticmethod
-    def _create_fid(file_stream: bytes):
-        return sha1(file_stream)
-
-    @staticmethod
-    def _origin_filename(fid: str, filetype: str):
-        return "{}.{}".format(fid, filetype)
-
-    @staticmethod
-    def _file_size(file: str):
-        _kb = float(getsize(file)) / 1024
-        if _kb >= 1024:
-            _M = _kb / 1024
-            if _M >= 1024:
-                _G = _M / 1024
-                return "{:.1f} G".format(_G)
-            else:
-                return "{:.1f} M".format(_M)
-        else:
-            return "{:.1f} kb".format(_kb)
-
-    @staticmethod
-    def _download(
-            url: str,
-            file: str,
-            enable_proxy=False,
-            allow_show_exception=False,
-            **kwargs
-    ):
-        request_params = {}
-        request_params.setdefault('headers', kwargs.get('headers') or headers)
-        request_params.setdefault('proxies', kwargs.get('proxies'))
-        request_params.setdefault('timeout', kwargs.get('timeout') or 60)
-        request_params.setdefault('stream', kwargs.get('stream') or True)
-        request_params.setdefault('verify', kwargs.get('verify') or False)
-        proxy = Proxy(enable_proxy)
-        retries = 0
-        while retries < 3:
-            try:
-                with requests.get(url, **request_params) as req:
-                    if req.status_code == 200:
-                        stream = req.content
-                        with open(file, 'wb') as f:
-                            f.write(stream)
-                        return stream
-                    else:
-                        retries += 1
-            except requests.RequestException:
-                if allow_show_exception:
-                    traceback.print_exc()
-                if enable_proxy:
-                    proxy.switch()
-                    request_params.update({'proxies': proxy.proxies})
-                retries += 1
-        return b''
-
-    def download(
-            self,
-            file_name: str,
-            file_type: str,
-            download_url: str,
-            enable_proxy=False,
-            allow_request_exception=False,
-            **kwargs
-    ):
-        if not file_name or not file_type or not download_url:
-            raise AttachmentNullError
-
-        local_tmp_file = self._create_file(file_name, file_type)
-        file_stream = self._download(
-            download_url,
-            local_tmp_file,
-            enable_proxy,
-            allow_request_exception,
-            **kwargs
-        )
-        result = {
-            'filename': '{}.{}'.format(file_name, file_type),
-            'org_url': download_url
-        }
-        if len(file_stream) > 0:
-            try:
-                fid = self._create_fid(file_stream)
-                key = self._origin_filename(fid, file_type)
-                result.setdefault('fid', key)
-                result.setdefault('ftype', file_type)
-                result.setdefault('size', self._file_size(local_tmp_file))
-                result.setdefault('url', 'oss')
-                super()._push_oss_from_local(key, local_tmp_file)
-            except Exception as e:
-                logger.warning(
-                    "[{}]下载异常,原因:{}".format(file_name, e.__class__.__name__)
-                )
-        remove(local_tmp_file)
-        '''上传/下载,无论失败/成功必须返回附件信息'''
-        return result

+ 0 - 133
zgzb/common/clean_html.py

@@ -1,133 +0,0 @@
-import re
-
-__all__ = ['cleaner']
-
-# 独立元素
-INDEPENDENT_TAGS = {
-    '<head>[\s\S]*?</head>': '',
-    '<html>|<html [^>]*>|</html>': '',
-    '<body>|<body [^>]*>|</body>': '',
-    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
-    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
-    '\\xa0|\\u3000': '',  # 空格
-    '<!--[\s\S]*?-->': '',  # 注释
-    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
-    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
-    '<input>': '',  # 输入框
-    '<img[^>]*>': '<br>',  # 图片
-}
-# 行内元素
-INLINE_TAGS = {
-    '<a>|<a [^>]*>|</a>': '',  # 超链接
-    '<span>|<span [^>]*>|</span>': '',  # span
-    '<label>|<label [^>]*>|</label>': '<br>',  # label
-    '<font>|<font [^>]*>|</font>': '',  # font
-}
-# 块级元素
-BLOCK_TAGS = {
-    '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
-    # '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
-    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
-    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
-    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
-}
-# 其他
-OTHER = {
-    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
-    '<epointform>': '',
-    '<!doctype html>|<!doctype html [^>]*>': '',
-    '【关闭】|关闭': '',
-    '【打印】|打印本页': '',
-    '【字体:[\s\S]*】': '',
-    '文章来源:[\u4e00-\u9fa5]+': '',
-    '浏览次数:.*[<]+': '',
-    '(责任编辑:.*?)': '',
-    '分享到[:]': '',
-    '阅读数[::]\d+': '',
-}
-# 样式
-CSS_STYLE = {
-    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
-    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
-    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
-    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
-    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
-    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
-}
-# 空白符
-BLANKS = {
-    '\n\s*\n': '\n',
-    '\s*\n\s*': '\n',
-    '[^\S\n]': ' ',
-    '\s+': ' ',
-}
-# css标签集合
-TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
-# css属性集合
-ATTRS = {'id', 'class', 'style', 'width'}
-
-
-def _repair_tag():
-    """异常的标签组合,用来替换非标准页面的标签"""
-    _repairs = {}
-    for tag in TAGS:
-        for attr in ATTRS:
-            key = '{}{}'.format(tag, attr)
-            val = '{} {}'.format(tag, attr)
-            _repairs[key] = val
-    return _repairs
-
-
-def _escape_character(html):
-    """转义字符"""
-    html = html.replace('&lt;', '<')
-    html = html.replace('&gt;', '>')
-    html = html.replace('&quot;', '"')
-    html = html.replace('&amp;', '&')
-    return html
-
-
-def _lowercase_tag(html):
-    """标签归一化处理(全部小写)"""
-    tags = re.findall("<[^>]+>", html)
-    for tag in tags:
-        html = html.replace(tag, str(tag).lower())
-
-    repair_tags = _repair_tag()
-    for err, right in repair_tags.items():
-        html = html.replace(err, right)
-
-    return html
-
-
-def cleaner(html, special=None, completely=False):
-    """
-    数据清洗
-
-    :param html: 清洗的页面
-    :param special: 额外指定页面清洗规则
-    :param completely: 是否完全清洗页面
-    :return: 清洗后的页面源码
-    """
-    if special is None:
-        special = {}
-    OTHER.update(special)
-    remove_tags = {
-        **INDEPENDENT_TAGS,
-        **INLINE_TAGS,
-        **BLOCK_TAGS,
-        **OTHER,
-        **CSS_STYLE,
-        **BLANKS,
-    }
-    html = _lowercase_tag(html)
-    for tag, repl in remove_tags.items():
-        html = re.sub(tag, repl, html)
-
-    if completely:
-        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
-        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
-        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
-
-    html = _escape_character(html)
-    return html

+ 0 - 112
zgzb/common/databases.py

@@ -1,112 +0,0 @@
-import bson
-import pymongo
-import redis
-import requests
-from elasticsearch import Elasticsearch
-
-from config.load import mongo_conf, redis_conf, es_conf, analyze_url
-
-
-# ---------------------------------- mongo ----------------------------------
-def mongo_client(cfg=None, host=None, port=None):
-    if cfg is None:
-        if host is not None and port is not None:
-            cfg = {'host': host, 'port': port}
-        else:
-            cfg = mongo_conf
-    return pymongo.MongoClient(host=cfg['host'], port=cfg['port'])
-
-
-def mongo_database(db: str, **kw):
-    client = mongo_client(**kw)
-    return client[db]
-
-
-def mongo_table(db: str, coll: str, **kw):
-    client = mongo_client(**kw)
-    return client[db][coll]
-
-
-def int2long(param: int):
-    """int 转换成 long """
-    return bson.int64.Int64(param)
-
-
-def object_id(_id: str):
-    return bson.objectid.ObjectId(_id)
-
-
-# ---------------------------------- es ----------------------------------
-def es_client(cfg=None):
-    if cfg is None:
-        cfg = es_conf
-    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
-
-
-def es_participles_service(text: str):
-    """
-    获取文本的分词列表
-
-    :param text: 需要分词的文本
-    :return: 分词列表
-    """
-    result = []
-    params = {"text": text, "analyzer": "ik_smart"}
-    res = requests.get(analyze_url, params=params, timeout=60)
-    if res.status_code == 200:
-        tokens = res.json().get('tokens', [])
-        for x in tokens:
-            if x["token"].encode('utf-8').isalpha():
-                continue
-            result.append(x["token"])
-    return result
-
-
-def es_query(title: str, publish_time: int):
-    """
-    查询es
-
-    :param title: 标题
-    :param publish_time: 发布时间
-    :return:
-    """
-    client = es_client()
-    stime = publish_time - 432000  # 往前推5天
-    etime = publish_time + 432000
-    conditions = []
-    participles = es_participles_service(title)
-    for word in participles:
-        conditions.append({
-            "multi_match": {
-                "query": word,
-                "type": "phrase",
-                "fields": ["title"]
-            }
-        })
-    conditions.append({
-        "range": {"publishtime": {"from": stime, "to": etime}}
-    })
-    query = {
-        "query": {
-            "bool": {
-                "must": conditions,
-                "minimum_should_match": 1
-            }
-        }
-    }
-    result = client.search(index='bidding', body=query, request_timeout=100)
-    count = len(result['hits']['hits'])
-    return count
-
-
-# ---------------------------------- redis ----------------------------------
-def redis_client(cfg=None):
-    if cfg is None:
-        cfg = redis_conf
-    pool = redis.ConnectionPool(
-        host=cfg['host'],
-        port=cfg['port'],
-        password=cfg['pwd'],
-        db=cfg['db']
-    )
-    return redis.Redis(connection_pool=pool, decode_responses=True)

+ 0 - 35
zgzb/common/execptions.py

@@ -1,35 +0,0 @@
-
-class JyBasicException(Exception):
-
-    def __init__(self, code: int, reason: str, **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-
-class CustomCheckError(JyBasicException):
-
-    def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-
-class AttachmentNullError(JyBasicException):
-
-    def __init__(self, code: int = 10004, reason: str = '附件下载异常', **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-
-class CustomAccountPrivilegeError(JyBasicException):
-
-    def __init__(self, *args, **kwargs):
-        pass

+ 0 - 14
zgzb/common/log.py

@@ -1,14 +0,0 @@
-from pathlib import Path
-
-from loguru import logger
-
-_absolute = Path(__file__).absolute().parent.parent
-_log_path = (_absolute / 'logs/crawl-{time:YYYY-MM-DD}.log').resolve()
-logger.add(
-    _log_path,
-    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
-    level='INFO',
-    rotation='00:00',
-    retention='1 week',
-    encoding='utf-8',
-)

+ 0 - 41
zgzb/common/socks5.py

@@ -1,41 +0,0 @@
-import threading
-
-import requests
-
-from common.log import logger
-from config.load import jy_proxy
-
-__all__ = ['Proxy']
-
-
-class Socks5Proxy:
-
-    def __init__(self):
-        self._lock = threading.RLock()
-        self._enable_proxy = False
-        self._url = jy_proxy['socks5']['url']
-        self._auth = jy_proxy['socks5']['auth']
-        self._proxies = None
-
-    @property
-    def proxies(self):
-        return self._proxies
-
-    def switch(self):
-        with self._lock:
-            if self._enable_proxy:
-                self._proxies = self._fetch_proxies()
-
-    def _fetch_proxies(self):
-        proxy = requests.get(self._url, headers=self._auth).json()
-        return proxy.get("data")
-
-    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
-        self._enable_proxy = enable_proxy
-        if self._enable_proxy:
-            logger.info("[加载socks5代理]")
-            self._proxies = self._fetch_proxies()
-        return self
-
-
-Proxy = Socks5Proxy()

+ 0 - 117
zgzb/common/tools.py

@@ -1,117 +0,0 @@
-import datetime
-import hashlib
-import re
-import time
-from collections import namedtuple
-
-from lxml.html import HtmlElement, fromstring, tostring
-
-SearchText = namedtuple('SearchText', ['total'])
-
-
-def element2html(element: HtmlElement) -> str:
-    return tostring(element, encoding="utf-8").decode()
-
-
-def html2element(html: str) -> HtmlElement:
-    return fromstring(html)
-
-
-def valid_element(node: HtmlElement, feature: str):
-    if len(node.xpath(feature)) > 0:
-        return True
-    else:
-        return False
-
-
-def remove_node(node: HtmlElement):
-    """
-    this is a in-place operation, not necessary to return
-    :param node:
-    :return:
-    """
-    parent = node.getparent()
-    if parent is not None:
-        parent.remove(node)
-
-
-def text_search(text: str) -> SearchText:
-    """
-    中文检索
-
-    :param text: 文本
-    :return: 中文数量
-    """
-    if not text:
-        return SearchText(0)
-
-    results = re.findall('[\u4e00-\u9fa5]', text, re.S)
-    # 列表长度即是中文的字数
-    return SearchText(len(results))
-
-
-def verify_text(val: str):
-    if val is None:
-        return False
-    """检查数字、字母、中文的个数"""
-    sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
-    for pattern in sub_pattern:
-        val = re.sub(pattern, '', val)
-    # 若文本长度小于50,表示页面内容无详情内容
-    if len(val) < 50:
-        '''无效文本'''
-        return False
-    '''有效文本'''
-    return True
-
-
-def sha1(text: str):
-    """
-    十六进制数字字符串形式摘要值
-
-    @param text: 字符串文本
-    @return: 摘要值
-    """
-    _sha1 = hashlib.sha1()
-    _sha1.update(text.encode("utf-8"))
-    return _sha1.hexdigest()
-
-
-def get_ms() -> int:
-    return int(round(time.time() * 1000))
-
-
-def get_current_date():
-    return datetime.datetime.now().strftime("%Y-%m-%d")
-
-
-def ms2date(ms: int, fmt="%Y-%m-%d %H:%M:%S"):
-    """毫秒转日期"""
-    timestamp = float(ms / 1000)
-    time_array = time.localtime(timestamp)
-    return time.strftime(fmt, time_array)
-
-
-def convert2type(ts_str):
-    """字符串类型时间戳转成整型"""
-    return int(float(ts_str) / 1000)
-
-
-def ts2date(ts_str, fmt="%Y-%m-%d %H:%M:%S") -> str:
-    """
-    时间戳转成日期
-
-    :param ts_str: 毫秒级时间戳
-    :param fmt: 日期格式
-    :return: 日期
-    """
-    timestamp = int(float(ts_str) / 1000)
-    time_array = time.localtime(timestamp)
-    return time.strftime(fmt, time_array)
-
-
-def date2ts(date_str: str, fmt="%Y-%m-%d"):
-    """日期转成时间戳"""
-    time_array = time.strptime(date_str, fmt)
-    timestamp = int(time.mktime(time_array))
-    return timestamp

+ 0 - 8
zgzb/common/webdriver/__init__.py

@@ -1,8 +0,0 @@
-from .utils import (
-    check_navigator,
-    new_window,
-    get_user_agent,
-    get_title,
-    until_wait,
-)
-from .webdriver import WebDriver, FireFoxWebDriverError

+ 0 - 59
zgzb/common/webdriver/utils.py

@@ -1,59 +0,0 @@
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-
-
-def check_navigator(driver):
-    """检查navigator属性"""
-    script = "return window.navigator.webdriver"
-    return driver.execute_script(script)
-
-
-def until_wait(
-        driver,
-        *,
-        xpath=None,
-        classname=None,
-        text=None,
-        timeout=None
-):
-    """
-    显示等待页面加载,否则抛出TimeoutException
-
-    :param driver: 浏览器驱动
-    :param xpath: xpath规则,页面等待特征
-    :param classname: class属性名称,页面等待特征
-    :param text: 期待的文本
-    :param timeout: 超时时间
-    :return:
-    """
-    _timeout = (timeout or 60)
-    wait = WebDriverWait(driver, _timeout, 0.2)
-    if xpath is not None:
-        locator = (By.XPATH, xpath)
-        if text is not None:
-            wait.until(EC.text_to_be_present_in_element(locator, text))
-        else:
-            wait.until(EC.presence_of_element_located(locator))
-
-    elif classname is not None:
-        locator = (By.CLASS_NAME, classname)
-        if text is not None:
-            wait.until(EC.text_to_be_present_in_element(locator, text))
-        else:
-            wait.until(EC.presence_of_element_located(locator))
-
-
-def new_window(driver):
-    """新的窗口"""
-    driver.execute_script('window.open();')
-    handles = driver.window_handles
-    driver.switch_to.window(handles[-1])
-
-
-def get_user_agent(driver):
-    return driver.execute_script("return navigator.userAgent;")
-
-
-def get_title(driver):
-    return driver.execute_script('return document.title')

+ 0 - 147
zgzb/common/webdriver/webdriver.py

@@ -1,147 +0,0 @@
-import datetime
-from collections import namedtuple
-from pathlib import Path
-
-from selenium import webdriver
-from selenium.common.exceptions import WebDriverException
-from selenium.webdriver import Firefox
-
-from common.log import logger
-
-_absolute = Path(__file__).absolute().parent.parent.parent
-_date = datetime.datetime.now().strftime('%Y-%m-%d')
-SERVICE_LOG_PATH = (_absolute / f'logs/geckodriver-{_date}.log').resolve()
-
-DEFAULT_USERAGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0"
-Netloc = namedtuple('Netloc', ['host', 'port'])
-
-
-def netloc(proxies: dict) -> Netloc:
-    host, port = proxies["https"].replace("socks5://", "").split(":")
-    return Netloc(host, port)
-
-
-class FireFoxWebDriverError(WebDriverException):
-    pass
-
-
-class WebDriver(Firefox):
-
-    def __init__(self, load_images=True, user_agent=None, proxy=None,
-                 headless=True, timeout=60, log_path=None,
-                 window_size=(1024, 800), executable_path=None,
-                 custom_argument=None, **kwargs):
-        """
-        Args:
-            load_images: 是否加载图片
-            user_agent: 字符串 或 无参函数,返回值为user_agent
-            proxy: {'https://sockets:xxx.xxx.xxx.xxx:xxxx'} 或 无参函数,返回值为代理地址
-            headless: 是否启用无头模式
-            driver_type: FIREFOX
-            timeout: 请求超时时间
-            log_path: Geckodriver服务的日志文件路径
-            window_size: 窗口大小
-            executable_path: 浏览器路径,默认为默认路径
-            custom_argument: 自定义配置参数
-            **kwargs: 需要额外配置的Firefox参数
-        """
-        self._load_images = load_images
-        self._user_agent = user_agent or DEFAULT_USERAGENT
-        self._proxy = proxy
-        self._headless = headless
-        self._timeout = timeout
-        self._window_size = window_size
-        self._executable_path = executable_path
-        self._custom_argument = custom_argument
-        self._service_log_path = log_path or str(SERVICE_LOG_PATH)
-
-        _profile = webdriver.FirefoxProfile()
-        _options = webdriver.FirefoxOptions()
-        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
-        _profile.set_preference("dom.webdriver.enabled", False)
-        _profile.set_preference('useAutomationExtension', False)
-        # _profile.set_preference('privacy.resistFingerprinting', True)  # 启用指纹保护
-        if self._proxy:
-            proxy = self._proxy() if callable(self._proxy) else self._proxy
-            host, port = netloc(proxy)
-            # 使用socks5 代理, 不使用代理:0, 使用代理:1
-            _profile.set_preference('network.proxy.type', 1)
-            _profile.set_preference('network.proxy.socks', host)
-            _profile.set_preference('network.proxy.socks_port', int(port))
-
-        if self._user_agent:
-            _profile.set_preference(
-                "general.useragent.override",
-                self._user_agent() if callable(self._user_agent) else self._user_agent,
-            )
-
-        if not self._load_images:
-            '''
-            允许加载所有图像,无论来源如何(默认)=1
-            阻止所有图像加载=2
-            防止加载第三方图像=3
-            '''
-            _profile.set_preference("permissions.default.image", 2)
-
-        _profile.update_preferences()
-
-        if self._headless:
-            _options.add_argument("--headless")
-            _options.add_argument("--disable-gpu")
-
-        if self._custom_argument:
-            [_options.add_argument(arg) for arg in self._custom_argument]
-
-        if self._executable_path:
-            super(WebDriver, self).__init__(
-                service_log_path=self._service_log_path,
-                capabilities=firefox_capabilities,
-                options=_options,
-                firefox_profile=_profile,
-                executable_path=self._executable_path,
-                **kwargs
-            )
-        else:
-            super(WebDriver, self).__init__(
-                service_log_path=self._service_log_path,
-                capabilities=firefox_capabilities,
-                options=_options,
-                firefox_profile=_profile,
-                **kwargs
-            )
-
-        if self._window_size:
-            self.set_window_size(*self._window_size)
-
-        self.set_page_load_timeout(self._timeout)
-        self.set_script_timeout(self._timeout)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_val:
-            logger.exception(f'{self.__class__.__name__} <> {exc_type.__name__}: {exc_val}')
-        self.quit()
-        print("关闭浏览器")
-        return True
-
-    @property
-    def cookies(self):
-        cookies_json = {}
-        for cookie in self.get_cookies():
-            cookies_json[cookie["name"]] = cookie["value"]
-        return cookies_json
-
-    @cookies.setter
-    def cookies(self, val: dict):
-        """
-        设置cookie
-        Args:
-            val: {"key":"value", "key2":"value2"}
-
-        Returns:
-
-        """
-        for key, value in val.items():
-            self.add_cookie({"name": key, "value": value})

+ 0 - 0
zgzb/config/__init__.py


+ 0 - 40
zgzb/config/conf.yaml

@@ -1,40 +0,0 @@
-# mongo
-mongo:
-  host: 172.17.4.87
-  port: !!int 27080
-#  host: 127.0.0.1
-#  port: !!int 27017
-
-
-# redis
-redis:
-  host: 127.0.0.1
-  port: !!int 6379
-  pwd: ""
-  db: !!int 10
-
-
-# 阿里oss
-ali_oss:
-  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
-  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
-#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
-  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
-  bucket_name: jy-datafile
-
-
-# es
-es:
-  host: 172.17.145.170
-#  host: 192.168.3.206
-#  host: 127.0.0.1
-  port: !!int 9800
-  db: biddingall # es库别名
-
-
-# 代理
-proxy:
-  socks5:
-    url: http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch
-    auth:
-      Authorization: Basic amlhbnl1MDAxOjEyM3F3ZSFB

+ 0 - 2
zgzb/config/constants.yaml

@@ -1,2 +0,0 @@
-headers:
-  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36

+ 0 - 34
zgzb/config/load.py

@@ -1,34 +0,0 @@
-from pathlib import Path
-
-import yaml
-
-__all__ = [
-    'mongo_conf',
-    'redis_conf',
-    'oss_conf',
-    'es_conf',
-    'jy_proxy',
-    'node_module_path',
-    'headers',
-    'analyze_url'
-]
-
-_base_path = Path(__file__).parent
-_yaml_conf = (_base_path / 'conf.yaml').resolve()
-_yaml_constants = (_base_path / 'constants.yaml').resolve()
-_node_modules = (_base_path.parent / 'node_modules').resolve()
-
-with open(_yaml_conf, encoding="utf-8") as f:
-    _conf = yaml.safe_load(f)
-    mongo_conf = _conf['mongo']
-    redis_conf = _conf['redis']
-    oss_conf: dict = _conf['ali_oss']
-    es_conf: dict = _conf['es']
-    jy_proxy: dict = _conf['proxy']
-    node_module_path = _node_modules
-    analyze_url = f'http://{es_conf["host"]}:{es_conf["port"]}/{es_conf["db"]}/_analyze'
-
-
-with open(_yaml_constants, encoding="utf-8") as fp:
-    _constants = yaml.safe_load(fp)
-    headers: dict = _constants['headers']

+ 0 - 0
zgzb/crawler/__init__.py


+ 0 - 201
zgzb/crawler/crawl_spider.py

@@ -1,201 +0,0 @@
-import time
-
-from selenium.common.exceptions import (
-    WebDriverException,
-    TimeoutException,
-    NoSuchElementException
-)
-
-from common.databases import mongo_table, int2long, redis_client
-from common.log import logger
-from common.socks5 import Proxy
-from common.tools import sha1, date2ts
-from common.webdriver import WebDriver
-from crawler.defaults import (
-    goto,
-    crawl_request,
-    select_category,
-    select_date,
-    extract_text,
-    crawl_psp_frame,
-    crawl_show_details,
-    next_page,
-    wait_load_list,
-    update_crawl_records,
-    write_crawl_records,
-    parser_list_elements,
-    get_crawl_menu,
-    get_category_id
-)
-
-crawl_tab = mongo_table(db='py_spider', coll='zgzb_wagf_list')
-save_tab = mongo_table(db='py_spider', coll='data_bak')
-redis_key = 'zgzb_wagf_2022'
-r = redis_client()
-
-
-def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
-    proxy = Proxy(enable_proxy)
-    headless = kw.get('headless', True)
-    crawl_category = kw.get('crawl_category')
-    crawl_date = kw.get('crawl_date', '今天')
-    prev_num = page_num = 1
-    while True:
-        proxies = proxy.proxies
-        logger.info(f"[采集代理]{proxies}")
-        list_page_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do?tabledivIds=searchTabLi2'
-        with WebDriver(load_images=False, proxy=proxies, headless=headless) as browser:
-            ua = browser.execute_script('return navigator.userAgent')
-            print('>>> ', ua)
-            success_reqeust = crawl_request(browser, list_page_url)
-            if not success_reqeust:
-                proxy.switch()
-                logger.error('[访问超时]请求列表页')
-                continue
-
-            '''等待加载主页'''
-            wait_load_list(browser)
-            '''获取主页句柄'''
-            main_handler = browser.current_window_handle
-            '''选择分类'''
-            category = select_category(browser, crawl_category)
-            '''分类栏目列表'''
-            crawl_menu = get_crawl_menu(category)
-            if crawl_menu is None:
-                browser.quit()
-                logger.info("任务结束")
-                break
-
-            logger.info(f"[分类栏目]{category}")
-
-            '''选择建立时间'''
-            success_select_date = select_date(browser, category, crawl_date)
-            if not success_select_date:
-                proxy.switch()
-                continue
-
-            exit_crawl = False
-            allow_next_page = False
-            while True:
-                if exit_crawl:
-                    proxy.switch()
-                    break
-
-                if allow_next_page:
-                    allow_next_page = True
-                    try:
-                        page_num = next_page(browser, category)
-                        if page_num is None or (page_num > crawl_max_page):
-                            browser.quit()
-                            proxy.switch()
-                            update_crawl_records(category, True)
-                            break
-                        elif page_num != prev_num and page_num % 2 == 0:
-                            '''每个代理IP仅采集2页,轮询使用代理'''
-                            browser.quit()
-                            proxy.switch()
-                            prev_num = page_num
-                            break
-                    except TimeoutException:
-                        browser.quit()
-                        proxy.switch()
-                        logger.error('[访问超时]请求翻页')
-                        break
-                else:
-                    allow_next_page = True
-
-                '''详情页'''
-                web_elements = parser_list_elements(browser, category)
-                if web_elements is None:
-                    proxy.switch()
-                    break
-
-                for index, element in enumerate(web_elements):
-                    index += 1
-                    item = {
-                        "site": "中国招标投标公共服务平台",
-                        "channel": crawl_menu.channel,
-                        "spidercode": crawl_menu.spidercode,
-                        "T": "bidding",
-                        "sendflag": "false",
-                        "_d": "comeintime",
-                        "comeintime": '',
-                        "area": '',
-                        "city": '',
-                        "publishdept": "",
-                        "title": "",
-                        "href": "",
-                        "publishtime": "",
-                        "l_np_publishtime": "",
-                    }
-                    html = browser.page_source
-                    category_id = get_category_id(category)
-                    click_detail_js = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@onclick')).strip()
-                    href_ = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@href')).strip()
-                    detail_js = (click_detail_js or href_)
-                    sign = sha1(detail_js)
-                    print(f'>>> {sign}')
-                    if r.hexists(redis_key, sign):
-                        continue
-                    '''发布标题'''
-                    node1 = element.find_element_by_xpath('./td[1]/a')
-                    title = node1.text
-                    item['title'] = title
-                    '''省市'''
-                    node2 = element.find_element_by_xpath('./td[3]/span')
-                    region = str(node2.text).replace('【', '').replace('】', '')
-                    if region.find(" ") > 0:
-                        province, city = region.split(' ')
-                    else:
-                        province = region
-                        city = ''
-                    item['area'] = str(province).replace('省', '').replace('市', '').replace('自治区', '')
-                    item['city'] = city
-                    '''发布时间'''
-                    node3 = element.find_element_by_xpath('./td[5]')
-                    publish_time = node3.text
-                    item['publishtime'] = publish_time
-                    item['l_np_publishtime'] = int2long(date2ts(publish_time))
-                    item['comeintime'] = int2long(int(time.time()))
-                    '''访问详情页'''
-                    goto(browser, node1, wait_time=2)
-                    '''详情页'''
-                    item['href'] = '#'
-                    detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do'
-                    if detail_js.startswith('showDetails') is False:
-                        item['competehref'] = detail_url
-                        try:
-                            item = crawl_psp_frame(browser, main_handler, item)
-                        except NoSuchElementException:
-                            exit_crawl = True
-                            break
-                    else:
-                        item['competehref'] = '{}/{}'.format(detail_url, sign)
-                        try:
-                            item = crawl_show_details(browser, main_handler, item)
-                        except (ValueError, WebDriverException) as e:
-                            browser.quit()
-                            exit_crawl = True
-                            if e.__class__.__name__ == 'ValueError':
-                                logger.error("[机器人验证]验证失败")
-                            break
-                    '''入库处理'''
-                    if 'contenthtml' not in item:
-                        item['crawl_status'] = 'detail_err'
-                    else:
-                        item['crawl_status'] = 'success'
-                        '''保存详情'''
-                        save_tab.insert_one(item)
-                        del item['contenthtml'], item['detail']
-                        if '_id' in item:
-                            del item['_id']
-                        logger.info(f'[采集成功-{item["channel"]}]{title} - {publish_time}')
-                    '''备注:详情页访问参数'''
-                    item['remark'] = detail_js
-                    '''添加数据指纹'''
-                    r.hset(redis_key, sign, '')
-                    '''保存列表'''
-                    crawl_tab.insert_one(item)
-
-                logger.info(f"[{category}-第{page_num}页]采集完成")
-                write_crawl_records(category, page_num)

+ 0 - 362
zgzb/crawler/defaults.py

@@ -1,362 +0,0 @@
-import time
-
-from selenium.common.exceptions import (
-    WebDriverException,
-    TimeoutException,
-    InvalidSessionIdException,
-    NoSuchElementException
-)
-from selenium.webdriver import ActionChains
-from selenium.webdriver.common.by import By
-
-from common.clean_html import cleaner
-from common.databases import int2long
-from common.log import logger
-from common.tools import html2element, element2html, verify_text, remove_node
-from common.webdriver import until_wait
-from crawler.params import (
-    CRAWL_RECORDS,
-    SETUP_TIME,
-    SETUP_MAPS,
-    CATEGORY_MAPS,
-    CRAWL_MENU
-)
-
-
-def get_crawl_menu(category: str):
-    """采集清单"""
-    return CRAWL_MENU.get(category)
-
-
-def get_category_id(category: str):
-    """分类id"""
-    return CATEGORY_MAPS[category]
-
-
-def extract_text(html: str, feature: str):
-    """抽取文本"""
-    element = html2element(html)
-    return element.xpath(feature)
-
-
-def extract_page_html(html: str, feature: str):
-    """抽取页面源码"""
-    element = html2element(html)
-    '''移除空附件信息'''
-    remove_target = element.xpath('//div[@id="isshow"]')
-    if len(remove_target) > 0:
-        remove_node(remove_target[0])
-    try:
-        node = element.xpath(feature)[0]
-        return element2html(node)
-    except IndexError:
-        pass
-
-
-def init_crawl_records(driver, web_element, category: str):
-    """初始记录"""
-    if category not in CRAWL_RECORDS:
-        goto(driver, web_element)
-        # init_config = {'finished': False, 'pages': ['1']}
-        init_config = {'finished': False, 'pages': []}
-        CRAWL_RECORDS.setdefault(category, init_config)
-        return True
-    else:
-        _record = CRAWL_RECORDS[category]
-        if not _record['finished']:
-            goto(driver, web_element)
-            return True
-        else:
-            return False
-
-
-def update_crawl_records(category: str, finished: bool):
-    """更新记录"""
-    if category in CRAWL_RECORDS:
-        _record = CRAWL_RECORDS[category]
-        _record['finished'] = finished
-        CRAWL_RECORDS.update(_record)
-
-
-def write_crawl_records(category: str, page_num: int):
-    """写入记录"""
-    if category in CRAWL_RECORDS:
-        _record = CRAWL_RECORDS[category]
-        '''记录采集页码,已记录页码不在访问'''
-        finished_pages = _record['pages']
-        finished_pages.append(str(page_num))
-        _record.update({'pages': finished_pages})
-        CRAWL_RECORDS.update({category: _record})
-
-
-def robots_alert(driver):
-    """机器人警告"""
-    wait = 0
-    while wait < 20:
-        '''等待验证模块加载'''
-        element = html2element(driver.page_source)
-        robot_alert = element.xpath('//span[@class="nc-lang-cnt"]/@data-nc-lang')
-        click_alert = element.xpath('//div[@id="text"]/text()')
-        if len(robot_alert) == 1 and "".join(robot_alert).strip() == '_Loading':
-            time.sleep(0.5)
-        elif len(robot_alert) > 1 and robot_alert[0] in ['_yesTEXT']:
-            '''通过机器人验证'''
-            return False, '0'
-        elif len(robot_alert) > 1 and robot_alert[0] in ['_startTEXT']:
-            '''机器人验证加载完成'''
-            return True, '1'
-        elif len(robot_alert) > 1 and robot_alert[0] in ['_errorNetwork']:
-            '''网络不给力,请点击刷新,或提交反馈 (00)'''
-            return True, '2'
-        elif len(click_alert) > 0 and "".join(click_alert) == '请点击此处完成验证或咨询客服':
-            return True, '3'
-        else:
-            return False, '0'
-
-        wait += 1
-    return True, '999'
-
-
-def check_robots_alert(driver):
-    """检查并处理机器人警告"""
-    while True:
-        alert, alert_type = robots_alert(driver)
-        if not alert:
-            break
-
-        if alert_type == '1':
-            until_wait(driver, xpath='//span[contains(@class, "nc_iconfont btn_slide")]')
-            element = driver.find_element_by_xpath('//span[contains(@class, "nc_iconfont btn_slide")]')
-            if element.is_displayed():
-                # 点击并且不松开鼠标,往右边移动258个位置,松开鼠标
-                ActionChains(driver).click_and_hold(element).move_by_offset(xoffset=258, yoffset=0).release().perform()
-
-        elif alert_type == '2':
-            until_wait(driver, xpath='//span[contains(@class, "nc-lang-cnt")]/a[1]')
-            element = driver.find_element_by_xpath('//span[contains(@class, "nc-lang-cnt")]/a[1]')
-            if element.is_displayed():
-                goto(driver, element, wait_time=2)
-
-        elif alert_type == '3':
-            # until_wait(driver, xpath='//div[@id="container"]')
-            # element = driver.find_element_by_xpath('//div[@id="container"]')
-            # if element.is_displayed():
-            #     goto(driver, element, wait_time=3)
-            #     driver.switch_to.alert.accept()
-            logger.error("[机器人验证]触发浏览器指纹检测,无法自动处理.")
-            raise ValueError()
-
-        else:
-            with open('robot.html', 'w') as wp:
-                wp.write(driver.page_source)
-            logger.error("[未知异常网页]页面源码保存在robot.html")
-            raise ValueError()
-        time.sleep(2)
-
-
-def refresh_page(driver):
-    """刷新页面"""
-    element = html2element(driver.page_source)
-    node = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
-    if "".join(node) == "暂无详细数据":
-        driver.refresh()
-        time.sleep(1)
-        '''页面alert元素确定'''
-        driver.switch_to.alert.accept()
-    time.sleep(1.5)
-    wait_load_detail(driver)
-    check_robots_alert(driver)
-
-
-def goto(driver, web_element, wait_time=None, allow_check_page=False):
-    """执行可点击js事件"""
-    driver.execute_script("arguments[0].click();", web_element)
-    _wait_time = (wait_time or 1)
-    time.sleep(_wait_time)
-    if allow_check_page:
-        check_robots_alert(driver)
-
-
-def next_page(driver, category):
-    """翻页"""
-    _finished_pages = CRAWL_RECORDS[category]['pages']
-    while True:
-        next_element = driver.find_element_by_xpath('//div[@id="Pagination"]/div[1]/child::a[last()]')
-        if next_element.text == '下一页':
-            goto(driver, next_element, wait_time=1.2)
-            current_element = driver.find_element_by_xpath('//div[@id="Pagination"]/div[1]/child::span[@class="current"]')
-            val = current_element.text
-            if val not in _finished_pages:
-                return int(val)
-        else:
-            break
-    time.sleep(1)
-
-
-def _select_category(driver, custom_category=None):
-    """采集分类"""
-    web_elements = driver.find_elements(by=By.XPATH, value='//ul[@id="myTab3"]/child::li')
-    for element in web_elements:
-        val = element.text
-        if custom_category is None:
-            success = init_crawl_records(driver, element, val)
-            return val if success else None
-        else:
-            if val == custom_category:
-                success = init_crawl_records(driver, element, custom_category)
-                return val if success else None
-
-
-def select_category(driver, category: str):
-    """选择分类"""
-    try:
-        _category = _select_category(driver, category)
-        return _category
-    except TimeoutException:
-        driver.quit()
-        logger.error('[访问超时]选择分类')
-        return None
-
-
-def _select_date(driver, category: str, setup_time: str):
-    """选择建立时间"""
-    logger.info(f"[建立时间]{setup_time}")
-    try:
-        attr = SETUP_TIME[category][SETUP_MAPS[setup_time]]
-        element = driver.find_element(by=By.XPATH, value=f'//a[@id="{attr}"]')
-        goto(driver, element)
-    except KeyError:
-        raise KeyError(f'请设置"SETUP_TIME"中{category}对应的建立时间')
-
-
-def select_date(driver, category, setup_time):
-    try:
-        _select_date(driver, category, setup_time)
-        return True
-    except TimeoutException:
-        driver.quit()
-        logger.error('[访问超时]选择建立时间')
-        return False
-
-
-def wait_load_detail(driver, check_feature=None, check_timeout=None):
-    """等待二次加载页面结果并检测元素变化"""
-    _check_timeout = (check_timeout or 10)
-    sleep_interval = 0.5
-    max_check_count = int(_check_timeout / sleep_interval)
-    if check_feature is not None:
-        check_count = 0
-        while check_count < max_check_count:
-            element = html2element(driver.page_source)
-            check_node = element.xpath(check_feature)
-            if len(check_node) > 0:
-                break
-            time.sleep(sleep_interval)
-            check_count += 1
-    else:
-        check_count = 0
-        while check_count < max_check_count:
-            element = html2element(driver.page_source)
-            root = element.xpath('//div[@id="xxnrList"]')
-            if len(root) > 0:
-                descendant = element.xpath('//div[@id="xxnrList"]/descendant::*')
-                if len(descendant) > 0:
-                    text = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
-                    children = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/child::*')
-                    if "".join(text) != '暂无详细数据' and len(children) > 0:
-                        break
-            time.sleep(sleep_interval)
-            check_count += 1
-    time.sleep(1)
-
-
-def wait_load_list(driver):
-    while True:
-        element = html2element(driver.page_source)
-        node = element.xpath('//div[@id="myTab_Contenta0"]/div[2]/table//tr')
-        if len(node) > 0:
-            break
-        time.sleep(0.5)
-
-
-def _handler_page_html(html, item):
-    """页面源码处理"""
-    if all([html is not None, verify_text(html)]):
-        item['contenthtml'] = html
-        item['detail'] = cleaner(html)
-        item['comeintime'] = int2long(int(time.time()))
-    else:
-        logger.error(
-            f'[文本异常-{item["channel"]}]{item["title"]} - {item["publishtime"]}')
-    return item
-
-
-def crawl_show_details(driver, handler, item):
-    for current_handler in driver.window_handles:
-        if current_handler == handler:
-            continue
-        driver.switch_to.window(current_handler)
-        '''加载等待并检查指定页面特征'''
-        wait_load_detail(driver, check_feature='//div[@id="xxnrList"]/div[1]/div[2]/div[2]')
-        '''检查机器人警告并处理'''
-        check_robots_alert(driver)
-        '''二次加载'''
-        refresh_page(driver)
-        '''加载等待'''
-        wait_load_detail(driver)
-        '''抽取源码'''
-        content_html = extract_page_html(driver.page_source, feature='//div[@id="xxnrList"]')
-        item = _handler_page_html(content_html, item)
-    '''关闭当前页'''
-    driver.close()
-    '''切换主页'''
-    driver.switch_to.window(handler)
-    return item
-
-
-def crawl_psp_frame(driver, handler, item):
-    """Frame页面"""
-    for current_handler in driver.window_handles:
-        if current_handler == handler:
-            continue
-        driver.switch_to.window(current_handler)
-        wait_load_detail(
-            driver,
-            check_feature='//div[contains(@id, "mini-1")]',
-            check_timeout=15
-        )
-        '''切换到frame'''
-        try:
-            driver.switch_to.frame('mini-iframe-6')
-        except NoSuchElementException:
-            driver.quit()
-            logger.error(f'[未检测到iframe-{item["channel"]}]{item["title"]} - {item["competehref"]}')
-            raise NoSuchElementException()
-        '''等待加载数据'''
-        wait_load_detail(driver, check_feature='//div[contains(@role, "accordion")]')
-        content_html = extract_page_html(driver.page_source, feature='//div[@class="fui-accordions"]')
-        item = _handler_page_html(content_html, item)
-    '''关闭当前页'''
-    driver.close()
-    '''切换主页'''
-    driver.switch_to.window(handler)
-    return item
-
-
-def crawl_request(driver, url):
-    try:
-        driver.get(url)
-        return True
-    except WebDriverException:
-        driver.quit()
-        return False
-
-
-def parser_list_elements(driver, category):
-    try:
-        web_elements = driver.find_elements(by=By.XPATH, value=f'//*[@id="{CATEGORY_MAPS[category]}"]//tr')
-        return web_elements
-    except InvalidSessionIdException:
-        driver.quit()
-        logger.error('[数据解析]获取列表失败')
-        return None

+ 0 - 70
zgzb/crawler/params.py

@@ -1,70 +0,0 @@
-from collections import namedtuple
-
-__all__ = [
-    'CRAWL_RECORDS',
-    'CATEGORY_MAPS',
-    'SETUP_MAPS',
-    'SETUP_TIME',
-    'CRAWL_MENU'
-]
-
-'''采集记录'''
-CRAWL_RECORDS = {}
-'''分类'''
-CATEGORY_MAPS = {
-    '招标项目': 'tenderProjectTab',
-    '招标公告': 'tenderBulletin',
-    '开标记录': 'openBidRecord',
-    '评标公示': 'bidEvaluation',
-    '中标公告': 'winBidBulletin',
-    # '签约履行': '',
-}
-'''建立时间'''
-SETUP_MAPS = {
-    '今天': 'jt',
-    '2天内': '2tq',
-    '3天内': '3tq',
-    '1周内': '1zn',
-}
-SETUP_TIME = {
-    '招标项目': {
-        'jt': 'tenderProject_begin1',
-        '2tq': 'tenderProject_begin2',
-        '3tq': 'tenderProject_begin3',
-        '1zn': 'tenderProject_begin7'
-    },
-    '招标公告': {
-        'jt': 'tenderBulletin_begin1',
-        '2tq': 'tenderBulletin_begin2',
-        '3tq': 'tenderBulletin_begin3',
-        '1zn': 'tenderBulletin_begin7'
-    },
-    '开标记录': {
-        'jt': 'openBidRecord_1',
-        '2tq': 'openBidRecord_2',
-        '3tq': 'openBidRecord_3',
-        '1zn': 'openBidRecord_7'
-    },
-    '评标公示': {
-        'jt': 'bidEvaluation_1',
-        '2tq': 'bidEvaluation_2',
-        '3tq': 'bidEvaluation_3',
-        '1zn': 'bidEvaluation_7'
-    },
-    '中标公告': {
-        'jt': 'winBidBulletin_1',
-        '2tq': 'winBidBulletin_2',
-        '3tq': 'winBidBulletin_3',
-        '1zn': 'winBidBulletin_7'
-    }
-}
-'''爬虫清单'''
-CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode'])
-CRAWL_MENU = {
-    '招标项目': CrawlMenu('未按数据规范-招标项目', 'a_zgzbtbggfwpt_wasjgf_zbxm'),
-    '招标公告': CrawlMenu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg'),
-    '开标记录': CrawlMenu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl'),
-    '评标公示': CrawlMenu('未按数据规范-评标公示', 'a_zgzbtbggfwpt_wasjgf_pbgs'),
-    '中标公告': CrawlMenu('未按数据规范-中标公告', 'a_zgzbtbggfwpt_wasjgf_zhbgg'),
-    '签约履行': CrawlMenu('未按数据规范-签约履行', 'a_zgzbtbggfwpt_wasjgf_qylx'),
-}

+ 0 - 14
zgzb/kbjl.py

@@ -1,14 +0,0 @@
-from crawler.crawl_spider import crawl_spider
-
-
-def main():
-    crawl_spider(
-        crawl_category='开标记录',
-        crawl_max_page=30,
-        enable_proxy=True,
-        headless=True,
-    )
-
-
-if __name__ == '__main__':
-    main()

+ 0 - 14
zgzb/pbjs.py

@@ -1,14 +0,0 @@
-from crawler.crawl_spider import crawl_spider
-
-
-def main():
-    crawl_spider(
-        crawl_category='评标公示',
-        crawl_max_page=30,
-        enable_proxy=True,
-        headless=True,
-    )
-
-
-if __name__ == '__main__':
-    main()

+ 0 - 14
zgzb/zbgg.py

@@ -1,14 +0,0 @@
-from crawler.crawl_spider import crawl_spider
-
-
-def main():
-    crawl_spider(
-        crawl_category='招标公告',
-        crawl_max_page=30,
-        enable_proxy=True,
-        headless=True,
-    )
-
-
-if __name__ == '__main__':
-    main()

+ 0 - 14
zgzb/zbxm.py

@@ -1,14 +0,0 @@
-from crawler.crawl_spider import crawl_spider
-
-
-def main():
-    crawl_spider(
-        crawl_category='招标项目',
-        crawl_max_page=30,
-        enable_proxy=True,
-        headless=True,
-    )
-
-
-if __name__ == '__main__':
-    main()

+ 0 - 14
zgzb/zhbgg.py

@@ -1,14 +0,0 @@
-from crawler.crawl_spider import crawl_spider
-
-
-def main():
-    crawl_spider(
-        crawl_category='中标公告',
-        crawl_max_page=30,
-        enable_proxy=True,
-        headless=True,
-    )
-
-
-if __name__ == '__main__':
-    main()