# -*- coding: utf-8 -*- """爬虫配置文件""" import datetime import os import sys # 数据保存失败表 TAB_FAILED_ITEMS = "pyspider:s_failed_items" # 任务失败表 TAB_FAILED_REQUESTS = "pyspider:z_failed_requests" # 采集任务生产表 TASK_REQUEST_PRODUCE = "pyspider_listdata" # 失败任务记录表 TASK_REQUEST_FAILED = "pyspider_listdata_err" # 爬虫心跳与数据采集汇总统计指标表 SPIDER_HEARTBEAT_RECORD = "pyspider_heartbeat" # MONGODB MONGO_IP = "172.17.4.87" MONGO_PORT = 27080 MONGO_DB = "py_spider" # REDIS REDISDB_IP_PORTS = "172.17.162.28:7361" REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ" REDISDB_DB = 10 # rabbitMq RABBITMQ_IP_PORT = "172.17.162.28:5672" RABBITMQ_USER = "root" RABBITMQ_USER_PASS = "V0O0049qBI2rV1554jLZPiBZ8H3Bo4" RABBITMQ_EXCHANGE = "pyspider.data.spider" RABBITMQ_EXCHANGE_TYPE = "direct" RABBITMQ_VIRTUAL_HOST = "/" RABBITMQ_SOCKET_TIMEOUT = 60 RABBITMQ_HEARTBEAT = 600 # 数据入库的pipeline ITEM_PIPELINES = [ # "feapder.pipelines.mongo_pipeline.MongoPipeline", "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline", ] # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警 EXPORT_DATA_MAX_FAILED_TIMES = 5 # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试 EXPORT_DATA_MAX_RETRY_TIMES = 5 COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量 # 爬虫 SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32 SPIDER_MAX_RETRY_TIMES = 3 # 每个请求最大重试次数 # 浏览器渲染 DRISSIONPAGE = dict( pool_size=1, # 浏览器标签页的数量 browser_path=None, # 浏览器可执行文件路径 scope=None, # 自动浏览器端口范围 port=None, # 浏览器端口 user_data_path=None, # 用户数据目录 headless=False, # 是否为无头浏览器 load_images=False, # 是否加载图片 user_agent=None, # 字符串 proxy=None, # xxx.xxx.xxx.xxx:xxxx window_size=(1024, 800), # 窗口大小 driver_type="chromium", load_mode="normal", # 网页加载策略, 可选值:"normal", "eager", "none" timeout=10, # 请求超时时间 retry=1, # 连接失败浏览器重试次数 interval=0.5, # 连接失败重试间隔(秒) page_load=15, # 页面加载超时时间(秒) render_time=20, # 渲染时长,即打开网页等待加载超时时间 download_path=None, # 下载文件的路径 custom_argument=[ "--no-sandbox", "--ignore-certificate-errors" ] ) # request网络请求超时时间 REQUEST_TIMEOUT = 60 # 设置代理 PROXY_EXTRACT_API = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch" # PROXY_EXTRACT_API = "http://172.17.162.28:16001/sam" PROXY_ENABLE = True PROXY_AUTH = "Basic amlhbnl1MDAxOjEyM3F3ZSFB" PROXY_POOL = "feapder.network.proxy_pool.DirectProxyPool" # PROXY_POOL = "feapder.network.proxy_pool.SpringBoardProxyPool" # 下载器 DOWNLOADER = "feapder.network.downloader.RequestsDownloader" # 请求下载器 SESSION_DOWNLOADER = "feapder.network.downloader.RequestsSessionDownloader" # SESSION_DOWNLOADER = "feapder.network.downloader.RequestsJa3SessionDownloader" RENDER_DOWNLOADER = "feapder.network.downloader.DrissionPageDownloader" # 渲染下载器 MAKE_ABSOLUTE_LINKS = True # 自动转成绝对连接 # item去重 ITEM_FILTER_ENABLE = True ITEM_FILTER_SETTING = dict( filter_type=6, expire_time=1 * 365 * 24 * 3600, # 过期时间 config={ "py": { "fingerprint_pref": "pylist_", "ip_port": "172.17.162.34:8361", "user_pass": "k5ZJR5KV4q7DRZ92DQ", "db": 0 }, "lua": { "fingerprint_pref": "list_", "ip_port": "172.17.4.84:4679", "user_pass": "jytopnet123", "db": 0 } } ) # 日志设置 DTIME = datetime.datetime.now().strftime("%Y-%m-%d") LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0] LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME) # log存储路径 LOG_LEVEL = "DEBUG" LOG_COLOR = True # 是否带有颜色 LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台 LOG_IS_WRITE_TO_FILE = True # 是否写文件 LOG_MODE = "w" # 写文件的模式 LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数 LOG_BACKUP_COUNT = 20 # 日志文件保留数量 LOG_ENCODING = "utf8" # 日志文件编码 # 详情采集任务领取接口 JY_TASK_URL = "http://pytask.spdata.jianyu360.com" # bucket配置 ALI_BUCKET_CONFIG = { "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi", "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh", "endpoint": "oss-cn-beijing-internal.aliyuncs.com", "bucket_name": "jy-datafile" }