123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- # -*- coding: utf-8 -*-
- """爬虫配置文件"""
- import datetime
- import os
- import sys
- # 数据保存失败表
- TAB_FAILED_ITEMS = "pyspider:s_failed_items"
- # 任务失败表
- TAB_FAILED_REQUESTS = "pyspider:z_failed_requests"
- # 采集任务生产表
- TASK_REQUEST_PRODUCE = "pyspider_listdata"
- # 失败任务记录表
- TASK_REQUEST_FAILED = "pyspider_listdata_err"
- # 爬虫心跳与数据采集汇总统计指标表
- SPIDER_HEARTBEAT_RECORD = "pyspider_heartbeat"
- # MONGODB
- MONGO_IP = "172.17.4.87"
- MONGO_PORT = 27080
- MONGO_DB = "py_spider"
- # REDIS
- REDISDB_IP_PORTS = "172.17.162.28:7361"
- REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
- REDISDB_DB = 10
- # rabbitMq
- RABBITMQ_IP_PORT = "172.17.162.28:5672"
- RABBITMQ_USER = "root"
- RABBITMQ_USER_PASS = "V0O0049qBI2rV1554jLZPiBZ8H3Bo4"
- RABBITMQ_EXCHANGE = "pyspider.data.spider"
- RABBITMQ_EXCHANGE_TYPE = "direct"
- RABBITMQ_VIRTUAL_HOST = "/"
- RABBITMQ_SOCKET_TIMEOUT = 60
- RABBITMQ_HEARTBEAT = 600
- # 数据入库的pipeline
- ITEM_PIPELINES = [
- # "feapder.pipelines.mongo_pipeline.MongoPipeline",
- "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
- ]
- # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
- EXPORT_DATA_MAX_FAILED_TIMES = 5
- # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
- EXPORT_DATA_MAX_RETRY_TIMES = 5
- COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
- # 爬虫
- SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32
- SPIDER_MAX_RETRY_TIMES = 3 # 每个请求最大重试次数
- # 浏览器渲染
- DRISSIONPAGE = dict(
- pool_size=1, # 浏览器标签页的数量
- browser_path=None, # 浏览器可执行文件路径
- scope=None, # 自动浏览器端口范围
- port=None, # 浏览器端口
- user_data_path=None, # 用户数据目录
- headless=False, # 是否为无头浏览器
- load_images=False, # 是否加载图片
- user_agent=None, # 字符串
- proxy=None, # xxx.xxx.xxx.xxx:xxxx
- window_size=(1024, 800), # 窗口大小
- driver_type="chromium",
- load_mode="normal", # 网页加载策略, 可选值:"normal", "eager", "none"
- timeout=10, # 请求超时时间
- retry=1, # 连接失败浏览器重试次数
- interval=0.5, # 连接失败重试间隔(秒)
- page_load=15, # 页面加载超时时间(秒)
- render_time=20, # 渲染时长,即打开网页等待加载超时时间
- download_path=None, # 下载文件的路径
- custom_argument=[
- "--no-sandbox",
- "--ignore-certificate-errors"
- ]
- )
- # request网络请求超时时间
- REQUEST_TIMEOUT = 60
- # 设置代理
- PROXY_EXTRACT_API = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
- # PROXY_EXTRACT_API = "http://172.17.162.28:16001/sam"
- PROXY_ENABLE = True
- PROXY_AUTH = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
- PROXY_POOL = "feapder.network.proxy_pool.DirectProxyPool"
- # PROXY_POOL = "feapder.network.proxy_pool.SpringBoardProxyPool"
- # 下载器
- DOWNLOADER = "feapder.network.downloader.RequestsDownloader" # 请求下载器
- SESSION_DOWNLOADER = "feapder.network.downloader.RequestsSessionDownloader"
- # SESSION_DOWNLOADER = "feapder.network.downloader.RequestsJa3SessionDownloader"
- RENDER_DOWNLOADER = "feapder.network.downloader.DrissionPageDownloader" # 渲染下载器
- MAKE_ABSOLUTE_LINKS = True # 自动转成绝对连接
- # item去重
- ITEM_FILTER_ENABLE = True
- ITEM_FILTER_SETTING = dict(
- filter_type=6,
- expire_time=1 * 365 * 24 * 3600, # 过期时间
- config={
- "py": {
- "fingerprint_pref": "pylist_",
- "ip_port": "172.17.162.34:8361",
- "user_pass": "k5ZJR5KV4q7DRZ92DQ",
- "db": 0
- },
- "lua": {
- "fingerprint_pref": "list_",
- "ip_port": "172.17.4.84:4679",
- "user_pass": "jytopnet123",
- "db": 0
- }
- }
- )
- # 日志设置
- DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
- LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
- LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME) # log存储路径
- LOG_LEVEL = "DEBUG"
- LOG_COLOR = True # 是否带有颜色
- LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
- LOG_IS_WRITE_TO_FILE = True # 是否写文件
- LOG_MODE = "w" # 写文件的模式
- LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数
- LOG_BACKUP_COUNT = 20 # 日志文件保留数量
- LOG_ENCODING = "utf8" # 日志文件编码
- # 详情采集任务领取接口
- JY_TASK_URL = "http://pytask.spdata.jianyu360.com"
- # bucket配置
- ALI_BUCKET_CONFIG = {
- "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
- "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
- "endpoint": "oss-cn-beijing-internal.aliyuncs.com",
- "bucket_name": "jy-datafile"
- }
|