|
@@ -1,124 +0,0 @@
|
|
|
-# -*- coding: utf-8 -*-
|
|
|
-"""爬虫配置文件"""
|
|
|
-import datetime
|
|
|
-import os
|
|
|
-import sys
|
|
|
-
|
|
|
-# 数据保存失败表
|
|
|
-TAB_FAILED_ITEMS = "pyspider:s_failed_items"
|
|
|
-# 任务失败表
|
|
|
-TAB_FAILED_REQUESTS = "pyspider:z_failed_requests"
|
|
|
-# 采集任务生产表
|
|
|
-TASK_REQUEST_PRODUCE = "pyspider_listdata"
|
|
|
-# 失败任务记录表
|
|
|
-TASK_REQUEST_FAILED = "pyspider_listdata_err"
|
|
|
-# 爬虫采集数据指标汇总表
|
|
|
-SPIDER_HEARTBEAT_RECORD = "pyspider_heartbeat"
|
|
|
-
|
|
|
-# MONGO
|
|
|
-MONGO_IP = "172.20.45.130"
|
|
|
-MONGO_PORT = 27017
|
|
|
-MONGO_DB = "py_spider"
|
|
|
-
|
|
|
-# REDIS
|
|
|
-REDISDB_IP_PORTS = "172.20.45.129:3379"
|
|
|
-REDISDB_USER_PASS = "jianyu@python"
|
|
|
-REDISDB_DB = 3
|
|
|
-
|
|
|
-# rabbitMq
|
|
|
-RABBITMQ_IP_PORT = '172.31.31.204:5672'
|
|
|
-RABBITMQ_USER = 'root'
|
|
|
-RABBITMQ_USER_PASS = '123123'
|
|
|
-RABBITMQ_EXCHANGE = 'py_spider'
|
|
|
-RABBITMQ_EXCHANGE_TYPE = 'direct'
|
|
|
-RABBITMQ_VIRTUAL_HOST = '/'
|
|
|
-RABBITMQ_SOCKET_TIMEOUT = 60
|
|
|
-RABBITMQ_HEARTBEAT = 600
|
|
|
-
|
|
|
-# 数据入库的pipeline
|
|
|
-ITEM_PIPELINES = [
|
|
|
- "feapder.pipelines.mongo_pipeline.MongoPipeline",
|
|
|
- # "feapder.pipelines.redis_pipeline.RedisPipeline",
|
|
|
- # "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
|
|
|
-]
|
|
|
-# 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
|
|
|
-EXPORT_DATA_MAX_FAILED_TIMES = 5
|
|
|
-# 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
|
|
|
-EXPORT_DATA_MAX_RETRY_TIMES = 5
|
|
|
-
|
|
|
-COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
|
|
|
-
|
|
|
-# 爬虫
|
|
|
-SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32
|
|
|
-SPIDER_MAX_RETRY_TIMES = 3 # 每个请求最大重试次数
|
|
|
-
|
|
|
-# 浏览器渲染
|
|
|
-WEBDRIVER = dict(
|
|
|
- pool_size=1, # 浏览器的数量
|
|
|
- load_images=False, # 是否加载图片
|
|
|
- user_agent=None, # 字符串 或 无参函数,返回值为user_agent
|
|
|
- proxy=None, # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
|
|
|
- headless=True, # 是否为无头浏览器
|
|
|
- driver_type="FIREFOX", # CHROME、FIREFOX
|
|
|
- timeout=3, # 请求超时时间
|
|
|
- window_size=(1280, 800), # 窗口大小
|
|
|
- executable_path='/Users/dongzhaorui/Desktop/dzr/pymain/py-tools/settings/geckodriver', # 浏览器路径,默认为默认路径
|
|
|
- render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
|
|
|
- custom_argument=["--ignore-certificate-errors"], # 自定义浏览器渲染参数
|
|
|
- usages_local_driver=False, # 是否加载本地驱动
|
|
|
- server_addr="http://http://172.31.31.204:8899/wd/hub", # selenium 远程服务地址
|
|
|
- version="", # 远程浏览器版本
|
|
|
- service_log_path=os.devnull # 日志路径
|
|
|
-)
|
|
|
-
|
|
|
-SAVE_FAILED_REQUEST = False
|
|
|
-RETRY_FAILED_REQUESTS = False
|
|
|
-
|
|
|
-# request网络请求超时时间
|
|
|
-REQUEST_TIMEOUT = 60
|
|
|
-
|
|
|
-# 设置代理,代理提取API ,返回的代理分割符为\r\n
|
|
|
-PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
|
|
|
-PROXY_ENABLE = True
|
|
|
-# 自建代理池
|
|
|
-JY_PROXY_URL = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
|
|
|
-JY_PROXY_AUTHOR = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
|
|
|
-
|
|
|
-# 任务中心
|
|
|
-JY_TASK_URL = "http://pytask.spdata.jianyu360.com"
|
|
|
-
|
|
|
-# splash 渲染服务
|
|
|
-SPLASH_API = "http://splash.spdata.jianyu360.com/render.json"
|
|
|
-
|
|
|
-# 验证码
|
|
|
-CAPTCHA_URL = "http://pycaptcha.spdata.jianyu360.com"
|
|
|
-
|
|
|
-# oss配置
|
|
|
-ALI_BUCKET_CONFIG = {
|
|
|
- "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
|
|
|
- "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
|
|
|
- "endpoint": "oss-cn-beijing.aliyuncs.com",
|
|
|
- "bucket_name": "jy-datafile"
|
|
|
-}
|
|
|
-
|
|
|
-# item去重
|
|
|
-ITEM_FILTER_ENABLE = False
|
|
|
-ITEM_FILTER_SETTING = dict(
|
|
|
- filter_type=5, # redis去重
|
|
|
- expire_time=86400, # 过期时间1天
|
|
|
- redis_url="redis://default:jianyu@python@172.20.45.129:3379/2"
|
|
|
-)
|
|
|
-
|
|
|
-# 日志设置
|
|
|
-DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
|
|
|
-LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
|
|
|
-LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME) # log存储路径
|
|
|
-LOG_LEVEL = "DEBUG"
|
|
|
-LOG_COLOR = True # 是否带有颜色
|
|
|
-LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
|
|
|
-LOG_IS_WRITE_TO_FILE = True # 是否写文件
|
|
|
-LOG_MODE = "w" # 写文件的模式
|
|
|
-LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数
|
|
|
-LOG_BACKUP_COUNT = 1 # 日志文件保留数量
|
|
|
-LOG_ENCODING = "utf8" # 日志文件编码
|
|
|
-OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 一般用不到
|