setting.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. # -*- coding: utf-8 -*-
  2. """爬虫配置文件"""
  3. import datetime
  4. import os
  5. import sys
  6. # 数据保存失败表
  7. TAB_FAILED_ITEMS = "pyspider:s_failed_items"
  8. # 任务失败表
  9. TAB_FAILED_REQUESTS = "pyspider:z_failed_requests"
  10. # 采集任务生产表
  11. TASK_REQUEST_PRODUCE = "pyspider_listdata"
  12. # 失败任务记录表
  13. TASK_REQUEST_FAILED = "pyspider_listdata_err"
  14. # 爬虫采集数据指标汇总表
  15. SPIDER_HEARTBEAT_RECORD = "pyspider_heartbeat"
  16. # MONGO
  17. MONGO_IP = "172.20.45.130"
  18. MONGO_PORT = 27017
  19. MONGO_DB = "py_spider"
  20. # REDIS
  21. REDISDB_IP_PORTS = "172.20.45.129:3379"
  22. REDISDB_USER_PASS = "jianyu@python"
  23. REDISDB_DB = 3
  24. # rabbitMq
  25. RABBITMQ_IP_PORT = '172.31.31.204:5672'
  26. RABBITMQ_USER = 'root'
  27. RABBITMQ_USER_PASS = '123123'
  28. RABBITMQ_EXCHANGE = 'py_spider'
  29. RABBITMQ_EXCHANGE_TYPE = 'direct'
  30. RABBITMQ_VIRTUAL_HOST = '/'
  31. RABBITMQ_SOCKET_TIMEOUT = 60
  32. RABBITMQ_HEARTBEAT = 600
  33. # 数据入库的pipeline
  34. ITEM_PIPELINES = [
  35. "feapder.pipelines.mongo_pipeline.MongoPipeline",
  36. # "feapder.pipelines.redis_pipeline.RedisPipeline",
  37. # "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
  38. ]
  39. # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
  40. EXPORT_DATA_MAX_FAILED_TIMES = 5
  41. # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
  42. EXPORT_DATA_MAX_RETRY_TIMES = 5
  43. COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
  44. # 爬虫
  45. SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32
  46. SPIDER_MAX_RETRY_TIMES = 3 # 每个请求最大重试次数
  47. # 浏览器渲染
  48. WEBDRIVER = dict(
  49. pool_size=1, # 浏览器的数量
  50. load_images=False, # 是否加载图片
  51. user_agent=None, # 字符串 或 无参函数,返回值为user_agent
  52. proxy=None, # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
  53. headless=True, # 是否为无头浏览器
  54. driver_type="FIREFOX", # CHROME、FIREFOX
  55. timeout=3, # 请求超时时间
  56. window_size=(1280, 800), # 窗口大小
  57. executable_path='/Users/dongzhaorui/Desktop/dzr/pymain/py-tools/settings/geckodriver', # 浏览器路径,默认为默认路径
  58. render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
  59. custom_argument=["--ignore-certificate-errors"], # 自定义浏览器渲染参数
  60. usages_local_driver=False, # 是否加载本地驱动
  61. server_addr="http://http://172.31.31.204:8899/wd/hub", # selenium 远程服务地址
  62. version="", # 远程浏览器版本
  63. service_log_path=os.devnull # 日志路径
  64. )
  65. SAVE_FAILED_REQUEST = False
  66. RETRY_FAILED_REQUESTS = False
  67. # request网络请求超时时间
  68. REQUEST_TIMEOUT = 60
  69. # 设置代理,代理提取API ,返回的代理分割符为\r\n
  70. PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
  71. PROXY_ENABLE = True
  72. # 自建代理池
  73. JY_PROXY_URL = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
  74. JY_PROXY_AUTHOR = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
  75. # 任务中心
  76. JY_TASK_URL = "http://pytask.spdata.jianyu360.com"
  77. # splash 渲染服务
  78. SPLASH_API = "http://splash.spdata.jianyu360.com/render.json"
  79. # 验证码
  80. CAPTCHA_URL = "http://pycaptcha.spdata.jianyu360.com"
  81. # oss配置
  82. ALI_BUCKET_CONFIG = {
  83. "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
  84. "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
  85. "endpoint": "oss-cn-beijing.aliyuncs.com",
  86. "bucket_name": "jy-datafile"
  87. }
  88. # item去重
  89. ITEM_FILTER_ENABLE = False
  90. ITEM_FILTER_SETTING = dict(
  91. filter_type=5, # redis去重
  92. expire_time=86400, # 过期时间1天
  93. redis_url="redis://default:jianyu@python@172.20.45.129:3379/2"
  94. )
  95. # 日志设置
  96. DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
  97. LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
  98. LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME) # log存储路径
  99. LOG_LEVEL = "DEBUG"
  100. LOG_COLOR = True # 是否带有颜色
  101. LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
  102. LOG_IS_WRITE_TO_FILE = True # 是否写文件
  103. LOG_MODE = "w" # 写文件的模式
  104. LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数
  105. LOG_BACKUP_COUNT = 1 # 日志文件保留数量
  106. LOG_ENCODING = "utf8" # 日志文件编码
  107. OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 一般用不到