setting.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. # -*- coding: utf-8 -*-
  2. """爬虫配置文件"""
  3. import datetime
  4. import os
  5. import sys
  6. # 数据保存失败表
  7. TAB_FAILED_ITEMS = 'pyspider:s_failed_items'
  8. # 任务失败表
  9. TAB_FAILED_REQUESTS = 'pyspider:z_failed_requests'
  10. # 待处理任务状态记录表
  11. TASK_CRAWL_STATE = "pyspider:t_crawl_state"
  12. # 失败任务记录表
  13. TASK_REQUEST_FAILED = "pyspider_listdata_err"
  14. # 爬虫采集汇总表
  15. SPIDER_HEARTBEAT_RECORD = "spider_heartbeat" # 爬虫采集心跳记录表名
  16. #mongo
  17. MONGO_IP = "192.168.3.182"
  18. MONGO_PORT = 27017
  19. MONGO_DB = "py_spider"
  20. # REDIS
  21. REDISDB_IP_PORTS = "192.168.3.182:6379"
  22. REDISDB_USER_PASS = "jianyu@python"
  23. REDISDB_DB = 2
  24. # 爬虫采集信息存放目录
  25. REDIS_KEY = "py_spider"
  26. # rabbitMq
  27. RABBITMQ_IP_PORT = '192.168.3.182:5672'
  28. RABBITMQ_USER = 'root'
  29. RABBITMQ_USER_PASS = '123123'
  30. RABBITMQ_EXCHANGE = 'py_spider'
  31. RABBITMQ_EXCHANGE_TYPE = 'direct'
  32. RABBITMQ_VIRTUAL_HOST = '/'
  33. RABBITMQ_SOCKET_TIMEOUT = 60
  34. RABBITMQ_HEARTBEAT = 600
  35. # 数据入库的pipeline
  36. ITEM_PIPELINES = [
  37. "feapder.pipelines.mongo_pipeline.MongoPipeline",
  38. # "feapder.pipelines.redis_pipeline.RedisPipeline",
  39. # "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
  40. ]
  41. # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
  42. EXPORT_DATA_MAX_FAILED_TIMES = 5
  43. # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
  44. EXPORT_DATA_MAX_RETRY_TIMES = 5
  45. COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
  46. # 爬虫
  47. SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32
  48. SPIDER_MAX_RETRY_TIMES = 3 # 每个请求最大重试次数
  49. # 浏览器渲染
  50. WEBDRIVER = dict(
  51. pool_size=1, # 浏览器的数量
  52. load_images=False, # 是否加载图片
  53. user_agent=None, # 字符串 或 无参函数,返回值为user_agent
  54. proxy=None, # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
  55. headless=True, # 是否为无头浏览器
  56. driver_type="FIREFOX", # CHROME、FIREFOX
  57. timeout=3, # 请求超时时间
  58. window_size=(1280, 800), # 窗口大小
  59. executable_path='/Users/dongzhaorui/Documents/dzr/pyscripts/Spiders/settings/geckodriver', # 浏览器路径,默认为默认路径
  60. render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
  61. custom_argument=["--ignore-certificate-errors"], # 自定义浏览器渲染参数
  62. usages_local_driver=False, # 是否加载本地驱动
  63. server_addr="http://192.168.3.182:8899/wd/hub", # selenium 远程服务地址
  64. version="", # 远程浏览器版本
  65. service_log_path=os.devnull # 日志路径
  66. )
  67. # request网络请求超时时间
  68. REQUEST_TIMEOUT = 60
  69. # 设置代理,代理提取API ,返回的代理分割符为\r\n
  70. PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
  71. PROXY_ENABLE = True
  72. # item去重
  73. ITEM_FILTER_ENABLE = False
  74. ITEM_FILTER_SETTING = dict(
  75. filter_type=5, # redis去重
  76. expire_time=86400, # 过期时间1天
  77. redis_url='redis://default:top@123@192.168.3.165:8165/2'
  78. )
  79. # 日志设置
  80. DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
  81. LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
  82. LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME) # log存储路径
  83. LOG_LEVEL = "DEBUG"
  84. LOG_COLOR = True # 是否带有颜色
  85. LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
  86. LOG_IS_WRITE_TO_FILE = True # 是否写文件
  87. LOG_MODE = "w" # 写文件的模式
  88. LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数
  89. LOG_BACKUP_COUNT = 20 # 日志文件保留数量
  90. LOG_ENCODING = "utf8" # 日志文件编码
  91. OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 一般用不到
  92. # 自建代理池
  93. SWORDFISH_PROXY_URL = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
  94. SWORDFISH_PROXY_AUTHOR = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
  95. # splash 渲染服务
  96. SWORDFISH_RENDER_URL = "http://splash.spdata.jianyu360.com/render.json"
  97. # 远程bucket配置
  98. ALI_BUCKET_CONFIG = {
  99. "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
  100. "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
  101. "endpoint": "oss-cn-beijing-internal.aliyuncs.com",
  102. "bucket_name": "jy-datafile"
  103. }