setting.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. # -*- coding: utf-8 -*-
  2. """爬虫配置文件"""
  3. import datetime
  4. import os
  5. import sys
  6. # MONGODB
  7. MONGO_IP = "172.17.4.87"
  8. MONGO_PORT = 27080
  9. MONGO_DB = "py_spider"
  10. # REDIS
  11. REDISDB_IP_PORTS = "172.17.4.232:7361"
  12. REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
  13. REDISDB_DB = 12
  14. # 爬虫采集信息存放目录
  15. REDIS_KEY = "py_spider"
  16. # rabbitMq
  17. RABBITMQ_IP_PORT = '172.17.4.232:5672'
  18. RABBITMQ_USER = 'root'
  19. RABBITMQ_USER_PASS = 'V0O0049qBI2rV1554jLZPiBZ8H3Bo4'
  20. RABBITMQ_EXCHANGE = 'py_spider'
  21. RABBITMQ_EXCHANGE_TYPE = 'direct'
  22. RABBITMQ_VIRTUAL_HOST = '/'
  23. RABBITMQ_SOCKET_TIMEOUT = 60
  24. RABBITMQ_HEARTBEAT = 600
  25. # 数据入库的pipeline
  26. ITEM_PIPELINES = [
  27. # "feapder.pipelines.mongo_pipeline.MongoPipeline",
  28. # "feapder.pipelines.redis_pipeline.RedisPipeline",
  29. "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
  30. ]
  31. # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
  32. EXPORT_DATA_MAX_FAILED_TIMES = 5
  33. # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
  34. EXPORT_DATA_MAX_RETRY_TIMES = 5
  35. COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
  36. # 爬虫
  37. SPIDER_HEARTBEAT = "spider_heartbeat" # 爬虫心跳
  38. SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32
  39. SPIDER_MAX_RETRY_TIMES = 3 # 每个请求最大重试次数
  40. # 浏览器渲染
  41. WEBDRIVER = dict(
  42. pool_size=1, # 浏览器的数量
  43. load_images=False, # 是否加载图片
  44. user_agent=None, # 字符串 或 无参函数,返回值为user_agent
  45. proxy=None, # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
  46. headless=False, # 是否为无头浏览器
  47. driver_type="CHROME", # CHROME、FIREFOX
  48. timeout=30, # 请求超时时间
  49. window_size=(1280, 800), # 窗口大小
  50. executable_path=None, # 浏览器路径,默认为默认路径
  51. render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
  52. custom_argument=["--ignore-certificate-errors"], # 自定义浏览器渲染参数
  53. usages_local_driver=False, # 是否加载本地驱动
  54. server_addr="http://172.17.4.232:6666/wd/hub", # selenium 远程服务地址
  55. version="", # 远程浏览器版本
  56. service_log_path=os.devnull # 日志路径
  57. )
  58. # 爬虫启动时,重新入库失败的item
  59. RETRY_FAILED_ITEMS = True
  60. # 保存失败的request
  61. SAVE_FAILED_REQUEST = False
  62. # request网络请求超时时间
  63. REQUEST_TIMEOUT = 60
  64. # 设置代理,代理提取API ,返回的代理分割符为\r\n
  65. PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
  66. PROXY_ENABLE = True
  67. # item去重
  68. ITEM_FILTER_ENABLE = True
  69. ITEM_FILTER_SETTING = dict(
  70. filter_type=6,
  71. redis_conf=dict(
  72. pylist_=dict(
  73. redisdb_ip_port="172.17.4.240:8361",
  74. redisdb_user_pass="k5ZJR5KV4q7DRZ92DQ",
  75. redisdb_db=0
  76. ),
  77. list_=dict(
  78. redisdb_ip_port="172.17.4.84:4679",
  79. redisdb_user_pass="jytopnet123",
  80. redisdb_db=0
  81. )
  82. ),
  83. expire_time=63072000, # 过期时间2年
  84. )
  85. # 日志设置
  86. DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
  87. LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
  88. LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME) # log存储路径
  89. LOG_LEVEL = "ERROR"
  90. LOG_COLOR = True # 是否带有颜色
  91. LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
  92. LOG_IS_WRITE_TO_FILE = True # 是否写文件
  93. LOG_MODE = "w" # 写文件的模式
  94. LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数
  95. LOG_BACKUP_COUNT = 20 # 日志文件保留数量
  96. LOG_ENCODING = "utf8" # 日志文件编码
  97. OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 一般用不到
  98. # 自建代理池
  99. SWORDFISH_PROXY_URL = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
  100. SWORDFISH_PROXY_AUTHOR = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
  101. # splash 渲染服务
  102. SWORDFISH_RENDER_URL = "http://splash.spdata.jianyu360.com/render.json"
  103. # 爬虫心跳
  104. RECORD_SPIDER_HEARTBEAT = "spider_heartbeat" # 爬虫采集心跳记录表名
  105. # 远程bucket配置
  106. ALI_BUCKET_CONFIG = {
  107. "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
  108. "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
  109. "endpoint": "oss-cn-beijing-internal.aliyuncs.com",
  110. "bucket_name": "jy-datafile"
  111. }