setting.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. # -*- coding: utf-8 -*-
  2. """爬虫配置文件"""
  3. import datetime
  4. import os
  5. import sys
  6. # 数据保存失败表
  7. TAB_FAILED_ITEMS = "pyspider:s_failed_items"
  8. # 任务失败表
  9. TAB_FAILED_REQUESTS = "pyspider:z_failed_requests"
  10. # 采集任务生产表
  11. TASK_REQUEST_PRODUCE = "pyspider_listdata"
  12. # 失败任务记录表
  13. TASK_REQUEST_FAILED = "pyspider_listdata_err"
  14. # 爬虫心跳与数据采集汇总统计指标表
  15. SPIDER_HEARTBEAT_RECORD = "pyspider_heartbeat"
  16. # MONGODB
  17. MONGO_IP = "172.17.4.87"
  18. MONGO_PORT = 27080
  19. MONGO_DB = "py_spider"
  20. # REDIS
  21. REDISDB_IP_PORTS = "172.17.162.28:7361"
  22. REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
  23. REDISDB_DB = 10
  24. # rabbitMq
  25. RABBITMQ_IP_PORT = "172.17.162.28:5672"
  26. RABBITMQ_USER = "root"
  27. RABBITMQ_USER_PASS = "V0O0049qBI2rV1554jLZPiBZ8H3Bo4"
  28. RABBITMQ_EXCHANGE = "pyspider.data.spider"
  29. RABBITMQ_EXCHANGE_TYPE = "direct"
  30. RABBITMQ_VIRTUAL_HOST = "/"
  31. RABBITMQ_SOCKET_TIMEOUT = 60
  32. RABBITMQ_HEARTBEAT = 600
  33. # 数据入库的pipeline
  34. ITEM_PIPELINES = [
  35. # "feapder.pipelines.mongo_pipeline.MongoPipeline",
  36. "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
  37. ]
  38. # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
  39. EXPORT_DATA_MAX_FAILED_TIMES = 5
  40. # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
  41. EXPORT_DATA_MAX_RETRY_TIMES = 5
  42. COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
  43. # 爬虫
  44. SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32
  45. SPIDER_MAX_RETRY_TIMES = 3 # 每个请求最大重试次数
  46. # 浏览器渲染
  47. WEBDRIVER = dict(
  48. pool_size=1, # 浏览器的数量
  49. load_images=False, # 是否加载图片
  50. user_agent=None, # 字符串 或 无参函数,返回值为user_agent
  51. proxy=None, # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
  52. headless=False, # 是否为无头浏览器
  53. driver_type="CHROME", # CHROME、FIREFOX
  54. timeout=30, # 请求超时时间
  55. window_size=(1280, 800), # 窗口大小
  56. executable_path=None, # 浏览器路径,默认为默认路径
  57. render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
  58. custom_argument=["--ignore-certificate-errors"], # 自定义浏览器渲染参数
  59. usages_local_driver=False, # 是否加载本地驱动
  60. server_addr="http://172.17.162.28:6666/wd/hub", # selenium 远程服务地址
  61. version="", # 远程浏览器版本
  62. service_log_path=os.devnull # 日志路径
  63. )
  64. # splash渲染
  65. SPLASH_API = "http://splash.spdata.jianyu360.com/render.json"
  66. # request网络请求超时时间
  67. REQUEST_TIMEOUT = 60
  68. # 设置代理,代理提取API ,返回的代理分割符为\r\n
  69. PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
  70. PROXY_ENABLE = True
  71. JY_PROXY_URL = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
  72. JY_PROXY_AUTHOR = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
  73. # item去重
  74. ITEM_FILTER_ENABLE = True
  75. ITEM_FILTER_SETTING = dict(
  76. filter_type=6,
  77. redisdb_conf=[
  78. dict(
  79. fingerprint_pref="pylist_",
  80. ip_port="172.17.4.240:8361",
  81. user_pass="k5ZJR5KV4q7DRZ92DQ",
  82. db=0
  83. ),
  84. dict(
  85. fingerprint_pref="list_",
  86. ip_port="172.17.4.84:4679",
  87. user_pass="jytopnet123",
  88. db=0
  89. )
  90. ],
  91. expire_time=63072000, # 过期时间2年
  92. )
  93. # 日志设置
  94. DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
  95. LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
  96. LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME) # log存储路径
  97. LOG_LEVEL = "ERROR"
  98. LOG_COLOR = True # 是否带有颜色
  99. LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
  100. LOG_IS_WRITE_TO_FILE = True # 是否写文件
  101. LOG_MODE = "w" # 写文件的模式
  102. LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数
  103. LOG_BACKUP_COUNT = 20 # 日志文件保留数量
  104. LOG_ENCODING = "utf8" # 日志文件编码
  105. OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 一般用不到
  106. # 远程bucket配置
  107. ALI_BUCKET_CONFIG = {
  108. "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
  109. "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
  110. "endpoint": "oss-cn-beijing-internal.aliyuncs.com",
  111. "bucket_name": "jy-datafile"
  112. }