3
0

setting.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. # -*- coding: utf-8 -*-
  2. """爬虫配置文件"""
  3. import datetime
  4. import os
  5. import sys
  6. # 数据保存失败表
  7. TAB_FAILED_ITEMS = "pyspider:s_failed_items"
  8. # 任务失败表
  9. TAB_FAILED_REQUESTS = "pyspider:z_failed_requests"
  10. # 采集任务生产表
  11. TASK_REQUEST_PRODUCE = "pyspider_listdata"
  12. # 失败任务记录表
  13. TASK_REQUEST_FAILED = "pyspider_listdata_err"
  14. # 爬虫心跳与数据采集汇总统计指标表
  15. SPIDER_HEARTBEAT_RECORD = "pyspider_heartbeat"
  16. # MONGODB
  17. MONGO_IP = "172.17.4.87"
  18. MONGO_PORT = 27080
  19. MONGO_DB = "py_spider"
  20. # REDIS
  21. REDISDB_IP_PORTS = "172.17.162.28:7361"
  22. REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
  23. REDISDB_DB = 10
  24. # rabbitMq
  25. RABBITMQ_IP_PORT = "172.17.162.28:5672"
  26. RABBITMQ_USER = "root"
  27. RABBITMQ_USER_PASS = "V0O0049qBI2rV1554jLZPiBZ8H3Bo4"
  28. RABBITMQ_EXCHANGE = "pyspider.data.spider"
  29. RABBITMQ_EXCHANGE_TYPE = "direct"
  30. RABBITMQ_VIRTUAL_HOST = "/"
  31. RABBITMQ_SOCKET_TIMEOUT = 60
  32. RABBITMQ_HEARTBEAT = 600
  33. # 数据入库的pipeline
  34. ITEM_PIPELINES = [
  35. # "feapder.pipelines.mongo_pipeline.MongoPipeline",
  36. "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
  37. ]
  38. # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
  39. EXPORT_DATA_MAX_FAILED_TIMES = 5
  40. # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
  41. EXPORT_DATA_MAX_RETRY_TIMES = 5
  42. COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
  43. # 爬虫
  44. SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32
  45. SPIDER_MAX_RETRY_TIMES = 3 # 每个请求最大重试次数
  46. # 浏览器渲染
  47. DRISSIONPAGE = dict(
  48. pool_size=1, # 浏览器标签页的数量
  49. browser_path=None, # 浏览器可执行文件路径
  50. scope=None, # 自动浏览器端口范围
  51. port=None, # 浏览器端口
  52. user_data_path=None, # 用户数据目录
  53. headless=True, # 是否为无头浏览器
  54. load_images=False, # 是否加载图片
  55. user_agent=None, # 字符串
  56. proxy=None, # xxx.xxx.xxx.xxx:xxxx
  57. window_size=(1024, 800), # 窗口大小
  58. driver_type="chromium",
  59. load_mode="normal", # 网页加载策略, 可选值:"normal", "eager", "none"
  60. timeout=10, # 请求超时时间
  61. retry=1, # 连接失败浏览器重试次数
  62. interval=0.5, # 连接失败重试间隔(秒)
  63. page_load=15, # 页面加载超时时间(秒)
  64. render_time=20, # 渲染时长,即打开网页等待加载超时时间
  65. download_path=None, # 下载文件的路径
  66. custom_argument=[
  67. "--no-sandbox",
  68. "--ignore-certificate-errors"
  69. ]
  70. )
  71. # request网络请求超时时间
  72. REQUEST_TIMEOUT = 60
  73. # 设置代理
  74. PROXY_EXTRACT_API = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
  75. # PROXY_EXTRACT_API = "http://172.17.162.28:16001/sam"
  76. PROXY_ENABLE = True
  77. PROXY_AUTH = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
  78. PROXY_POOL = "feapder.network.proxy_pool.DirectProxyPool"
  79. # PROXY_POOL = "feapder.network.proxy_pool.SpringBoardProxyPool"
  80. # 下载器
  81. DOWNLOADER = "feapder.network.downloader.RequestsDownloader" # 请求下载器
  82. SESSION_DOWNLOADER = "feapder.network.downloader.RequestsSessionDownloader"
  83. # SESSION_DOWNLOADER = "feapder.network.downloader.RequestsJa3SessionDownloader"
  84. RENDER_DOWNLOADER = "feapder.network.downloader.DrissionPageDownloader" # 渲染下载器
  85. MAKE_ABSOLUTE_LINKS = True # 自动转成绝对连接
  86. # item去重
  87. ITEM_FILTER_ENABLE = True
  88. ITEM_FILTER_SETTING = dict(
  89. filter_type=6,
  90. expire_time=1 * 365 * 24 * 3600, # 过期时间
  91. config={
  92. "py": {
  93. "fingerprint_pref": "pylist_",
  94. "ip_port": "172.17.162.34:8361",
  95. "user_pass": "k5ZJR5KV4q7DRZ92DQ",
  96. "db": 0
  97. },
  98. "lua": {
  99. "fingerprint_pref": "list_",
  100. "ip_port": "172.17.4.84:4679",
  101. "user_pass": "jytopnet123",
  102. "db": 0
  103. }
  104. }
  105. )
  106. # 日志设置
  107. DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
  108. LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
  109. LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME) # log存储路径
  110. LOG_LEVEL = "ERROR"
  111. LOG_COLOR = True # 是否带有颜色
  112. LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
  113. LOG_IS_WRITE_TO_FILE = True # 是否写文件
  114. LOG_MODE = "w" # 写文件的模式
  115. LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数
  116. LOG_BACKUP_COUNT = 20 # 日志文件保留数量
  117. LOG_ENCODING = "utf8" # 日志文件编码
  118. # 剑鱼采集任务服务地址[py]
  119. JY_TASK_URL = "http://pytask.spdata.jianyu360.com"
  120. # 剑鱼附件管理服务地址
  121. JY_OSS_URL = "http://172.17.162.27:18011"
  122. # bucket配置
  123. ALI_BUCKET_CONFIG = {
  124. "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
  125. "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
  126. "endpoint": "oss-cn-beijing-internal.aliyuncs.com",
  127. "bucket_name": "jy-datafile"
  128. }