download.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. import threading
  2. import requests
  3. import urllib3
  4. from loguru import logger
  5. from requests.adapters import HTTPAdapter
  6. from requests.models import Response, REDIRECT_STATI
  7. from requests.utils import get_encodings_from_content
  8. from urllib3.util.retry import Retry
  9. from config.load import headers
  10. try:
  11. import chardet
  12. except ImportError:
  13. import charset_normalizer as chardet
  14. urllib3.disable_warnings()
  15. '''特殊编码需要解码'''
  16. SPECIAL_ENCODINGS = [
  17. 'Windows-1254',
  18. 'ISO-8859-1'
  19. ]
  20. '''每个Session连接池大小'''
  21. DEFAULT_POOLSIZE = 10
  22. class Downloader:
  23. def __init__(self, max_retries=3, retry_interval=0, **kwargs):
  24. self._max_retries = max_retries # 请求错误时的最大重试次数
  25. self._backoff_factor = retry_interval # 重试间隔补偿系数
  26. self.disable_debug_log = kwargs.pop('disable_debug_log', False)
  27. self.session = requests.Session()
  28. # 适配器 - 重试对象
  29. retry = Retry(
  30. total=self._max_retries,
  31. backoff_factor=self._backoff_factor
  32. )
  33. # 适配器
  34. adapter = HTTPAdapter(
  35. pool_connections=DEFAULT_POOLSIZE,
  36. pool_maxsize=DEFAULT_POOLSIZE,
  37. max_retries=retry
  38. )
  39. self.session.mount('http://', adapter)
  40. self.session.mount('https://', adapter)
  41. @staticmethod
  42. def prepare_params(**kw):
  43. request_params = {}
  44. request_params.setdefault('allow_redirects', False)
  45. request_params.setdefault('timeout', (kw.pop('timeout', None) or 10))
  46. for key, val in kw.items():
  47. if key != 'headers' and key in request_params:
  48. request_params.update({key: val})
  49. else:
  50. request_params.setdefault(key, val)
  51. request_headers = (kw.pop('headers', None) or headers)
  52. for key, val in request_headers.items():
  53. if key in request_headers:
  54. request_headers.update({key: val})
  55. else:
  56. request_headers.setdefault(key, val)
  57. request_params.setdefault('headers', request_headers)
  58. return request_params
  59. @staticmethod
  60. def apparent_encoding(response):
  61. encoding = response.encoding
  62. if encoding in SPECIAL_ENCODINGS:
  63. # 根据真正的编码格式对内容进行解码
  64. true_encoding = get_encodings_from_content(response.text)
  65. if true_encoding:
  66. encoding = true_encoding[0]
  67. else:
  68. encoding = chardet.detect(response.content)['encoding']
  69. return encoding
  70. def _requests_by_get(self, url, **kw):
  71. request_params = self.prepare_params(**kw)
  72. response = None # 请求响应
  73. reason = "" # 错误原因
  74. ssl_retries = 2 # ssl证书验证,错误重试次数
  75. while True:
  76. try:
  77. response = self.session.get(url, **request_params)
  78. # 解决重定向的网站
  79. if response.status_code in REDIRECT_STATI:
  80. request_params.update({'allow_redirects': True})
  81. continue
  82. response.encoding = self.apparent_encoding(response)
  83. break
  84. except requests.exceptions.SSLError as e:
  85. reason = e.__class__.__name__
  86. if 'verify' not in request_params:
  87. request_params.setdefault('verify', False)
  88. else:
  89. if 'verify' in request_params:
  90. del request_params['verify']
  91. url = url.replace('https', 'http')
  92. if ssl_retries <= 0:
  93. break
  94. ssl_retries -= 1
  95. except requests.RequestException as e:
  96. reason = e.__class__.__name__
  97. break
  98. if response is None:
  99. response = Response()
  100. response.status_code = 10001
  101. response.encoding = 'utf-8' # 设置默认编码
  102. response._content = b"" # 设置默认响应文本流
  103. response.reason = reason
  104. if self.disable_debug_log:
  105. t_name = threading.currentThread().getName()
  106. logger.debug(f'<{t_name}-Response> {response.status_code} - {url}')
  107. return response
  108. def get(self, url, **kw):
  109. """
  110. 网络请求
  111. :param url: 访问地址
  112. :param kw: requests.GET请求参数
  113. :return: 响应对象
  114. """
  115. return self._requests_by_get(url, **kw)
  116. class RenderDownloader(Downloader):
  117. def get(self, url, **kw):
  118. splash_url = 'http://splash.spdata.jianyu360.com/render.json'
  119. args = {
  120. 'url': url,
  121. 'html': 1,
  122. 'iframes': 1,
  123. 'headers': headers,
  124. 'timeout': kw.pop('timeout', 2),
  125. 'wait': kw.pop('wait', 0.5),
  126. 'viewport': kw.pop('viewport', 'full'),
  127. }
  128. resp = requests.post(splash_url,
  129. json=args,
  130. headers={'content-type': 'application/json'})
  131. return resp