download.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. import threading
  2. try:
  3. import chardet
  4. except ImportError:
  5. import charset_normalizer as chardet
  6. import requests
  7. import urllib3
  8. from loguru import logger
  9. from requests.adapters import HTTPAdapter
  10. from requests.models import Response, REDIRECT_STATI
  11. from requests.utils import get_encodings_from_content
  12. from urllib3.util.retry import Retry
  13. from config.load import headers
  14. urllib3.disable_warnings()
  15. '''特殊编码需要解码'''
  16. SPECIAL_ENCODINGS = [
  17. 'Windows-1254',
  18. 'ISO-8859-1'
  19. ]
  20. class Downloader:
  21. def __init__(self, max_retries=3, retry_interval=0.1):
  22. self._max_retries = max_retries
  23. self._backoff_factor = retry_interval
  24. @staticmethod
  25. def prepare_params(**kw):
  26. request_params = {}
  27. request_params.setdefault('allow_redirects', False)
  28. request_params.setdefault('timeout', (kw.pop('timeout', None) or 10))
  29. for key, val in kw.items():
  30. if key != 'headers' and key in request_params:
  31. request_params.update({key: val})
  32. else:
  33. request_params.setdefault(key, val)
  34. request_headers = (kw.pop('headers', None) or headers)
  35. for key, val in request_headers.items():
  36. if key in request_headers:
  37. request_headers.update({key: val})
  38. else:
  39. request_headers.setdefault(key, val)
  40. request_params.setdefault('headers', request_headers)
  41. return request_params
  42. @staticmethod
  43. def apparent_encoding(response):
  44. encoding = response.encoding
  45. if encoding in SPECIAL_ENCODINGS:
  46. # 根据真正的编码格式对内容进行解码
  47. true_encoding = get_encodings_from_content(response.text)
  48. if true_encoding:
  49. encoding = true_encoding[0]
  50. else:
  51. encoding = chardet.detect(response.content)['encoding']
  52. return encoding
  53. def _requests_by_get(self, url, **kw):
  54. max_retries = (kw.pop('max_retries', 3))
  55. disable_debug_log = kw.pop('disable_debug_log', True)
  56. request_params = self.prepare_params(**kw)
  57. response = Response()
  58. response.encoding = 'utf-8'
  59. response.status_code = 10001
  60. retries = 0
  61. while retries < max_retries:
  62. try:
  63. response = self._session.get(url, **request_params)
  64. # 解决重定向的网站
  65. if response.status_code in REDIRECT_STATI:
  66. request_params.update({'allow_redirects': True})
  67. continue
  68. response.encoding = self.apparent_encoding(response)
  69. break
  70. except requests.exceptions.SSLError as e:
  71. response.reason = e.__class__.__name__
  72. if 'verify' not in request_params:
  73. request_params.setdefault('verify', False)
  74. else:
  75. if 'verify' in request_params:
  76. del request_params['verify']
  77. url = url.replace('https', 'http')
  78. except requests.RequestException as e:
  79. response.reason = e.__class__.__name__
  80. finally:
  81. retries += 1
  82. if not disable_debug_log:
  83. t_name = threading.currentThread().getName()
  84. logger.debug(f'<{t_name}> - {url} - 响应 - {response}')
  85. return response
  86. def get(self, url, **kw):
  87. """
  88. 网络请求
  89. :param url: 访问地址
  90. :param kw: requests.GET请求参数
  91. :return: 响应对象
  92. """
  93. return self._requests_by_get(url, **kw)
  94. @property
  95. def _session(self):
  96. retry = Retry(
  97. total=self._max_retries,
  98. backoff_factor=self._backoff_factor
  99. )
  100. adapter = HTTPAdapter(max_retries=retry)
  101. session = requests.Session()
  102. session.mount('http://', adapter)
  103. session.mount('https://', adapter)
  104. return session
  105. class RenderDownloader(Downloader):
  106. def get(self, url, **kw):
  107. splash_url = 'http://8.131.72.226:8998/render.json'
  108. args = {
  109. 'url': url,
  110. 'html': 1,
  111. 'iframes': 1,
  112. 'headers': headers,
  113. 'timeout': kw.pop('timeout', 2),
  114. 'wait': kw.pop('wait', 0.5),
  115. 'viewport': kw.pop('viewport', 'full'),
  116. }
  117. resp = requests.post(splash_url,
  118. json=args,
  119. headers={'content-type': 'application/json'})
  120. return resp