download.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. import threading
  2. import chardet
  3. import requests
  4. import urllib3
  5. from loguru import logger
  6. from requests.adapters import HTTPAdapter
  7. from requests.models import Response, REDIRECT_STATI
  8. from urllib3.util.retry import Retry
  9. from config.load import headers
  10. urllib3.disable_warnings()
  11. '''特殊编码需要解码'''
  12. SPECIAL_ENCODINGS = [
  13. 'Windows-1254'
  14. ]
  15. class Downloader:
  16. def __init__(self, max_retries=3, retry_interval=0.1):
  17. self._max_retries = max_retries
  18. self._backoff_factor = retry_interval
  19. @staticmethod
  20. def prepare_params(**kw):
  21. request_params = {}
  22. request_params.setdefault('allow_redirects', False)
  23. request_params.setdefault('timeout', (kw.pop('timeout', None) or 10))
  24. for key, val in kw.items():
  25. if key != 'headers' and key in request_params:
  26. request_params.update({key: val})
  27. else:
  28. request_params.setdefault(key, val)
  29. request_headers = (kw.pop('headers', None) or headers)
  30. for key, val in request_headers.items():
  31. if key in request_headers:
  32. request_headers.update({key: val})
  33. else:
  34. request_headers.setdefault(key, val)
  35. request_params.setdefault('headers', request_headers)
  36. return request_params
  37. def _requests_by_get(self, url, **kw):
  38. max_retries = (kw.pop('max_retries', 3))
  39. disable_debug_log = kw.pop('disable_debug_log', True)
  40. request_params = self.prepare_params(**kw)
  41. response = Response()
  42. response.encoding = 'utf-8'
  43. response.status_code = 10001
  44. retries = 0
  45. while retries < max_retries:
  46. try:
  47. response = self._session.get(url, **request_params)
  48. # 解决重定向的网站
  49. if response.status_code in REDIRECT_STATI:
  50. request_params.update({'allow_redirects': True})
  51. continue
  52. response.encoding = chardet.detect(response.content)['encoding']
  53. # response.encoding = response.apparent_encoding
  54. # if response.encoding in SPECIAL_ENCODINGS:
  55. # response.encoding = 'utf-8'
  56. break
  57. except requests.exceptions.SSLError as e:
  58. response.reason = e.__class__.__name__
  59. if 'verify' not in request_params:
  60. request_params.setdefault('verify', False)
  61. else:
  62. if 'verify' in request_params:
  63. del request_params['verify']
  64. url = url.replace('https', 'http')
  65. except requests.RequestException as e:
  66. response.reason = e.__class__.__name__
  67. finally:
  68. retries += 1
  69. if not disable_debug_log:
  70. t_name = threading.currentThread().getName()
  71. logger.debug(f'<{t_name}> - {url} - 响应 - {response}')
  72. return response
  73. def get(self, url, **kw):
  74. """
  75. 网络请求
  76. :param url: 访问地址
  77. :param kw: requests.GET请求参数
  78. :return: 响应对象
  79. """
  80. return self._requests_by_get(url, **kw)
  81. @property
  82. def _session(self):
  83. retry = Retry(
  84. total=self._max_retries,
  85. backoff_factor=self._backoff_factor
  86. )
  87. adapter = HTTPAdapter(max_retries=retry)
  88. session = requests.Session()
  89. session.mount('http://', adapter)
  90. session.mount('https://', adapter)
  91. return session
  92. class RenderDownloader(Downloader):
  93. # def get(self, url, **kw):
  94. # splash_url = 'http://8.131.72.226:8998/render.html'
  95. # args = {
  96. # 'url': url,
  97. # 'timeout': 60,
  98. # 'wait': 0.5,
  99. # }
  100. # resp = requests.get(splash_url, params=args, headers=headers)
  101. # return resp
  102. def get(self, url, **kw):
  103. splash_url = 'http://8.131.72.226:8998/render.json'
  104. args = {
  105. 'url': url,
  106. 'html': 1,
  107. 'iframes': 1,
  108. }
  109. # splash_url = 'http://8.131.72.226:8998/render.html'
  110. # args = {
  111. # 'url': url,
  112. # 'timeout': 60,
  113. # 'wait': 0.5,
  114. # }
  115. resp = requests.get(splash_url, params=args, headers=headers)
  116. return resp
  117. if __name__ == '__main__':
  118. render = RenderDownloader()
  119. href = 'http://113.230.236.116:5002/mvvm/src/ebid/gcjs/combine/jypt.html?type=%e6%8b%9b%e6%a0%87%e5%85%ac%e5%91%8a&tpid=62c2943104c74c0e34cacef9&tpTitle=%e5%bb%ba%e5%b9%b3%e5%8e%bf%e7%ac%ac%e5%9b%9b%e5%b0%8f%e5%ad%a6%e8%bf%90%e5%8a%a8%e5%9c%ba%e5%8d%87%e7%ba%a7%e6%94%b9%e9%80%a0%e9%a1%b9%e7%9b%ae'
  120. resp = render.get(href)
  121. resp_json = resp.json()
  122. for k, val in resp_json.items():
  123. print(f">> {k}", val)
  124. childFrames = resp_json['childFrames']
  125. index = 0
  126. for child in childFrames:
  127. print(child)
  128. title = child['title']
  129. if len(title) == 0:
  130. title = index
  131. index += 1
  132. with open(f'{title}.html', 'w') as fp:
  133. fp.write(child['html'])
  134. html = resp_json['html']
  135. with open('p1.html', 'w') as fp:
  136. fp.write(html)