import threading import chardet import requests import urllib3 from loguru import logger from requests.adapters import HTTPAdapter from requests.models import Response, REDIRECT_STATI from urllib3.util.retry import Retry from config.load import headers urllib3.disable_warnings() '''特殊编码需要解码''' SPECIAL_ENCODINGS = [ 'Windows-1254' ] class Downloader: def __init__(self, max_retries=3, retry_interval=0.1): self._max_retries = max_retries self._backoff_factor = retry_interval @staticmethod def prepare_params(**kw): request_params = {} request_params.setdefault('allow_redirects', False) request_params.setdefault('timeout', (kw.pop('timeout', None) or 10)) for key, val in kw.items(): if key != 'headers' and key in request_params: request_params.update({key: val}) else: request_params.setdefault(key, val) request_headers = (kw.pop('headers', None) or headers) for key, val in request_headers.items(): if key in request_headers: request_headers.update({key: val}) else: request_headers.setdefault(key, val) request_params.setdefault('headers', request_headers) return request_params def _requests_by_get(self, url, **kw): max_retries = (kw.pop('max_retries', 3)) disable_debug_log = kw.pop('disable_debug_log', True) request_params = self.prepare_params(**kw) response = Response() response.encoding = 'utf-8' response.status_code = 10001 retries = 0 while retries < max_retries: try: response = self._session.get(url, **request_params) # 解决重定向的网站 if response.status_code in REDIRECT_STATI: request_params.update({'allow_redirects': True}) continue response.encoding = chardet.detect(response.content)['encoding'] # response.encoding = response.apparent_encoding # if response.encoding in SPECIAL_ENCODINGS: # response.encoding = 'utf-8' break except requests.exceptions.SSLError as e: response.reason = e.__class__.__name__ if 'verify' not in request_params: request_params.setdefault('verify', False) else: if 'verify' in request_params: del request_params['verify'] url = url.replace('https', 'http') except requests.RequestException as e: response.reason = e.__class__.__name__ finally: retries += 1 if not disable_debug_log: t_name = threading.currentThread().getName() logger.debug(f'<{t_name}> - {url} - 响应 - {response}') return response def get(self, url, **kw): """ 网络请求 :param url: 访问地址 :param kw: requests.GET请求参数 :return: 响应对象 """ return self._requests_by_get(url, **kw) @property def _session(self): retry = Retry( total=self._max_retries, backoff_factor=self._backoff_factor ) adapter = HTTPAdapter(max_retries=retry) session = requests.Session() session.mount('http://', adapter) session.mount('https://', adapter) return session class RenderDownloader(Downloader): # def get(self, url, **kw): # splash_url = 'http://8.131.72.226:8998/render.html' # args = { # 'url': url, # 'timeout': 60, # 'wait': 0.5, # } # resp = requests.get(splash_url, params=args, headers=headers) # return resp def get(self, url, **kw): splash_url = 'http://8.131.72.226:8998/render.json' args = { 'url': url, 'html': 1, 'iframes': 1, } # splash_url = 'http://8.131.72.226:8998/render.html' # args = { # 'url': url, # 'timeout': 60, # 'wait': 0.5, # } resp = requests.get(splash_url, params=args, headers=headers) return resp if __name__ == '__main__': render = RenderDownloader() href = 'http://113.230.236.116:5002/mvvm/src/ebid/gcjs/combine/jypt.html?type=%e6%8b%9b%e6%a0%87%e5%85%ac%e5%91%8a&tpid=62c2943104c74c0e34cacef9&tpTitle=%e5%bb%ba%e5%b9%b3%e5%8e%bf%e7%ac%ac%e5%9b%9b%e5%b0%8f%e5%ad%a6%e8%bf%90%e5%8a%a8%e5%9c%ba%e5%8d%87%e7%ba%a7%e6%94%b9%e9%80%a0%e9%a1%b9%e7%9b%ae' resp = render.get(href) resp_json = resp.json() for k, val in resp_json.items(): print(f">> {k}", val) childFrames = resp_json['childFrames'] index = 0 for child in childFrames: print(child) title = child['title'] if len(title) == 0: title = index index += 1 with open(f'{title}.html', 'w') as fp: fp.write(child['html']) html = resp_json['html'] with open('p1.html', 'w') as fp: fp.write(html)