123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- import threading
- import chardet
- import requests
- import urllib3
- from loguru import logger
- from requests.adapters import HTTPAdapter
- from requests.models import Response, REDIRECT_STATI
- from urllib3.util.retry import Retry
- from config.load import headers
- urllib3.disable_warnings()
- '''特殊编码需要解码'''
- SPECIAL_ENCODINGS = [
- 'Windows-1254'
- ]
- class Downloader:
- def __init__(self, max_retries=3, retry_interval=0.1):
- self._max_retries = max_retries
- self._backoff_factor = retry_interval
- @staticmethod
- def prepare_params(**kw):
- request_params = {}
- request_params.setdefault('allow_redirects', False)
- request_params.setdefault('timeout', (kw.pop('timeout', None) or 10))
- for key, val in kw.items():
- if key != 'headers' and key in request_params:
- request_params.update({key: val})
- else:
- request_params.setdefault(key, val)
- request_headers = (kw.pop('headers', None) or headers)
- for key, val in request_headers.items():
- if key in request_headers:
- request_headers.update({key: val})
- else:
- request_headers.setdefault(key, val)
- request_params.setdefault('headers', request_headers)
- return request_params
- def _requests_by_get(self, url, **kw):
- max_retries = (kw.pop('max_retries', 3))
- disable_debug_log = kw.pop('disable_debug_log', True)
- request_params = self.prepare_params(**kw)
- response = Response()
- response.encoding = 'utf-8'
- response.status_code = 10001
- retries = 0
- while retries < max_retries:
- try:
- response = self._session.get(url, **request_params)
- # 解决重定向的网站
- if response.status_code in REDIRECT_STATI:
- request_params.update({'allow_redirects': True})
- continue
- response.encoding = chardet.detect(response.content)['encoding']
- # response.encoding = response.apparent_encoding
- # if response.encoding in SPECIAL_ENCODINGS:
- # response.encoding = 'utf-8'
- break
- except requests.exceptions.SSLError as e:
- response.reason = e.__class__.__name__
- if 'verify' not in request_params:
- request_params.setdefault('verify', False)
- else:
- if 'verify' in request_params:
- del request_params['verify']
- url = url.replace('https', 'http')
- except requests.RequestException as e:
- response.reason = e.__class__.__name__
- finally:
- retries += 1
- if not disable_debug_log:
- t_name = threading.currentThread().getName()
- logger.debug(f'<{t_name}> - {url} - 响应 - {response}')
- return response
- def get(self, url, **kw):
- """
- 网络请求
- :param url: 访问地址
- :param kw: requests.GET请求参数
- :return: 响应对象
- """
- return self._requests_by_get(url, **kw)
- @property
- def _session(self):
- retry = Retry(
- total=self._max_retries,
- backoff_factor=self._backoff_factor
- )
- adapter = HTTPAdapter(max_retries=retry)
- session = requests.Session()
- session.mount('http://', adapter)
- session.mount('https://', adapter)
- return session
- class RenderDownloader(Downloader):
- # def get(self, url, **kw):
- # splash_url = 'http://8.131.72.226:8998/render.html'
- # args = {
- # 'url': url,
- # 'timeout': 60,
- # 'wait': 0.5,
- # }
- # resp = requests.get(splash_url, params=args, headers=headers)
- # return resp
- def get(self, url, **kw):
- splash_url = 'http://8.131.72.226:8998/render.json'
- args = {
- 'url': url,
- 'html': 1,
- 'iframes': 1,
- }
- # splash_url = 'http://8.131.72.226:8998/render.html'
- # args = {
- # 'url': url,
- # 'timeout': 60,
- # 'wait': 0.5,
- # }
- resp = requests.get(splash_url, params=args, headers=headers)
- return resp
- if __name__ == '__main__':
- render = RenderDownloader()
- href = 'http://113.230.236.116:5002/mvvm/src/ebid/gcjs/combine/jypt.html?type=%e6%8b%9b%e6%a0%87%e5%85%ac%e5%91%8a&tpid=62c2943104c74c0e34cacef9&tpTitle=%e5%bb%ba%e5%b9%b3%e5%8e%bf%e7%ac%ac%e5%9b%9b%e5%b0%8f%e5%ad%a6%e8%bf%90%e5%8a%a8%e5%9c%ba%e5%8d%87%e7%ba%a7%e6%94%b9%e9%80%a0%e9%a1%b9%e7%9b%ae'
- resp = render.get(href)
- resp_json = resp.json()
- for k, val in resp_json.items():
- print(f">> {k}", val)
- childFrames = resp_json['childFrames']
- index = 0
- for child in childFrames:
- print(child)
- title = child['title']
- if len(title) == 0:
- title = index
- index += 1
- with open(f'{title}.html', 'w') as fp:
- fp.write(child['html'])
- html = resp_json['html']
- with open('p1.html', 'w') as fp:
- fp.write(html)
|