|
@@ -6,6 +6,7 @@ import urllib3
|
|
|
from loguru import logger
|
|
|
from requests.adapters import HTTPAdapter
|
|
|
from requests.models import Response, REDIRECT_STATI
|
|
|
+from requests.utils import get_encodings_from_content
|
|
|
from urllib3.util.retry import Retry
|
|
|
|
|
|
from config.load import headers
|
|
@@ -14,7 +15,8 @@ urllib3.disable_warnings()
|
|
|
|
|
|
'''特殊编码需要解码'''
|
|
|
SPECIAL_ENCODINGS = [
|
|
|
- 'Windows-1254'
|
|
|
+ 'Windows-1254',
|
|
|
+ 'ISO-8859-1'
|
|
|
]
|
|
|
|
|
|
|
|
@@ -44,6 +46,18 @@ class Downloader:
|
|
|
request_params.setdefault('headers', request_headers)
|
|
|
return request_params
|
|
|
|
|
|
+ @staticmethod
|
|
|
+ def apparent_encoding(response):
|
|
|
+ encoding = response.encoding
|
|
|
+ if encoding in SPECIAL_ENCODINGS:
|
|
|
+ # 根据真正的编码格式对内容进行解码
|
|
|
+ true_encoding = get_encodings_from_content(response.text)
|
|
|
+ if true_encoding:
|
|
|
+ encoding = true_encoding[0]
|
|
|
+ else:
|
|
|
+ encoding = chardet.detect(response.content)['encoding']
|
|
|
+ return encoding
|
|
|
+
|
|
|
def _requests_by_get(self, url, **kw):
|
|
|
max_retries = (kw.pop('max_retries', 3))
|
|
|
disable_debug_log = kw.pop('disable_debug_log', True)
|
|
@@ -59,10 +73,7 @@ class Downloader:
|
|
|
if response.status_code in REDIRECT_STATI:
|
|
|
request_params.update({'allow_redirects': True})
|
|
|
continue
|
|
|
- response.encoding = chardet.detect(response.content)['encoding']
|
|
|
- # response.encoding = response.apparent_encoding
|
|
|
- # if response.encoding in SPECIAL_ENCODINGS:
|
|
|
- # response.encoding = 'utf-8'
|
|
|
+ response.encoding = self.apparent_encoding(response)
|
|
|
break
|
|
|
except requests.exceptions.SSLError as e:
|
|
|
response.reason = e.__class__.__name__
|
|
@@ -131,29 +142,3 @@ class RenderDownloader(Downloader):
|
|
|
# }
|
|
|
resp = requests.get(splash_url, params=args, headers=headers)
|
|
|
return resp
|
|
|
-
|
|
|
-
|
|
|
-if __name__ == '__main__':
|
|
|
- render = RenderDownloader()
|
|
|
- href = 'http://113.230.236.116:5002/mvvm/src/ebid/gcjs/combine/jypt.html?type=%e6%8b%9b%e6%a0%87%e5%85%ac%e5%91%8a&tpid=62c2943104c74c0e34cacef9&tpTitle=%e5%bb%ba%e5%b9%b3%e5%8e%bf%e7%ac%ac%e5%9b%9b%e5%b0%8f%e5%ad%a6%e8%bf%90%e5%8a%a8%e5%9c%ba%e5%8d%87%e7%ba%a7%e6%94%b9%e9%80%a0%e9%a1%b9%e7%9b%ae'
|
|
|
- resp = render.get(href)
|
|
|
- resp_json = resp.json()
|
|
|
-
|
|
|
- for k, val in resp_json.items():
|
|
|
- print(f">> {k}", val)
|
|
|
-
|
|
|
- childFrames = resp_json['childFrames']
|
|
|
- index = 0
|
|
|
- for child in childFrames:
|
|
|
- print(child)
|
|
|
- title = child['title']
|
|
|
- if len(title) == 0:
|
|
|
- title = index
|
|
|
- index += 1
|
|
|
-
|
|
|
- with open(f'{title}.html', 'w') as fp:
|
|
|
- fp.write(child['html'])
|
|
|
-
|
|
|
- html = resp_json['html']
|
|
|
- with open('p1.html', 'w') as fp:
|
|
|
- fp.write(html)
|