dongzhaorui 3 năm trước cách đây
mục cha
commit
2b16289cb8
1 tập tin đã thay đổi với 16 bổ sung31 xóa
  1. 16 31
      find_source/crawler/download.py

+ 16 - 31
find_source/crawler/download.py

@@ -6,6 +6,7 @@ import urllib3
 from loguru import logger
 from requests.adapters import HTTPAdapter
 from requests.models import Response, REDIRECT_STATI
+from requests.utils import get_encodings_from_content
 from urllib3.util.retry import Retry
 
 from config.load import headers
@@ -14,7 +15,8 @@ urllib3.disable_warnings()
 
 '''特殊编码需要解码'''
 SPECIAL_ENCODINGS = [
-    'Windows-1254'
+    'Windows-1254',
+    'ISO-8859-1'
 ]
 
 
@@ -44,6 +46,18 @@ class Downloader:
         request_params.setdefault('headers', request_headers)
         return request_params
 
+    @staticmethod
+    def apparent_encoding(response):
+        encoding = response.encoding
+        if encoding in SPECIAL_ENCODINGS:
+            # 根据真正的编码格式对内容进行解码
+            true_encoding = get_encodings_from_content(response.text)
+            if true_encoding:
+                encoding = true_encoding[0]
+            else:
+                encoding = chardet.detect(response.content)['encoding']
+        return encoding
+
     def _requests_by_get(self, url, **kw):
         max_retries = (kw.pop('max_retries', 3))
         disable_debug_log = kw.pop('disable_debug_log', True)
@@ -59,10 +73,7 @@ class Downloader:
                 if response.status_code in REDIRECT_STATI:
                     request_params.update({'allow_redirects': True})
                     continue
-                response.encoding = chardet.detect(response.content)['encoding']
-                # response.encoding = response.apparent_encoding
-                # if response.encoding in SPECIAL_ENCODINGS:
-                #     response.encoding = 'utf-8'
+                response.encoding = self.apparent_encoding(response)
                 break
             except requests.exceptions.SSLError as e:
                 response.reason = e.__class__.__name__
@@ -131,29 +142,3 @@ class RenderDownloader(Downloader):
         # }
         resp = requests.get(splash_url, params=args, headers=headers)
         return resp
-
-
-if __name__ == '__main__':
-    render = RenderDownloader()
-    href = 'http://113.230.236.116:5002/mvvm/src/ebid/gcjs/combine/jypt.html?type=%e6%8b%9b%e6%a0%87%e5%85%ac%e5%91%8a&tpid=62c2943104c74c0e34cacef9&tpTitle=%e5%bb%ba%e5%b9%b3%e5%8e%bf%e7%ac%ac%e5%9b%9b%e5%b0%8f%e5%ad%a6%e8%bf%90%e5%8a%a8%e5%9c%ba%e5%8d%87%e7%ba%a7%e6%94%b9%e9%80%a0%e9%a1%b9%e7%9b%ae'
-    resp = render.get(href)
-    resp_json = resp.json()
-
-    for k, val in resp_json.items():
-        print(f">> {k}", val)
-
-    childFrames = resp_json['childFrames']
-    index = 0
-    for child in childFrames:
-        print(child)
-        title = child['title']
-        if len(title) == 0:
-            title = index
-            index += 1
-
-        with open(f'{title}.html', 'w') as fp:
-            fp.write(child['html'])
-
-    html = resp_json['html']
-    with open('p1.html', 'w') as fp:
-        fp.write(html)