dongzhaorui 3 years ago
parent
commit
5df85cacfc
1 changed files with 10 additions and 9 deletions
  1. 10 9
      find_source/crawler/download.py

+ 10 - 9
find_source/crawler/download.py

@@ -1,5 +1,6 @@
 import threading
 
+import chardet
 import requests
 import urllib3
 from loguru import logger
@@ -19,10 +20,9 @@ SPECIAL_ENCODINGS = [
 
 class Downloader:
 
-    def __init__(self, connect=5, backoff_factor=0.1):
-        self._connect = connect
-        self._backoff_factor = backoff_factor
-        self._max_retries = 3
+    def __init__(self, max_retries=3, retry_interval=0.1):
+        self._max_retries = max_retries
+        self._backoff_factor = retry_interval
 
     @staticmethod
     def prepare_params(**kw):
@@ -45,7 +45,7 @@ class Downloader:
         return request_params
 
     def _requests_by_get(self, url, **kw):
-        max_retries = (kw.pop('max_retries', None) or self._max_retries)
+        max_retries = (kw.pop('max_retries', 3))
         disable_debug_log = kw.pop('disable_debug_log', True)
         request_params = self.prepare_params(**kw)
         response = Response()
@@ -59,9 +59,10 @@ class Downloader:
                 if response.status_code in REDIRECT_STATI:
                     request_params.update({'allow_redirects': True})
                     continue
-                response.encoding = response.apparent_encoding
-                if response.encoding in SPECIAL_ENCODINGS:
-                    response.encoding = 'utf-8'
+                response.encoding = chardet.detect(response.content)['encoding']
+                # response.encoding = response.apparent_encoding
+                # if response.encoding in SPECIAL_ENCODINGS:
+                #     response.encoding = 'utf-8'
                 break
             except requests.exceptions.SSLError as e:
                 response.reason = e.__class__.__name__
@@ -93,7 +94,7 @@ class Downloader:
     @property
     def _session(self):
         retry = Retry(
-            connect=self._connect,
+            total=self._max_retries,
             backoff_factor=self._backoff_factor
         )
         adapter = HTTPAdapter(max_retries=retry)