Преглед изворни кода

优化 - 网络状态码300的跳转处理

dongzhaorui пре 3 година
родитељ
комит
b5afbf0823
2 измењених фајлова са 17 додато и 10 уклоњено
  1. 11 4
      find_source/crawler/download.py
  2. 6 6
      find_source/crawler/validate.py

+ 11 - 4
find_source/crawler/download.py

@@ -1,23 +1,26 @@
 import requests
 import requests
 import urllib3
 import urllib3
-from requests.models import Response
+from requests.models import Response, REDIRECT_STATI
 
 
 from config.load import headers
 from config.load import headers
-from settings import SPECIAL_ENCODINGS
 
 
 urllib3.disable_warnings()
 urllib3.disable_warnings()
 
 
+'''特殊编码需要解码'''
+SPECIAL_ENCODINGS = [
+    'Windows-1254'
+]
+
 
 
 class Downloader:
 class Downloader:
 
 
     def __init__(self):
     def __init__(self):
         self.timeout = 15
         self.timeout = 15
-        self.allow_redirects = False
         self.max_retries = 3
         self.max_retries = 3
 
 
     def prepare_params(self, **kw):
     def prepare_params(self, **kw):
         request_params = {}
         request_params = {}
-        request_params.setdefault('allow_redirects', self.allow_redirects)
+        request_params.setdefault('allow_redirects', False)
         request_params.setdefault('timeout', self.timeout)
         request_params.setdefault('timeout', self.timeout)
         for key, val in kw.items():
         for key, val in kw.items():
             if key != 'headers' and key in request_params:
             if key != 'headers' and key in request_params:
@@ -42,6 +45,10 @@ class Downloader:
         while retries < self.max_retries:
         while retries < self.max_retries:
             try:
             try:
                 response = requests.get(url, **request_params)
                 response = requests.get(url, **request_params)
+                # 解决重定向的网站
+                if response.status_code in REDIRECT_STATI:
+                    request_params.pop('allow_redirects')
+                    continue
                 response.encoding = response.apparent_encoding
                 response.encoding = response.apparent_encoding
                 if response.encoding in SPECIAL_ENCODINGS:
                 if response.encoding in SPECIAL_ENCODINGS:
                     response.encoding = 'utf-8'
                     response.encoding = 'utf-8'

+ 6 - 6
find_source/crawler/validate.py

@@ -1,5 +1,5 @@
 from crawler.bloom_filter.RedisBloomFilter import RedisFilter
 from crawler.bloom_filter.RedisBloomFilter import RedisFilter
-from settings import REQUIREMENT_PHRASE
+from settings import FILTER_WORDS
 
 
 
 
 class Validator:
 class Validator:
@@ -10,9 +10,9 @@ class Validator:
         self._rbf.start(1000000000, 0.00001)
         self._rbf.start(1000000000, 0.00001)
 
 
     @staticmethod
     @staticmethod
-    def _requirement_phrase(val: str):
-        """关键词"""
-        for word in REQUIREMENT_PHRASE:
+    def _filter_words(val: str):
+        """过滤词"""
+        for word in FILTER_WORDS:
             if val.find(word) != -1:
             if val.find(word) != -1:
                 return True
                 return True
         return False
         return False
@@ -23,8 +23,8 @@ class Validator:
     def data(self, val: str):
     def data(self, val: str):
         return self._rbf.is_exists(val)
         return self._rbf.is_exists(val)
 
 
-    def phrase(self, val: str):
-        return self._requirement_phrase(val)
+    def words(self, val: str):
+        return self._filter_words(val)
 
 
     @property
     @property
     def length(self):
     def length(self):