Pārlūkot izejas kodu

添加 ja3_session 机制

dongzhaorui 1 gadu atpakaļ
vecāks
revīzija
4910476e5f

+ 47 - 4
FworkSpider/feapder/network/request.py

@@ -14,6 +14,7 @@ import requests
 from requests.adapters import HTTPAdapter
 from requests.adapters import HTTPAdapter
 from requests.cookies import RequestsCookieJar
 from requests.cookies import RequestsCookieJar
 from requests.packages.urllib3.exceptions import InsecureRequestWarning
 from requests.packages.urllib3.exceptions import InsecureRequestWarning
+from requests.packages.urllib3.util.ssl_ import create_urllib3_context
 
 
 import feapder.setting as setting
 import feapder.setting as setting
 import feapder.utils.tools as tools
 import feapder.utils.tools as tools
@@ -27,6 +28,29 @@ from feapder.utils.webdriver import WebDriverPool
 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
 
 
 
 
+class DESAdapter(HTTPAdapter):
+
+    def __init__(self, *args, **kwargs):
+        """
+        A TransportAdapter that re-enables 3DES support in Requests.
+        """
+        ciphers = ":".join(setting.JA3_REQUEST_CIPHERS).split(':')
+        tools.random.shuffle(ciphers)
+        ciphers = ':'.join(ciphers)
+        self.ciphers = ciphers + ':!aNULL:!eNULL:!MD5'
+        super().__init__(*args, **kwargs)
+
+    def init_poolmanager(self, *args, **kwargs):
+        context = create_urllib3_context(ciphers=self.ciphers)
+        kwargs['ssl_context'] = context
+        return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
+
+    def proxy_manager_for(self, *args, **kwargs):
+        context = create_urllib3_context(ciphers=self.ciphers)
+        kwargs['ssl_context'] = context
+        return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)
+
+
 class Request(object):
 class Request(object):
     session = None
     session = None
     webdriver_pool: WebDriverPool = None
     webdriver_pool: WebDriverPool = None
@@ -67,6 +91,7 @@ class Request(object):
         auto_request=True,
         auto_request=True,
         request_sync=False,
         request_sync=False,
         use_session=None,
         use_session=None,
+        use_ja3_session=None,
         random_user_agent=True,
         random_user_agent=True,
         download_midware=None,
         download_midware=None,
         is_abandoned=False,
         is_abandoned=False,
@@ -85,6 +110,7 @@ class Request(object):
         auto_request=True,
         auto_request=True,
         request_sync=False,
         request_sync=False,
         use_session=None,
         use_session=None,
+        use_ja3_session=None,
         random_user_agent=True,
         random_user_agent=True,
         download_midware=None,
         download_midware=None,
         is_abandoned=False,
         is_abandoned=False,
@@ -108,6 +134,7 @@ class Request(object):
         @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
         @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
         @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
         @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
         @param use_session: 是否使用session方式
         @param use_session: 是否使用session方式
+        @param use_ja3_session: 是否使用ja3_session方式
         @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
         @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
         @param download_midware: 下载中间件。默认为parser中的download_midware
         @param download_midware: 下载中间件。默认为parser中的download_midware
         @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
         @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
@@ -149,6 +176,7 @@ class Request(object):
         self.auto_request = auto_request
         self.auto_request = auto_request
         self.request_sync = request_sync
         self.request_sync = request_sync
         self.use_session = use_session
         self.use_session = use_session
+        self.use_ja3_session = use_ja3_session
         self.random_user_agent = random_user_agent
         self.random_user_agent = random_user_agent
         self.download_midware = download_midware
         self.download_midware = download_midware
         self.is_abandoned = is_abandoned
         self.is_abandoned = is_abandoned
@@ -192,12 +220,23 @@ class Request(object):
         use_session = (
         use_session = (
             setting.USE_SESSION if self.use_session is None else self.use_session
             setting.USE_SESSION if self.use_session is None else self.use_session
         )  # self.use_session 优先级高
         )  # self.use_session 优先级高
+        use_ja3_session = (
+            setting.USE_JA3_SESSION if self.use_ja3_session is None else self.use_ja3_session
+        )  # self.use_ja3_session 优先级高
+        use_session = use_session or use_ja3_session
         if use_session and not self.__class__.session:
         if use_session and not self.__class__.session:
             self.__class__.session = requests.Session()
             self.__class__.session = requests.Session()
-            # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
-            http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
-            # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
-            self.__class__.session.mount("http", http_adapter)
+            if use_ja3_session:
+                # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
+                des_adapter = DESAdapter(pool_connections=1000, pool_maxsize=1000)
+                # 任何使用该session会话的 HTTP/HTTPS 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
+                self.__class__.session.mount("https://", des_adapter)
+                self.__class__.session.mount("http://", des_adapter)
+            else:
+                # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
+                http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
+                # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
+                self.__class__.session.mount("http", http_adapter)
 
 
         return self.__class__.session
         return self.__class__.session
 
 
@@ -339,6 +378,10 @@ class Request(object):
         use_session = (
         use_session = (
             setting.USE_SESSION if self.use_session is None else self.use_session
             setting.USE_SESSION if self.use_session is None else self.use_session
         )  # self.use_session 优先级高
         )  # self.use_session 优先级高
+        use_ja3_session = (
+            setting.USE_JA3_SESSION if self.use_ja3_session is None else self.use_ja3_session
+        )  # self.use_ja3_session 优先级高
+        use_session = use_session or use_ja3_session
 
 
         if self.render:
         if self.render:
             # 使用request的user_agent、cookies、proxy
             # 使用request的user_agent、cookies、proxy

+ 3 - 1
FworkSpider/feapder/setting.py

@@ -121,7 +121,9 @@ USER_AGENT_TYPE = "chrome"
 DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
 DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
 # requests 使用session
 # requests 使用session
 USE_SESSION = False
 USE_SESSION = False
-
+USE_JA3_SESSION = False
+# 遇见 ja3 反爬网站,修改 requests模块用的传输密码
+JA3_REQUEST_CIPHERS = ["DH+AES", "RSA+AES"]
 # 去重
 # 去重
 ITEM_FILTER_ENABLE = False  # item 去重
 ITEM_FILTER_ENABLE = False  # item 去重
 ITEM_FILTER_SETTING = dict(
 ITEM_FILTER_SETTING = dict(