Pārlūkot izejas kodu

新增drissionpage下载器

dongzhaorui 2 mēneši atpakaļ
vecāks
revīzija
f54eb1ce25

+ 18 - 0
FworkSpider/feapder/network/downloader/__init__.py

@@ -0,0 +1,18 @@
+from ._requests import RequestsDownloader
+from ._requests import RequestsSessionDownloader
+from ._requests import RequestsJa3SessionDownloader
+
+
+# 下面是非必要依赖
+try:
+    from ._selenium import SeleniumDownloader
+except ModuleNotFoundError:
+    pass
+try:
+    from ._playwright import PlaywrightDownloader
+except ModuleNotFoundError:
+    pass
+try:
+    from ._drissionpage import DrissionPageDownloader
+except ModuleNotFoundError:
+    pass

+ 107 - 0
FworkSpider/feapder/network/downloader/_drissionpage.py

@@ -0,0 +1,107 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-05-14 
+---------
+@summary:
+---------
+@author: Dzr
+"""
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.network.downloader.base import RenderDownloader
+from feapder.network.response import Response
+from feapder.utils.webdriver import WebDriverPool, DrissionPageDriver
+
+
+class DrissionPageDownloader(RenderDownloader):
+    webdriver_pool: WebDriverPool = None
+
+    @property
+    def _webdriver_pool(self):
+        if not self.__class__.webdriver_pool:
+            self.__class__.webdriver_pool = WebDriverPool(
+                **setting.DRISSIONPAGE, driver_cls=DrissionPageDriver
+            )
+
+        return self.__class__.webdriver_pool
+
+    def download(self, request) -> Response:
+        # 代理优先级 自定义 > 配置文件 > 随机
+        if request.custom_proxies:
+            proxy = request.get_proxy()
+        elif setting.DRISSIONPAGE.get("proxy"):
+            proxy = setting.DRISSIONPAGE.get("proxy")
+        else:
+            proxy = request.get_proxy()
+
+        # user_agent优先级 自定义 > 配置文件 > 随机
+        if request.custom_ua:
+            user_agent = request.get_user_agent()
+        elif setting.DRISSIONPAGE.get("user_agent"):
+            user_agent = setting.DRISSIONPAGE.get("user_agent")
+        else:
+            user_agent = request.get_user_agent()
+
+        cookies = request.get_cookies()
+        url = request.url
+        render_time = request.render_time or setting.DRISSIONPAGE.get("render_time")
+        if request.get_params():
+            url = tools.joint_url(url, request.get_params())
+
+        driver: DrissionPageDriver = self._webdriver_pool.get(
+            user_agent=user_agent, proxy=proxy
+        )
+
+        try:
+            if cookies:
+                driver.url = url
+                driver.tab.set.cookies = cookies
+
+            retry = setting.DRISSIONPAGE.get("retry")
+            interval = setting.DRISSIONPAGE.get("interval")
+            timeout = setting.DRISSIONPAGE.get("page_load")
+            driver.tab.get(url, retry=retry, interval=interval, timeout=timeout)
+            driver.wait_for_dom_load(timeout=render_time)
+
+            if driver.tab.mode == "s":
+                response = Response(driver.tab.response)
+            else:
+                cookies = driver.tab.cookies().as_dict()
+                html = driver.tab.html
+                response = Response.from_dict(
+                    {
+                        "url": driver.tab.url,
+                        "cookies": cookies,
+                        "_content": html.encode(),
+                        "status_code": 200,
+                        "elapsed": 666,
+                        "headers": {
+                            "User-Agent": driver.tab.user_agent,
+                            "Cookie": tools.cookies2str(cookies),
+                        },
+                    }
+                )
+
+            response.driver = driver
+            response.browser = driver
+            return response
+        except Exception as e:
+            self._webdriver_pool.remove(driver)
+            request.del_proxy()
+            raise e
+
+    def close(self, driver):
+        if driver:
+            self._webdriver_pool.remove(driver)
+
+    def put_back(self, driver):
+        """
+        释放浏览器对象
+        """
+        self._webdriver_pool.put(driver)
+
+    def close_all(self):
+        """
+        关闭所有浏览器
+        """
+        self._webdriver_pool.close()

+ 105 - 0
FworkSpider/feapder/network/downloader/_playwright.py

@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2022/9/7 4:05 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.network.downloader.base import RenderDownloader
+from feapder.network.response import Response
+from feapder.utils.webdriver import WebDriverPool, PlaywrightDriver
+
+
+class PlaywrightDownloader(RenderDownloader):
+    webdriver_pool: WebDriverPool = None
+
+    @property
+    def _webdriver_pool(self):
+        if not self.__class__.webdriver_pool:
+            self.__class__.webdriver_pool = WebDriverPool(
+                **setting.PLAYWRIGHT, driver_cls=PlaywrightDriver, thread_safe=True
+            )
+
+        return self.__class__.webdriver_pool
+
+    def download(self, request) -> Response:
+        # 代理优先级 自定义 > 配置文件 > 随机
+        if request.custom_proxies:
+            proxy = request.get_proxy()
+        elif setting.PLAYWRIGHT.get("proxy"):
+            proxy = setting.PLAYWRIGHT.get("proxy")
+        else:
+            proxy = request.get_proxy()
+
+        # user_agent优先级 自定义 > 配置文件 > 随机
+        if request.custom_ua:
+            user_agent = request.get_user_agent()
+        elif setting.PLAYWRIGHT.get("user_agent"):
+            user_agent = setting.PLAYWRIGHT.get("user_agent")
+        else:
+            user_agent = request.get_user_agent()
+
+        cookies = request.get_cookies()
+        url = request.url
+        render_time = request.render_time or setting.PLAYWRIGHT.get("render_time")
+        wait_until = setting.PLAYWRIGHT.get("wait_until") or "domcontentloaded"
+        if request.get_params():
+            url = tools.joint_url(url, request.get_params())
+
+        driver: PlaywrightDriver = self._webdriver_pool.get(
+            user_agent=user_agent, proxy=proxy
+        )
+        try:
+            if cookies:
+                driver.url = url
+                driver.cookies = cookies
+            http_response = driver.page.goto(url, wait_until=wait_until)
+            status_code = http_response.status
+
+            if render_time:
+                tools.delay_time(render_time)
+
+            html = driver.page.content()
+            response = Response.from_dict(
+                {
+                    "url": driver.page.url,
+                    "cookies": driver.cookies,
+                    "_content": html.encode(),
+                    "status_code": status_code,
+                    "elapsed": 666,
+                    "headers": {
+                        "User-Agent": driver.user_agent,
+                        "Cookie": tools.cookies2str(driver.cookies),
+                    },
+                }
+            )
+
+            response.driver = driver
+            response.browser = driver
+            return response
+        except Exception as e:
+            self._webdriver_pool.remove(driver)
+            raise e
+
+    def close(self, driver):
+        if driver:
+            self._webdriver_pool.remove(driver)
+
+    def put_back(self, driver):
+        """
+        释放浏览器对象
+        """
+        self._webdriver_pool.put(driver)
+
+    def close_all(self):
+        """
+        关闭所有浏览器
+        """
+        # 不支持
+        # self._webdriver_pool.close()
+        pass

+ 98 - 0
FworkSpider/feapder/network/downloader/_requests.py

@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2022/4/10 5:57 下午
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import random
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.ssl_ import create_urllib3_context
+
+import feapder.setting as setting
+from feapder.network.downloader.base import Downloader
+from feapder.network.response import Response
+
+
+class RequestsDownloader(Downloader):
+    def download(self, request) -> Response:
+        response = requests.request(
+            request.method, request.url, **request.requests_kwargs
+        )
+        response = Response(response)
+        return response
+
+
+class RequestsSessionDownloader(Downloader):
+    session = None
+
+    @property
+    def _session(self):
+        if not self.__class__.session:
+            self.__class__.session = requests.Session()
+            # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
+            http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
+            # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
+            self.__class__.session.mount("http", http_adapter)
+
+        return self.__class__.session
+
+    def download(self, request) -> Response:
+        response = self._session.request(
+            request.method, request.url, **request.requests_kwargs
+        )
+        response = Response(response)
+        return response
+
+
+class _DESAdapter(HTTPAdapter):
+
+    def __init__(self, *args, **kwargs):
+        """
+        A TransportAdapter that re-enables 3DES support in Requests.
+        """
+        ciphers = ":".join(setting.SESSION_REQUEST_CIPHERS).split(":")
+        random.shuffle(ciphers)
+        ciphers = ":".join(ciphers)
+        self.ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
+        super().__init__(*args, **kwargs)
+
+    def init_poolmanager(self, *args, **kwargs):
+        context = create_urllib3_context(ciphers=self.ciphers)
+        kwargs["ssl_context"] = context
+        return super(_DESAdapter, self).init_poolmanager(*args, **kwargs)
+
+    def proxy_manager_for(self, *args, **kwargs):
+        context = create_urllib3_context(ciphers=self.ciphers)
+        kwargs["ssl_context"] = context
+        return super(_DESAdapter, self).proxy_manager_for(*args, **kwargs)
+
+
+class RequestsJa3SessionDownloader(Downloader):
+    session = None
+
+    @property
+    def _session(self):
+        if not self.__class__.session:
+            self.__class__.session = requests.Session()
+            # pool_connections – 缓存的 urllib3 连接池个数
+            # pool_maxsize – 连接池中保存的最大连接数
+            des_adapter = _DESAdapter(pool_connections=1000, pool_maxsize=1000)
+            # 任何使用该 session会话的 HTTP/HTTPS 请求,
+            # 只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
+            self.__class__.session.mount("https://", des_adapter)
+            self.__class__.session.mount("http://", des_adapter)
+
+        return self.__class__.session
+
+    def download(self, request) -> Response:
+        response = self._session.request(
+            request.method, request.url, **request.requests_kwargs
+        )
+        response = Response(response)
+        return response

+ 102 - 0
FworkSpider/feapder/network/downloader/_selenium.py

@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2022/7/26 4:28 下午
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.network.downloader.base import RenderDownloader
+from feapder.network.response import Response
+from feapder.utils.webdriver import WebDriverPool, SeleniumDriver
+
+
+class SeleniumDownloader(RenderDownloader):
+    webdriver_pool: WebDriverPool = None
+
+    @property
+    def _webdriver_pool(self):
+        if not self.__class__.webdriver_pool:
+            self.__class__.webdriver_pool = WebDriverPool(
+                **setting.WEBDRIVER, driver=SeleniumDriver
+            )
+
+        return self.__class__.webdriver_pool
+
+    def download(self, request) -> Response:
+        # 代理优先级 自定义 > 配置文件 > 随机
+        if request.custom_proxies:
+            proxy = request.get_proxy()
+        elif setting.WEBDRIVER.get("proxy"):
+            proxy = setting.WEBDRIVER.get("proxy")
+        else:
+            proxy = request.get_proxy()
+
+        # user_agent优先级 自定义 > 配置文件 > 随机
+        if request.custom_ua:
+            user_agent = request.get_user_agent()
+        elif setting.WEBDRIVER.get("user_agent"):
+            user_agent = setting.WEBDRIVER.get("user_agent")
+        else:
+            user_agent = request.get_user_agent()
+
+        cookies = request.get_cookies()
+        url = request.url
+        render_time = request.render_time or setting.WEBDRIVER.get("render_time")
+        if request.get_params():
+            url = tools.joint_url(url, request.get_params())
+
+        browser: SeleniumDriver = self._webdriver_pool.get(
+            user_agent=user_agent, proxy=proxy
+        )
+        try:
+            browser.get(url)
+            if cookies:
+                browser.cookies = cookies
+                # 刷新使cookie生效
+                browser.get(url)
+
+            if render_time:
+                tools.delay_time(render_time)
+
+            html = browser.page_source
+            response = Response.from_dict(
+                {
+                    "url": browser.current_url,
+                    "cookies": browser.cookies,
+                    "_content": html.encode(),
+                    "status_code": 200,
+                    "elapsed": 666,
+                    "headers": {
+                        "User-Agent": browser.user_agent,
+                        "Cookie": tools.cookies2str(browser.cookies),
+                    },
+                }
+            )
+
+            response.driver = browser
+            response.browser = browser
+            return response
+        except Exception as e:
+            self._webdriver_pool.remove(browser)
+            raise e
+
+    def close(self, driver):
+        if driver:
+            self._webdriver_pool.remove(driver)
+
+    def put_back(self, driver):
+        """
+        释放浏览器对象
+        """
+        self._webdriver_pool.put(driver)
+
+    def close_all(self):
+        """
+        关闭所有浏览器
+        """
+        self._webdriver_pool.close()

+ 41 - 0
FworkSpider/feapder/network/downloader/base.py

@@ -0,0 +1,41 @@
+import abc
+from abc import ABC
+
+from feapder.network.response import Response
+
+
+class Downloader:
+    @abc.abstractmethod
+    def download(self, request) -> Response:
+        """
+
+        Args:
+            request: feapder.Request
+
+        Returns: feapder.Response
+
+        """
+        raise NotImplementedError
+
+    def close(self, response: Response):
+        pass
+
+
+class RenderDownloader(Downloader, ABC):
+    def put_back(self, driver):
+        """
+        释放浏览器对象
+        """
+        pass
+
+    def close(self, driver):
+        """
+        关闭浏览器
+        """
+        pass
+
+    def close_all(self):
+        """
+        关闭所有浏览器
+        """
+        pass