浏览代码

新增drissionpage驱动

dongzhaorui 2 月之前
父节点
当前提交
deae530b7b

+ 17 - 0
FworkSpider/feapder/utils/webdriver/__init__.py

@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2022/9/7 4:39 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+from .drissionpage_driver import DrissionPageDriver
+from .playwright_driver import PlaywrightDriver
+from .selenium_driver import SeleniumDriver
+from .webdirver import InterceptRequest, InterceptResponse
+from .webdriver_pool import WebDriverPool
+
+# 为了兼容老代码
+WebDriver = SeleniumDriver

+ 293 - 0
FworkSpider/feapder/utils/webdriver/drissionpage_driver.py

@@ -0,0 +1,293 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-05-14 
+---------
+@summary:
+---------
+@author: Dzr
+"""
+
+from time import perf_counter
+
+from DrissionPage import Chromium, ChromiumOptions
+from DrissionPage.common import Settings
+
+from feapder.utils import tools
+from feapder.utils.log import log
+from feapder.utils.webdriver.webdirver import *
+
+
+class SingletonMeta(type):
+    """单例元类"""
+    _instances = {}
+
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super().__call__(*args, **kwargs)
+        return cls._instances[cls]
+
+    def clear_instance(cls):
+        """清除元类中保存的实例引用"""
+        if cls in cls._instances:
+            del cls._instances[cls]
+
+
+class Browser(metaclass=SingletonMeta):
+    _browser: Chromium = None
+
+    def __init__(
+        self,
+        load_images=True,
+        user_agent=None,
+        port=None,
+        user_data_path=None,
+        proxy=None,
+        headless=False,
+        singleton_tab=True,
+        driver_type="Chromium",
+        timeout=30,
+        custom_argument=None,
+        download_path=None,
+        browser_path=None,
+        **kwargs
+    ):
+        """
+        webdriver 封装,仅支持Chromium
+        Args:
+            load_images: 是否加载图片
+            port: 浏览器端口
+            user_data_path: 用户数据目录
+            scope: 自动端口范围,与port 同时只能生效一个
+            user_agent: 字符串 或 无参函数,返回值为user_agent
+            proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
+            headless: 是否启用无头模式
+            driver_type: Chromium
+            singleton_tab: 标签页是否开启多例支持,True=单例 False=多例
+            timeout: 请求超时时间
+            custom_argument: 自定义参数,浏览器启动配置参数
+            download_path: 文件下载保存路径;
+            browser_path: 浏览器可执行文件路径;
+            **kwargs:
+        """
+
+        # 如果实例已存在,则不再重新初始化
+        if self._browser is not None:
+            return
+
+        self._singleton_tab = singleton_tab
+        self._driver_type = driver_type
+        self._headless = headless
+        self._user_agent = user_agent or setting.DEFAULT_USERAGENT
+        self._proxy = proxy
+        self._timeout = timeout
+        self._load_images = load_images
+        self._download_path = download_path
+        self._browser_path = browser_path
+        self._custom_argument = custom_argument
+        self._kwargs = kwargs
+
+        Settings.set_language("zh_cn")  # DrissionPage 的报错信息及提示设置为中文
+        Settings.set_singleton_tab_obj(self._singleton_tab)
+
+        co = ChromiumOptions()
+        if self._browser_path is not None:
+            co.set_browser_path(self._browser_path)
+
+        port = port or setting.DRISSIONPAGE.get("port")
+        user_data_path = user_data_path or setting.DRISSIONPAGE.get("user_data_path")
+        if port is not None:
+            co.set_local_port(int(port))
+            if user_data_path is not None:
+                co.set_user_data_path(user_data_path)
+        else:
+            # 设置自动端口范围
+            co.auto_port(scope=setting.DRISSIONPAGE.get("scope"))
+
+        # 设置默认超时时间,用于元素等待、alert 等待、WebPage的 s 模式连接等等
+        if self._timeout is not None:
+            co.set_timeouts(base=self._timeout)
+
+        # 设置是否以无界面模式启动浏览器
+        co.headless(on_off=self._headless)
+
+        # 设置初始窗口大小
+        window_size = setting.DRISSIONPAGE.get("window_size")
+        if window_size is not None:
+            window_size = ",".join((str(n) for n in window_size))
+            co.set_argument("--window-size", window_size)
+
+        # 设置 useragent
+        co.set_user_agent(self._user_agent)
+
+        # 设置浏览器代理
+        if self._proxy is not None:
+            co.set_argument("--proxy-server", value=self._proxy)
+
+        # 设置是否禁止加载图片
+        co.no_imgs(on_off=not self._load_images)
+
+        # 设置下载路径
+        if self._download_path is not None:
+            co.set_download_path(self._download_path)
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                co.set_argument(arg)
+
+        self._browser = Chromium(addr_or_opts=co)
+
+    @property
+    def is_running(self):
+        return self._browser.states.is_alive if self._browser is not None else False
+
+    def new_tab(self):
+        if self.is_running:
+            return self._browser.new_tab()
+
+    def tabs_count(self):
+        if self.is_running:
+            return self._browser.tabs_count
+        else:
+            return 0
+
+    def get_browser(self):
+        return self._browser
+
+    def quit(self):
+        if self._browser is not None:
+            self._browser.quit(del_data=True)
+            self._browser = None
+            SingletonMeta.clear_instance(self.__class__)  # 释放资源
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_val:
+            log.error(exc_val)
+
+        self.quit()
+        return True
+
+
+class DrissionPageDriver(WebDriver):
+
+    def __init__(self, **kwargs):
+        super(DrissionPageDriver, self).__init__(**kwargs)
+        # 创建全局浏览器实例(单例)
+        self._browser = Browser(**kwargs)
+
+        # 创建新标签页
+        tab = self._browser.new_tab()
+
+        # 设置自动确认弹窗
+        tab.set.auto_handle_alert()
+
+        # 设置网页加载策略
+        tab.set.load_mode(setting.DRISSIONPAGE.get("load_mode", "normal"))
+
+        # 设置浏览器标识
+        ua = kwargs.get("user_agent")
+        if ua is not None:
+            tab.set.user_agent(ua)
+
+        self.url = None
+        self.tab = tab
+
+    @property
+    def browser(self):
+        return self._browser.get_browser()
+
+    def get_tab(self):
+        """获取当前标签页,启用多例-可支持多个实例控制同一个标签页"""
+        return self.browser.get_tab(id_or_num=self.tab.tab_id)
+
+    def get_dom_hash(self):
+        """获取当前DOM的哈希值"""
+        return tools.get_md5(self.tab.html)
+
+    def wait_for_dom_stable(self, duration=None):
+        """
+        计算指定时间内页面的DOM变化次数,可忽略特定元素
+
+        :param duration: 监听时长(秒)
+        :return: DOM变化次数
+        """
+        duration = self.tab.timeouts.base if duration is None else duration
+        script = f'''
+            return new Promise((resolve) => {{
+                let mutationCount = 0;
+
+                // 创建MutationObserver监听DOM变化
+                const observer = new MutationObserver((mutations) => {{
+                    if (mutations.length > 0) {{
+                        mutationCount += mutations.length;
+                    }}
+                }});
+
+                // 开始监听所有DOM变化
+                observer.observe(document, {{
+                    childList: true,
+                    attributes: true,
+                    subtree: true,
+                    characterData: true
+                }});
+
+                // 指定时间后停止监听并返回变化次数
+                setTimeout(() => {{
+                    observer.disconnect();
+                    resolve(mutationCount);
+                }}, {duration * 1000});
+            }});
+        '''
+        count = self.tab.run_js(script)
+        return True if count == 0 else False
+
+    def reload(self):
+        # 尝试触发重绘(通过重绘修复图片加载完成后可能导致布局错乱问题)
+        self.tab.run_js('document.body.style.display="none";document.body.style.display="block";')
+
+    def wait_for_dom_load(self, timeout=None, raise_err=False):
+        """等待页面加载"""
+
+        assert "ERR_CONNECTION_CLOSED" not in self.tab.raw_data
+        assert self.tab.url_available is True
+
+        timeout = self.tab.timeouts.page_load if timeout is None else timeout
+        end_time = perf_counter() + timeout
+        while perf_counter() < end_time:
+            init_hash = self.get_dom_hash()
+            render_time = (end_time - perf_counter()) / 10
+            timeout = render_time if render_time > 0 else 1
+            self.reload()
+            self.tab.wait(timeout)
+            current_hash = self.get_dom_hash()
+            if current_hash == init_hash:
+                return True
+
+        if raise_err is True:
+            raise TimeoutError("等待页面加载超时")
+
+        return False
+
+    @property
+    def domain(self):
+        return tools.get_domain(self.url or self.tab.url)
+
+    def quit(self):
+        if self.tab:
+            self.tab.close()
+
+        if self._browser.is_running and self._browser.tabs_count() <= 1:
+            self._browser.quit()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_val:
+            log.error(exc_val)
+
+        self.quit()
+        return True

+ 298 - 0
FworkSpider/feapder/utils/webdriver/playwright_driver.py

@@ -0,0 +1,298 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2022/9/7 4:11 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import json
+import os
+import re
+from collections import defaultdict
+from typing import Union, List
+
+try:
+    from typing import Literal  # python >= 3.8
+except ImportError:  # python <3.8
+    from typing_extensions import Literal
+
+
+from playwright.sync_api import Page, BrowserContext, ViewportSize, ProxySettings
+from playwright.sync_api import Playwright, Browser
+from playwright.sync_api import Response
+from playwright.sync_api import sync_playwright
+
+from feapder.utils import tools
+from feapder.utils.log import log
+from feapder.utils.webdriver.webdirver import *
+
+
+class PlaywrightDriver(WebDriver):
+    def __init__(
+        self,
+        *,
+        page_on_event_callback: dict = None,
+        storage_state_path: str = None,
+        driver_type: Literal["chromium", "firefox", "webkit"] = "chromium",
+        url_regexes: list = None,
+        save_all: bool = False,
+        **kwargs
+    ):
+        """
+
+        Args:
+            page_on_event_callback: page.on() 事件的回调 如 page_on_event_callback={"dialog": lambda dialog: dialog.accept()}
+            storage_state_path: 保存浏览器状态的路径
+            driver_type: 浏览器类型 chromium, firefox, webkit
+            url_regexes: 拦截接口,支持正则,数组类型
+            save_all: 是否保存所有拦截的接口, 默认只保存最后一个
+            **kwargs:
+        """
+        super(PlaywrightDriver, self).__init__(**kwargs)
+        self.driver: Playwright = None
+        self.browser: Browser = None
+        self.context: BrowserContext = None
+        self.page: Page = None
+        self.url = None
+        self.storage_state_path = storage_state_path
+
+        self._driver_type = driver_type or "chromium"
+        self._page_on_event_callback = page_on_event_callback
+        self._url_regexes = url_regexes
+        self._save_all = save_all
+
+        if self._save_all and self._url_regexes:
+            log.warning(
+                "获取完拦截的数据后, 请主动调用PlaywrightDriver的clear_cache()方法清空拦截的数据,否则数据会一直累加,导致内存溢出"
+            )
+            self._cache_data = defaultdict(list)
+        else:
+            self._cache_data = {}
+
+        self._setup()
+
+    def _setup(self):
+        # 处理参数
+        if self._proxy:
+            proxy = self._proxy() if callable(self._proxy) else self._proxy
+            proxy = self.format_context_proxy(proxy)
+        else:
+            proxy = None
+
+        user_agent = (
+            self._user_agent() if callable(self._user_agent) else self._user_agent
+        )
+
+        view_size = ViewportSize(
+            width=self._window_size[0], height=self._window_size[1]
+        )
+
+        # 初始化浏览器对象
+        self.driver = sync_playwright().start()
+        self.browser = getattr(self.driver, self._driver_type).launch(
+            headless=self._headless,
+            args=["--no-sandbox"],
+            proxy=proxy,
+            executable_path=self._executable_path,
+            downloads_path=self._download_path,
+        )
+
+        if self.storage_state_path and os.path.exists(self.storage_state_path):
+            self.context = self.browser.new_context(
+                user_agent=user_agent,
+                screen=view_size,
+                viewport=view_size,
+                proxy=proxy,
+                storage_state=self.storage_state_path,
+            )
+        else:
+            self.context = self.browser.new_context(
+                user_agent=user_agent,
+                screen=view_size,
+                viewport=view_size,
+                proxy=proxy,
+            )
+
+        if self._use_stealth_js:
+            path = os.path.join(os.path.dirname(__file__), "../js/stealth.min.js")
+            self.context.add_init_script(path=path)
+
+        self.page = self.context.new_page()
+        self.page.set_default_timeout(self._timeout * 1000)
+
+        if self._page_on_event_callback:
+            for event, callback in self._page_on_event_callback.items():
+                self.page.on(event, callback)
+
+        if self._url_regexes:
+            self.page.on("response", self.on_response)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_val:
+            log.error(exc_val)
+
+        self.quit()
+        return True
+
+    def format_context_proxy(self, proxy) -> ProxySettings:
+        """
+        Args:
+            proxy: username:password@ip:port / ip:port
+        Returns:
+            {
+                "server": "ip:port"
+                "username": username,
+                "password": password,
+            }
+            server: http://ip:port or socks5://ip:port. Short form ip:port is considered an HTTP proxy.
+        """
+
+        if "@" in proxy:
+            certification, _proxy = proxy.split("@")
+            username, password = certification.split(":")
+
+            context_proxy = ProxySettings(
+                server=_proxy,
+                username=username,
+                password=password,
+            )
+        else:
+            context_proxy = ProxySettings(server=proxy)
+
+        return context_proxy
+
+    def save_storage_stage(self):
+        if self.storage_state_path:
+            os.makedirs(os.path.dirname(self.storage_state_path), exist_ok=True)
+            self.context.storage_state(path=self.storage_state_path)
+
+    def quit(self):
+        self.page.close()
+        self.context.close()
+        self.browser.close()
+        self.driver.stop()
+
+    @property
+    def domain(self):
+        return tools.get_domain(self.url or self.page.url)
+
+    @property
+    def cookies(self):
+        cookies_json = {}
+        for cookie in self.page.context.cookies():
+            cookies_json[cookie["name"]] = cookie["value"]
+
+        return cookies_json
+
+    @cookies.setter
+    def cookies(self, val: Union[dict, List[dict]]):
+        """
+        设置cookie
+        Args:
+            val: List[{name: str, value: str, url: Union[str, NoneType], domain: Union[str, NoneType], path: Union[str, NoneType], expires: Union[float, NoneType], httpOnly: Union[bool, NoneType], secure: Union[bool, NoneType], sameSite: Union["Lax", "None", "Strict", NoneType]}]
+
+        Returns:
+
+        """
+        if isinstance(val, list):
+            self.page.context.add_cookies(val)
+        else:
+            cookies = []
+            for key, value in val.items():
+                cookies.append(
+                    {"name": key, "value": value, "url": self.url or self.page.url}
+                )
+            self.page.context.add_cookies(cookies)
+
+    @property
+    def user_agent(self):
+        return self.page.evaluate("() => navigator.userAgent")
+
+    def on_response(self, response: Response):
+        for regex in self._url_regexes:
+            if re.search(regex, response.request.url):
+                intercept_request = InterceptRequest(
+                    url=response.request.url,
+                    headers=response.request.headers,
+                    data=response.request.post_data,
+                )
+
+                intercept_response = InterceptResponse(
+                    request=intercept_request,
+                    url=response.url,
+                    headers=response.headers,
+                    content=response.body(),
+                    status_code=response.status,
+                )
+                if self._save_all:
+                    self._cache_data[regex].append(intercept_response)
+                else:
+                    self._cache_data[regex] = intercept_response
+
+    def get_response(self, url_regex) -> InterceptResponse:
+        if self._save_all:
+            response_list = self._cache_data.get(url_regex)
+            if response_list:
+                return response_list[-1]
+        return self._cache_data.get(url_regex)
+
+    def get_all_response(self, url_regex) -> List[InterceptResponse]:
+        """
+        获取所有匹配的响应, 仅在save_all=True时有效
+        Args:
+            url_regex:
+
+        Returns:
+
+        """
+        response_list = self._cache_data.get(url_regex, [])
+        if not isinstance(response_list, list):
+            return [response_list]
+        return response_list
+
+    def get_text(self, url_regex):
+        return (
+            self.get_response(url_regex).content.decode()
+            if self.get_response(url_regex)
+            else None
+        )
+
+    def get_all_text(self, url_regex):
+        """
+        获取所有匹配的响应文本, 仅在save_all=True时有效
+        Args:
+            url_regex:
+
+        Returns:
+
+        """
+        return [
+            response.content.decode() for response in self.get_all_response(url_regex)
+        ]
+
+    def get_json(self, url_regex):
+        return (
+            json.loads(self.get_text(url_regex))
+            if self.get_response(url_regex)
+            else None
+        )
+
+    def get_all_json(self, url_regex):
+        """
+        获取所有匹配的响应json, 仅在save_all=True时有效
+        Args:
+            url_regex:
+
+        Returns:
+
+        """
+        return [json.loads(text) for text in self.get_all_text(url_regex)]
+
+    def clear_cache(self):
+        self._cache_data = defaultdict(list)

+ 530 - 0
FworkSpider/feapder/utils/webdriver/selenium_driver.py

@@ -0,0 +1,530 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021/3/18 4:59 下午
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import json
+import logging
+import os
+from typing import Optional, Union, List
+
+from selenium import webdriver
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
+from webdriver_manager.chrome import ChromeDriverManager
+from webdriver_manager.firefox import GeckoDriverManager
+
+from feapder.utils import tools
+from feapder.utils.log import log, OTHERS_LOG_LEVAL
+from feapder.utils.webdriver.webdirver import *
+
+# 屏蔽webdriver_manager日志
+logging.getLogger("WDM").setLevel(OTHERS_LOG_LEVAL)
+
+
+class SeleniumDriver(WebDriver, RemoteWebDriver):
+    CHROME = "CHROME"
+    EDGE = "EDGE"
+    PHANTOMJS = "PHANTOMJS"
+    FIREFOX = "FIREFOX"
+
+    __CHROME_ATTRS__ = {
+        "executable_path",
+        "port",
+        "options",
+        "service_args",
+        "desired_capabilities",
+        "service_log_path",
+        "chrome_options",
+        "keep_alive",
+    }
+
+    __EDGE_ATTRS__ = __CHROME_ATTRS__
+
+    __FIREFOX_ATTRS__ = {
+        "firefox_profile",
+        "firefox_binary",
+        "timeout",
+        "capabilities",
+        "proxy",
+        "executable_path",
+        "options",
+        "service_log_path",
+        "firefox_options",
+        "service_args",
+        "desired_capabilities",
+        "log_path",
+        "keep_alive",
+    }
+    __PHANTOMJS_ATTRS__ = {
+        "executable_path",
+        "port",
+        "desired_capabilities",
+        "service_args",
+        "service_log_path",
+    }
+
+    def __init__(self, xhr_url_regexes: list = None, **kwargs):
+        """
+
+        Args:
+            xhr_url_regexes: 拦截xhr接口,支持正则,数组类型
+            **kwargs:
+        """
+        super(SeleniumDriver, self).__init__(**kwargs)
+        self._xhr_url_regexes = xhr_url_regexes
+        self._driver_type = self._driver_type or SeleniumDriver.CHROME
+
+        if self._xhr_url_regexes and self._driver_type != SeleniumDriver.CHROME:
+            raise Exception(
+                "xhr_url_regexes only support by chrome now! eg: driver_type=SeleniumDriver.CHROME"
+            )
+
+        if self._driver_type == SeleniumDriver.CHROME:
+            self.driver = self.chrome_driver()
+
+        elif self._driver_type == SeleniumDriver.EDGE:
+            self.driver = self.edge_driver()
+
+        elif self._driver_type == SeleniumDriver.PHANTOMJS:
+            self.driver = self.phantomjs_driver()
+
+        elif self._driver_type == SeleniumDriver.FIREFOX:
+            self.driver = self.firefox_driver()
+
+        else:
+            raise TypeError(
+                "dirver_type must be one of CHROME or PHANTOMJS or FIREFOX, but received {}".format(
+                    type(self._driver_type)
+                )
+            )
+
+        # driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
+        self.driver.set_page_load_timeout(self._timeout)
+        # 设置10秒脚本超时时间
+        self.driver.set_script_timeout(self._timeout)
+        self.url = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_val:
+            log.error(exc_val)
+
+        self.quit()
+        return True
+
+    def filter_kwargs(self, kwargs: dict, driver_attrs: set):
+        if not kwargs:
+            return {}
+
+        data = {}
+        for key, value in kwargs.items():
+            if key in driver_attrs:
+                data[key] = value
+
+        return data
+
+    def get_driver(self):
+        return self.driver
+
+    def firefox_driver(self):
+        if webdriver.__version__ >= "4.0.0":
+            raise Exception(
+                f"暂未适配selenium=={webdriver.__version__}版本的firefox API,建议安装selenium==3.141.0版本或使用CHROME浏览器"
+            )
+
+        firefox_profile = webdriver.FirefoxProfile()
+        firefox_options = webdriver.FirefoxOptions()
+        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
+        try:
+            from selenium.webdriver.firefox.service import Service
+        except (ImportError, ModuleNotFoundError):
+            Service = None
+
+        if self._proxy:
+            proxy = self._proxy() if callable(self._proxy) else self._proxy
+            firefox_capabilities["marionette"] = True
+            firefox_capabilities["proxy"] = {
+                "proxyType": "MANUAL",
+                "httpProxy": proxy,
+                "ftpProxy": proxy,
+                "sslProxy": proxy,
+            }
+
+        if self._user_agent:
+            firefox_profile.set_preference(
+                "general.useragent.override",
+                self._user_agent() if callable(self._user_agent) else self._user_agent,
+            )
+
+        if not self._load_images:
+            firefox_profile.set_preference("permissions.default.image", 2)
+
+        if self._headless:
+            firefox_options.add_argument("--headless")
+            firefox_options.add_argument("--disable-gpu")
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                firefox_options.add_argument(arg)
+
+        kwargs = self.filter_kwargs(self._kwargs, self.__FIREFOX_ATTRS__)
+
+        if Service is None:
+            if self._executable_path:
+                kwargs.update(executable_path=self._executable_path)
+            elif self._auto_install_driver:
+                kwargs.update(executable_path=GeckoDriverManager().install())
+        else:
+            if self._executable_path:
+                kwargs.update(service=Service(self._executable_path))
+            elif self._auto_install_driver:
+                kwargs.update(service=Service(GeckoDriverManager().install()))
+
+        driver = webdriver.Firefox(
+            capabilities=firefox_capabilities,
+            options=firefox_options,
+            firefox_profile=firefox_profile,
+            **kwargs,
+        )
+
+        if self._window_size:
+            driver.set_window_size(*self._window_size)
+
+        return driver
+
+    def chrome_driver(self):
+        chrome_options = webdriver.ChromeOptions()
+        # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
+        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        chrome_options.add_experimental_option("useAutomationExtension", False)
+        # docker 里运行需要
+        chrome_options.add_argument("--no-sandbox")
+        try:
+            from selenium.webdriver.chrome.service import Service
+        except (ImportError, ModuleNotFoundError):
+            Service = None
+
+        if self._proxy:
+            chrome_options.add_argument(
+                "--proxy-server={}".format(
+                    self._proxy() if callable(self._proxy) else self._proxy
+                )
+            )
+        if self._user_agent:
+            chrome_options.add_argument(
+                "user-agent={}".format(
+                    self._user_agent()
+                    if callable(self._user_agent)
+                    else self._user_agent
+                )
+            )
+        if not self._load_images:
+            chrome_options.add_experimental_option(
+                "prefs", {"profile.managed_default_content_settings.images": 2}
+            )
+
+        if self._headless:
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--disable-gpu")
+
+        if self._window_size:
+            chrome_options.add_argument(
+                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
+            )
+
+        if self._download_path:
+            os.makedirs(self._download_path, exist_ok=True)
+            prefs = {
+                "download.prompt_for_download": False,
+                "download.default_directory": self._download_path,
+            }
+            chrome_options.add_experimental_option("prefs", prefs)
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                chrome_options.add_argument(arg)
+
+        kwargs = self.filter_kwargs(self._kwargs, self.__CHROME_ATTRS__)
+        if Service is None:
+            if self._executable_path:
+                kwargs.update(executable_path=self._executable_path)
+            elif self._auto_install_driver:
+                kwargs.update(executable_path=ChromeDriverManager().install())
+        else:
+            if self._executable_path:
+                kwargs.update(service=Service(self._executable_path))
+            elif self._auto_install_driver:
+                kwargs.update(service=Service(ChromeDriverManager().install()))
+
+        driver = webdriver.Chrome(options=chrome_options, **kwargs)
+
+        # 隐藏浏览器特征
+        if self._use_stealth_js:
+            with open(
+                os.path.join(os.path.dirname(__file__), "../js/stealth.min.js")
+            ) as f:
+                js = f.read()
+                driver.execute_cdp_cmd(
+                    "Page.addScriptToEvaluateOnNewDocument", {"source": js}
+                )
+
+        if self._xhr_url_regexes:
+            assert isinstance(self._xhr_url_regexes, list)
+            with open(
+                os.path.join(os.path.dirname(__file__), "../js/intercept.js")
+            ) as f:
+                js = f.read()
+            driver.execute_cdp_cmd(
+                "Page.addScriptToEvaluateOnNewDocument", {"source": js}
+            )
+            js = f"window.__urlRegexes = {self._xhr_url_regexes}"
+            driver.execute_cdp_cmd(
+                "Page.addScriptToEvaluateOnNewDocument", {"source": js}
+            )
+
+        if self._download_path:
+            driver.command_executor._commands["send_command"] = (
+                "POST",
+                "/session/$sessionId/chromium/send_command",
+            )
+            params = {
+                "cmd": "Page.setDownloadBehavior",
+                "params": {"behavior": "allow", "downloadPath": self._download_path},
+            }
+            driver.execute("send_command", params)
+
+        return driver
+
+    def edge_driver(self):
+        edge_options = webdriver.EdgeOptions()
+        # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
+        edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        edge_options.add_experimental_option("useAutomationExtension", False)
+        # docker 里运行需要
+        edge_options.add_argument("--no-sandbox")
+        try:
+            from selenium.webdriver.edge.service import Service
+        except (ImportError, ModuleNotFoundError):
+            Service = None
+
+        if self._proxy:
+            edge_options.add_argument(
+                "--proxy-server={}".format(
+                    self._proxy() if callable(self._proxy) else self._proxy
+                )
+            )
+        if self._user_agent:
+            edge_options.add_argument(
+                "user-agent={}".format(
+                    self._user_agent()
+                    if callable(self._user_agent)
+                    else self._user_agent
+                )
+            )
+        if not self._load_images:
+            edge_options.add_experimental_option(
+                "prefs", {"profile.managed_default_content_settings.images": 2}
+            )
+
+        if self._headless:
+            edge_options.add_argument("--headless")
+            edge_options.add_argument("--disable-gpu")
+
+        if self._window_size:
+            edge_options.add_argument(
+                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
+            )
+
+        if self._download_path:
+            os.makedirs(self._download_path, exist_ok=True)
+            prefs = {
+                "download.prompt_for_download": False,
+                "download.default_directory": self._download_path,
+            }
+            edge_options.add_experimental_option("prefs", prefs)
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                edge_options.add_argument(arg)
+
+        kwargs = self.filter_kwargs(self._kwargs, self.__CHROME_ATTRS__)
+        if Service is None:
+            if self._executable_path:
+                kwargs.update(executable_path=self._executable_path)
+            elif self._auto_install_driver:
+                raise NotImplementedError("edge not support auto install driver")
+        else:
+            if self._executable_path:
+                kwargs.update(service=Service(self._executable_path))
+            elif self._auto_install_driver:
+                raise NotImplementedError("edge not support auto install driver")
+
+        driver = webdriver.Edge(options=edge_options, **kwargs)
+
+        # 隐藏浏览器特征
+        if self._use_stealth_js:
+            with open(
+                os.path.join(os.path.dirname(__file__), "../js/stealth.min.js")
+            ) as f:
+                js = f.read()
+                driver.execute_cdp_cmd(
+                    "Page.addScriptToEvaluateOnNewDocument", {"source": js}
+                )
+
+        if self._xhr_url_regexes:
+            assert isinstance(self._xhr_url_regexes, list)
+            with open(
+                os.path.join(os.path.dirname(__file__), "../js/intercept.js")
+            ) as f:
+                js = f.read()
+            driver.execute_cdp_cmd(
+                "Page.addScriptToEvaluateOnNewDocument", {"source": js}
+            )
+            js = f"window.__urlRegexes = {self._xhr_url_regexes}"
+            driver.execute_cdp_cmd(
+                "Page.addScriptToEvaluateOnNewDocument", {"source": js}
+            )
+
+        if self._download_path:
+            driver.command_executor._commands["send_command"] = (
+                "POST",
+                "/session/$sessionId/chromium/send_command",
+            )
+            params = {
+                "cmd": "Page.setDownloadBehavior",
+                "params": {"behavior": "allow", "downloadPath": self._download_path},
+            }
+            driver.execute("send_command", params)
+
+        return driver
+
+    def phantomjs_driver(self):
+        import warnings
+
+        warnings.filterwarnings("ignore")
+
+        service_args = []
+        dcap = DesiredCapabilities.PHANTOMJS
+
+        if self._proxy:
+            service_args.append(
+                "--proxy=%s" % self._proxy() if callable(self._proxy) else self._proxy
+            )
+        if self._user_agent:
+            dcap["phantomjs.page.settings.userAgent"] = (
+                self._user_agent() if callable(self._user_agent) else self._user_agent
+            )
+        if not self._load_images:
+            service_args.append("--load-images=no")
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                service_args.append(arg)
+
+        kwargs = self.filter_kwargs(self._kwargs, self.__PHANTOMJS_ATTRS__)
+
+        if self._executable_path:
+            kwargs.update(executable_path=self._executable_path)
+
+        driver = webdriver.PhantomJS(
+            service_args=service_args, desired_capabilities=dcap, **kwargs
+        )
+
+        if self._window_size:
+            driver.set_window_size(self._window_size[0], self._window_size[1])
+
+        del warnings
+
+        return driver
+
+    @property
+    def domain(self):
+        return tools.get_domain(self.url or self.driver.current_url)
+
+    @property
+    def cookies(self):
+        cookies_json = {}
+        for cookie in self.driver.get_cookies():
+            cookies_json[cookie["name"]] = cookie["value"]
+
+        return cookies_json
+
+    @cookies.setter
+    def cookies(self, val: Union[dict, List[dict]]):
+        """
+        设置cookie
+        Args:
+            val: {"key":"value", "key2":"value2"}
+
+        Returns:
+
+        """
+        if isinstance(val, list):
+            for cookie in val:
+                # "path", "domain", "secure", "expiry"
+                _cookie = {
+                    "name": cookie.get("name"),
+                    "value": cookie.get("value"),
+                    "domain": cookie.get("domain"),
+                    "path": cookie.get("path"),
+                    "expires": cookie.get("expires"),
+                    "secure": cookie.get("secure"),
+                }
+                self.driver.add_cookie(_cookie)
+        else:
+            for key, value in val.items():
+                self.driver.add_cookie({"name": key, "value": value})
+
+    @property
+    def user_agent(self):
+        return self.driver.execute_script("return navigator.userAgent;")
+
+    def xhr_response(self, xhr_url_regex) -> Optional[InterceptResponse]:
+        data = self.driver.execute_script(
+            f'return window.__ajaxData["{xhr_url_regex}"];'
+        )
+        if not data:
+            return None
+
+        request = InterceptRequest(**data["request"])
+        response = InterceptResponse(request, **data["response"])
+        return response
+
+    def xhr_data(self, xhr_url_regex) -> Union[str, dict, None]:
+        response = self.xhr_response(xhr_url_regex)
+        if not response:
+            return None
+        return response.content
+
+    def xhr_text(self, xhr_url_regex) -> Optional[str]:
+        response = self.xhr_response(xhr_url_regex)
+        if not response:
+            return None
+        if isinstance(response.content, dict):
+            return json.dumps(response.content, ensure_ascii=False)
+        return response.content
+
+    def xhr_json(self, xhr_url_regex) -> Optional[dict]:
+        text = self.xhr_text(xhr_url_regex)
+        return json.loads(text)
+
+    def __getattr__(self, name):
+        if self.driver:
+            return getattr(self.driver, name)
+        else:
+            raise AttributeError
+
+    # def __del__(self):
+    #     self.quit()

+ 81 - 0
FworkSpider/feapder/utils/webdriver/webdirver.py

@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2022/9/7 4:27 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+import abc
+
+from feapder import setting
+
+
+class InterceptRequest:
+    def __init__(self, url, data, headers):
+        self.url = url
+        self.data = data
+        self.headers = headers
+
+
+class InterceptResponse:
+    def __init__(self, request: InterceptRequest, url, headers, content, status_code):
+        self.request = request
+        self.url = url
+        self.headers = headers
+        self.content = content
+        self.status_code = status_code
+
+
+class WebDriver:
+    def __init__(
+        self,
+        load_images=True,
+        user_agent=None,
+        proxy=None,
+        headless=False,
+        driver_type=None,
+        timeout=16,
+        window_size=(1024, 800),
+        executable_path=None,
+        custom_argument=None,
+        download_path=None,
+        auto_install_driver=True,
+        use_stealth_js=True,
+        **kwargs,
+    ):
+        """
+        webdirver 封装,支持chrome、phantomjs 和 firefox
+        Args:
+            load_images: 是否加载图片
+            user_agent: 字符串 或 无参函数,返回值为user_agent
+            proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
+            headless: 是否启用无头模式
+            driver_type: CHROME,EDGE 或 PHANTOMJS,FIREFOX
+            timeout: 请求超时时间
+            window_size: # 窗口大小
+            executable_path: 浏览器路径,默认为默认路径
+            custom_argument: 自定义参数 用于webdriver.Chrome(options=chrome_options, **kwargs)
+            download_path: 文件下载保存路径;如果指定,不再出现“保留”“放弃”提示,仅对Chrome有效
+            auto_install_driver: 自动下载浏览器驱动 支持chrome 和 firefox
+            use_stealth_js: 使用stealth.min.js隐藏浏览器特征
+            **kwargs:
+        """
+        self._load_images = load_images
+        self._user_agent = user_agent or setting.DEFAULT_USERAGENT
+        self._proxy = proxy
+        self._headless = headless
+        self._timeout = timeout
+        self._window_size = window_size
+        self._executable_path = executable_path
+        self._custom_argument = custom_argument
+        self._download_path = download_path
+        self._auto_install_driver = auto_install_driver
+        self._use_stealth_js = use_stealth_js
+        self._driver_type = driver_type
+        self._kwargs = kwargs
+
+    @abc.abstractmethod
+    def quit(self):
+        pass

+ 115 - 0
FworkSpider/feapder/utils/webdriver/webdriver_pool.py

@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021/3/18 4:59 下午
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import queue
+import threading
+
+from feapder.utils.log import log
+from feapder.utils.tools import Singleton
+from feapder.utils.webdriver.selenium_driver import SeleniumDriver
+
+
+@Singleton
+class WebDriverPool:
+    def __init__(
+        self, pool_size=5, driver_cls=SeleniumDriver, thread_safe=False, **kwargs
+    ):
+        """
+
+        Args:
+            pool_size: driver池的大小
+            driver: 驱动类型
+            thread_safe: 是否线程安全
+                是则每个线程拥有一个driver,pool_size无效,driver数量为线程数
+                否则每个线程从池中获取driver
+            **kwargs:
+        """
+        self.pool_size = pool_size
+        self.driver_cls = driver_cls
+        self.thread_safe = thread_safe
+        self.kwargs = kwargs
+
+        self.queue = queue.Queue(maxsize=pool_size)
+        self.lock = threading.RLock()
+        self.driver_count = 0
+        self.ctx = threading.local()
+
+    @property
+    def driver(self):
+        if not hasattr(self.ctx, "driver"):
+            self.ctx.driver = None
+        return self.ctx.driver
+
+    @driver.setter
+    def driver(self, driver):
+        self.ctx.driver = driver
+
+    @property
+    def is_full(self):
+        return self.driver_count >= self.pool_size
+
+    def create_driver(self, user_agent: str = None, proxy: str = None):
+        kwargs = self.kwargs.copy()
+        if user_agent:
+            kwargs["user_agent"] = user_agent
+        if proxy:
+            kwargs["proxy"] = proxy
+        return self.driver_cls(**kwargs)
+
+    def get(self, user_agent: str = None, proxy: str = None):
+        """
+        获取webdriver
+        当webdriver为新实例时会使用 user_agen, proxy, cookie参数来创建
+        Args:
+            user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
+            proxy: xxx.xxx.xxx.xxx
+        Returns:
+
+        """
+        if not self.is_full and not self.thread_safe:
+            with self.lock:
+                if not self.is_full:
+                    driver = self.create_driver(user_agent, proxy)
+                    self.queue.put(driver)
+                    self.driver_count += 1
+        elif self.thread_safe:
+            if not self.driver:
+                driver = self.create_driver(user_agent, proxy)
+                self.driver = driver
+                self.driver_count += 1
+
+        if self.thread_safe:
+            driver = self.driver
+        else:
+            driver = self.queue.get()
+
+        return driver
+
+    def put(self, driver):
+        if not self.thread_safe:
+            self.queue.put(driver)
+
+    def remove(self, driver):
+        if self.thread_safe:
+            if self.driver:
+                self.driver.quit()
+                self.driver = None
+        else:
+            driver.quit()
+        self.driver_count -= 1
+
+    def close(self):
+        if self.thread_safe:
+            log.info("暂不支持关闭需线程安全的driver")
+
+        while not self.queue.empty():
+            driver = self.queue.get()
+            driver.quit()
+            self.driver_count -= 1