Преглед на файлове

update:更新webdriver 驱动

dongzhaorui преди 2 години
родител
ревизия
415ac503ef

+ 32 - 31
zgztb_cookie/FworkSpider/feapder/network/request.py

@@ -291,15 +291,14 @@ class Request(object):
 
         # 代理
         proxies = self.requests_kwargs.get("proxies", -1)
-        if not self.render:
-            if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
-                while True:
-                    proxies = self.get_proxy()
-                    if proxies:
-                        self.requests_kwargs.update(proxies=proxies)
-                        break
-                    else:
-                        log.debug("暂无可用代理 ...")
+        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
+            while True:
+                proxies = self.get_proxy()
+                if proxies:
+                    self.requests_kwargs.update(proxies=proxies)
+                    break
+                else:
+                    log.debug("暂无可用代理 ...")
 
         log.debug(
             """
@@ -344,10 +343,14 @@ class Request(object):
                 if cookie_str:
                     cookies = tools.get_cookies_from_str(cookie_str)
 
-            browser = self._webdriver_pool.get(user_agent=user_agent)
+            proxy = None
+            if proxies and proxies != -1:
+                proxy = proxies.get("http", "").strip("http://") or proxies.get(
+                    "https", ""
+                ).strip("https://")
+
+            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
             try:
-                if proxies:
-                    self.chage_ip(browser)
                 browser.get(self.url)
                 if cookies:
                     browser.cookies = cookies
@@ -380,14 +383,27 @@ class Request(object):
             response = self._session.request(method, self.url, **self.requests_kwargs)
             response = Response(response)
         elif self.splash:
-            proxies = self.requests_kwargs.pop('proxies') or self.get_proxy()
-            resp = requests.get(setting.JIANYU_SPLASH_URL, params={
+            headers = self.requests_kwargs.get('headers')
+            if not headers:
+                headers = {'User-Agent': self.user_agent()}
+            headers = [(key, val) for key, val in headers.items()]
+
+            proxy = None
+            if proxies and proxies != -1:
+                proxy = proxies.get("http", "").strip("http://") or proxies.get(
+                    "https", ""
+                ).strip("https://")
+
+            params = {
                 'iframes': self.iframes,
                 'wait': self.render_time,
                 'html': 1,
-                'proxy': proxies.get("http"),
+                'proxy': proxy,
                 'url': self.url,
-            })
+            }
+            data = {'headers': headers}
+            splash_url = setting.JIANYU_SPLASH_URL
+            resp = requests.get(splash_url, params=params, json=data)
             response = Response(resp)
         else:
             response = requests.request(method, self.url, **self.requests_kwargs)
@@ -424,21 +440,6 @@ class Request(object):
         print(f"切换代理:{proxy.get('data')}")
         return proxy.get("data")
 
-    def chage_ip(self,browser):
-        ip = self.get_proxy().get("http")  # ip格式"127.0.0.1:80"
-        ip = ip.split("//")[-1]
-        browser.get("about:config")
-        browser.find_element_by_id("warningButton").click()
-        # js代码
-        setupScript = '''var prefs = Components.classes["@mozilla.org/preferences-service;1"].getService(Components.interfaces.nsIPrefBranch);
-        prefs.setIntPref("network.proxy.type", 1);
-        prefs.setCharPref("network.proxy.socks", "%s");
-        prefs.setIntPref("network.proxy.socks_port", "%s");
-        ''' % (
-        ip.split(':')[0], ip.split(':')[1])
-        # 执行js
-        browser.execute_script(setupScript)
-
     def user_agent(self):
         headers = self.requests_kwargs.get("headers")
         if headers:

+ 170 - 43
zgztb_cookie/FworkSpider/feapder/utils/webdriver.py

@@ -1,11 +1,11 @@
 # -*- coding: utf-8 -*-
 """
-Created on 2021/3/18 4:59 下午
+Created on 2023-03-01
 ---------
-@summary:
+@summary: 远程selenium服务
 ---------
-@author: Boris
-@email: boris_liu@foxmail.com
+@author: dzr
+@email: dongzhaorui@topnet.net.cn
 """
 
 import os
@@ -17,17 +17,16 @@ from selenium.webdriver.chrome.remote_connection import ChromeRemoteConnection
 from selenium.webdriver.firefox.remote_connection import FirefoxRemoteConnection
 from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
 
+from feapder.setting import WEBDRIVER
 from feapder.utils.log import log
 from feapder.utils.tools import Singleton
-from feapder.setting import WEBDRIVER
 
 DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
 
 
 class WebDriver(RemoteWebDriver):
-    '''浏览器采集 - selenium'''
+    """浏览器采集 - selenium"""
     CHROME = "CHROME"
-    EDGE = "EDGE"
     FIREFOX = "FIREFOX"
 
     def __init__(
@@ -38,12 +37,17 @@ class WebDriver(RemoteWebDriver):
         driver_type=CHROME,
         timeout=10,
         window_size=(1024, 800),
-        command_executor=None,
+        server_addr=None,
         custom_argument=None,
+        version=None,
+        usages_local_driver=False,
+        headless=False,
+        executable_path=None,
+        service_log_path=None,
         **kwargs
     ):
         """
-        webdirver 封装,支持chrome、edge 和 firefox
+        webdirver 封装,支持 chrome 和 firefox
         Args:
             load_images: 是否加载图片
             user_agent: 字符串 或 无参函数,返回值为user_agent
@@ -52,23 +56,29 @@ class WebDriver(RemoteWebDriver):
             driver_type: CHROME 或 FIREFOX...
             timeout: 请求超时时间
             window_size: # 窗口大小
-            command_executor: 远程服务地址
+            executable_path: 浏览器路径,默认为默认路径
+            server_addr: 远程服务地址
+            usages_local_driver: 使用本地驱动
+            service_log_path: selenium service 日志路径
+            version: 浏览器版本
             **kwargs:
         """
         self._load_images = load_images
         self._user_agent = user_agent or DEFAULT_USERAGENT
         self._proxy = proxy
+        self._headless = headless
         self._timeout = timeout
         self._window_size = window_size
-        self._command_executor = command_executor or WEBDRIVER['command_executor']
+        self._server_addr = server_addr or WEBDRIVER["server_addr"]
         self._custom_argument = custom_argument
+        self._version = version or WEBDRIVER["version"]
+        self._executable_path = executable_path
+        self._usages_local_driver = usages_local_driver
+        self._service_log_path = service_log_path or WEBDRIVER["service_log_path"]
 
         if driver_type == WebDriver.CHROME:
             self.driver = self.chrome_driver()
 
-        # elif driver_type == WebDriver.EDGE:
-        #     self.driver = self.edge_driver()
-
         elif driver_type == WebDriver.FIREFOX:
             self.driver = self.firefox_driver()
 
@@ -97,16 +107,71 @@ class WebDriver(RemoteWebDriver):
     def get_driver(self):
         return self.driver
 
-    def firefox_driver(self):
+    def local_firefox_driver(self):
+        firefox_profile = webdriver.FirefoxProfile()
+        firefox_options = webdriver.FirefoxOptions()
+        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
+        firefox_profile.set_preference("dom.webdriver.enabled", False)
+        if self._proxy:
+            proxy = self._proxy() if callable(self._proxy) else self._proxy
+            proxy = proxy.replace("socks5://", "")
+            # 使用socks5 代理
+            ip, port = proxy.split(":")
+            firefox_profile.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
+            firefox_profile.set_preference('network.proxy.socks', ip)
+            firefox_profile.set_preference('network.proxy.socks_port', int(port))
+
+        if self._user_agent:
+            firefox_profile.set_preference(
+                "general.useragent.override",
+                self._user_agent() if callable(
+                    self._user_agent) else self._user_agent,
+            )
+
+        if not self._load_images:
+            firefox_profile.set_preference("permissions.default.image", 2)
+
+        if self._headless:
+            firefox_options.add_argument("--headless")
+            firefox_options.add_argument("--disable-gpu")
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                firefox_options.add_argument(arg)
+
+        if self._executable_path:
+            driver = webdriver.Firefox(
+                capabilities=firefox_capabilities,
+                options=firefox_options,
+                firefox_profile=firefox_profile,
+                executable_path=self._executable_path,
+                service_log_path=self._service_log_path
+            )
+        else:
+            driver = webdriver.Firefox(
+                capabilities=firefox_capabilities,
+                options=firefox_options,
+                firefox_profile=firefox_profile,
+                service_log_path=self._service_log_path
+            )
+
+        if self._window_size:
+            driver.set_window_size(*self._window_size)
+
+        return driver
+
+    def remote_firefox_driver(self):
         firefox_capabilities = {
-            # "browserName": "firefox",
+            "browserName": "firefox",
             "platform": "ANY",
-            "version": "",
+            "version": self._version,
             "javascriptEnabled": True,
             "marionette": False,
         }
         firefox_options = webdriver.FirefoxOptions()
         firefox_options.add_argument("--disable-gpu")
+        firefox_options.set_preference("dom.webdriver.enabled", False)
         if self._proxy:
             proxy = self._proxy() if callable(self._proxy) else self._proxy
             proxy = proxy.replace("socks5://", "")
@@ -130,9 +195,7 @@ class WebDriver(RemoteWebDriver):
             for arg in self._custom_argument:
                 firefox_options.add_argument(arg)
 
-        executor = FirefoxRemoteConnection(
-            remote_server_addr=self._command_executor)
-
+        executor = FirefoxRemoteConnection(remote_server_addr=self._server_addr)
         browser = webdriver.Remote(
             command_executor=executor,
             desired_capabilities=firefox_capabilities,
@@ -144,11 +207,16 @@ class WebDriver(RemoteWebDriver):
 
         return browser
 
-    def chrome_driver(self):
+    def firefox_driver(self):
+        if self._usages_local_driver:
+            return self.local_firefox_driver()
+        return self.remote_firefox_driver()
+
+    def remote_chrome_driver(self):
         chrome_capabilities = {
-            # "browserName": "chrome",
+            "browserName": "chrome",
             "platform": "ANY",
-            "version": "",
+            "version": self._version,
             "javascriptEnabled": True,
         }
         chrome_options = webdriver.ChromeOptions()
@@ -159,6 +227,15 @@ class WebDriver(RemoteWebDriver):
         # docker 里运行需要
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--disable-gpu")
+        chrome_options.add_argument('--disable-extensions')
+        chrome_options.add_argument('--disable-dev-shm-usage')
+
+        if self._proxy:
+            chrome_options.add_argument(
+                "--proxy-server={}".format(
+                    self._proxy() if callable(self._proxy) else self._proxy
+                )
+            )
 
         if self._user_agent:
             chrome_options.add_argument(
@@ -168,13 +245,6 @@ class WebDriver(RemoteWebDriver):
                     else self._user_agent
                 )
             )
-        # 不支持socks5协议
-        # if self._proxy:
-        #     chrome_options.add_argument(
-        #         "--proxy-server={}".format(
-        #             self._proxy() if callable(self._proxy) else self._proxy
-        #         )
-        #     )
 
         if not self._load_images:
             chrome_options.add_experimental_option(
@@ -193,7 +263,7 @@ class WebDriver(RemoteWebDriver):
 
         browser = webdriver.Remote(
             command_executor=ChromeRemoteConnection(
-                remote_server_addr=self._command_executor,
+                remote_server_addr=self._server_addr,
                 keep_alive=True),
             desired_capabilities=chrome_capabilities,
             options=chrome_options
@@ -210,18 +280,75 @@ class WebDriver(RemoteWebDriver):
 
         return browser
 
-    def edge_driver(self):
-        edge_capabilities = {
-            "browserName": "MicrosoftEdge",
-            "platform": "ANY",  # WINDOWS
-            "version": "",
-            "javascriptEnabled": True,
-        }
-        browser = webdriver.Remote(
-            command_executor=self._command_executor,
-            desired_capabilities=edge_capabilities,
-        )
-        return browser
+    def local_chrome_driver(self):
+        chrome_options = webdriver.ChromeOptions()
+        # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
+        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        chrome_options.add_experimental_option("useAutomationExtension", False)
+        # docker 里运行需要
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-gpu")
+        chrome_options.add_argument('--disable-extensions')
+        chrome_options.add_argument('--disable-dev-shm-usage')
+
+        if self._proxy:
+            chrome_options.add_argument(
+                "--proxy-server={}".format(
+                    self._proxy() if callable(self._proxy) else self._proxy
+                )
+            )
+
+        if self._user_agent:
+            chrome_options.add_argument(
+                "user-agent={}".format(
+                    self._user_agent()
+                    if callable(self._user_agent)
+                    else self._user_agent
+                )
+            )
+
+        if not self._load_images:
+            chrome_options.add_experimental_option(
+                "prefs", {"profile.managed_default_content_settings.images": 2}
+            )
+
+        if self._headless:
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--disable-gpu")
+
+        if self._window_size:
+            chrome_options.add_argument(
+                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
+            )
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                chrome_options.add_argument(arg)
+
+        if self._executable_path:
+            driver = webdriver.Chrome(
+                chrome_options=chrome_options,
+                executable_path=self._executable_path,
+                service_log_path=self._service_log_path
+            )
+        else:
+            driver = webdriver.Chrome(
+                chrome_options=chrome_options,
+                service_log_path=self._service_log_path
+            )
+
+        # 隐藏浏览器特征
+        with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
+            js = f.read()
+        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
+
+        return driver
+
+    def chrome_driver(self):
+        if self._usages_local_driver:
+            return self.local_chrome_driver()
+        return self.remote_chrome_driver()
 
     @property
     def cookies(self):

+ 10 - 5
zgztb_cookie/FworkSpider/setting.py

@@ -14,6 +14,7 @@ MONGO_USER_PASS = ""
 REDISDB_IP_PORTS = "172.17.4.232:7361"
 REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
 REDISDB_DB = 2
+
 # # 适用于redis哨兵模式
 REDISDB_SERVICE_NAME = "quchoong"
 
@@ -32,17 +33,21 @@ REQUEST_TIMEOUT = 10
 
 # 浏览器渲染
 WEBDRIVER = dict(
+    server_addr="http://172.17.4.232:6666/wd/hub",  # selenium 远程服务地址
+    version="",  # 浏览器版本。不指定版本时,随机分发,版本详见群公告
     pool_size=1,  # 浏览器的数量
-    command_executor="http://172.17.4.232:6666/wd/hub",  # selenium 远程服务地址
-    # command_executor="http://192.168.20.248:4444/wd/hub",  # selenium 远程服务地址
-    load_images=True,  # 是否加载图片
+    load_images=False,  # 是否加载图片
     user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
+    headless=False,  # 是否为无头浏览器
     proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
+    driver_type="CHROME",  # CHROME、FIREFOX
     timeout=30,  # 请求超时时间
-    driver_type="FIREFOX",  # CHROME、FIREFOX、EDGE
+    executable_path=None,  # 浏览器路径,默认为默认路径
+    usages_local_driver=False,  # 是否使用本地驱动,默认启动本地驱动
     window_size=(1280, 800),  # 窗口大小
-    render_time=5,  # 渲染时长,即打开网页等待指定时间后再获取源码
+    render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
+    service_log_path=os.devnull  # 日志路径,默认置空
 )
 
 # 设置代理

+ 4 - 4
zgztb_cookie/cookie_pool.py

@@ -16,11 +16,11 @@ class WebCookiePool(PageCookiePool):
         self.cookie_key = cookie_key
         self._kwargs = kwargs
         self._kwargs.setdefault("load_images", False)
-        self._kwargs.setdefault("headless", True)
-        self._kwargs.setdefault("driver_type", "FIREFOX")
+        self._kwargs.setdefault("headless", False)
+        self._kwargs.setdefault("driver_type", "CHROME")
 
-    def create_cookies(self, proxies):
-        self._kwargs.setdefault("proxy", proxies)
+    def create_cookies(self, proxy=None):
+        self._kwargs.setdefault("proxy", proxy)
         with WebDriver(**self._kwargs) as driver_pool:
             import time
             try:

+ 3 - 3
zgztb_cookie/detail_normol.py

@@ -109,8 +109,8 @@ class Details(feapder.AirSpider):
                     count=0)
         elif '滑动验证页面' in response.text:
             log.info('开始过滑块验证')
-            '''尝试代理池获取通过滑块验证的cookies会话信息,进行采集'''
-            cookies = self.cookie_pool.create_cookies(proxies=self.proxy.get("http"))
+            '''尝试通过滑块验证的cookies会话信息,进行采集'''
+            cookies = self.cookie_pool.create_cookies(proxy=self.proxy.get("http"))
             # print("cookies >>> ", cookies)
             count = request.count
             if count > 4:
@@ -130,7 +130,7 @@ class Details(feapder.AirSpider):
             except Exception as e:
                 log.warning(f"状态码:{response.status_code} {e.__class__.__name__}:{e.args[0]}")
                 self.proxy = swordfish_proxy()
-                cookies = self.cookie_pool.create_cookies(proxies=self.proxy.get("http"))
+                cookies = self.cookie_pool.create_cookies(proxy=self.proxy.get("http"))
                 request.session.cookies.update(cookies)
                 yield feapder.Request(url=request.url, item=request.item,
                                       method="POST", data=request.data,