|
@@ -1,11 +1,11 @@
|
|
|
# -*- coding: utf-8 -*-
|
|
|
"""
|
|
|
-Created on 2021/3/18 4:59 下午
|
|
|
+Created on 2023-03-01
|
|
|
---------
|
|
|
-@summary:
|
|
|
+@summary: 远程selenium服务
|
|
|
---------
|
|
|
-@author: Boris
|
|
|
-@email: boris_liu@foxmail.com
|
|
|
+@author: dzr
|
|
|
+@email: dongzhaorui@topnet.net.cn
|
|
|
"""
|
|
|
|
|
|
import os
|
|
@@ -17,17 +17,16 @@ from selenium.webdriver.chrome.remote_connection import ChromeRemoteConnection
|
|
|
from selenium.webdriver.firefox.remote_connection import FirefoxRemoteConnection
|
|
|
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
|
|
|
|
|
|
+from feapder.setting import WEBDRIVER
|
|
|
from feapder.utils.log import log
|
|
|
from feapder.utils.tools import Singleton
|
|
|
-from feapder.setting import WEBDRIVER
|
|
|
|
|
|
DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
|
|
|
|
|
|
|
|
|
class WebDriver(RemoteWebDriver):
|
|
|
- '''浏览器采集 - selenium'''
|
|
|
+ """浏览器采集 - selenium"""
|
|
|
CHROME = "CHROME"
|
|
|
- EDGE = "EDGE"
|
|
|
FIREFOX = "FIREFOX"
|
|
|
|
|
|
def __init__(
|
|
@@ -38,12 +37,17 @@ class WebDriver(RemoteWebDriver):
|
|
|
driver_type=CHROME,
|
|
|
timeout=10,
|
|
|
window_size=(1024, 800),
|
|
|
- command_executor=None,
|
|
|
+ server_addr=None,
|
|
|
custom_argument=None,
|
|
|
+ version=None,
|
|
|
+ usages_local_driver=False,
|
|
|
+ headless=False,
|
|
|
+ executable_path=None,
|
|
|
+ service_log_path=None,
|
|
|
**kwargs
|
|
|
):
|
|
|
"""
|
|
|
- webdirver 封装,支持chrome、edge 和 firefox
|
|
|
+ webdirver 封装,支持 chrome 和 firefox
|
|
|
Args:
|
|
|
load_images: 是否加载图片
|
|
|
user_agent: 字符串 或 无参函数,返回值为user_agent
|
|
@@ -52,23 +56,29 @@ class WebDriver(RemoteWebDriver):
|
|
|
driver_type: CHROME 或 FIREFOX...
|
|
|
timeout: 请求超时时间
|
|
|
window_size: # 窗口大小
|
|
|
- command_executor: 远程服务地址
|
|
|
+ executable_path: 浏览器路径,默认为默认路径
|
|
|
+ server_addr: 远程服务地址
|
|
|
+ usages_local_driver: 使用本地驱动
|
|
|
+ service_log_path: selenium service 日志路径
|
|
|
+ version: 浏览器版本
|
|
|
**kwargs:
|
|
|
"""
|
|
|
self._load_images = load_images
|
|
|
self._user_agent = user_agent or DEFAULT_USERAGENT
|
|
|
self._proxy = proxy
|
|
|
+ self._headless = headless
|
|
|
self._timeout = timeout
|
|
|
self._window_size = window_size
|
|
|
- self._command_executor = command_executor or WEBDRIVER['command_executor']
|
|
|
+ self._server_addr = server_addr or WEBDRIVER["server_addr"]
|
|
|
self._custom_argument = custom_argument
|
|
|
+ self._version = version or WEBDRIVER["version"]
|
|
|
+ self._executable_path = executable_path
|
|
|
+ self._usages_local_driver = usages_local_driver
|
|
|
+ self._service_log_path = service_log_path or WEBDRIVER["service_log_path"]
|
|
|
|
|
|
if driver_type == WebDriver.CHROME:
|
|
|
self.driver = self.chrome_driver()
|
|
|
|
|
|
- # elif driver_type == WebDriver.EDGE:
|
|
|
- # self.driver = self.edge_driver()
|
|
|
-
|
|
|
elif driver_type == WebDriver.FIREFOX:
|
|
|
self.driver = self.firefox_driver()
|
|
|
|
|
@@ -97,16 +107,71 @@ class WebDriver(RemoteWebDriver):
|
|
|
def get_driver(self):
|
|
|
return self.driver
|
|
|
|
|
|
- def firefox_driver(self):
|
|
|
+ def local_firefox_driver(self):
|
|
|
+ firefox_profile = webdriver.FirefoxProfile()
|
|
|
+ firefox_options = webdriver.FirefoxOptions()
|
|
|
+ firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
|
|
|
+ firefox_profile.set_preference("dom.webdriver.enabled", False)
|
|
|
+ if self._proxy:
|
|
|
+ proxy = self._proxy() if callable(self._proxy) else self._proxy
|
|
|
+ proxy = proxy.replace("socks5://", "")
|
|
|
+ # 使用socks5 代理
|
|
|
+ ip, port = proxy.split(":")
|
|
|
+ firefox_profile.set_preference('network.proxy.type', 1) # 不使用代理:0, 使用代理:1
|
|
|
+ firefox_profile.set_preference('network.proxy.socks', ip)
|
|
|
+ firefox_profile.set_preference('network.proxy.socks_port', int(port))
|
|
|
+
|
|
|
+ if self._user_agent:
|
|
|
+ firefox_profile.set_preference(
|
|
|
+ "general.useragent.override",
|
|
|
+ self._user_agent() if callable(
|
|
|
+ self._user_agent) else self._user_agent,
|
|
|
+ )
|
|
|
+
|
|
|
+ if not self._load_images:
|
|
|
+ firefox_profile.set_preference("permissions.default.image", 2)
|
|
|
+
|
|
|
+ if self._headless:
|
|
|
+ firefox_options.add_argument("--headless")
|
|
|
+ firefox_options.add_argument("--disable-gpu")
|
|
|
+
|
|
|
+ # 添加自定义的配置参数
|
|
|
+ if self._custom_argument:
|
|
|
+ for arg in self._custom_argument:
|
|
|
+ firefox_options.add_argument(arg)
|
|
|
+
|
|
|
+ if self._executable_path:
|
|
|
+ driver = webdriver.Firefox(
|
|
|
+ capabilities=firefox_capabilities,
|
|
|
+ options=firefox_options,
|
|
|
+ firefox_profile=firefox_profile,
|
|
|
+ executable_path=self._executable_path,
|
|
|
+ service_log_path=self._service_log_path
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ driver = webdriver.Firefox(
|
|
|
+ capabilities=firefox_capabilities,
|
|
|
+ options=firefox_options,
|
|
|
+ firefox_profile=firefox_profile,
|
|
|
+ service_log_path=self._service_log_path
|
|
|
+ )
|
|
|
+
|
|
|
+ if self._window_size:
|
|
|
+ driver.set_window_size(*self._window_size)
|
|
|
+
|
|
|
+ return driver
|
|
|
+
|
|
|
+ def remote_firefox_driver(self):
|
|
|
firefox_capabilities = {
|
|
|
- # "browserName": "firefox",
|
|
|
+ "browserName": "firefox",
|
|
|
"platform": "ANY",
|
|
|
- "version": "",
|
|
|
+ "version": self._version,
|
|
|
"javascriptEnabled": True,
|
|
|
"marionette": False,
|
|
|
}
|
|
|
firefox_options = webdriver.FirefoxOptions()
|
|
|
firefox_options.add_argument("--disable-gpu")
|
|
|
+ firefox_options.set_preference("dom.webdriver.enabled", False)
|
|
|
if self._proxy:
|
|
|
proxy = self._proxy() if callable(self._proxy) else self._proxy
|
|
|
proxy = proxy.replace("socks5://", "")
|
|
@@ -130,9 +195,7 @@ class WebDriver(RemoteWebDriver):
|
|
|
for arg in self._custom_argument:
|
|
|
firefox_options.add_argument(arg)
|
|
|
|
|
|
- executor = FirefoxRemoteConnection(
|
|
|
- remote_server_addr=self._command_executor)
|
|
|
-
|
|
|
+ executor = FirefoxRemoteConnection(remote_server_addr=self._server_addr)
|
|
|
browser = webdriver.Remote(
|
|
|
command_executor=executor,
|
|
|
desired_capabilities=firefox_capabilities,
|
|
@@ -144,11 +207,16 @@ class WebDriver(RemoteWebDriver):
|
|
|
|
|
|
return browser
|
|
|
|
|
|
- def chrome_driver(self):
|
|
|
+ def firefox_driver(self):
|
|
|
+ if self._usages_local_driver:
|
|
|
+ return self.local_firefox_driver()
|
|
|
+ return self.remote_firefox_driver()
|
|
|
+
|
|
|
+ def remote_chrome_driver(self):
|
|
|
chrome_capabilities = {
|
|
|
- # "browserName": "chrome",
|
|
|
+ "browserName": "chrome",
|
|
|
"platform": "ANY",
|
|
|
- "version": "",
|
|
|
+ "version": self._version,
|
|
|
"javascriptEnabled": True,
|
|
|
}
|
|
|
chrome_options = webdriver.ChromeOptions()
|
|
@@ -159,6 +227,15 @@ class WebDriver(RemoteWebDriver):
|
|
|
# docker 里运行需要
|
|
|
chrome_options.add_argument("--no-sandbox")
|
|
|
chrome_options.add_argument("--disable-gpu")
|
|
|
+ chrome_options.add_argument('--disable-extensions')
|
|
|
+ chrome_options.add_argument('--disable-dev-shm-usage')
|
|
|
+
|
|
|
+ if self._proxy:
|
|
|
+ chrome_options.add_argument(
|
|
|
+ "--proxy-server={}".format(
|
|
|
+ self._proxy() if callable(self._proxy) else self._proxy
|
|
|
+ )
|
|
|
+ )
|
|
|
|
|
|
if self._user_agent:
|
|
|
chrome_options.add_argument(
|
|
@@ -168,13 +245,6 @@ class WebDriver(RemoteWebDriver):
|
|
|
else self._user_agent
|
|
|
)
|
|
|
)
|
|
|
- # 不支持socks5协议
|
|
|
- # if self._proxy:
|
|
|
- # chrome_options.add_argument(
|
|
|
- # "--proxy-server={}".format(
|
|
|
- # self._proxy() if callable(self._proxy) else self._proxy
|
|
|
- # )
|
|
|
- # )
|
|
|
|
|
|
if not self._load_images:
|
|
|
chrome_options.add_experimental_option(
|
|
@@ -193,7 +263,7 @@ class WebDriver(RemoteWebDriver):
|
|
|
|
|
|
browser = webdriver.Remote(
|
|
|
command_executor=ChromeRemoteConnection(
|
|
|
- remote_server_addr=self._command_executor,
|
|
|
+ remote_server_addr=self._server_addr,
|
|
|
keep_alive=True),
|
|
|
desired_capabilities=chrome_capabilities,
|
|
|
options=chrome_options
|
|
@@ -210,18 +280,75 @@ class WebDriver(RemoteWebDriver):
|
|
|
|
|
|
return browser
|
|
|
|
|
|
- def edge_driver(self):
|
|
|
- edge_capabilities = {
|
|
|
- "browserName": "MicrosoftEdge",
|
|
|
- "platform": "ANY", # WINDOWS
|
|
|
- "version": "",
|
|
|
- "javascriptEnabled": True,
|
|
|
- }
|
|
|
- browser = webdriver.Remote(
|
|
|
- command_executor=self._command_executor,
|
|
|
- desired_capabilities=edge_capabilities,
|
|
|
- )
|
|
|
- return browser
|
|
|
+ def local_chrome_driver(self):
|
|
|
+ chrome_options = webdriver.ChromeOptions()
|
|
|
+ # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
|
|
|
+ chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
|
|
+ chrome_options.add_experimental_option("useAutomationExtension", False)
|
|
|
+ # docker 里运行需要
|
|
|
+ chrome_options.add_argument("--no-sandbox")
|
|
|
+ chrome_options.add_argument("--disable-gpu")
|
|
|
+ chrome_options.add_argument('--disable-extensions')
|
|
|
+ chrome_options.add_argument('--disable-dev-shm-usage')
|
|
|
+
|
|
|
+ if self._proxy:
|
|
|
+ chrome_options.add_argument(
|
|
|
+ "--proxy-server={}".format(
|
|
|
+ self._proxy() if callable(self._proxy) else self._proxy
|
|
|
+ )
|
|
|
+ )
|
|
|
+
|
|
|
+ if self._user_agent:
|
|
|
+ chrome_options.add_argument(
|
|
|
+ "user-agent={}".format(
|
|
|
+ self._user_agent()
|
|
|
+ if callable(self._user_agent)
|
|
|
+ else self._user_agent
|
|
|
+ )
|
|
|
+ )
|
|
|
+
|
|
|
+ if not self._load_images:
|
|
|
+ chrome_options.add_experimental_option(
|
|
|
+ "prefs", {"profile.managed_default_content_settings.images": 2}
|
|
|
+ )
|
|
|
+
|
|
|
+ if self._headless:
|
|
|
+ chrome_options.add_argument("--headless")
|
|
|
+ chrome_options.add_argument("--disable-gpu")
|
|
|
+
|
|
|
+ if self._window_size:
|
|
|
+ chrome_options.add_argument(
|
|
|
+ "--window-size={},{}".format(self._window_size[0], self._window_size[1])
|
|
|
+ )
|
|
|
+
|
|
|
+ # 添加自定义的配置参数
|
|
|
+ if self._custom_argument:
|
|
|
+ for arg in self._custom_argument:
|
|
|
+ chrome_options.add_argument(arg)
|
|
|
+
|
|
|
+ if self._executable_path:
|
|
|
+ driver = webdriver.Chrome(
|
|
|
+ chrome_options=chrome_options,
|
|
|
+ executable_path=self._executable_path,
|
|
|
+ service_log_path=self._service_log_path
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ driver = webdriver.Chrome(
|
|
|
+ chrome_options=chrome_options,
|
|
|
+ service_log_path=self._service_log_path
|
|
|
+ )
|
|
|
+
|
|
|
+ # 隐藏浏览器特征
|
|
|
+ with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
|
|
|
+ js = f.read()
|
|
|
+ driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
|
|
|
+
|
|
|
+ return driver
|
|
|
+
|
|
|
+ def chrome_driver(self):
|
|
|
+ if self._usages_local_driver:
|
|
|
+ return self.local_chrome_driver()
|
|
|
+ return self.remote_chrome_driver()
|
|
|
|
|
|
@property
|
|
|
def cookies(self):
|