webdriver_old.py 15 KB


  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2023-03-01
  4. ---------
  5. @summary: 远程selenium服务
  6. ---------
  7. @author: dzr
  8. @email: dongzhaorui@topnet.net.cn
  9. """
  10. import os
  11. import queue
  12. import threading
  13. from selenium import webdriver
  14. from selenium.webdriver.chrome.remote_connection import ChromeRemoteConnection
  15. from selenium.webdriver.firefox.remote_connection import FirefoxRemoteConnection
  16. from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
  17. from feapder.setting import WEBDRIVER
  18. from feapder.utils.log import log
  19. from feapder.utils.tools import Singleton
  20. DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
  21. class WebDriver(RemoteWebDriver):
  22. """浏览器采集 - selenium"""
  23. CHROME = "CHROME"
  24. FIREFOX = "FIREFOX"
  25. def __init__(
  26. self,
  27. load_images=True,
  28. user_agent=None,
  29. proxy=None,
  30. driver_type=CHROME,
  31. timeout=20,
  32. headless=False,
  33. usages_local_driver=False,
  34. window_size=(1024, 800),
  35. server_addr=None,
  36. version=None,
  37. custom_argument=None,
  38. executable_path=None,
  39. service_log_path=None,
  40. **kwargs
  41. ):
  42. """
  43. webdirver 封装,支持 chrome 和 firefox
  44. Args:
  45. load_images: 是否加载图片
  46. user_agent: 字符串 或 无参函数,返回值为user_agent
  47. proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
  48. headless: 是否启用无头模式
  49. driver_type: CHROME 或 FIREFOX...
  50. timeout: 请求超时时间
  51. window_size: # 窗口大小
  52. executable_path: 浏览器路径,默认为默认路径
  53. server_addr: 远程服务地址
  54. usages_local_driver: 是否使用本地驱动
  55. service_log_path: selenium service 日志路径
  56. version: 浏览器版本
  57. **kwargs:
  58. """
  59. self._load_images = load_images
  60. self._user_agent = user_agent or DEFAULT_USERAGENT
  61. self._proxy = proxy
  62. self._headless = headless
  63. self._usages_local_driver = usages_local_driver
  64. self._timeout = timeout
  65. self._window_size = window_size
  66. self._executable_path = executable_path
  67. self._custom_argument = custom_argument
  68. self._server_addr = server_addr or WEBDRIVER["server_addr"]
  69. self._version = version or WEBDRIVER["version"]
  70. self._service_log_path = service_log_path or WEBDRIVER["service_log_path"]
  71. if driver_type == WebDriver.CHROME:
  72. self.driver = self.chrome_driver()
  73. elif driver_type == WebDriver.FIREFOX:
  74. self.driver = self.firefox_driver()
  75. else:
  76. raise TypeError(
  77. "dirver_type must be one of CHROME or FIREFOX, but received {}".format(
  78. type(driver_type)
  79. )
  80. )
  81. # driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
  82. self.driver.set_page_load_timeout(self._timeout)
  83. # 设置10秒脚本超时时间
  84. self.driver.set_script_timeout(self._timeout)
  85. self._is_remote = not self._usages_local_driver
  86. def __enter__(self):
  87. return self
  88. def __exit__(self, exc_type, exc_val, exc_tb):
  89. if exc_val:
  90. log.error(exc_val)
  91. self.quit()
  92. return False
  93. def __getattr__(self, name):
  94. if self.driver:
  95. return getattr(self.driver, name)
  96. else:
  97. raise AttributeError
  98. def get_driver(self):
  99. return self.driver
  100. def local_firefox_driver(self):
  101. firefox_profile = webdriver.FirefoxProfile()
  102. firefox_options = webdriver.FirefoxOptions()
  103. firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
  104. firefox_profile.set_preference("dom.webdriver.enabled", False)
  105. if self._proxy:
  106. proxy = self._proxy() if callable(self._proxy) else self._proxy
  107. proxy = proxy.replace("socks5://", "")
  108. # 使用socks5 代理
  109. ip, port = proxy.split(":")
  110. firefox_profile.set_preference('network.proxy.type', 1) # 不使用代理:0, 使用代理:1
  111. firefox_profile.set_preference('network.proxy.socks', ip)
  112. firefox_profile.set_preference('network.proxy.socks_port', int(port))
  113. if self._user_agent:
  114. firefox_profile.set_preference(
  115. "general.useragent.override",
  116. self._user_agent() if callable(
  117. self._user_agent) else self._user_agent,
  118. )
  119. if not self._load_images:
  120. firefox_profile.set_preference("permissions.default.image", 2)
  121. if self._headless:
  122. firefox_options.add_argument("--headless")
  123. firefox_options.add_argument("--disable-gpu")
  124. # 添加自定义的配置参数
  125. if self._custom_argument:
  126. for arg in self._custom_argument:
  127. firefox_options.add_argument(arg)
  128. if self._executable_path:
  129. driver = webdriver.Firefox(
  130. capabilities=firefox_capabilities,
  131. options=firefox_options,
  132. firefox_profile=firefox_profile,
  133. executable_path=self._executable_path,
  134. service_log_path=self._service_log_path
  135. )
  136. else:
  137. driver = webdriver.Firefox(
  138. capabilities=firefox_capabilities,
  139. options=firefox_options,
  140. firefox_profile=firefox_profile,
  141. service_log_path=self._service_log_path
  142. )
  143. if self._window_size:
  144. driver.set_window_size(*self._window_size)
  145. return driver
  146. def remote_firefox_driver(self):
  147. firefox_options = webdriver.FirefoxOptions()
  148. desired_capabilities = firefox_options.to_capabilities()
  149. firefox_options.set_preference("dom.webdriver.enabled", False)
  150. if self._version:
  151. desired_capabilities['version'] = self._version
  152. if self._proxy:
  153. proxy = self._proxy() if callable(self._proxy) else self._proxy
  154. proxy = proxy.replace("socks5://", "")
  155. # 使用socks5 代理
  156. ip, port = proxy.split(":")
  157. firefox_options.set_preference('network.proxy.type', 1) # 不使用代理:0, 使用代理:1
  158. firefox_options.set_preference('network.proxy.socks', ip)
  159. firefox_options.set_preference('network.proxy.socks_port', int(port))
  160. if self._user_agent:
  161. firefox_options.set_preference(
  162. "general.useragent.override",
  163. self._user_agent() if callable(self._user_agent) else self._user_agent,
  164. )
  165. if not self._load_images:
  166. firefox_options.set_preference("permissions.default.image", 2)
  167. if self._headless:
  168. firefox_options.add_argument("--headless")
  169. firefox_options.add_argument("--disable-gpu")
  170. if self._custom_argument:
  171. for arg in self._custom_argument:
  172. firefox_options.add_argument(arg)
  173. executor = FirefoxRemoteConnection(remote_server_addr=self._server_addr)
  174. browser = webdriver.Remote(
  175. command_executor=executor,
  176. desired_capabilities=desired_capabilities,
  177. options=firefox_options
  178. )
  179. if self._window_size:
  180. browser.set_window_size(*self._window_size)
  181. return browser
  182. def firefox_driver(self):
  183. if self._usages_local_driver:
  184. return self.local_firefox_driver()
  185. return self.remote_firefox_driver()
  186. def remote_chrome_driver(self):
  187. chrome_options = webdriver.ChromeOptions()
  188. desired_capabilities = chrome_options.to_capabilities()
  189. # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
  190. chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  191. chrome_options.add_experimental_option("useAutomationExtension", False)
  192. chrome_options.add_argument('--disable-blink-features=AutomationControlled')
  193. # docker 里运行需要
  194. chrome_options.add_argument('--no-sandbox')
  195. chrome_options.add_argument('--disable-extensions')
  196. chrome_options.add_argument('--disable-dev-shm-usage')
  197. if self._version:
  198. desired_capabilities['version'] = self._version
  199. if self._proxy:
  200. chrome_options.add_argument(
  201. "--proxy-server={}".format(
  202. self._proxy() if callable(self._proxy) else self._proxy
  203. )
  204. )
  205. if self._user_agent:
  206. chrome_options.add_argument(
  207. "user-agent={}".format(
  208. self._user_agent()
  209. if callable(self._user_agent)
  210. else self._user_agent
  211. )
  212. )
  213. if not self._load_images:
  214. chrome_options.add_experimental_option(
  215. "prefs", {"profile.managed_default_content_settings.images": 2}
  216. )
  217. if self._headless:
  218. chrome_options.add_argument("--headless")
  219. chrome_options.add_argument("--disable-gpu")
  220. if self._window_size:
  221. chrome_options.add_argument(
  222. "--window-size={},{}".format(self._window_size[0], self._window_size[1])
  223. )
  224. # 添加自定义的配置参数
  225. if self._custom_argument:
  226. for arg in self._custom_argument:
  227. chrome_options.add_argument(arg)
  228. browser = webdriver.Remote(
  229. command_executor=ChromeRemoteConnection(
  230. remote_server_addr=self._server_addr,
  231. keep_alive=True),
  232. desired_capabilities=desired_capabilities,
  233. options=chrome_options
  234. )
  235. # 隐藏浏览器特征
  236. with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
  237. js = f.read()
  238. params = {
  239. 'cmd': 'Page.addScriptToEvaluateOnNewDocument',
  240. 'params': {'source': js}
  241. }
  242. response = browser.execute("executeCdpCommand", params)['value']
  243. return browser
  244. def local_chrome_driver(self):
  245. chrome_options = webdriver.ChromeOptions()
  246. # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
  247. chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  248. chrome_options.add_experimental_option("useAutomationExtension", False)
  249. chrome_options.add_argument('--disable-blink-features=AutomationControlled')
  250. # docker 里运行需要
  251. chrome_options.add_argument("--no-sandbox")
  252. chrome_options.add_argument('--disable-extensions')
  253. chrome_options.add_argument('--disable-dev-shm-usage')
  254. if self._proxy:
  255. chrome_options.add_argument(
  256. "--proxy-server={}".format(
  257. self._proxy() if callable(self._proxy) else self._proxy
  258. )
  259. )
  260. if self._user_agent:
  261. chrome_options.add_argument(
  262. "user-agent={}".format(
  263. self._user_agent()
  264. if callable(self._user_agent)
  265. else self._user_agent
  266. )
  267. )
  268. if not self._load_images:
  269. chrome_options.add_experimental_option(
  270. "prefs", {"profile.managed_default_content_settings.images": 2}
  271. )
  272. if self._headless:
  273. chrome_options.add_argument("--headless")
  274. chrome_options.add_argument("--disable-gpu")
  275. if self._window_size:
  276. chrome_options.add_argument(
  277. "--window-size={},{}".format(self._window_size[0], self._window_size[1])
  278. )
  279. # 添加自定义的配置参数
  280. if self._custom_argument:
  281. for arg in self._custom_argument:
  282. chrome_options.add_argument(arg)
  283. if self._executable_path:
  284. driver = webdriver.Chrome(
  285. chrome_options=chrome_options,
  286. executable_path=self._executable_path,
  287. service_log_path=self._service_log_path
  288. )
  289. else:
  290. driver = webdriver.Chrome(
  291. chrome_options=chrome_options,
  292. service_log_path=self._service_log_path
  293. )
  294. # 隐藏浏览器特征
  295. with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
  296. js = f.read()
  297. driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
  298. return driver
  299. def chrome_driver(self):
  300. if self._usages_local_driver:
  301. return self.local_chrome_driver()
  302. return self.remote_chrome_driver()
  303. @property
  304. def cookies(self):
  305. cookies_json = {}
  306. for cookie in self.driver.get_cookies():
  307. cookies_json[cookie["name"]] = cookie["value"]
  308. return cookies_json
  309. @cookies.setter
  310. def cookies(self, val: dict):
  311. """
  312. 设置cookie
  313. Args:
  314. val: {"key":"value", "key2":"value2"}
  315. Returns:
  316. """
  317. for key, value in val.items():
  318. self.driver.add_cookie({"name": key, "value": value})
  319. def quit(self):
  320. try:
  321. self.get_driver().quit()
  322. except Exception:
  323. # We don't care about the message because something probably has gone wrong
  324. pass
  325. # def __del__(self):
  326. # if self.driver:
  327. # self.driver.quit()
  328. @Singleton
  329. class WebDriverPool:
  330. def __init__(self, pool_size=5, **kwargs):
  331. self.queue = queue.Queue(maxsize=pool_size)
  332. self.kwargs = kwargs
  333. self.lock = threading.RLock()
  334. self.driver_count = 0
  335. @property
  336. def is_full(self):
  337. return self.driver_count >= self.queue.maxsize
  338. def get(self, user_agent: str = None, proxy: str = None) -> WebDriver:
  339. """
  340. 获取webdriver
  341. 当webdriver为新实例时会使用 user_agen, proxy, cookie参数来创建
  342. Args:
  343. user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
  344. proxy: xxx.xxx.xxx.xxx
  345. Returns:
  346. """
  347. if not self.is_full:
  348. with self.lock:
  349. if not self.is_full:
  350. kwargs = self.kwargs.copy()
  351. if user_agent:
  352. kwargs["user_agent"] = user_agent
  353. if proxy:
  354. kwargs["proxy"] = proxy
  355. driver = WebDriver(**kwargs)
  356. self.queue.put(driver)
  357. self.driver_count += 1
  358. driver = self.queue.get()
  359. return driver
  360. def put(self, driver):
  361. self.queue.put(driver)
  362. def remove(self, driver):
  363. driver.quit()
  364. self.driver_count -= 1
  365. def close(self):
  366. while not self.queue.empty():
  367. driver = self.queue.get()
  368. driver.quit()
  369. self.driver_count -= 1