webdriver.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2023-03-01
  4. ---------
  5. @summary: 远程selenium服务
  6. ---------
  7. @author: dzr
  8. @email: dongzhaorui@topnet.net.cn
  9. """
  10. import os
  11. import queue
  12. import threading
  13. from selenium import webdriver
  14. from selenium.webdriver.chrome.remote_connection import ChromeRemoteConnection
  15. from selenium.webdriver.firefox.remote_connection import FirefoxRemoteConnection
  16. from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
  17. from feapder.setting import WEBDRIVER
  18. from feapder.utils.log import log
  19. from feapder.utils.tools import Singleton
  20. DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
  21. class WebDriver(RemoteWebDriver):
  22. """浏览器采集 - selenium"""
  23. CHROME = "CHROME"
  24. FIREFOX = "FIREFOX"
  25. def __init__(
  26. self,
  27. load_images=True,
  28. user_agent=None,
  29. proxy=None,
  30. driver_type=FIREFOX,
  31. timeout=10,
  32. window_size=(1024, 800),
  33. server_addr=None,
  34. custom_argument=None,
  35. version=None,
  36. usages_local_driver=True,
  37. headless=False,
  38. executable_path=None,
  39. service_log_path=None,
  40. **kwargs
  41. ):
  42. """
  43. webdirver 封装,支持 chrome 和 firefox
  44. Args:
  45. load_images: 是否加载图片
  46. user_agent: 字符串 或 无参函数,返回值为user_agent
  47. proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
  48. headless: 是否启用无头模式
  49. driver_type: CHROME 或 FIREFOX...
  50. timeout: 请求超时时间
  51. window_size: # 窗口大小
  52. executable_path: 浏览器路径,默认为默认路径
  53. server_addr: 远程服务地址
  54. usages_local_driver: 使用本地驱动
  55. service_log_path: selenium service 日志路径
  56. version: 浏览器版本
  57. **kwargs:
  58. """
  59. self._load_images = load_images
  60. self._user_agent = user_agent or DEFAULT_USERAGENT
  61. self._proxy = proxy
  62. self._headless = headless
  63. self._timeout = timeout
  64. self._window_size = window_size
  65. self._server_addr = server_addr or WEBDRIVER["server_addr"]
  66. self._custom_argument = custom_argument
  67. self._version = version or WEBDRIVER["version"]
  68. self._executable_path = executable_path
  69. self._usages_local_driver = usages_local_driver
  70. self._service_log_path = service_log_path
  71. if driver_type == WebDriver.CHROME:
  72. self.driver = self.chrome_driver()
  73. elif driver_type == WebDriver.FIREFOX:
  74. self.driver = self.firefox_driver()
  75. else:
  76. raise TypeError(
  77. "dirver_type must be one of CHROME or PHANTOMJS or FIREFOX, but received {}".format(
  78. type(driver_type)
  79. )
  80. )
  81. # driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
  82. self.driver.set_page_load_timeout(self._timeout)
  83. # 设置10秒脚本超时时间
  84. self.driver.set_script_timeout(self._timeout)
  85. def __enter__(self):
  86. return self
  87. def __exit__(self, exc_type, exc_val, exc_tb):
  88. if exc_val:
  89. log.error(exc_val)
  90. self.get_driver().quit()
  91. return False
  92. def get_driver(self):
  93. return self.driver
  94. def local_firefox_driver(self):
  95. firefox_profile = webdriver.FirefoxProfile()
  96. firefox_options = webdriver.FirefoxOptions()
  97. firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
  98. firefox_profile.set_preference("dom.webdriver.enabled", False)
  99. if self._proxy:
  100. proxy = self._proxy() if callable(self._proxy) else self._proxy
  101. proxy = proxy.replace("socks5://", "")
  102. # 使用socks5 代理
  103. firefox_profile.set_preference('network.proxy.type', 1) # 不使用代理:0, 使用代理:1
  104. firefox_profile.set_preference('network.proxy.socks', proxy.split(":")[0])
  105. firefox_profile.set_preference('network.proxy.socks_port', int(proxy.split(":")[-1]))
  106. if self._user_agent:
  107. firefox_profile.set_preference(
  108. "general.useragent.override",
  109. self._user_agent() if callable(
  110. self._user_agent) else self._user_agent,
  111. )
  112. if not self._load_images:
  113. firefox_profile.set_preference("permissions.default.image", 2)
  114. if self._headless:
  115. firefox_options.add_argument("--headless")
  116. firefox_options.add_argument("--disable-gpu")
  117. # 添加自定义的配置参数
  118. if self._custom_argument:
  119. for arg in self._custom_argument:
  120. firefox_options.add_argument(arg)
  121. if self._executable_path:
  122. driver = webdriver.Firefox(
  123. capabilities=firefox_capabilities,
  124. options=firefox_options,
  125. firefox_profile=firefox_profile,
  126. executable_path=self._executable_path,
  127. service_log_path=self._service_log_path
  128. )
  129. else:
  130. driver = webdriver.Firefox(
  131. capabilities=firefox_capabilities,
  132. options=firefox_options,
  133. firefox_profile=firefox_profile,
  134. service_log_path=self._service_log_path
  135. )
  136. if self._window_size:
  137. driver.set_window_size(*self._window_size)
  138. return driver
  139. def remote_firefox_driver(self):
  140. firefox_capabilities = {
  141. "browserName": "firefox",
  142. "platform": "ANY",
  143. "version": self._version,
  144. "javascriptEnabled": True,
  145. "marionette": False,
  146. }
  147. firefox_options = webdriver.FirefoxOptions()
  148. firefox_options.add_argument("--disable-gpu")
  149. firefox_options.set_preference("dom.webdriver.enabled", False)
  150. if self._proxy:
  151. proxy = self._proxy() if callable(self._proxy) else self._proxy
  152. proxy = proxy.replace("socks5://", "")
  153. # 使用socks5 代理
  154. ip, port = proxy.split(":")
  155. firefox_options.set_preference('network.proxy.type', 1) # 不使用代理:0, 使用代理:1
  156. firefox_options.set_preference('network.proxy.socks', ip)
  157. firefox_options.set_preference('network.proxy.socks_port', int(port))
  158. # firefox_capabilities["marionette"] = True # http代理的使用
  159. if self._user_agent:
  160. firefox_options.set_preference(
  161. "general.useragent.override",
  162. self._user_agent() if callable(self._user_agent) else self._user_agent,
  163. )
  164. if not self._load_images:
  165. firefox_options.set_preference("permissions.default.image", 2)
  166. if self._custom_argument:
  167. for arg in self._custom_argument:
  168. firefox_options.add_argument(arg)
  169. executor = FirefoxRemoteConnection(remote_server_addr=self._server_addr)
  170. browser = webdriver.Remote(
  171. command_executor=executor,
  172. desired_capabilities=firefox_capabilities,
  173. options=firefox_options
  174. )
  175. if self._window_size:
  176. browser.set_window_size(*self._window_size)
  177. return browser
  178. def firefox_driver(self):
  179. if self._usages_local_driver:
  180. return self.local_firefox_driver()
  181. return self.remote_firefox_driver()
  182. def remote_chrome_driver(self):
  183. chrome_capabilities = {
  184. "browserName": "chrome",
  185. "platform": "ANY",
  186. "version": self._version,
  187. "javascriptEnabled": True,
  188. }
  189. chrome_options = webdriver.ChromeOptions()
  190. # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
  191. chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  192. chrome_options.add_experimental_option("useAutomationExtension", False)
  193. # docker 里运行需要
  194. chrome_options.add_argument("--no-sandbox")
  195. chrome_options.add_argument("--disable-gpu")
  196. if self._user_agent:
  197. chrome_options.add_argument(
  198. "user-agent={}".format(
  199. self._user_agent()
  200. if callable(self._user_agent)
  201. else self._user_agent
  202. )
  203. )
  204. # 不支持socks5协议
  205. # if self._proxy:
  206. # chrome_options.add_argument(
  207. # "--proxy-server={}".format(
  208. # self._proxy() if callable(self._proxy) else self._proxy
  209. # )
  210. # )
  211. if not self._load_images:
  212. chrome_options.add_experimental_option(
  213. "prefs", {"profile.managed_default_content_settings.images": 2}
  214. )
  215. if self._window_size:
  216. chrome_options.add_argument(
  217. "--window-size={},{}".format(self._window_size[0], self._window_size[1])
  218. )
  219. # 添加自定义的配置参数
  220. if self._custom_argument:
  221. for arg in self._custom_argument:
  222. chrome_options.add_argument(arg)
  223. browser = webdriver.Remote(
  224. command_executor=ChromeRemoteConnection(
  225. remote_server_addr=self._server_addr,
  226. keep_alive=True),
  227. desired_capabilities=chrome_capabilities,
  228. options=chrome_options
  229. )
  230. # 隐藏浏览器特征
  231. with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
  232. js = f.read()
  233. params = {
  234. 'cmd': 'Page.addScriptToEvaluateOnNewDocument',
  235. 'params': {'source': js}
  236. }
  237. res = browser.execute("executeCdpCommand", params)['value']
  238. return browser
  239. def local_chrome_driver(self):
  240. chrome_options = webdriver.ChromeOptions()
  241. # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
  242. chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  243. chrome_options.add_experimental_option("useAutomationExtension", False)
  244. # docker 里运行需要
  245. chrome_options.add_argument("--no-sandbox")
  246. if self._proxy:
  247. chrome_options.add_argument(
  248. "--proxy-server={}".format(
  249. self._proxy() if callable(self._proxy) else self._proxy
  250. )
  251. )
  252. if self._user_agent:
  253. chrome_options.add_argument(
  254. "user-agent={}".format(
  255. self._user_agent()
  256. if callable(self._user_agent)
  257. else self._user_agent
  258. )
  259. )
  260. if not self._load_images:
  261. chrome_options.add_experimental_option(
  262. "prefs", {"profile.managed_default_content_settings.images": 2}
  263. )
  264. if self._headless:
  265. chrome_options.add_argument("--headless")
  266. chrome_options.add_argument("--disable-gpu")
  267. if self._window_size:
  268. chrome_options.add_argument(
  269. "--window-size={},{}".format(self._window_size[0], self._window_size[1])
  270. )
  271. # 添加自定义的配置参数
  272. if self._custom_argument:
  273. for arg in self._custom_argument:
  274. chrome_options.add_argument(arg)
  275. if self._executable_path:
  276. driver = webdriver.Chrome(
  277. chrome_options=chrome_options,
  278. executable_path=self._executable_path,
  279. service_log_path=self._service_log_path
  280. )
  281. else:
  282. driver = webdriver.Chrome(
  283. chrome_options=chrome_options,
  284. service_log_path=self._service_log_path
  285. )
  286. # 隐藏浏览器特征
  287. with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
  288. js = f.read()
  289. driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
  290. return driver
  291. def chrome_driver(self):
  292. if self._usages_local_driver:
  293. return self.local_chrome_driver()
  294. return self.remote_chrome_driver()
  295. @property
  296. def cookies(self):
  297. cookies_json = {}
  298. for cookie in self.driver.get_cookies():
  299. cookies_json[cookie["name"]] = cookie["value"]
  300. return cookies_json
  301. @cookies.setter
  302. def cookies(self, val: dict):
  303. """
  304. 设置cookie
  305. Args:
  306. val: {"key":"value", "key2":"value2"}
  307. Returns:
  308. """
  309. for key, value in val.items():
  310. self.driver.add_cookie({"name": key, "value": value})
  311. def __getattr__(self, name):
  312. if self.driver:
  313. return getattr(self.driver, name)
  314. else:
  315. raise AttributeError
  316. # def __del__(self):
  317. # if self.driver:
  318. # self.driver.quit()
  319. @Singleton
  320. class WebDriverPool:
  321. def __init__(self, pool_size=5, **kwargs):
  322. self.queue = queue.Queue(maxsize=pool_size)
  323. self.kwargs = kwargs
  324. self.lock = threading.RLock()
  325. self.driver_count = 0
  326. @property
  327. def is_full(self):
  328. return self.driver_count >= self.queue.maxsize
  329. def get(self, user_agent: str = None, proxy: str = None) -> WebDriver:
  330. """
  331. 获取webdriver
  332. 当webdriver为新实例时会使用 user_agen, proxy, cookie参数来创建
  333. Args:
  334. user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
  335. proxy: xxx.xxx.xxx.xxx
  336. Returns:
  337. """
  338. if not self.is_full:
  339. with self.lock:
  340. if not self.is_full:
  341. kwargs = self.kwargs.copy()
  342. if user_agent:
  343. kwargs["user_agent"] = user_agent
  344. if proxy:
  345. kwargs["proxy"] = proxy
  346. driver = WebDriver(**kwargs)
  347. self.queue.put(driver)
  348. self.driver_count += 1
  349. driver = self.queue.get()
  350. return driver
  351. def put(self, driver):
  352. self.queue.put(driver)
  353. def remove(self, driver):
  354. driver.quit()
  355. self.driver_count -= 1
  356. def close(self):
  357. while not self.queue.empty():
  358. driver = self.queue.get()
  359. driver.quit()
  360. self.driver_count -= 1