webdriver.py 11 KB


  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2021/3/18 4:59 下午
  4. ---------
  5. @summary:
  6. ---------
  7. @author: Boris
  8. @email: boris_liu@foxmail.com
  9. """
  10. import queue
  11. import threading
  12. import os
  13. from selenium import webdriver
  14. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  15. from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
  16. from feapder.utils.log import log
  17. from feapder.utils.tools import Singleton
  18. DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
  19. class WebDriver(RemoteWebDriver):
  20. CHROME = "CHROME"
  21. PHANTOMJS = "PHANTOMJS"
  22. FIREFOX = "FIREFOX"
  23. def __init__(
  24. self,
  25. load_images=True,
  26. user_agent=None,
  27. proxy=None,
  28. headless=False,
  29. driver_type=CHROME,
  30. timeout=16,
  31. window_size=(1024, 800),
  32. executable_path=None,
  33. custom_argument=None,
  34. **kwargs
  35. ):
  36. """
  37. webdirver 封装,支持chrome、phantomjs 和 firefox
  38. Args:
  39. load_images: 是否加载图片
  40. user_agent: 字符串 或 无参函数,返回值为user_agent
  41. proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
  42. headless: 是否启用无头模式
  43. driver_type: CHROME 或 PHANTOMJS,FIREFOX
  44. timeout: 请求超时时间
  45. window_size: # 窗口大小
  46. executable_path: 浏览器路径,默认为默认路径
  47. **kwargs:
  48. """
  49. self._load_images = load_images
  50. self._user_agent = user_agent or DEFAULT_USERAGENT
  51. self._proxy = proxy
  52. self._headless = headless
  53. self._timeout = timeout
  54. self._window_size = window_size
  55. self._executable_path = executable_path
  56. self._custom_argument = custom_argument
  57. self.proxies = {}
  58. self.user_agent = None
  59. if driver_type == WebDriver.CHROME:
  60. self.driver = self.chrome_driver()
  61. elif driver_type == WebDriver.PHANTOMJS:
  62. self.driver = self.phantomjs_driver()
  63. elif driver_type == WebDriver.FIREFOX:
  64. self.driver = self.firefox_driver()
  65. else:
  66. raise TypeError(
  67. "dirver_type must be one of CHROME or PHANTOMJS or FIREFOX, but received {}".format(
  68. type(driver_type)
  69. )
  70. )
  71. # driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
  72. self.driver.set_page_load_timeout(self._timeout)
  73. # 设置10秒脚本超时时间
  74. self.driver.set_script_timeout(self._timeout)
  75. def __enter__(self):
  76. return self
  77. def __exit__(self, exc_type, exc_val, exc_tb):
  78. if exc_val:
  79. log.error(exc_val)
  80. self.quit()
  81. return True
  82. def get_driver(self):
  83. return self.driver
  84. def firefox_driver(self):
  85. firefox_profile = webdriver.FirefoxProfile()
  86. firefox_options = webdriver.FirefoxOptions()
  87. firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
  88. firefox_profile.set_preference("dom.webdriver.enabled",False)
  89. if self._proxy:
  90. proxy = self._proxy() if callable(self._proxy) else self._proxy
  91. proxy = proxy.replace("socks5://","")
  92. # 使用socks5 代理
  93. firefox_profile.set_preference('network.proxy.type', 1) # 不使用代理:0, 使用代理:1
  94. firefox_profile.set_preference('network.proxy.socks', proxy.split(":")[0])
  95. firefox_profile.set_preference('network.proxy.socks_port', int(proxy.split(":")[-1]))
  96. # firefox_capabilities["marionette"] = True # http代理的使用
  97. # firefox_capabilities["proxy"] = {
  98. # "proxyType": "MANUAL",
  99. # "httpProxy": proxy,
  100. # "ftpProxy": proxy,
  101. # "sslProxy": proxy,
  102. # }
  103. if self._user_agent:
  104. firefox_profile.set_preference(
  105. "general.useragent.override",
  106. self._user_agent() if callable(self._user_agent) else self._user_agent,
  107. )
  108. if not self._load_images:
  109. firefox_profile.set_preference("permissions.default.image", 2)
  110. if self._headless:
  111. firefox_options.add_argument("--headless")
  112. firefox_options.add_argument("--disable-gpu")
  113. # 添加自定义的配置参数
  114. if self._custom_argument:
  115. for arg in self._custom_argument:
  116. firefox_options.add_argument(arg)
  117. if self._executable_path:
  118. driver = webdriver.Firefox(
  119. capabilities=firefox_capabilities,
  120. options=firefox_options,
  121. firefox_profile=firefox_profile,
  122. executable_path=self._executable_path,
  123. )
  124. else:
  125. driver = webdriver.Firefox(
  126. capabilities=firefox_capabilities,
  127. options=firefox_options,
  128. firefox_profile=firefox_profile,
  129. )
  130. if self._window_size:
  131. driver.set_window_size(*self._window_size)
  132. return driver
  133. def chrome_driver(self):
  134. chrome_options = webdriver.ChromeOptions()
  135. # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
  136. chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  137. chrome_options.add_experimental_option("useAutomationExtension", False)
  138. # docker 里运行需要
  139. chrome_options.add_argument("--no-sandbox")
  140. if self._proxy:
  141. chrome_options.add_argument(
  142. "--proxy-server={}".format(
  143. self._proxy() if callable(self._proxy) else self._proxy
  144. )
  145. )
  146. if self._user_agent:
  147. chrome_options.add_argument(
  148. "user-agent={}".format(
  149. self._user_agent()
  150. if callable(self._user_agent)
  151. else self._user_agent
  152. )
  153. )
  154. if not self._load_images:
  155. chrome_options.add_experimental_option(
  156. "prefs", {"profile.managed_default_content_settings.images": 2}
  157. )
  158. if self._headless:
  159. chrome_options.add_argument("--headless")
  160. chrome_options.add_argument("--disable-gpu")
  161. if self._window_size:
  162. chrome_options.add_argument(
  163. "--window-size={},{}".format(self._window_size[0], self._window_size[1])
  164. )
  165. # 添加自定义的配置参数
  166. if self._custom_argument:
  167. for arg in self._custom_argument:
  168. chrome_options.add_argument(arg)
  169. if self._executable_path:
  170. driver = webdriver.Chrome(
  171. chrome_options=chrome_options, executable_path=self._executable_path
  172. )
  173. else:
  174. driver = webdriver.Chrome(chrome_options=chrome_options)
  175. # 隐藏浏览器特征
  176. with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
  177. js = f.read()
  178. driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
  179. return driver
  180. def phantomjs_driver(self):
  181. import warnings
  182. warnings.filterwarnings("ignore")
  183. service_args = []
  184. dcap = DesiredCapabilities.PHANTOMJS
  185. if self._proxy:
  186. service_args.append(
  187. "--proxy=%s" % self._proxy() if callable(self._proxy) else self._proxy
  188. )
  189. if self._user_agent:
  190. dcap["phantomjs.page.settings.userAgent"] = (
  191. self._user_agent() if callable(self._user_agent) else self._user_agent
  192. )
  193. if not self._load_images:
  194. service_args.append("--load-images=no")
  195. # 添加自定义的配置参数
  196. if self._custom_argument:
  197. for arg in self._custom_argument:
  198. service_args.append(arg)
  199. if self._executable_path:
  200. driver = webdriver.PhantomJS(
  201. service_args=service_args,
  202. desired_capabilities=dcap,
  203. executable_path=self._executable_path,
  204. )
  205. else:
  206. driver = webdriver.PhantomJS(
  207. service_args=service_args, desired_capabilities=dcap
  208. )
  209. if self._window_size:
  210. driver.set_window_size(self._window_size[0], self._window_size[1])
  211. del warnings
  212. return driver
  213. @property
  214. def cookies(self):
  215. cookies_json = {}
  216. for cookie in self.driver.get_cookies():
  217. cookies_json[cookie["name"]] = cookie["value"]
  218. return cookies_json
  219. @cookies.setter
  220. def cookies(self, val: dict):
  221. """
  222. 设置cookie
  223. Args:
  224. val: {"key":"value", "key2":"value2"}
  225. Returns:
  226. """
  227. for key, value in val.items():
  228. self.driver.add_cookie({"name": key, "value": value})
  229. def __getattr__(self, name):
  230. if self.driver:
  231. return getattr(self.driver, name)
  232. else:
  233. raise AttributeError
  234. # def __del__(self):
  235. # self.quit()
  236. @Singleton
  237. class WebDriverPool:
  238. def __init__(self, pool_size=5, **kwargs):
  239. self.queue = queue.Queue(maxsize=pool_size)
  240. self.kwargs = kwargs
  241. self.lock = threading.RLock()
  242. self.driver_count = 0
  243. @property
  244. def is_full(self):
  245. return self.driver_count >= self.queue.maxsize
  246. def get(self, user_agent: str = None, proxy: str = None) -> WebDriver:
  247. """
  248. 获取webdriver
  249. 当webdriver为新实例时会使用 user_agen, proxy, cookie参数来创建
  250. Args:
  251. user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
  252. proxy: xxx.xxx.xxx.xxx
  253. Returns:
  254. """
  255. if not self.is_full:
  256. with self.lock:
  257. if not self.is_full:
  258. kwargs = self.kwargs.copy()
  259. if user_agent:
  260. kwargs["user_agent"] = user_agent
  261. if proxy:
  262. kwargs["proxy"] = proxy
  263. driver = WebDriver(**kwargs)
  264. self.queue.put(driver)
  265. self.driver_count += 1
  266. driver = self.queue.get()
  267. return driver
  268. def put(self, driver):
  269. self.queue.put(driver)
  270. def remove(self, driver):
  271. driver.quit()
  272. self.driver_count -= 1
  273. def close(self):
  274. while not self.queue.empty():
  275. driver = self.queue.get()
  276. driver.quit()
  277. self.driver_count -= 1