webdriver.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2021/3/18 4:59 下午
  4. ---------
  5. @summary:
  6. ---------
  7. @author: Boris
  8. @email: boris_liu@foxmail.com
  9. """
  10. import queue
  11. import threading
  12. import os
  13. from selenium import webdriver
  14. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  15. from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
  16. from feapder.utils.log import log
  17. from feapder.utils.tools import Singleton
  18. DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
  19. class WebDriver(RemoteWebDriver):
  20. '''浏览器采集 - selenium'''
  21. CHROME = "CHROME"
  22. PHANTOMJS = "PHANTOMJS"
  23. FIREFOX = "FIREFOX"
  24. def __init__(
  25. self,
  26. load_images=True,
  27. user_agent=None,
  28. proxy=None,
  29. headless=False,
  30. driver_type=CHROME,
  31. timeout=16,
  32. window_size=(1024, 800),
  33. executable_path=None,
  34. custom_argument=None,
  35. **kwargs
  36. ):
  37. """
  38. webdirver 封装,支持chrome、phantomjs 和 firefox
  39. Args:
  40. load_images: 是否加载图片
  41. user_agent: 字符串 或 无参函数,返回值为user_agent
  42. proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
  43. headless: 是否启用无头模式
  44. driver_type: CHROME 或 PHANTOMJS,FIREFOX
  45. timeout: 请求超时时间
  46. window_size: # 窗口大小
  47. executable_path: 浏览器路径,默认为默认路径
  48. **kwargs:
  49. """
  50. self._load_images = load_images
  51. self._user_agent = user_agent or DEFAULT_USERAGENT
  52. self._proxy = proxy
  53. self._headless = headless
  54. self._timeout = timeout
  55. self._window_size = window_size
  56. self._executable_path = executable_path
  57. self._custom_argument = custom_argument
  58. self.proxies = {}
  59. self.user_agent = None
  60. if driver_type == WebDriver.CHROME:
  61. self.driver = self.chrome_driver()
  62. elif driver_type == WebDriver.PHANTOMJS:
  63. self.driver = self.phantomjs_driver()
  64. elif driver_type == WebDriver.FIREFOX:
  65. self.driver = self.firefox_driver()
  66. else:
  67. raise TypeError(
  68. "dirver_type must be one of CHROME or PHANTOMJS or FIREFOX, but received {}".format(
  69. type(driver_type)
  70. )
  71. )
  72. # driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
  73. self.driver.set_page_load_timeout(self._timeout)
  74. # 设置10秒脚本超时时间
  75. self.driver.set_script_timeout(self._timeout)
  76. def __enter__(self):
  77. return self
  78. def __exit__(self, exc_type, exc_val, exc_tb):
  79. if exc_val:
  80. log.error(exc_val)
  81. self.quit()
  82. return True
  83. def get_driver(self):
  84. return self.driver
  85. def firefox_driver(self):
  86. firefox_profile = webdriver.FirefoxProfile()
  87. firefox_options = webdriver.FirefoxOptions()
  88. firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
  89. firefox_profile.set_preference("dom.webdriver.enabled",False)
  90. if self._proxy:
  91. proxy = self._proxy() if callable(self._proxy) else self._proxy
  92. proxy = proxy.replace("socks5://","")
  93. # 使用socks5 代理
  94. firefox_profile.set_preference('network.proxy.type', 1) # 不使用代理:0, 使用代理:1
  95. firefox_profile.set_preference('network.proxy.socks', proxy.split(":")[0])
  96. firefox_profile.set_preference('network.proxy.socks_port', int(proxy.split(":")[-1]))
  97. # firefox_capabilities["marionette"] = True # http代理的使用
  98. if self._user_agent:
  99. firefox_profile.set_preference(
  100. "general.useragent.override",
  101. self._user_agent() if callable(self._user_agent) else self._user_agent,
  102. )
  103. if not self._load_images:
  104. firefox_profile.set_preference("permissions.default.image", 2)
  105. if self._headless:
  106. firefox_options.add_argument("--headless")
  107. firefox_options.add_argument("--disable-gpu")
  108. # 添加自定义的配置参数
  109. if self._custom_argument:
  110. for arg in self._custom_argument:
  111. firefox_options.add_argument(arg)
  112. if self._executable_path:
  113. driver = webdriver.Firefox(
  114. capabilities=firefox_capabilities,
  115. options=firefox_options,
  116. firefox_profile=firefox_profile,
  117. executable_path=self._executable_path,
  118. )
  119. else:
  120. driver = webdriver.Firefox(
  121. capabilities=firefox_capabilities,
  122. options=firefox_options,
  123. firefox_profile=firefox_profile,
  124. )
  125. if self._window_size:
  126. driver.set_window_size(*self._window_size)
  127. return driver
  128. def chrome_driver(self):
  129. chrome_options = webdriver.ChromeOptions()
  130. # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
  131. chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  132. chrome_options.add_experimental_option("useAutomationExtension", False)
  133. # docker 里运行需要
  134. chrome_options.add_argument("--no-sandbox")
  135. if self._proxy:
  136. chrome_options.add_argument(
  137. "--proxy-server={}".format(
  138. self._proxy() if callable(self._proxy) else self._proxy
  139. )
  140. )
  141. if self._user_agent:
  142. chrome_options.add_argument(
  143. "user-agent={}".format(
  144. self._user_agent()
  145. if callable(self._user_agent)
  146. else self._user_agent
  147. )
  148. )
  149. if not self._load_images:
  150. chrome_options.add_experimental_option(
  151. "prefs", {"profile.managed_default_content_settings.images": 2}
  152. )
  153. if self._headless:
  154. chrome_options.add_argument("--headless")
  155. chrome_options.add_argument("--disable-gpu")
  156. if self._window_size:
  157. chrome_options.add_argument(
  158. "--window-size={},{}".format(self._window_size[0], self._window_size[1])
  159. )
  160. # 添加自定义的配置参数
  161. if self._custom_argument:
  162. for arg in self._custom_argument:
  163. chrome_options.add_argument(arg)
  164. if self._executable_path:
  165. driver = webdriver.Chrome(
  166. chrome_options=chrome_options, executable_path=self._executable_path
  167. )
  168. else:
  169. driver = webdriver.Chrome(chrome_options=chrome_options)
  170. # 隐藏浏览器特征
  171. with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
  172. js = f.read()
  173. driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
  174. return driver
  175. def phantomjs_driver(self):
  176. import warnings
  177. warnings.filterwarnings("ignore")
  178. service_args = []
  179. dcap = DesiredCapabilities.PHANTOMJS
  180. if self._proxy:
  181. service_args.append(
  182. "--proxy=%s" % self._proxy() if callable(self._proxy) else self._proxy
  183. )
  184. if self._user_agent:
  185. dcap["phantomjs.page.settings.userAgent"] = (
  186. self._user_agent() if callable(self._user_agent) else self._user_agent
  187. )
  188. if not self._load_images:
  189. service_args.append("--load-images=no")
  190. # 添加自定义的配置参数
  191. if self._custom_argument:
  192. for arg in self._custom_argument:
  193. service_args.append(arg)
  194. if self._executable_path:
  195. driver = webdriver.PhantomJS(
  196. service_args=service_args,
  197. desired_capabilities=dcap,
  198. executable_path=self._executable_path,
  199. )
  200. else:
  201. driver = webdriver.PhantomJS(
  202. service_args=service_args, desired_capabilities=dcap
  203. )
  204. if self._window_size:
  205. driver.set_window_size(self._window_size[0], self._window_size[1])
  206. del warnings
  207. return driver
  208. @property
  209. def cookies(self):
  210. cookies_json = {}
  211. for cookie in self.driver.get_cookies():
  212. cookies_json[cookie["name"]] = cookie["value"]
  213. return cookies_json
  214. @cookies.setter
  215. def cookies(self, val: dict):
  216. """
  217. 设置cookie
  218. Args:
  219. val: {"key":"value", "key2":"value2"}
  220. Returns:
  221. """
  222. for key, value in val.items():
  223. self.driver.add_cookie({"name": key, "value": value})
  224. def __getattr__(self, name):
  225. if self.driver:
  226. return getattr(self.driver, name)
  227. else:
  228. raise AttributeError
  229. def __del__(self):
  230. self.quit()
  231. @Singleton
  232. class WebDriverPool:
  233. def __init__(self, pool_size=5, **kwargs):
  234. self.queue = queue.Queue(maxsize=pool_size)
  235. self.kwargs = kwargs
  236. self.lock = threading.RLock()
  237. self.driver_count = 0
  238. @property
  239. def is_full(self):
  240. return self.driver_count >= self.queue.maxsize
  241. def get(self, user_agent: str = None, proxy: str = None) -> WebDriver:
  242. """
  243. 获取webdriver
  244. 当webdriver为新实例时会使用 user_agen, proxy, cookie参数来创建
  245. Args:
  246. user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
  247. proxy: xxx.xxx.xxx.xxx
  248. Returns:
  249. """
  250. if not self.is_full:
  251. with self.lock:
  252. if not self.is_full:
  253. kwargs = self.kwargs.copy()
  254. if user_agent:
  255. kwargs["user_agent"] = user_agent
  256. if proxy:
  257. kwargs["proxy"] = proxy
  258. driver = WebDriver(**kwargs)
  259. self.queue.put(driver)
  260. self.driver_count += 1
  261. driver = self.queue.get()
  262. return driver
  263. def put(self, driver):
  264. self.queue.put(driver)
  265. def remove(self, driver):
  266. driver.quit()
  267. self.driver_count -= 1
  268. def close(self):
  269. while not self.queue.empty():
  270. driver = self.queue.get()
  271. driver.quit()
  272. self.driver_count -= 1