webdriver.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. import datetime
  2. from collections import namedtuple
  3. from pathlib import Path
  4. from selenium import webdriver
  5. from selenium.common.exceptions import WebDriverException
  6. from selenium.webdriver.common.by import By
  7. from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
  8. from selenium.webdriver.support import expected_conditions as EC
  9. from selenium.webdriver.support.ui import WebDriverWait
  10. from common.log import logger
  11. _absolute = Path(__file__).absolute().parent.parent
  12. _date = datetime.datetime.now().strftime('%Y-%m-%d')
  13. _service_log_path = (_absolute / f'logs/geckodriver-{_date}.log').resolve()
  14. DEFAULT_USERAGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0"
  15. Netloc = namedtuple('Netloc', ['host', 'port'])
  16. def check_navigator(driver):
  17. script = "return window.navigator.webdriver"
  18. return driver.execute_script(script)
  19. def netloc(proxies: dict) -> Netloc:
  20. host, port = proxies["https"].replace("socks5://", "").split(":")
  21. return Netloc(host, port)
  22. class FireFoxWebDriverError(WebDriverException):
  23. pass
  24. class WebDriver(RemoteWebDriver):
  25. FIREFOX = "FIREFOX"
  26. def __init__(
  27. self,
  28. load_images=True,
  29. user_agent=None,
  30. proxy=None,
  31. headless=True,
  32. driver_type=FIREFOX,
  33. timeout=120,
  34. window_size=(1024, 800),
  35. executable_path=None,
  36. custom_argument=None,
  37. **kwargs
  38. ):
  39. """
  40. Args:
  41. load_images: 是否加载图片
  42. user_agent: 字符串 或 无参函数,返回值为user_agent
  43. proxy: {'https://sockets:xxx.xxx.xxx.xxx:xxxx'} 或 无参函数,返回值为代理地址
  44. headless: 是否启用无头模式
  45. driver_type: FIREFOX
  46. timeout: 请求超时时间
  47. window_size: # 窗口大小
  48. executable_path: 浏览器路径,默认为默认路径
  49. **kwargs:
  50. """
  51. self._load_images = load_images
  52. self._user_agent = user_agent or DEFAULT_USERAGENT
  53. self._proxy = proxy
  54. self._headless = headless
  55. self._timeout = timeout
  56. self._window_size = window_size
  57. self._executable_path = executable_path
  58. self._custom_argument = custom_argument
  59. self.proxies = {}
  60. self.user_agent = None
  61. if driver_type == WebDriver.FIREFOX:
  62. self.driver = self.firefox_driver()
  63. self.driver.set_page_load_timeout(self._timeout)
  64. self.driver.set_script_timeout(self._timeout)
  65. def __enter__(self):
  66. return self
  67. def __exit__(self, exc_type, exc_val, exc_tb):
  68. if exc_val:
  69. logger.error(f'{self.__class__.__name__} <> {exc_type.__name__}: {exc_val}')
  70. self.quit()
  71. return False
  72. def firefox_driver(self):
  73. firefox_profile = webdriver.FirefoxProfile()
  74. firefox_options = webdriver.FirefoxOptions()
  75. firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
  76. firefox_profile.set_preference("dom.webdriver.enabled", False)
  77. firefox_profile.set_preference('useAutomationExtension', False)
  78. # firefox_profile.set_preference('privacy.resistFingerprinting', True) # 启用指纹保护
  79. if self._proxy:
  80. proxy = self._proxy() if callable(self._proxy) else self._proxy
  81. host, port = netloc(proxy)
  82. # 使用socks5 代理, 不使用代理:0, 使用代理:1
  83. firefox_profile.set_preference('network.proxy.type', 1)
  84. firefox_profile.set_preference('network.proxy.socks', host)
  85. firefox_profile.set_preference('network.proxy.socks_port', int(port))
  86. if self._user_agent:
  87. firefox_profile.set_preference(
  88. "general.useragent.override",
  89. self._user_agent() if callable(self._user_agent) else self._user_agent,
  90. )
  91. if not self._load_images:
  92. '''
  93. 允许加载所有图像,无论来源如何(默认)=1
  94. 阻止所有图像加载=2
  95. 防止加载第三方图像=3
  96. '''
  97. firefox_profile.set_preference("permissions.default.image", 2)
  98. firefox_profile.update_preferences()
  99. if self._headless:
  100. firefox_options.add_argument("--headless")
  101. firefox_options.add_argument("--disable-gpu")
  102. # 添加自定义的配置参数
  103. if self._custom_argument:
  104. for arg in self._custom_argument:
  105. firefox_options.add_argument(arg)
  106. if self._executable_path:
  107. driver = webdriver.Firefox(
  108. service_log_path=str(_service_log_path),
  109. capabilities=firefox_capabilities,
  110. options=firefox_options,
  111. firefox_profile=firefox_profile,
  112. executable_path=self._executable_path,
  113. )
  114. else:
  115. driver = webdriver.Firefox(
  116. service_log_path=str(_service_log_path),
  117. capabilities=firefox_capabilities,
  118. options=firefox_options,
  119. firefox_profile=firefox_profile,
  120. )
  121. if self._window_size:
  122. driver.set_window_size(*self._window_size)
  123. return driver
  124. @property
  125. def cookies(self):
  126. cookies_json = {}
  127. for cookie in self.driver.get_cookies():
  128. cookies_json[cookie["name"]] = cookie["value"]
  129. return cookies_json
  130. @cookies.setter
  131. def cookies(self, val: dict):
  132. """
  133. 设置cookie
  134. Args:
  135. val: {"key":"value", "key2":"value2"}
  136. Returns:
  137. """
  138. for key, value in val.items():
  139. self.driver.add_cookie({"name": key, "value": value})
  140. def __getattr__(self, name):
  141. if self.driver:
  142. return getattr(self.driver, name)
  143. else:
  144. raise AttributeError
  145. def get_user_agent(driver):
  146. return driver.execute_script("return navigator.userAgent;")
  147. def get_title(driver):
  148. return driver.execute_script('return document.title')
  149. def until_wait(
  150. driver,
  151. *,
  152. xpath=None,
  153. classname=None,
  154. text=None,
  155. timeout=None
  156. ):
  157. """
  158. 显示等待页面加载,否则抛出TimeoutException
  159. :param driver: 浏览器驱动
  160. :param xpath: xpath规则,页面等待特征
  161. :param classname: class属性名称,页面等待特征
  162. :param text: 期待的文本
  163. :param timeout: 超时时间
  164. :return:
  165. """
  166. _timeout = (timeout or 60)
  167. wait = WebDriverWait(driver, _timeout, 0.2)
  168. if xpath is not None:
  169. locator = (By.XPATH, xpath)
  170. if text is not None:
  171. wait.until(EC.text_to_be_present_in_element(locator, text))
  172. else:
  173. wait.until(EC.presence_of_element_located(locator))
  174. elif classname is not None:
  175. locator = (By.CLASS_NAME, classname)
  176. if text is not None:
  177. wait.until(EC.text_to_be_present_in_element(locator, text))
  178. else:
  179. wait.until(EC.presence_of_element_located(locator))
  180. def new_window(driver):
  181. """新的窗口"""
  182. driver.execute_script('window.open();')
  183. handles = driver.window_handles
  184. driver.switch_to.window(handles[-1])