Explorar o código

fixbug:修正chrome无法切换代理;splash添加代理和请求头

dongzhaorui %!s(int64=2) %!d(string=hai) anos
pai
achega
e14cf43432

+ 49 - 49
FworkSpider/feapder/network/request.py

@@ -294,15 +294,14 @@ class Request(object):
 
         # 代理
         proxies = self.requests_kwargs.get("proxies", -1)
-        if not self.render:
-            if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
-                while True:
-                    proxies = self.get_proxy()
-                    if proxies:
-                        self.requests_kwargs.update(proxies=proxies)
-                        break
-                    else:
-                        log.debug("暂无可用代理 ...")
+        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
+            while True:
+                proxies = self.get_proxy()
+                if proxies:
+                    self.requests_kwargs.update(proxies=proxies)
+                    break
+                else:
+                    log.debug("暂无可用代理 ...")
 
         log.debug(
             """
@@ -339,7 +338,6 @@ class Request(object):
             # 使用request的user_agent、cookies、proxy
             user_agent = headers.get("User-Agent") or headers.get("user-agent")
             cookies = self.requests_kwargs.get("cookies")
-            print(f'cookies >>>  {cookies}')
             if cookies and isinstance(cookies, RequestsCookieJar):
                 cookies = cookies.get_dict()
 
@@ -348,10 +346,15 @@ class Request(object):
                 if cookie_str:
                     cookies = tools.get_cookies_from_str(cookie_str)
 
-            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=False)
+            proxy = None
+            if proxies and proxies != -1:
+                proxy = proxies.get("http", "").strip("http://") or proxies.get(
+                    "https", ""
+                ).strip("https://")
+
+            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
+
             try:
-                if proxies:
-                    self.chage_ip(browser)
                 browser.get(self.url)
                 if cookies:
                     browser.cookies = cookies
@@ -359,36 +362,49 @@ class Request(object):
                     tools.delay_time(self.render_time)
 
                 html = browser.page_source
-                response = Response.from_dict(
-                    {
-                        "url": browser.current_url,
-                        "cookies": browser.cookies,
-                        "_content": html.encode(),
-                        "status_code": 200,
-                        "elapsed": 666,
-                        "headers": {
-                            "User-Agent": browser.execute_script(
-                                "return navigator.userAgent"
-                            ),
-                            "Cookie": tools.cookies2str(browser.cookies),
-                        },
-                    }
-                )
+                response = Response.from_dict({
+                    "url": browser.current_url,
+                    "cookies": browser.cookies,
+                    "_content": html.encode(),
+                    "status_code": 200,
+                    "elapsed": 666,
+                    "headers": {
+                        "User-Agent": browser.execute_script(
+                            "return navigator.userAgent"
+                        ),
+                        "Cookie": tools.cookies2str(browser.cookies),
+                    },
+                })
                 response.browser = browser
             except Exception as e:
                 self._webdriver_pool.remove(browser)
                 raise e
+
         elif use_session:
             response = self._session.request(method, self.url, **self.requests_kwargs)
             response = Response(response)
         elif self.splash:
-            resp = requests.get(setting.SWORDFISH_RENDER_URL, params={
+            headers = self.requests_kwargs.get('headers')
+            if not headers:
+                headers = {'User-Agent': self.user_agent()}
+            headers = [(key, val) for key, val in headers.items()]
+
+            proxy = None
+            if proxies and proxies != -1:
+                proxy = proxies.get("http", "").strip("http://") or proxies.get(
+                    "https", ""
+                ).strip("https://")
+
+            params = {
                 'iframes': self.iframes,
                 'wait': self.render_time,
                 'html': 1,
-                'proxy': {} if self.proxies == False else self.get_proxy().get("http"),
-                'url': self.url
-            })
+                'proxy': proxy,
+                'url': self.url,
+            }
+            data = {'headers': headers}
+            splash_url = setting.SWORDFISH_RENDER_URL
+            resp = requests.get(splash_url, params=params, json=data)
             response = Response(resp)
 
             # if self.iframes:
@@ -429,6 +445,7 @@ class Request(object):
 
         if save_cached:
             self.save_cached(response, expire_time=self.__class__.cached_expire_time)
+
         return response
 
     def proxies(self):
@@ -452,25 +469,8 @@ class Request(object):
     def get_proxy(self):
         headers = {"Authorization": setting.SWORDFISH_PROXY_AUTHOR}
         proxy = requests.get(setting.SWORDFISH_PROXY_URL, headers=headers).json()
-        print(f"切换代理:{proxy.get('data')}")
         return proxy.get("data")
 
-    def chage_ip(self, browser):
-        ip = self.get_proxy().get("http")  # ip格式"127.0.0.1:80"
-        ip = ip.split("//")[-1]
-        browser.get("about:config")
-        tools.delay_time(0.5)
-        browser.find_element_by_id("warningButton").click()
-        # js代码
-        setupScript = '''
-            var prefs = Components.classes["@mozilla.org/preferences-service;1"].getService(Components.interfaces.nsIPrefBranch);
-            prefs.setIntPref("network.proxy.type", 1);
-            prefs.setCharPref("network.proxy.socks", "%s");
-            prefs.setIntPref("network.proxy.socks_port", "%s");
-        ''' % (ip.split(':')[0], ip.split(':')[1])
-        # 执行js
-        browser.execute_script(setupScript)
-
     def user_agent(self):
         headers = self.requests_kwargs.get("headers")
         if headers:

+ 17 - 9
FworkSpider/feapder/utils/webdriver.py

@@ -116,9 +116,10 @@ class WebDriver(RemoteWebDriver):
             proxy = self._proxy() if callable(self._proxy) else self._proxy
             proxy = proxy.replace("socks5://", "")
             # 使用socks5 代理
+            ip, port = proxy.split(":")
             firefox_profile.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
-            firefox_profile.set_preference('network.proxy.socks', proxy.split(":")[0])
-            firefox_profile.set_preference('network.proxy.socks_port', int(proxy.split(":")[-1]))
+            firefox_profile.set_preference('network.proxy.socks', ip)
+            firefox_profile.set_preference('network.proxy.socks_port', int(port))
 
         if self._user_agent:
             firefox_profile.set_preference(
@@ -226,6 +227,15 @@ class WebDriver(RemoteWebDriver):
         # docker 里运行需要
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--disable-gpu")
+        chrome_options.add_argument('--disable-extensions')
+        chrome_options.add_argument('--disable-dev-shm-usage')
+
+        if self._proxy:
+            chrome_options.add_argument(
+                "--proxy-server={}".format(
+                    self._proxy() if callable(self._proxy) else self._proxy
+                )
+            )
 
         if self._user_agent:
             chrome_options.add_argument(
@@ -235,13 +245,6 @@ class WebDriver(RemoteWebDriver):
                     else self._user_agent
                 )
             )
-        # 不支持socks5协议
-        # if self._proxy:
-        #     chrome_options.add_argument(
-        #         "--proxy-server={}".format(
-        #             self._proxy() if callable(self._proxy) else self._proxy
-        #         )
-        #     )
 
         if not self._load_images:
             chrome_options.add_experimental_option(
@@ -284,6 +287,9 @@ class WebDriver(RemoteWebDriver):
         chrome_options.add_experimental_option("useAutomationExtension", False)
         # docker 里运行需要
         chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-gpu")
+        chrome_options.add_argument('--disable-extensions')
+        chrome_options.add_argument('--disable-dev-shm-usage')
 
         if self._proxy:
             chrome_options.add_argument(
@@ -291,6 +297,7 @@ class WebDriver(RemoteWebDriver):
                     self._proxy() if callable(self._proxy) else self._proxy
                 )
             )
+
         if self._user_agent:
             chrome_options.add_argument(
                 "user-agent={}".format(
@@ -299,6 +306,7 @@ class WebDriver(RemoteWebDriver):
                     else self._user_agent
                 )
             )
+
         if not self._load_images:
             chrome_options.add_experimental_option(
                 "prefs", {"profile.managed_default_content_settings.images": 2}

+ 2 - 2
FworkSpider/setting.py

@@ -40,10 +40,10 @@ WEBDRIVER = dict(
     user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
     headless=True,  # 是否为无头浏览器
     proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
-    driver_type="FIREFOX",  # CHROME、FIREFOX、EDGE
+    driver_type="CHROME",  # CHROME、FIREFOX
     timeout=30,  # 请求超时时间
     executable_path=None,  # 浏览器路径,默认为默认路径
-    usages_local_driver=True,  # 是否使用本地驱动,默认启动本地驱动
+    usages_local_driver=False,  # 是否使用本地驱动,默认启动本地驱动
     window_size=(1280, 800),  # 窗口大小
     render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数

+ 3 - 2
FworkSpider/untils/WebCookiePool.py

@@ -16,9 +16,10 @@ class WebCookiePool(PageCookiePool):
         self._kwargs = kwargs
         self._kwargs.setdefault("load_images", False)
         self._kwargs.setdefault("headless", True)
-        self._kwargs.setdefault("driver_type", "FIREFOX")
+        self._kwargs.setdefault("driver_type", "CHROME")
 
-    def create_cookie(self):
+    def create_cookie(self, proxy=None):
+        self._kwargs.setdefault("proxy", proxy)
         with WebDriver(**self._kwargs) as browser:
             try:
                 browser.get(self.page_url)