3
0
Эх сурвалжийг харах

修复服务器运行爬虫关闭无头浏览器错误

dongzhaorui 3 долоо хоног өмнө
parent
commit
6e9747873b

+ 19 - 11
FworkSpider/untils/WebCookiePool.py

@@ -15,7 +15,7 @@ from untils.cookie_pool import PageCookiePool
 
 DRISSIONPAGE = dict(
     singleton_tab=True,  # 一个标签页是否支持多例操作,True=单例;False=多例
-    headless=False,  # 是否为无头浏览器
+    headless=True,  # 是否为无头浏览器
     load_images=False,  # 是否加载图片
     user_agent=None,  # 字符串
     proxy=None,  # xxx.xxx.xxx.xxx:xxxx
@@ -38,29 +38,41 @@ DRISSIONPAGE = dict(
 class BrowserCookiePool(PageCookiePool):
 
     def __init__(self, redis_key, page_url, cookie_key, **kwargs):
-        proxy_api = kwargs.pop("proxy_api", None)
         self._retry = kwargs.pop("retry", 3)
         self._interval = kwargs.pop("interval", 1.5)
         self._render_time = kwargs.pop("render_time", 3)
-        self._proxies = kwargs.pop("proxies", None)  # 仅支持字符串
+
         self._enable_proxy = kwargs.pop("enable_proxy", False)
+        self._proxies = kwargs.pop("proxies", None)  # 仅支持字符串
         self._proxy = None
-        if self._proxies is None and self._enable_proxy:
+        proxy_api = kwargs.pop("proxy_api", None)
+        if self._enable_proxy and self._proxies is None:
             self._proxy = SpringBoardProxyPool(proxy_api=proxy_api)
             DRISSIONPAGE["proxy"] = self._proxy.get_proxy()["http"]
         else:
             DRISSIONPAGE["proxy"] = self._proxies
 
-        DRISSIONPAGE["user_agent"] = user_agent_pool.get("chrome")
+        DRISSIONPAGE["user_agent"] = kwargs.pop("user_agent", None) or user_agent_pool.get("chrome")
         DRISSIONPAGE["load_images"] = kwargs.pop("load_images", False)
         super(BrowserCookiePool, self).__init__(redis_key, **kwargs)
         self.page_url = page_url
         self.cookie_key = cookie_key
 
+    def proxies(self, proxy):
+        DRISSIONPAGE["proxy"] = proxy
+        return self
+
+    def user_agent(self, ua):
+        DRISSIONPAGE["user_agent"] = ua
+        return self
+
+    def headless(self, on_off=True):
+        DRISSIONPAGE["headless"] = on_off
+        return self
+
     def create_cookie(self):
-        nums = 0
         with DrissionPageDriver(**DRISSIONPAGE) as driver:
-            while True:
+            for _ in range(self._retry):
                 try:
                     driver.tab.get(self.page_url,
                                    retry=DRISSIONPAGE["retry"],
@@ -72,10 +84,6 @@ class BrowserCookiePool(PageCookiePool):
                         return cookies
 
                     driver.tab.wait(self._interval)
-                    nums += 1
-                    if nums >= self._retry:
-                        return
-
                 except Exception as e:
                     log.error(f"获取cookie失败,{e}")
                     driver.tab.clear_cache()

+ 0 - 35
FworkSpider/untils/aliyun.py

@@ -1,35 +0,0 @@
-import oss2
-
-from feapder.setting import ALI_BUCKET_CONFIG as oss_conf
-
-
-class AliYunService:
-
-    def __init__(self):
-        self.__acc_key_id = oss_conf['key_id']
-        self.__acc_key_secret = oss_conf['key_secret']
-        self.__endpoint = oss_conf['endpoint']
-        self.__bucket_name = oss_conf['bucket_name']
-
-    def push_oss_from_local(self, key, filename):
-        """
-        上传一个本地文件到OSS的普通文件
-
-        :param str key: 上传到OSS的文件名
-        :param str filename: 本地文件名,需要有可读权限
-        """
-        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
-        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
-        bucket.put_object_from_file(key, filename)
-
-    def push_oss_from_stream(self, key, data):
-        """
-        流式上传oss
-
-        :param str key: 上传到OSS的文件名
-        :param data: 待上传的内容。
-        :type data: bytes,str或file-like object
-        """
-        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
-        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
-        bucket.put_object(key, data)

+ 0 - 15
FworkSpider/untils/execptions.py

@@ -1,15 +0,0 @@
-class PySpiderError(Exception):
-
-    def __init__(self, *args, **kwargs):
-        if 'code' not in kwargs and 'reason' not in kwargs:
-            kwargs['code'] = 10000
-            kwargs['reason'] = '未知爬虫错误,请手动处理'
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-        super(PySpiderError, self).__init__(*args, kwargs)
-
-
-class AttachmentNullError(PySpiderError):
-
-    def __init__(self, code: int = 10004, reason: str = '附件下载异常'):
-        super(AttachmentNullError, self).__init__(code=code, reason=reason)