Procházet zdrojové kódy

修复若干问题

dongzhaorui před 3 týdny
rodič
revize
c9ca1ff88c

+ 39 - 38
FworkSpider/untils/attachment.py

@@ -16,8 +16,7 @@ import urllib3
 
 import feapder.utils.tools as tools
 from feapder.utils.log import log as logger
-from untils.aliyun import AliYunService
-from untils.execptions import AttachmentNullError
+from feapder.utils.oss import JyOssClient, OssBucketClient
 
 urllib3.disable_warnings()
 
@@ -34,10 +33,12 @@ def clear_file_type_suffix(filename: str, filetype: str):
     return filename
 
 
-class AttachmentDownloader:
+class Downloader:
 
     def __init__(self):
         self.dir_name = "file"
+        # self._oss = JyOssClient()
+        self._bucket = OssBucketClient()
 
     def create_file(self, filename, filetype):
         os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
@@ -104,6 +105,22 @@ class AttachmentDownloader:
                 result = "{:.1f} M".format(_M)
         return result
 
+    @staticmethod
+    def read_pdf_by_chunks(f, chunk_size=1024):
+        try:
+            with open(f, 'rb') as file:
+                chunk = file.read(chunk_size)
+                if "<</Names <</Dests 4 0 R>>" in str(chunk) and "SourceModified" in str(chunk):
+                    return False
+                elif "doctypehtml" not in str(chunk):
+                    return True
+                elif "%PDF" in str(chunk):
+                    return True
+                else:
+                    return False
+        except Exception as e:
+            return False
+    
     def fetch_data(self, url, proxies=None, file=None, show_error_log=False, **kwargs):
         """
         下载数据
@@ -198,6 +215,7 @@ class AttachmentDownloader:
         :param str filetype: 文件类型
         :param str url: 文件下载地址
         """
+        gzip = kwargs.pop("gzip", False)
         stream = self.fetch_data(url, file=None, **kwargs)
         attachment = {
             "filename": "{}.{}".format(filename, filetype),
@@ -210,7 +228,8 @@ class AttachmentDownloader:
                 attachment["fid"] = "{}.{}".format(fid, filetype)
                 attachment["size"] = self.getsize(stream)
                 attachment["url"] = "oss"
-                AliYunService().push_oss_from_stream(attachment["fid"], stream)
+                # self._oss.upload("file", attachment["fid"], stream, gzip=gzip)
+                self._bucket.put_object(attachment["fid"], stream)
             except Exception as e:
                 logger.error(
                     "[{}]上传失败,原因:{}".format(filename, e.__class__.__name__)
@@ -218,21 +237,6 @@ class AttachmentDownloader:
 
         return attachment
 
-    def read_pdf_in_chunks(self, pdf_path, chunk_size=1024):
-        try:
-            with open(pdf_path, 'rb') as file:
-                chunk = file.read(chunk_size)
-                if "<</Names <</Dests 4 0 R>>" in str(chunk) and "SourceModified" in str(chunk):
-                    return False
-                elif "doctypehtml" not in str(chunk):
-                    return True
-                elif "%PDF" in str(chunk):
-                    return True
-                else:
-                    return False
-        except Exception as e:
-            return False
-
     def _push_oss_from_local(self, filename, filetype, url, **kwargs):
         """
         上传本地文件到oss
@@ -241,6 +245,7 @@ class AttachmentDownloader:
         :param str filetype: 文件类型
         :param str url: 文件下载地址
         """
+        gzip = kwargs.pop("gzip", False)
         file = self.create_file(filename, filetype)
         stream = self.fetch_data(url, file=file, **kwargs)
         '''上传/下载,无论失败成功都需要返回文件基础信息'''
@@ -250,7 +255,7 @@ class AttachmentDownloader:
         }
 
         if kwargs.get('is_check', None):
-            if not self.read_pdf_in_chunks(file):
+            if not self.read_pdf_by_chunks(file):
                 self.remove(file)
                 return attachment
 
@@ -261,7 +266,8 @@ class AttachmentDownloader:
                 attachment["size"] = self.getsize(file)
                 attachment["ftype"] = filetype
                 attachment["url"] = "oss"
-                AliYunService().push_oss_from_local(attachment["fid"], file)
+                # self._oss.upload("file", attachment["fid"], stream, gzip=gzip)
+                self._bucket.put_object_from_file(attachment["fid"], file)
             except Exception as e:
                 logger.error(
                     "[{}]上传失败,原因:{}".format(filename, e.__class__.__name__)
@@ -270,27 +276,18 @@ class AttachmentDownloader:
         self.remove(file)  # 删除本地临时文件
         return attachment
 
-    def fetch_attachment(
-        self,
-        file_name: str,
-        file_type: str,
-        download_url: str,
-        mode="local",
-        proxies=None,
-        **kwargs
-    ):
+    def fetch_attachment(self, file_name, file_type, download_url, mode="local", proxies=None, gzip=False, **kwargs):
         """
         下载附件
 
-        @param file_name: 文件名称
-        @param file_type: 文件类型
-        @param download_url: 文件下载地址
-        @param mode: 附件上传模式 "local" = 本地文件 or "stream" = 数据流
-        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
-        @return:
+        @param str file_name: 文件名称
+        @param str file_type: 文件类型
+        @param str download_url: 文件下载地址
+        @param str mode: 附件上传模式; "local" = 本地文件; "stream" = 数据流
+        @param bool gzip: 是否压缩
+        @param dict proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
+        @return: {"fid":"", "filename":"", "url":"oss", "size":"", "ftype":"", "org_url":""}
         """
-        if not file_name or not file_type or not download_url:
-            raise AttachmentNullError
 
         file_name = clear_file_type_suffix(file_name, file_type)  # 防止文件后缀重复
         file_kwargs = dict(
@@ -298,6 +295,7 @@ class AttachmentDownloader:
             filetype=file_type,
             url=download_url,
             proxies=proxies,
+            gzip=gzip,
             **kwargs
         )
         if mode == "stream":
@@ -305,3 +303,6 @@ class AttachmentDownloader:
         else:
             attachment = self._push_oss_from_local(**file_kwargs)
         return attachment
+
+
+AttachmentDownloader = Downloader

+ 9 - 8
FworkSpider/untils/attachment_res.py

@@ -16,8 +16,7 @@ import urllib3
 
 import feapder.utils.tools as tools
 from feapder.utils.log import log as logger
-from untils.aliyun import AliYunService
-from untils.execptions import AttachmentNullError
+from feapder.utils.oss import JyOssClient, OssBucketClient
 
 urllib3.disable_warnings()
 # 文件文档类型
@@ -154,11 +153,12 @@ def judge_file_url(file_url: str):
     return file_url
 
 
-class AttachmentDownloader(AliYunService):
+class Downloader:
 
     def __init__(self):
-        super(AttachmentDownloader, self).__init__()
         self.dir_name = "file"
+        # self._oss = JyOssClient()
+        self._bucket = OssBucketClient()
 
     def _create_file(self, filename, filetype):
         os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
@@ -240,9 +240,6 @@ class AttachmentDownloader(AliYunService):
         callback=None,
         **kwargs
     ):
-        if not file_name or not download_url:
-            raise AttachmentNullError
-
         file_kwargs = dict(callback=callback, url=download_url, **kwargs)
         filestream, filetype = self.fetch_data(**file_kwargs)
         filetype = file_type or filetype
@@ -266,7 +263,8 @@ class AttachmentDownloader(AliYunService):
                 attachment["size"] = self._file_size(local_temp_file)
                 attachment["ftype"] = filetype
                 attachment["url"] = "oss"
-                super().push_oss_from_local(attachment["fid"], local_temp_file)
+                # self._oss.upload("file", attachment["fid"], filestream)
+                self._bucket.put_object_from_file(attachment["fid"], local_temp_file)
             except Exception as e:
                 logger.error(
                     "[{}]上传失败,原因:{}".format(file_name, e.__class__.__name__)
@@ -277,3 +275,6 @@ class AttachmentDownloader(AliYunService):
             return attachment
         else:
             return {}
+
+
+AttachmentDownloader = Downloader

+ 1 - 1
FworkSpider/untils/tools.py

@@ -9,7 +9,7 @@ import bson
 from bs4 import BeautifulSoup
 
 from feapder.network.proxy_pool import DirectProxyPool
-from untils.clean_html import cleaner
+from feapder.utils.clean_html import cleaner
 
 SearchText = namedtuple('SearchText', ['total'])