před 3 týdny · c9ca1ff88c
--- a/FworkSpider/untils/attachment.py
+++ b/FworkSpider/untils/attachment.py
@@ -16,8 +16,7 @@ import urllib3
 
				 
			
 
				 import feapder.utils.tools as tools
			
 
				 from feapder.utils.log import log as logger
			
 
				-from untils.aliyun import AliYunService
			
 
				-from untils.execptions import AttachmentNullError
			
 
				+from feapder.utils.oss import JyOssClient, OssBucketClient
			
 
				 
			
 
				 urllib3.disable_warnings()
			
 
				 
			
@@ -34,10 +33,12 @@ def clear_file_type_suffix(filename: str, filetype: str):
 
				     return filename
			
 
				 
			
 
				 
			
 
				-class AttachmentDownloader:
			
 
				+class Downloader:
			
 
				 
			
 
				     def __init__(self):
			
 
				         self.dir_name = "file"
			
 
				+        # self._oss = JyOssClient()
			
 
				+        self._bucket = OssBucketClient()
			
 
				 
			
 
				     def create_file(self, filename, filetype):
			
 
				         os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
			
@@ -104,6 +105,22 @@ class AttachmentDownloader:
 
				                 result = "{:.1f} M".format(_M)
			
 
				         return result
			
 
				 
			
 
				+    @staticmethod
			
 
				+    def read_pdf_by_chunks(f, chunk_size=1024):
			
 
				+        try:
			
 
				+            with open(f, 'rb') as file:
			
 
				+                chunk = file.read(chunk_size)
			
 
				+                if "<</Names <</Dests 4 0 R>>" in str(chunk) and "SourceModified" in str(chunk):
			
 
				+                    return False
			
 
				+                elif "doctypehtml" not in str(chunk):
			
 
				+                    return True
			
 
				+                elif "%PDF" in str(chunk):
			
 
				+                    return True
			
 
				+                else:
			
 
				+                    return False
			
 
				+        except Exception as e:
			
 
				+            return False
			
 
				+    
			
 
				     def fetch_data(self, url, proxies=None, file=None, show_error_log=False, **kwargs):
			
 
				         """
			
 
				         下载数据
			
@@ -198,6 +215,7 @@ class AttachmentDownloader:
 
				         :param str filetype: 文件类型
			
 
				         :param str url: 文件下载地址
			
 
				         """
			
 
				+        gzip = kwargs.pop("gzip", False)
			
 
				         stream = self.fetch_data(url, file=None, **kwargs)
			
 
				         attachment = {
			
 
				             "filename": "{}.{}".format(filename, filetype),
			
@@ -210,7 +228,8 @@ class AttachmentDownloader:
 
				                 attachment["fid"] = "{}.{}".format(fid, filetype)
			
 
				                 attachment["size"] = self.getsize(stream)
			
 
				                 attachment["url"] = "oss"
			
 
				-                AliYunService().push_oss_from_stream(attachment["fid"], stream)
			
 
				+                # self._oss.upload("file", attachment["fid"], stream, gzip=gzip)
			
 
				+                self._bucket.put_object(attachment["fid"], stream)
			
 
				             except Exception as e:
			
 
				                 logger.error(
			
 
				                     "[{}]上传失败,原因:{}".format(filename, e.__class__.__name__)
			
@@ -218,21 +237,6 @@ class AttachmentDownloader:
 
				 
			
 
				         return attachment
			
 
				 
			
 
				-    def read_pdf_in_chunks(self, pdf_path, chunk_size=1024):
			
 
				-        try:
			
 
				-            with open(pdf_path, 'rb') as file:
			
 
				-                chunk = file.read(chunk_size)
			
 
				-                if "<</Names <</Dests 4 0 R>>" in str(chunk) and "SourceModified" in str(chunk):
			
 
				-                    return False
			
 
				-                elif "doctypehtml" not in str(chunk):
			
 
				-                    return True
			
 
				-                elif "%PDF" in str(chunk):
			
 
				-                    return True
			
 
				-                else:
			
 
				-                    return False
			
 
				-        except Exception as e:
			
 
				-            return False
			
 
				-
			
 
				     def _push_oss_from_local(self, filename, filetype, url, **kwargs):
			
 
				         """
			
 
				         上传本地文件到oss
			
@@ -241,6 +245,7 @@ class AttachmentDownloader:
 
				         :param str filetype: 文件类型
			
 
				         :param str url: 文件下载地址
			
 
				         """
			
 
				+        gzip = kwargs.pop("gzip", False)
			
 
				         file = self.create_file(filename, filetype)
			
 
				         stream = self.fetch_data(url, file=file, **kwargs)
			
 
				         '''上传/下载,无论失败成功都需要返回文件基础信息'''
			
@@ -250,7 +255,7 @@ class AttachmentDownloader:
 
				         }
			
 
				 
			
 
				         if kwargs.get('is_check', None):
			
 
				-            if not self.read_pdf_in_chunks(file):
			
 
				+            if not self.read_pdf_by_chunks(file):
			
 
				                 self.remove(file)
			
 
				                 return attachment
			
 
				 
			
@@ -261,7 +266,8 @@ class AttachmentDownloader:
 
				                 attachment["size"] = self.getsize(file)
			
 
				                 attachment["ftype"] = filetype
			
 
				                 attachment["url"] = "oss"
			
 
				-                AliYunService().push_oss_from_local(attachment["fid"], file)
			
 
				+                # self._oss.upload("file", attachment["fid"], stream, gzip=gzip)
			
 
				+                self._bucket.put_object_from_file(attachment["fid"], file)
			
 
				             except Exception as e:
			
 
				                 logger.error(
			
 
				                     "[{}]上传失败,原因:{}".format(filename, e.__class__.__name__)
			
@@ -270,27 +276,18 @@ class AttachmentDownloader:
 
				         self.remove(file)  # 删除本地临时文件
			
 
				         return attachment
			
 
				 
			
 
				-    def fetch_attachment(
			
 
				-        self,
			
 
				-        file_name: str,
			
 
				-        file_type: str,
			
 
				-        download_url: str,
			
 
				-        mode="local",
			
 
				-        proxies=None,
			
 
				-        **kwargs
			
 
				-    ):
			
 
				+    def fetch_attachment(self, file_name, file_type, download_url, mode="local", proxies=None, gzip=False, **kwargs):
			
 
				         """
			
 
				         下载附件
			
 
				 
			
 
				-        @param file_name: 文件名称
			
 
				-        @param file_type: 文件类型
			
 
				-        @param download_url: 文件下载地址
			
 
				-        @param mode: 附件上传模式 "local" = 本地文件 or "stream" = 数据流
			
 
				-        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
			
 
				-        @return:
			
 
				+        @param str file_name: 文件名称
			
 
				+        @param str file_type: 文件类型
			
 
				+        @param str download_url: 文件下载地址
			
 
				+        @param str mode: 附件上传模式; "local" = 本地文件; "stream" = 数据流
			
 
				+        @param bool gzip: 是否压缩
			
 
				+        @param dict proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
			
 
				+        @return: {"fid":"", "filename":"", "url":"oss", "size":"", "ftype":"", "org_url":""}
			
 
				         """
			
 
				-        if not file_name or not file_type or not download_url:
			
 
				-            raise AttachmentNullError
			
 
				 
			
 
				         file_name = clear_file_type_suffix(file_name, file_type)  # 防止文件后缀重复
			
 
				         file_kwargs = dict(
			
@@ -298,6 +295,7 @@ class AttachmentDownloader:
 
				             filetype=file_type,
			
 
				             url=download_url,
			
 
				             proxies=proxies,
			
 
				+            gzip=gzip,
			
 
				             **kwargs
			
 
				         )
			
 
				         if mode == "stream":
			
@@ -305,3 +303,6 @@ class AttachmentDownloader:
 
				         else:
			
 
				             attachment = self._push_oss_from_local(**file_kwargs)
			
 
				         return attachment
			
 
				+
			
 
				+
			
 
				+AttachmentDownloader = Downloader
			
--- a/FworkSpider/untils/attachment_res.py
+++ b/FworkSpider/untils/attachment_res.py
@@ -16,8 +16,7 @@ import urllib3
 
				 
			
 
				 import feapder.utils.tools as tools
			
 
				 from feapder.utils.log import log as logger
			
 
				-from untils.aliyun import AliYunService
			
 
				-from untils.execptions import AttachmentNullError
			
 
				+from feapder.utils.oss import JyOssClient, OssBucketClient
			
 
				 
			
 
				 urllib3.disable_warnings()
			
 
				 # 文件文档类型
			
@@ -154,11 +153,12 @@ def judge_file_url(file_url: str):
 
				     return file_url
			
 
				 
			
 
				 
			
 
				-class AttachmentDownloader(AliYunService):
			
 
				+class Downloader:
			
 
				 
			
 
				     def __init__(self):
			
 
				-        super(AttachmentDownloader, self).__init__()
			
 
				         self.dir_name = "file"
			
 
				+        # self._oss = JyOssClient()
			
 
				+        self._bucket = OssBucketClient()
			
 
				 
			
 
				     def _create_file(self, filename, filetype):
			
 
				         os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
			
@@ -240,9 +240,6 @@ class AttachmentDownloader(AliYunService):
 
				         callback=None,
			
 
				         **kwargs
			
 
				     ):
			
 
				-        if not file_name or not download_url:
			
 
				-            raise AttachmentNullError
			
 
				-
			
 
				         file_kwargs = dict(callback=callback, url=download_url, **kwargs)
			
 
				         filestream, filetype = self.fetch_data(**file_kwargs)
			
 
				         filetype = file_type or filetype
			
@@ -266,7 +263,8 @@ class AttachmentDownloader(AliYunService):
 
				                 attachment["size"] = self._file_size(local_temp_file)
			
 
				                 attachment["ftype"] = filetype
			
 
				                 attachment["url"] = "oss"
			
 
				-                super().push_oss_from_local(attachment["fid"], local_temp_file)
			
 
				+                # self._oss.upload("file", attachment["fid"], filestream)
			
 
				+                self._bucket.put_object_from_file(attachment["fid"], local_temp_file)
			
 
				             except Exception as e:
			
 
				                 logger.error(
			
 
				                     "[{}]上传失败,原因:{}".format(file_name, e.__class__.__name__)
			
@@ -277,3 +275,6 @@ class AttachmentDownloader(AliYunService):
 
				             return attachment
			
 
				         else:
			
 
				             return {}
			
 
				+
			
 
				+
			
 
				+AttachmentDownloader = Downloader
			
--- a/FworkSpider/untils/tools.py
+++ b/FworkSpider/untils/tools.py
@@ -9,7 +9,7 @@ import bson
 
				 from bs4 import BeautifulSoup
			
 
				 
			
 
				 from feapder.network.proxy_pool import DirectProxyPool
			
 
				-from untils.clean_html import cleaner
			
 
				+from feapder.utils.clean_html import cleaner
			
 
				 
			
 
				 SearchText = namedtuple('SearchText', ['total'])