Pārlūkot izejas kodu

附件下载模块优化

dongzhaorui 1 gadu atpakaļ
vecāks
revīzija
ce0e4cd4ae
2 mainītis faili ar 73 papildinājumiem un 70 dzēšanām
  1. 56 51
      FworkSpider/untils/attachment.py
  2. 17 19
      FworkSpider/untils/attachment_res.py

+ 56 - 51
FworkSpider/untils/attachment.py

@@ -27,30 +27,35 @@ headers = {
 }
 
 
+def clear_file_type_suffix(filename: str, filetype: str):
+    filename = filename.strip()
+    if filetype in filename:
+        filename = filename.replace(f".{filetype}", '')
+    return filename
+
+
 class AttachmentDownloader:
 
     def __init__(self):
         self.dir_name = "file"
 
-    def create_dir(self):
-        if not os.path.exists(self.dir_name):
-            os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
-
     def create_file(self, filename, filetype):
-        self.create_dir()
-        sign = tools.get_sha1("{}_{}".format(filename, uuid.uuid4()))
-        file_name = "{}.{}".format(sign, filetype)
-        return "{}/{}".format(self.dir_name, file_name)
+        os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
+        file = "{filename}.{filetype}".format(
+            filename=tools.get_sha1("{}_{}".format(filename, uuid.uuid4())),
+            filetype=filetype
+        )
+        return "{}/{}".format(self.dir_name, file)
 
     @staticmethod
-    def clean_attachment(file_path):
+    def clean_attachment(filepath):
         """
         删除文件
 
-        :param str file_path: 文件路径
+        :param str filepath: 文件路径
         """
         try:
-            os.remove(file_path)
+            os.remove(filepath)
         except FileNotFoundError:
             pass
 
@@ -58,9 +63,9 @@ class AttachmentDownloader:
         self.clean_attachment(file)
 
     @staticmethod
-    def get_mb(data):
+    def calculate_size(data):
         """
-        获取数据的Mb
+        计算数据大小
 
         :param int data: 准备计算大小的内容
         :return: float
@@ -101,12 +106,11 @@ class AttachmentDownloader:
 
     def fetch_data(self, url, proxies=None, file=None, show_error_log=False, **kwargs):
         """
-        数据下载
+        下载数据
 
-        :param str url: 下载地址
+        :param str url: 文件下载地址
+        :param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
         :param file: 本地文件
-        :param dict kwargs: requests请求参数
-        :param dict proxies: 代理ip
         :param show_error_log: 展示错误堆栈信息日志
         """
         method = kwargs.pop("method", "get")
@@ -127,12 +131,11 @@ class AttachmentDownloader:
                 with requests.request(method, url, **request_kwargs) as req:
                     stream = io.BytesIO()
                     lh = {k.lower(): v for k, v in req.headers.items()}
-                    '''内容长度'''
-                    cl = lh.get("content-length") or len(req.content)
+                    cl = lh.get("content-length") or len(req.content)  # 内容长度
                     icl = int(cl)
-                    content_length = self.get_mb(icl)
+                    content_length = self.calculate_size(icl)
                     if content_length > 50:
-                        '''丢弃超过50Mb内容长度的文件'''
+                        # 丢弃超过50Mb内容长度的文件
                         return stream.getvalue()
 
                     if req.status_code != 200:
@@ -168,11 +171,11 @@ class AttachmentDownloader:
 
     def _push_oss_from_stream(self, filename, filetype, url, **kwargs):
         """
-        将数据流推送oss
+        推送数据流到oss
 
         :param str filename: 文件名称
         :param str filetype: 文件类型
-        :param str url: 下载地址
+        :param str url: 文件下载地址
         """
         stream = self.fetch_data(url, file=None, **kwargs)
         attachment = {
@@ -187,18 +190,20 @@ class AttachmentDownloader:
                 attachment["size"] = self.getsize(stream)
                 attachment["url"] = "oss"
                 AliYunService().push_oss_from_stream(attachment["fid"], stream)
-            except Exception:
-                pass
+            except Exception as e:
+                logger.error(
+                    "[{}]上传失败,原因:{}".format(filename, e.__class__.__name__)
+                )
 
         return attachment
 
     def _push_oss_from_local(self, filename, filetype, url, **kwargs):
         """
-        将本地文件推送oss
+        上传本地文件到oss
 
         :param str filename: 文件名称
         :param str filetype: 文件类型
-        :param str url: 下载地址
+        :param str url: 文件下载地址
         """
         file = self.create_file(filename, filetype)
         stream = self.fetch_data(url, file=file, **kwargs)
@@ -223,38 +228,38 @@ class AttachmentDownloader:
         self.remove(file)  # 删除本地临时文件
         return attachment
 
-    def _fetch_attachment(self, filename, filetype, download_url, mode, **kwargs):
-        """
-        下载附件
-
-        :param str filename: 文件名称
-        :param str filetype: 文件类型
-        :param str download_url: 下载地址
-        :param str mode: 附件上传模式 "local" or "stream"
-        """
-        file_kwargs = dict(
-            filename=filename,
-            filetype=filetype,
-            url=download_url,
-            **kwargs
-        )
-        if mode == "stream":
-            res = self._push_oss_from_stream(**file_kwargs)
-        else:
-            res = self._push_oss_from_local(**file_kwargs)
-        return res
-
     def fetch_attachment(
         self,
         file_name: str,
         file_type: str,
         download_url: str,
-        proxies=None,
         mode="local",
+        proxies=None,
         **kwargs
     ):
+        """
+        下载附件
+
+        @param file_name: 文件名称
+        @param file_type: 文件类型
+        @param download_url: 文件下载地址
+        @param mode: 附件上传模式 "local" = 本地文件 or "stream" = 数据流
+        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
+        @return:
+        """
         if not file_name or not file_type or not download_url:
             raise AttachmentNullError
 
-        file_kwargs = dict(proxies=proxies, **kwargs)
-        return self._fetch_attachment(file_name, file_type, download_url, mode, **file_kwargs)
+        file_name = clear_file_type_suffix(file_name, file_type)  # 防止文件后缀重复
+        file_kwargs = dict(
+            filename=file_name,
+            filetype=file_type,
+            url=download_url,
+            proxies=proxies,
+            **kwargs
+        )
+        if mode == "stream":
+            attachment = self._push_oss_from_stream(**file_kwargs)
+        else:
+            attachment = self._push_oss_from_local(**file_kwargs)
+        return attachment

+ 17 - 19
FworkSpider/untils/attachment_res.py

@@ -6,7 +6,6 @@ Created on 2022-03-06
 ---------
 @author: Lzz
 """
-import hashlib
 import os
 import re
 import uuid
@@ -50,15 +49,6 @@ headers = {
 }
 
 
-def sha1(val):
-    _sha1 = hashlib.sha1()
-    if isinstance(val, bytes):
-        _sha1.update(str(val).encode("utf-8"))
-    elif isinstance(val, str):
-        _sha1.update(val.encode("utf-8"))
-    return _sha1.hexdigest()
-
-
 def remove(file_path: str):
     try:
         os.remove(file_path)
@@ -173,7 +163,7 @@ class AttachmentDownloader(AliYunService):
     def _create_file(self, filename, filetype):
         os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
         file = "{filename}.{filetype}".format(
-            filename=sha1("{}_{}".format(filename, uuid.uuid4())),
+            filename=tools.get_sha1("{}_{}".format(filename, uuid.uuid4())),
             filetype=filetype
         )
         return "{}/{}".format(self.dir_name, file)
@@ -192,13 +182,21 @@ class AttachmentDownloader(AliYunService):
             return "{:.1f} kb".format(_kb)
 
     @staticmethod
-    def _fetch_attachment(
+    def fetch_data(
         callback,
         url: str,
         proxies=None,
         show_error_log=False,
         **kwargs
     ):
+        """
+        下载数据
+
+        @param callback: 回调函数 可以是函数 也可是函数名
+        @param url: 文件下载地址
+        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
+        @param show_error_log: 展示错误堆栈信息日志
+        """
         request_params = {}
         request_params.setdefault("proxies", proxies)
         request_params.setdefault("headers", kwargs.get("headers") or headers)
@@ -214,7 +212,7 @@ class AttachmentDownloader(AliYunService):
                         stream = response.content
                         filetype_lst = []  # 文件类型列表
                         if callable(callback):
-                            # 通过自定义的解析函数获取响应体头部属性中的文件类型
+                            # 通过自定义的回调函数 获取 response.headers 文件类型添加到 filetype_lst
                             callback(response, filetype_lst)
 
                         filetype = filetype_lst[0] if filetype_lst else ""
@@ -238,28 +236,28 @@ class AttachmentDownloader(AliYunService):
         if not file_name or not download_url:
             raise AttachmentNullError
 
-        results = self._fetch_attachment(callback, download_url, **kwargs)
+        results = self.fetch_data(callback, download_url, **kwargs)
         if len(results) == 2:
             filetype = results[-1]
         else:
             filetype = ""
 
+        filestream = results[0]  # 文件数据流
         filename = clear_file_type_suffix(file_name, filetype)
         download_url = judge_file_url(download_url)
 
         # 保存本地临时文件
-        file_stream = results[0]
         local_temp_file = self._create_file(filename, filetype)
         with open(local_temp_file, "wb") as f:
-            f.write(file_stream)
+            f.write(filestream)
 
         '''上传/下载,无论失败/成功必须返回附件信息'''
         attachment = {
             "filename": "{}.{}".format(filename, filetype),
             "org_url": download_url
         }
-        if len(file_stream) > 0:
-            content_hash = tools.get_sha1(file_stream)
+        if len(filestream) > 0:
+            content_hash = tools.get_sha1(filestream)
             try:
                 attachment["fid"] = "{}.{}".format(content_hash, filetype)
                 attachment["size"] = self._file_size(local_temp_file)
@@ -271,7 +269,7 @@ class AttachmentDownloader(AliYunService):
                     "[{}]上传失败,原因:{}".format(file_name, e.__class__.__name__)
                 )
 
-        remove(local_temp_file)
+        remove(local_temp_file)  # 删除本地临时文件
         if "size" not in attachment or limit_file_size(attachment.get("size")):
             return attachment
         else: