瀏覽代碼

附件下载修改

maguopeng 3 年之前
父節點
當前提交
967a646705
共有 1 個文件被更改,包括 79 次插入59 次删除
  1. 79 59
      FworkSpider/untils/attachment.py

+ 79 - 59
FworkSpider/untils/attachment.py

@@ -7,6 +7,8 @@ from urllib.parse import urlparse, unquote
 
 import requests
 import urllib3
+import sys
+sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
 
 from feapder.setting import headers
 from untils.execptions import AttachmentNullError
@@ -16,26 +18,33 @@ from untils.proxy_pool import ProxyPool
 urllib3.disable_warnings()
 
 
-def hex_sha1(val):
-    sha1 = hashlib.sha1()
+def sha1(val):
+    _sha1 = hashlib.sha1()
     if isinstance(val, bytes):
-        sha1.update(str(val).encode("utf-8"))
+        _sha1.update(str(val).encode("utf-8"))
     elif isinstance(val, str):
-        sha1.update(val.encode("utf-8"))
-    res = sha1.hexdigest()
-    return res
+        _sha1.update(val.encode("utf-8"))
+    return _sha1.hexdigest()
 
 
-def extract_file_type(text):
-    if text is None:
-        return None
+def remove(file_path: str):
+    os.remove(file_path)
+
+
+def getsize(file_path: str):
+    try:
+        return os.path.getsize(file_path)
+    except FileNotFoundError:
+        return 0
+
 
+def discern_file_format(text):
     file_types = {
-        'pdf', 'doc', 'docx', 'rar', 'zip', 'gzzb', 'jpg', 'png'
+        'pdf', 'doc', 'docx', 'rar', 'zip', 'gzzb', 'jpg', 'png', 'swf'
     }
     for file_type in file_types:
-        tmp = [file_type, file_type.upper()]
-        for t in tmp:
+        all_file_format = [file_type, file_type.upper()]
+        for t in all_file_format:
             result = re.match(f'.*{t}$', text, re.S)
             if result is not None:
                 return t
@@ -43,7 +52,14 @@ def extract_file_type(text):
         return None
 
 
-def extract_file_name(href: str, file_type: str):
+def extract_file_type(text):
+    if text is None:
+        return None
+    return discern_file_format(text)
+
+
+def extract_file_name_by_href(href: str, file_type: str):
+    """从url中抽取文件名称"""
     # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
     # 中文字符:[\u4e00 -\u9fa5]
     zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
@@ -58,29 +74,56 @@ def extract_file_name(href: str, file_type: str):
     return None
 
 
+def extract_file_name(text):
+    file_type = discern_file_format(text)
+    if file_type is not None:
+        repl = '.{}'.format(file_type)
+        text = text.replace(repl, '')
+    return text
+
+
 def verify_file_name(name):
     if extract_file_type(name) is None:
         raise ValueError
 
 
+class AttachmentNullError(Exception):
+
+    def __init__(self, code: int = 10004, reason: str = '附件下载异常', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
 class AttachmentDownloader:
 
     def __init__(self):
         self.dir_name = 'file'
 
-    def create_dir(self):
-        if not os.path.exists(self.dir_name):
-            os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
-
-    def create_file_path(self, filename, file_type):
-        self.create_dir()
-        sign = hex_sha1("{}_{}".format(filename, uuid.uuid4()))
-        tmp_name = "{}.{}".format(sign, file_type)
+    def get_file_path(self, filename, file_type):
+        os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
+        sha1_name = sha1("{}_{}".format(filename, uuid.uuid4()))
+        tmp_name = "{}.{}".format(sha1_name, file_type)
         return "{}/{}".format(self.dir_name, tmp_name)
 
     @staticmethod
     def create_fid(file_stream: bytes):
-        return hex_sha1(file_stream)
+        return sha1(file_stream)
+
+    @staticmethod
+    def file_size(file_path: str):
+        _kb = float(getsize(file_path)) / 1024
+        if _kb >= 1024:
+            _M = _kb / 1024
+            if _M >= 1024:
+                _G = _M / 1024
+                return "{:.1f} G".format(_G)
+            else:
+                return "{:.1f} M".format(_M)
+        else:
+            return "{:.1f} kb".format(_kb)
 
     @staticmethod
     def _fetch_attachment(
@@ -119,29 +162,6 @@ class AttachmentDownloader:
                 retries += 1
         return b''
 
-    @staticmethod
-    def clean_attachment(file_path):
-        os.remove(file_path)
-
-    @staticmethod
-    def getsize(file_path: str):
-        def _getsize(filename):
-            try:
-                return os.path.getsize(filename)
-            except:
-                return 0
-
-        _kb = float(_getsize(file_path)) / 1024
-        if _kb >= 1024:
-            _M = _kb / 1024
-            if _M >= 1024:
-                _G = _M / 1024
-                return "{:.1f} G".format(_G)
-            else:
-                return "{:.1f} M".format(_M)
-        else:
-            return "{:.1f} kb".format(_kb)
-
     def fetch_attachment(
             self,
             file_name: str,
@@ -154,7 +174,7 @@ class AttachmentDownloader:
         if not file_name or not file_type or not download_url:
             raise AttachmentNullError
 
-        file_path = self.create_file_path(file_name, file_type)
+        file_path = self.get_file_path(file_name, file_type)
         file_stream = self._fetch_attachment(
             download_url,
             file_path,
@@ -164,35 +184,35 @@ class AttachmentDownloader:
         )
         if len(file_stream) > 0:
             fid = self.create_fid(file_stream)
-            '''上传/下载,无论失败成功都需要给出文件基础信息'''
+            '''上传/下载,无论失败/成功最终返回附件信息'''
             try:
                 result = {
-                    'filename': file_name,
+                    'filename': '{}.{}'.format(file_name, file_type),
                     'ftype': file_type,
                     'fid': "{}.{}".format(fid, file_type),
                     'org_url': download_url,
-                    'size': self.getsize(file_path),
+                    'size': self.file_size(file_path),
                     'url': 'oss',
                 }
                 AliYunService().push_oss_from_local(result['fid'], file_path)
             except Exception:
                 result = {
-                    'filename': file_name,
+                    'filename': '{}.{}'.format(file_name, file_type),
                     'org_url': download_url,
                 }
-            self.clean_attachment(file_path)
         else:
             result = {
-                'filename': file_name,
+                'filename': '{}.{}'.format(file_name, file_type),
                 'org_url': download_url,
             }
+        remove(file_path)
         return result
 
 
-# if __name__ == '__main__':
-    # a = AttachmentDownloader().fetch_attachment(
-    #     file_name='成建制移民村(五标段)合同',
-    #     file_type='pdf',
-    #     download_url='http://222.75.70.90/NXGPPSP_MG/downloadFileServlet?req=F&num=8b80b23f7e729b88017e758a1b03422c'
-    # )
-    # print(a)
+if __name__ == '__main__':
+    a = AttachmentDownloader().fetch_attachment(
+        file_name='成建制移民村(五标段)合同',
+        file_type='pdf',
+        download_url='http://222.75.70.90/NXGPPSP_MG/downloadFileServlet?req=F&num=8b80b23f7e729b88017e758a1b03422c'
+    )
+    print(a)