# -*- coding: utf-8 -*- """ Created on 2022-03-06 --------- @summary: 附件下载模块res --------- @author: Lzz """ import hashlib import os import re import uuid from urllib.parse import urlparse, unquote import requests import urllib3 import feapder.utils.tools as tools from feapder.utils.log import log as logger from untils.aliyun import AliYunService from untils.execptions import AttachmentNullError urllib3.disable_warnings() # 文件文档类型 DOCTYPE = { "txt", "rtf", "dps", "et", "ett", "xls", "xlsx", "xlsb", "xlsm", "xlt", "ods", "pmd", "pmdx", "doc", "docm", "docx", "dot", "dotm", "dotx", "odt", "wps", "csv", "xml", "xps" } # 压缩类型 COMPRESSION_TYPE = { "rar", "zip", "gzzb", "7z", "tar", "gz", "bz2", "jar", "iso", "cab", "arj", "lzh", "ace", "uue", "edxz", } # 图片类型 IMAGE_TYPE = { "jpg", "png", "jpeg", "tiff", "gif", "psd", "raw", "eps", "svg", "bmp", "pdf" } # 其他类型 OTHER_TYPE = { "swf", "nxzf", "xezf", "nxcf" } headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36", "Accept": "*/*" } def sha1(val): _sha1 = hashlib.sha1() if isinstance(val, bytes): _sha1.update(str(val).encode("utf-8")) elif isinstance(val, str): _sha1.update(val.encode("utf-8")) return _sha1.hexdigest() def remove(file_path: str): try: os.remove(file_path) except FileNotFoundError: pass def getsize(file): try: return os.path.getsize(file) except FileNotFoundError: return 0 def discern_file_format(text, show_warn_log=False): """ 识别文件格式 @param text: 识别文本 @param show_warn_log: 是否打印警告信息 @return: 文件格式 """ file_types = { *DOCTYPE, *COMPRESSION_TYPE, *IMAGE_TYPE, *OTHER_TYPE } for file_type in file_types: all_file_format = [file_type, file_type.upper()] for t in all_file_format: result = re.match(f".*{t}$", text, re.S) if result is not None: return t else: unknown_type = re.findall('[^.\\/:*?"<>|\r\n]+$', text, re.S) if show_warn_log: logger.warning(f"[未识别文件类型]{unknown_type}") return None def extract_file_type(text): if text is None: return None return discern_file_format(text) def extract_file_name_by_href(href: str, file_type: str): """从url中抽取文件名称""" # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b] # 中文字符:[\u4e00 -\u9fa5] zh_char_pattern = "[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+" parser = urlparse(href) query = (parser.query or parser.path) result = re.search(f".*\\.{file_type}", query, re.S) if result is not None: encode_str = unquote(result.group()) name = re.search(zh_char_pattern, encode_str) if name is not None: return unquote(name.group()) return None def extract_file_name(text): file_type = discern_file_format(text) if file_type is not None: repl = ".{}".format(file_type) text = text.replace(repl, '') return text def verify_file_name(name): if extract_file_type(name) is None: raise ValueError # 去除附件名空格、两个后缀 def clear_file_type_suffix(file_name: str, file_type: str): file_name = file_name.strip() if file_type in file_name: file_name = file_name.replace(f".{file_type}", '') return file_name # 限制附件大小:size < 5 kb 不存入数据库 def limit_file_size(file_size: str): _pattern = "^[0-9]\d*\.\d*|[1-9]\d*" if "M" in file_size or "m" in file_size: file_size = float("".join(re.findall(_pattern, file_size))) * 1000 else: file_size = "".join(re.findall(_pattern, file_size)) if float(file_size) < 5: return False else: return True # 判断附件地址是否正确 def judge_file_url(file_url: str): file_url = file_url.strip() if " " in file_url: file_url = file_url.split(" ")[0] return file_url class AttachmentDownloader(AliYunService): def __init__(self): super(AttachmentDownloader, self).__init__() self.dir_name = "file" def _create_file(self, filename, filetype): os.makedirs(self.dir_name, mode=0o777, exist_ok=True) file = "{filename}.{filetype}".format( filename=sha1("{}_{}".format(filename, uuid.uuid4())), filetype=filetype ) return "{}/{}".format(self.dir_name, file) @staticmethod def _file_size(file: str): _kb = float(getsize(file)) / 1024 if _kb >= 1024: _M = _kb / 1024 if _M >= 1024: _G = _M / 1024 return "{:.1f} G".format(_G) else: return "{:.1f} M".format(_M) else: return "{:.1f} kb".format(_kb) @staticmethod def _fetch_attachment( callback, url: str, proxies=None, show_error_log=False, **kwargs ): request_params = {} request_params.setdefault("proxies", proxies) request_params.setdefault("headers", kwargs.get("headers") or headers) request_params.setdefault("timeout", kwargs.get("timeout") or 60) request_params.setdefault("stream", kwargs.get("stream") or True) request_params.setdefault("verify", kwargs.get("verify") or False) retries = 0 while retries < 3: try: with requests.get(url, **request_params) as response: if response.status_code == 200: stream = response.content filetype_lst = [] # 文件类型列表 if callable(callback): # 通过自定义的解析函数获取响应体头部属性中的文件类型 callback(response, filetype_lst) filetype = filetype_lst[0] if filetype_lst else "" return stream, filetype else: retries += 1 except requests.RequestException as why: retries += 1 if show_error_log: logger.exception(why) return b'' def fetch_attachment( self, file_name: str, download_url: str, callback, **kwargs ): if not file_name or not download_url: raise AttachmentNullError results = self._fetch_attachment(callback, download_url, **kwargs) if len(results) == 2: filetype = results[-1] else: filetype = "" filename = clear_file_type_suffix(file_name, filetype) download_url = judge_file_url(download_url) # 保存本地临时文件 file_stream = results[0] local_temp_file = self._create_file(filename, filetype) with open(local_temp_file, "wb") as f: f.write(file_stream) '''上传/下载,无论失败/成功必须返回附件信息''' attachment = { "filename": "{}.{}".format(filename, filetype), "org_url": download_url } if len(file_stream) > 0: content_hash = tools.get_sha1(file_stream) try: attachment["fid"] = "{}.{}".format(content_hash, filetype) attachment["size"] = self._file_size(local_temp_file) attachment["ftype"] = filetype attachment["url"] = "oss" super().push_oss_from_local(attachment["fid"], local_temp_file) except Exception as e: logger.error( "[{}]上传失败,原因:{}".format(file_name, e.__class__.__name__) ) remove(local_temp_file) if "size" not in attachment or limit_file_size(attachment.get("size")): return attachment else: return {}