attachment_res.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2022-03-06
  4. ---------
  5. @summary: 附件下载模块res
  6. ---------
  7. @author: Lzz
  8. """
  9. import os
  10. import re
  11. import uuid
  12. from urllib.parse import urlparse, unquote
  13. import requests
  14. import urllib3
  15. import feapder.utils.tools as tools
  16. from feapder.utils.log import log as logger
  17. from feapder.utils.oss import JyOssClient, OssBucketClient
  18. urllib3.disable_warnings()
  19. # 文件文档类型
  20. DOCTYPE = {
  21. "txt", "rtf", "dps", "et", "ett", "xls",
  22. "xlsx", "xlsb", "xlsm", "xlt", "ods", "pmd", "pmdx",
  23. "doc", "docm", "docx", "dot", "dotm", "dotx",
  24. "odt", "wps", "csv", "xml", "xps"
  25. }
  26. # 压缩类型
  27. COMPRESSION_TYPE = {
  28. "rar", "zip", "gzzb", "7z", "tar", "gz", "bz2", "jar", "iso", "cab",
  29. "arj", "lzh", "ace", "uue", "edxz",
  30. }
  31. # 图片类型
  32. IMAGE_TYPE = {
  33. "jpg", "png", "jpeg", "tiff", "gif", "psd", "raw", "eps", "svg", "bmp",
  34. "pdf"
  35. }
  36. # 其他类型
  37. OTHER_TYPE = {
  38. "swf", "nxzf", "xezf", "nxcf"
  39. }
  40. headers = {
  41. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
  42. "Accept": "*/*"
  43. }
  44. def remove(file_path: str):
  45. try:
  46. os.remove(file_path)
  47. except FileNotFoundError:
  48. pass
  49. def getsize(file):
  50. try:
  51. return os.path.getsize(file)
  52. except FileNotFoundError:
  53. return 0
  54. def discern_file_format(text, show_warn_log=False):
  55. """
  56. 识别文件格式
  57. @param text: 识别文本
  58. @param show_warn_log: 是否打印警告信息
  59. @return: 文件格式
  60. """
  61. file_types = {
  62. *DOCTYPE,
  63. *COMPRESSION_TYPE,
  64. *IMAGE_TYPE,
  65. *OTHER_TYPE
  66. }
  67. for file_type in file_types:
  68. all_file_format = [file_type, file_type.upper()]
  69. for t in all_file_format:
  70. result = re.match(f".*{t}$", text, re.S)
  71. if result is not None:
  72. return t
  73. else:
  74. unknown_type = re.findall('[^.\\/:*?"<>|\r\n]+$', text, re.S)
  75. if show_warn_log:
  76. logger.warning(f"[未识别文件类型]{unknown_type}")
  77. return None
  78. def extract_file_type(text):
  79. if text is None:
  80. return None
  81. return discern_file_format(text)
  82. def extract_file_name_by_href(href: str, file_type: str):
  83. """从url中抽取文件名称"""
  84. # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
  85. # 中文字符:[\u4e00 -\u9fa5]
  86. zh_char_pattern = "[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+"
  87. parser = urlparse(href)
  88. query = (parser.query or parser.path)
  89. result = re.search(f".*\\.{file_type}", query, re.S)
  90. if result is not None:
  91. encode_str = unquote(result.group())
  92. name = re.search(zh_char_pattern, encode_str)
  93. if name is not None:
  94. return unquote(name.group())
  95. return None
  96. def extract_file_name(text):
  97. file_type = discern_file_format(text)
  98. if file_type is not None:
  99. repl = ".{}".format(file_type)
  100. text = text.replace(repl, '')
  101. return text
  102. def verify_file_name(name):
  103. if extract_file_type(name) is None:
  104. raise ValueError
  105. # 去除附件名空格、两个后缀
  106. def clear_file_type_suffix(file_name: str, file_type: str):
  107. file_name = file_name.strip()
  108. if file_type in file_name:
  109. file_name = file_name.replace(f".{file_type}", '')
  110. return file_name
  111. # 限制附件大小:size < 5 kb 不存入数据库
  112. def limit_file_size(file_size: str):
  113. _pattern = "^[0-9]\d*\.\d*|[1-9]\d*"
  114. if "M" in file_size or "m" in file_size:
  115. file_size = float("".join(re.findall(_pattern, file_size))) * 1000
  116. else:
  117. file_size = "".join(re.findall(_pattern, file_size))
  118. if float(file_size) < 5:
  119. return False
  120. else:
  121. return True
  122. # 判断附件地址是否正确
  123. def judge_file_url(file_url: str):
  124. file_url = file_url.strip()
  125. if " " in file_url:
  126. file_url = file_url.split(" ")[0]
  127. return file_url
  128. class Downloader:
  129. def __init__(self):
  130. self.dir_name = "file"
  131. # self._oss = JyOssClient()
  132. self._bucket = OssBucketClient()
  133. def _create_file(self, filename, filetype):
  134. os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
  135. file = "{filename}.{filetype}".format(
  136. filename=tools.get_sha1("{}_{}".format(filename, uuid.uuid4())),
  137. filetype=filetype
  138. )
  139. return "{}/{}".format(self.dir_name, file)
  140. @staticmethod
  141. def _file_size(file: str):
  142. _kb = float(getsize(file)) / 1024
  143. if _kb >= 1024:
  144. _M = _kb / 1024
  145. if _M >= 1024:
  146. _G = _M / 1024
  147. return "{:.1f} G".format(_G)
  148. else:
  149. return "{:.1f} M".format(_M)
  150. else:
  151. return "{:.1f} kb".format(_kb)
  152. @staticmethod
  153. def fetch_data(
  154. url: str,
  155. callback=None,
  156. proxies=None,
  157. show_error_log=False,
  158. **kwargs
  159. ):
  160. """
  161. 下载数据
  162. @param url: 文件下载地址
  163. @param callback: 回调函数 可以是函数 也可是函数名
  164. @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
  165. @param show_error_log: 展示错误堆栈信息日志
  166. """
  167. method = kwargs.pop("method", "get")
  168. request_params = {}
  169. request_params.setdefault("proxies", proxies)
  170. request_params.setdefault("headers", kwargs.get("headers") or headers)
  171. request_params.setdefault("timeout", kwargs.pop("timeout", 60))
  172. request_params.setdefault("params", kwargs.pop("params", None))
  173. request_params.setdefault("data", kwargs.pop("data", None))
  174. request_params.setdefault("json", kwargs.pop("json", None))
  175. request_params.setdefault("cookies", kwargs.pop("cookies", None))
  176. request_params.setdefault("verify", kwargs.pop("verify", False))
  177. request_params.setdefault("stream", kwargs.pop("stream", True))
  178. retries = 0
  179. while retries < 3:
  180. try:
  181. with requests.request(method, url, **request_params) as response:
  182. if response.status_code == 200:
  183. stream = response.content
  184. filetype_lst = [] # 文件类型列表
  185. if callable(callback):
  186. # 通过自定义的回调函数 获取 response.headers 文件类型添加到 filetype_lst
  187. callback(response, filetype_lst)
  188. filetype = filetype_lst[0] if filetype_lst else ""
  189. return stream, filetype
  190. else:
  191. retries += 1
  192. except requests.RequestException as why:
  193. retries += 1
  194. if show_error_log:
  195. logger.exception(why)
  196. return b"", ""
  197. def fetch_attachment(
  198. self,
  199. file_name: str,
  200. download_url: str,
  201. file_type=None,
  202. callback=None,
  203. **kwargs
  204. ):
  205. file_kwargs = dict(callback=callback, url=download_url, **kwargs)
  206. filestream, filetype = self.fetch_data(**file_kwargs)
  207. filetype = file_type or filetype
  208. filename = clear_file_type_suffix(file_name, filetype)
  209. download_url = judge_file_url(download_url)
  210. # 保存本地临时文件
  211. local_temp_file = self._create_file(filename, filetype)
  212. with open(local_temp_file, "wb") as f:
  213. f.write(filestream)
  214. '''上传/下载,无论失败/成功必须返回附件信息'''
  215. attachment = {
  216. "filename": "{}.{}".format(filename, filetype),
  217. "org_url": download_url
  218. }
  219. if len(filestream) > 0:
  220. content_hash = tools.get_sha1(filestream)
  221. try:
  222. attachment["fid"] = "{}.{}".format(content_hash, filetype)
  223. attachment["size"] = self._file_size(local_temp_file)
  224. attachment["ftype"] = filetype
  225. attachment["url"] = "oss"
  226. # self._oss.upload("file", attachment["fid"], filestream)
  227. self._bucket.put_object_from_file(attachment["fid"], local_temp_file)
  228. except Exception as e:
  229. logger.error(
  230. "[{}]上传失败,原因:{}".format(file_name, e.__class__.__name__)
  231. )
  232. remove(local_temp_file) # 删除本地临时文件
  233. if "size" not in attachment or limit_file_size(attachment.get("size")):
  234. return attachment
  235. else:
  236. return {}
  237. AttachmentDownloader = Downloader