attachment.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. import hashlib
  2. import io
  3. import os
  4. import traceback
  5. import uuid
  6. import requests
  7. import tqdm
  8. import urllib3
  9. from untils.aliyun import AliYunService
  10. from untils.execptions import AttachmentNullError
  11. from untils.proxy_pool import ProxyPool
  12. urllib3.disable_warnings()
  13. headers = {
  14. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
  15. 'Accept': '*/*'
  16. }
  17. class AttachmentDownloader:
  18. """附件下载模块"""
  19. def __init__(self):
  20. self.dir_name = 'file'
  21. def create_dir(self):
  22. if not os.path.exists(self.dir_name):
  23. os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
  24. def create_file(self, filename, file_type):
  25. self.create_dir()
  26. sign = self._hash("{}_{}".format(filename, uuid.uuid4()))
  27. local_file_name = "{}.{}".format(sign, file_type)
  28. return "{}/{}".format(self.dir_name, local_file_name)
  29. def create_fid(self, data: bytes):
  30. return self._hash(data)
  31. @staticmethod
  32. def _hash(val):
  33. _sha1 = hashlib.sha1()
  34. if isinstance(val, bytes):
  35. _sha1.update(str(val).encode("utf-8"))
  36. elif isinstance(val, str):
  37. _sha1.update(val.encode("utf-8"))
  38. return _sha1.hexdigest()
  39. @staticmethod
  40. def clean_attachment(file_path):
  41. """
  42. 删除文件
  43. :param str file_path: 文件路径
  44. """
  45. try:
  46. os.remove(file_path)
  47. except FileNotFoundError:
  48. pass
  49. def remove(self, file):
  50. self.clean_attachment(file)
  51. @staticmethod
  52. def get_mb(data):
  53. """
  54. 获取数据的Mb
  55. :param int data: 准备计算大小的内容
  56. :return: float
  57. """
  58. _kb = float(data / 1024.0)
  59. return float(_kb / 1024.0)
  60. @staticmethod
  61. def getsize(data):
  62. """
  63. 计算数据大小
  64. :param data: 待上传的内容。
  65. :type data: bytes,str或file-like object
  66. :return str
  67. """
  68. size = 0
  69. if isinstance(data, str):
  70. try:
  71. size = os.path.getsize(data)
  72. except FileNotFoundError:
  73. pass
  74. elif isinstance(data, bytes):
  75. size = len(data)
  76. else:
  77. pass
  78. _kb = float(size) / 1024
  79. result = "{:.1f} kb".format(_kb)
  80. if _kb >= 1024:
  81. _M = _kb / 1024
  82. if _M >= 1024:
  83. _G = _M / 1024
  84. result = "{:.1f} G".format(_G)
  85. else:
  86. result = "{:.1f} M".format(_M)
  87. return result
  88. def fetch_data(self, url, file=None, **kwargs):
  89. """
  90. 数据下载
  91. :param str url: 下载地址
  92. :param file: 本地文件
  93. :param dict kwargs: requests请求参数
  94. :return:
  95. """
  96. enable_proxy = kwargs.pop('enable_proxy', False)
  97. allow_show_exception = kwargs.pop('allow_show_exception', False)
  98. method = kwargs.pop('method', 'get')
  99. request_params = {}
  100. request_params.setdefault('data', kwargs.pop('data', None))
  101. request_params.setdefault('cookies', kwargs.pop('cookies', None))
  102. request_params.setdefault('headers', kwargs.get('headers') or headers)
  103. request_params.setdefault('proxies', kwargs.get('proxies'))
  104. request_params.setdefault('timeout', kwargs.pop('timeout', 60))
  105. request_params.setdefault('stream', kwargs.pop('stream', True))
  106. request_params.setdefault('verify', kwargs.pop('verify', False))
  107. request_params.setdefault('allow_redirects', kwargs.pop('allow_redirects', True))
  108. retries = 0
  109. while retries < 3:
  110. try:
  111. with requests.request(method, url, **request_params) as req:
  112. stream = io.BytesIO()
  113. lh = {k.lower(): v for k, v in req.headers.items()}
  114. '''内容长度'''
  115. cl = lh.get('content-length') or len(req.content)
  116. icl = int(cl)
  117. content_length = self.get_mb(icl)
  118. if content_length > 50:
  119. '''丢弃超过50Mb内容长度的文件'''
  120. return stream.getvalue()
  121. if req.status_code != 200:
  122. retries += 1
  123. continue
  124. iter_content = req.iter_content(chunk_size=1024 * 20)
  125. with tqdm.tqdm(
  126. total=icl,
  127. unit='B',
  128. initial=0,
  129. unit_scale=True,
  130. unit_divisor=1024, # 1M=1024Kb,单位换算
  131. ascii=True,
  132. desc=file) as bar:
  133. if file is not None:
  134. with open(file, 'wb') as f:
  135. for chunk in iter_content:
  136. stream.write(chunk)
  137. size = f.write(chunk)
  138. bar.update(size)
  139. else:
  140. for chunk in iter_content:
  141. size = stream.write(chunk)
  142. bar.update(size)
  143. return stream.getvalue()
  144. except requests.RequestException:
  145. if allow_show_exception:
  146. traceback.print_exc()
  147. if enable_proxy:
  148. request_params.update({'proxies': ProxyPool().get()})
  149. retries += 1
  150. return b''
  151. def _push_oss_from_stream(self, file_name, file_type, url, **kw):
  152. """
  153. 将数据流推送oss
  154. :param str file_name: 文件名称
  155. :param str file_type: 文件类型
  156. :param str url: 下载地址
  157. :param dict kw: 额外下载信息
  158. :return: dict: 附件信息
  159. """
  160. stream = self.fetch_data(url, None, **kw)
  161. if len(stream) > 0:
  162. fid = self.create_fid(stream)
  163. try:
  164. result = {
  165. 'filename': file_name,
  166. 'ftype': file_type,
  167. 'fid': "{}.{}".format(fid, file_type),
  168. 'org_url': url,
  169. 'size': self.getsize(stream),
  170. 'url': 'oss',
  171. }
  172. AliYunService().push_oss_from_stream(result['fid'], stream)
  173. except Exception:
  174. result = {
  175. 'filename': file_name,
  176. 'org_url': url,
  177. }
  178. else:
  179. result = {
  180. 'filename': file_name,
  181. 'org_url': url,
  182. }
  183. return result
  184. def _push_oss_from_file(self, file_name, file_type, url, **kw):
  185. """
  186. 将本地文件推送oss
  187. :param str file_name: 文件名称
  188. :param str file_type: 文件类型
  189. :param str url: 下载地址
  190. :param dict kw: 额外下载信息
  191. :return: dict: 附件信息
  192. """
  193. file = self.create_file(file_name, file_type)
  194. stream = self.fetch_data(url, file, **kw)
  195. '''上传/下载,无论失败成功都需要返回文件基础信息'''
  196. if len(stream) > 0:
  197. fid = self.create_fid(stream)
  198. try:
  199. result = {
  200. 'filename': file_name,
  201. 'ftype': file_type,
  202. 'fid': "{}.{}".format(fid, file_type),
  203. 'org_url': url,
  204. 'size': self.getsize(file),
  205. 'url': 'oss',
  206. }
  207. AliYunService().push_oss_from_local(result['fid'], file)
  208. except Exception:
  209. result = {
  210. 'filename': file_name,
  211. 'org_url': url,
  212. }
  213. else:
  214. result = {
  215. 'filename': file_name,
  216. 'org_url': url,
  217. }
  218. '''删除本地临时文件'''
  219. self.remove(file)
  220. return result
  221. def _fetch_attachment(self, file_name, file_type, download_url, **kwargs):
  222. """
  223. 下载附件
  224. :param str file_name: 文件名称
  225. :param str file_type: 文件类型
  226. :param str download_url: 下载地址
  227. :param dict kwargs: 额外的附件下载配置
  228. :return: dict: 附件
  229. """
  230. mode = kwargs.pop('mode', 'local')
  231. if mode == "stream":
  232. res = self._push_oss_from_stream(
  233. file_name,
  234. file_type,
  235. download_url,
  236. **kwargs
  237. )
  238. else:
  239. res = self._push_oss_from_file(
  240. file_name,
  241. file_type,
  242. download_url,
  243. **kwargs
  244. )
  245. return res
  246. def fetch_attachment(
  247. self,
  248. file_name: str,
  249. file_type: str,
  250. download_url: str,
  251. **kw
  252. ):
  253. if not file_name or not file_type or not download_url:
  254. raise AttachmentNullError
  255. return self._fetch_attachment(file_name, file_type, download_url, **kw)