attachment.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. import hashlib
  2. import os
  3. import re
  4. import traceback
  5. import uuid
  6. import warnings
  7. from urllib.parse import urlparse, unquote
  8. import requests
  9. import urllib3
  10. from config.load import headers
  11. from utils.aliyun import AliYunService
  12. from utils.execptions import AttachmentNullError
  13. from utils.socks5 import Proxy
  14. urllib3.disable_warnings()
  15. def sha1(val):
  16. _sha1 = hashlib.sha1()
  17. if isinstance(val, bytes):
  18. _sha1.update(str(val).encode("utf-8"))
  19. elif isinstance(val, str):
  20. _sha1.update(val.encode("utf-8"))
  21. return _sha1.hexdigest()
  22. def remove(file_path: str):
  23. os.remove(file_path)
  24. def getsize(file_path: str):
  25. try:
  26. return os.path.getsize(file_path)
  27. except FileNotFoundError:
  28. return 0
  29. def discern_file_format(text):
  30. file_types = {
  31. 'pdf', 'doc', 'docx', 'rar', 'zip', 'gzzb', 'jpg', 'png', 'swf'
  32. }
  33. for file_type in file_types:
  34. all_file_format = [file_type, file_type.upper()]
  35. for t in all_file_format:
  36. result = re.match(f'.*{t}$', text, re.S)
  37. if result is not None:
  38. return t
  39. else:
  40. return None
  41. def extract_file_type(text):
  42. if text is None:
  43. return None
  44. return discern_file_format(text)
  45. def extract_file_name_by_href(href: str, file_type: str):
  46. """从url中抽取文件名称"""
  47. # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
  48. # 中文字符:[\u4e00 -\u9fa5]
  49. zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
  50. parser = urlparse(href)
  51. query = (parser.query or parser.path)
  52. result = re.search(f'.*\\.{file_type}', query, re.S)
  53. if result is not None:
  54. encode_str = unquote(result.group())
  55. name = re.search(zh_char_pattern, encode_str)
  56. if name is not None:
  57. return unquote(name.group())
  58. return None
  59. def extract_file_name(text):
  60. file_type = discern_file_format(text)
  61. if file_type is not None:
  62. repl = '.{}'.format(file_type)
  63. text = text.replace(repl, '')
  64. return text
  65. def verify_file_name(name):
  66. if extract_file_type(name) is None:
  67. raise ValueError
  68. class AttachmentDownloader(AliYunService):
  69. def __init__(self):
  70. super(AttachmentDownloader, self).__init__()
  71. self.dir_name = 'file'
  72. def _create_file(self, filename, filetype):
  73. os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
  74. file = "{filename}.{filetype}".format(
  75. filename=sha1("{}_{}".format(filename, uuid.uuid4())),
  76. filetype=filetype
  77. )
  78. return "{}/{}".format(self.dir_name, file)
  79. @staticmethod
  80. def _create_fid(file_stream: bytes):
  81. return sha1(file_stream)
  82. @staticmethod
  83. def _origin_filename(fid: str, filetype: str):
  84. return "{}.{}".format(fid, filetype)
  85. @staticmethod
  86. def _file_size(file: str):
  87. _kb = float(getsize(file)) / 1024
  88. if _kb >= 1024:
  89. _M = _kb / 1024
  90. if _M >= 1024:
  91. _G = _M / 1024
  92. return "{:.1f} G".format(_G)
  93. else:
  94. return "{:.1f} M".format(_M)
  95. else:
  96. return "{:.1f} kb".format(_kb)
  97. @staticmethod
  98. def _download(
  99. url: str,
  100. file: str,
  101. enable_proxy=False,
  102. allow_show_exception=False,
  103. **kwargs
  104. ):
  105. request_params = {}
  106. request_params.setdefault('headers', kwargs.get('headers') or headers)
  107. request_params.setdefault('proxies', kwargs.get('proxies'))
  108. request_params.setdefault('timeout', kwargs.get('timeout') or 60)
  109. request_params.setdefault('stream', kwargs.get('stream') or True)
  110. request_params.setdefault('verify', kwargs.get('verify') or False)
  111. proxy = Proxy(enable_proxy)
  112. retries = 0
  113. while retries < 3:
  114. try:
  115. with requests.get(url, **request_params) as req:
  116. if req.status_code == 200:
  117. stream = req.content
  118. with open(file, 'wb') as f:
  119. f.write(stream)
  120. return stream
  121. else:
  122. retries += 1
  123. except requests.RequestException:
  124. if allow_show_exception:
  125. traceback.print_exc()
  126. if enable_proxy:
  127. proxy.switch()
  128. request_params.update({'proxies': proxy.proxies})
  129. retries += 1
  130. return b''
  131. def download(
  132. self,
  133. file_name: str,
  134. file_type: str,
  135. download_url: str,
  136. enable_proxy=False,
  137. allow_request_exception=False,
  138. **kwargs
  139. ):
  140. if not file_name or not file_type or not download_url:
  141. raise AttachmentNullError
  142. local_tmp_file = self._create_file(file_name, file_type)
  143. file_stream = self._download(
  144. download_url,
  145. local_tmp_file,
  146. enable_proxy,
  147. allow_request_exception,
  148. **kwargs
  149. )
  150. result = {
  151. 'filename': '{}.{}'.format(file_name, file_type),
  152. 'org_url': download_url
  153. }
  154. if len(file_stream) > 0:
  155. try:
  156. fid = self._create_fid(file_stream)
  157. key = self._origin_filename(fid, file_type)
  158. result.setdefault('fid', key)
  159. result.setdefault('ftype', file_type)
  160. result.setdefault('size', self._file_size(local_tmp_file))
  161. result.setdefault('url', 'oss')
  162. super()._push_oss_from_local(key, local_tmp_file)
  163. except Exception as e:
  164. warnings.warn(
  165. "[{}]下载异常,原因:{}".format(file_name, e.__class__.__name__)
  166. )
  167. remove(local_tmp_file)
  168. '''上传/下载,无论失败/成功必须返回附件信息'''
  169. return result