3
0

attachment_res.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. import hashlib
  2. import os
  3. import re
  4. import traceback
  5. import uuid
  6. from urllib.parse import urlparse, unquote
  7. import requests
  8. import urllib3
  9. from feapder.utils.log import log as logger
  10. from untils.aliyun import AliYunService
  11. from untils.execptions import AttachmentNullError
  12. from untils.proxy_pool import ProxyPool
  13. urllib3.disable_warnings()
  14. # 文件文档类型
  15. DOCTYPE = {
  16. 'txt', 'rtf', 'dps', 'et', 'ett', 'xls',
  17. 'xlsx', 'xlsb', 'xlsm', 'xlt', 'ods', 'pmd', 'pmdx',
  18. 'doc', 'docm', 'docx', 'dot', 'dotm', 'dotx',
  19. 'odt', 'wps', 'csv', 'xml', 'xps'
  20. }
  21. # 压缩类型
  22. COMPRESSION_TYPE = {
  23. 'rar', 'zip', 'gzzb', '7z', 'tar', 'gz', 'bz2', 'jar', 'iso', 'cab',
  24. 'arj', 'lzh', 'ace', 'uue', 'edxz',
  25. }
  26. # 图片类型
  27. IMAGE_TYPE = {
  28. 'jpg', 'png', 'jpeg', 'tiff', 'gif', 'psd', 'raw', 'eps', 'svg', 'bmp',
  29. 'pdf'
  30. }
  31. # 其他类型
  32. OTHER_TYPE = {
  33. 'swf', 'nxzf', 'xezf', 'nxcf'
  34. }
  35. headers = {
  36. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
  37. 'Accept': '*/*'
  38. }
  39. def sha1(val):
  40. _sha1 = hashlib.sha1()
  41. if isinstance(val, bytes):
  42. _sha1.update(str(val).encode("utf-8"))
  43. elif isinstance(val, str):
  44. _sha1.update(val.encode("utf-8"))
  45. return _sha1.hexdigest()
  46. def remove(file_path: str):
  47. try:
  48. os.remove(file_path)
  49. except FileNotFoundError:
  50. pass
  51. def getsize(file):
  52. try:
  53. return os.path.getsize(file)
  54. except FileNotFoundError:
  55. return 0
  56. def discern_file_format(text, allow_show_waring=False):
  57. """
  58. 识别文件格式
  59. @param text: 识别文本
  60. @param allow_show_waring: 是否打印警告信息
  61. @return: 文件格式
  62. """
  63. file_types = {
  64. *DOCTYPE,
  65. *COMPRESSION_TYPE,
  66. *IMAGE_TYPE,
  67. *OTHER_TYPE
  68. }
  69. for file_type in file_types:
  70. all_file_format = [file_type, file_type.upper()]
  71. for t in all_file_format:
  72. result = re.match(f'.*{t}$', text, re.S)
  73. if result is not None:
  74. return t
  75. else:
  76. unknown_type = re.findall('[^.\\/:*?"<>|\r\n]+$', text, re.S)
  77. if allow_show_waring:
  78. logger.warning(f'[未识别文件类型]{unknown_type}')
  79. return None
  80. def extract_file_type(text):
  81. if text is None:
  82. return None
  83. return discern_file_format(text)
  84. def extract_file_name_by_href(href: str, file_type: str):
  85. """从url中抽取文件名称"""
  86. # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
  87. # 中文字符:[\u4e00 -\u9fa5]
  88. zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
  89. parser = urlparse(href)
  90. query = (parser.query or parser.path)
  91. result = re.search(f'.*\\.{file_type}', query, re.S)
  92. if result is not None:
  93. encode_str = unquote(result.group())
  94. name = re.search(zh_char_pattern, encode_str)
  95. if name is not None:
  96. return unquote(name.group())
  97. return None
  98. def extract_file_name(text):
  99. file_type = discern_file_format(text)
  100. if file_type is not None:
  101. repl = '.{}'.format(file_type)
  102. text = text.replace(repl, '')
  103. return text
  104. def verify_file_name(name):
  105. if extract_file_type(name) is None:
  106. raise ValueError
  107. # 去除附件名空格、两个后缀
  108. def clean_file_name(file_name: str, file_type: str):
  109. file_name = file_name.strip()
  110. if file_type in file_name:
  111. file_name = file_name.replace(f'.{file_type}', '')
  112. return file_name
  113. # 限制附件大小:size < 5 kb 不存入数据库
  114. def limit_file_size(file_size: str):
  115. _pattern = '^[0-9]\d*\.\d*|[1-9]\d*'
  116. if "M" in file_size or "m" in file_size:
  117. file_size = float("".join(re.findall(_pattern, file_size))) * 1000
  118. else:
  119. file_size = "".join(re.findall(_pattern, file_size))
  120. if float(file_size) < 5:
  121. return False
  122. else:
  123. return True
  124. # 判断附件地址是否正确
  125. def judge_file_url(file_url: str):
  126. file_url = file_url.strip()
  127. if " " in file_url:
  128. file_url = file_url.split(" ")[0]
  129. return file_url
  130. class AttachmentDownloader(AliYunService):
  131. def __init__(self):
  132. super(AttachmentDownloader, self).__init__()
  133. self.dir_name = 'file'
  134. def _create_file(self, filename, filetype):
  135. os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
  136. file = "{filename}.{filetype}".format(
  137. filename=sha1("{}_{}".format(filename, uuid.uuid4())),
  138. filetype=filetype
  139. )
  140. return "{}/{}".format(self.dir_name, file)
  141. @staticmethod
  142. def _create_fid(file_stream: bytes):
  143. return sha1(file_stream)
  144. @staticmethod
  145. def _origin_filename(fid: str, filetype: str):
  146. return "{}.{}".format(fid, filetype)
  147. @staticmethod
  148. def _file_size(file: str):
  149. _kb = float(getsize(file)) / 1024
  150. if _kb >= 1024:
  151. _M = _kb / 1024
  152. if _M >= 1024:
  153. _G = _M / 1024
  154. return "{:.1f} G".format(_G)
  155. else:
  156. return "{:.1f} M".format(_M)
  157. else:
  158. return "{:.1f} kb".format(_kb)
  159. @staticmethod
  160. def _fetch_attachment(
  161. get_file_type:str,
  162. file_type_name:str,
  163. url: str,
  164. enable_proxy=False,
  165. proxy={},
  166. allow_show_exception=False,
  167. **kwargs
  168. ):
  169. request_params = {}
  170. request_params.setdefault('headers', kwargs.get('headers') or headers)
  171. request_params.setdefault('proxies', kwargs.get('proxies'))
  172. request_params.setdefault('timeout', kwargs.get('timeout') or 60)
  173. request_params.setdefault('stream', kwargs.get('stream') or True)
  174. request_params.setdefault('verify', kwargs.get('verify') or False)
  175. if enable_proxy:
  176. proxy = ProxyPool()
  177. else:
  178. proxy = proxy
  179. retries = 0
  180. while retries < 3:
  181. try:
  182. with requests.get(url, **request_params) as req:
  183. if req.status_code == 200:
  184. stream = req.content
  185. '''
  186. file_type_name 响应头中附件后缀所对应的键
  187. get_file_type 取附件后缀的规则
  188. file_type_txt 附件响应头
  189. '''
  190. if len(get_file_type) > 10:
  191. file_types = []
  192. file_type_txt = req.headers.get(file_type_name)
  193. exec(get_file_type)
  194. if file_types:
  195. file_type = file_types[0]
  196. else:
  197. file_type = ''
  198. return stream,file_type
  199. else:
  200. return stream, get_file_type
  201. else:
  202. retries += 1
  203. except requests.RequestException:
  204. if allow_show_exception:
  205. traceback.print_exc()
  206. if enable_proxy:
  207. request_params.update({'proxies': proxy.get()})
  208. retries += 1
  209. return b''
  210. def fetch_attachment(
  211. self,
  212. get_file_type:str,
  213. file_name: str,
  214. file_type_name: str,
  215. download_url: str,
  216. enable_proxy=False,
  217. allow_show_exception=False,
  218. **kwargs
  219. ):
  220. if not file_name or not download_url:
  221. raise AttachmentNullError
  222. file_stream = self._fetch_attachment(
  223. get_file_type,
  224. file_type_name,
  225. download_url,
  226. enable_proxy,
  227. allow_show_exception=allow_show_exception,
  228. **kwargs
  229. )
  230. if len(file_stream) == 2:
  231. file_type = file_stream[-1]
  232. else:
  233. file_type = ''
  234. file_name = clean_file_name(file_name,file_type)
  235. download_url = judge_file_url(download_url)
  236. local_tmp_file = self._create_file(file_name, file_type)
  237. with open(local_tmp_file, 'wb') as f:
  238. f.write(file_stream[0])
  239. result = {
  240. 'filename': '{}.{}'.format(file_name, file_type),
  241. 'org_url': download_url
  242. }
  243. if len(file_stream[0]) > 0:
  244. try:
  245. fid = self._create_fid(file_stream[0])
  246. key = self._origin_filename(fid, file_type)
  247. result.setdefault('fid', key)
  248. result.setdefault('ftype', file_type)
  249. result.setdefault('size', self._file_size(local_tmp_file))
  250. result.setdefault('url', 'oss')
  251. super().push_oss_from_local(key, local_tmp_file)
  252. except Exception as e:
  253. logger.warning(
  254. "[{}]下载异常,原因:{}".format(file_name, e.__class__.__name__)
  255. )
  256. remove(local_tmp_file)
  257. '''上传/下载,无论失败/成功必须返回附件信息'''
  258. if "size" not in result or limit_file_size(result.get('size')):
  259. return result
  260. else:
  261. return {}