attachment.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. import hashlib
  2. import os
  3. import re
  4. import traceback
  5. import uuid
  6. from urllib.parse import urlparse, unquote
  7. import requests
  8. import urllib3
  9. from feapder.setting import headers
  10. from untils.execptions import AttachmentNullError
  11. from untils.aliyun import AliYunService
  12. from untils.proxy_pool import ProxyPool
  13. urllib3.disable_warnings()
  14. def hex_sha1(val):
  15. sha1 = hashlib.sha1()
  16. if isinstance(val, bytes):
  17. sha1.update(str(val).encode("utf-8"))
  18. elif isinstance(val, str):
  19. sha1.update(val.encode("utf-8"))
  20. res = sha1.hexdigest()
  21. return res
  22. def extract_file_type(text):
  23. if text is None:
  24. return None
  25. file_types = {
  26. 'pdf', 'doc', 'docx', 'rar', 'zip', 'gzzb', 'jpg', 'png'
  27. }
  28. for file_type in file_types:
  29. tmp = [file_type, file_type.upper()]
  30. for t in tmp:
  31. result = re.match(f'.*{t}$', text, re.S)
  32. if result is not None:
  33. return t
  34. else:
  35. return None
  36. def extract_file_name(href: str, file_type: str):
  37. # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
  38. # 中文字符:[\u4e00 -\u9fa5]
  39. zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
  40. parser = urlparse(href)
  41. query = (parser.query or parser.path)
  42. result = re.search(f'.*\\.{file_type}', query, re.S)
  43. if result is not None:
  44. encode_str = unquote(result.group())
  45. name = re.search(zh_char_pattern, encode_str)
  46. if name is not None:
  47. return unquote(name.group())
  48. return None
  49. def verify_file_name(name):
  50. if extract_file_type(name) is None:
  51. raise ValueError
  52. class AttachmentDownloader:
  53. def __init__(self):
  54. self.dir_name = '/file'
  55. def create_dir(self):
  56. if not os.path.exists(self.dir_name):
  57. os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
  58. def create_file_path(self, filename, file_type):
  59. self.create_dir()
  60. sign = hex_sha1("{}_{}".format(filename, uuid.uuid4()))
  61. tmp_name = "{}.{}".format(sign, file_type)
  62. return "{}/{}".format(self.dir_name, tmp_name)
  63. @staticmethod
  64. def create_fid(file_stream: bytes):
  65. return hex_sha1(file_stream)
  66. @staticmethod
  67. def _fetch_attachment(
  68. url: str,
  69. file_path: str,
  70. enable_proxy=False,
  71. allow_show_exception=False,
  72. **kwargs
  73. ):
  74. request_params = {}
  75. request_params.setdefault('headers', kwargs.get('headers') or headers)
  76. request_params.setdefault('proxies', kwargs.get('proxies'))
  77. request_params.setdefault('timeout', kwargs.get('timeout') or 60)
  78. request_params.setdefault('stream', kwargs.get('stream') or True)
  79. request_params.setdefault('verify', kwargs.get('verify') or False)
  80. if enable_proxy:
  81. proxy = ProxyPool()
  82. else:
  83. proxy = {}
  84. retries = 0
  85. while retries < 3:
  86. try:
  87. with requests.get(url, **request_params) as req:
  88. if req.status_code == 200:
  89. stream = req.content
  90. with open(file_path, 'wb') as f:
  91. f.write(stream)
  92. return stream
  93. else:
  94. retries += 1
  95. except requests.RequestException:
  96. if allow_show_exception:
  97. traceback.print_exc()
  98. if enable_proxy:
  99. request_params.update({'proxies': proxy.get()})
  100. retries += 1
  101. return b''
  102. @staticmethod
  103. def clean_attachment(file_path):
  104. os.remove(file_path)
  105. @staticmethod
  106. def getsize(file_path: str):
  107. def _getsize(filename):
  108. try:
  109. return os.path.getsize(filename)
  110. except:
  111. return 0
  112. _kb = float(_getsize(file_path)) / 1024
  113. if _kb >= 1024:
  114. _M = _kb / 1024
  115. if _M >= 1024:
  116. _G = _M / 1024
  117. return "{:.1f} G".format(_G)
  118. else:
  119. return "{:.1f} M".format(_M)
  120. else:
  121. return "{:.1f} kb".format(_kb)
  122. def fetch_attachment(
  123. self,
  124. file_name: str,
  125. file_type: str,
  126. download_url: str,
  127. enable_proxy=False,
  128. allow_request_exception=False,
  129. **kwargs
  130. ):
  131. if not file_name or not file_type or not download_url:
  132. raise AttachmentNullError
  133. file_path = self.create_file_path(file_name, file_type)
  134. file_stream = self._fetch_attachment(
  135. download_url,
  136. file_path,
  137. enable_proxy,
  138. allow_request_exception,
  139. **kwargs
  140. )
  141. if len(file_stream) > 0:
  142. fid = self.create_fid(file_stream)
  143. '''上传/下载,无论失败成功都需要给出文件基础信息'''
  144. try:
  145. result = {
  146. 'filename': file_name,
  147. 'ftype': file_type,
  148. 'fid': "{}.{}".format(fid, file_type),
  149. 'org_url': download_url,
  150. 'size': self.getsize(file_path),
  151. 'url': 'oss',
  152. }
  153. AliYunService().push_oss_from_local(result['fid'], file_path)
  154. except Exception:
  155. result = {
  156. 'filename': file_name,
  157. 'org_url': download_url,
  158. }
  159. self.clean_attachment(file_path)
  160. else:
  161. result = {
  162. 'filename': file_name,
  163. 'org_url': download_url,
  164. }
  165. return result
  166. # if __name__ == '__main__':
  167. # a = AttachmentDownloader().fetch_attachment(
  168. # file_name='成建制移民村(五标段)合同',
  169. # file_type='pdf',
  170. # download_url='http://222.75.70.90/NXGPPSP_MG/downloadFileServlet?req=F&num=8b80b23f7e729b88017e758a1b03422c'
  171. # )
  172. # print(a)