attachment.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. import hashlib
  2. import os
  3. import sys
  4. import traceback
  5. import uuid
  6. from urllib import request
  7. import requests
  8. import urllib3
  9. from feapder.setting import headers
  10. from untils.execptions import AttachmentNullError
  11. from untils.aliyun import AliYunService
  12. from untils.proxy_pool import ProxyPool
  13. import time
  14. import tqdm
  15. urllib3.disable_warnings()
  16. class AttachmentDownloader:
  17. '''附件下载模块'''
  18. def __init__(self):
  19. self.dir_name = 'file'
  20. def create_dir(self):
  21. if not os.path.exists(self.dir_name):
  22. os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
  23. def create_file_path(self, filename, file_type):
  24. self.create_dir()
  25. sign = self.hex_sha1("{}_{}".format(filename, uuid.uuid4()))
  26. tmp_name = "{}.{}".format(sign, file_type)
  27. return "{}/{}".format(self.dir_name, tmp_name)
  28. def hex_sha1(self,val):
  29. sha1 = hashlib.sha1()
  30. if isinstance(val, bytes):
  31. sha1.update(str(val).encode("utf-8"))
  32. elif isinstance(val, str):
  33. sha1.update(val.encode("utf-8"))
  34. res = sha1.hexdigest()
  35. return res
  36. @staticmethod
  37. def create_fid(file_stream: bytes):
  38. sha1 = hashlib.sha1()
  39. if isinstance(file_stream, bytes):
  40. sha1.update(str(file_stream).encode("utf-8"))
  41. elif isinstance(file_stream, str):
  42. sha1.update(file_stream.encode("utf-8"))
  43. res = sha1.hexdigest()
  44. return res
  45. @staticmethod
  46. def clean_attachment(file_path):
  47. os.remove(file_path)
  48. @staticmethod
  49. def getsize(file_path: str):
  50. def _getsize(filename):
  51. try:
  52. return os.path.getsize(filename)
  53. except:
  54. return 0
  55. _kb = float(_getsize(file_path)) / 1024
  56. if _kb >= 1024:
  57. _M = _kb / 1024
  58. if _M >= 1024:
  59. _G = _M / 1024
  60. return "{:.1f} G".format(_G)
  61. else:
  62. return "{:.1f} M".format(_M)
  63. else:
  64. return "{:.1f} kb".format(_kb)
  65. @staticmethod
  66. def _fetch_attachment(
  67. url: str,
  68. file_path: str,
  69. enable_proxy=False,
  70. allow_show_exception=False,
  71. **kwargs
  72. ):
  73. request_params = {}
  74. request_params.setdefault('headers', kwargs.get('headers') or headers)
  75. request_params.setdefault('proxies', kwargs.get('proxies'))
  76. request_params.setdefault('timeout', kwargs.get('timeout') or 60)
  77. # request_params.setdefault('stream', kwargs.get('stream') or True)
  78. request_params.setdefault('verify', kwargs.get('verify') or False)
  79. if enable_proxy:
  80. proxy = ProxyPool().get()
  81. else:
  82. proxy = {}
  83. retries = 0
  84. while retries < 3:
  85. try:
  86. with requests.get(url,stream=True, **request_params) as req:
  87. content_size = req.headers.get('Content-Length') or 0
  88. content_size = int(content_size)
  89. stream = b''
  90. if req.status_code == 200:
  91. with open(file_path, 'wb') as f:
  92. with tqdm.tqdm(total=content_size, unit='B', initial=0, unit_scale=True, unit_divisor=1024,
  93. ascii=True,desc=file_path) as bar:
  94. for chunk in req.iter_content(chunk_size=1024*20):
  95. if chunk:
  96. f.write(chunk)
  97. stream += chunk
  98. bar.update(len(chunk))
  99. return stream
  100. else:
  101. retries += 1
  102. except requests.RequestException:
  103. if allow_show_exception:
  104. traceback.print_exc()
  105. if enable_proxy:
  106. request_params.update({'proxies': ProxyPool().get()})
  107. retries += 1
  108. return b''
  109. def fetch_attachment(
  110. self,
  111. file_name: str,
  112. file_type: str,
  113. download_url: str,
  114. enable_proxy=False,
  115. allow_request_exception=False,
  116. **kwargs
  117. ):
  118. if not file_name or not file_type or not download_url:
  119. raise AttachmentNullError
  120. file_path = self.create_file_path(file_name, file_type)
  121. file_stream = self._fetch_attachment(
  122. download_url,
  123. file_path,
  124. enable_proxy,
  125. allow_request_exception,
  126. **kwargs
  127. )
  128. # file_stream = self.download_file(download_url,file_path,enable_proxy,allow_request_exception)
  129. if len(file_stream) > 0:
  130. fid = self.create_fid(file_stream)
  131. '''上传/下载,无论失败成功都需要给出文件基础信息'''
  132. try:
  133. result = {
  134. 'filename': file_name,
  135. 'ftype': file_type,
  136. 'fid': "{}.{}".format(fid, file_type),
  137. 'org_url': download_url,
  138. 'size': self.getsize(file_path),
  139. 'url': 'oss',
  140. }
  141. AliYunService().push_oss_from_local(result['fid'], file_path)
  142. except Exception:
  143. result = {
  144. 'filename': file_name,
  145. 'org_url': download_url,
  146. }
  147. self.clean_attachment(file_path)
  148. else:
  149. result = {
  150. 'filename': file_name,
  151. 'org_url': download_url,
  152. }
  153. return result
  154. def download_file(self, url, file_path, call_func=None,enable_proxy=False,data=None):
  155. """
  156. Args:
  157. url: 地址
  158. file_path: 文件存储地址
  159. call_func: 下载成功的回调
  160. Returns:
  161. """
  162. # proxies = kwargs.get('proxies') or None
  163. # data = kwargs.get('data') or None
  164. start_time = time.time()
  165. def progress_callfunc(blocknum, blocksize, totalsize):
  166. """回调函数
  167. @blocknum : 已经下载的数据块
  168. @blocksize : 数据块的大小
  169. @totalsize: 远程文件的大小
  170. """
  171. speed = (blocknum * blocksize) / (time.time() - start_time)
  172. # speed_str = " Speed: %.2f" % speed
  173. speed_str = " Speed: %s" % format_size(speed)
  174. recv_size = blocknum * blocksize
  175. # 设置下载进度条
  176. f = sys.stdout
  177. pervent = recv_size / totalsize
  178. percent_str = "%.2f%%" % (pervent * 100)
  179. n = round(pervent * 50)
  180. s = ('#' * n).ljust(50, '-')
  181. f.write(percent_str.ljust(8, ' ') + '[' + s + ']' + speed_str)
  182. f.flush()
  183. f.write('\r')
  184. def format_size(bytes):
  185. try:
  186. bytes = float(bytes)
  187. kb = bytes / 1024
  188. except:
  189. print("传入的字节格式不对")
  190. return "Error"
  191. if kb >= 1024:
  192. M = kb / 1024
  193. if M >= 1024:
  194. G = M / 1024
  195. return "%.3fG" % (G)
  196. else:
  197. return "%.3fM" % (M)
  198. else:
  199. return "%.3fK" % (kb)
  200. if url:
  201. try:
  202. if enable_proxy:
  203. proxies = ProxyPool().get()
  204. # create the object, assign it to a variable
  205. proxy = request.ProxyHandler(proxies)
  206. # construct a new opener using your proxy settings
  207. opener = request.build_opener(proxy)
  208. # install the openen on the module-level
  209. request.install_opener(opener)
  210. # 测试可以打开进度条,生产环境禁用进度条
  211. filename, headers = request.urlretrieve(url, file_path, progress_callfunc, data)
  212. # filename, headers = request.urlretrieve(url, file_path, data)
  213. print(filename,headers)
  214. if callable(call_func):
  215. call_func()
  216. return filename
  217. except Exception as e:
  218. print(e)
  219. return ''
  220. else:
  221. return ''
  222. if __name__ == '__main__':
  223. url = 'https://gdgpo.czt.gd.gov.cn/gpx-bid-file/440606/gpx-tender/2022/5/9/8a7e15d780a438400180a6be91e90cb2.zip?accessCode=0cf1d12a48345bcb7e64ac9583e30207'
  224. attachment = AttachmentDownloader().fetch_attachment(
  225. file_name="file_name", file_type="pdf", download_url=url,
  226. enable_proxy=False)
  227. print(attachment)