import hashlib import os import sys import traceback import uuid from urllib import request import requests import urllib3 from feapder.setting import headers from untils.execptions import AttachmentNullError from untils.aliyun import AliYunService from untils.proxy_pool import ProxyPool import time import tqdm urllib3.disable_warnings() class AttachmentDownloader: '''附件下载模块''' def __init__(self): self.dir_name = 'file' def create_dir(self): if not os.path.exists(self.dir_name): os.makedirs(self.dir_name, mode=0o777, exist_ok=True) def create_file_path(self, filename, file_type): self.create_dir() sign = self.hex_sha1("{}_{}".format(filename, uuid.uuid4())) tmp_name = "{}.{}".format(sign, file_type) return "{}/{}".format(self.dir_name, tmp_name) def hex_sha1(self,val): sha1 = hashlib.sha1() if isinstance(val, bytes): sha1.update(str(val).encode("utf-8")) elif isinstance(val, str): sha1.update(val.encode("utf-8")) res = sha1.hexdigest() return res @staticmethod def create_fid(file_stream: bytes): sha1 = hashlib.sha1() if isinstance(file_stream, bytes): sha1.update(str(file_stream).encode("utf-8")) elif isinstance(file_stream, str): sha1.update(file_stream.encode("utf-8")) res = sha1.hexdigest() return res @staticmethod def clean_attachment(file_path): os.remove(file_path) @staticmethod def getsize(file_path: str): def _getsize(filename): try: return os.path.getsize(filename) except: return 0 _kb = float(_getsize(file_path)) / 1024 if _kb >= 1024: _M = _kb / 1024 if _M >= 1024: _G = _M / 1024 return "{:.1f} G".format(_G) else: return "{:.1f} M".format(_M) else: return "{:.1f} kb".format(_kb) @staticmethod def _fetch_attachment( url: str, file_path: str, enable_proxy=False, allow_show_exception=False, **kwargs ): request_params = {} request_params.setdefault('headers', kwargs.get('headers') or headers) request_params.setdefault('proxies', kwargs.get('proxies')) request_params.setdefault('timeout', kwargs.get('timeout') or 60) # request_params.setdefault('stream', kwargs.get('stream') or True) request_params.setdefault('verify', kwargs.get('verify') or False) if enable_proxy: proxy = ProxyPool().get() else: proxy = {} retries = 0 while retries < 3: try: with requests.get(url,stream=True, **request_params) as req: content_size = req.headers.get('Content-Length') or 0 content_size = int(content_size) stream = b'' if req.status_code == 200: with open(file_path, 'wb') as f: with tqdm.tqdm(total=content_size, unit='B', initial=0, unit_scale=True, unit_divisor=1024, ascii=True,desc=file_path) as bar: for chunk in req.iter_content(chunk_size=1024*20): if chunk: f.write(chunk) stream += chunk bar.update(len(chunk)) return stream else: retries += 1 except requests.RequestException: if allow_show_exception: traceback.print_exc() if enable_proxy: request_params.update({'proxies': ProxyPool().get()}) retries += 1 return b'' def fetch_attachment( self, file_name: str, file_type: str, download_url: str, enable_proxy=False, allow_request_exception=False, **kwargs ): if not file_name or not file_type or not download_url: raise AttachmentNullError file_path = self.create_file_path(file_name, file_type) file_stream = self._fetch_attachment( download_url, file_path, enable_proxy, allow_request_exception, **kwargs ) # file_stream = self.download_file(download_url,file_path,enable_proxy,allow_request_exception) if len(file_stream) > 0: fid = self.create_fid(file_stream) '''上传/下载,无论失败成功都需要给出文件基础信息''' try: result = { 'filename': file_name, 'ftype': file_type, 'fid': "{}.{}".format(fid, file_type), 'org_url': download_url, 'size': self.getsize(file_path), 'url': 'oss', } AliYunService().push_oss_from_local(result['fid'], file_path) except Exception: result = { 'filename': file_name, 'org_url': download_url, } self.clean_attachment(file_path) else: result = { 'filename': file_name, 'org_url': download_url, } return result def download_file(self, url, file_path, call_func=None,enable_proxy=False,data=None): """ Args: url: 地址 file_path: 文件存储地址 call_func: 下载成功的回调 Returns: """ # proxies = kwargs.get('proxies') or None # data = kwargs.get('data') or None start_time = time.time() def progress_callfunc(blocknum, blocksize, totalsize): """回调函数 @blocknum : 已经下载的数据块 @blocksize : 数据块的大小 @totalsize: 远程文件的大小 """ speed = (blocknum * blocksize) / (time.time() - start_time) # speed_str = " Speed: %.2f" % speed speed_str = " Speed: %s" % format_size(speed) recv_size = blocknum * blocksize # 设置下载进度条 f = sys.stdout pervent = recv_size / totalsize percent_str = "%.2f%%" % (pervent * 100) n = round(pervent * 50) s = ('#' * n).ljust(50, '-') f.write(percent_str.ljust(8, ' ') + '[' + s + ']' + speed_str) f.flush() f.write('\r') def format_size(bytes): try: bytes = float(bytes) kb = bytes / 1024 except: print("传入的字节格式不对") return "Error" if kb >= 1024: M = kb / 1024 if M >= 1024: G = M / 1024 return "%.3fG" % (G) else: return "%.3fM" % (M) else: return "%.3fK" % (kb) if url: try: if enable_proxy: proxies = ProxyPool().get() # create the object, assign it to a variable proxy = request.ProxyHandler(proxies) # construct a new opener using your proxy settings opener = request.build_opener(proxy) # install the openen on the module-level request.install_opener(opener) # 测试可以打开进度条,生产环境禁用进度条 filename, headers = request.urlretrieve(url, file_path, progress_callfunc, data) # filename, headers = request.urlretrieve(url, file_path, data) print(filename,headers) if callable(call_func): call_func() return filename except Exception as e: print(e) return '' else: return '' if __name__ == '__main__': url = 'https://gdgpo.czt.gd.gov.cn/gpx-bid-file/440606/gpx-tender/2022/5/9/8a7e15d780a438400180a6be91e90cb2.zip?accessCode=0cf1d12a48345bcb7e64ac9583e30207' attachment = AttachmentDownloader().fetch_attachment( file_name="file_name", file_type="pdf", download_url=url, enable_proxy=False) print(attachment)