123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244 |
- import hashlib
- import os
- import sys
- import traceback
- import uuid
- from urllib import request
- import requests
- import urllib3
- from feapder.setting import headers
- from untils.execptions import AttachmentNullError
- from untils.aliyun import AliYunService
- from untils.proxy_pool import ProxyPool
- import time
- import tqdm
- urllib3.disable_warnings()
- class AttachmentDownloader:
- '''附件下载模块'''
- def __init__(self):
- self.dir_name = 'file'
- def create_dir(self):
- if not os.path.exists(self.dir_name):
- os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
- def create_file_path(self, filename, file_type):
- self.create_dir()
- sign = self.hex_sha1("{}_{}".format(filename, uuid.uuid4()))
- tmp_name = "{}.{}".format(sign, file_type)
- return "{}/{}".format(self.dir_name, tmp_name)
- def hex_sha1(self,val):
- sha1 = hashlib.sha1()
- if isinstance(val, bytes):
- sha1.update(str(val).encode("utf-8"))
- elif isinstance(val, str):
- sha1.update(val.encode("utf-8"))
- res = sha1.hexdigest()
- return res
- @staticmethod
- def create_fid(file_stream: bytes):
- sha1 = hashlib.sha1()
- if isinstance(file_stream, bytes):
- sha1.update(str(file_stream).encode("utf-8"))
- elif isinstance(file_stream, str):
- sha1.update(file_stream.encode("utf-8"))
- res = sha1.hexdigest()
- return res
- @staticmethod
- def clean_attachment(file_path):
- os.remove(file_path)
- @staticmethod
- def getsize(file_path: str):
- def _getsize(filename):
- try:
- return os.path.getsize(filename)
- except:
- return 0
- _kb = float(_getsize(file_path)) / 1024
- if _kb >= 1024:
- _M = _kb / 1024
- if _M >= 1024:
- _G = _M / 1024
- return "{:.1f} G".format(_G)
- else:
- return "{:.1f} M".format(_M)
- else:
- return "{:.1f} kb".format(_kb)
- @staticmethod
- def _fetch_attachment(
- url: str,
- file_path: str,
- enable_proxy=False,
- allow_show_exception=False,
- **kwargs
- ):
- request_params = {}
- request_params.setdefault('headers', kwargs.get('headers') or headers)
- request_params.setdefault('proxies', kwargs.get('proxies'))
- request_params.setdefault('timeout', kwargs.get('timeout') or 60)
- # request_params.setdefault('stream', kwargs.get('stream') or True)
- request_params.setdefault('verify', kwargs.get('verify') or False)
- if enable_proxy:
- proxy = ProxyPool().get()
- else:
- proxy = {}
- retries = 0
- while retries < 3:
- try:
- with requests.get(url,stream=True, **request_params) as req:
- content_size = req.headers.get('Content-Length') or 0
- content_size = int(content_size)
- stream = b''
- if req.status_code == 200:
- with open(file_path, 'wb') as f:
- with tqdm.tqdm(total=content_size, unit='B', initial=0, unit_scale=True, unit_divisor=1024,
- ascii=True,desc=file_path) as bar:
- for chunk in req.iter_content(chunk_size=1024*20):
- if chunk:
- f.write(chunk)
- stream += chunk
- bar.update(len(chunk))
- return stream
- else:
- retries += 1
- except requests.RequestException:
- if allow_show_exception:
- traceback.print_exc()
- if enable_proxy:
- request_params.update({'proxies': ProxyPool().get()})
- retries += 1
- return b''
- def fetch_attachment(
- self,
- file_name: str,
- file_type: str,
- download_url: str,
- enable_proxy=False,
- allow_request_exception=False,
- **kwargs
- ):
- if not file_name or not file_type or not download_url:
- raise AttachmentNullError
- file_path = self.create_file_path(file_name, file_type)
- file_stream = self._fetch_attachment(
- download_url,
- file_path,
- enable_proxy,
- allow_request_exception,
- **kwargs
- )
- # file_stream = self.download_file(download_url,file_path,enable_proxy,allow_request_exception)
- if len(file_stream) > 0:
- fid = self.create_fid(file_stream)
- '''上传/下载,无论失败成功都需要给出文件基础信息'''
- try:
- result = {
- 'filename': file_name,
- 'ftype': file_type,
- 'fid': "{}.{}".format(fid, file_type),
- 'org_url': download_url,
- 'size': self.getsize(file_path),
- 'url': 'oss',
- }
- AliYunService().push_oss_from_local(result['fid'], file_path)
- except Exception:
- result = {
- 'filename': file_name,
- 'org_url': download_url,
- }
- self.clean_attachment(file_path)
- else:
- result = {
- 'filename': file_name,
- 'org_url': download_url,
- }
- return result
- def download_file(self, url, file_path, call_func=None,enable_proxy=False,data=None):
- """
- Args:
- url: 地址
- file_path: 文件存储地址
- call_func: 下载成功的回调
- Returns:
- """
- # proxies = kwargs.get('proxies') or None
- # data = kwargs.get('data') or None
- start_time = time.time()
- def progress_callfunc(blocknum, blocksize, totalsize):
- """回调函数
- @blocknum : 已经下载的数据块
- @blocksize : 数据块的大小
- @totalsize: 远程文件的大小
- """
- speed = (blocknum * blocksize) / (time.time() - start_time)
- # speed_str = " Speed: %.2f" % speed
- speed_str = " Speed: %s" % format_size(speed)
- recv_size = blocknum * blocksize
- # 设置下载进度条
- f = sys.stdout
- pervent = recv_size / totalsize
- percent_str = "%.2f%%" % (pervent * 100)
- n = round(pervent * 50)
- s = ('#' * n).ljust(50, '-')
- f.write(percent_str.ljust(8, ' ') + '[' + s + ']' + speed_str)
- f.flush()
- f.write('\r')
- def format_size(bytes):
- try:
- bytes = float(bytes)
- kb = bytes / 1024
- except:
- print("传入的字节格式不对")
- return "Error"
- if kb >= 1024:
- M = kb / 1024
- if M >= 1024:
- G = M / 1024
- return "%.3fG" % (G)
- else:
- return "%.3fM" % (M)
- else:
- return "%.3fK" % (kb)
- if url:
- try:
- if enable_proxy:
- proxies = ProxyPool().get()
- # create the object, assign it to a variable
- proxy = request.ProxyHandler(proxies)
- # construct a new opener using your proxy settings
- opener = request.build_opener(proxy)
- # install the openen on the module-level
- request.install_opener(opener)
- # 测试可以打开进度条,生产环境禁用进度条
- filename, headers = request.urlretrieve(url, file_path, progress_callfunc, data)
- # filename, headers = request.urlretrieve(url, file_path, data)
- print(filename,headers)
- if callable(call_func):
- call_func()
- return filename
- except Exception as e:
- print(e)
- return ''
- else:
- return ''
- if __name__ == '__main__':
- url = 'https://gdgpo.czt.gd.gov.cn/gpx-bid-file/440606/gpx-tender/2022/5/9/8a7e15d780a438400180a6be91e90cb2.zip?accessCode=0cf1d12a48345bcb7e64ac9583e30207'
- attachment = AttachmentDownloader().fetch_attachment(
- file_name="file_name", file_type="pdf", download_url=url,
- enable_proxy=False)
- print(attachment)
|