|
@@ -1,6 +1,6 @@
|
|
# -*- coding: utf-8 -*-
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
"""
|
|
-Created on 2020-09-06
|
|
|
|
|
|
+Created on 2024-02-26
|
|
---------
|
|
---------
|
|
@summary: 附件下载模块
|
|
@summary: 附件下载模块
|
|
---------
|
|
---------
|
|
@@ -122,54 +122,73 @@ class AttachmentDownloader:
|
|
request_kwargs.setdefault("data", kwargs.pop("data", None))
|
|
request_kwargs.setdefault("data", kwargs.pop("data", None))
|
|
request_kwargs.setdefault("json", kwargs.pop("json", None))
|
|
request_kwargs.setdefault("json", kwargs.pop("json", None))
|
|
request_kwargs.setdefault("cookies", kwargs.pop("cookies", None))
|
|
request_kwargs.setdefault("cookies", kwargs.pop("cookies", None))
|
|
- request_kwargs.setdefault("timeout", kwargs.pop("timeout", 60))
|
|
|
|
|
|
+ request_kwargs.setdefault("timeout", kwargs.pop("timeout", (60,120)))
|
|
request_kwargs.setdefault("stream", kwargs.pop("stream", True))
|
|
request_kwargs.setdefault("stream", kwargs.pop("stream", True))
|
|
request_kwargs.setdefault("verify", kwargs.pop("verify", False))
|
|
request_kwargs.setdefault("verify", kwargs.pop("verify", False))
|
|
request_kwargs.setdefault("allow_redirects", kwargs.pop("allow_redirects", True))
|
|
request_kwargs.setdefault("allow_redirects", kwargs.pop("allow_redirects", True))
|
|
|
|
|
|
|
|
+ stream = io.BytesIO()
|
|
retries = 0
|
|
retries = 0
|
|
while retries < 3:
|
|
while retries < 3:
|
|
try:
|
|
try:
|
|
with requests.request(method, url, **request_kwargs) as req:
|
|
with requests.request(method, url, **request_kwargs) as req:
|
|
- stream = io.BytesIO()
|
|
|
|
- lh = {k.lower(): v for k, v in req.headers.items()}
|
|
|
|
- cl = lh.get("content-length") or len(req.content) # 内容长度
|
|
|
|
- icl = int(cl)
|
|
|
|
- content_length = self.calculate_size(icl)
|
|
|
|
- if content_length > 50:
|
|
|
|
- # 丢弃超过50Mb内容长度的文件
|
|
|
|
- return stream.getvalue()
|
|
|
|
-
|
|
|
|
- if req.status_code != 200:
|
|
|
|
- retries += 1
|
|
|
|
- continue
|
|
|
|
-
|
|
|
|
- iter_content = req.iter_content(chunk_size=1024 * 20)
|
|
|
|
|
|
+ req.raise_for_status()
|
|
|
|
+
|
|
|
|
+ lower_headers = {k.lower(): v for k, v in req.headers.items()}
|
|
|
|
+ content_length = lower_headers.get('content-length')
|
|
|
|
+ if content_length is not None:
|
|
|
|
+ content_length = self.calculate_size(int(content_length))
|
|
|
|
+ if content_length > 50:
|
|
|
|
+ # 丢弃超过50Mb内容长度的文件
|
|
|
|
+ return stream.getvalue()
|
|
|
|
+ else:
|
|
|
|
+ content_length = None
|
|
|
|
+
|
|
|
|
+ chunk_size = 1024 * 20 # 20KB chunks
|
|
|
|
+ downloaded_size = 0
|
|
with tqdm.tqdm(
|
|
with tqdm.tqdm(
|
|
- total=icl,
|
|
|
|
|
|
+ total=content_length,
|
|
unit="B",
|
|
unit="B",
|
|
initial=0,
|
|
initial=0,
|
|
unit_scale=True,
|
|
unit_scale=True,
|
|
unit_divisor=1024, # 1M=1024Kb,单位换算
|
|
unit_divisor=1024, # 1M=1024Kb,单位换算
|
|
ascii=True,
|
|
ascii=True,
|
|
desc=file) as bar:
|
|
desc=file) as bar:
|
|
|
|
+
|
|
|
|
+ iter_content = req.iter_content(chunk_size=chunk_size)
|
|
if file is not None:
|
|
if file is not None:
|
|
with open(file, "wb") as f:
|
|
with open(file, "wb") as f:
|
|
for chunk in iter_content:
|
|
for chunk in iter_content:
|
|
- stream.write(chunk)
|
|
|
|
- size = f.write(chunk)
|
|
|
|
|
|
+ size = stream.write(chunk)
|
|
|
|
+ f.write(chunk)
|
|
bar.update(size)
|
|
bar.update(size)
|
|
|
|
+ downloaded_size += size
|
|
|
|
+ content_length = self.calculate_size(downloaded_size)
|
|
|
|
+ if content_length > 50:
|
|
|
|
+ stream.truncate(0) # 截断流,保留前0个字节,即清空流
|
|
|
|
+ stream.seek(0) # 将位置指针移回流的开始处
|
|
|
|
+ break
|
|
else:
|
|
else:
|
|
for chunk in iter_content:
|
|
for chunk in iter_content:
|
|
size = stream.write(chunk)
|
|
size = stream.write(chunk)
|
|
bar.update(size)
|
|
bar.update(size)
|
|
|
|
+ downloaded_size += size
|
|
|
|
+ content_length = self.calculate_size(downloaded_size)
|
|
|
|
+ if content_length > 50:
|
|
|
|
+ stream.truncate(0) # 截断流,保留前0个字节,即清空流
|
|
|
|
+ stream.seek(0) # 将位置指针移回流的开始处
|
|
|
|
+ break
|
|
|
|
+
|
|
return stream.getvalue()
|
|
return stream.getvalue()
|
|
|
|
+
|
|
except requests.RequestException as why:
|
|
except requests.RequestException as why:
|
|
|
|
+ stream.truncate(0) # 截断流,保留前0个字节,即清空流
|
|
|
|
+ stream.seek(0) # 将位置指针移回流的开始处
|
|
retries += 1
|
|
retries += 1
|
|
if show_error_log:
|
|
if show_error_log:
|
|
logger.exception(why)
|
|
logger.exception(why)
|
|
|
|
|
|
- return b''
|
|
|
|
|
|
+ return stream.getvalue()
|
|
|
|
|
|
def _push_oss_from_stream(self, filename, filetype, url, **kwargs):
|
|
def _push_oss_from_stream(self, filename, filetype, url, **kwargs):
|
|
"""
|
|
"""
|
|
@@ -199,6 +218,21 @@ class AttachmentDownloader:
|
|
|
|
|
|
return attachment
|
|
return attachment
|
|
|
|
|
|
|
|
+ def read_pdf_in_chunks(self, pdf_path, chunk_size=1024):
|
|
|
|
+ try:
|
|
|
|
+ with open(pdf_path, 'rb') as file:
|
|
|
|
+ chunk = file.read(chunk_size)
|
|
|
|
+ if "<</Names <</Dests 4 0 R>>" in str(chunk) and "SourceModified" in str(chunk):
|
|
|
|
+ return False
|
|
|
|
+ elif "doctypehtml" not in str(chunk):
|
|
|
|
+ return True
|
|
|
|
+ elif "%PDF" in str(chunk):
|
|
|
|
+ return True
|
|
|
|
+ else:
|
|
|
|
+ return False
|
|
|
|
+ except Exception as e:
|
|
|
|
+ return False
|
|
|
|
+
|
|
def _push_oss_from_local(self, filename, filetype, url, **kwargs):
|
|
def _push_oss_from_local(self, filename, filetype, url, **kwargs):
|
|
"""
|
|
"""
|
|
上传本地文件到oss
|
|
上传本地文件到oss
|
|
@@ -214,6 +248,12 @@ class AttachmentDownloader:
|
|
"filename": "{}.{}".format(filename, filetype),
|
|
"filename": "{}.{}".format(filename, filetype),
|
|
"org_url": url
|
|
"org_url": url
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ if kwargs.get('is_check', None):
|
|
|
|
+ if not self.read_pdf_in_chunks(file):
|
|
|
|
+ self.remove(file)
|
|
|
|
+ return attachment
|
|
|
|
+
|
|
if len(stream) > 0:
|
|
if len(stream) > 0:
|
|
content_hash = tools.get_sha1(stream)
|
|
content_hash = tools.get_sha1(stream)
|
|
try:
|
|
try:
|