7 months ago · 32b1f486b6
--- a/FworkSpider/untils/attachment.py
+++ b/FworkSpider/untils/attachment.py
@@ -1,6 +1,6 @@
 
															 # -*- coding: utf-8 -*-
														
 
															 """
														
 
															-Created on 2020-09-06
														
 
															+Created on 2024-02-26
														
 
															 ---------
														
 
															 @summary: 附件下载模块
														
 
															 ---------
														
@@ -122,54 +122,73 @@ class AttachmentDownloader:
 
															         request_kwargs.setdefault("data", kwargs.pop("data", None))
														
 
															         request_kwargs.setdefault("json", kwargs.pop("json", None))
														
 
															         request_kwargs.setdefault("cookies", kwargs.pop("cookies", None))
														
 
															-        request_kwargs.setdefault("timeout", kwargs.pop("timeout", 60))
														
 
															+        request_kwargs.setdefault("timeout", kwargs.pop("timeout", (60,120)))
														
 
															         request_kwargs.setdefault("stream", kwargs.pop("stream", True))
														
 
															         request_kwargs.setdefault("verify", kwargs.pop("verify", False))
														
 
															         request_kwargs.setdefault("allow_redirects", kwargs.pop("allow_redirects", True))
														
 
															+        stream = io.BytesIO()
														
 
															         retries = 0
														
 
															         while retries < 3:
														
 
															             try:
														
 
															                 with requests.request(method, url, **request_kwargs) as req:
														
 
															-                    stream = io.BytesIO()
														
 
															-                    lh = {k.lower(): v for k, v in req.headers.items()}
														
 
															-                    cl = lh.get("content-length") or len(req.content)  # 内容长度
														
 
															-                    icl = int(cl)
														
 
															-                    content_length = self.calculate_size(icl)
														
 
															-                    if content_length > 50:
														
 
															-                        # 丢弃超过50Mb内容长度的文件
														
 
															-                        return stream.getvalue()
														
 
															-
														
 
															-                    if req.status_code != 200:
														
 
															-                        retries += 1
														
 
															-                        continue
														
 
															-
														
 
															-                    iter_content = req.iter_content(chunk_size=1024 * 20)
														
 
															+                    req.raise_for_status()
														
 
															+
														
 
															+                    lower_headers = {k.lower(): v for k, v in req.headers.items()}
														
 
															+                    content_length = lower_headers.get('content-length')
														
 
															+                    if content_length is not None:
														
 
															+                        content_length = self.calculate_size(int(content_length))
														
 
															+                        if content_length > 50:
														
 
															+                            # 丢弃超过50Mb内容长度的文件
														
 
															+                            return stream.getvalue()
														
 
															+                    else:
														
 
															+                        content_length = None
														
 
															+
														
 
															+                    chunk_size = 1024 * 20  # 20KB chunks
														
 
															+                    downloaded_size = 0
														
 
															                     with tqdm.tqdm(
														
 
															-                            total=icl,
														
 
															+                            total=content_length,
														
 
															                             unit="B",
														
 
															                             initial=0,
														
 
															                             unit_scale=True,
														
 
															                             unit_divisor=1024,  # 1M=1024Kb,单位换算
														
 
															                             ascii=True,
														
 
															                             desc=file) as bar:
														
 
															+
														
 
															+                        iter_content = req.iter_content(chunk_size=chunk_size)
														
 
															                         if file is not None:
														
 
															                             with open(file, "wb") as f:
														
 
															                                 for chunk in iter_content:
														
 
															-                                    stream.write(chunk)
														
 
															-                                    size = f.write(chunk)
														
 
															+                                    size = stream.write(chunk)
														
 
															+                                    f.write(chunk)
														
 
															                                     bar.update(size)
														
 
															+                                    downloaded_size += size
														
 
															+                                    content_length = self.calculate_size(downloaded_size)
														
 
															+                                    if content_length > 50:
														
 
															+                                        stream.truncate(0)  # 截断流，保留前0个字节，即清空流
														
 
															+                                        stream.seek(0)  # 将位置指针移回流的开始处
														
 
															+                                        break
														
 
															                         else:
														
 
															                             for chunk in iter_content:
														
 
															                                 size = stream.write(chunk)
														
 
															                                 bar.update(size)
														
 
															+                                downloaded_size += size
														
 
															+                                content_length = self.calculate_size(downloaded_size)
														
 
															+                                if content_length > 50:
														
 
															+                                    stream.truncate(0)  # 截断流，保留前0个字节，即清空流
														
 
															+                                    stream.seek(0)  # 将位置指针移回流的开始处
														
 
															+                                    break
														
 
															+
														
 
															                     return stream.getvalue()
														
 
															+
														
 
															             except requests.RequestException as why:
														
 
															+                stream.truncate(0)  # 截断流，保留前0个字节，即清空流
														
 
															+                stream.seek(0)  # 将位置指针移回流的开始处
														
 
															                 retries += 1
														
 
															                 if show_error_log:
														
 
															                     logger.exception(why)
														
 
															-        return b''
														
 
															+        return stream.getvalue()
														
 
															     def _push_oss_from_stream(self, filename, filetype, url, **kwargs):
														
 
															         """
														
@@ -199,6 +218,21 @@ class AttachmentDownloader:
 
															         return attachment
														
 
															+    def read_pdf_in_chunks(self, pdf_path, chunk_size=1024):
														
 
															+        try:
														
 
															+            with open(pdf_path, 'rb') as file:
														
 
															+                chunk = file.read(chunk_size)
														
 
															+                if "<</Names <</Dests 4 0 R>>" in str(chunk) and "SourceModified" in str(chunk):
														
 
															+                    return False
														
 
															+                elif "doctypehtml" not in str(chunk):
														
 
															+                    return True
														
 
															+                elif "%PDF" in str(chunk):
														
 
															+                    return True
														
 
															+                else:
														
 
															+                    return False
														
 
															+        except Exception as e:
														
 
															+            return False
														
 
															+
														
 
															     def _push_oss_from_local(self, filename, filetype, url, **kwargs):
														
 
															         """
														
 
															         上传本地文件到oss
														
@@ -214,6 +248,12 @@ class AttachmentDownloader:
 
															             "filename": "{}.{}".format(filename, filetype),
														
 
															             "org_url": url
														
 
															         }
														
 
															+
														
 
															+        if kwargs.get('is_check', None):
														
 
															+            if not self.read_pdf_in_chunks(file):
														
 
															+                self.remove(file)
														
 
															+                return attachment
														
 
															+
														
 
															         if len(stream) > 0:
														
 
															             content_hash = tools.get_sha1(stream)
														
 
															             try: