7 months ago · 32b1f486b6
--- a/FworkSpider/untils/attachment.py
+++ b/FworkSpider/untils/attachment.py
@@ -1,6 +1,6 @@
 
				 # -*- coding: utf-8 -*-
			
 
				 """
			
 
				-Created on 2020-09-06
			
 
				+Created on 2024-02-26
			
 
				 ---------
			
 
				 @summary: 附件下载模块
			
 
				 ---------
			
@@ -122,54 +122,73 @@ class AttachmentDownloader:
 
				         request_kwargs.setdefault("data", kwargs.pop("data", None))
			
 
				         request_kwargs.setdefault("json", kwargs.pop("json", None))
			
 
				         request_kwargs.setdefault("cookies", kwargs.pop("cookies", None))
			
 
				-        request_kwargs.setdefault("timeout", kwargs.pop("timeout", 60))
			
 
				+        request_kwargs.setdefault("timeout", kwargs.pop("timeout", (60,120)))
			
 
				         request_kwargs.setdefault("stream", kwargs.pop("stream", True))
			
 
				         request_kwargs.setdefault("verify", kwargs.pop("verify", False))
			
 
				         request_kwargs.setdefault("allow_redirects", kwargs.pop("allow_redirects", True))
			
 
				 
			
 
				+        stream = io.BytesIO()
			
 
				         retries = 0
			
 
				         while retries < 3:
			
 
				             try:
			
 
				                 with requests.request(method, url, **request_kwargs) as req:
			
 
				-                    stream = io.BytesIO()
			
 
				-                    lh = {k.lower(): v for k, v in req.headers.items()}
			
 
				-                    cl = lh.get("content-length") or len(req.content)  # 内容长度
			
 
				-                    icl = int(cl)
			
 
				-                    content_length = self.calculate_size(icl)
			
 
				-                    if content_length > 50:
			
 
				-                        # 丢弃超过50Mb内容长度的文件
			
 
				-                        return stream.getvalue()
			
 
				-
			
 
				-                    if req.status_code != 200:
			
 
				-                        retries += 1
			
 
				-                        continue
			
 
				-
			
 
				-                    iter_content = req.iter_content(chunk_size=1024 * 20)
			
 
				+                    req.raise_for_status()
			
 
				+
			
 
				+                    lower_headers = {k.lower(): v for k, v in req.headers.items()}
			
 
				+                    content_length = lower_headers.get('content-length')
			
 
				+                    if content_length is not None:
			
 
				+                        content_length = self.calculate_size(int(content_length))
			
 
				+                        if content_length > 50:
			
 
				+                            # 丢弃超过50Mb内容长度的文件
			
 
				+                            return stream.getvalue()
			
 
				+                    else:
			
 
				+                        content_length = None
			
 
				+
			
 
				+                    chunk_size = 1024 * 20  # 20KB chunks
			
 
				+                    downloaded_size = 0
			
 
				                     with tqdm.tqdm(
			
 
				-                            total=icl,
			
 
				+                            total=content_length,
			
 
				                             unit="B",
			
 
				                             initial=0,
			
 
				                             unit_scale=True,
			
 
				                             unit_divisor=1024,  # 1M=1024Kb,单位换算
			
 
				                             ascii=True,
			
 
				                             desc=file) as bar:
			
 
				+
			
 
				+                        iter_content = req.iter_content(chunk_size=chunk_size)
			
 
				                         if file is not None:
			
 
				                             with open(file, "wb") as f:
			
 
				                                 for chunk in iter_content:
			
 
				-                                    stream.write(chunk)
			
 
				-                                    size = f.write(chunk)
			
 
				+                                    size = stream.write(chunk)
			
 
				+                                    f.write(chunk)
			
 
				                                     bar.update(size)
			
 
				+                                    downloaded_size += size
			
 
				+                                    content_length = self.calculate_size(downloaded_size)
			
 
				+                                    if content_length > 50:
			
 
				+                                        stream.truncate(0)  # 截断流，保留前0个字节，即清空流
			
 
				+                                        stream.seek(0)  # 将位置指针移回流的开始处
			
 
				+                                        break
			
 
				                         else:
			
 
				                             for chunk in iter_content:
			
 
				                                 size = stream.write(chunk)
			
 
				                                 bar.update(size)
			
 
				+                                downloaded_size += size
			
 
				+                                content_length = self.calculate_size(downloaded_size)
			
 
				+                                if content_length > 50:
			
 
				+                                    stream.truncate(0)  # 截断流，保留前0个字节，即清空流
			
 
				+                                    stream.seek(0)  # 将位置指针移回流的开始处
			
 
				+                                    break
			
 
				+
			
 
				                     return stream.getvalue()
			
 
				+
			
 
				             except requests.RequestException as why:
			
 
				+                stream.truncate(0)  # 截断流，保留前0个字节，即清空流
			
 
				+                stream.seek(0)  # 将位置指针移回流的开始处
			
 
				                 retries += 1
			
 
				                 if show_error_log:
			
 
				                     logger.exception(why)
			
 
				 
			
 
				-        return b''
			
 
				+        return stream.getvalue()
			
 
				 
			
 
				     def _push_oss_from_stream(self, filename, filetype, url, **kwargs):
			
 
				         """
			
@@ -199,6 +218,21 @@ class AttachmentDownloader:
 
				 
			
 
				         return attachment
			
 
				 
			
 
				+    def read_pdf_in_chunks(self, pdf_path, chunk_size=1024):
			
 
				+        try:
			
 
				+            with open(pdf_path, 'rb') as file:
			
 
				+                chunk = file.read(chunk_size)
			
 
				+                if "<</Names <</Dests 4 0 R>>" in str(chunk) and "SourceModified" in str(chunk):
			
 
				+                    return False
			
 
				+                elif "doctypehtml" not in str(chunk):
			
 
				+                    return True
			
 
				+                elif "%PDF" in str(chunk):
			
 
				+                    return True
			
 
				+                else:
			
 
				+                    return False
			
 
				+        except Exception as e:
			
 
				+            return False
			
 
				+
			
 
				     def _push_oss_from_local(self, filename, filetype, url, **kwargs):
			
 
				         """
			
 
				         上传本地文件到oss
			
@@ -214,6 +248,12 @@ class AttachmentDownloader:
 
				             "filename": "{}.{}".format(filename, filetype),
			
 
				             "org_url": url
			
 
				         }
			
 
				+
			
 
				+        if kwargs.get('is_check', None):
			
 
				+            if not self.read_pdf_in_chunks(file):
			
 
				+                self.remove(file)
			
 
				+                return attachment
			
 
				+
			
 
				         if len(stream) > 0:
			
 
				             content_hash = tools.get_sha1(stream)
			
 
				             try: