Browse Source

添加附件完整性检查

dongzhaorui 7 months ago
parent
commit
32b1f486b6
1 changed files with 60 additions and 20 deletions
  1. 60 20
      FworkSpider/untils/attachment.py

+ 60 - 20
FworkSpider/untils/attachment.py

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Created on 2020-09-06
+Created on 2024-02-26
 ---------
 @summary: 附件下载模块
 ---------
@@ -122,54 +122,73 @@ class AttachmentDownloader:
         request_kwargs.setdefault("data", kwargs.pop("data", None))
         request_kwargs.setdefault("json", kwargs.pop("json", None))
         request_kwargs.setdefault("cookies", kwargs.pop("cookies", None))
-        request_kwargs.setdefault("timeout", kwargs.pop("timeout", 60))
+        request_kwargs.setdefault("timeout", kwargs.pop("timeout", (60,120)))
         request_kwargs.setdefault("stream", kwargs.pop("stream", True))
         request_kwargs.setdefault("verify", kwargs.pop("verify", False))
         request_kwargs.setdefault("allow_redirects", kwargs.pop("allow_redirects", True))
 
+        stream = io.BytesIO()
         retries = 0
         while retries < 3:
             try:
                 with requests.request(method, url, **request_kwargs) as req:
-                    stream = io.BytesIO()
-                    lh = {k.lower(): v for k, v in req.headers.items()}
-                    cl = lh.get("content-length") or len(req.content)  # 内容长度
-                    icl = int(cl)
-                    content_length = self.calculate_size(icl)
-                    if content_length > 50:
-                        # 丢弃超过50Mb内容长度的文件
-                        return stream.getvalue()
-
-                    if req.status_code != 200:
-                        retries += 1
-                        continue
-
-                    iter_content = req.iter_content(chunk_size=1024 * 20)
+                    req.raise_for_status()
+
+                    lower_headers = {k.lower(): v for k, v in req.headers.items()}
+                    content_length = lower_headers.get('content-length')
+                    if content_length is not None:
+                        content_length = self.calculate_size(int(content_length))
+                        if content_length > 50:
+                            # 丢弃超过50Mb内容长度的文件
+                            return stream.getvalue()
+                    else:
+                        content_length = None
+
+                    chunk_size = 1024 * 20  # 20KB chunks
+                    downloaded_size = 0
                     with tqdm.tqdm(
-                            total=icl,
+                            total=content_length,
                             unit="B",
                             initial=0,
                             unit_scale=True,
                             unit_divisor=1024,  # 1M=1024Kb,单位换算
                             ascii=True,
                             desc=file) as bar:
+
+                        iter_content = req.iter_content(chunk_size=chunk_size)
                         if file is not None:
                             with open(file, "wb") as f:
                                 for chunk in iter_content:
-                                    stream.write(chunk)
-                                    size = f.write(chunk)
+                                    size = stream.write(chunk)
+                                    f.write(chunk)
                                     bar.update(size)
+                                    downloaded_size += size
+                                    content_length = self.calculate_size(downloaded_size)
+                                    if content_length > 50:
+                                        stream.truncate(0)  # 截断流,保留前0个字节,即清空流
+                                        stream.seek(0)  # 将位置指针移回流的开始处
+                                        break
                         else:
                             for chunk in iter_content:
                                 size = stream.write(chunk)
                                 bar.update(size)
+                                downloaded_size += size
+                                content_length = self.calculate_size(downloaded_size)
+                                if content_length > 50:
+                                    stream.truncate(0)  # 截断流,保留前0个字节,即清空流
+                                    stream.seek(0)  # 将位置指针移回流的开始处
+                                    break
+
                     return stream.getvalue()
+
             except requests.RequestException as why:
+                stream.truncate(0)  # 截断流,保留前0个字节,即清空流
+                stream.seek(0)  # 将位置指针移回流的开始处
                 retries += 1
                 if show_error_log:
                     logger.exception(why)
 
-        return b''
+        return stream.getvalue()
 
     def _push_oss_from_stream(self, filename, filetype, url, **kwargs):
         """
@@ -199,6 +218,21 @@ class AttachmentDownloader:
 
         return attachment
 
+    def read_pdf_in_chunks(self, pdf_path, chunk_size=1024):
+        try:
+            with open(pdf_path, 'rb') as file:
+                chunk = file.read(chunk_size)
+                if "<</Names <</Dests 4 0 R>>" in str(chunk) and "SourceModified" in str(chunk):
+                    return False
+                elif "doctypehtml" not in str(chunk):
+                    return True
+                elif "%PDF" in str(chunk):
+                    return True
+                else:
+                    return False
+        except Exception as e:
+            return False
+
     def _push_oss_from_local(self, filename, filetype, url, **kwargs):
         """
         上传本地文件到oss
@@ -214,6 +248,12 @@ class AttachmentDownloader:
             "filename": "{}.{}".format(filename, filetype),
             "org_url": url
         }
+
+        if kwargs.get('is_check', None):
+            if not self.read_pdf_in_chunks(file):
+                self.remove(file)
+                return attachment
+
         if len(stream) > 0:
             content_hash = tools.get_sha1(stream)
             try: