Bladeren bron

中国招标与采购网 - 更新流式下载

dongzhaorui 3 jaren geleden
bovenliggende
commit
879aefc505
3 gewijzigde bestanden met toevoegingen van 40 en 39 verwijderingen
  1. 5 6
      zbytb/crawler/spiders/DetailPageSpider.py
  2. 0 1
      zbytb/requirements.txt
  3. 35 32
      zbytb/utils/attachment.py

+ 5 - 6
zbytb/crawler/spiders/DetailPageSpider.py

@@ -111,7 +111,6 @@ class CrawlDetailPageSpider:
         logger.error(err_msg)
 
     def download_attachment(self, content: str, rows: dict):
-        logger.info('>>> 下载附件')
         index = 0
         attachments = {}
         soup = BeautifulSoup(content, "lxml")
@@ -155,7 +154,6 @@ class CrawlDetailPageSpider:
             rows["projectinfo"] = {"attachments": attachments}
 
     def save_data(self, content, rows: dict):
-        logger.info('>>> 保存数据')
         rows["contenthtml"] = clean_js(content)
         special = {
             '<iframe[^<>]*>[\s\S]*?</iframe>': ''
@@ -175,7 +173,6 @@ class CrawlDetailPageSpider:
         logger.info("[采集成功]{}-{}".format(rows['title'], rows['publishtime']))
 
     def crawl_response(self, response, rows: dict):
-        logger.info('>>> 采集响应')
         source = re.findall(r'Inner(.*?);Inner', response.text)
         if len(source) > 0:
             content = source[0][13:-1]
@@ -208,7 +205,6 @@ class CrawlDetailPageSpider:
         return counter
 
     def crawl_request(self, url: str, referer: str, user: User):
-        logger.info('>>> 采集请求')
         headers = {
             'Host': 'www.zbytb.com',
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
@@ -251,7 +247,7 @@ class CrawlDetailPageSpider:
             item = sc.crawl_task
             if len(item) == 0:
                 return False
-
+            logger.info(f">>> {item['title']} - {item['competehref']}")
             self._lock_task(item)
             sc.spider_code = self.spider_code = item['spidercode']
             sc.crawl_url = item['competehref']
@@ -273,7 +269,10 @@ class CrawlDetailPageSpider:
                     sc.crawl_counter(num)
                 next_task_interval = 10
             except (ZbYTbCrawlError, Exception) as e:
-                if getattr(e, 'code', None) == 10105:
+                if getattr(e, 'code', None) is None:
+                    err = ZbYTbCrawlError(unknown_err=e)
+                    sc.err_record(err)
+                elif e.code == 10105:
                     # 抛出异常时,将es查询统计结果进行更新
                     self._update_crawl_task(item["_id"], count=item['count'])
                 else:

+ 0 - 1
zbytb/requirements.txt

@@ -15,7 +15,6 @@ crcmod==1.7
 cryptography==35.0.0
 cssselect==1.1.0
 DBUtils==2.0.2
-ddddocr==1.1.0
 decorator==5.1.0
 EditorConfig==0.12.3
 elasticsearch==7.10.1

+ 35 - 32
zbytb/utils/attachment.py

@@ -36,34 +36,49 @@ class AttachmentDownloader(AliYunService):
         return "{}.{}".format(fid, filetype)
 
     @staticmethod
-    def _file_size(file: str):
-        _kb = float(getsize(file)) / 1024
-        if _kb >= 1024:
-            _M = _kb / 1024
+    def _calc_size(size: float):
+        if size >= 1024:
+            _M = size / 1024
             if _M >= 1024:
                 _G = _M / 1024
                 return "{:.1f} G".format(_G)
             else:
                 return "{:.1f} M".format(_M)
         else:
-            return "{:.1f} kb".format(_kb)
+            return "{:.1f} kb".format(size)
 
-    @staticmethod
-    def get_stream(file, response):
+    def _file_size(self, file):
+        _kb = float(getsize(file)) / 1024
+        return self._calc_size(_kb)
+
+    def stream_download(self, file_path, response):
         stream = io.BytesIO()
-        chunk_size = 1024  # 单次请求最大值
-        with open(file, 'wb') as f:
+        if 'content-length' in response.headers:
+            content_size = int(response.headers['content-length'])  # 内容体总大小
+        else:
+            content_size = len(response.content)
+        size = self._calc_size(float(content_size) / 1024)
+        logger.info(f'>>> 下载附件:{file_path} - 大小:{size}')
+
+        chunk_size = 1024  # 单次请求数据块最大值
+        data_chunk_size = 0  # 下载数据块
+        with open(file_path, 'wb') as f:
             for data in response.iter_content(chunk_size=chunk_size):
                 stream.write(data)
                 f.write(data)
+                data_chunk_size += len(data)
+                percent = (data_chunk_size / content_size) * 100
+                print("\r文件下载进度:%d%%(%d/%d) - %s" % (
+                    percent, data_chunk_size, content_size, file_path), end=" ")
         return stream.getvalue()
 
-    def _download(
+    def _fetch_file(
             self,
+            method: str,
             url: str,
             file: str,
-            enable_proxy=False,
-            allow_show_exception=False,
+            enable_proxy: bool,
+            allow_show_exception: bool,
             **kwargs
     ):
         request_params = {}
@@ -76,20 +91,11 @@ class AttachmentDownloader(AliYunService):
         retries = 0
         while retries < 3:
             try:
-                with requests.get(url, **request_params) as req:
-                    for req_kw in req_keywords:
-                        if req_kw in req.text:
-                            with requests.post(url, **request_params) as req:
-                                if req.status_code == 200:
-                                    return self.get_stream(file, req)
-                                else:
-                                    retries += 1
+                with requests.request(method, url, **request_params) as req:
+                    if req.status_code == 200:
+                        return self.stream_download(file, req)
                     else:
-                        if req.status_code == 200:
-                            return self.get_stream(file, req)
-                        else:
-                            retries += 1
-
+                        retries += 1
             except requests.RequestException:
                 if allow_show_exception:
                     traceback.print_exc()
@@ -110,19 +116,17 @@ class AttachmentDownloader(AliYunService):
     ):
         if not file_name or not file_type or not download_url:
             raise AttachmentError
-
+        request_method = kwargs.pop('method', 'get')
         file_type = file_type.strip()
-
-        file_name = clean_file_name(file_name,file_type)
-
+        file_name = clean_file_name(file_name, file_type)
         download_url = judge_file_url(download_url)
 
         for app_param in modify_file_url_list:
             download_url = app_param(download_url)
 
         local_tmp_file = self._create_file(file_name, file_type)
-
-        file_stream = self._download(
+        file_stream = self._fetch_file(
+            request_method,
             download_url,
             local_tmp_file,
             enable_proxy,
@@ -154,4 +158,3 @@ class AttachmentDownloader(AliYunService):
             return result
         else:
             return {}
-