3 jaren geleden · 879aefc505
--- a/zbytb/crawler/spiders/DetailPageSpider.py
+++ b/zbytb/crawler/spiders/DetailPageSpider.py
@@ -111,7 +111,6 @@ class CrawlDetailPageSpider:
 
				         logger.error(err_msg)
			
 
				 
			
 
				     def download_attachment(self, content: str, rows: dict):
			
 
				-        logger.info('>>> 下载附件')
			
 
				         index = 0
			
 
				         attachments = {}
			
 
				         soup = BeautifulSoup(content, "lxml")
			
@@ -155,7 +154,6 @@ class CrawlDetailPageSpider:
 
				             rows["projectinfo"] = {"attachments": attachments}
			
 
				 
			
 
				     def save_data(self, content, rows: dict):
			
 
				-        logger.info('>>> 保存数据')
			
 
				         rows["contenthtml"] = clean_js(content)
			
 
				         special = {
			
 
				             '<iframe[^<>]*>[\s\S]*?</iframe>': ''
			
@@ -175,7 +173,6 @@ class CrawlDetailPageSpider:
 
				         logger.info("[采集成功]{}-{}".format(rows['title'], rows['publishtime']))
			
 
				 
			
 
				     def crawl_response(self, response, rows: dict):
			
 
				-        logger.info('>>> 采集响应')
			
 
				         source = re.findall(r'Inner(.*?);Inner', response.text)
			
 
				         if len(source) > 0:
			
 
				             content = source[0][13:-1]
			
@@ -208,7 +205,6 @@ class CrawlDetailPageSpider:
 
				         return counter
			
 
				 
			
 
				     def crawl_request(self, url: str, referer: str, user: User):
			
 
				-        logger.info('>>> 采集请求')
			
 
				         headers = {
			
 
				             'Host': 'www.zbytb.com',
			
 
				             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
			
@@ -251,7 +247,7 @@ class CrawlDetailPageSpider:
 
				             item = sc.crawl_task
			
 
				             if len(item) == 0:
			
 
				                 return False
			
 
				-
			
 
				+            logger.info(f">>> {item['title']} - {item['competehref']}")
			
 
				             self._lock_task(item)
			
 
				             sc.spider_code = self.spider_code = item['spidercode']
			
 
				             sc.crawl_url = item['competehref']
			
@@ -273,7 +269,10 @@ class CrawlDetailPageSpider:
 
				                     sc.crawl_counter(num)
			
 
				                 next_task_interval = 10
			
 
				             except (ZbYTbCrawlError, Exception) as e:
			
 
				-                if getattr(e, 'code', None) == 10105:
			
 
				+                if getattr(e, 'code', None) is None:
			
 
				+                    err = ZbYTbCrawlError(unknown_err=e)
			
 
				+                    sc.err_record(err)
			
 
				+                elif e.code == 10105:
			
 
				                     # 抛出异常时,将es查询统计结果进行更新
			
 
				                     self._update_crawl_task(item["_id"], count=item['count'])
			
 
				                 else:
			
--- a/zbytb/requirements.txt
+++ b/zbytb/requirements.txt
@@ -15,7 +15,6 @@ crcmod==1.7
 
				 cryptography==35.0.0
			
 
				 cssselect==1.1.0
			
 
				 DBUtils==2.0.2
			
 
				-ddddocr==1.1.0
			
 
				 decorator==5.1.0
			
 
				 EditorConfig==0.12.3
			
 
				 elasticsearch==7.10.1
			
--- a/zbytb/utils/attachment.py
+++ b/zbytb/utils/attachment.py
@@ -36,34 +36,49 @@ class AttachmentDownloader(AliYunService):
 
				         return "{}.{}".format(fid, filetype)
			
 
				 
			
 
				     @staticmethod
			
 
				-    def _file_size(file: str):
			
 
				-        _kb = float(getsize(file)) / 1024
			
 
				-        if _kb >= 1024:
			
 
				-            _M = _kb / 1024
			
 
				+    def _calc_size(size: float):
			
 
				+        if size >= 1024:
			
 
				+            _M = size / 1024
			
 
				             if _M >= 1024:
			
 
				                 _G = _M / 1024
			
 
				                 return "{:.1f} G".format(_G)
			
 
				             else:
			
 
				                 return "{:.1f} M".format(_M)
			
 
				         else:
			
 
				-            return "{:.1f} kb".format(_kb)
			
 
				+            return "{:.1f} kb".format(size)
			
 
				 
			
 
				-    @staticmethod
			
 
				-    def get_stream(file, response):
			
 
				+    def _file_size(self, file):
			
 
				+        _kb = float(getsize(file)) / 1024
			
 
				+        return self._calc_size(_kb)
			
 
				+
			
 
				+    def stream_download(self, file_path, response):
			
 
				         stream = io.BytesIO()
			
 
				-        chunk_size = 1024  # 单次请求最大值
			
 
				-        with open(file, 'wb') as f:
			
 
				+        if 'content-length' in response.headers:
			
 
				+            content_size = int(response.headers['content-length'])  # 内容体总大小
			
 
				+        else:
			
 
				+            content_size = len(response.content)
			
 
				+        size = self._calc_size(float(content_size) / 1024)
			
 
				+        logger.info(f'>>> 下载附件:{file_path} - 大小:{size}')
			
 
				+
			
 
				+        chunk_size = 1024  # 单次请求数据块最大值
			
 
				+        data_chunk_size = 0  # 下载数据块
			
 
				+        with open(file_path, 'wb') as f:
			
 
				             for data in response.iter_content(chunk_size=chunk_size):
			
 
				                 stream.write(data)
			
 
				                 f.write(data)
			
 
				+                data_chunk_size += len(data)
			
 
				+                percent = (data_chunk_size / content_size) * 100
			
 
				+                print("\r文件下载进度：%d%%(%d/%d) - %s" % (
			
 
				+                    percent, data_chunk_size, content_size, file_path), end=" ")
			
 
				         return stream.getvalue()
			
 
				 
			
 
				-    def _download(
			
 
				+    def _fetch_file(
			
 
				             self,
			
 
				+            method: str,
			
 
				             url: str,
			
 
				             file: str,
			
 
				-            enable_proxy=False,
			
 
				-            allow_show_exception=False,
			
 
				+            enable_proxy: bool,
			
 
				+            allow_show_exception: bool,
			
 
				             **kwargs
			
 
				     ):
			
 
				         request_params = {}
			
@@ -76,20 +91,11 @@ class AttachmentDownloader(AliYunService):
 
				         retries = 0
			
 
				         while retries < 3:
			
 
				             try:
			
 
				-                with requests.get(url, **request_params) as req:
			
 
				-                    for req_kw in req_keywords:
			
 
				-                        if req_kw in req.text:
			
 
				-                            with requests.post(url, **request_params) as req:
			
 
				-                                if req.status_code == 200:
			
 
				-                                    return self.get_stream(file, req)
			
 
				-                                else:
			
 
				-                                    retries += 1
			
 
				+                with requests.request(method, url, **request_params) as req:
			
 
				+                    if req.status_code == 200:
			
 
				+                        return self.stream_download(file, req)
			
 
				                     else:
			
 
				-                        if req.status_code == 200:
			
 
				-                            return self.get_stream(file, req)
			
 
				-                        else:
			
 
				-                            retries += 1
			
 
				-
			
 
				+                        retries += 1
			
 
				             except requests.RequestException:
			
 
				                 if allow_show_exception:
			
 
				                     traceback.print_exc()
			
@@ -110,19 +116,17 @@ class AttachmentDownloader(AliYunService):
 
				     ):
			
 
				         if not file_name or not file_type or not download_url:
			
 
				             raise AttachmentError
			
 
				-
			
 
				+        request_method = kwargs.pop('method', 'get')
			
 
				         file_type = file_type.strip()
			
 
				-
			
 
				-        file_name = clean_file_name(file_name,file_type)
			
 
				-
			
 
				+        file_name = clean_file_name(file_name, file_type)
			
 
				         download_url = judge_file_url(download_url)
			
 
				 
			
 
				         for app_param in modify_file_url_list:
			
 
				             download_url = app_param(download_url)
			
 
				 
			
 
				         local_tmp_file = self._create_file(file_name, file_type)
			
 
				-
			
 
				-        file_stream = self._download(
			
 
				+        file_stream = self._fetch_file(
			
 
				+            request_method,
			
 
				             download_url,
			
 
				             local_tmp_file,
			
 
				             enable_proxy,
			
@@ -154,4 +158,3 @@ class AttachmentDownloader(AliYunService):
 
				             return result
			
 
				         else:
			
 
				             return {}
			
 
				-