瀏覽代碼

中国招标与采购网添加正文包含附件地址处理

lizongze 2 年之前
父節點
當前提交
d40824f3a7
共有 1 個文件被更改,包括 53 次插入39 次删除
  1. 53 39
      zbytb/crawler/spiders/DetailPageSpider.py

+ 53 - 39
zbytb/crawler/spiders/DetailPageSpider.py

@@ -150,6 +150,20 @@ class CrawlDetailPageSpider:
                         attachments[str(index + 1)] = attachment
                         index += 1
 
+        file_url = soup.findAll('pdfpath')
+        if file_url:
+            file_url = list(file_url[0].stripped_strings)[0]
+            file_type = extract_file_type(file_url)
+            file_name = rows['title']
+            if file_type:
+                attachment2 = self.attachment_downloader.download(
+                    file_name=file_name,
+                    file_type=file_type,
+                    download_url=file_path,
+                )
+            if len(attachment2) > 0:
+                attachments[str(len(attachments) + 1)] = attachment2
+
         if len(attachments) > 0:
             rows["projectinfo"] = {"attachments": attachments}
 
@@ -254,47 +268,47 @@ class CrawlDetailPageSpider:
     def crawl_spider(self, sc: Scheduler):
         while True:
             next_task_interval = None
-            # item = sc.crawl_task
-            # if len(item) == 0:
-            #     return False
-            item_list = mongo_table('py_spider', 'zbytb_list').find({"count": 0, "crawl": False, "crawl_status": {"$exists": False}}).limit(1).sort("_id",1)
-            for item in item_list:
-                logger.info(f">>> {item['title']} - {item['competehref']}")
-                self._lock_task(item)
-                sc.spider_code = self.spider_code = item['spidercode']
-                sc.crawl_url = item['competehref']
+            item = sc.crawl_task
+            if len(item) == 0:
+                return False
+            # item_list = mongo_table('py_spider', 'zbytb_list').find({"count": 0, "crawl": False, "crawl_status": {"$exists": False}}).limit(1).sort("_id",1)
+            # for item in item_list:
+            logger.info(f">>> {item['title']} - {item['competehref']}")
+            self._lock_task(item)
+            sc.spider_code = self.spider_code = item['spidercode']
+            sc.crawl_url = item['competehref']
 
-                # 分配账号和账号cookie
-                self.account = item.get('account', sc.user.username)
-                self.cookies = load_login_cookies(self.account)
-                user = sc.query_user(self.account)
-                if user is None:
-                    return False
+            # 分配账号和账号cookie
+            self.account = item.get('account', sc.user.username)
+            self.cookies = load_login_cookies(self.account)
+            user = sc.query_user(self.account)
+            if user is None:
+                return False
 
-                try:
-                    # CheckTask(item)
-                    url = self.prepare_url(item)
-                    referer = item['competehref']
-                    response = self.crawl_request(url, referer, user)
-                    if response is not None:
-                        num = self.crawl_response(response, item)
-                        sc.crawl_counter(num)
-                    next_task_interval = 10
-                except (ZbYTbCrawlError, Exception) as e:
-                    if getattr(e, 'code', None) is None:
-                        err = ZbYTbCrawlError(unknown_err=e)
-                        sc.err_record(err)
-                    elif e.code == 10105:
-                        # 抛出异常时,将es查询统计结果进行更新
-                        self._update_crawl_task(item["_id"], count=item['count'])
-                    else:
-                        sc.err_record(e)
-                        self._update_crawl_task(item["_id"], crawl_status='error')
-                    sc.crawl_counter(0)
-                    next_task_interval = 0.1
-                finally:
-                    self._release_task(item)
-                    sc.wait_for_next_task(next_task_interval)
+            try:
+                CheckTask(item)
+                url = self.prepare_url(item)
+                referer = item['competehref']
+                response = self.crawl_request(url, referer, user)
+                if response is not None:
+                    num = self.crawl_response(response, item)
+                    sc.crawl_counter(num)
+                next_task_interval = 10
+            except (ZbYTbCrawlError, Exception) as e:
+                if getattr(e, 'code', None) is None:
+                    err = ZbYTbCrawlError(unknown_err=e)
+                    sc.err_record(err)
+                elif e.code == 10105:
+                    # 抛出异常时,将es查询统计结果进行更新
+                    self._update_crawl_task(item["_id"], count=item['count'])
+                else:
+                    sc.err_record(e)
+                    self._update_crawl_task(item["_id"], crawl_status='error')
+                sc.crawl_counter(0)
+                next_task_interval = 0.1
+            finally:
+                self._release_task(item)
+                sc.wait_for_next_task(next_task_interval)
 
     def start(self):
         while True: