dongzhaorui@topnet.net.cn 3 năm trước cách đây
mục cha
commit
62fe0176c6
1 tập tin đã thay đổi với 12 bổ sung12 xóa
  1. 12 12
      zbytb/crawler/spiders/DetailPageSpider.py

+ 12 - 12
zbytb/crawler/spiders/DetailPageSpider.py

@@ -41,15 +41,15 @@ class CrawlDetailPageSpider:
         self.senior_account = 'runhekeji'
 
     @staticmethod
-    def select_user(rows: dict, sc: Scheduler):
+    def select_user(rows: dict, username):
         """
         选择用户账号,并在采集内容中添加用户账号
 
         :param rows: 采集内容
-        :param sc: 采集账号任务分配调度器
+        :param username: 采集账号
         :return: 用户账号和账号cookie
         """
-        account = rows.get('account', sc.user.username)
+        account = rows.get('account', username)
         rows.update({'account': account})
         return account, load_login_cookies(account)
 
@@ -244,7 +244,7 @@ class CrawlDetailPageSpider:
             {'$set': {'crawl': status}}
         )
 
-    def crawl_spider(self, rows: dict, sc: Scheduler):
+    def crawl_spider(self, rows: dict, user, account, cookies):
         headers = {
             'Host': 'www.zbytb.com',
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
@@ -254,20 +254,16 @@ class CrawlDetailPageSpider:
         }
         headers.update({'Referer': rows['competehref']})
         url = self.prepare_url(rows)
-        account, cookies = self.select_user(rows, sc)
-        user = sc.query_user(account)
         success, response = self.crawl_request(user, url, headers, cookies)
         print(rows['competehref'])
         if success:
             self.crawl_success(response, rows)
-            sc.update_count(1)
         else:
             self.crawl_error(
                 spider_code=rows['spidercode'],
                 account=account,
                 response=response
             )
-            sc.update_count(0)
 
     def _spider(self, sc: Scheduler):
         while True:
@@ -279,8 +275,13 @@ class CrawlDetailPageSpider:
             sc.crawl_url = item['competehref']
             try:
                 CheckTask(item)
-                self.crawl_spider(item, sc)
+                account, cookies = self.select_user(item, sc.user.username)
+                user = sc.query_user(account)
+                if user is None:
+                    return False
+                self.crawl_spider(item, user, account, cookies)
                 self.update_crawl_status(item, False)
+                sc.crawl_counter(1)
                 sc.wait_for_next_task(10)
             except JyBasicException as e:
                 if e.code == 10105:
@@ -296,12 +297,11 @@ class CrawlDetailPageSpider:
                         {'$set': {'crawl_status': 'error'}}
                     )
                 self.update_crawl_status(item, False)
+                sc.crawl_counter(0)
 
     def start(self):
-        query = {'used': False, 'site': '中国招标与采购网'}
         while True:
-            with Scheduler(query) as scheduler:
-                scheduler.crawl_type = 'detail'
+            with Scheduler(site='中国招标与采购网', crawl_type='detail') as scheduler:
                 if scheduler.crawl_start:
                     finished = self._spider(scheduler)
                     if not finished: