Bladeren bron

元博网 - 详情页采集爬虫异常下载错误修复

dongzhaorui 3 jaren geleden
bovenliggende
commit
39283b07da
1 gewijzigde bestanden met toevoegingen van 16 en 8 verwijderingen
  1. 16 8
      ybw/detail_spider.py

+ 16 - 8
ybw/detail_spider.py

@@ -10,7 +10,7 @@ from crawler.clean_html import cleaner
 from crawler.crawl_scheduler import Scheduler
 from crawler.login import login, load_login_cookies, login_check
 from utils.databases import mongo_table, int2long
-from utils.execptions import CrawlError, YbwCrawlError
+from utils.execptions import YbwCrawlError
 from utils.log import logger
 from utils.socks5 import Proxy
 
@@ -89,7 +89,8 @@ class DetailSpider:
         update = {'crawl': False}
         self._update_crawl_task(task['_id'], **update)
 
-    def crawl_request(self, url):
+    def crawl_request(self, item: dict):
+        url = item['competehref']
         headers = {
             'Host': 'www.chinabidding.cn',
             'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
@@ -109,8 +110,14 @@ class DetailSpider:
         request_params.setdefault('timeout', 60)
 
         retries = 0
+        retries_502, max_retries_502 = 0, 15
         proxy, proxies = None, None
         while retries < 3:
+            if retries_502 > max_retries_502:
+                # 网站已移除该数据
+                self._update_crawl_task(item['_id'], crawl_status='remove')
+                break
+
             login_cookies = load_login_cookies(self.user.phone)
             if login_cookies is None:
                 login(*self.user)
@@ -122,7 +129,7 @@ class DetailSpider:
 
             try:
                 r = requests.get(url, **request_params)
-                '''账号登录状态检查'''
+                # 账号登录状态检查
                 retry_login = login_check(self.user.phone, url, False)
                 if retry_login:
                     logger.info(f"[重新登录]{self.user.phone}")
@@ -140,7 +147,9 @@ class DetailSpider:
                 element = fromstring(r.text)
                 nodes = element.xpath('//*[@id="main_dom"]/div[1]')
                 if len(nodes) != 1:
-                    raise CrawlError(code=10021, reason=f'"main_dom"属性匹配个数:{len(nodes)}')
+                    retries_502 += 1
+                    logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
+                    continue
                 else:
                     node = nodes[0]
                     logger.info(f'[采集正文] id={node.attrib.get("id")}')
@@ -204,8 +213,7 @@ class DetailSpider:
             try:
                 # 检查请求采集任务
                 CheckTask(item)
-                url = item['competehref']
-                response = self.crawl_request(url)
+                response = self.crawl_request(item)
                 if response is not None:
                     self.crawl_response(response, item)
                     self._update_crawl_task(item["_id"], crawl_status='finished')
@@ -232,10 +240,10 @@ class DetailSpider:
                     self.user = scheduler.user
                     finished = self.crawl_spider(scheduler)
                     if finished:
-                        '''完成采集任务'''
+                        # 完成采集任务
                         scheduler.finished()
                     else:
-                        '''暂无采集任务'''
+                        # 暂无采集任务
                         scheduler.wait_for_next_task()