Browse Source

元博网 - 详情页采集爬虫异常下载错误修复

dongzhaorui 3 years ago
parent
commit
ea2150c4c2
1 changed files with 23 additions and 25 deletions
  1. 23 25
      ybw/detail_spider.py

+ 23 - 25
ybw/detail_spider.py

@@ -78,6 +78,17 @@ class DetailSpider:
         self.save_tab = mongo_table(db, save_tab)
         self.user = None
 
+    def _update_crawl_task(self, tid, **kwargs):
+        self.crawl_tab.update_one({'_id': tid}, {'$set': kwargs})
+
+    def _lock_task(self, task: dict):
+        update = {'crawl': True}
+        self._update_crawl_task(task['_id'], **update)
+
+    def _release_task(self, task: dict):
+        update = {'crawl': False}
+        self._update_crawl_task(task['_id'], **update)
+
     def crawl_request(self, url):
         headers = {
             'Host': 'www.chinabidding.cn',
@@ -129,7 +140,7 @@ class DetailSpider:
                 element = fromstring(r.text)
                 nodes = element.xpath('//*[@id="main_dom"]/div[1]')
                 if len(nodes) != 1:
-                    raise CrawlError
+                    raise CrawlError(code=10021, reason=f'"main_dom"属性匹配个数:{len(nodes)}')
                 else:
                     node = nodes[0]
                     logger.info(f'[采集正文] id={node.attrib.get("id")}')
@@ -178,54 +189,41 @@ class DetailSpider:
             pass
         logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
 
-    def update_crawl_status(self, item: dict, status: bool):
-        self.crawl_tab.update_one(
-            {'_id': item['_id']},
-            {'$set': {'crawl': status}}
-        )
-
     def crawl_spider(self, sc: Scheduler):
         while True:
+            next_task_interval = None
             if sc.count >= sc.total:
                 return True
             item = sc.crawl_task
             if len(item) == 0:
                 return False
-            self.update_crawl_status(item, True)
-            '''使用调度器记录采集内容,出现错误时错误写入数据库'''
+            self._lock_task(item)
+            # 记录采集异常的爬虫代码与来源
             sc.spider_code = item['spidercode']
             sc.crawl_url = item['competehref']
             try:
-                '''检查请求采集任务'''
+                # 检查请求采集任务
                 CheckTask(item)
                 url = item['competehref']
                 response = self.crawl_request(url)
                 if response is not None:
                     self.crawl_response(response, item)
-                    self.crawl_tab.update_one(
-                        {"_id": item["_id"]},
-                        {'$set': {'crawl_status': 'finished'}}
-                    )
+                    self._update_crawl_task(item["_id"], crawl_status='finished')
                     sc.crawl_counter(1)
             except YbwCrawlError as e:
                 if e.code == 10105:
-                    '''检查出该异常时,程序会将es查询结果更新采集表'''
-                    self.crawl_tab.update_one(
-                        {"_id": item["_id"]},
-                        {'$set': {'count': item['count']}}
-                    )
+                    # 抛出异常时,将es查询统计结果进行更新
+                    self._update_crawl_task(item["_id"], count=item['count'])
                     logger.info('[重复数据]{}-{}'.format(item['title'], item['publishtime']))
                 else:
                     sc.err_record(e)
-                    self.crawl_tab.update_one(
-                        {"_id": item["_id"]},
-                        {'$set': {'crawl_status': 'error'}}
-                    )
+                    self._update_crawl_task(item["_id"], crawl_status='error')
                     logger.info('[问题数据]{}-{}'.format(item['title'], item['publishtime']))
                 sc.crawl_counter(0)
+                next_task_interval = 0.1
             finally:
-                self.update_crawl_status(item, False)
-                sc.wait_for_next_task()
+                self._release_task(item)
+                sc.wait_for_next_task(next_task_interval)
 
     def start(self):
         while True: