dongzhaorui@topnet.net.cn 3 years ago
parent
commit
4deed6c298
1 changed files with 17 additions and 18 deletions
  1. 17 18
      ybw/list_spider.py

+ 17 - 18
ybw/list_spider.py

@@ -72,16 +72,15 @@ class ListSpider:
                             proxy.switch()
                         proxies = proxy.proxies
                         retries += 1
-                else:
-                    login_cookies = load_login_cookies(self.user.phone)
-                    request_params.update({'cookies': login_cookies})
+                login_cookies = load_login_cookies(self.user.phone)
+                request_params.update({'cookies': login_cookies})
             elif element.xpath('//*[@id="pages"]') and len(element.xpath(feature)) > 0:
                 return response
             else:
                 '''没有搜索到任何内容的页面'''
                 return None
 
-        raise VoidCrawlError(code=100020, reason='列表页采集异常')
+        raise VoidCrawlError(code=100020, reason='列表页访问失败')
 
     def crawl_response(self, response, menu: CrawlMenu):
         results = []
@@ -153,12 +152,14 @@ class ListSpider:
                     )
                     refer = previous_url
                     previous_url = url
+                print(">>> ", url)
                 sc.crawl_url = url
                 sc.spider_code = menu.spidercode
-                print(">>> ", url)
+                '''添加身份信息cookies'''
                 if crawl_total >= 4:
                     '''列表数据从第4页开始,普通登录账号登录状态下才能获取数据'''
                     cookies = load_login_cookies(self.user.phone)
+                '''数据采集'''
                 try:
                     response = self.crawl_request(url, refer, cookies=cookies)
                     if response is None:
@@ -171,10 +172,16 @@ class ListSpider:
                         break
                     else:
                         crawl_total += 1
-                except JyBasicException as e:
+                except (JyBasicException, Exception) as e:
+                    logger.error('[采集失败]{}-{}-第{}页, 错误类型:{}'.format(
+                        menu.channel,
+                        region_name,
+                        page,
+                        e.__class__.__name__,
+                    ))
                     sc.err_record(e)
-                    logger.info(f'[采集失败]{menu.channel}-{region_name}-第{page}页-0条')
-                sc.wait_for_next_task(random.choice(range(2, 8)))
+                finally:
+                    sc.wait_for_next_task(random.choice(range(2, 6)))
             self.session.close()
 
     def start(self):
@@ -183,16 +190,8 @@ class ListSpider:
                 scheduler.crawl_type = 'list'
                 if scheduler.crawl_start:
                     self.user = scheduler.user
-                    while True:
-                        try:
-                            self.crawl_spider(scheduler, menu)
-                            break
-                        except Exception as e:
-                            logger.error('采集分类的名称:{} 错误类型:{} '.format(
-                                menu.channel,
-                                e.__class__.__name__,
-                            ))
-                    scheduler.finished()
+                    self.crawl_spider(scheduler, menu)
+                scheduler.finished()
 
 
 if __name__ == '__main__':