|
@@ -72,16 +72,15 @@ class ListSpider:
|
|
proxy.switch()
|
|
proxy.switch()
|
|
proxies = proxy.proxies
|
|
proxies = proxy.proxies
|
|
retries += 1
|
|
retries += 1
|
|
- else:
|
|
|
|
- login_cookies = load_login_cookies(self.user.phone)
|
|
|
|
- request_params.update({'cookies': login_cookies})
|
|
|
|
|
|
+ login_cookies = load_login_cookies(self.user.phone)
|
|
|
|
+ request_params.update({'cookies': login_cookies})
|
|
elif element.xpath('//*[@id="pages"]') and len(element.xpath(feature)) > 0:
|
|
elif element.xpath('//*[@id="pages"]') and len(element.xpath(feature)) > 0:
|
|
return response
|
|
return response
|
|
else:
|
|
else:
|
|
'''没有搜索到任何内容的页面'''
|
|
'''没有搜索到任何内容的页面'''
|
|
return None
|
|
return None
|
|
|
|
|
|
- raise VoidCrawlError(code=100020, reason='列表页采集异常')
|
|
|
|
|
|
+ raise VoidCrawlError(code=100020, reason='列表页访问失败')
|
|
|
|
|
|
def crawl_response(self, response, menu: CrawlMenu):
|
|
def crawl_response(self, response, menu: CrawlMenu):
|
|
results = []
|
|
results = []
|
|
@@ -153,12 +152,14 @@ class ListSpider:
|
|
)
|
|
)
|
|
refer = previous_url
|
|
refer = previous_url
|
|
previous_url = url
|
|
previous_url = url
|
|
|
|
+ print(">>> ", url)
|
|
sc.crawl_url = url
|
|
sc.crawl_url = url
|
|
sc.spider_code = menu.spidercode
|
|
sc.spider_code = menu.spidercode
|
|
- print(">>> ", url)
|
|
|
|
|
|
+ '''添加身份信息cookies'''
|
|
if crawl_total >= 4:
|
|
if crawl_total >= 4:
|
|
'''列表数据从第4页开始,普通登录账号登录状态下才能获取数据'''
|
|
'''列表数据从第4页开始,普通登录账号登录状态下才能获取数据'''
|
|
cookies = load_login_cookies(self.user.phone)
|
|
cookies = load_login_cookies(self.user.phone)
|
|
|
|
+ '''数据采集'''
|
|
try:
|
|
try:
|
|
response = self.crawl_request(url, refer, cookies=cookies)
|
|
response = self.crawl_request(url, refer, cookies=cookies)
|
|
if response is None:
|
|
if response is None:
|
|
@@ -171,10 +172,16 @@ class ListSpider:
|
|
break
|
|
break
|
|
else:
|
|
else:
|
|
crawl_total += 1
|
|
crawl_total += 1
|
|
- except JyBasicException as e:
|
|
|
|
|
|
+ except (JyBasicException, Exception) as e:
|
|
|
|
+ logger.error('[采集失败]{}-{}-第{}页, 错误类型:{}'.format(
|
|
|
|
+ menu.channel,
|
|
|
|
+ region_name,
|
|
|
|
+ page,
|
|
|
|
+ e.__class__.__name__,
|
|
|
|
+ ))
|
|
sc.err_record(e)
|
|
sc.err_record(e)
|
|
- logger.info(f'[采集失败]{menu.channel}-{region_name}-第{page}页-0条')
|
|
|
|
- sc.wait_for_next_task(random.choice(range(2, 8)))
|
|
|
|
|
|
+ finally:
|
|
|
|
+ sc.wait_for_next_task(random.choice(range(2, 6)))
|
|
self.session.close()
|
|
self.session.close()
|
|
|
|
|
|
def start(self):
|
|
def start(self):
|
|
@@ -183,16 +190,8 @@ class ListSpider:
|
|
scheduler.crawl_type = 'list'
|
|
scheduler.crawl_type = 'list'
|
|
if scheduler.crawl_start:
|
|
if scheduler.crawl_start:
|
|
self.user = scheduler.user
|
|
self.user = scheduler.user
|
|
- while True:
|
|
|
|
- try:
|
|
|
|
- self.crawl_spider(scheduler, menu)
|
|
|
|
- break
|
|
|
|
- except Exception as e:
|
|
|
|
- logger.error('采集分类的名称:{} 错误类型:{} '.format(
|
|
|
|
- menu.channel,
|
|
|
|
- e.__class__.__name__,
|
|
|
|
- ))
|
|
|
|
- scheduler.finished()
|
|
|
|
|
|
+ self.crawl_spider(scheduler, menu)
|
|
|
|
+ scheduler.finished()
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|