# -*- coding: utf-8 -*- """ Created on 2017-01-09 10:38 --------- @summary: 组装parser、 parser_control 和 collector --------- @author: Boris @email: boris_liu@foxmail.com """ import threading import time from collections import Iterable import feapder.setting as setting import feapder.utils.tools as tools from feapder.buffer.item_buffer import ItemBuffer from feapder.buffer.request_buffer import RequestBuffer from feapder.buffer.heartbeat_buffer import HeartBeatBuffer from feapder.core.base_parser import BaseParser from feapder.core.collector import Collector from feapder.core.handle_failed_items import HandleFailedItems from feapder.core.handle_failed_requests import HandleFailedRequests from feapder.core.parser_control import PaserControl from feapder.db.rabbitMq import RabbitMQ from feapder.network.item import Item from feapder.network.request import Request from feapder.utils import metrics from feapder.utils.log import log class Scheduler(threading.Thread): __custom_setting__ = {} def __init__( self, redis_key=None, user=None, thread_count=None, begin_callback=None, end_callback=None, keep_alive=None, auto_start_requests=None, **kwargs ): """ @summary: 调度器 --------- @param redis_key: 爬虫request及item存放redis中的文件夹 @param user: 指定mq特定的程序消费用户标识 @param thread_count: 线程数,默认为配置文件中的线程数 @param begin_callback: 爬虫开始回调函数 @param end_callback: 爬虫结束回调函数 @param keep_alive: 爬虫是否常驻,默认否 @param auto_start_requests: 爬虫是否自动添加任务 --------- @result: """ super(Scheduler, self).__init__() for key, value in self.__class__.__custom_setting__.items(): if key == "AUTO_STOP_WHEN_SPIDER_DONE": # 兼容老版本的配置 setattr(setting, "KEEP_ALIVE", not value) else: setattr(setting, key, value) self._redis_key = redis_key or setting.REDIS_KEY if not self._redis_key: raise Exception( """ redis_key 为redis中存放request与item的目录。不能为空, 可在setting中配置,如 REDIS_KEY = 'test' 或spider初始化时传参, 如 TestSpider(redis_key='test') """ ) self._rabbitmq = RabbitMQ() self._request_buffer = RequestBuffer(redis_key, user=user) self._item_buffer = ItemBuffer(redis_key, user=user) self._collector = Collector(redis_key, user=user) self._heartbeat_buffer = HeartBeatBuffer(redis_key) self._parsers = [] self._parser_controls = [] self._parser_control_obj = PaserControl # 兼容老版本的参数 if "auto_stop_when_spider_done" in kwargs: self._keep_alive = not kwargs.get("auto_stop_when_spider_done") else: self._keep_alive = ( keep_alive if keep_alive is not None else setting.KEEP_ALIVE ) self._auto_start_requests = ( auto_start_requests if auto_start_requests is not None else setting.SPIDER_AUTO_START_REQUESTS ) self._begin_callback = ( begin_callback if begin_callback else lambda: log.info("\n********** feapder begin **********") ) self._end_callback = ( end_callback if end_callback else lambda: log.info("\n********** feapder end **********") ) self._thread_count = ( setting.SPIDER_THREAD_COUNT if not thread_count else thread_count ) self._spider_id = tools.get_uuid(redis_key, tools.get_current_date()) self._spider_name = redis_key self._is_notify_end = False # 是否已经通知结束 # Request 缓存设置 Request.cached_redis_key = redis_key Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME self._last_check_task_status_time = 0 self._user = user self.init_metrics() def init_metrics(self): """ 初始化打点系统 """ metrics.init(**setting.METRICS_OTHER_ARGS) def add_parser(self, parser): parser = parser() # parser 实例化 if isinstance(parser, BaseParser): self._parsers.append(parser) else: raise ValueError("类型错误,爬虫需继承feapder.BaseParser") def _start(self): self.spider_begin() # 启动爬虫 -- start_callback # 将失败的item入库 if setting.RETRY_FAILED_ITEMS: handle_failed_items = HandleFailedItems( redis_key=self._redis_key, item_buffer=self._item_buffer, rabbitmq=self._rabbitmq, user=self._user ) handle_failed_items.reput_failed_items_to_db() self._heartbeat_buffer.start() # 心跳管理器 # STEP 3.1 开启 request_buffer -- 任务管理器 负责缓冲添加到数据库中的request self._request_buffer.start() # STEP 3.2 开启 item_buffer -- 管道管理器 负责缓冲采集的数据添加到数据库 self._item_buffer.start() # STEP 3.4 开启 collector -- 任务管理 分发任务 self._collector.start() # 启动parser control 线程池 for i in range(self._thread_count): # STEP 3.4 创建执行任务线程池 parser_control = self._parser_control_obj( self._collector, self._redis_key, self._request_buffer, self._item_buffer, self._heartbeat_buffer ) for parser in self._parsers: # step 3.5 把所有待执行任务添加到线程池 parser_control.add_parser(parser) parser_control.start() # STEP 3.6 开启采集线程 self._parser_controls.append(parser_control) # STEP 3.7下发任务 有消费线程之后开始读取任务 if setting.RETRY_FAILED_REQUESTS: # 重设失败的任务 handle_failed_requests = HandleFailedRequests( redis_key=self._redis_key, rabbitmq=self._rabbitmq, user=self._user ) handle_failed_requests.reput_failed_requests_to_requests() # STEP 3.8下发新任务 ,生产新任务 if self._auto_start_requests: self.__add_task() def run(self): self._start() while True: try: if self.all_thread_is_done(): if not self._is_notify_end: self.spider_end() # 爬虫结束 self._is_notify_end = True if not self._keep_alive: # 如果不是常驻爬虫 关闭所有线程 self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() except (Exception, BaseException) as e: log.exception(e) tools.delay_time(1) def __add_task(self): # 判断任务池中属否还有任务,若有接着抓取,若无则生产新任务 todo_task_count = self._collector.get_requests_count() if todo_task_count: log.info("检查到有待做任务 %s 条,不重下发新任务,将接着上回异常终止处继续抓取" % todo_task_count) else: for parser in self._parsers: results = parser.start_requests() # 添加request到请求队列,由请求队列统一入库 if results and not isinstance(results, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for result in results or []: # step 对yield 的数据进行判断处理 if isinstance(result, Request): # Request 加入到任务队列 result.parser_name = result.parser_name or parser.name self._request_buffer.put_request(result) result_type = 1 elif isinstance(result, Item): # Item 数据,存入到数据管道队列,等待存储 self._item_buffer.put_item(result) result_type = 2 elif callable(result): # callable 的 request 可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(result) else: self._item_buffer.put_item(result) else: raise TypeError( "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format( type(result) ) ) self._request_buffer.flush() self._item_buffer.flush() def all_thread_is_done(self): # Stress 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性 for i in range(5): # STEP 5.1 检测 collector 状态 if ( self._collector.is_collector_task() or self._collector.get_requests_count() > 0 ): return False # STEP 5.2 检测 parser_control 状态 for parser_control in self._parser_controls: if not parser_control.is_not_task(): return False # STEP 5.3 检测 item_buffer 状态 if ( self._item_buffer.get_items_count() > 0 or self._item_buffer.is_adding_to_db() ): return False # STEP 5.4 检测 request_buffer 状态 if ( self._request_buffer.get_requests_count() > 0 or self._request_buffer.is_adding_to_db() ): return False # 检测 heartbeat_buffer 状态 if ( self._heartbeat_buffer.get_items_count() > 0 or self._heartbeat_buffer.is_adding_to_db() ): return False tools.delay_time(1) # 休眠1秒 return True @tools.run_safe_model("check_task_status") def check_task_status(self): """ 检查任务状态 预警 """ # step 每分钟检查一次 now_time = time.time() if now_time - self._last_check_task_status_time > 60: self._last_check_task_status_time = now_time else: return # 检查失败任务数量 超过1000 报警 failed_count = self._request_buffer.get_failed_requests_count() if failed_count > setting.WARNING_FAILED_COUNT: # 发送报警 msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count) log.error(msg) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name), ) # parser_control 实时统计已做任务数及失败任务数,若成功率<0.5 则报警 failed_task_count, success_task_count = PaserControl.get_task_status_count() total_count = success_task_count + failed_task_count if total_count > 0: task_success_rate = success_task_count / total_count if task_success_rate < 0.5: # 发送报警 msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % ( self._spider_name, success_task_count, failed_task_count, task_success_rate, ) log.error(msg) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name), ) # 检查入库失败次数 if self._item_buffer.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES: msg = "《{}》爬虫导出数据失败,失败次数:{}, 请检查爬虫是否正常".format( self._spider_name, self._item_buffer.export_falied_times ) log.error(msg) self.send_msg( msg, level="error", message_prefix="《%s》爬虫导出数据失败" % (self._spider_name) ) def _stop_all_thread(self): # 关闭任务管理器 self._request_buffer.stop() # 关闭数据管道 self._item_buffer.stop() # 关闭任务管理 self._collector.stop() # 停止 parser_controls for parser_control in self._parser_controls: parser_control.stop() # 关闭心跳管理 self._heartbeat_buffer.stop() # 记录爬虫停止时间 self._started.clear() def send_msg(self, msg, level="debug", message_prefix=""): # log.debug("发送报警 level:{} msg{}".format(level, msg)) tools.send_msg(msg=msg, level=level, message_prefix=message_prefix) def spider_begin(self): """ @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享 --------- --------- @result: """ if self._begin_callback: self._begin_callback() for parser in self._parsers: parser.start_callback() # 任务开始回调 def spider_end(self): if self._end_callback: # 任务结束回调 self._end_callback() for parser in self._parsers: if not self._keep_alive: parser.close() # 爬虫自定义 close parser.end_callback() # 调用结束回调函数 if not self._keep_alive: # 关闭webdirver Request.render_downloader and Request.render_downloader.close_all() metrics.close() # 关闭打点 else: metrics.flush() if self._keep_alive: log.info("爬虫不自动结束,等待下一轮任务...") else: log.info("《%s》爬虫结束" % (self._spider_name,)) def join(self, timeout=None): """ 重写线程的join """ if not self._started.is_set(): return super().join()