123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611 |
- # -*- coding: utf-8 -*-
- """
- Created on 2017-01-09 10:38
- ---------
- @summary: 组装parser、 parser_control 和 collector
- ---------
- @author: Boris
- @email: boris_liu@foxmail.com
- """
- import json
- import sys
- import threading
- import time
- from collections import Iterable
- import feapder.setting as setting
- import feapder.utils.tools as tools
- from feapder.buffer.item_buffer import ItemBuffer
- from feapder.buffer.request_buffer import RequestBuffer
- from feapder.core.base_parser import BaseParser
- from feapder.core.collector import Collector
- from feapder.core.handle_failed_items import HandleFailedItems
- from feapder.core.handle_failed_requests import HandleFailedRequests
- from feapder.core.parser_control import PaserControl
- from feapder.db.redisdb import RedisDB
- from feapder.network.item import Item
- from feapder.network.request import Request
- from feapder.utils import metrics
- from feapder.utils.log import log
- from feapder.utils.redis_lock import RedisLock
- SPIDER_UUID = tools.get_uuid()
- SPIDER_START_TIME = "spider_start_time"
- SPIDER_START_TIME_KEY = SPIDER_START_TIME + "#" + SPIDER_UUID
- SPIDER_END_TIME_KEY = "spider_end_time"
- SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY = "last_task_count_record_time"
- class Obj(object):
- def __init__(self, dict_):
- self.__dict__.update(dict_)
- class Scheduler(threading.Thread):
- __custom_setting__ = {}
- def __init__(
- self,
- redis_key=None,
- thread_count=None,
- begin_callback=None,
- end_callback=None,
- delete_keys=(),
- keep_alive=None,
- auto_start_requests=None,
- batch_interval=0,
- wait_lock=True,
- task_table=None,
- **kwargs
- ):
- """
- @summary: 调度器
- ---------
- @param redis_key: 爬虫request及item存放redis中的文件夹
- @param thread_count: 线程数,默认为配置文件中的线程数
- @param begin_callback: 爬虫开始回调函数
- @param end_callback: 爬虫结束回调函数
- @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则
- @param keep_alive: 爬虫是否常驻,默认否
- @param auto_start_requests: 爬虫是否自动添加任务
- @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动
- @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True
- @param task_table: 任务表, 批次爬虫传递
- ---------
- @result:
- """
- super(Scheduler, self).__init__()
- for key, value in self.__class__.__custom_setting__.items():
- if key == "AUTO_STOP_WHEN_SPIDER_DONE": # 兼容老版本的配置
- setattr(setting, "KEEP_ALIVE", not value)
- else:
- setattr(setting, key, value)
-
- # 历史爬虫[redis_key]
- for item in sys.argv[1:]:
- if item.startswith("--purpose"):
- val = item.split('=')[-1]
- if not redis_key.endswith(val):
- # 历史爬虫需要单独的redis_key,防止增量爬虫
- # 与历史爬虫共用同一个redis_key,出现增量爬虫断点续采的情况
- redis_key += f'_{val}'
-
- self._redis_key = redis_key or setting.REDIS_KEY
- if not self._redis_key:
- raise Exception(
- """
- redis_key 为redis中存放request与item的目录。不能为空,
- 可在setting中配置,如 REDIS_KEY = 'test'
- 或spider初始化时传参, 如 TestSpider(redis_key='test')
- """
- )
- self._request_buffer = RequestBuffer(redis_key)
- self._item_buffer = ItemBuffer(redis_key, task_table)
- self._collector = Collector(redis_key)
- self._parsers = []
- self._parser_controls = []
- self._parser_control_obj = PaserControl
- # 兼容老版本的参数
- if "auto_stop_when_spider_done" in kwargs:
- self._keep_alive = not kwargs.get("auto_stop_when_spider_done")
- else:
- self._keep_alive = (
- keep_alive if keep_alive is not None else setting.KEEP_ALIVE
- )
- self._auto_start_requests = (
- auto_start_requests
- if auto_start_requests is not None
- else setting.SPIDER_AUTO_START_REQUESTS
- )
- self._batch_interval = batch_interval
- self._begin_callback = (
- begin_callback
- if begin_callback
- else lambda: log.info("\n********** feapder begin **********")
- )
- self._end_callback = (
- end_callback
- if end_callback
- else lambda: log.info("\n********** feapder end **********")
- )
- self._thread_count = (
- setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
- )
- self._spider_name = redis_key
- self._project_name = redis_key.split(":")[0]
- self._task_table = task_table
- self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key)
- self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
- self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key)
- self._tab_failed_requests = setting.TAB_FAILED_REQUESTS.format(
- redis_key=redis_key
- )
- self._is_notify_end = False # 是否已经通知结束
- self._last_task_count = 0 # 最近一次任务数量
- self._redisdb = RedisDB()
- self._project_total_state_table = "{}_total_state".format(self._project_name)
- self._is_exist_project_total_state_table = False
- # Request 缓存设置
- Request.cached_redis_key = redis_key
- Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME
- delete_keys = delete_keys or setting.DELETE_KEYS
- if delete_keys:
- self.delete_tables(delete_keys)
- self._last_check_task_status_time = 0
- self.wait_lock = wait_lock
- self.init_metrics()
- def init_metrics(self):
- """
- 初始化打点系统
- """
- metrics.init(**setting.METRICS_OTHER_ARGS)
- def add_parser(self, parser):
- parser = parser() # parser 实例化
- if isinstance(parser, BaseParser):
- self._parsers.append(parser)
- else:
- raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser")
- def run(self): # STEP 1 爬虫框架入口
- if not self.is_reach_next_spider_time(): # STEP 2 检测爬虫是否到达执行时间
- return
- self._start() # STEP 3 开始运行爬虫
- while True: # step 4 对爬虫状态的一个监控
- try:
- if self.all_thread_is_done(): # Step 5 判断爬虫是否运行完成
- if not self._is_notify_end:
- self.spider_end() # 跑完一轮
- self._is_notify_end = True
- if not self._keep_alive: # step 7 如果不是常驻爬虫 停止所有线程
- self._stop_all_thread()
- break
- else:
- self._is_notify_end = False
- self.check_task_status() # step 8 检查任务状态,并进行告警通知
- except Exception as e:
- log.exception(e)
- tools.delay_time(1) # 1秒钟检查一次爬虫状态
- def __add_task(self):
- # 启动parser 的 start_requests
- self.spider_begin() # 不自动结束的爬虫此处只能执行一遍
- # 判断任务池中属否还有任务,若有接着抓取,若无则生产新任务
- todo_task_count = self._collector.get_requests_count()
- if todo_task_count:
- log.info("检查到有待做任务 %s 条,不重下发新任务,将接着上回异常终止处继续抓取" % todo_task_count)
- else:
- for parser in self._parsers:
- results = parser.start_requests()
- # 添加request到请求队列,由请求队列统一入库
- if results and not isinstance(results, Iterable):
- raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
- result_type = 1
- for result in results or []: # step 对yield 的数据进行判断处理
- if isinstance(result, Request): # Request 加入到任务队列
- result.parser_name = result.parser_name or parser.name
- self._request_buffer.put_request(result)
- result_type = 1
- elif isinstance(result, Item): # Item 数据,存入到数据管道队列,等待存储
- self._item_buffer.put_item(result)
- result_type = 2
- elif callable(result): # callable 的 request 可能是更新数据库操作的函数
- if result_type == 1:
- self._request_buffer.put_request(result)
- else:
- self._item_buffer.put_item(result)
- else:
- raise TypeError(
- "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
- type(result)
- )
- )
- self._request_buffer.flush()
- self._item_buffer.flush()
- def _start(self):
- # 将失败的item入库
- if setting.RETRY_FAILED_ITEMS:
- handle_failed_items = HandleFailedItems(
- redis_key=self._redis_key,
- task_table=self._task_table,
- item_buffer=self._item_buffer,
- )
- handle_failed_items.reput_failed_items_to_db()
- # STEP 3.1 启动request_buffer -- 任务管理器, 负责缓冲添加到数据库中的request
- self._request_buffer.start()
- # STEP 3.2 启动item_buffer -- 管道管理器 责缓冲添加到数据库中的item, 由该manager统一添加。防止多线程同时访问数据库
- self._item_buffer.start()
- # STEP 3.3 启动collector -- 任务管理 ,根据节点和任务,平均分配给每个节点
- self._collector.start()
- # 启动parser control
- for i in range(self._thread_count):
- # STEP 3.4 根据 任务管理器、redis_key,下载器,数据管道创建一个线程池
- parser_control = self._parser_control_obj(
- self._collector,
- self._redis_key,
- self._request_buffer,
- self._item_buffer,
- )
- for parser in self._parsers: # step 3.5 把所有任务放入线程池
- parser_control.add_parser(parser)
- parser_control.start() # STEP 3.6 根据线程池开辟一个线程
- self._parser_controls.append(parser_control)
- # STEP 3.7下发任务 有消费线程之后开始读取任务
- if setting.RETRY_FAILED_REQUESTS:
- # 重设失败的任务, 不用加锁,原子性操作
- handle_failed_requests = HandleFailedRequests(self._redis_key)
- handle_failed_requests.reput_failed_requests_to_requests()
- # STEP 3.8下发新任务 ,生产新任务
- if self._auto_start_requests: # 自动下发
- if self.wait_lock:
- # Stress 将添加任务处加锁,防止多进程之间添加重复的任务
- with RedisLock(key=self._spider_name) as lock:
- if lock.locked:
- self.__add_task()
- else:
- self.__add_task()
- def all_thread_is_done(self):
- # Stress 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
- for i in range(3):
- # STEP 5.1 检测 collector 状态
- if (
- self._collector.is_collector_task()
- or self._collector.get_requests_count() > 0
- ):
- return False
- # STEP 5.2 检测 parser_control 状态
- for parser_control in self._parser_controls:
- if not parser_control.is_not_task():
- return False
- # STEP 5.3 检测 item_buffer 状态
- if (
- self._item_buffer.get_items_count() > 0
- or self._item_buffer.is_adding_to_db()
- ):
- return False
- # STEP 5.4 检测 request_buffer 状态
- if (
- self._request_buffer.get_requests_count() > 0
- or self._request_buffer.is_adding_to_db()
- ):
- return False
- tools.delay_time(1) # 休眠1秒
- return True
- @tools.run_safe_model("check_task_status")
- def check_task_status(self):
- """
- 检查任务状态 预警
- """
- # step 每分钟检查一次
- now_time = time.time()
- if now_time - self._last_check_task_status_time > 60:
- self._last_check_task_status_time = now_time
- else:
- return
- # 检查失败任务数量 超过1000 报警,
- failed_count = self._redisdb.zget_count(self._tab_failed_requests)
- print('<<<<<<<<<<<<<<<<<<<<<<<<<<<< 失败次数:', failed_count)
- if failed_count > setting.WARNING_FAILED_COUNT:
- # 发送报警
- msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
- log.error(msg)
- self.send_msg(
- msg,
- level="error",
- message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name),
- )
- # parser_control实时统计已做任务数及失败任务数,若成功率<0.5 则报警
- failed_task_count, success_task_count = PaserControl.get_task_status_count()
- total_count = success_task_count + failed_task_count
- if total_count > 0:
- task_success_rate = success_task_count / total_count
- if task_success_rate < 0.5:
- # 发送报警
- msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % (
- self._spider_name,
- success_task_count,
- failed_task_count,
- task_success_rate,
- )
- log.error(msg)
- self.send_msg(
- msg,
- level="error",
- message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name),
- )
- # 判断任务数是否变化
- # step 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息
- task_count = self._redisdb.zget_count(self._tab_requests)
- if task_count:
- if task_count != self._last_task_count:
- self._last_task_count = task_count
- self._redisdb.hset(
- self._tab_spider_time,
- SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
- tools.get_current_timestamp(),
- ) # 多进程会重复发消息, 使用redis记录上次统计时间
- else:
- # step 判断时间间隔是否超过20分钟
- lua = """
- -- local key = KEYS[1]
- local field = ARGV[1]
- local current_timestamp = ARGV[2]
- -- 取值
- local last_timestamp = redis.call('hget', KEYS[1], field)
- if last_timestamp and current_timestamp - last_timestamp >= 1200 then
- -- 返回任务停滞时间 秒
- return current_timestamp - last_timestamp
- end
- if not last_timestamp then
- redis.call('hset', KEYS[1], field, current_timestamp)
- end
- return 0
- """
- redis_obj = self._redisdb.get_redis_obj()
- cmd = redis_obj.register_script(lua)
- overtime = cmd(
- keys=[self._tab_spider_time],
- args=[
- SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
- tools.get_current_timestamp(),
- ],
- )
- if overtime:
- # step 记录日志,并发送报警
- msg = "{} 爬虫任务停滞 {},请检查爬虫是否正常".format(
- self._spider_name, tools.format_seconds(overtime)
- )
- log.error(msg) # TODO 这一步可以加一个print,在平台的日志框里输出
- self.send_msg(
- msg,
- level="error",
- message_prefix="《{}》爬虫任务停滞".format(self._spider_name),
- )
- else:
- self._last_task_count = 0
- # 检查入库失败次数
- if self._item_buffer.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
- msg = "《{}》爬虫导出数据失败,失败次数:{}, 请检查爬虫是否正常".format(
- self._spider_name, self._item_buffer.export_falied_times
- )
- log.error(msg)
- self.send_msg(
- msg, level="error", message_prefix="《%s》爬虫导出数据失败" % (self._spider_name)
- )
- def delete_tables(self, delete_tables_list):
- if isinstance(delete_tables_list, bool):
- delete_tables_list = [self._redis_key + "*"]
- elif not isinstance(delete_tables_list, (list, tuple)):
- delete_tables_list = [delete_tables_list]
- redis = RedisDB()
- for delete_tab in delete_tables_list:
- if not delete_tab.startswith(self._redis_key):
- delete_tab = self._redis_key + delete_tab
- tables = redis.getkeys(delete_tab)
- for table in tables:
- if table != self._tab_spider_time:
- log.info("正在删除key %s" % table)
- redis.clear(table)
- else:
- keys = redis.hgetall(table)
- for key in keys:
- if key.startswith(SPIDER_START_TIME):
- redis.hdel(table, key)
- def _stop_all_thread(self):
- self._request_buffer.stop()
- self._item_buffer.stop()
- # 停止 collector
- self._collector.stop()
- # 停止 parser_controls
- for parser_control in self._parser_controls:
- parser_control.stop()
- self._started.clear()
- def send_msg(self, msg, level="debug", message_prefix=""):
- #TODO 这个方法是消息预警,但如果每次都发送,会造成消息轰炸,所以采集框架的消息预警没有开启,
- # 后续优化方向,消息预警的内容可以通过接口,接受保存,并对内容紧急度进行分辨,紧急度高的消息,可以直接发送至微信群中,这里尽量不要直接存储,feapder
- # 框架不进行mongo的直接存储,只做查询操作
- # log.debug("发送报警 level:{} msg{}".format(level, msg))
- tools.send_msg(msg=msg, level=level, message_prefix=message_prefix)
- def get_argvs(self):
- argvs = {"next_page": False, "max_page": 10}
- for item in sys.argv[1:]:
- # print(item)
- if item.startswith("--"):
- key = item.replace("--", "").split('=')[0]
- val = item.split('=')[-1]
- if key != 'purpose':
- argvs[key] = eval(val) # 此处使用eval的原因是字符串转bool或int
- return json.loads(json.dumps(argvs), object_hook=Obj)
- def spider_begin(self):
- """
- @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享
- ---------
- ---------
- @result:
- """
- if self._begin_callback:
- self._begin_callback()
- for parser in self._parsers:
- parameter = self.get_argvs()
- parser.platform_next_page = parameter.next_page
- parser.platform_max_page = parameter.max_page
- parser.start_callback()
- # 记录开始时间
- if not self._redisdb.hexists(self._tab_spider_time, SPIDER_START_TIME_KEY):
- current_timestamp = tools.get_current_timestamp()
- self._redisdb.hset(
- self._tab_spider_time, SPIDER_START_TIME_KEY, current_timestamp
- )
- # 发送消息
- # self.send_msg("《%s》爬虫开始" % self._spider_name)
- def spider_end(self): # step end 爬虫结束时的一些操作
- self.record_end_time()
- if self._end_callback: # 系统自带的回调,如果自定义回调,则这个回调不会执行
- self._end_callback()
- for parser in self._parsers:
- if not self._keep_alive:
- parser.close() # 爬虫可自定义close
- parser.end_callback() # 调用结束回调函数,可在爬虫自定义
- if not self._keep_alive:
- # 关闭webdirver
- if Request.webdriver_pool:
- Request.webdriver_pool.close()
- # 关闭打点
- metrics.close()
- else:
- metrics.flush()
- # 计算抓取时长
- data = self._redisdb.hget(
- self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True
- )
- if data:
- begin_timestamp = int(data)
- elapsed_time = tools.get_current_timestamp() - begin_timestamp
- msg = "《%s》爬虫结束,耗时 %s" % (
- self._spider_name,
- tools.format_seconds(elapsed_time),
- )
- log.info(msg)
- # self.send_msg(msg)
- if self._keep_alive:
- log.info("爬虫不自动结束,等待下一轮任务...")
- else:
- if self._collector.get_spider_count() <= 1:
- self.delete_tables(self._tab_spider_time)
- self.delete_tables(self._tab_spider_status)
- else:
- # 清除关闭爬虫的心跳记录,防止删除任务共享表,造成爬虫异常僵死
- self._collector.delete_spider_node()
- def record_end_time(self):
- # 记录结束时间
- if self._batch_interval:
- current_timestamp = tools.get_current_timestamp()
- self._redisdb.hset(
- self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp
- )
- def is_reach_next_spider_time(self): # 如果没有设置爬虫的启动时间,这一块儿不需要管的
- if not self._batch_interval:
- return True
- # 下面是对上次执行完成的时间和当前时间的一个校验,不在规定范围内则不启动爬虫,阻塞等待时间到达后再运行爬虫
- last_spider_end_time = self._redisdb.hget(
- self._tab_spider_time, SPIDER_END_TIME_KEY
- )
- if last_spider_end_time:
- last_spider_end_time = int(last_spider_end_time)
- current_timestamp = tools.get_current_timestamp()
- time_interval = current_timestamp - last_spider_end_time
- if time_interval < self._batch_interval * 86400:
- log.info(
- "上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行,退出~".format(
- tools.timestamp_to_date(last_spider_end_time),
- tools.format_seconds(time_interval),
- tools.format_seconds(self._batch_interval * 86400),
- )
- )
- return False
- return True
- def join(self, timeout=None):
- """
- 重写线程的join
- """
- if not self._started.is_set():
- return
- super().join()
|