|
@@ -12,6 +12,7 @@ import sys
|
|
|
import threading
|
|
|
import time
|
|
|
from collections import Iterable
|
|
|
+from types import SimpleNamespace
|
|
|
|
|
|
import feapder.setting as setting
|
|
|
import feapder.utils.tools as tools
|
|
@@ -22,23 +23,11 @@ from feapder.core.collector import Collector
|
|
|
from feapder.core.handle_failed_items import HandleFailedItems
|
|
|
from feapder.core.handle_failed_requests import HandleFailedRequests
|
|
|
from feapder.core.parser_control import PaserControl
|
|
|
-from feapder.db.redisdb import RedisDB
|
|
|
+from feapder.db.rabbitMq import RabbitMQ
|
|
|
from feapder.network.item import Item
|
|
|
from feapder.network.request import Request
|
|
|
from feapder.utils import metrics
|
|
|
from feapder.utils.log import log
|
|
|
-from feapder.utils.redis_lock import RedisLock
|
|
|
-
|
|
|
-SPIDER_UUID = tools.get_uuid()
|
|
|
-SPIDER_START_TIME = "spider_start_time"
|
|
|
-SPIDER_START_TIME_KEY = SPIDER_START_TIME + "#" + SPIDER_UUID
|
|
|
-SPIDER_END_TIME_KEY = "spider_end_time"
|
|
|
-SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY = "last_task_count_record_time"
|
|
|
-
|
|
|
-
|
|
|
-class Obj(object):
|
|
|
- def __init__(self, dict_):
|
|
|
- self.__dict__.update(dict_)
|
|
|
|
|
|
|
|
|
class Scheduler(threading.Thread):
|
|
@@ -50,12 +39,8 @@ class Scheduler(threading.Thread):
|
|
|
thread_count=None,
|
|
|
begin_callback=None,
|
|
|
end_callback=None,
|
|
|
- delete_keys=(),
|
|
|
keep_alive=None,
|
|
|
auto_start_requests=None,
|
|
|
- batch_interval=0,
|
|
|
- wait_lock=True,
|
|
|
- task_table=None,
|
|
|
**kwargs
|
|
|
):
|
|
|
"""
|
|
@@ -65,12 +50,8 @@ class Scheduler(threading.Thread):
|
|
|
@param thread_count: 线程数,默认为配置文件中的线程数
|
|
|
@param begin_callback: 爬虫开始回调函数
|
|
|
@param end_callback: 爬虫结束回调函数
|
|
|
- @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则
|
|
|
@param keep_alive: 爬虫是否常驻,默认否
|
|
|
@param auto_start_requests: 爬虫是否自动添加任务
|
|
|
- @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动
|
|
|
- @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True
|
|
|
- @param task_table: 任务表, 批次爬虫传递
|
|
|
---------
|
|
|
@result:
|
|
|
"""
|
|
@@ -82,16 +63,7 @@ class Scheduler(threading.Thread):
|
|
|
setattr(setting, "KEEP_ALIVE", not value)
|
|
|
else:
|
|
|
setattr(setting, key, value)
|
|
|
-
|
|
|
- # 历史爬虫[redis_key]
|
|
|
- for item in sys.argv[1:]:
|
|
|
- if item.startswith("--purpose"):
|
|
|
- val = item.split('=')[-1]
|
|
|
- if not redis_key.endswith(val):
|
|
|
- # 历史爬虫需要单独的redis_key,防止增量爬虫
|
|
|
- # 与历史爬虫共用同一个redis_key,出现增量爬虫断点续采的情况
|
|
|
- redis_key += f'_{val}'
|
|
|
-
|
|
|
+
|
|
|
self._redis_key = redis_key or setting.REDIS_KEY
|
|
|
if not self._redis_key:
|
|
|
raise Exception(
|
|
@@ -102,10 +74,12 @@ class Scheduler(threading.Thread):
|
|
|
"""
|
|
|
)
|
|
|
|
|
|
- self._request_buffer = RequestBuffer(redis_key)
|
|
|
- self._item_buffer = ItemBuffer(redis_key, task_table)
|
|
|
+ self._rabbitmq = RabbitMQ()
|
|
|
|
|
|
+ self._request_buffer = RequestBuffer(redis_key)
|
|
|
+ self._item_buffer = ItemBuffer(redis_key)
|
|
|
self._collector = Collector(redis_key)
|
|
|
+
|
|
|
self._parsers = []
|
|
|
self._parser_controls = []
|
|
|
self._parser_control_obj = PaserControl
|
|
@@ -114,16 +88,15 @@ class Scheduler(threading.Thread):
|
|
|
if "auto_stop_when_spider_done" in kwargs:
|
|
|
self._keep_alive = not kwargs.get("auto_stop_when_spider_done")
|
|
|
else:
|
|
|
-
|
|
|
self._keep_alive = (
|
|
|
keep_alive if keep_alive is not None else setting.KEEP_ALIVE
|
|
|
)
|
|
|
+
|
|
|
self._auto_start_requests = (
|
|
|
auto_start_requests
|
|
|
if auto_start_requests is not None
|
|
|
else setting.SPIDER_AUTO_START_REQUESTS
|
|
|
)
|
|
|
- self._batch_interval = batch_interval
|
|
|
|
|
|
self._begin_callback = (
|
|
|
begin_callback
|
|
@@ -140,34 +113,20 @@ class Scheduler(threading.Thread):
|
|
|
setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
|
|
|
)
|
|
|
|
|
|
+ self._spider_id = tools.get_uuid(redis_key, tools.get_current_date())
|
|
|
self._spider_name = redis_key
|
|
|
- self._project_name = redis_key.split(":")[0]
|
|
|
- self._task_table = task_table
|
|
|
-
|
|
|
- self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key)
|
|
|
- self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
|
|
|
- self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key)
|
|
|
- self._tab_failed_requests = setting.TAB_FAILED_REQUESTS.format(
|
|
|
- redis_key=redis_key
|
|
|
- )
|
|
|
|
|
|
- self._is_notify_end = False # 是否已经通知结束
|
|
|
- self._last_task_count = 0 # 最近一次任务数量
|
|
|
- self._redisdb = RedisDB()
|
|
|
+ # 声明爬虫心跳队列
|
|
|
+ self._tab_spider_heartbeat = setting.SPIDER_HEARTBEAT
|
|
|
+ self._rabbitmq.declare(queue=self._tab_spider_heartbeat)
|
|
|
|
|
|
- self._project_total_state_table = "{}_total_state".format(self._project_name)
|
|
|
- self._is_exist_project_total_state_table = False
|
|
|
+ self._is_notify_end = False # 是否已经通知结束
|
|
|
|
|
|
# Request 缓存设置
|
|
|
Request.cached_redis_key = redis_key
|
|
|
Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME
|
|
|
|
|
|
- delete_keys = delete_keys or setting.DELETE_KEYS
|
|
|
- if delete_keys:
|
|
|
- self.delete_tables(delete_keys)
|
|
|
-
|
|
|
self._last_check_task_status_time = 0
|
|
|
- self.wait_lock = wait_lock
|
|
|
|
|
|
self.init_metrics()
|
|
|
|
|
@@ -184,36 +143,32 @@ class Scheduler(threading.Thread):
|
|
|
else:
|
|
|
raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser")
|
|
|
|
|
|
- def run(self): # STEP 1 爬虫框架入口
|
|
|
- if not self.is_reach_next_spider_time(): # STEP 2 检测爬虫是否到达执行时间
|
|
|
- return
|
|
|
-
|
|
|
- self._start() # STEP 3 开始运行爬虫
|
|
|
+ def run(self):
|
|
|
+ self._start()
|
|
|
|
|
|
- while True: # step 4 对爬虫状态的一个监控
|
|
|
+ while True:
|
|
|
+ self.__report_node_heartbeat('running')
|
|
|
try:
|
|
|
- if self.all_thread_is_done(): # Step 5 判断爬虫是否运行完成
|
|
|
+ if self.all_thread_is_done():
|
|
|
if not self._is_notify_end:
|
|
|
- self.spider_end() # 跑完一轮
|
|
|
+ self.spider_end() # 爬虫运行结束
|
|
|
self._is_notify_end = True
|
|
|
|
|
|
- if not self._keep_alive: # step 7 如果不是常驻爬虫 停止所有线程
|
|
|
+ if not self._keep_alive: # 如果不是常驻爬虫 关闭所有线程
|
|
|
self._stop_all_thread()
|
|
|
break
|
|
|
|
|
|
else:
|
|
|
self._is_notify_end = False
|
|
|
|
|
|
- self.check_task_status() # step 8 检查任务状态,并进行告警通知
|
|
|
-
|
|
|
+ self.check_task_status()
|
|
|
except Exception as e:
|
|
|
log.exception(e)
|
|
|
|
|
|
- tools.delay_time(1) # 1秒钟检查一次爬虫状态
|
|
|
+ tools.delay_time(1)
|
|
|
|
|
|
def __add_task(self):
|
|
|
- # 启动parser 的 start_requests
|
|
|
- self.spider_begin() # 不自动结束的爬虫此处只能执行一遍
|
|
|
+ self.spider_begin() # 启动爬虫 start_requests
|
|
|
|
|
|
# 判断任务池中属否还有任务,若有接着抓取,若无则生产新任务
|
|
|
todo_task_count = self._collector.get_requests_count()
|
|
@@ -257,21 +212,21 @@ class Scheduler(threading.Thread):
|
|
|
if setting.RETRY_FAILED_ITEMS:
|
|
|
handle_failed_items = HandleFailedItems(
|
|
|
redis_key=self._redis_key,
|
|
|
- task_table=self._task_table,
|
|
|
item_buffer=self._item_buffer,
|
|
|
+ rabbitmq=self._rabbitmq,
|
|
|
)
|
|
|
handle_failed_items.reput_failed_items_to_db()
|
|
|
|
|
|
- # STEP 3.1 启动request_buffer -- 任务管理器, 负责缓冲添加到数据库中的request
|
|
|
+ # STEP 3.1 开启 request_buffer -- 任务管理器,负责缓冲添加到数据库中的request
|
|
|
self._request_buffer.start()
|
|
|
- # STEP 3.2 启动item_buffer -- 管道管理器 责缓冲添加到数据库中的item, 由该manager统一添加。防止多线程同时访问数据库
|
|
|
+ # STEP 3.2 开启 item_buffer -- 管道管理器 负责缓冲采集的数据添加到数据库
|
|
|
self._item_buffer.start()
|
|
|
- # STEP 3.3 启动collector -- 任务管理 ,根据节点和任务,平均分配给每个节点
|
|
|
+ # STEP 3.3 开启 collector -- 任务管理 分发任务
|
|
|
self._collector.start()
|
|
|
|
|
|
# 启动parser control
|
|
|
for i in range(self._thread_count):
|
|
|
- # STEP 3.4 根据 任务管理器、redis_key,下载器,数据管道创建一个线程池
|
|
|
+ # STEP 3.4 创建执行任务线程池
|
|
|
parser_control = self._parser_control_obj(
|
|
|
self._collector,
|
|
|
self._redis_key,
|
|
@@ -279,27 +234,24 @@ class Scheduler(threading.Thread):
|
|
|
self._item_buffer,
|
|
|
)
|
|
|
|
|
|
- for parser in self._parsers: # step 3.5 把所有任务放入线程池
|
|
|
+ for parser in self._parsers: # step 3.5 把所有待执行任务添加到线程池
|
|
|
parser_control.add_parser(parser)
|
|
|
|
|
|
- parser_control.start() # STEP 3.6 根据线程池开辟一个线程
|
|
|
+ parser_control.start() # STEP 3.6 开启采集线程
|
|
|
self._parser_controls.append(parser_control)
|
|
|
|
|
|
# STEP 3.7下发任务 有消费线程之后开始读取任务
|
|
|
if setting.RETRY_FAILED_REQUESTS:
|
|
|
- # 重设失败的任务, 不用加锁,原子性操作
|
|
|
- handle_failed_requests = HandleFailedRequests(self._redis_key)
|
|
|
+ # 重设失败的任务
|
|
|
+ handle_failed_requests = HandleFailedRequests(
|
|
|
+ redis_key=self._redis_key,
|
|
|
+ rabbitmq=self._rabbitmq
|
|
|
+ )
|
|
|
handle_failed_requests.reput_failed_requests_to_requests()
|
|
|
|
|
|
# STEP 3.8下发新任务 ,生产新任务
|
|
|
- if self._auto_start_requests: # 自动下发
|
|
|
- if self.wait_lock:
|
|
|
- # Stress 将添加任务处加锁,防止多进程之间添加重复的任务
|
|
|
- with RedisLock(key=self._spider_name) as lock:
|
|
|
- if lock.locked:
|
|
|
- self.__add_task()
|
|
|
- else:
|
|
|
- self.__add_task()
|
|
|
+ if self._auto_start_requests:
|
|
|
+ self.__add_task()
|
|
|
|
|
|
def all_thread_is_done(self):
|
|
|
# Stress 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
|
|
@@ -347,19 +299,19 @@ class Scheduler(threading.Thread):
|
|
|
return
|
|
|
|
|
|
# 检查失败任务数量 超过1000 报警,
|
|
|
- failed_count = self._redisdb.zget_count(self._tab_failed_requests)
|
|
|
- print('<<<<<<<<<<<<<<<<<<<<<<<<<<<< 失败次数:', failed_count)
|
|
|
+ failed_count = self._request_buffer.get_failed_requests_count()
|
|
|
+ log.debug(f'《{self._spider_name}》爬虫失败任务数量:{failed_count}')
|
|
|
if failed_count > setting.WARNING_FAILED_COUNT:
|
|
|
# 发送报警
|
|
|
msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
|
|
|
log.error(msg)
|
|
|
- self.send_msg(
|
|
|
- msg,
|
|
|
+ tools.send_msg(**dict(
|
|
|
+ msg=msg,
|
|
|
level="error",
|
|
|
message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name),
|
|
|
- )
|
|
|
+ ))
|
|
|
|
|
|
- # parser_control实时统计已做任务数及失败任务数,若成功率<0.5 则报警
|
|
|
+ # parser_control 实时统计已做任务数及失败任务数,若成功率<0.5 则报警
|
|
|
failed_task_count, success_task_count = PaserControl.get_task_status_count()
|
|
|
total_count = success_task_count + failed_task_count
|
|
|
if total_count > 0:
|
|
@@ -373,68 +325,11 @@ class Scheduler(threading.Thread):
|
|
|
task_success_rate,
|
|
|
)
|
|
|
log.error(msg)
|
|
|
- self.send_msg(
|
|
|
- msg,
|
|
|
+ tools.send_msg(**dict(
|
|
|
+ msg=msg,
|
|
|
level="error",
|
|
|
message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name),
|
|
|
- )
|
|
|
-
|
|
|
- # 判断任务数是否变化
|
|
|
- # step 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息
|
|
|
- task_count = self._redisdb.zget_count(self._tab_requests)
|
|
|
-
|
|
|
- if task_count:
|
|
|
- if task_count != self._last_task_count:
|
|
|
- self._last_task_count = task_count
|
|
|
- self._redisdb.hset(
|
|
|
- self._tab_spider_time,
|
|
|
- SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
|
|
|
- tools.get_current_timestamp(),
|
|
|
- ) # 多进程会重复发消息, 使用redis记录上次统计时间
|
|
|
- else:
|
|
|
- # step 判断时间间隔是否超过20分钟
|
|
|
- lua = """
|
|
|
- -- local key = KEYS[1]
|
|
|
- local field = ARGV[1]
|
|
|
- local current_timestamp = ARGV[2]
|
|
|
-
|
|
|
- -- 取值
|
|
|
- local last_timestamp = redis.call('hget', KEYS[1], field)
|
|
|
- if last_timestamp and current_timestamp - last_timestamp >= 1200 then
|
|
|
- -- 返回任务停滞时间 秒
|
|
|
- return current_timestamp - last_timestamp
|
|
|
- end
|
|
|
-
|
|
|
- if not last_timestamp then
|
|
|
- redis.call('hset', KEYS[1], field, current_timestamp)
|
|
|
- end
|
|
|
-
|
|
|
- return 0
|
|
|
-
|
|
|
- """
|
|
|
- redis_obj = self._redisdb.get_redis_obj()
|
|
|
- cmd = redis_obj.register_script(lua)
|
|
|
- overtime = cmd(
|
|
|
- keys=[self._tab_spider_time],
|
|
|
- args=[
|
|
|
- SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
|
|
|
- tools.get_current_timestamp(),
|
|
|
- ],
|
|
|
- )
|
|
|
-
|
|
|
- if overtime:
|
|
|
- # step 记录日志,并发送报警
|
|
|
- msg = "{} 爬虫任务停滞 {},请检查爬虫是否正常".format(
|
|
|
- self._spider_name, tools.format_seconds(overtime)
|
|
|
- )
|
|
|
- log.error(msg) # TODO 这一步可以加一个print,在平台的日志框里输出
|
|
|
- self.send_msg(
|
|
|
- msg,
|
|
|
- level="error",
|
|
|
- message_prefix="《{}》爬虫任务停滞".format(self._spider_name),
|
|
|
- )
|
|
|
- else:
|
|
|
- self._last_task_count = 0
|
|
|
+ ))
|
|
|
|
|
|
# 检查入库失败次数
|
|
|
if self._item_buffer.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
|
|
@@ -442,49 +337,27 @@ class Scheduler(threading.Thread):
|
|
|
self._spider_name, self._item_buffer.export_falied_times
|
|
|
)
|
|
|
log.error(msg)
|
|
|
- self.send_msg(
|
|
|
- msg, level="error", message_prefix="《%s》爬虫导出数据失败" % (self._spider_name)
|
|
|
- )
|
|
|
-
|
|
|
- def delete_tables(self, delete_tables_list):
|
|
|
- if isinstance(delete_tables_list, bool):
|
|
|
- delete_tables_list = [self._redis_key + "*"]
|
|
|
- elif not isinstance(delete_tables_list, (list, tuple)):
|
|
|
- delete_tables_list = [delete_tables_list]
|
|
|
-
|
|
|
- redis = RedisDB()
|
|
|
- for delete_tab in delete_tables_list:
|
|
|
- if not delete_tab.startswith(self._redis_key):
|
|
|
- delete_tab = self._redis_key + delete_tab
|
|
|
- tables = redis.getkeys(delete_tab)
|
|
|
- for table in tables:
|
|
|
- if table != self._tab_spider_time:
|
|
|
- log.info("正在删除key %s" % table)
|
|
|
- redis.clear(table)
|
|
|
- else:
|
|
|
- keys = redis.hgetall(table)
|
|
|
- for key in keys:
|
|
|
- if key.startswith(SPIDER_START_TIME):
|
|
|
- redis.hdel(table, key)
|
|
|
+ tools.send_msg(**dict(
|
|
|
+ msg=msg,
|
|
|
+ level="error",
|
|
|
+ message_prefix="《%s》爬虫导出数据失败" % (self._spider_name)
|
|
|
+ ))
|
|
|
|
|
|
def _stop_all_thread(self):
|
|
|
+ # 关闭任务管理器
|
|
|
self._request_buffer.stop()
|
|
|
+ # 关闭数据管道
|
|
|
self._item_buffer.stop()
|
|
|
- # 停止 collector
|
|
|
+ # 关闭任务管理
|
|
|
self._collector.stop()
|
|
|
# 停止 parser_controls
|
|
|
for parser_control in self._parser_controls:
|
|
|
parser_control.stop()
|
|
|
|
|
|
+ # 记录爬虫停止时间
|
|
|
+ self.__report_node_heartbeat('close')
|
|
|
self._started.clear()
|
|
|
|
|
|
- def send_msg(self, msg, level="debug", message_prefix=""):
|
|
|
- #TODO 这个方法是消息预警,但如果每次都发送,会造成消息轰炸,所以采集框架的消息预警没有开启,
|
|
|
- # 后续优化方向,消息预警的内容可以通过接口,接受保存,并对内容紧急度进行分辨,紧急度高的消息,可以直接发送至微信群中,这里尽量不要直接存储,feapder
|
|
|
- # 框架不进行mongo的直接存储,只做查询操作
|
|
|
- # log.debug("发送报警 level:{} msg{}".format(level, msg))
|
|
|
- tools.send_msg(msg=msg, level=level, message_prefix=message_prefix)
|
|
|
-
|
|
|
def get_argvs(self):
|
|
|
argvs = {"next_page": False, "max_page": 10}
|
|
|
for item in sys.argv[1:]:
|
|
@@ -494,7 +367,7 @@ class Scheduler(threading.Thread):
|
|
|
val = item.split('=')[-1]
|
|
|
if key != 'purpose':
|
|
|
argvs[key] = eval(val) # 此处使用eval的原因是字符串转bool或int
|
|
|
- return json.loads(json.dumps(argvs), object_hook=Obj)
|
|
|
+ return json.loads(json.dumps(argvs), object_hook=lambda d: SimpleNamespace(**d))
|
|
|
|
|
|
def spider_begin(self):
|
|
|
"""
|
|
@@ -503,7 +376,6 @@ class Scheduler(threading.Thread):
|
|
|
---------
|
|
|
@result:
|
|
|
"""
|
|
|
-
|
|
|
if self._begin_callback:
|
|
|
self._begin_callback()
|
|
|
|
|
@@ -513,29 +385,23 @@ class Scheduler(threading.Thread):
|
|
|
parser.platform_max_page = parameter.max_page
|
|
|
parser.start_callback()
|
|
|
|
|
|
- # 记录开始时间
|
|
|
- if not self._redisdb.hexists(self._tab_spider_time, SPIDER_START_TIME_KEY):
|
|
|
- current_timestamp = tools.get_current_timestamp()
|
|
|
- self._redisdb.hset(
|
|
|
- self._tab_spider_time, SPIDER_START_TIME_KEY, current_timestamp
|
|
|
- )
|
|
|
-
|
|
|
- # 发送消息
|
|
|
- # self.send_msg("《%s》爬虫开始" % self._spider_name)
|
|
|
+ # 记录爬虫开始时间
|
|
|
+ self.__report_node_heartbeat('start')
|
|
|
|
|
|
- def spider_end(self): # step end 爬虫结束时的一些操作
|
|
|
- self.record_end_time()
|
|
|
+ def spider_end(self):
|
|
|
+ # 爬虫结束时间
|
|
|
+ self.__report_node_heartbeat('end')
|
|
|
|
|
|
- if self._end_callback: # 系统自带的回调,如果自定义回调,则这个回调不会执行
|
|
|
+ if self._end_callback: # 任务结束回调
|
|
|
self._end_callback()
|
|
|
|
|
|
for parser in self._parsers:
|
|
|
if not self._keep_alive:
|
|
|
parser.close() # 爬虫可自定义close
|
|
|
- parser.end_callback() # 调用结束回调函数,可在爬虫自定义
|
|
|
+ parser.end_callback() # 调用结束回调函数
|
|
|
|
|
|
if not self._keep_alive:
|
|
|
- # 关闭webdirver
|
|
|
+ # 关闭 webdriver 管理池
|
|
|
if Request.webdriver_pool:
|
|
|
Request.webdriver_pool.close()
|
|
|
|
|
@@ -544,62 +410,23 @@ class Scheduler(threading.Thread):
|
|
|
else:
|
|
|
metrics.flush()
|
|
|
|
|
|
- # 计算抓取时长
|
|
|
- data = self._redisdb.hget(
|
|
|
- self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True
|
|
|
- )
|
|
|
- if data:
|
|
|
- begin_timestamp = int(data)
|
|
|
- elapsed_time = tools.get_current_timestamp() - begin_timestamp
|
|
|
- msg = "《%s》爬虫结束,耗时 %s" % (
|
|
|
- self._spider_name,
|
|
|
- tools.format_seconds(elapsed_time),
|
|
|
- )
|
|
|
- log.info(msg)
|
|
|
-
|
|
|
- # self.send_msg(msg)
|
|
|
-
|
|
|
if self._keep_alive:
|
|
|
log.info("爬虫不自动结束,等待下一轮任务...")
|
|
|
else:
|
|
|
- if self._collector.get_spider_count() <= 1:
|
|
|
- self.delete_tables(self._tab_spider_time)
|
|
|
- self.delete_tables(self._tab_spider_status)
|
|
|
- else:
|
|
|
- # 清除关闭爬虫的心跳记录,防止删除任务共享表,造成爬虫异常僵死
|
|
|
- self._collector.delete_spider_node()
|
|
|
-
|
|
|
- def record_end_time(self):
|
|
|
- # 记录结束时间
|
|
|
- if self._batch_interval:
|
|
|
- current_timestamp = tools.get_current_timestamp()
|
|
|
- self._redisdb.hset(
|
|
|
- self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp
|
|
|
- )
|
|
|
+ log.info("《%s》爬虫结束" % (self._spider_name))
|
|
|
|
|
|
- def is_reach_next_spider_time(self): # 如果没有设置爬虫的启动时间,这一块儿不需要管的
|
|
|
- if not self._batch_interval:
|
|
|
- return True
|
|
|
- # 下面是对上次执行完成的时间和当前时间的一个校验,不在规定范围内则不启动爬虫,阻塞等待时间到达后再运行爬虫
|
|
|
- last_spider_end_time = self._redisdb.hget(
|
|
|
- self._tab_spider_time, SPIDER_END_TIME_KEY
|
|
|
- )
|
|
|
- if last_spider_end_time:
|
|
|
- last_spider_end_time = int(last_spider_end_time)
|
|
|
- current_timestamp = tools.get_current_timestamp()
|
|
|
- time_interval = current_timestamp - last_spider_end_time
|
|
|
-
|
|
|
- if time_interval < self._batch_interval * 86400:
|
|
|
- log.info(
|
|
|
- "上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行,退出~".format(
|
|
|
- tools.timestamp_to_date(last_spider_end_time),
|
|
|
- tools.format_seconds(time_interval),
|
|
|
- tools.format_seconds(self._batch_interval * 86400),
|
|
|
- )
|
|
|
- )
|
|
|
- return False
|
|
|
-
|
|
|
- return True
|
|
|
+ def __report_node_heartbeat(self, status):
|
|
|
+ """
|
|
|
+ 爬虫心跳
|
|
|
+ """
|
|
|
+ message = {
|
|
|
+ 'ip': tools.get_localhost_ip(),
|
|
|
+ 'spider_id': self._spider_id,
|
|
|
+ 'spider_name': self._spider_name,
|
|
|
+ 'ts': tools.get_current_timestamp(),
|
|
|
+ 'status': status
|
|
|
+ }
|
|
|
+ self._rabbitmq.add(self._tab_spider_heartbeat, message)
|
|
|
|
|
|
def join(self, timeout=None):
|
|
|
"""
|