scheduler.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2017-01-09 10:38
  4. ---------
  5. @summary: 组装parser、 parser_control 和 collector
  6. ---------
  7. @author: Boris
  8. @email: boris_liu@foxmail.com
  9. """
  10. import threading
  11. import time
  12. from collections import Iterable
  13. import feapder.setting as setting
  14. import feapder.utils.tools as tools
  15. from feapder.buffer.item_buffer import ItemBuffer
  16. from feapder.buffer.request_buffer import RequestBuffer
  17. from feapder.core.base_parser import BaseParser
  18. from feapder.core.collector import Collector
  19. from feapder.core.handle_failed_requests import HandleFailedRequests
  20. from feapder.core.parser_control import PaserControl
  21. from feapder.db.redisdb import RedisDB
  22. from feapder.network.item import Item
  23. from feapder.network.request import Request
  24. from feapder.utils.log import log
  25. from feapder.utils.redis_lock import RedisLock
  26. from feapder.utils import metrics
  27. SPIDER_START_TIME_KEY = "spider_start_time"
  28. SPIDER_END_TIME_KEY = "spider_end_time"
  29. SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY = "last_task_count_record_time"
  30. class Scheduler(threading.Thread):
  31. __custom_setting__ = {}
  32. def __init__(
  33. self,
  34. redis_key=None,
  35. thread_count=None,
  36. begin_callback=None,
  37. end_callback=None,
  38. delete_keys=(),
  39. keep_alive=None,
  40. auto_start_requests=None,
  41. batch_interval=0,
  42. wait_lock=True,
  43. task_table=None,
  44. **kwargs
  45. ):
  46. """
  47. @summary: 调度器
  48. ---------
  49. @param redis_key: 爬虫request及item存放redis中的文件夹
  50. @param thread_count: 线程数,默认为配置文件中的线程数
  51. @param begin_callback: 爬虫开始回调函数
  52. @param end_callback: 爬虫结束回调函数
  53. @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则
  54. @param keep_alive: 爬虫是否常驻,默认否
  55. @param auto_start_requests: 爬虫是否自动添加任务
  56. @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动
  57. @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True
  58. @param task_table: 任务表, 批次爬虫传递
  59. ---------
  60. @result:
  61. """
  62. super(Scheduler, self).__init__()
  63. for key, value in self.__class__.__custom_setting__.items():
  64. if key == "AUTO_STOP_WHEN_SPIDER_DONE": # 兼容老版本的配置
  65. setattr(setting, "KEEP_ALIVE", not value)
  66. else:
  67. setattr(setting, key, value)
  68. self._redis_key = redis_key or setting.REDIS_KEY
  69. if not self._redis_key:
  70. raise Exception(
  71. """
  72. redis_key 为redis中存放request与item的目录。不能为空,
  73. 可在setting中配置,如 REDIS_KEY = 'test'
  74. 或spider初始化时传参, 如 TestSpider(redis_key='test')
  75. """
  76. )
  77. self._request_buffer = RequestBuffer(redis_key)
  78. self._item_buffer = ItemBuffer(redis_key, task_table)
  79. self._collector = Collector(redis_key)
  80. self._parsers = []
  81. self._parser_controls = []
  82. self._parser_control_obj = PaserControl
  83. # 兼容老版本的参数
  84. if "auto_stop_when_spider_done" in kwargs:
  85. self._keep_alive = not kwargs.get("auto_stop_when_spider_done")
  86. else:
  87. self._keep_alive = (
  88. keep_alive if keep_alive is not None else setting.KEEP_ALIVE
  89. )
  90. self._auto_start_requests = (
  91. auto_start_requests
  92. if auto_start_requests is not None
  93. else setting.SPIDER_AUTO_START_REQUESTS
  94. )
  95. self._batch_interval = batch_interval
  96. self._begin_callback = (
  97. begin_callback
  98. if begin_callback
  99. else lambda: log.info("\n********** feapder begin **********")
  100. )
  101. self._end_callback = (
  102. end_callback
  103. if end_callback
  104. else lambda: log.info("\n********** feapder end **********")
  105. )
  106. self._thread_count = (
  107. setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
  108. )
  109. self._spider_name = redis_key
  110. self._project_name = redis_key.split(":")[0]
  111. self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key)
  112. self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
  113. self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key)
  114. self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format(
  115. redis_key=redis_key
  116. )
  117. self._is_notify_end = False # 是否已经通知结束
  118. self._last_task_count = 0 # 最近一次任务数量
  119. self._redisdb = RedisDB()
  120. self._project_total_state_table = "{}_total_state".format(self._project_name)
  121. self._is_exist_project_total_state_table = False
  122. # Request 缓存设置
  123. Request.cached_redis_key = redis_key
  124. Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME
  125. delete_keys = delete_keys or setting.DELETE_KEYS
  126. if delete_keys:
  127. self.delete_tables(delete_keys)
  128. self._last_check_task_status_time = 0
  129. self.wait_lock = wait_lock
  130. self.init_metrics()
  131. def init_metrics(self):
  132. """
  133. 初始化打点系统
  134. """
  135. metrics.init(**setting.METRICS_OTHER_ARGS)
  136. def add_parser(self, parser):
  137. parser = parser() # parser 实例化
  138. if isinstance(parser, BaseParser):
  139. self._parsers.append(parser)
  140. else:
  141. raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser")
  142. def run(self):
  143. if not self.is_reach_next_spider_time():
  144. return
  145. self._start()
  146. while True:
  147. try:
  148. if self.all_thread_is_done():
  149. if not self._is_notify_end:
  150. self.spider_end() # 跑完一轮
  151. self.record_spider_state(
  152. spider_type=1,
  153. state=1,
  154. spider_end_time=tools.get_current_date(),
  155. batch_interval=self._batch_interval,
  156. )
  157. self._is_notify_end = True
  158. if not self._keep_alive:
  159. self._stop_all_thread()
  160. break
  161. else:
  162. self._is_notify_end = False
  163. self.check_task_status()
  164. except Exception as e:
  165. log.exception(e)
  166. tools.delay_time(1) # 1秒钟检查一次爬虫状态
  167. def __add_task(self):
  168. # 启动parser 的 start_requests
  169. self.spider_begin() # 不自动结束的爬虫此处只能执行一遍
  170. self.record_spider_state(
  171. spider_type=1,
  172. state=0,
  173. batch_date=tools.get_current_date(),
  174. spider_start_time=tools.get_current_date(),
  175. batch_interval=self._batch_interval,
  176. )
  177. # 判断任务池中属否还有任务,若有接着抓取
  178. todo_task_count = self._collector.get_requests_count()
  179. if todo_task_count:
  180. log.info("检查到有待做任务 %s 条,不重下发新任务,将接着上回异常终止处继续抓取" % todo_task_count)
  181. else:
  182. for parser in self._parsers:
  183. results = parser.start_requests()
  184. # 添加request到请求队列,由请求队列统一入库
  185. if results and not isinstance(results, Iterable):
  186. raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
  187. result_type = 1
  188. for result in results or []:
  189. if isinstance(result, Request):
  190. result.parser_name = result.parser_name or parser.name
  191. self._request_buffer.put_request(result)
  192. result_type = 1
  193. elif isinstance(result, Item):
  194. self._item_buffer.put_item(result)
  195. result_type = 2
  196. elif callable(result): # callbale的request可能是更新数据库操作的函数
  197. if result_type == 1:
  198. self._request_buffer.put_request(result)
  199. else:
  200. self._item_buffer.put_item(result)
  201. else:
  202. raise TypeError(
  203. "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
  204. type(result)
  205. )
  206. )
  207. self._request_buffer.flush()
  208. self._item_buffer.flush()
  209. def _start(self):
  210. # 启动request_buffer
  211. self._request_buffer.start()
  212. # 启动item_buffer
  213. self._item_buffer.start()
  214. # 启动collector
  215. self._collector.start()
  216. # 启动parser control
  217. for i in range(self._thread_count):
  218. parser_control = self._parser_control_obj(
  219. self._collector,
  220. self._redis_key,
  221. self._request_buffer,
  222. self._item_buffer,
  223. )
  224. for parser in self._parsers:
  225. parser_control.add_parser(parser)
  226. parser_control.start()
  227. self._parser_controls.append(parser_control)
  228. # 下发任务 因为时间可能比较长,放到最后面
  229. if setting.RETRY_FAILED_REQUESTS:
  230. # 重设失败的任务, 不用加锁,原子性操作
  231. handle_failed_requests = HandleFailedRequests(self._redis_key)
  232. handle_failed_requests.reput_failed_requests_to_requests()
  233. # 下发新任务
  234. if self._auto_start_requests: # 自动下发
  235. if self.wait_lock:
  236. # 将添加任务处加锁,防止多进程之间添加重复的任务
  237. with RedisLock(key=self._spider_name) as lock:
  238. if lock.locked:
  239. self.__add_task()
  240. else:
  241. self.__add_task()
  242. def all_thread_is_done(self):
  243. for i in range(3): # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
  244. # 检测 collector 状态
  245. if (
  246. self._collector.is_collector_task()
  247. or self._collector.get_requests_count() > 0
  248. ):
  249. return False
  250. # 检测 parser_control 状态
  251. for parser_control in self._parser_controls:
  252. if not parser_control.is_not_task():
  253. return False
  254. # 检测 item_buffer 状态
  255. if (
  256. self._item_buffer.get_items_count() > 0
  257. or self._item_buffer.is_adding_to_db()
  258. ):
  259. return False
  260. # 检测 request_buffer 状态
  261. if (
  262. self._request_buffer.get_requests_count() > 0
  263. or self._request_buffer.is_adding_to_db()
  264. ):
  265. return False
  266. tools.delay_time(1)
  267. return True
  268. @tools.run_safe_model("check_task_status")
  269. def check_task_status(self):
  270. """
  271. 检查任务状态 预警
  272. """
  273. # 每分钟检查一次
  274. now_time = time.time()
  275. if now_time - self._last_check_task_status_time > 30:
  276. self._last_check_task_status_time = now_time
  277. else:
  278. return
  279. # 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息
  280. task_count = self._redisdb.zget_count(self._tab_requests)
  281. print(task_count)
  282. if task_count:
  283. if task_count != self._last_task_count:
  284. self._last_task_count = task_count
  285. self._redisdb.hset(
  286. self._tab_spider_time,
  287. SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
  288. tools.get_current_timestamp(),
  289. ) # 多进程会重复发消息, 使用reids记录上次统计时间
  290. else:
  291. # 判断时间间隔是否超过20分钟
  292. lua = """
  293. -- local key = KEYS[1]
  294. local field = ARGV[1]
  295. local current_timestamp = ARGV[2]
  296. -- 取值
  297. local last_timestamp = redis.call('hget', KEYS[1], field)
  298. if last_timestamp and current_timestamp - last_timestamp >= 600 then
  299. return current_timestamp - last_timestamp -- 返回任务停滞时间 秒
  300. end
  301. if not last_timestamp then
  302. redis.call('hset', KEYS[1], field, current_timestamp)
  303. end
  304. return 0
  305. """
  306. redis_obj = self._redisdb.get_redis_obj()
  307. cmd = redis_obj.register_script(lua)
  308. overtime = cmd(
  309. keys=[self._tab_spider_time],
  310. args=[
  311. SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
  312. tools.get_current_timestamp(),
  313. ],
  314. )
  315. if overtime:
  316. # 发送报警
  317. msg = "{} 爬虫任务停滞 {},请检查爬虫是否正常".format(
  318. self._spider_name, tools.format_seconds(overtime)
  319. )
  320. log.error(msg)
  321. log.error("爬虫任务异常停滞,爬虫将强制退出")
  322. exit()
  323. self.send_msg(
  324. msg,
  325. level="error",
  326. message_prefix="《{}》爬虫任务停滞".format(self._spider_name),
  327. )
  328. else:
  329. self._last_task_count = 0
  330. # 检查失败任务数量 超过1000 报警,
  331. failed_count = self._redisdb.zget_count(self._tab_failed_requests)
  332. print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<失败次数:',failed_count)
  333. if failed_count > setting.WARNING_FAILED_COUNT:
  334. # 发送报警
  335. msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
  336. log.error(msg)
  337. self.send_msg(
  338. msg,
  339. level="error",
  340. message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name),
  341. )
  342. # parser_control实时统计已做任务数及失败任务数,若成功率<0.5 则报警
  343. failed_task_count, success_task_count = PaserControl.get_task_status_count()
  344. total_count = success_task_count + failed_task_count
  345. if total_count > 0:
  346. task_success_rate = success_task_count / total_count
  347. if task_success_rate < 0.5:
  348. # 发送报警
  349. msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % (
  350. self._spider_name,
  351. success_task_count,
  352. failed_task_count,
  353. task_success_rate,
  354. )
  355. log.error(msg)
  356. self.send_msg(
  357. msg,
  358. level="error",
  359. message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name),
  360. )
  361. # 检查入库失败次数
  362. if self._item_buffer.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
  363. msg = "《{}》爬虫导出数据失败,失败次数:{}, 请检查爬虫是否正常".format(
  364. self._spider_name, self._item_buffer.export_falied_times
  365. )
  366. log.error(msg)
  367. self.send_msg(
  368. msg, level="error", message_prefix="《%s》爬虫导出数据失败" % (self._spider_name)
  369. )
  370. def delete_tables(self, delete_tables_list):
  371. if isinstance(delete_tables_list, bool):
  372. delete_tables_list = [self._redis_key + "*"]
  373. elif not isinstance(delete_tables_list, (list, tuple)):
  374. delete_tables_list = [delete_tables_list]
  375. redis = RedisDB()
  376. for delete_tab in delete_tables_list:
  377. if not delete_tab.startswith(self._redis_key):
  378. delete_tab = self._redis_key + delete_tab
  379. tables = redis.getkeys(delete_tab)
  380. for table in tables:
  381. if table != self._tab_spider_time:
  382. log.info("正在删除key %s" % table)
  383. redis.clear(table)
  384. def _stop_all_thread(self):
  385. self._request_buffer.stop()
  386. self._item_buffer.stop()
  387. # 停止 collector
  388. self._collector.stop()
  389. # 停止 parser_controls
  390. for parser_control in self._parser_controls:
  391. parser_control.stop()
  392. self._started.clear()
  393. def send_msg(self, msg, level="debug", message_prefix=""):
  394. # log.debug("发送报警 level:{} msg{}".format(level, msg))
  395. tools.send_msg(msg=msg, level=level, message_prefix=message_prefix)
  396. def spider_begin(self):
  397. """
  398. @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享
  399. ---------
  400. ---------
  401. @result:
  402. """
  403. if self._begin_callback:
  404. self._begin_callback()
  405. for parser in self._parsers:
  406. parser.start_callback()
  407. # 记录开始时间
  408. if not self._redisdb.hexists(self._tab_spider_time, SPIDER_START_TIME_KEY):
  409. current_timestamp = tools.get_current_timestamp()
  410. self._redisdb.hset(
  411. self._tab_spider_time, SPIDER_START_TIME_KEY, current_timestamp
  412. )
  413. # 发送消息
  414. # self.send_msg("《%s》爬虫开始" % self._spider_name)
  415. def spider_end(self):
  416. self.record_end_time()
  417. if self._end_callback:
  418. self._end_callback()
  419. for parser in self._parsers:
  420. if not self._keep_alive:
  421. parser.close()
  422. parser.end_callback()
  423. if not self._keep_alive:
  424. # 关闭webdirver
  425. if Request.webdriver_pool:
  426. Request.webdriver_pool.close()
  427. # 关闭打点
  428. metrics.close()
  429. else:
  430. metrics.flush()
  431. # 计算抓取时长
  432. data = self._redisdb.hget(
  433. self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True
  434. )
  435. if data:
  436. begin_timestamp = int(data)
  437. spand_time = tools.get_current_timestamp() - begin_timestamp
  438. msg = "《%s》爬虫结束,耗时 %s" % (
  439. self._spider_name,
  440. tools.format_seconds(spand_time),
  441. )
  442. log.info(msg)
  443. # self.send_msg(msg)
  444. if self._keep_alive:
  445. log.info("爬虫不自动结束, 等待下一轮任务...")
  446. else:
  447. self.delete_tables(self._tab_spider_status)
  448. def record_end_time(self):
  449. # 记录结束时间
  450. if self._batch_interval:
  451. current_timestamp = tools.get_current_timestamp()
  452. self._redisdb.hset(
  453. self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp
  454. )
  455. def is_reach_next_spider_time(self):
  456. if not self._batch_interval:
  457. return True
  458. last_spider_end_time = self._redisdb.hget(
  459. self._tab_spider_time, SPIDER_END_TIME_KEY
  460. )
  461. if last_spider_end_time:
  462. last_spider_end_time = int(last_spider_end_time)
  463. current_timestamp = tools.get_current_timestamp()
  464. time_interval = current_timestamp - last_spider_end_time
  465. if time_interval < self._batch_interval * 86400:
  466. log.info(
  467. "上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行,退出~".format(
  468. tools.timestamp_to_date(last_spider_end_time),
  469. tools.format_seconds(time_interval),
  470. tools.format_seconds(self._batch_interval * 86400),
  471. )
  472. )
  473. return False
  474. return True
  475. def record_spider_state(
  476. self,
  477. spider_type,
  478. state,
  479. batch_date=None,
  480. spider_start_time=None,
  481. spider_end_time=None,
  482. batch_interval=None,
  483. ):
  484. pass
  485. def join(self, timeout=None):
  486. """
  487. 重写线程的join
  488. """
  489. if not self._started.is_set():
  490. return
  491. super().join()