|
@@ -1,252 +1,182 @@
|
|
|
# -*- coding: utf-8 -*-
|
|
|
"""
|
|
|
-Created on 2020/4/22 12:05 AM
|
|
|
+Created on 2024-08-19
|
|
|
---------
|
|
|
-@summary:
|
|
|
+@summary:
|
|
|
---------
|
|
|
-@author: Boris
|
|
|
-@email: boris_liu@foxmail.com
|
|
|
+@author: Dzr
|
|
|
"""
|
|
|
|
|
|
-import warnings
|
|
|
-from collections import Iterable
|
|
|
-
|
|
|
-import amqpstorm
|
|
|
+from threading import Thread
|
|
|
|
|
|
import feapder.setting as setting
|
|
|
import feapder.utils.tools as tools
|
|
|
+from feapder.buffer.heartbeat_buffer import HeartBeatBuffer
|
|
|
+from feapder.buffer.item_buffer import JyItemBuffer
|
|
|
from feapder.core.base_parser import BaseParser
|
|
|
-from feapder.core.scheduler import Scheduler
|
|
|
-from feapder.network.item import Item, FailedTaskItem
|
|
|
+from feapder.core.parser_control import JySpiderParserControl
|
|
|
+from feapder.db.memory_db import MemoryDB
|
|
|
+from feapder.network.item import FailedTaskItem
|
|
|
from feapder.network.request import Request
|
|
|
from feapder.utils.log import log
|
|
|
|
|
|
-CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline"
|
|
|
-
|
|
|
-
|
|
|
-class Spider(
|
|
|
- BaseParser, Scheduler
|
|
|
-): # threading 中有name函数, 必须先继承BaseParser 否则其内部的name会被Schedule的基类覆盖threading.Thread的name
|
|
|
- """
|
|
|
- @summary: 为了简化搭建爬虫
|
|
|
- ---------
|
|
|
- """
|
|
|
-
|
|
|
- def __init__(
|
|
|
- self,
|
|
|
- redis_key=None,
|
|
|
- user=None,
|
|
|
- check_task_interval=5,
|
|
|
- thread_count=None,
|
|
|
- begin_callback=None,
|
|
|
- end_callback=None,
|
|
|
- keep_alive=None,
|
|
|
- auto_start_requests=None,
|
|
|
- **kwargs
|
|
|
- ):
|
|
|
- """
|
|
|
- @summary: 爬虫
|
|
|
- ---------
|
|
|
- @param redis_key: 任务等数据存放在redis中的key前缀
|
|
|
- @param user: 指定mq特定的程序消费用户标识,在多个生产者对应单一消费者时生效
|
|
|
- @param check_task_interval: 检查是否还有任务的时间间隔;默认5秒
|
|
|
- @param thread_count: 线程数,默认为配置文件中的线程数
|
|
|
- @param begin_callback: 爬虫开始回调函数
|
|
|
- @param end_callback: 爬虫结束回调函数
|
|
|
- @param keep_alive: 爬虫是否常驻
|
|
|
- @param auto_start_requests: 爬虫是否自动添加任务
|
|
|
- ---------
|
|
|
- @result:
|
|
|
- """
|
|
|
- super(Spider, self).__init__(
|
|
|
- redis_key=redis_key,
|
|
|
- user=user,
|
|
|
- thread_count=thread_count,
|
|
|
- begin_callback=begin_callback,
|
|
|
- end_callback=end_callback,
|
|
|
- keep_alive=keep_alive,
|
|
|
- auto_start_requests=auto_start_requests,
|
|
|
- **kwargs
|
|
|
- )
|
|
|
-
|
|
|
- self._check_task_interval = check_task_interval
|
|
|
- self._is_distributed_task = False
|
|
|
- self._is_show_not_task = False
|
|
|
-
|
|
|
- def run(self): # 调度控制流程起始
|
|
|
- if not self._parsers:
|
|
|
- self._parsers.append(self)
|
|
|
-
|
|
|
- self._start()
|
|
|
-
|
|
|
- while True:
|
|
|
- try:
|
|
|
- if self.all_thread_is_done():
|
|
|
- if not self._is_notify_end:
|
|
|
- self.spider_end() # 跑完一轮
|
|
|
- self._is_notify_end = True
|
|
|
-
|
|
|
- if not self._keep_alive:
|
|
|
- self._stop_all_thread()
|
|
|
- break
|
|
|
- else:
|
|
|
- self._is_notify_end = False
|
|
|
-
|
|
|
- self.check_task_status()
|
|
|
- except (Exception, BaseException) as e:
|
|
|
- log.exception(e)
|
|
|
|
|
|
- tools.delay_time(1) # 1秒钟检查一次爬虫状态
|
|
|
+class Spider(BaseParser, Thread):
|
|
|
+ __custom_setting__ = {}
|
|
|
|
|
|
- @classmethod
|
|
|
- def to_DebugSpider(cls, *args, **kwargs):
|
|
|
- # DebugSpider 继承 cls
|
|
|
- DebugSpider.__bases__ = (cls,)
|
|
|
- DebugSpider.__name__ = cls.__name__
|
|
|
- return DebugSpider(*args, **kwargs)
|
|
|
-
|
|
|
-
|
|
|
-class DebugSpider(Spider):
|
|
|
- """
|
|
|
- Debug爬虫
|
|
|
- """
|
|
|
-
|
|
|
- __debug_custom_setting__ = dict(
|
|
|
- COLLECTOR_SLEEP_TIME=1,
|
|
|
- COLLECTOR_TASK_COUNT=1,
|
|
|
- SPIDER_THREAD_COUNT=1, # SPIDER
|
|
|
- SPIDER_SLEEP_TIME=0,
|
|
|
- SPIDER_TASK_COUNT=1,
|
|
|
- SPIDER_MAX_RETRY_TIMES=10,
|
|
|
- REQUEST_LOST_TIMEOUT=600, # 10分钟
|
|
|
- PROXY_ENABLE=False,
|
|
|
- RETRY_FAILED_REQUESTS=False,
|
|
|
- SAVE_FAILED_REQUEST=False, # 保存失败的request
|
|
|
- ITEM_FILTER_ENABLE=False, # 过滤
|
|
|
- REQUEST_FILTER_ENABLE=False,
|
|
|
- OSS_UPLOAD_TABLES=(),
|
|
|
- DELETE_KEYS=True,
|
|
|
- ITEM_PIPELINES=[CONSOLE_PIPELINE_PATH],
|
|
|
- )
|
|
|
+ __business_type__ = ""
|
|
|
|
|
|
- def __init__(self, request=None, request_dict=None, *args, **kwargs):
|
|
|
+ def __init__(self, redis_key, thread_count=None, **kwargs):
|
|
|
"""
|
|
|
- @param request: request 类对象
|
|
|
- @param request_dict: request 字典。 request 与 request_dict 二者选一即可
|
|
|
- @param kwargs:
|
|
|
+
|
|
|
+ ---------
|
|
|
+ @param redis_key:
|
|
|
+ @param thread_count: 线程数,默认为配置文件中的线程数
|
|
|
+ ---------
|
|
|
"""
|
|
|
- warnings.warn(
|
|
|
- "您正处于debug模式下,该模式下不会更新任务状态及数据入库,仅用于调试。正式发布前请更改为正常模式", category=Warning
|
|
|
- )
|
|
|
+ super(Spider, self).__init__()
|
|
|
|
|
|
- if not request and not request_dict:
|
|
|
- raise Exception("request 与 request_dict 不能同时为null")
|
|
|
+ for key, value in self.__class__.__custom_setting__.items():
|
|
|
+ setattr(setting, key, value)
|
|
|
|
|
|
- kwargs["redis_key"] = kwargs["redis_key"] + "_debug"
|
|
|
- self.__class__.__custom_setting__.update(
|
|
|
- self.__class__.__debug_custom_setting__
|
|
|
+ self._thread_count = (
|
|
|
+ setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
|
|
|
)
|
|
|
|
|
|
- super(DebugSpider, self).__init__(*args, **kwargs)
|
|
|
-
|
|
|
- self._request = request or Request.from_dict(request_dict)
|
|
|
-
|
|
|
- def __start_requests(self):
|
|
|
- yield self._request
|
|
|
-
|
|
|
- def _start(self):
|
|
|
- # 启动parser 的 start_requests
|
|
|
- self.spider_begin() # 不自动结束的爬虫此处只能执行一遍
|
|
|
+ self._heartbeat_buffer = HeartBeatBuffer()
|
|
|
+ self._item_buffer = JyItemBuffer(redis_key=redis_key)
|
|
|
+
|
|
|
+ self._memory_db = MemoryDB()
|
|
|
+ self._parser_controls = [] # 爬虫实例列表
|
|
|
+
|
|
|
+ self.tasks_dict = {}
|
|
|
+ self.task_api_auth_token = None
|
|
|
+
|
|
|
+ def distribute_task(self):
|
|
|
+ for request in self.start_requests():
|
|
|
+ if not isinstance(request, Request):
|
|
|
+ raise ValueError("仅支持 yield Request")
|
|
|
+
|
|
|
+ request.parser_name = request.parser_name or self.name
|
|
|
+ self._memory_db.add(request)
|
|
|
+
|
|
|
+ def all_thread_is_done(self):
|
|
|
+ for i in range(3):
|
|
|
+ # 检测 heartbeat_buffer 状态
|
|
|
+ if (
|
|
|
+ self._heartbeat_buffer.get_items_count() > 0
|
|
|
+ or self._heartbeat_buffer.is_adding_to_db()
|
|
|
+ ):
|
|
|
+ return False
|
|
|
+
|
|
|
+ # 检测 parser_control 状态
|
|
|
+ for parser_control in self._parser_controls:
|
|
|
+ if not parser_control.is_not_task():
|
|
|
+ return False
|
|
|
+
|
|
|
+ # 检测 任务队列 状态
|
|
|
+ if not self._memory_db.empty():
|
|
|
+ return False
|
|
|
+
|
|
|
+ # 检测 item_buffer 状态
|
|
|
+ if (
|
|
|
+ self._item_buffer.get_items_count() > 0
|
|
|
+ or self._item_buffer.is_adding_to_db()
|
|
|
+ ):
|
|
|
+ return False
|
|
|
+
|
|
|
+ tools.delay_time(1)
|
|
|
+
|
|
|
+ return True
|
|
|
+
|
|
|
+ def get_task_api_token(self):
|
|
|
+ # 获取TOKEN
|
|
|
+ if self.task_api_auth_token is None:
|
|
|
+ token_url = f"{setting.JY_TASK_URL}/tasks/token"
|
|
|
+ data = {"username": "spider@py", "password": "123@qweA!"}
|
|
|
+ auth_params = dict(url=token_url, timeout=10, data=data, proxies=False)
|
|
|
+ response = Request(method="GET", **auth_params).get_response()
|
|
|
+ token = response.json["token"]
|
|
|
+ self.task_api_auth_token = token
|
|
|
+ log.debug(f"Apply Task api Token:{self.task_api_auth_token}")
|
|
|
|
|
|
- for parser in self._parsers:
|
|
|
- results = parser.__start_requests()
|
|
|
- # 添加request到请求队列,由请求队列统一入库
|
|
|
- if results and not isinstance(results, Iterable):
|
|
|
- raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
|
|
|
-
|
|
|
- result_type = 1
|
|
|
- for result in results or []:
|
|
|
- if isinstance(result, Request):
|
|
|
- result.parser_name = result.parser_name or parser.name
|
|
|
- self._request_buffer.put_request(result)
|
|
|
- result_type = 1
|
|
|
-
|
|
|
- elif isinstance(result, Item):
|
|
|
- self._item_buffer.put_item(result)
|
|
|
- result_type = 2
|
|
|
-
|
|
|
- elif callable(result): # callbale的request可能是更新数据库操作的函数
|
|
|
- if result_type == 1:
|
|
|
- self._request_buffer.put_request(result)
|
|
|
- else:
|
|
|
- self._item_buffer.put_item(result)
|
|
|
-
|
|
|
- self._request_buffer.flush()
|
|
|
- self._item_buffer.flush()
|
|
|
+ def run(self): # 调度控制流程起始
|
|
|
+ self.start_callback()
|
|
|
|
|
|
- # 启动collector
|
|
|
- self._collector.start()
|
|
|
+ self._heartbeat_buffer.start() # 启动 heartbeat_buffer
|
|
|
|
|
|
- # 启动parser control
|
|
|
for i in range(self._thread_count):
|
|
|
- parser_control = self._parser_control_obj(
|
|
|
- self._collector,
|
|
|
- self._redis_key,
|
|
|
- self._request_buffer,
|
|
|
- self._item_buffer,
|
|
|
+ parser_control = JySpiderParserControl(
|
|
|
+ memory_db=self._memory_db,
|
|
|
+ item_buffer=self._item_buffer,
|
|
|
+ heartbeat_buffer=self._heartbeat_buffer
|
|
|
)
|
|
|
-
|
|
|
- for parser in self._parsers:
|
|
|
- parser_control.add_parser(parser)
|
|
|
-
|
|
|
+ parser_control.add_parser(self)
|
|
|
parser_control.start()
|
|
|
self._parser_controls.append(parser_control)
|
|
|
|
|
|
- # 启动request_buffer
|
|
|
- self._request_buffer.start()
|
|
|
-
|
|
|
- # 启动item_buffer
|
|
|
self._item_buffer.start()
|
|
|
-
|
|
|
- def run(self):
|
|
|
- if not self._parsers: # 不是add_parser 模式
|
|
|
- self._parsers.append(self)
|
|
|
-
|
|
|
- self._start()
|
|
|
-
|
|
|
+ if self.__class__.__business_type__.endswith("Detail"):
|
|
|
+ self._item_buffer.release_task_enable = True # 启用爬虫释放采集任务
|
|
|
+ self.get_task_api_token() # 申请token
|
|
|
+
|
|
|
+ # 派发任务
|
|
|
+ self.distribute_task()
|
|
|
+ # 已派发任务加入 item_buffer 缓存容器
|
|
|
+ self._item_buffer.tasks_dict.update(self.tasks_dict)
|
|
|
while True:
|
|
|
try:
|
|
|
if self.all_thread_is_done():
|
|
|
- self._stop_all_thread()
|
|
|
+ # 停止 parser_controls
|
|
|
+ for parser_control in self._parser_controls:
|
|
|
+ parser_control.stop()
|
|
|
+
|
|
|
+ self._item_buffer.stop() # 关闭 item_buffer
|
|
|
+ self._heartbeat_buffer.stop() # 关闭 heartbeat_buffer
|
|
|
+
|
|
|
+ # 关闭 webdriver
|
|
|
+ if Request.webdriver_pool:
|
|
|
+ Request.webdriver_pool.close()
|
|
|
+
|
|
|
+ log.info("无任务,爬虫结束")
|
|
|
break
|
|
|
+
|
|
|
except Exception as e:
|
|
|
log.exception(e)
|
|
|
|
|
|
tools.delay_time(1) # 1秒钟检查一次爬虫状态
|
|
|
|
|
|
+ self._item_buffer.release_tasks(self.tasks_dict, finished=False) # 释放剩余未完成的任务
|
|
|
+ self.end_callback()
|
|
|
+ self._started.clear() # 为了线程可重复start
|
|
|
+
|
|
|
+ def join(self, timeout=None):
|
|
|
+ """
|
|
|
+ 重写线程的join
|
|
|
+ """
|
|
|
+ if not self._started.is_set():
|
|
|
+ return
|
|
|
+
|
|
|
+ super().join()
|
|
|
+
|
|
|
|
|
|
class BaseBusinessListSpider(Spider):
|
|
|
"""列表页采集基础爬虫"""
|
|
|
|
|
|
__business_type__ = "List"
|
|
|
|
|
|
- def __auto_increment_page_number(self, request):
|
|
|
- """翻页 - 页码自增"""
|
|
|
- if request.page is None:
|
|
|
- raise ValueError('请设置 request.page 起始页码数')
|
|
|
+ def infinite_pages(self, request, response):
|
|
|
+ """无限翻页"""
|
|
|
+
|
|
|
+ def _page_increment():
|
|
|
+ if request.page is None:
|
|
|
+ raise ValueError("请设置 request.page 起始页码数")
|
|
|
|
|
|
- if request.page < int(request.item["crawl_page"]):
|
|
|
- request.page += 1 # 采集页码自增
|
|
|
- yield request
|
|
|
+ if request.page < int(request.item["crawl_page"]):
|
|
|
+ request.page += 1 # 采集页码自增
|
|
|
+ yield request
|
|
|
|
|
|
- def infinite_pages(self, request, response):
|
|
|
- """翻页"""
|
|
|
- generator = self.__auto_increment_page_number(request)
|
|
|
- try:
|
|
|
- request = next(generator)
|
|
|
- return request
|
|
|
- except StopIteration:
|
|
|
- pass
|
|
|
+ return next(_page_increment(), None)
|
|
|
|
|
|
|
|
|
class BaseBusinessDetailSpider(Spider):
|
|
@@ -257,32 +187,18 @@ class BaseBusinessDetailSpider(Spider):
|
|
|
ITEM_FILTER_ENABLE=False
|
|
|
)
|
|
|
|
|
|
- def __init__(
|
|
|
- self,
|
|
|
- redis_key=None,
|
|
|
- thread_count=None,
|
|
|
- begin_callback=None,
|
|
|
- end_callback=None,
|
|
|
- delete_keys=(),
|
|
|
- keep_alive=None,
|
|
|
- auto_start_requests=None,
|
|
|
- **kwargs
|
|
|
- ):
|
|
|
+ def __init__(self, redis_key, thread_count=None, **kwargs):
|
|
|
self.__class__.__custom_setting__.update(
|
|
|
self.__class__.__business_setting__
|
|
|
)
|
|
|
- redis_key = f'{redis_key}_detailc'
|
|
|
super(BaseBusinessDetailSpider, self).__init__(
|
|
|
redis_key=redis_key,
|
|
|
thread_count=thread_count,
|
|
|
- begin_callback=begin_callback,
|
|
|
- end_callback=end_callback,
|
|
|
- delete_keys=delete_keys,
|
|
|
- keep_alive=keep_alive,
|
|
|
- auto_start_requests=auto_start_requests,
|
|
|
**kwargs
|
|
|
)
|
|
|
|
|
|
+ self._redis_key = redis_key
|
|
|
+
|
|
|
def failed_request(self, request, response):
|
|
|
"""请求、解析错误次数超过上限后,记录错误详情信息"""
|
|
|
failed_item = FailedTaskItem(
|
|
@@ -293,73 +209,23 @@ class BaseBusinessDetailSpider(Spider):
|
|
|
failed_item.table_name = setting.TASK_REQUEST_FAILED
|
|
|
yield failed_item
|
|
|
|
|
|
- def get_tasks_by_rabbitmq(self, limit=None, auto_ack=True):
|
|
|
- """
|
|
|
-
|
|
|
- @param limit: 获取消息数量
|
|
|
- @param auto_ack: 自动回复消息确认
|
|
|
- """
|
|
|
- queue_name = setting.TAB_ITEMS.format(
|
|
|
- redis_key=self._redis_key.replace("_detailc", "")
|
|
|
- )
|
|
|
- limit = limit or setting.COLLECTOR_TASK_COUNT
|
|
|
- correlation_id = tools.get_uuid().replace("-", "")
|
|
|
- if self._rabbitmq.get_message_count(queue_name) == 0:
|
|
|
- # 步骤1 推送要求发布任务消息
|
|
|
- produce_queue = "pyspider.report.produce"
|
|
|
- produce_task = {
|
|
|
- "ip": tools.get_localhost_ip(),
|
|
|
- "queue_name": queue_name,
|
|
|
- "coll_name": setting.TASK_REQUEST_PRODUCE,
|
|
|
- "limit": limit,
|
|
|
- }
|
|
|
- properties = dict(correlation_id=correlation_id)
|
|
|
- self._rabbitmq.add(produce_task, produce_queue, properties=properties)
|
|
|
-
|
|
|
- # 步骤2 等待任务生产完成的处理回应消息
|
|
|
- receipt_queue = f"receipt_{correlation_id}"
|
|
|
- with self._rabbitmq.get_mq_obj().channel() as channel:
|
|
|
- try:
|
|
|
- channel.basic.consume(queue=receipt_queue, no_ack=True)
|
|
|
- tools.delay_time(0.8) # 监听与收复消息的时间间隔
|
|
|
- inbound = channel.build_inbound_messages(break_on_empty=True)
|
|
|
- message_dict = {msg.correlation_id: msg for msg in inbound}
|
|
|
- # log.debug(f"采集任务推送 {message_dict}")
|
|
|
- message = message_dict.get(correlation_id)
|
|
|
- if message:
|
|
|
- body = tools.loads_obj(message.body)
|
|
|
- log.debug(f"推送任务到采集队列《{body['queue_name']}》完成")
|
|
|
- except amqpstorm.exception.AMQPChannelError:
|
|
|
- pass
|
|
|
-
|
|
|
- # 步骤3 开始拉取任务
|
|
|
- task_lst = []
|
|
|
- messages = self._rabbitmq.get(queue_name, limit, auto_ack, to_str=False)
|
|
|
- for message in messages:
|
|
|
- body = message.body
|
|
|
- if isinstance(body, Item):
|
|
|
- task_lst.append(body.to_dict)
|
|
|
- else:
|
|
|
- task_lst.append(body)
|
|
|
- return task_lst
|
|
|
-
|
|
|
- def get_tasks_by_mongodb(self, table=None, query=None, limit=None):
|
|
|
- pipeline_path = "feapder.pipelines.mongo_pipeline.TaskPipeline"
|
|
|
- pipeline = tools.import_cls(pipeline_path)()
|
|
|
- table = table or setting.TASK_REQUEST_PRODUCE
|
|
|
- queue_name = setting.TAB_ITEMS.format(
|
|
|
- redis_key=self._redis_key.replace('_detailc', '')
|
|
|
- )
|
|
|
- conditions = query or {
|
|
|
- 'state': {'$in': [1, 3, 5]},
|
|
|
- 'queue_name': queue_name,
|
|
|
- 'update_at': {'$lt': tools.get_current_timestamp()}
|
|
|
+ def get_tasks(self, limit=None, **kwargs):
|
|
|
+ queue = setting.TAB_ITEMS.format(redis_key=self._redis_key.replace("_detailc", ""))
|
|
|
+
|
|
|
+ # 获取任务
|
|
|
+ url = f"{setting.JY_TASK_URL}/tasks/fd?qn={queue}&limit={limit}"
|
|
|
+ headers = {"Authorization": self.task_api_auth_token}
|
|
|
+ params = dict(headers=headers, timeout=10, proxies=False)
|
|
|
+ response = Request(method="GET", url=url, **params).get_response()
|
|
|
+ ret = response.json["task"]
|
|
|
+ self.tasks_dict = {
|
|
|
+ "token": self.task_api_auth_token,
|
|
|
+ "data": {t["pyuuid"]: {"tid": t["tid"], "queue": queue} for t in ret}
|
|
|
}
|
|
|
- limit = limit or setting.COLLECTOR_TASK_COUNT
|
|
|
- results = pipeline.find_items(table, conditions, limit)
|
|
|
- ignore = {'_id', 'state', 'update_at', 'queue_name'}
|
|
|
- task_lst = [{k: v for k, v in items.items() if k not in ignore} for items in results]
|
|
|
- return task_lst
|
|
|
+ return ret
|
|
|
+
|
|
|
+ get_tasks_by_rabbitmq = get_tasks
|
|
|
+ get_tasks_by_mongodb = get_tasks
|
|
|
|
|
|
|
|
|
class BiddingListSpider(BaseBusinessListSpider):
|