|
@@ -24,7 +24,7 @@ CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline"
|
|
|
|
|
|
class Spider(
|
|
|
BaseParser, Scheduler
|
|
|
-): # threading 中有name函数, 必须先继承BaseParser 否则其内部的name会被Schedule的基类threading.Thread的name覆盖
|
|
|
+): # threading 中有name函数, 必须先继承BaseParser 否则其内部的name会被Schedule的基类覆盖threading.Thread的name
|
|
|
"""
|
|
|
@summary: 为了简化搭建爬虫
|
|
|
---------
|
|
@@ -33,32 +33,24 @@ class Spider(
|
|
|
def __init__(
|
|
|
self,
|
|
|
redis_key=None,
|
|
|
- min_task_count=1,
|
|
|
check_task_interval=5,
|
|
|
thread_count=None,
|
|
|
begin_callback=None,
|
|
|
end_callback=None,
|
|
|
- delete_keys=(),
|
|
|
keep_alive=None,
|
|
|
auto_start_requests=None,
|
|
|
- batch_interval=0,
|
|
|
- wait_lock=True,
|
|
|
**kwargs
|
|
|
):
|
|
|
"""
|
|
|
@summary: 爬虫
|
|
|
---------
|
|
|
@param redis_key: 任务等数据存放在redis中的key前缀
|
|
|
- @param min_task_count: 任务队列中最少任务数, 少于这个数量才会添加任务,默认1。start_monitor_task 模式下生效
|
|
|
@param check_task_interval: 检查是否还有任务的时间间隔;默认5秒
|
|
|
@param thread_count: 线程数,默认为配置文件中的线程数
|
|
|
@param begin_callback: 爬虫开始回调函数
|
|
|
@param end_callback: 爬虫结束回调函数
|
|
|
- @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则; 常用于清空任务队列,否则重启时会断点续爬
|
|
|
@param keep_alive: 爬虫是否常驻
|
|
|
@param auto_start_requests: 爬虫是否自动添加任务
|
|
|
- @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动
|
|
|
- @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True
|
|
|
---------
|
|
|
@result:
|
|
|
"""
|
|
@@ -67,17 +59,12 @@ class Spider(
|
|
|
thread_count=thread_count,
|
|
|
begin_callback=begin_callback,
|
|
|
end_callback=end_callback,
|
|
|
- delete_keys=delete_keys,
|
|
|
keep_alive=keep_alive,
|
|
|
auto_start_requests=auto_start_requests,
|
|
|
- batch_interval=batch_interval,
|
|
|
- wait_lock=wait_lock,
|
|
|
**kwargs
|
|
|
)
|
|
|
|
|
|
- self._min_task_count = min_task_count
|
|
|
self._check_task_interval = check_task_interval
|
|
|
-
|
|
|
self._is_distributed_task = False
|
|
|
self._is_show_not_task = False
|
|
|
|
|
@@ -308,7 +295,7 @@ class BusinessBaseDetailSpider(Spider):
|
|
|
ITEM_FILTER_ENABLE=False
|
|
|
)
|
|
|
|
|
|
- err_coll_name = "listdata_err"
|
|
|
+ err_coll_name = "listdata_err" # 详情采集失败时存放的详情任务数据的表
|
|
|
_to_db = None
|
|
|
|
|
|
def __init__(
|
|
@@ -320,8 +307,6 @@ class BusinessBaseDetailSpider(Spider):
|
|
|
delete_keys=(),
|
|
|
keep_alive=None,
|
|
|
auto_start_requests=None,
|
|
|
- batch_interval=0,
|
|
|
- wait_lock=True,
|
|
|
**kwargs
|
|
|
):
|
|
|
self.__class__.__custom_setting__.update(
|
|
@@ -335,8 +320,6 @@ class BusinessBaseDetailSpider(Spider):
|
|
|
delete_keys=delete_keys,
|
|
|
keep_alive=keep_alive,
|
|
|
auto_start_requests=auto_start_requests,
|
|
|
- batch_interval=batch_interval,
|
|
|
- wait_lock=wait_lock,
|
|
|
**kwargs
|
|
|
)
|
|
|
|