|
@@ -391,7 +391,7 @@ class PaserControl(threading.Thread):
|
|
request._webdriver_pool.put(response.browser)
|
|
request._webdriver_pool.put(response.browser)
|
|
|
|
|
|
# 发布心跳
|
|
# 发布心跳
|
|
- self.publish_heartbeat(parser, request, **counter)
|
|
|
|
|
|
+ self.publish_heartbeat(parser, request, response, **counter)
|
|
break
|
|
break
|
|
|
|
|
|
if setting.SPIDER_SLEEP_TIME:
|
|
if setting.SPIDER_SLEEP_TIME:
|
|
@@ -428,38 +428,41 @@ class PaserControl(threading.Thread):
|
|
return True
|
|
return True
|
|
return False
|
|
return False
|
|
|
|
|
|
- def publish_heartbeat(self, parser, request, **kwargs):
|
|
|
|
|
|
+ def publish_heartbeat(self, parser, request, response, **kwargs):
|
|
request_item = getattr(request, "item")
|
|
request_item = getattr(request, "item")
|
|
business_type: str = parser.__business_type__ # 爬虫业务类型
|
|
business_type: str = parser.__business_type__ # 爬虫业务类型
|
|
if business_type.endswith("List"):
|
|
if business_type.endswith("List"):
|
|
site = getattr(parser, "site")
|
|
site = getattr(parser, "site")
|
|
spidercode = request_item["code"]
|
|
spidercode = request_item["code"]
|
|
- count = kwargs["extract_count"]
|
|
|
|
|
|
+ count = kwargs["extract_count"] # 抽取的列表数量
|
|
else:
|
|
else:
|
|
site = request_item["site"]
|
|
site = request_item["site"]
|
|
spidercode = request_item["spidercode"]
|
|
spidercode = request_item["spidercode"]
|
|
- count = 0
|
|
|
|
|
|
+ count = 0 # 详情页采集任务总数,汇总计算在心跳管理器处理
|
|
|
|
|
|
run_time = tools.get_current_date(date_format="%Y-%m-%d") # 运行时间,单位:天
|
|
run_time = tools.get_current_date(date_format="%Y-%m-%d") # 运行时间,单位:天
|
|
heartbeat_item = HeartBeatItem(
|
|
heartbeat_item = HeartBeatItem(
|
|
- node_ip=tools.os.environ.get("CRAWLAB_SERVER_REGISTER_IP"), # crawlab节点名称
|
|
|
|
- crawlab_taskid=tools.os.environ.get("CRAWLAB_TASK_ID"), # crawlab平台爬虫的任务id
|
|
|
|
|
|
+ batch_no=tools.get_md5(spidercode + business_type + run_time),
|
|
|
|
+ node_ip=tools.os.environ.get("CRAWLAB_SERVER_REGISTER_IP"), # crawlab 节点名称
|
|
|
|
+ crawlab_taskid=tools.os.environ.get("CRAWLAB_TASK_ID"), # crawlab 执行采集的任务id
|
|
|
|
+ filepath=str(pathlib.Path(setting.sys.argv[0])), # 文件路径
|
|
site=site,
|
|
site=site,
|
|
channel=request_item["channel"],
|
|
channel=request_item["channel"],
|
|
spidercode=spidercode,
|
|
spidercode=spidercode,
|
|
business_type=business_type,
|
|
business_type=business_type,
|
|
- spider_id=tools.get_md5(spidercode + business_type + run_time),
|
|
|
|
- filepath=str(pathlib.Path(setting.sys.argv[0])), # 文件路径
|
|
|
|
runtime=run_time,
|
|
runtime=run_time,
|
|
|
|
+ url=request.url,
|
|
|
|
+ status_code=getattr(response, "status_code", -1),
|
|
nowpage=kwargs["now_page"], # 当前列表页页码
|
|
nowpage=kwargs["now_page"], # 当前列表页页码
|
|
- count=count, # 抽取的列表数量
|
|
|
|
|
|
+ count=count,
|
|
|
|
+ failed_retry_times=request.retry_times, # 失败重试次数
|
|
rel_count=kwargs["rel_count"], # 实际入库总数
|
|
rel_count=kwargs["rel_count"], # 实际入库总数
|
|
failed_task_count=self._failed_task_count,
|
|
failed_task_count=self._failed_task_count,
|
|
success_task_count=self._success_task_count,
|
|
success_task_count=self._success_task_count,
|
|
create_at=tools.ensure_int64(tools.get_current_timestamp()), # 创建时间, 单位:秒
|
|
create_at=tools.ensure_int64(tools.get_current_timestamp()), # 创建时间, 单位:秒
|
|
expire_at=tools.get_utcnow(), # 设置utc时间,定期删除(5天)
|
|
expire_at=tools.get_utcnow(), # 设置utc时间,定期删除(5天)
|
|
)
|
|
)
|
|
- # 采集任务总数(本次爬虫运行发起的总请求数) failed_task_count + success_task_count
|
|
|
|
|
|
+ # 采集任务总数(爬虫本次运行发起的总请求数) failed_task_count + success_task_count
|
|
heartbeat_item.table_name = setting.SPIDER_HEARTBEAT_RECORD # 设置表名
|
|
heartbeat_item.table_name = setting.SPIDER_HEARTBEAT_RECORD # 设置表名
|
|
return self._heartbeat_buffer.put_item(heartbeat_item)
|
|
return self._heartbeat_buffer.put_item(heartbeat_item)
|
|
|
|
|