1 ano atrás · 32593093e8
--- a/FworkSpider/feapder/core/parser_control.py
+++ b/FworkSpider/feapder/core/parser_control.py
@@ -391,7 +391,7 @@ class PaserControl(threading.Thread):
 
															                         request._webdriver_pool.put(response.browser)
														
 
															                     # 发布心跳
														
 
															-                    self.publish_heartbeat(parser, request, **counter)
														
 
															+                    self.publish_heartbeat(parser, request, response, **counter)
														
 
															                 break
														
 
															         if setting.SPIDER_SLEEP_TIME:
														
@@ -428,38 +428,41 @@ class PaserControl(threading.Thread):
 
															                 return True
														
 
															         return False
														
 
															-    def publish_heartbeat(self, parser, request, **kwargs):
														
 
															+    def publish_heartbeat(self, parser, request, response, **kwargs):
														
 
															         request_item = getattr(request, "item")
														
 
															         business_type: str = parser.__business_type__  # 爬虫业务类型
														
 
															         if business_type.endswith("List"):
														
 
															             site = getattr(parser, "site")
														
 
															             spidercode = request_item["code"]
														
 
															-            count = kwargs["extract_count"]
														
 
															+            count = kwargs["extract_count"]  # 抽取的列表数量
														
 
															         else:
														
 
															             site = request_item["site"]
														
 
															             spidercode = request_item["spidercode"]
														
 
															-            count = 0
														
 
															+            count = 0  # 详情页采集任务总数，汇总计算在心跳管理器处理
														
 
															         run_time = tools.get_current_date(date_format="%Y-%m-%d")  # 运行时间,单位:天
														
 
															         heartbeat_item = HeartBeatItem(
														
 
															-            node_ip=tools.os.environ.get("CRAWLAB_SERVER_REGISTER_IP"),  # crawlab节点名称
														
 
															-            crawlab_taskid=tools.os.environ.get("CRAWLAB_TASK_ID"),  # crawlab平台爬虫的任务id
														
 
															+            batch_no=tools.get_md5(spidercode + business_type + run_time),
														
 
															+            node_ip=tools.os.environ.get("CRAWLAB_SERVER_REGISTER_IP"),  # crawlab 节点名称
														
 
															+            crawlab_taskid=tools.os.environ.get("CRAWLAB_TASK_ID"),  # crawlab 执行采集的任务id
														
 
															+            filepath=str(pathlib.Path(setting.sys.argv[0])),  # 文件路径
														
 
															             site=site,
														
 
															             channel=request_item["channel"],
														
 
															             spidercode=spidercode,
														
 
															             business_type=business_type,
														
 
															-            spider_id=tools.get_md5(spidercode + business_type + run_time),
														
 
															-            filepath=str(pathlib.Path(setting.sys.argv[0])),  # 文件路径
														
 
															             runtime=run_time,
														
 
															+            url=request.url,
														
 
															+            status_code=getattr(response, "status_code", -1),
														
 
															             nowpage=kwargs["now_page"],  # 当前列表页页码
														
 
															-            count=count,  # 抽取的列表数量
														
 
															+            count=count,
														
 
															+            failed_retry_times=request.retry_times,  # 失败重试次数
														
 
															             rel_count=kwargs["rel_count"],  # 实际入库总数
														
 
															             failed_task_count=self._failed_task_count,
														
 
															             success_task_count=self._success_task_count,
														
 
															             create_at=tools.ensure_int64(tools.get_current_timestamp()),  # 创建时间, 单位:秒
														
 
															             expire_at=tools.get_utcnow(),  # 设置utc时间，定期删除（5天）
														
 
															         )
														
 
															-        # 采集任务总数(本次爬虫运行发起的总请求数) failed_task_count + success_task_count
														
 
															+        # 采集任务总数(爬虫本次运行发起的总请求数) failed_task_count + success_task_count
														
 
															         heartbeat_item.table_name = setting.SPIDER_HEARTBEAT_RECORD  # 设置表名
														
 
															         return self._heartbeat_buffer.put_item(heartbeat_item)