Ver código fonte

更新心跳汇总统计指标数据

dongzhaorui 1 ano atrás
pai
commit
32593093e8
1 arquivos alterados com 13 adições e 10 exclusões
  1. 13 10
      FworkSpider/feapder/core/parser_control.py

+ 13 - 10
FworkSpider/feapder/core/parser_control.py

@@ -391,7 +391,7 @@ class PaserControl(threading.Thread):
                         request._webdriver_pool.put(response.browser)
                         request._webdriver_pool.put(response.browser)
 
 
                     # 发布心跳
                     # 发布心跳
-                    self.publish_heartbeat(parser, request, **counter)
+                    self.publish_heartbeat(parser, request, response, **counter)
                 break
                 break
 
 
         if setting.SPIDER_SLEEP_TIME:
         if setting.SPIDER_SLEEP_TIME:
@@ -428,38 +428,41 @@ class PaserControl(threading.Thread):
                 return True
                 return True
         return False
         return False
 
 
-    def publish_heartbeat(self, parser, request, **kwargs):
+    def publish_heartbeat(self, parser, request, response, **kwargs):
         request_item = getattr(request, "item")
         request_item = getattr(request, "item")
         business_type: str = parser.__business_type__  # 爬虫业务类型
         business_type: str = parser.__business_type__  # 爬虫业务类型
         if business_type.endswith("List"):
         if business_type.endswith("List"):
             site = getattr(parser, "site")
             site = getattr(parser, "site")
             spidercode = request_item["code"]
             spidercode = request_item["code"]
-            count = kwargs["extract_count"]
+            count = kwargs["extract_count"]  # 抽取的列表数量
         else:
         else:
             site = request_item["site"]
             site = request_item["site"]
             spidercode = request_item["spidercode"]
             spidercode = request_item["spidercode"]
-            count = 0
+            count = 0  # 详情页采集任务总数,汇总计算在心跳管理器处理
 
 
         run_time = tools.get_current_date(date_format="%Y-%m-%d")  # 运行时间,单位:天
         run_time = tools.get_current_date(date_format="%Y-%m-%d")  # 运行时间,单位:天
         heartbeat_item = HeartBeatItem(
         heartbeat_item = HeartBeatItem(
-            node_ip=tools.os.environ.get("CRAWLAB_SERVER_REGISTER_IP"),  # crawlab节点名称
-            crawlab_taskid=tools.os.environ.get("CRAWLAB_TASK_ID"),  # crawlab平台爬虫的任务id
+            batch_no=tools.get_md5(spidercode + business_type + run_time),
+            node_ip=tools.os.environ.get("CRAWLAB_SERVER_REGISTER_IP"),  # crawlab 节点名称
+            crawlab_taskid=tools.os.environ.get("CRAWLAB_TASK_ID"),  # crawlab 执行采集的任务id
+            filepath=str(pathlib.Path(setting.sys.argv[0])),  # 文件路径
             site=site,
             site=site,
             channel=request_item["channel"],
             channel=request_item["channel"],
             spidercode=spidercode,
             spidercode=spidercode,
             business_type=business_type,
             business_type=business_type,
-            spider_id=tools.get_md5(spidercode + business_type + run_time),
-            filepath=str(pathlib.Path(setting.sys.argv[0])),  # 文件路径
             runtime=run_time,
             runtime=run_time,
+            url=request.url,
+            status_code=getattr(response, "status_code", -1),
             nowpage=kwargs["now_page"],  # 当前列表页页码
             nowpage=kwargs["now_page"],  # 当前列表页页码
-            count=count,  # 抽取的列表数量
+            count=count,
+            failed_retry_times=request.retry_times,  # 失败重试次数
             rel_count=kwargs["rel_count"],  # 实际入库总数
             rel_count=kwargs["rel_count"],  # 实际入库总数
             failed_task_count=self._failed_task_count,
             failed_task_count=self._failed_task_count,
             success_task_count=self._success_task_count,
             success_task_count=self._success_task_count,
             create_at=tools.ensure_int64(tools.get_current_timestamp()),  # 创建时间, 单位:秒
             create_at=tools.ensure_int64(tools.get_current_timestamp()),  # 创建时间, 单位:秒
             expire_at=tools.get_utcnow(),  # 设置utc时间,定期删除(5天)
             expire_at=tools.get_utcnow(),  # 设置utc时间,定期删除(5天)
         )
         )
-        # 采集任务总数(本次爬虫运行发起的总请求数) failed_task_count + success_task_count
+        # 采集任务总数(爬虫本次运行发起的总请求数) failed_task_count + success_task_count
         heartbeat_item.table_name = setting.SPIDER_HEARTBEAT_RECORD  # 设置表名
         heartbeat_item.table_name = setting.SPIDER_HEARTBEAT_RECORD  # 设置表名
         return self._heartbeat_buffer.put_item(heartbeat_item)
         return self._heartbeat_buffer.put_item(heartbeat_item)