1 年之前 · 378f0d6712
--- a/A数据处理/sync_data/monitor.py
+++ b/A数据处理/sync_data/monitor.py
@@ -2,13 +2,12 @@
 
				 """
			
 
				 Created on 2023-04-05
			
 
				 ---------
			
 
				-@summary:  爬虫运行监控(feapder + py_spiders)
			
 
				+@summary:  爬虫运行监控(feapder)和日常采集统计
			
 
				 ---------
			
 
				 @author: Dzr
			
 
				 """
			
 
				 import hashlib
			
 
				 from datetime import datetime, timedelta
			
 
				-from operator import itemgetter
			
 
				 
			
 
				 from bson.int64 import Int64
			
 
				 from bson.son import SON
			
@@ -152,6 +151,7 @@ def aggregate_query_count(runtime):
 
				     @param runtime: 运行时间
			
 
				     @return:
			
 
				     """
			
 
				+    spider_dict = {}  # spidercode 与 site 排查记录
			
 
				     aggregate_items = {}
			
 
				     pipeline = [
			
 
				         {"$match": {"runtime": runtime}},
			
@@ -160,97 +160,69 @@ def aggregate_query_count(runtime):
 
				                 "_id": "$batch_no",
			
 
				                 "rel_count": {"$sum": "$rel_count"},  # 入库量(去重)
			
 
				                 "count": {"$sum": "$count"},  # 下载量
			
 
				-                "spider_item": {
			
 
				-                    "$addToSet": {
			
 
				-                        "site": "$site",
			
 
				-                        "channel": "$channel",
			
 
				-                        "spidercode": "$spidercode",
			
 
				-                        "business_type": "$business_type"
			
 
				-                    }
			
 
				-                }
			
 
				+                "site": {"$first": "$site"},
			
 
				+                "channel": {"$first": "$channel"},
			
 
				+                "spidercode": {"$first": "$spidercode"},
			
 
				+                "business_type": {"$first": "$business_type"},
			
 
				             }
			
 
				         },
			
 
				         {"$sort": SON([("rel_count", -1)])}
			
 
				     ]
			
 
				     results = aggregate_query(spider_heartbeat, pipeline)
			
 
				-    for doc in results:
			
 
				-        spider_dict = {}
			
 
				-        spider_item = doc["spider_item"]
			
 
				-        for items in spider_item:
			
 
				-            site = items["site"]
			
 
				-            channel = items["channel"]
			
 
				-            spidercode = items["spidercode"]
			
 
				-            business_type = items["business_type"]
			
 
				-
			
 
				-            if len(spider_item) > 1 and site not in special_sites:
			
 
				-                logger.warning(f"{spidercode} -> {site} --存在风险, {len(spider_item)}")
			
 
				-
			
 
				-            hash_key = get_md5(**items)  # 防止多站点对应1个spidercode,数据相互重叠
			
 
				-            if not hash_key:
			
 
				-                logger.error(f"异常批次号 {doc['_id']}")
			
 
				-                continue
			
 
				-
			
 
				-            is_list = str(business_type).endswith("List")
			
 
				-            if not aggregate_items.get(hash_key):
			
 
				-                data = {
			
 
				-                    "batch_no": doc["_id"],
			
 
				-                    "site": site,
			
 
				-                    "channel": channel,
			
 
				-                    "spidercode": spidercode,
			
 
				-                    "business_type": business_type,
			
 
				-                    "runtime": runtime,
			
 
				-                    "spidercode_at_site_num": 0,  # 爬虫代码与站点对应的关系数量
			
 
				-                    "frame": "feapder"  # 采集框架
			
 
				-                }
			
 
				-                if is_list:
			
 
				-                    data["list_count"] = doc["count"]
			
 
				-                    data["list_rel_count"] = doc["rel_count"]
			
 
				-                    data["list_runtimes"] = 1
			
 
				-                    data["detail_count"] = 0
			
 
				-                    data["detail_rel_count"] = 0
			
 
				-                    data["detail_runtimes"] = 0
			
 
				-                else:
			
 
				-                    data["detail_count"] = doc["count"]
			
 
				-                    data["detail_rel_count"] = doc["rel_count"]
			
 
				-                    data["detail_runtimes"] = 1
			
 
				-                    data["list_count"] = 0
			
 
				-                    data["list_rel_count"] = 0
			
 
				-                    data["list_runtimes"] = 0
			
 
				-
			
 
				-                aggregate_items.setdefault(hash_key, data)
			
 
				+    for items in results:
			
 
				+        site = items["site"]
			
 
				+        channel = items["channel"]
			
 
				+        spidercode = items["spidercode"]
			
 
				+        business_type = items["business_type"]
			
 
				+        business_type = "list" if business_type.endswith("List") else "detail"
			
 
				+
			
 
				+        hash_key = get_md5(**items)  # 爬虫查询标识
			
 
				+        if not hash_key:
			
 
				+            logger.error(f"异常批次号 {items['_id']}")
			
 
				+            continue
			
 
				+
			
 
				+        # 通过爬虫业务类型 分拣数据
			
 
				+        data_dict = {
			
 
				+            "site": site,
			
 
				+            "channel": channel,
			
 
				+            "spidercode": spidercode,
			
 
				+            "runtime": runtime,
			
 
				+            "spidercode_at_site_num": 0,  # 爬虫代码与站点对应的关系数量
			
 
				+            "frame": "feapder",  # 采集框架
			
 
				+            business_type: {
			
 
				+                "batch_no": items["_id"],
			
 
				+                "count": items["count"],
			
 
				+                "rel_count": items["rel_count"],
			
 
				+                "runtimes": 1
			
 
				+            }
			
 
				+        }
			
 
				+        if not aggregate_items.get(hash_key):
			
 
				+            aggregate_items.setdefault(hash_key, data_dict)
			
 
				+        else:
			
 
				+            aggregate_items[hash_key][business_type] = data_dict[business_type]
			
 
				+
			
 
				+        # 监控爬虫任务，当 spidercode_at_site_num > 1
			
 
				+        # 表明创建的爬虫任务存在问题，需反馈数据寻源相关人员
			
 
				+        if site not in special_sites:
			
 
				+            spider = aggregate_items[hash_key]
			
 
				+            if spidercode not in spider_dict:
			
 
				+                spider["spidercode_at_site_num"] = 1
			
 
				+                values = {"keys": [hash_key], "sites": [site]}
			
 
				+                spider_dict.setdefault(spidercode, values)
			
 
				             else:
			
 
				-                data = aggregate_items.get(hash_key)
			
 
				-                if is_list:
			
 
				-                    data["list_count"] += doc["count"]
			
 
				-                    data["list_rel_count"] += doc["rel_count"]
			
 
				-                    data["list_runtimes"] += 1
			
 
				-                else:
			
 
				-                    data["detail_count"] += doc["count"]
			
 
				-                    data["detail_rel_count"] += doc["rel_count"]
			
 
				-                    data["detail_runtimes"] += 1
			
 
				-
			
 
				-                aggregate_items.update({hash_key: data})
			
 
				-
			
 
				-            # 监控爬虫任务，当 spidercode_at_site_num > 1
			
 
				-            # 表明创建的爬虫任务存在问题，需反馈数据寻源相关人员
			
 
				-            if site not in special_sites:
			
 
				-                spider = aggregate_items[hash_key]
			
 
				-                if spidercode not in spider_dict:
			
 
				-                    spider["spidercode_at_site_num"] = 1
			
 
				-                    values = {"keys": [hash_key], "sites": [site]}
			
 
				-                    spider_dict.setdefault(spidercode, values)
			
 
				-                else:
			
 
				-                    # 相同 spidercode 但 site 不同的爬虫进行计数+1
			
 
				-                    sites = spider_dict[spidercode]["sites"]
			
 
				-                    if site not in sites:
			
 
				-                        keys = spider_dict[spidercode]["keys"]
			
 
				-                        for key in keys:
			
 
				-                            # 更新相同 spidercode 的 spidercode_at_site_num
			
 
				-                            aggregate_items[key]["spidercode_at_site_num"] += 1
			
 
				-
			
 
				-                        keys.append(hash_key)  # 记录新爬虫
			
 
				-                        sites.append(site)  # 添加新站点
			
 
				-                        spider["spidercode_at_site_num"] = len(site)
			
 
				+                # spidercode 相同，但 site 不同的爬虫进行计数+1
			
 
				+                sites = spider_dict[spidercode]["sites"]
			
 
				+                if site not in sites:
			
 
				+                    keys = spider_dict[spidercode]["keys"]
			
 
				+                    for key in keys:
			
 
				+                        # 更新相同 spidercode 的 spidercode_at_site_num
			
 
				+                        aggregate_items[key]["spidercode_at_site_num"] += 1
			
 
				+
			
 
				+                    keys.append(hash_key)  # 记录新爬虫
			
 
				+                    sites.append(site)  # 添加新站点
			
 
				+                    spider["spidercode_at_site_num"] = len(sites)
			
 
				+        else:
			
 
				+            aggregate_items[hash_key]["spidercode_at_site_num"] = 1
			
 
				 
			
 
				     return aggregate_items
			
 
				 
			
@@ -297,15 +269,17 @@ def aggregate_query_crawl_list(runtime):
 
				     return aggregate_items
			
 
				 
			
 
				 
			
 
				-def aggregate_query_crawlab_information(runtime):
			
 
				-    """feapder爬虫采集聚合查询crawlab平台运行信息统计"""
			
 
				-    aggregate_items = {}
			
 
				+def aggregate_count_crawlab_update_runtimes(runtime):
			
 
				+    """feapder爬虫采集聚合查询crawlab平台运行信息"""
			
 
				     pipeline = [
			
 
				         {
			
 
				             "$project": {
			
 
				                 "_id": 0,
			
 
				+                "site": 1,
			
 
				+                "channel": 1,
			
 
				                 "batch_no": 1,
			
 
				                 "spidercode": 1,
			
 
				+                "business_type": 1,
			
 
				                 "runtime": 1,
			
 
				                 "node_ip": 1,
			
 
				                 "crawlab_taskid": 1,
			
@@ -316,46 +290,43 @@ def aggregate_query_crawlab_information(runtime):
 
				         {
			
 
				             "$group": {
			
 
				                 "_id": "$batch_no",
			
 
				+                "business_type": {"$first": "$business_type"},
			
 
				+                "site": {"$first": "$site"},
			
 
				+                "channel": {"$first": "$channel"},
			
 
				+                "spidercode": {"$first": "$spidercode"},
			
 
				                 "crawlab_item": {
			
 
				                     "$addToSet": {
			
 
				-                        "spidercode": "$spidercode",
			
 
				-                        "create_at": "$create_at",
			
 
				                         "node_ip": "$node_ip",
			
 
				-                        "crawlab_taskid": "$crawlab_taskid"
			
 
				+                        "crawlab_taskid": "$crawlab_taskid",
			
 
				                     },
			
 
				                 }
			
 
				             }
			
 
				         }
			
 
				     ]
			
 
				     results = aggregate_query(spider_heartbeat, pipeline)
			
 
				-    for doc in results:
			
 
				-        crawlab_item = sorted(doc["crawlab_item"], key=itemgetter("create_at"), reverse=True)
			
 
				-        items: dict = crawlab_item[0]
			
 
				-        spidercode = items.pop("spidercode")
			
 
				-        if not aggregate_items.get(spidercode):
			
 
				-            aggregate_items.setdefault(spidercode, items)
			
 
				+    for items in results:
			
 
				+        runtimes = len(items["crawlab_item"])  # 采集任务运行次数
			
 
				 
			
 
				-    return aggregate_items
			
 
				+        # 通过爬虫业务类型 分拣数据
			
 
				+        business_type = items["business_type"]
			
 
				+        business_type = "list" if business_type.endswith("List") else "detail"
			
 
				+
			
 
				+        hash_key = get_md5(**items)  # 爬虫查询标识
			
 
				+        # 更新聚合统计任务运行次数
			
 
				+        aggregate_count_items[hash_key][business_type]["runtimes"] = runtimes
			
 
				 
			
 
				 
			
 
				 _runtime = get_runtime()  # 统计时间
			
 
				 aggregate_count_items = aggregate_query_count(_runtime)
			
 
				 aggregate_crawl_list_items = aggregate_query_crawl_list(_runtime)
			
 
				-aggregate_query_crawlab_items = aggregate_query_crawlab_information(_runtime)
			
 
				-
			
 
				-
			
 
				-def get_node_and_taskid(spidercode, default=None):
			
 
				-    """获取最新爬虫工作节点和任务id"""
			
 
				-    if aggregate_query_crawlab_items.get(spidercode):
			
 
				-        default = aggregate_query_crawlab_items[spidercode]
			
 
				-    return default
			
 
				+aggregate_count_crawlab_update_runtimes(_runtime)
			
 
				 
			
 
				 
			
 
				 def get_list_isgetdata(hash_key, default=0):
			
 
				     """列表页是否采集数据"""
			
 
				     query_result = aggregate_count_items.get(hash_key)
			
 
				     if query_result:
			
 
				-        default = query_result["list_count"]
			
 
				+        default = query_result["list"]["count"]
			
 
				     return True if default > 0 else False
			
 
				 
			
 
				 
			
@@ -368,8 +339,25 @@ def get_list_allintimes(hash_key, default=0):
 
				 
			
 
				 def get_list_runtimes(hash_key, default=0):
			
 
				     """列表采集运行频次"""
			
 
				-    if aggregate_count_items.get(hash_key):
			
 
				-        default = aggregate_count_items[hash_key]["list_runtimes"]
			
 
				+    query_result = aggregate_count_items.get(hash_key)
			
 
				+    if query_result and "list" in query_result:
			
 
				+        default = aggregate_count_items[hash_key]["list"]["runtimes"]
			
 
				+    return default
			
 
				+
			
 
				+
			
 
				+def get_list_count(hash_key, default=0):
			
 
				+    """列表采集总数"""
			
 
				+    query_result = aggregate_count_items.get(hash_key)
			
 
				+    if query_result and "list" in query_result:
			
 
				+        default = aggregate_count_items[hash_key]["list"]["count"]
			
 
				+    return default
			
 
				+
			
 
				+
			
 
				+def get_list_rel_count(hash_key, default=0):
			
 
				+    """列表实际入库总数"""
			
 
				+    query_result = aggregate_count_items.get(hash_key)
			
 
				+    if query_result and "list" in query_result:
			
 
				+        default = aggregate_count_items[hash_key]["list"]["rel_count"]
			
 
				     return default
			
 
				 
			
 
				 
			
@@ -382,18 +370,26 @@ def get_list_nodatatimes(hash_key, default=-1):
 
				 
			
 
				 def get_detail_downloadnum(hash_key, default=0):
			
 
				     """详情页下载量"""
			
 
				-    if aggregate_count_items.get(hash_key):
			
 
				-        default = aggregate_count_items[hash_key]["detail_count"]
			
 
				+    query_result = aggregate_count_items.get(hash_key)
			
 
				+    if query_result and "detail" in query_result:
			
 
				+        default = aggregate_count_items[hash_key]["detail"]["count"]
			
 
				     return default
			
 
				 
			
 
				 
			
 
				+get_detail_count = get_detail_downloadnum
			
 
				+
			
 
				+
			
 
				 def get_detail_downloadsuccessnum(hash_key, default=0):
			
 
				     """详情页下载成功量"""
			
 
				-    if aggregate_count_items.get(hash_key):
			
 
				-        default = aggregate_count_items[hash_key]["detail_rel_count"]
			
 
				+    query_result = aggregate_count_items.get(hash_key)
			
 
				+    if query_result and "detail" in query_result:
			
 
				+        default = aggregate_count_items[hash_key]["detail"]["rel_count"]
			
 
				     return default
			
 
				 
			
 
				 
			
 
				+get_detail_rel_count = get_detail_downloadsuccessnum
			
 
				+
			
 
				+
			
 
				 def get_detail_downloadfailnum(**kwargs):
			
 
				     """详情页下载失败量"""
			
 
				     count = -1
			
@@ -414,32 +410,27 @@ def start_monitor():
 
				         hash_key = get_md5(site, channel, spidercode)
			
 
				         query_result = aggregate_count_items.get(hash_key)
			
 
				         if query_result:
			
 
				-            crawlab = get_node_and_taskid(spidercode)  # crawlab 运行详情
			
 
				-            if crawlab:
			
 
				-                join_data["py_taskid"] = crawlab["crawlab_taskid"] or ""
			
 
				-                join_data["py_nodename"] = crawlab["node_ip"] or ""
			
 
				-
			
 
				-            # 聚合查询 - 统计数据采集详情信息(来自采集心跳)
			
 
				+            # 聚合查询 - 统计数据采集信息
			
 
				             result = query_result
			
 
				-            join_data["frame"] = result["frame"]  # 采集框架
			
 
				-            join_data["batch_no"] = result["batch_no"]
			
 
				-            join_data["business_type"] = result["business_type"]
			
 
				-            join_data["list_count"] = result["list_count"]
			
 
				-            join_data["list_rel_count"] = result["list_rel_count"]
			
 
				-            join_data["detail_count"] = result["detail_count"]
			
 
				-            join_data["detail_rel_count"] = result["detail_rel_count"]
			
 
				+            join_data["frame"] = result["frame"]  # 采集框架名
			
 
				             join_data["spidercode_at_site_num"] = result["spidercode_at_site_num"]
			
 
				 
			
 
				-        # 采集汇总 - 列表数据
			
 
				+        # 聚合统计 - 列表采集数据
			
 
				         join_data["list_isgetdata"] = get_list_isgetdata(hash_key)
			
 
				         join_data["list_allintimes"] = get_list_allintimes(hash_key)
			
 
				         join_data["list_runtimes"] = get_list_runtimes(hash_key)
			
 
				         join_data["list_nodatatimes"] = get_list_nodatatimes(hash_key)
			
 
				-        # 采集汇总 - 详情数据
			
 
				+        join_data["list_count"] = get_list_count(hash_key)
			
 
				+        join_data["list_rel_count"] = get_list_rel_count(hash_key)
			
 
				+
			
 
				+        # 聚合统计 - 详情采集数据
			
 
				+        join_data["detail_count"] = get_detail_count(hash_key)
			
 
				+        join_data["detail_rel_count"] = get_detail_rel_count(hash_key)
			
 
				         join_data["detail_downloadnum"] = get_detail_downloadnum(hash_key)
			
 
				         join_data["detail_downloadsuccessnum"] = get_detail_downloadsuccessnum(hash_key)
			
 
				         join_data["detail_downloadfailnum"] = get_detail_downloadfailnum(**join_data)
			
 
				-        # 监控是否有效
			
 
				+
			
 
				+        # 日统计采集数据是否有效
			
 
				         frame = join_data.get("frame")
			
 
				         if frame and frame == "feapder":
			
 
				             join_data["is_valid"] = True
			
@@ -448,7 +439,7 @@ def start_monitor():
 
				         logger.info(f"{site} {channel} {spidercode} --统计完成")
			
 
				     # 上传数据库
			
 
				     save(summary_queue, spider_monitor)
			
 
				-    logger.info(f"爬虫监控 - 统计完成")
			
 
				+    logger.info(f"爬虫监控 - 日统计完成")
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':