فهرست منبع

update:添加竞品采集列表统计

dongzhaorui 2 سال پیش
والد
کامیت
af87b0fd24
1فایلهای تغییر یافته به همراه61 افزوده شده و 3 حذف شده
  1. 61 3
      A数据处理/sync_data/summary_data.py

+ 61 - 3
A数据处理/sync_data/summary_data.py

@@ -30,6 +30,10 @@ spider_heartbeat = mongodb["spider_heartbeat"]
 # py_spiders列表
 py_spiders_crawl_list = mongodb["crawl_data"]
 
+# 竞品列表
+ybw_list = mongodb["ybw_list"]
+zbytb_list = mongodb["zbytb_list"]
+
 # 列表页汇总表
 summary_table_of_list_pages = mongodb["list"]
 
@@ -88,7 +92,6 @@ def summary_data(document, runtime, only_count_list_page=False):
 
 def feapder_crawl_aggregate_of_list_pages(datestr=None):
     """feapder采集列表页数据汇总(前一天的数据)"""
-
     if datestr is None:
         today = datetime.now().date()
         yesterday = today + timedelta(days=-1)
@@ -180,8 +183,63 @@ def py_spiders_crawl_aggregate_of_list_pages(datestr=None):
         logger.info("[Summary]py_spiders数据汇总结束")
 
 
+def competing_products_crawl_aggregate(collection, datestr=None):
+    """竞品采集聚合查询"""
+    if datestr is not None:
+        today = datetime.fromisoformat(datestr).date()
+    else:
+        today = datetime.now().date()
+    yesterday = today + timedelta(days=-1)
+
+    publish_time = yesterday.strftime("%Y-%m-%d")
+    table_name = collection.name
+    pipeline = [
+        {
+            "$addFields": {
+                "rel_count": {
+                    "$cond": {
+                        "if": {"$ne": ["$count", 0]},
+                        "then": 1,
+                        "else": 0
+                    }
+                }
+            }
+        },
+        {"$match": {"publishtime": publish_time}},
+        {
+            "$group": {
+                "_id": "$channel",
+                "count": {"$sum": 1},  # 当天采集总数
+                "rel_count": {"$sum": "$rel_count"},  # es检索结果为0的总数
+                "spider_item": {
+                    "$addToSet": {
+                        "site": "$site",
+                        "channel": "$channel",
+                        "spidercode": "$spidercode",
+                        "business_type": "List"
+                    }
+                }
+            }
+        },
+    ]
+    cursor = collection.aggregate(pipeline, allowDiskUse=True)
+    try:
+        results = []
+        for doc in cursor:
+            results.extend(summary_data(doc, publish_time))
+        save(results, summary_table_of_list_pages)
+    finally:
+        client.close()
+        logger.info(f"[Summary]{table_name}数据汇总结束")
+
+
+def competing_products_crawl_aggregate_of_list_pages(datestr=None):
+    """竞品采集列表页数据汇总"""
+    competing_products_crawl_aggregate(ybw_list, datestr)
+    competing_products_crawl_aggregate(zbytb_list, datestr)
+
+
 if __name__ == '__main__':
-    # feapder_crawl_aggregate_of_list_pages("2023-04-03")
     feapder_crawl_aggregate_of_list_pages()
-    # py_spiders_crawl_aggregate_of_list_pages("2023-04-04")
     py_spiders_crawl_aggregate_of_list_pages()
+    competing_products_crawl_aggregate_of_list_pages()