2 سال پیش · af87b0fd24
--- a/A数据处理/sync_data/summary_data.py
+++ b/A数据处理/sync_data/summary_data.py
@@ -30,6 +30,10 @@ spider_heartbeat = mongodb["spider_heartbeat"]
 
				 # py_spiders列表
			
 
				 py_spiders_crawl_list = mongodb["crawl_data"]
			
 
				 
			
 
				+# 竞品列表
			
 
				+ybw_list = mongodb["ybw_list"]
			
 
				+zbytb_list = mongodb["zbytb_list"]
			
 
				+
			
 
				 # 列表页汇总表
			
 
				 summary_table_of_list_pages = mongodb["list"]
			
 
				 
			
@@ -88,7 +92,6 @@ def summary_data(document, runtime, only_count_list_page=False):
 
				 
			
 
				 def feapder_crawl_aggregate_of_list_pages(datestr=None):
			
 
				     """feapder采集列表页数据汇总（前一天的数据）"""
			
 
				-
			
 
				     if datestr is None:
			
 
				         today = datetime.now().date()
			
 
				         yesterday = today + timedelta(days=-1)
			
@@ -180,8 +183,63 @@ def py_spiders_crawl_aggregate_of_list_pages(datestr=None):
 
				         logger.info("[Summary]py_spiders数据汇总结束")
			
 
				 
			
 
				 
			
 
				+def competing_products_crawl_aggregate(collection, datestr=None):
			
 
				+    """竞品采集聚合查询"""
			
 
				+    if datestr is not None:
			
 
				+        today = datetime.fromisoformat(datestr).date()
			
 
				+    else:
			
 
				+        today = datetime.now().date()
			
 
				+    yesterday = today + timedelta(days=-1)
			
 
				+
			
 
				+    publish_time = yesterday.strftime("%Y-%m-%d")
			
 
				+    table_name = collection.name
			
 
				+    pipeline = [
			
 
				+        {
			
 
				+            "$addFields": {
			
 
				+                "rel_count": {
			
 
				+                    "$cond": {
			
 
				+                        "if": {"$ne": ["$count", 0]},
			
 
				+                        "then": 1,
			
 
				+                        "else": 0
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        },
			
 
				+        {"$match": {"publishtime": publish_time}},
			
 
				+        {
			
 
				+            "$group": {
			
 
				+                "_id": "$channel",
			
 
				+                "count": {"$sum": 1},  # 当天采集总数
			
 
				+                "rel_count": {"$sum": "$rel_count"},  # es检索结果为0的总数
			
 
				+                "spider_item": {
			
 
				+                    "$addToSet": {
			
 
				+                        "site": "$site",
			
 
				+                        "channel": "$channel",
			
 
				+                        "spidercode": "$spidercode",
			
 
				+                        "business_type": "List"
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        },
			
 
				+    ]
			
 
				+    cursor = collection.aggregate(pipeline, allowDiskUse=True)
			
 
				+    try:
			
 
				+        results = []
			
 
				+        for doc in cursor:
			
 
				+            results.extend(summary_data(doc, publish_time))
			
 
				+        save(results, summary_table_of_list_pages)
			
 
				+    finally:
			
 
				+        client.close()
			
 
				+        logger.info(f"[Summary]{table_name}数据汇总结束")
			
 
				+
			
 
				+
			
 
				+def competing_products_crawl_aggregate_of_list_pages(datestr=None):
			
 
				+    """竞品采集列表页数据汇总"""
			
 
				+    competing_products_crawl_aggregate(ybw_list, datestr)
			
 
				+    competing_products_crawl_aggregate(zbytb_list, datestr)
			
 
				+
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				-    # feapder_crawl_aggregate_of_list_pages("2023-04-03")
			
 
				     feapder_crawl_aggregate_of_list_pages()
			
 
				-    # py_spiders_crawl_aggregate_of_list_pages("2023-04-04")
			
 
				     py_spiders_crawl_aggregate_of_list_pages()
			
 
				+    competing_products_crawl_aggregate_of_list_pages()