Ver Fonte

update:添加中国招标投标公共服务平台数据汇总

dongzhaorui há 2 anos atrás
pai
commit
5e2943af9c
1 ficheiros alterados com 57 adições e 6 exclusões
  1. 57 6
      A数据处理/sync_data/summary.py

+ 57 - 6
A数据处理/sync_data/summary_data.py → A数据处理/sync_data/summary.py

@@ -14,10 +14,14 @@ from pymongo import MongoClient
 
 from log import logger
 
-# mongo
-MONGO_HOST = "172.17.4.87"
-MONGO_PORT = 27080
+MONGO_HOST = "127.0.0.1"
+MONGO_PORT = 27001
 MONGO_DB = "py_spider"
+
+# mongo
+# MONGO_HOST = "172.17.4.87"
+# MONGO_PORT = 27080
+# MONGO_DB = "py_spider"
 client = MongoClient(MONGO_HOST, MONGO_PORT)
 mongodb = client[MONGO_DB]
 
@@ -34,13 +38,16 @@ py_spiders_crawl_list = mongodb["crawl_data"]
 ybw_list = mongodb["ybw_list"]
 zbytb_list = mongodb["zbytb_list"]
 
+# 主题爬虫
+zgzb_list = mongodb["zgzb_list"]
+
 # 列表页汇总表
-summary_table_of_list_pages = mongodb["list"]
+# summary_table_of_list_pages = mongodb["list"]
+summary_table_of_list_pages = mongodb["123qqq"]
 
 
 def save(documents, collection):
     """保存数据"""
-
     is_list = isinstance(documents, list)
     documents = documents if is_list else [documents]
 
@@ -56,8 +63,10 @@ def save(documents, collection):
             logger.info(f"[Summary]{collection.name}-批量保存{count}条数据--已完成")
 
     # 提交剩余数据
-    collection.insert_many(data_lst)
+    if len(data_lst) > 0:
+        collection.insert_many(data_lst)
     logger.info(f"[Summary]{collection.name}-批量保存{count}条数据--已完成")
+    return count
 
 
 def summary_data(document, runtime, only_count_list_page=False):
@@ -239,7 +248,49 @@ def competing_products_crawl_aggregate_of_list_pages(datestr=None):
     competing_products_crawl_aggregate(zbytb_list, datestr)
 
 
+def zgzb_crawl_aggregate_of_list_pages(datestr=None):
+    if datestr is not None:
+        today = datetime.fromisoformat(datestr).date()
+    else:
+        today = datetime.now().date()
+    yesterday = today + timedelta(days=-1)
+
+    runtime = yesterday.strftime("%Y-%m-%d")
+    start_time = int(datetime.combine(yesterday, time()).timestamp())
+    end_time = int(datetime.combine(today, time()).timestamp())
+
+    pipeline = [
+        {"$match": {"comeintime": {"$gte": start_time, "$lt": end_time}}},
+        {
+            "$group": {
+                "_id": "$spidercode",
+                "count": {"$sum": 1},  # 当天采集总数
+                "rel_count": {"$sum": 1},  # 当天采集总数
+                "spider_item": {
+                    "$addToSet": {
+                        "site": "$site",
+                        "channel": "$channel",
+                        "spidercode": "$spidercode",
+                        "business_type": "List"
+                    }
+                }
+            }
+        },
+    ]
+    cursor = zgzb_list.aggregate(pipeline, allowDiskUse=True)
+    try:
+        results = []
+        for doc in cursor:
+            results.extend(summary_data(doc, runtime))
+        save(results, summary_table_of_list_pages)
+    finally:
+        client.close()
+        logger.info("[Summary]zgzb_list数据汇总结束")
+
+
 if __name__ == '__main__':
     feapder_crawl_aggregate_of_list_pages()
     py_spiders_crawl_aggregate_of_list_pages()
     competing_products_crawl_aggregate_of_list_pages()
+    zgzb_crawl_aggregate_of_list_pages()
+