|
@@ -14,10 +14,14 @@ from pymongo import MongoClient
|
|
|
|
|
|
from log import logger
|
|
|
|
|
|
-# mongo
|
|
|
-MONGO_HOST = "172.17.4.87"
|
|
|
-MONGO_PORT = 27080
|
|
|
+MONGO_HOST = "127.0.0.1"
|
|
|
+MONGO_PORT = 27001
|
|
|
MONGO_DB = "py_spider"
|
|
|
+
|
|
|
+# mongo
|
|
|
+# MONGO_HOST = "172.17.4.87"
|
|
|
+# MONGO_PORT = 27080
|
|
|
+# MONGO_DB = "py_spider"
|
|
|
client = MongoClient(MONGO_HOST, MONGO_PORT)
|
|
|
mongodb = client[MONGO_DB]
|
|
|
|
|
@@ -34,13 +38,16 @@ py_spiders_crawl_list = mongodb["crawl_data"]
|
|
|
ybw_list = mongodb["ybw_list"]
|
|
|
zbytb_list = mongodb["zbytb_list"]
|
|
|
|
|
|
+# 主题爬虫
|
|
|
+zgzb_list = mongodb["zgzb_list"]
|
|
|
+
|
|
|
# 列表页汇总表
|
|
|
-summary_table_of_list_pages = mongodb["list"]
|
|
|
+# summary_table_of_list_pages = mongodb["list"]
|
|
|
+summary_table_of_list_pages = mongodb["123qqq"]
|
|
|
|
|
|
|
|
|
def save(documents, collection):
|
|
|
"""保存数据"""
|
|
|
-
|
|
|
is_list = isinstance(documents, list)
|
|
|
documents = documents if is_list else [documents]
|
|
|
|
|
@@ -56,8 +63,10 @@ def save(documents, collection):
|
|
|
logger.info(f"[Summary]{collection.name}-批量保存{count}条数据--已完成")
|
|
|
|
|
|
# 提交剩余数据
|
|
|
- collection.insert_many(data_lst)
|
|
|
+ if len(data_lst) > 0:
|
|
|
+ collection.insert_many(data_lst)
|
|
|
logger.info(f"[Summary]{collection.name}-批量保存{count}条数据--已完成")
|
|
|
+ return count
|
|
|
|
|
|
|
|
|
def summary_data(document, runtime, only_count_list_page=False):
|
|
@@ -239,7 +248,49 @@ def competing_products_crawl_aggregate_of_list_pages(datestr=None):
|
|
|
competing_products_crawl_aggregate(zbytb_list, datestr)
|
|
|
|
|
|
|
|
|
+def zgzb_crawl_aggregate_of_list_pages(datestr=None):
|
|
|
+ if datestr is not None:
|
|
|
+ today = datetime.fromisoformat(datestr).date()
|
|
|
+ else:
|
|
|
+ today = datetime.now().date()
|
|
|
+ yesterday = today + timedelta(days=-1)
|
|
|
+
|
|
|
+ runtime = yesterday.strftime("%Y-%m-%d")
|
|
|
+ start_time = int(datetime.combine(yesterday, time()).timestamp())
|
|
|
+ end_time = int(datetime.combine(today, time()).timestamp())
|
|
|
+
|
|
|
+ pipeline = [
|
|
|
+ {"$match": {"comeintime": {"$gte": start_time, "$lt": end_time}}},
|
|
|
+ {
|
|
|
+ "$group": {
|
|
|
+ "_id": "$spidercode",
|
|
|
+ "count": {"$sum": 1}, # 当天采集总数
|
|
|
+ "rel_count": {"$sum": 1}, # 当天采集总数
|
|
|
+ "spider_item": {
|
|
|
+ "$addToSet": {
|
|
|
+ "site": "$site",
|
|
|
+ "channel": "$channel",
|
|
|
+ "spidercode": "$spidercode",
|
|
|
+ "business_type": "List"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ },
|
|
|
+ ]
|
|
|
+ cursor = zgzb_list.aggregate(pipeline, allowDiskUse=True)
|
|
|
+ try:
|
|
|
+ results = []
|
|
|
+ for doc in cursor:
|
|
|
+ results.extend(summary_data(doc, runtime))
|
|
|
+ save(results, summary_table_of_list_pages)
|
|
|
+ finally:
|
|
|
+ client.close()
|
|
|
+ logger.info("[Summary]zgzb_list数据汇总结束")
|
|
|
+
|
|
|
+
|
|
|
if __name__ == '__main__':
|
|
|
feapder_crawl_aggregate_of_list_pages()
|
|
|
py_spiders_crawl_aggregate_of_list_pages()
|
|
|
competing_products_crawl_aggregate_of_list_pages()
|
|
|
+ zgzb_crawl_aggregate_of_list_pages()
|
|
|
+
|