|
@@ -30,6 +30,10 @@ spider_heartbeat = mongodb["spider_heartbeat"]
|
|
|
# py_spiders列表
|
|
|
py_spiders_crawl_list = mongodb["crawl_data"]
|
|
|
|
|
|
+# 竞品列表
|
|
|
+ybw_list = mongodb["ybw_list"]
|
|
|
+zbytb_list = mongodb["zbytb_list"]
|
|
|
+
|
|
|
# 列表页汇总表
|
|
|
summary_table_of_list_pages = mongodb["list"]
|
|
|
|
|
@@ -88,7 +92,6 @@ def summary_data(document, runtime, only_count_list_page=False):
|
|
|
|
|
|
def feapder_crawl_aggregate_of_list_pages(datestr=None):
|
|
|
"""feapder采集列表页数据汇总(前一天的数据)"""
|
|
|
-
|
|
|
if datestr is None:
|
|
|
today = datetime.now().date()
|
|
|
yesterday = today + timedelta(days=-1)
|
|
@@ -180,8 +183,63 @@ def py_spiders_crawl_aggregate_of_list_pages(datestr=None):
|
|
|
logger.info("[Summary]py_spiders数据汇总结束")
|
|
|
|
|
|
|
|
|
+def competing_products_crawl_aggregate(collection, datestr=None):
|
|
|
+ """竞品采集聚合查询"""
|
|
|
+ if datestr is not None:
|
|
|
+ today = datetime.fromisoformat(datestr).date()
|
|
|
+ else:
|
|
|
+ today = datetime.now().date()
|
|
|
+ yesterday = today + timedelta(days=-1)
|
|
|
+
|
|
|
+ publish_time = yesterday.strftime("%Y-%m-%d")
|
|
|
+ table_name = collection.name
|
|
|
+ pipeline = [
|
|
|
+ {
|
|
|
+ "$addFields": {
|
|
|
+ "rel_count": {
|
|
|
+ "$cond": {
|
|
|
+ "if": {"$ne": ["$count", 0]},
|
|
|
+ "then": 1,
|
|
|
+ "else": 0
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ },
|
|
|
+ {"$match": {"publishtime": publish_time}},
|
|
|
+ {
|
|
|
+ "$group": {
|
|
|
+ "_id": "$channel",
|
|
|
+ "count": {"$sum": 1}, # 当天采集总数
|
|
|
+ "rel_count": {"$sum": "$rel_count"}, # es检索结果为0的总数
|
|
|
+ "spider_item": {
|
|
|
+ "$addToSet": {
|
|
|
+ "site": "$site",
|
|
|
+ "channel": "$channel",
|
|
|
+ "spidercode": "$spidercode",
|
|
|
+ "business_type": "List"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ },
|
|
|
+ ]
|
|
|
+ cursor = collection.aggregate(pipeline, allowDiskUse=True)
|
|
|
+ try:
|
|
|
+ results = []
|
|
|
+ for doc in cursor:
|
|
|
+ results.extend(summary_data(doc, publish_time))
|
|
|
+ save(results, summary_table_of_list_pages)
|
|
|
+ finally:
|
|
|
+ client.close()
|
|
|
+ logger.info(f"[Summary]{table_name}数据汇总结束")
|
|
|
+
|
|
|
+
|
|
|
+def competing_products_crawl_aggregate_of_list_pages(datestr=None):
|
|
|
+ """竞品采集列表页数据汇总"""
|
|
|
+ competing_products_crawl_aggregate(ybw_list, datestr)
|
|
|
+ competing_products_crawl_aggregate(zbytb_list, datestr)
|
|
|
+
|
|
|
+
|
|
|
if __name__ == '__main__':
|
|
|
- # feapder_crawl_aggregate_of_list_pages("2023-04-03")
|
|
|
feapder_crawl_aggregate_of_list_pages()
|
|
|
- # py_spiders_crawl_aggregate_of_list_pages("2023-04-04")
|
|
|
py_spiders_crawl_aggregate_of_list_pages()
|
|
|
+ competing_products_crawl_aggregate_of_list_pages()
|