|
@@ -25,10 +25,7 @@ mongodb = client[MONGO_DB]
|
|
|
data_bak = mongodb["data_bak"]
|
|
|
|
|
|
# 心跳表
|
|
|
-spider_heartbeat = mongodb["spider_heartbeat"]
|
|
|
-
|
|
|
-# py_spiders列表
|
|
|
-py_spiders_crawl_list = mongodb["crawl_data"]
|
|
|
+spider_heartbeat = mongodb["pyspider_heartbeat"]
|
|
|
|
|
|
# 竞品列表
|
|
|
ybw_list = mongodb["ybw_list"]
|
|
@@ -133,60 +130,6 @@ def feapder_crawl_aggregate_of_list_pages(datestr=None):
|
|
|
logger.info("[Summary]feapder数据汇总结束")
|
|
|
|
|
|
|
|
|
-def py_spiders_crawl_aggregate_of_list_pages(datestr=None):
|
|
|
- """py_spiders采集列表页数据汇总(前一天的数据)"""
|
|
|
- if datestr is not None:
|
|
|
- today = datetime.fromisoformat(datestr).date()
|
|
|
- else:
|
|
|
- today = datetime.now().date()
|
|
|
- yesterday = today + timedelta(days=-1)
|
|
|
-
|
|
|
- runtime = yesterday.strftime("%Y-%m-%d")
|
|
|
- start_time = int(datetime.combine(yesterday, time()).timestamp())
|
|
|
- end_time = int(datetime.combine(today, time()).timestamp())
|
|
|
-
|
|
|
- pipeline = [
|
|
|
- {
|
|
|
- "$addFields": {
|
|
|
- "rel_count": {
|
|
|
- "$cond": {
|
|
|
- "if": {"$ne": ["$finished", True]},
|
|
|
- "then": 1,
|
|
|
- "else": 0
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- },
|
|
|
- {"$match": {"comeintime": {"$gte": start_time, "$lt": end_time}}},
|
|
|
- {
|
|
|
- "$group": {
|
|
|
- "_id": "$spidercode",
|
|
|
- "count": {"$sum": 1}, # 当天采集总数
|
|
|
- "rel_count": {"$sum": 1}, # 当天采集总数
|
|
|
- # "rel_count": {"$sum": "$rel_count"}, # 当天采集详情总数(仅成功)
|
|
|
- "spider_item": {
|
|
|
- "$addToSet": {
|
|
|
- "site": "$site",
|
|
|
- "channel": "$channel",
|
|
|
- "spidercode": "$spidercode",
|
|
|
- "business_type": "List"
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- },
|
|
|
- {"$sort": SON([("rel_count", -1)])}
|
|
|
- ]
|
|
|
- cursor = py_spiders_crawl_list.aggregate(pipeline, allowDiskUse=True)
|
|
|
- try:
|
|
|
- results = []
|
|
|
- for doc in cursor:
|
|
|
- results.extend(summary_data(doc, runtime))
|
|
|
- save(results, summary_table_of_list_pages)
|
|
|
- finally:
|
|
|
- client.close()
|
|
|
- logger.info("[Summary]py_spiders数据汇总结束")
|
|
|
-
|
|
|
-
|
|
|
def competing_products_crawl_aggregate(collection, datestr=None):
|
|
|
"""竞品采集聚合查询"""
|
|
|
if datestr is not None:
|
|
@@ -285,7 +228,5 @@ def zgzb_crawl_aggregate_of_list_pages(datestr=None):
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
feapder_crawl_aggregate_of_list_pages()
|
|
|
- py_spiders_crawl_aggregate_of_list_pages()
|
|
|
competing_products_crawl_aggregate_of_list_pages()
|
|
|
zgzb_crawl_aggregate_of_list_pages()
|
|
|
-
|