dongzhaorui 2 lat temu
rodzic
commit
747d6985a2

+ 3 - 0
A数据处理/sync_data/README.md

@@ -0,0 +1,3 @@
+#### 数据同步
+    采集数据先推送RedisDB临时存放,然后从RedisDB通过同步服务间隔5分钟推送至爬虫MongoDB生产库,
+    数据同步脚本由crontab调用

+ 14 - 0
A数据处理/sync_data/log.py

@@ -0,0 +1,14 @@
+from pathlib import Path
+
+from loguru import logger
+
+_absolute = Path(__file__).absolute().parent
+_log_path = (_absolute / 'logs/sync_{time:YYYY-MM-DD}.log').resolve()
+logger.add(
+    _log_path,
+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
+    level='INFO',
+    rotation='00:00',
+    retention='1 week',
+    encoding='utf-8',
+)

+ 332 - 0
A数据处理/sync_data/monitor_summary.py

@@ -0,0 +1,332 @@
+import hashlib
+from datetime import datetime, time, timedelta
+
+from bson.int64 import Int64
+from bson.son import SON
+from pymongo import MongoClient
+
+from log import logger
+
+# mongo
+# MONGO_HOST = "172.17.4.87"
+# MONGO_PORT = 27080
+
+MONGO_HOST = "127.0.0.1"
+MONGO_PORT = 27001
+client = MongoClient(MONGO_HOST, MONGO_PORT)
+
+MONGO_DB1 = "py_spider"
+MONGO_DB2 = "editor"
+
+mongodb1 = client[MONGO_DB1]
+mongodb2 = client[MONGO_DB2]
+
+# 爬虫数据表
+data_bak = mongodb1["data_bak"]
+
+# 心跳表
+spider_heartbeat = mongodb1["spider_heartbeat"]
+
+# py_spiders列表
+py_spiders_crawl_list = mongodb1["crawl_data"]
+
+# 列表页汇总表
+spider_monitor = mongodb1["spider_monitor"]
+
+# luaconfig表
+spider_lua_config = mongodb2["luaconfig"]
+
+
+def get_hask_key(*args):
+    """
+    @summary: 获取唯一的32位md5
+    ---------
+    @param *args: 参与联合去重的值
+    ---------
+    @result: 7c8684bcbdfcea6697650aa53d7b1405
+    """
+    join_data = "_".join(*args).encode()
+    return hashlib.md5(join_data).hexdigest()
+
+
+def save(documents, collection):
+    """保存数据"""
+
+    is_list = isinstance(documents, list)
+    documents = documents if is_list else [documents]
+
+    count = 0
+    data_lst = []
+    for item in documents:
+        item.pop("_id", None)
+        item.pop("business_type", None)
+        item["comeintime"] = Int64(datetime.now().timestamp())
+        data_lst.append(item)
+        count += 1
+        if len(data_lst) % 100 == 0:
+            collection.insert_many(data_lst)
+            data_lst.clear()
+            logger.info(f"{collection.name}-批量保存{count}条数据--已完成")
+
+    # 提交剩余数据
+    collection.insert_many(data_lst)
+    logger.info(f"{collection.name}-批量保存{count}条数据--已完成")
+
+
+def get_runtime(datestr=None):
+    if datestr is None:
+        today = datetime.now().date()
+        yesterday = today + timedelta(days=-1)
+        datestr = yesterday.strftime("%Y-%m-%d")
+    return datestr
+
+
+def get_crawler_basic_information():
+    """爬虫基础信息"""
+    crawler_lst = []
+    q = {"platform": "python", "state": 11}
+    projection = {"_id": 0, "site": 1, "channel": 1, "modifyuser": 1, "modifyuserid": 1, "code":1}
+    cursor = spider_lua_config.find(q, projection=projection)
+    try:
+        for doc in cursor:
+            crawler_lst.append({
+                "site": doc["site"],
+                "channel": doc["channel"],
+                "spidercode": doc["code"],
+                "modifyid": doc["modifyuserid"],
+                "modifyuser": doc["modifyuser"],
+            })
+    finally:
+        client.close()
+        logger.info(f"爬虫采集日报--共计{len(crawler_lst)}个爬虫")
+        yield from crawler_lst
+
+
+def get_node_and_taskid(runtime, spidercode):
+    """获取最新爬虫工作节点和任务id"""
+    q = {"runtime": runtime, "spidercode": spidercode}
+    projection = {"node_ip": 1, "crawlab_taskid":1, "_id": 0}
+    sort = [("_id", -1)]
+    result = spider_heartbeat.find_one(q, projection=projection, sort=sort)
+    return result
+
+
+
+
+def aggregate_query(runtime):
+    """feapder采集聚合查询"""
+    pipeline = [
+        {"$match": {"runtime": runtime}},
+        {
+            "$group": {
+                "_id": "$spider_id",
+                "rel_count": {"$sum": "$rel_count"},  # 实际下载量
+                "count": {"$sum": "$count"},  # 下载量
+                "spider_item": {
+                    "$addToSet": {
+                        "site": "$site",
+                        "channel": "$channel",
+                        "spidercode": "$spidercode",
+                        "business_type": "$business_type"
+                    }
+                }
+            }
+        },
+        {"$sort": SON([("rel_count", -1)])}
+    ]
+
+    aggregate_items = {}
+    website_lst = []
+
+    cursor = spider_heartbeat.aggregate(pipeline, allowDiskUse=True)
+    try:
+        for doc in cursor:
+            spider_item = doc["spider_item"]
+
+            spidercode_at_site_num = 0
+
+            for item in spider_item:
+                site = item["site"]
+                channel = item["channel"]
+                spidercode = item["spidercode"]
+
+                hash_key = get_hask_key([site, channel, spidercode])  # 防止多站点对应1个spidercode,数据相互重叠
+
+                same_site = True
+                if site not in website_lst:
+                    same_site = False
+                    website_lst.append(site)
+
+                if not same_site and aggregate_items.get(hash_key):
+                    aggregate_items.get(hash_key)["spidercode_at_site_num"] += 1
+                else:
+                    spidercode_at_site_num += 1
+
+                if not aggregate_items.get(hash_key):
+                    data = {
+                        "business_type": item["business_type"],
+                        "spider_id": doc["_id"],
+                        "site": site,
+                        "channel": item["channel"],
+                        "spidercode": spidercode,
+                        "runtime": runtime,
+                        "spidercode_at_site_num": spidercode_at_site_num  # 爬虫代码对应的站点数量
+                    }
+
+                    is_list = str(item["business_type"]).endswith("List")
+                    if is_list:
+                        data["list_count"] = doc["count"]
+                        data["list_rel_count"] = doc["rel_count"]
+                        data["detail_count"] = 0
+                        data["detail_rel_count"] = 0
+                        data["list_runtimes"] = 1
+                        data["detail_runtimes"] = 0
+                    else:
+                        data["list_count"] = 0
+                        data["list_rel_count"] = 0
+                        data["detail_count"] = doc["count"]
+                        data["detail_rel_count"] = doc["rel_count"]
+                        data["detail_runtimes"] = 1
+                        data["list_runtimes"] = 0
+
+                    if len(spider_item) > 1:
+                        logger.warning(f"{spidercode} -> {site} --映射关系错误")
+
+                    aggregate_items.setdefault(hash_key, data)
+
+                else:
+                    data = aggregate_items.get(hash_key)
+                    is_list = str(item["business_type"]).endswith("List")
+                    if is_list:
+                        data["list_count"] += doc["count"]
+                        data["list_rel_count"] += doc["rel_count"]
+                        data["list_runtimes"] += 1
+                    else:
+                        data["detail_count"] += doc["count"]
+                        data["detail_rel_count"] += doc["rel_count"]
+                        data["detail_runtimes"] += 1
+
+                    aggregate_items.update({hash_key: data})
+
+    finally:
+        client.close()
+        return aggregate_items
+
+
+runtime = get_runtime()
+aggregate_results = aggregate_query(runtime)
+
+
+def get_list_isgetdata(hash_key):
+    """列表页是否采集数据"""
+    count = 0
+    if aggregate_results.get(hash_key):
+        count += aggregate_results[hash_key]["list_count"]
+    return True if count > 0 else False
+
+
+def get_list_allintimes(hash_key):
+    """日采集列表的总入库量"""
+    count = 0
+    if aggregate_results.get(hash_key):
+        count += aggregate_results[hash_key]["list_rel_count"]
+    return count
+
+
+def get_list_runtimes(hash_key):
+    count = 0
+    if aggregate_results.get(hash_key):
+        count += aggregate_results.get(hash_key)["list_runtimes"]
+    return count
+
+
+def get_detail_downloadnum(hash_key):
+    """详情页下载量"""
+    count = 0
+    if aggregate_results.get(hash_key):
+        count += aggregate_results.get(hash_key)["detail_count"]
+    return count
+
+
+def get_detail_downloadsuccessnum(hash_key):
+    """详情页下载成功量"""
+    count = 0
+    if aggregate_results.get(hash_key):
+        count += aggregate_results.get(hash_key)["detail_rel_count"]
+    return count
+
+
+def get_count(document, business_type: str):
+    if business_type.title() not in ["List", "Detail"]:
+        raise ValueError("business_type")
+
+    if str(document["business_type"]).endswith(business_type):
+        return document["count"]
+    return 0
+
+
+def get_rel_count(document, business_type: str):
+
+    if business_type.title() not in ["List", "Detail"]:
+        raise ValueError("business_type")
+
+    if str(document["business_type"]).endswith(business_type):
+        return document["rel_count"]
+    return 0
+
+
+def main():
+    summary_queue = []
+    crawlers = get_crawler_basic_information()
+    for crawler in crawlers:
+        site = crawler["site"]
+        channel = crawler["channel"]
+        spidercode = crawler["spidercode"]
+        hash_key = get_hask_key([site, channel, spidercode])
+
+        if aggregate_results.get(hash_key):
+            # 合并数据
+            join_data = {**crawler}
+            result = aggregate_results.get(hash_key)
+
+            join_data["spidercode_at_site_num"] = result["spidercode_at_site_num"]
+
+            join_data["business_type"] = result["business_type"]
+            join_data["spider_id"] = result["spider_id"]
+
+            join_data["list_count"] = result["list_count"]
+            join_data["list_rel_count"] = result["list_rel_count"]
+            join_data["detail_count"] = result["detail_count"]
+            join_data["detail_rel_count"] = result["detail_rel_count"]
+
+            # crawlab平台
+            crawlab = get_node_and_taskid(runtime, spidercode)
+            if crawlab:
+                join_data["py_taskid"] = crawlab["crawlab_taskid"]
+                join_data["py_nodename"] = crawlab["node_ip"]
+
+            join_data["list_isgetdata"] = get_list_isgetdata(hash_key)  # 列表页是否采集数据
+            join_data["list_allintimes"] = get_list_allintimes(hash_key)  # 日采集列表的总入库量
+            join_data["list_runtimes"] = get_list_runtimes(hash_key)  # 列表页采集运行频次
+
+            join_data["detail_downloadnum"] = get_detail_downloadnum(hash_key)  # 详情页下载量
+            join_data["detail_downloadsuccessnum"] = get_detail_downloadsuccessnum(hash_key)  # 详情页下载成功量
+            join_data["detail_downloadfailnum"] = join_data["detail_downloadnum"] - join_data["detail_downloadsuccessnum"]  # 下载详情失败数量
+
+        else:
+            join_data = {**crawler}
+            join_data["list_isgetdata"] = False
+            join_data["list_allintimes"] = -1
+            join_data["list_runtimes"] = -1
+            join_data["detail_downloadnum"] = -1
+            join_data["detail_downloadsuccessnum"] = -1
+            join_data["detail_downloadfailnum"] = -1
+
+        logger.info(f"{crawler['site']}-{crawler['channel']}-{spidercode}--完成统计")
+        summary_queue.append(join_data)
+
+    save(summary_queue, spider_monitor)
+
+
+if __name__ == '__main__':
+    main()

+ 240 - 0
A数据处理/sync_data/send_data.py

@@ -0,0 +1,240 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-02-21
+---------
+@summary:  数据上传(redis到mongo),采集数据同步服务
+---------
+@author: dzr
+"""
+
+import ast
+import time
+from concurrent.futures import ThreadPoolExecutor, wait
+from typing import Dict
+
+import redis
+from bson import int64
+from elasticsearch import Elasticsearch
+from func_timeout import func_set_timeout
+from func_timeout.exceptions import FunctionTimedOut
+from pymongo import MongoClient
+from redis._compat import unicode, long, basestring
+from redis.connection import Encoder as RedisEncoder
+from redis.exceptions import DataError
+
+from log import logger
+
+# mongo
+MONGO_HOST = "172.17.4.87"
+MONGO_PORT = 27080
+MONGO_DB = "py_spider"
+mcli = MongoClient(MONGO_HOST, MONGO_PORT)
+mongodb = mcli[MONGO_DB]
+
+
+# redis
+class Encoder(RedisEncoder):
+
+    def encode(self, value):
+        "Return a bytestring or bytes-like representation of the value"
+        if isinstance(value, (bytes, memoryview)):
+            return value
+        # elif isinstance(value, bool):
+        #     # special case bool since it is a subclass of int
+        #     raise DataError(
+        #         "Invalid input of type: 'bool'. Convert to a "
+        #         "bytes, string, int or float first."
+        #     )
+        elif isinstance(value, float):
+            value = repr(value).encode()
+        elif isinstance(value, (int, long)):
+            # python 2 repr() on longs is '123L', so use str() instead
+            value = str(value).encode()
+        elif isinstance(value, (list, dict, tuple)):
+            value = unicode(value)
+        elif not isinstance(value, basestring):
+            # a value we don't know how to deal with. throw an error
+            typename = type(value).__name__
+            raise DataError(
+                "Invalid input of type: '%s'. Convert to a "
+                "bytes, string, int or float first." % typename
+            )
+        if isinstance(value, unicode):
+            value = value.encode(self.encoding, self.encoding_errors)
+        return value
+
+
+REDIS_HOST = "172.17.4.232"
+REDIS_PORT = 7361
+REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
+REDIS_DB = 10
+
+redis.connection.Encoder = Encoder
+
+pool = redis.ConnectionPool(
+    host=REDIS_HOST,
+    port=REDIS_PORT,
+    password=REDISDB_USER_PASS,
+    db=REDIS_DB
+)
+rcli = redis.StrictRedis(connection_pool=pool, decode_responses=True)
+
+# es
+ES_HOST = '172.17.145.178'
+ES_PORT = 9800
+ES_INDEX = 'biddingall'
+ecli = Elasticsearch([{"host": ES_HOST, "port": ES_PORT}])
+
+# 延时间隔
+DELAY = 43200
+
+
+def literal_eval(node_or_string):
+    try:
+        return ast.literal_eval(node_or_string)
+    except ValueError as e:
+        if 'malformed node or string' in e.args[0]:
+            from bson import Code, ObjectId  # eval变量作用域,ObjectId参数
+            return eval(node_or_string)
+        else:
+            raise e
+
+
+def date2ts(date_str):
+    """日期转时间戳"""
+    if ":" in date_str:
+        ts = int(time.mktime(time.strptime(date_str, "%Y-%m-%d %H:%M:%S")))
+    else:
+        ts = int(time.mktime(time.strptime(date_str, "%Y-%m-%d")))
+    return ts
+
+
+def es_query(title, publish_time):
+    """
+    查询es
+
+    :param title: 标题
+    :param publish_time: 发布时间
+    :return:
+    """
+    publish_time = date2ts(publish_time)
+    stime = publish_time - 432000  # 往前推5天
+    etime = publish_time + 432000
+    # 通过发布标题和发布时间范围查询
+    query = {
+        "query": {
+            "bool": {
+                "must": [
+                    {
+                        "multi_match": {
+                            "query": title,
+                            "type": "phrase",
+                            "fields": ["title"]
+                        }
+                    },
+                    {"range": {'publishtime': {"from": stime, "to": etime}}}
+                ]
+            }
+        }
+    }
+    result = ecli.search(body=query, index=ES_INDEX, request_timeout=100)
+    # print(result['hits']['total'])
+    total = int(result['hits']['total'])
+    return total
+
+
+def get_redis_key(table, prefix="savemongo:"):
+    return prefix + table
+
+
+def rpush(name, values, is_redis_cluster=False):
+    """“将“values”推到列表“name”的尾部”"""
+    if isinstance(values, list):
+        pipe = rcli.pipeline()
+        if not is_redis_cluster:
+            pipe.multi()
+
+        for value in values:
+            pipe.rpush(name, value)
+        pipe.execute()
+    else:
+        return rcli.rpush(name, values)
+
+
+def insert_one(table, item: Dict):
+    """MongoDB 单条入库"""
+    if item is not None:
+        item.pop('_id', '')
+        if item.get("comeintime"):
+            item['comeintime'] = int64.Int64(item['comeintime'])
+        try:
+            title = item.get('title')
+            result = mongodb[table].insert_one(item)
+            logger.info(f'{table}-{str(result.inserted_id)}-{title}--上传成功')
+        except Exception as e:
+            rpush(get_redis_key(table), item)
+            logger.error(table + f"--推送失败,原因:{''.join(e.args)}")
+
+
+def sync_data(table):
+    redis_key = get_redis_key(table)
+    total = rcli.llen(redis_key)
+    logger.info(f"同步表名:{table},推送总数:{total}")
+    for _ in range(total):
+        obj = rcli.lpop(redis_key)
+        if obj is None:
+            logger.warning(f'{table} 错误数据:{obj}')
+            continue
+
+        try:
+            item = literal_eval(obj)
+            if table != 'mgp_list':
+                insert_one(table, item)
+            else:
+                title = item.get("item").get("title")
+                # 延时推送流程
+                if item.get("is_delay"):
+                    site = item.get("item").get("site")
+                    t_diff = int(time.time()) - item.get("comeintime")
+                    if t_diff <= DELAY:
+                        rpush(redis_key, item)
+                        logger.info(f"{site}-{title}-等待{t_diff}秒--延时入库")
+                # es检索流程
+                elif item.get("if_es"):
+                    pt = item.get("item").get("publishtime")
+                    if title is not None and es_query(title.strip(), pt) == 0:
+                        insert_one(table, item)
+                else:
+                    insert_one(table, item)
+        except Exception as e:
+            # print(e)
+            # print(f'{table} {type(obj)} >>>>> {repr(obj)}')
+            rpush(redis_key, obj)
+
+
+def err_msg(worker):
+    err = worker.exception()
+    if err:
+        logger.exception("worker err: {}".format(err))
+    return worker
+
+
+@func_set_timeout(60 * 20)
+def main():
+    logger.info("数据同步开始")
+    tables = ["mgp_list", "data_bak", "spider_heartbeat", "njpc_list", "data_njpc", "listdata_err"]
+    with ThreadPoolExecutor() as pool:
+        futures = []
+        for table in tables:
+            f = pool.submit(sync_data, table)
+            f.add_done_callback(err_msg)
+            futures.append(f)
+        wait(futures)
+    logger.info("数据同步结束")
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    except FunctionTimedOut:
+        logger.warning("数据同步超时")

+ 180 - 0
A数据处理/sync_data/summary_data.py

@@ -0,0 +1,180 @@
+
+from datetime import datetime, time, timedelta
+
+from bson.int64 import Int64
+from bson.son import SON
+from pymongo import MongoClient
+
+from log import logger
+
+# mongo
+MONGO_HOST = "172.17.4.87"
+MONGO_PORT = 27080
+MONGO_DB = "py_spider"
+client = MongoClient(MONGO_HOST, MONGO_PORT)
+mongodb = client[MONGO_DB]
+
+# 爬虫数据表
+data_bak = mongodb["data_bak"]
+
+# 心跳表
+spider_heartbeat = mongodb["spider_heartbeat"]
+
+# py_spiders列表
+py_spiders_crawl_list = mongodb["crawl_data"]
+
+# 列表页汇总表
+summary_table_of_list_pages = mongodb["list"]
+
+
+def save(documents, collection):
+    """保存数据"""
+
+    is_list = isinstance(documents, list)
+    documents = documents if is_list else [documents]
+
+    count = 0
+    data_lst = []
+    for item in documents:
+        item.pop("_id", None)
+        data_lst.append(item)
+        count += 1
+        if len(data_lst) % 100 == 0:
+            collection.insert_many(data_lst)
+            data_lst.clear()
+            logger.info(f"{collection.name}-批量保存{count}条数据--已完成")
+
+    # 提交剩余数据
+    collection.insert_many(data_lst)
+    logger.info(f"{collection.name}-批量保存{count}条数据--已完成")
+
+
+def summary_data(document, runtime, only_count_list_page=False):
+    """对聚合的数据进行汇总和分类"""
+    summary_lst = []
+    spider_item = document["spider_item"]
+    for item in spider_item:
+        spidercode = item["spidercode"]
+        site = item["site"]
+        data = {
+            "business_type": item["business_type"],
+            "site": site,
+            "channel": item["channel"],
+            "spidercode": spidercode,
+            "count": document["count"],
+            "rel_count": document["rel_count"],
+            "runtime": runtime,
+            "create_at": Int64(datetime.now().timestamp())
+        }
+        if len(spider_item) > 1:
+            logger.warning(f"{spidercode} -> {site} --spidercode业务对应关系错误")
+            data["warning"] = "spidercode业务对应关系错误"
+
+        if only_count_list_page:
+            if str(item["business_type"]).endswith("List"):
+                summary_lst.append(data)
+            continue
+        summary_lst.append(data)
+
+    return summary_lst
+
+
+def feapder_crawl_aggregate_of_list_pages(datestr=None):
+    """feapder采集列表页数据汇总(前一天的数据)"""
+
+    if datestr is None:
+        today = datetime.now().date()
+        yesterday = today + timedelta(days=-1)
+        datestr = yesterday.strftime("%Y-%m-%d")
+
+    pipeline = [
+        {"$match": {"runtime": datestr}},
+        {
+            "$group": {
+                "_id": "$spider_id",
+                "rel_count": {"$sum": "$rel_count"},
+                "count": {"$sum": "$count"},
+                "spider_item": {
+                    "$addToSet": {
+                        "site": "$site",
+                        "channel": "$channel",
+                        "spidercode": "$spidercode",
+                        "business_type": "$business_type"
+                    }
+                }
+            }
+        },
+        {"$sort": SON([("rel_count", -1)])}
+    ]
+    #  $group阶段的内存限制为100M,默认情况下,如果stage超过此限制,
+    #  $group将产生错误,但是,要允许处理大型数据集,请将allowDiskUse选项设置为true以启用$group操作以写入临时文件。
+    cursor = spider_heartbeat.aggregate(pipeline, allowDiskUse=True)
+    try:
+        results = []
+        for doc in cursor:
+            results.extend(summary_data(doc, datestr, True))
+        save(results, summary_table_of_list_pages)
+    finally:
+        client.close()
+        logger.info("feapder数据汇总结束")
+
+
+def py_spiders_crawl_aggregate_of_list_pages(datestr=None):
+    """py_spiders采集列表页数据汇总(前一天的数据)"""
+    if datestr is not None:
+        today = datetime.fromisoformat(datestr).date()
+    else:
+        today = datetime.now().date()
+    yesterday = today + timedelta(days=-1)
+
+    runtime = yesterday.strftime("%Y-%m-%d")
+    start_time = int(datetime.combine(yesterday, time()).timestamp())
+    end_time = int(datetime.combine(today, time()).timestamp())
+
+    pipeline = [
+        {
+            "$addFields": {
+                "rel_count": {
+                    "$cond": {
+                        "if": {"$ne": ["$finished", True]},
+                        "then": 1,
+                        "else": 0
+                    }
+                }
+            }
+        },
+        {"$match": {"comeintime": {"$gte": start_time, "$lt": end_time}}},
+        {
+            "$group": {
+                "_id": "$spidercode",
+                "count": {"$sum": 1},  # 当天采集总数
+                "rel_count": {"$sum": 1},  # 当天采集总数
+                # "rel_count": {"$sum": "$rel_count"},  # 当天采集详情总数(仅成功)
+                "spider_item": {
+                    "$addToSet": {
+                        "site": "$site",
+                        "channel": "$channel",
+                        "spidercode": "$spidercode",
+                        "business_type": "List"
+                    }
+                }
+            }
+        },
+        {"$sort": SON([("rel_count", -1)])}
+    ]
+    cursor = py_spiders_crawl_list.aggregate(pipeline, allowDiskUse=True)
+    try:
+        results = []
+        for doc in cursor:
+            results.extend(summary_data(doc, runtime))
+        save(results, summary_table_of_list_pages)
+    finally:
+        client.close()
+        logger.info("py_spiders数据汇总结束")
+
+
+if __name__ == '__main__':
+    # feapder_crawl_aggregate_of_list_pages("2023-04-03")
+    feapder_crawl_aggregate_of_list_pages()
+    # py_spiders_crawl_aggregate_of_list_pages("2023-04-04")
+    py_spiders_crawl_aggregate_of_list_pages()