2 lat temu · 747d6985a2
--- a/A数据处理/sync_data/README.md
+++ b/A数据处理/sync_data/README.md
@@ -0,0 +1,3 @@
 
				+#### 数据同步
			
 
				+    采集数据先推送RedisDB临时存放，然后从RedisDB通过同步服务间隔5分钟推送至爬虫MongoDB生产库，
			
 
				+    数据同步脚本由crontab调用
			
--- a/A数据处理/sync_data/log.py
+++ b/A数据处理/sync_data/log.py
@@ -0,0 +1,14 @@
 
				+from pathlib import Path
			
 
				+
			
 
				+from loguru import logger
			
 
				+
			
 
				+_absolute = Path(__file__).absolute().parent
			
 
				+_log_path = (_absolute / 'logs/sync_{time:YYYY-MM-DD}.log').resolve()
			
 
				+logger.add(
			
 
				+    _log_path,
			
 
				+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
			
 
				+    level='INFO',
			
 
				+    rotation='00:00',
			
 
				+    retention='1 week',
			
 
				+    encoding='utf-8',
			
 
				+)
			
--- a/A数据处理/sync_data/monitor_summary.py
+++ b/A数据处理/sync_data/monitor_summary.py
@@ -0,0 +1,332 @@
 
				+import hashlib
			
 
				+from datetime import datetime, time, timedelta
			
 
				+
			
 
				+from bson.int64 import Int64
			
 
				+from bson.son import SON
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+from log import logger
			
 
				+
			
 
				+# mongo
			
 
				+# MONGO_HOST = "172.17.4.87"
			
 
				+# MONGO_PORT = 27080
			
 
				+
			
 
				+MONGO_HOST = "127.0.0.1"
			
 
				+MONGO_PORT = 27001
			
 
				+client = MongoClient(MONGO_HOST, MONGO_PORT)
			
 
				+
			
 
				+MONGO_DB1 = "py_spider"
			
 
				+MONGO_DB2 = "editor"
			
 
				+
			
 
				+mongodb1 = client[MONGO_DB1]
			
 
				+mongodb2 = client[MONGO_DB2]
			
 
				+
			
 
				+# 爬虫数据表
			
 
				+data_bak = mongodb1["data_bak"]
			
 
				+
			
 
				+# 心跳表
			
 
				+spider_heartbeat = mongodb1["spider_heartbeat"]
			
 
				+
			
 
				+# py_spiders列表
			
 
				+py_spiders_crawl_list = mongodb1["crawl_data"]
			
 
				+
			
 
				+# 列表页汇总表
			
 
				+spider_monitor = mongodb1["spider_monitor"]
			
 
				+
			
 
				+# luaconfig表
			
 
				+spider_lua_config = mongodb2["luaconfig"]
			
 
				+
			
 
				+
			
 
				+def get_hask_key(*args):
			
 
				+    """
			
 
				+    @summary: 获取唯一的32位md5
			
 
				+    ---------
			
 
				+    @param *args: 参与联合去重的值
			
 
				+    ---------
			
 
				+    @result: 7c8684bcbdfcea6697650aa53d7b1405
			
 
				+    """
			
 
				+    join_data = "_".join(*args).encode()
			
 
				+    return hashlib.md5(join_data).hexdigest()
			
 
				+
			
 
				+
			
 
				+def save(documents, collection):
			
 
				+    """保存数据"""
			
 
				+
			
 
				+    is_list = isinstance(documents, list)
			
 
				+    documents = documents if is_list else [documents]
			
 
				+
			
 
				+    count = 0
			
 
				+    data_lst = []
			
 
				+    for item in documents:
			
 
				+        item.pop("_id", None)
			
 
				+        item.pop("business_type", None)
			
 
				+        item["comeintime"] = Int64(datetime.now().timestamp())
			
 
				+        data_lst.append(item)
			
 
				+        count += 1
			
 
				+        if len(data_lst) % 100 == 0:
			
 
				+            collection.insert_many(data_lst)
			
 
				+            data_lst.clear()
			
 
				+            logger.info(f"{collection.name}-批量保存{count}条数据--已完成")
			
 
				+
			
 
				+    # 提交剩余数据
			
 
				+    collection.insert_many(data_lst)
			
 
				+    logger.info(f"{collection.name}-批量保存{count}条数据--已完成")
			
 
				+
			
 
				+
			
 
				+def get_runtime(datestr=None):
			
 
				+    if datestr is None:
			
 
				+        today = datetime.now().date()
			
 
				+        yesterday = today + timedelta(days=-1)
			
 
				+        datestr = yesterday.strftime("%Y-%m-%d")
			
 
				+    return datestr
			
 
				+
			
 
				+
			
 
				+def get_crawler_basic_information():
			
 
				+    """爬虫基础信息"""
			
 
				+    crawler_lst = []
			
 
				+    q = {"platform": "python", "state": 11}
			
 
				+    projection = {"_id": 0, "site": 1, "channel": 1, "modifyuser": 1, "modifyuserid": 1, "code":1}
			
 
				+    cursor = spider_lua_config.find(q, projection=projection)
			
 
				+    try:
			
 
				+        for doc in cursor:
			
 
				+            crawler_lst.append({
			
 
				+                "site": doc["site"],
			
 
				+                "channel": doc["channel"],
			
 
				+                "spidercode": doc["code"],
			
 
				+                "modifyid": doc["modifyuserid"],
			
 
				+                "modifyuser": doc["modifyuser"],
			
 
				+            })
			
 
				+    finally:
			
 
				+        client.close()
			
 
				+        logger.info(f"爬虫采集日报--共计{len(crawler_lst)}个爬虫")
			
 
				+        yield from crawler_lst
			
 
				+
			
 
				+
			
 
				+def get_node_and_taskid(runtime, spidercode):
			
 
				+    """获取最新爬虫工作节点和任务id"""
			
 
				+    q = {"runtime": runtime, "spidercode": spidercode}
			
 
				+    projection = {"node_ip": 1, "crawlab_taskid":1, "_id": 0}
			
 
				+    sort = [("_id", -1)]
			
 
				+    result = spider_heartbeat.find_one(q, projection=projection, sort=sort)
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+def aggregate_query(runtime):
			
 
				+    """feapder采集聚合查询"""
			
 
				+    pipeline = [
			
 
				+        {"$match": {"runtime": runtime}},
			
 
				+        {
			
 
				+            "$group": {
			
 
				+                "_id": "$spider_id",
			
 
				+                "rel_count": {"$sum": "$rel_count"},  # 实际下载量
			
 
				+                "count": {"$sum": "$count"},  # 下载量
			
 
				+                "spider_item": {
			
 
				+                    "$addToSet": {
			
 
				+                        "site": "$site",
			
 
				+                        "channel": "$channel",
			
 
				+                        "spidercode": "$spidercode",
			
 
				+                        "business_type": "$business_type"
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        },
			
 
				+        {"$sort": SON([("rel_count", -1)])}
			
 
				+    ]
			
 
				+
			
 
				+    aggregate_items = {}
			
 
				+    website_lst = []
			
 
				+
			
 
				+    cursor = spider_heartbeat.aggregate(pipeline, allowDiskUse=True)
			
 
				+    try:
			
 
				+        for doc in cursor:
			
 
				+            spider_item = doc["spider_item"]
			
 
				+
			
 
				+            spidercode_at_site_num = 0
			
 
				+
			
 
				+            for item in spider_item:
			
 
				+                site = item["site"]
			
 
				+                channel = item["channel"]
			
 
				+                spidercode = item["spidercode"]
			
 
				+
			
 
				+                hash_key = get_hask_key([site, channel, spidercode])  # 防止多站点对应1个spidercode,数据相互重叠
			
 
				+
			
 
				+                same_site = True
			
 
				+                if site not in website_lst:
			
 
				+                    same_site = False
			
 
				+                    website_lst.append(site)
			
 
				+
			
 
				+                if not same_site and aggregate_items.get(hash_key):
			
 
				+                    aggregate_items.get(hash_key)["spidercode_at_site_num"] += 1
			
 
				+                else:
			
 
				+                    spidercode_at_site_num += 1
			
 
				+
			
 
				+                if not aggregate_items.get(hash_key):
			
 
				+                    data = {
			
 
				+                        "business_type": item["business_type"],
			
 
				+                        "spider_id": doc["_id"],
			
 
				+                        "site": site,
			
 
				+                        "channel": item["channel"],
			
 
				+                        "spidercode": spidercode,
			
 
				+                        "runtime": runtime,
			
 
				+                        "spidercode_at_site_num": spidercode_at_site_num  # 爬虫代码对应的站点数量
			
 
				+                    }
			
 
				+
			
 
				+                    is_list = str(item["business_type"]).endswith("List")
			
 
				+                    if is_list:
			
 
				+                        data["list_count"] = doc["count"]
			
 
				+                        data["list_rel_count"] = doc["rel_count"]
			
 
				+                        data["detail_count"] = 0
			
 
				+                        data["detail_rel_count"] = 0
			
 
				+                        data["list_runtimes"] = 1
			
 
				+                        data["detail_runtimes"] = 0
			
 
				+                    else:
			
 
				+                        data["list_count"] = 0
			
 
				+                        data["list_rel_count"] = 0
			
 
				+                        data["detail_count"] = doc["count"]
			
 
				+                        data["detail_rel_count"] = doc["rel_count"]
			
 
				+                        data["detail_runtimes"] = 1
			
 
				+                        data["list_runtimes"] = 0
			
 
				+
			
 
				+                    if len(spider_item) > 1:
			
 
				+                        logger.warning(f"{spidercode} -> {site} --映射关系错误")
			
 
				+
			
 
				+                    aggregate_items.setdefault(hash_key, data)
			
 
				+
			
 
				+                else:
			
 
				+                    data = aggregate_items.get(hash_key)
			
 
				+                    is_list = str(item["business_type"]).endswith("List")
			
 
				+                    if is_list:
			
 
				+                        data["list_count"] += doc["count"]
			
 
				+                        data["list_rel_count"] += doc["rel_count"]
			
 
				+                        data["list_runtimes"] += 1
			
 
				+                    else:
			
 
				+                        data["detail_count"] += doc["count"]
			
 
				+                        data["detail_rel_count"] += doc["rel_count"]
			
 
				+                        data["detail_runtimes"] += 1
			
 
				+
			
 
				+                    aggregate_items.update({hash_key: data})
			
 
				+
			
 
				+    finally:
			
 
				+        client.close()
			
 
				+        return aggregate_items
			
 
				+
			
 
				+
			
 
				+runtime = get_runtime()
			
 
				+aggregate_results = aggregate_query(runtime)
			
 
				+
			
 
				+
			
 
				+def get_list_isgetdata(hash_key):
			
 
				+    """列表页是否采集数据"""
			
 
				+    count = 0
			
 
				+    if aggregate_results.get(hash_key):
			
 
				+        count += aggregate_results[hash_key]["list_count"]
			
 
				+    return True if count > 0 else False
			
 
				+
			
 
				+
			
 
				+def get_list_allintimes(hash_key):
			
 
				+    """日采集列表的总入库量"""
			
 
				+    count = 0
			
 
				+    if aggregate_results.get(hash_key):
			
 
				+        count += aggregate_results[hash_key]["list_rel_count"]
			
 
				+    return count
			
 
				+
			
 
				+
			
 
				+def get_list_runtimes(hash_key):
			
 
				+    count = 0
			
 
				+    if aggregate_results.get(hash_key):
			
 
				+        count += aggregate_results.get(hash_key)["list_runtimes"]
			
 
				+    return count
			
 
				+
			
 
				+
			
 
				+def get_detail_downloadnum(hash_key):
			
 
				+    """详情页下载量"""
			
 
				+    count = 0
			
 
				+    if aggregate_results.get(hash_key):
			
 
				+        count += aggregate_results.get(hash_key)["detail_count"]
			
 
				+    return count
			
 
				+
			
 
				+
			
 
				+def get_detail_downloadsuccessnum(hash_key):
			
 
				+    """详情页下载成功量"""
			
 
				+    count = 0
			
 
				+    if aggregate_results.get(hash_key):
			
 
				+        count += aggregate_results.get(hash_key)["detail_rel_count"]
			
 
				+    return count
			
 
				+
			
 
				+
			
 
				+def get_count(document, business_type: str):
			
 
				+    if business_type.title() not in ["List", "Detail"]:
			
 
				+        raise ValueError("business_type")
			
 
				+
			
 
				+    if str(document["business_type"]).endswith(business_type):
			
 
				+        return document["count"]
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+def get_rel_count(document, business_type: str):
			
 
				+
			
 
				+    if business_type.title() not in ["List", "Detail"]:
			
 
				+        raise ValueError("business_type")
			
 
				+
			
 
				+    if str(document["business_type"]).endswith(business_type):
			
 
				+        return document["rel_count"]
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    summary_queue = []
			
 
				+    crawlers = get_crawler_basic_information()
			
 
				+    for crawler in crawlers:
			
 
				+        site = crawler["site"]
			
 
				+        channel = crawler["channel"]
			
 
				+        spidercode = crawler["spidercode"]
			
 
				+        hash_key = get_hask_key([site, channel, spidercode])
			
 
				+
			
 
				+        if aggregate_results.get(hash_key):
			
 
				+            # 合并数据
			
 
				+            join_data = {**crawler}
			
 
				+            result = aggregate_results.get(hash_key)
			
 
				+
			
 
				+            join_data["spidercode_at_site_num"] = result["spidercode_at_site_num"]
			
 
				+
			
 
				+            join_data["business_type"] = result["business_type"]
			
 
				+            join_data["spider_id"] = result["spider_id"]
			
 
				+
			
 
				+            join_data["list_count"] = result["list_count"]
			
 
				+            join_data["list_rel_count"] = result["list_rel_count"]
			
 
				+            join_data["detail_count"] = result["detail_count"]
			
 
				+            join_data["detail_rel_count"] = result["detail_rel_count"]
			
 
				+
			
 
				+            # crawlab平台
			
 
				+            crawlab = get_node_and_taskid(runtime, spidercode)
			
 
				+            if crawlab:
			
 
				+                join_data["py_taskid"] = crawlab["crawlab_taskid"]
			
 
				+                join_data["py_nodename"] = crawlab["node_ip"]
			
 
				+
			
 
				+            join_data["list_isgetdata"] = get_list_isgetdata(hash_key)  # 列表页是否采集数据
			
 
				+            join_data["list_allintimes"] = get_list_allintimes(hash_key)  # 日采集列表的总入库量
			
 
				+            join_data["list_runtimes"] = get_list_runtimes(hash_key)  # 列表页采集运行频次
			
 
				+
			
 
				+            join_data["detail_downloadnum"] = get_detail_downloadnum(hash_key)  # 详情页下载量
			
 
				+            join_data["detail_downloadsuccessnum"] = get_detail_downloadsuccessnum(hash_key)  # 详情页下载成功量
			
 
				+            join_data["detail_downloadfailnum"] = join_data["detail_downloadnum"] - join_data["detail_downloadsuccessnum"]  # 下载详情失败数量
			
 
				+
			
 
				+        else:
			
 
				+            join_data = {**crawler}
			
 
				+            join_data["list_isgetdata"] = False
			
 
				+            join_data["list_allintimes"] = -1
			
 
				+            join_data["list_runtimes"] = -1
			
 
				+            join_data["detail_downloadnum"] = -1
			
 
				+            join_data["detail_downloadsuccessnum"] = -1
			
 
				+            join_data["detail_downloadfailnum"] = -1
			
 
				+
			
 
				+        logger.info(f"{crawler['site']}-{crawler['channel']}-{spidercode}--完成统计")
			
 
				+        summary_queue.append(join_data)
			
 
				+
			
 
				+    save(summary_queue, spider_monitor)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/A数据处理/sync_data/send_data.py
+++ b/A数据处理/sync_data/send_data.py
@@ -0,0 +1,240 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2023-02-21
			
 
				+---------
			
 
				+@summary:  数据上传（redis到mongo）,采集数据同步服务
			
 
				+---------
			
 
				+@author: dzr
			
 
				+"""
			
 
				+
			
 
				+import ast
			
 
				+import time
			
 
				+from concurrent.futures import ThreadPoolExecutor, wait
			
 
				+from typing import Dict
			
 
				+
			
 
				+import redis
			
 
				+from bson import int64
			
 
				+from elasticsearch import Elasticsearch
			
 
				+from func_timeout import func_set_timeout
			
 
				+from func_timeout.exceptions import FunctionTimedOut
			
 
				+from pymongo import MongoClient
			
 
				+from redis._compat import unicode, long, basestring
			
 
				+from redis.connection import Encoder as RedisEncoder
			
 
				+from redis.exceptions import DataError
			
 
				+
			
 
				+from log import logger
			
 
				+
			
 
				+# mongo
			
 
				+MONGO_HOST = "172.17.4.87"
			
 
				+MONGO_PORT = 27080
			
 
				+MONGO_DB = "py_spider"
			
 
				+mcli = MongoClient(MONGO_HOST, MONGO_PORT)
			
 
				+mongodb = mcli[MONGO_DB]
			
 
				+
			
 
				+
			
 
				+# redis
			
 
				+class Encoder(RedisEncoder):
			
 
				+
			
 
				+    def encode(self, value):
			
 
				+        "Return a bytestring or bytes-like representation of the value"
			
 
				+        if isinstance(value, (bytes, memoryview)):
			
 
				+            return value
			
 
				+        # elif isinstance(value, bool):
			
 
				+        #     # special case bool since it is a subclass of int
			
 
				+        #     raise DataError(
			
 
				+        #         "Invalid input of type: 'bool'. Convert to a "
			
 
				+        #         "bytes, string, int or float first."
			
 
				+        #     )
			
 
				+        elif isinstance(value, float):
			
 
				+            value = repr(value).encode()
			
 
				+        elif isinstance(value, (int, long)):
			
 
				+            # python 2 repr() on longs is '123L', so use str() instead
			
 
				+            value = str(value).encode()
			
 
				+        elif isinstance(value, (list, dict, tuple)):
			
 
				+            value = unicode(value)
			
 
				+        elif not isinstance(value, basestring):
			
 
				+            # a value we don't know how to deal with. throw an error
			
 
				+            typename = type(value).__name__
			
 
				+            raise DataError(
			
 
				+                "Invalid input of type: '%s'. Convert to a "
			
 
				+                "bytes, string, int or float first." % typename
			
 
				+            )
			
 
				+        if isinstance(value, unicode):
			
 
				+            value = value.encode(self.encoding, self.encoding_errors)
			
 
				+        return value
			
 
				+
			
 
				+
			
 
				+REDIS_HOST = "172.17.4.232"
			
 
				+REDIS_PORT = 7361
			
 
				+REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
			
 
				+REDIS_DB = 10
			
 
				+
			
 
				+redis.connection.Encoder = Encoder
			
 
				+
			
 
				+pool = redis.ConnectionPool(
			
 
				+    host=REDIS_HOST,
			
 
				+    port=REDIS_PORT,
			
 
				+    password=REDISDB_USER_PASS,
			
 
				+    db=REDIS_DB
			
 
				+)
			
 
				+rcli = redis.StrictRedis(connection_pool=pool, decode_responses=True)
			
 
				+
			
 
				+# es
			
 
				+ES_HOST = '172.17.145.178'
			
 
				+ES_PORT = 9800
			
 
				+ES_INDEX = 'biddingall'
			
 
				+ecli = Elasticsearch([{"host": ES_HOST, "port": ES_PORT}])
			
 
				+
			
 
				+# 延时间隔
			
 
				+DELAY = 43200
			
 
				+
			
 
				+
			
 
				+def literal_eval(node_or_string):
			
 
				+    try:
			
 
				+        return ast.literal_eval(node_or_string)
			
 
				+    except ValueError as e:
			
 
				+        if 'malformed node or string' in e.args[0]:
			
 
				+            from bson import Code, ObjectId  # eval变量作用域，ObjectId参数
			
 
				+            return eval(node_or_string)
			
 
				+        else:
			
 
				+            raise e
			
 
				+
			
 
				+
			
 
				+def date2ts(date_str):
			
 
				+    """日期转时间戳"""
			
 
				+    if ":" in date_str:
			
 
				+        ts = int(time.mktime(time.strptime(date_str, "%Y-%m-%d %H:%M:%S")))
			
 
				+    else:
			
 
				+        ts = int(time.mktime(time.strptime(date_str, "%Y-%m-%d")))
			
 
				+    return ts
			
 
				+
			
 
				+
			
 
				+def es_query(title, publish_time):
			
 
				+    """
			
 
				+    查询es
			
 
				+
			
 
				+    :param title: 标题
			
 
				+    :param publish_time: 发布时间
			
 
				+    :return:
			
 
				+    """
			
 
				+    publish_time = date2ts(publish_time)
			
 
				+    stime = publish_time - 432000  # 往前推5天
			
 
				+    etime = publish_time + 432000
			
 
				+    # 通过发布标题和发布时间范围查询
			
 
				+    query = {
			
 
				+        "query": {
			
 
				+            "bool": {
			
 
				+                "must": [
			
 
				+                    {
			
 
				+                        "multi_match": {
			
 
				+                            "query": title,
			
 
				+                            "type": "phrase",
			
 
				+                            "fields": ["title"]
			
 
				+                        }
			
 
				+                    },
			
 
				+                    {"range": {'publishtime': {"from": stime, "to": etime}}}
			
 
				+                ]
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    result = ecli.search(body=query, index=ES_INDEX, request_timeout=100)
			
 
				+    # print(result['hits']['total'])
			
 
				+    total = int(result['hits']['total'])
			
 
				+    return total
			
 
				+
			
 
				+
			
 
				+def get_redis_key(table, prefix="savemongo:"):
			
 
				+    return prefix + table
			
 
				+
			
 
				+
			
 
				+def rpush(name, values, is_redis_cluster=False):
			
 
				+    """“将“values”推到列表“name”的尾部”"""
			
 
				+    if isinstance(values, list):
			
 
				+        pipe = rcli.pipeline()
			
 
				+        if not is_redis_cluster:
			
 
				+            pipe.multi()
			
 
				+
			
 
				+        for value in values:
			
 
				+            pipe.rpush(name, value)
			
 
				+        pipe.execute()
			
 
				+    else:
			
 
				+        return rcli.rpush(name, values)
			
 
				+
			
 
				+
			
 
				+def insert_one(table, item: Dict):
			
 
				+    """MongoDB 单条入库"""
			
 
				+    if item is not None:
			
 
				+        item.pop('_id', '')
			
 
				+        if item.get("comeintime"):
			
 
				+            item['comeintime'] = int64.Int64(item['comeintime'])
			
 
				+        try:
			
 
				+            title = item.get('title')
			
 
				+            result = mongodb[table].insert_one(item)
			
 
				+            logger.info(f'{table}-{str(result.inserted_id)}-{title}--上传成功')
			
 
				+        except Exception as e:
			
 
				+            rpush(get_redis_key(table), item)
			
 
				+            logger.error(table + f"--推送失败,原因:{''.join(e.args)}")
			
 
				+
			
 
				+
			
 
				+def sync_data(table):
			
 
				+    redis_key = get_redis_key(table)
			
 
				+    total = rcli.llen(redis_key)
			
 
				+    logger.info(f"同步表名:{table},推送总数:{total}")
			
 
				+    for _ in range(total):
			
 
				+        obj = rcli.lpop(redis_key)
			
 
				+        if obj is None:
			
 
				+            logger.warning(f'{table} 错误数据:{obj}')
			
 
				+            continue
			
 
				+
			
 
				+        try:
			
 
				+            item = literal_eval(obj)
			
 
				+            if table != 'mgp_list':
			
 
				+                insert_one(table, item)
			
 
				+            else:
			
 
				+                title = item.get("item").get("title")
			
 
				+                # 延时推送流程
			
 
				+                if item.get("is_delay"):
			
 
				+                    site = item.get("item").get("site")
			
 
				+                    t_diff = int(time.time()) - item.get("comeintime")
			
 
				+                    if t_diff <= DELAY:
			
 
				+                        rpush(redis_key, item)
			
 
				+                        logger.info(f"{site}-{title}-等待{t_diff}秒--延时入库")
			
 
				+                # es检索流程
			
 
				+                elif item.get("if_es"):
			
 
				+                    pt = item.get("item").get("publishtime")
			
 
				+                    if title is not None and es_query(title.strip(), pt) == 0:
			
 
				+                        insert_one(table, item)
			
 
				+                else:
			
 
				+                    insert_one(table, item)
			
 
				+        except Exception as e:
			
 
				+            # print(e)
			
 
				+            # print(f'{table} {type(obj)} >>>>> {repr(obj)}')
			
 
				+            rpush(redis_key, obj)
			
 
				+
			
 
				+
			
 
				+def err_msg(worker):
			
 
				+    err = worker.exception()
			
 
				+    if err:
			
 
				+        logger.exception("worker err: {}".format(err))
			
 
				+    return worker
			
 
				+
			
 
				+
			
 
				+@func_set_timeout(60 * 20)
			
 
				+def main():
			
 
				+    logger.info("数据同步开始")
			
 
				+    tables = ["mgp_list", "data_bak", "spider_heartbeat", "njpc_list", "data_njpc", "listdata_err"]
			
 
				+    with ThreadPoolExecutor() as pool:
			
 
				+        futures = []
			
 
				+        for table in tables:
			
 
				+            f = pool.submit(sync_data, table)
			
 
				+            f.add_done_callback(err_msg)
			
 
				+            futures.append(f)
			
 
				+        wait(futures)
			
 
				+    logger.info("数据同步结束")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    try:
			
 
				+        main()
			
 
				+    except FunctionTimedOut:
			
 
				+        logger.warning("数据同步超时")
			
--- a/A数据处理/sync_data/summary_data.py
+++ b/A数据处理/sync_data/summary_data.py
@@ -0,0 +1,180 @@
 
				+
			
 
				+from datetime import datetime, time, timedelta
			
 
				+
			
 
				+from bson.int64 import Int64
			
 
				+from bson.son import SON
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+from log import logger
			
 
				+
			
 
				+# mongo
			
 
				+MONGO_HOST = "172.17.4.87"
			
 
				+MONGO_PORT = 27080
			
 
				+MONGO_DB = "py_spider"
			
 
				+client = MongoClient(MONGO_HOST, MONGO_PORT)
			
 
				+mongodb = client[MONGO_DB]
			
 
				+
			
 
				+# 爬虫数据表
			
 
				+data_bak = mongodb["data_bak"]
			
 
				+
			
 
				+# 心跳表
			
 
				+spider_heartbeat = mongodb["spider_heartbeat"]
			
 
				+
			
 
				+# py_spiders列表
			
 
				+py_spiders_crawl_list = mongodb["crawl_data"]
			
 
				+
			
 
				+# 列表页汇总表
			
 
				+summary_table_of_list_pages = mongodb["list"]
			
 
				+
			
 
				+
			
 
				+def save(documents, collection):
			
 
				+    """保存数据"""
			
 
				+
			
 
				+    is_list = isinstance(documents, list)
			
 
				+    documents = documents if is_list else [documents]
			
 
				+
			
 
				+    count = 0
			
 
				+    data_lst = []
			
 
				+    for item in documents:
			
 
				+        item.pop("_id", None)
			
 
				+        data_lst.append(item)
			
 
				+        count += 1
			
 
				+        if len(data_lst) % 100 == 0:
			
 
				+            collection.insert_many(data_lst)
			
 
				+            data_lst.clear()
			
 
				+            logger.info(f"{collection.name}-批量保存{count}条数据--已完成")
			
 
				+
			
 
				+    # 提交剩余数据
			
 
				+    collection.insert_many(data_lst)
			
 
				+    logger.info(f"{collection.name}-批量保存{count}条数据--已完成")
			
 
				+
			
 
				+
			
 
				+def summary_data(document, runtime, only_count_list_page=False):
			
 
				+    """对聚合的数据进行汇总和分类"""
			
 
				+    summary_lst = []
			
 
				+    spider_item = document["spider_item"]
			
 
				+    for item in spider_item:
			
 
				+        spidercode = item["spidercode"]
			
 
				+        site = item["site"]
			
 
				+        data = {
			
 
				+            "business_type": item["business_type"],
			
 
				+            "site": site,
			
 
				+            "channel": item["channel"],
			
 
				+            "spidercode": spidercode,
			
 
				+            "count": document["count"],
			
 
				+            "rel_count": document["rel_count"],
			
 
				+            "runtime": runtime,
			
 
				+            "create_at": Int64(datetime.now().timestamp())
			
 
				+        }
			
 
				+        if len(spider_item) > 1:
			
 
				+            logger.warning(f"{spidercode} -> {site} --spidercode业务对应关系错误")
			
 
				+            data["warning"] = "spidercode业务对应关系错误"
			
 
				+
			
 
				+        if only_count_list_page:
			
 
				+            if str(item["business_type"]).endswith("List"):
			
 
				+                summary_lst.append(data)
			
 
				+            continue
			
 
				+        summary_lst.append(data)
			
 
				+
			
 
				+    return summary_lst
			
 
				+
			
 
				+
			
 
				+def feapder_crawl_aggregate_of_list_pages(datestr=None):
			
 
				+    """feapder采集列表页数据汇总（前一天的数据）"""
			
 
				+
			
 
				+    if datestr is None:
			
 
				+        today = datetime.now().date()
			
 
				+        yesterday = today + timedelta(days=-1)
			
 
				+        datestr = yesterday.strftime("%Y-%m-%d")
			
 
				+
			
 
				+    pipeline = [
			
 
				+        {"$match": {"runtime": datestr}},
			
 
				+        {
			
 
				+            "$group": {
			
 
				+                "_id": "$spider_id",
			
 
				+                "rel_count": {"$sum": "$rel_count"},
			
 
				+                "count": {"$sum": "$count"},
			
 
				+                "spider_item": {
			
 
				+                    "$addToSet": {
			
 
				+                        "site": "$site",
			
 
				+                        "channel": "$channel",
			
 
				+                        "spidercode": "$spidercode",
			
 
				+                        "business_type": "$business_type"
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        },
			
 
				+        {"$sort": SON([("rel_count", -1)])}
			
 
				+    ]
			
 
				+    #  $group阶段的内存限制为100M，默认情况下，如果stage超过此限制，
			
 
				+    #  $group将产生错误，但是，要允许处理大型数据集，请将allowDiskUse选项设置为true以启用$group操作以写入临时文件。
			
 
				+    cursor = spider_heartbeat.aggregate(pipeline, allowDiskUse=True)
			
 
				+    try:
			
 
				+        results = []
			
 
				+        for doc in cursor:
			
 
				+            results.extend(summary_data(doc, datestr, True))
			
 
				+        save(results, summary_table_of_list_pages)
			
 
				+    finally:
			
 
				+        client.close()
			
 
				+        logger.info("feapder数据汇总结束")
			
 
				+
			
 
				+
			
 
				+def py_spiders_crawl_aggregate_of_list_pages(datestr=None):
			
 
				+    """py_spiders采集列表页数据汇总（前一天的数据）"""
			
 
				+    if datestr is not None:
			
 
				+        today = datetime.fromisoformat(datestr).date()
			
 
				+    else:
			
 
				+        today = datetime.now().date()
			
 
				+    yesterday = today + timedelta(days=-1)
			
 
				+
			
 
				+    runtime = yesterday.strftime("%Y-%m-%d")
			
 
				+    start_time = int(datetime.combine(yesterday, time()).timestamp())
			
 
				+    end_time = int(datetime.combine(today, time()).timestamp())
			
 
				+
			
 
				+    pipeline = [
			
 
				+        {
			
 
				+            "$addFields": {
			
 
				+                "rel_count": {
			
 
				+                    "$cond": {
			
 
				+                        "if": {"$ne": ["$finished", True]},
			
 
				+                        "then": 1,
			
 
				+                        "else": 0
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        },
			
 
				+        {"$match": {"comeintime": {"$gte": start_time, "$lt": end_time}}},
			
 
				+        {
			
 
				+            "$group": {
			
 
				+                "_id": "$spidercode",
			
 
				+                "count": {"$sum": 1},  # 当天采集总数
			
 
				+                "rel_count": {"$sum": 1},  # 当天采集总数
			
 
				+                # "rel_count": {"$sum": "$rel_count"},  # 当天采集详情总数(仅成功)
			
 
				+                "spider_item": {
			
 
				+                    "$addToSet": {
			
 
				+                        "site": "$site",
			
 
				+                        "channel": "$channel",
			
 
				+                        "spidercode": "$spidercode",
			
 
				+                        "business_type": "List"
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        },
			
 
				+        {"$sort": SON([("rel_count", -1)])}
			
 
				+    ]
			
 
				+    cursor = py_spiders_crawl_list.aggregate(pipeline, allowDiskUse=True)
			
 
				+    try:
			
 
				+        results = []
			
 
				+        for doc in cursor:
			
 
				+            results.extend(summary_data(doc, runtime))
			
 
				+        save(results, summary_table_of_list_pages)
			
 
				+    finally:
			
 
				+        client.close()
			
 
				+        logger.info("py_spiders数据汇总结束")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # feapder_crawl_aggregate_of_list_pages("2023-04-03")
			
 
				+    feapder_crawl_aggregate_of_list_pages()
			
 
				+    # py_spiders_crawl_aggregate_of_list_pages("2023-04-04")
			
 
				+    py_spiders_crawl_aggregate_of_list_pages()