data_spider
/
crawlab_feader


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
							# -*- coding: utf-8 -*-
"""
Created on 2023-04-05
---------
@summary:  爬虫运行监控(feapder)和日常采集统计
---------
@author: Dzr
"""
import hashlib
from datetime import datetime, timedelta

from bson.int64 import Int64
from bson.son import SON
from pymongo import MongoClient

from log import logger

# mongo
MONGO_HOST = "172.17.4.87"
MONGO_PORT = 27080
client = MongoClient(MONGO_HOST, MONGO_PORT)

MONGO_DB1 = "py_spider"
MONGO_DB2 = "editor"

mongodb1 = client[MONGO_DB1]
mongodb2 = client[MONGO_DB2]

# 爬虫数据生产表
data_bak = mongodb1["data_bak"]
# 爬虫心跳表
spider_heartbeat = mongodb1["pyspider_heartbeat"]
# 日采集详情汇总表
spider_monitor = mongodb1["spider_monitor"]
# 采集任务表
spider_lua_config = mongodb2["luaconfig"]

# 特殊网站
special_sites = ["云南省政府采购网", "湖南省政府采购电子卖场"]


def get_md5(*args, **kwargs):
    """
    @summary: 获取唯一的32位md5
    ---------
    @param args: 参与联合去重的值数组
    @param kwargs: 参与联合去重的值字典
    ---------
    @result: 7c8684bcbdfcea6697650aa53d7b1405
    """
    if len(args) != 1:
        conditions = ["site", "channel", "spidercode"]
        data_lst = list(filter(lambda x: x is not None, args))
        for k, v in kwargs.items():
            if k in conditions and (v and v not in data_lst):
                data_lst.append(v)

        if not data_lst or len(data_lst) != 3:
            logger.error(f"组合条件缺失:{conditions},当前内容:{data_lst}")
            return None

        data_lst = sorted(data_lst)
        content = "_".join(data_lst)
    else:
        content = args[0]
    return hashlib.md5(str(content).encode()).hexdigest()


def get_runtime(datestr=None):
    if datestr is None:
        today = datetime.now().date()
        yesterday = today + timedelta(days=-1)
        datestr = yesterday.strftime("%Y-%m-%d")
    return datestr


def save(documents, collection, ordered=False):
    """保存数据"""
    is_list = isinstance(documents, list)
    documents = documents if is_list else [documents]

    data_lst = []
    for items in documents:
        items.pop("_id", None)
        items.pop("business_type", None)
        items["comeintime"] = Int64(datetime.now().timestamp())
        data_lst.append(items)
        if len(data_lst) == 100:
            ret = collection.insert_many(data_lst, ordered)
            logger.info(f"MongoDB {collection.name} 保存 {len(ret.inserted_ids)} 条数据")
            data_lst = []

    # 提交剩余数据
    if data_lst:
        collection.insert_many(data_lst, ordered)

    logger.info(f"MongoDB {collection.name} 保存 {len(documents)} 条数据")


def get_spider_lst():
    """爬虫基础信息"""
    crawler_lst = []
    q = {"platform": "python", "state": 11}
    projection = {
        "_id": 0, "site": 1, "channel": 1,
        "modifyuser": 1, "modifyuserid": 1, "code": 1, "event": 1
    }
    cursor = spider_lua_config.find(q, projection=projection)
    try:
        for doc in cursor:
            crawler_lst.append({
                "site": doc["site"],
                "channel": doc["channel"],
                "code": doc["code"],
                "modifyid": doc["modifyuserid"],
                "modifyuser": doc["modifyuser"],
                "event": doc["event"]
            })
    finally:
        client.close()
        logger.info(f"爬虫监控 - 已上线 {len(crawler_lst)} 个爬虫")
        yield from crawler_lst


def aggregate_query(collection, pipeline, is_print_error=False):
    """
    聚合查询

    @param collection: MongoDB集合
    @param pipeline: 聚合查询条件
    @param is_print_error: 是否在console打印错误日志
    @return: 聚合结果
    """
    results = []
    cursor = collection.aggregate(pipeline, allowDiskUse=True)
    try:
        for doc in cursor:
            results.append(doc)
    except Exception as e:
        if is_print_error:
            logger.exception(e)
    finally:
        client.close()
        yield from results


def aggregate_query_count(runtime):
    """
    feapder爬虫列表和详情采集数据量统计聚合查询结果统计

    @param runtime: 运行时间
    @return:
    """
    spider_dict = {}  # spidercode 与 site 排查记录
    aggregate_items = {}
    pipeline = [
        {"$match": {"runtime": runtime}},
        {
            "$group": {
                "_id": "$batch_no",
                "rel_count": {"$sum": "$rel_count"},  # 入库量(去重)
                "count": {"$sum": "$count"},  # 下载量
                "site": {"$first": "$site"},
                "channel": {"$first": "$channel"},
                "spidercode": {"$first": "$spidercode"},
                "business_type": {"$first": "$business_type"},
            }
        },
        {"$sort": SON([("rel_count", -1)])}
    ]
    results = aggregate_query(spider_heartbeat, pipeline)
    for items in results:
        site = items["site"]
        channel = items["channel"]
        spidercode = items["spidercode"]
        business_type = items["business_type"]
        business_type = "list" if business_type.endswith("List") else "detail"

        hash_key = get_md5(**items)  # 爬虫查询标识
        if not hash_key:
            logger.error(f"异常批次号 {items['_id']}")
            continue

        # 通过爬虫业务类型 分拣数据
        data_dict = {
            "site": site,
            "channel": channel,
            "spidercode": spidercode,
            "runtime": runtime,
            "spidercode_at_site_num": 0,  # 爬虫代码与站点对应的关系数量
            "frame": "feapder",  # 采集框架
            business_type: {
                "batch_no": items["_id"],
                "count": items["count"],
                "rel_count": items["rel_count"],
                "runtimes": 1
            }
        }
        if not aggregate_items.get(hash_key):
            aggregate_items.setdefault(hash_key, data_dict)
        else:
            aggregate_items[hash_key][business_type] = data_dict[business_type]

        # 监控爬虫任务，当 spidercode_at_site_num > 1
        # 表明创建的爬虫任务存在问题，需反馈数据寻源相关人员
        if site not in special_sites:
            spider = aggregate_items[hash_key]
            if spidercode not in spider_dict:
                spider["spidercode_at_site_num"] = 1
                values = {"keys": [hash_key], "sites": [site]}
                spider_dict.setdefault(spidercode, values)
            else:
                # spidercode 相同，但 site 不同的爬虫进行计数+1
                sites = spider_dict[spidercode]["sites"]
                if site not in sites:
                    keys = spider_dict[spidercode]["keys"]
                    for key in keys:
                        # 更新相同 spidercode 的 spidercode_at_site_num
                        aggregate_items[key]["spidercode_at_site_num"] += 1

                    keys.append(hash_key)  # 记录新爬虫
                    sites.append(site)  # 添加新站点
                    spider["spidercode_at_site_num"] = len(sites)
        else:
            aggregate_items[hash_key]["spidercode_at_site_num"] = 1

    return aggregate_items


def aggregate_query_crawl_list(runtime):
    """feapder列表爬虫采集聚合查询结果统计"""
    aggregate_items = {}
    pipeline = [
        {
            "$match": {
                "runtime": runtime,
                "business_type": {"$regex": "List"},
                "status_code": {"$ne": -1}
            }
        },
        {
            "$group": {
                "_id": "$batch_no",
                "count": {"$sum": "$count"},
                "rel_count": {"$sum": "$rel_count"},
                "site": {"$first": "$site"},
                "channel": {"$first": "$channel"},
                "spidercode": {"$first": "$spidercode"},
            }
        }
    ]
    results = aggregate_query(spider_heartbeat, pipeline)
    for items in results:
        hask_key = get_md5(**items)
        if not aggregate_items.get(hask_key):
            values = {"list_allintimes": 0, "list_nodatatimes": 0}
            aggregate_items.setdefault(hask_key, values)

        if all([
            items["count"] > 0,
            items["rel_count"] > 0,
            items["count"] == items["rel_count"]
        ]):
            aggregate_items[hask_key]["list_allintimes"] += 1

        if items["count"] == 0:
            aggregate_items[hask_key]["list_nodatatimes"] += 1

    return aggregate_items


def aggregate_count_crawlab_update_runtimes(runtime):
    """feapder爬虫采集聚合查询crawlab平台运行信息"""
    pipeline = [
        {
            "$project": {
                "_id": 0,
                "site": 1,
                "channel": 1,
                "batch_no": 1,
                "spidercode": 1,
                "business_type": 1,
                "runtime": 1,
                "node_ip": 1,
                "crawlab_taskid": 1,
                "create_at": 1,
            }
        },
        {"$match": {"runtime": runtime}},
        {
            "$group": {
                "_id": "$batch_no",
                "business_type": {"$first": "$business_type"},
                "site": {"$first": "$site"},
                "channel": {"$first": "$channel"},
                "spidercode": {"$first": "$spidercode"},
                "crawlab_item": {
                    "$addToSet": {
                        "node_ip": "$node_ip",
                        "crawlab_taskid": "$crawlab_taskid",
                    },
                }
            }
        }
    ]
    results = aggregate_query(spider_heartbeat, pipeline)
    for items in results:
        runtimes = len(items["crawlab_item"])  # 采集任务运行次数

        # 通过爬虫业务类型 分拣数据
        business_type = items["business_type"]
        business_type = "list" if business_type.endswith("List") else "detail"

        hash_key = get_md5(**items)  # 爬虫查询标识
        # 更新聚合统计任务运行次数
        aggregate_count_items[hash_key][business_type]["runtimes"] = runtimes


_runtime = get_runtime()  # 统计时间
aggregate_count_items = aggregate_query_count(_runtime)
aggregate_crawl_list_items = aggregate_query_crawl_list(_runtime)
aggregate_count_crawlab_update_runtimes(_runtime)


def get_list_isgetdata(hash_key, default=0):
    """列表页是否采集数据"""
    query_result = aggregate_count_items.get(hash_key)
    if query_result:
        default = query_result["list"]["count"]
    return True if default > 0 else False


def get_list_allintimes(hash_key, default=0):
    """日采集列表数量与入库数量相等的次数（扣除标题去重数量 + 增量（全量）去重数量）"""
    if aggregate_crawl_list_items.get(hash_key):
        default = aggregate_crawl_list_items[hash_key]["list_allintimes"]
    return default


def get_list_runtimes(hash_key, default=0):
    """列表采集运行频次"""
    query_result = aggregate_count_items.get(hash_key)
    if query_result and "list" in query_result:
        default = aggregate_count_items[hash_key]["list"]["runtimes"]
    return default


def get_list_count(hash_key, default=0):
    """列表采集总数"""
    query_result = aggregate_count_items.get(hash_key)
    if query_result and "list" in query_result:
        default = aggregate_count_items[hash_key]["list"]["count"]
    return default


def get_list_rel_count(hash_key, default=0):
    """列表实际入库总数"""
    query_result = aggregate_count_items.get(hash_key)
    if query_result and "list" in query_result:
        default = aggregate_count_items[hash_key]["list"]["rel_count"]
    return default


def get_list_nodatatimes(hash_key, default=-1):
    """列表页采集无数据次数(过滤后)"""
    if aggregate_crawl_list_items.get(hash_key):
        default = aggregate_crawl_list_items[hash_key]["list_nodatatimes"]
    return default


def get_detail_downloadnum(hash_key, default=0):
    """详情页下载量"""
    query_result = aggregate_count_items.get(hash_key)
    if query_result and "detail" in query_result:
        default = aggregate_count_items[hash_key]["detail"]["count"]
    return default


get_detail_count = get_detail_downloadnum


def get_detail_downloadsuccessnum(hash_key, default=0):
    """详情页下载成功量"""
    query_result = aggregate_count_items.get(hash_key)
    if query_result and "detail" in query_result:
        default = aggregate_count_items[hash_key]["detail"]["rel_count"]
    return default


get_detail_rel_count = get_detail_downloadsuccessnum


def get_detail_downloadfailnum(**kwargs):
    """详情页下载失败量"""
    count = -1
    if kwargs["detail_downloadnum"] >= 0 and kwargs["detail_downloadnum"] >= 0:
        count = kwargs["detail_downloadnum"] - kwargs["detail_downloadsuccessnum"]
    return count


def start_monitor():
    summary_queue = []
    spider_lst = get_spider_lst()
    for spider in spider_lst:
        site = spider["site"]
        channel = spider["channel"]
        spidercode = spider["code"]

        join_data = {**spider, "is_valid": False}  # 创建爬虫基础数据
        hash_key = get_md5(site, channel, spidercode)
        query_result = aggregate_count_items.get(hash_key)
        if query_result:
            # 聚合查询 - 统计数据采集信息
            result = query_result
            join_data["frame"] = result["frame"]  # 采集框架名
            join_data["spidercode_at_site_num"] = result["spidercode_at_site_num"]

        # 聚合统计 - 列表采集数据
        join_data["list_isgetdata"] = get_list_isgetdata(hash_key)
        join_data["list_allintimes"] = get_list_allintimes(hash_key)
        join_data["list_runtimes"] = get_list_runtimes(hash_key)
        join_data["list_nodatatimes"] = get_list_nodatatimes(hash_key)
        join_data["list_count"] = get_list_count(hash_key)
        join_data["list_rel_count"] = get_list_rel_count(hash_key)

        # 聚合统计 - 详情采集数据
        join_data["detail_count"] = get_detail_count(hash_key)
        join_data["detail_rel_count"] = get_detail_rel_count(hash_key)
        join_data["detail_downloadnum"] = get_detail_downloadnum(hash_key)
        join_data["detail_downloadsuccessnum"] = get_detail_downloadsuccessnum(hash_key)
        join_data["detail_downloadfailnum"] = get_detail_downloadfailnum(**join_data)

        # 日统计采集数据是否有效
        frame = join_data.get("frame")
        if frame and frame == "feapder":
            join_data["is_valid"] = True

        summary_queue.append(join_data)
        logger.info(f"{site} {channel} {spidercode} --统计完成")
    # 上传数据库
    save(summary_queue, spider_monitor)
    logger.info("爬虫监控 - 日统计完成")


if __name__ == '__main__':
    start_monitor()