浏览代码

覆盖度工具,数据时效性统计

jialuyao 1 周之前
父节点
当前提交
bfb8ae4389
共有 2 个文件被更改,包括 210 次插入0 次删除
  1. 138 0
      tools/联通项目/data_timeliness.py
  2. 72 0
      tools/联通项目/extract_site_es.py

+ 138 - 0
tools/联通项目/data_timeliness.py

@@ -0,0 +1,138 @@
+from pymongo import MongoClient
+import pymysql
+from datetime import datetime, timedelta
+import pandas as pd
+
+
+def data_timeliness_analysis():
+    try:
+        # ==================== MongoDB连接配置 ====================
+        mongo_client = MongoClient(
+            'mongodb://127.0.0.1:27087/',
+            unicode_decode_error_handler="ignore",
+            directConnection=True
+        )
+        mongo_db = mongo_client["jyqyfw"]
+        collection = mongo_db["usermail"]
+
+        # ==================== MySQL连接配置 ====================
+        mysql_conn = pymysql.connect(
+            host='172.20.45.129',
+            port=4000,
+            user='root',  # 替换实际用户名
+            password='=PDT49#80Z!RVv52_z',  # 替换实际密码
+            database='quality',
+            charset='utf8mb4'
+        )
+
+        # ==================== 数据统计逻辑 ====================
+        # 时间范围设置(当前时间前1小时)
+        now = datetime.now()
+        hour_start = (now - timedelta(hours=1)).replace(minute=0, second=0, microsecond=0)
+
+        # 构建查询条件
+        query = {
+            "appid": "jyGQ1XQQsEAwNeSENOFR9D",
+            "createtime": {
+                "$gte": int(hour_start.timestamp()),
+                "$lt": int(now.replace(minute=0, second=0, microsecond=0).timestamp())
+            }
+        }
+
+        # 初始化统计结果
+        stats = {
+            'create_time': now.strftime('%Y-%m-%d %H:%M:%S'),
+            'total_count': 0,
+            'less_than_2h_count': 0,
+            'less_than_2h_ratio': 0.0,
+            '2h_to_3h_count': 0,
+            '2h_to_3h_ratio': 0.0,
+            '3h_to_4h_count': 0,
+            '3h_to_4h_ratio': 0.0,
+            '4h_to_8h_count': 0,
+            '4h_to_8h_ratio': 0.0,
+            'more_than_8h_count': 0,
+            'more_than_8h_ratio': 0.0
+        }
+
+        # 执行统计查询
+        total_count = collection.count_documents(query)
+        stats['total_count'] = total_count
+
+        if total_count > 0:
+            # 各时间区间统计
+            for doc in collection.find(query, {"createtime": 1, "publishtime": 1, "_id": 0}):
+                hours_diff = abs(doc["createtime"] - doc["publishtime"]) / 3600
+
+                if hours_diff <= 2:
+                    stats['less_than_2h_count'] += 1
+                elif 2 < hours_diff <= 3:
+                    stats['2h_to_3h_count'] += 1
+                elif 3 < hours_diff <= 4:
+                    stats['3h_to_4h_count'] += 1
+                elif 4 < hours_diff <= 8:
+                    stats['4h_to_8h_count'] += 1
+                else:
+                    stats['more_than_8h_count'] += 1
+
+            # 计算占比(保留2位小数)
+            stats['less_than_2h_ratio'] = round(stats['less_than_2h_count'] / total_count * 100, 2)
+            stats['2h_to_3h_ratio'] = round(stats['2h_to_3h_count'] / total_count * 100, 2)
+            stats['3h_to_4h_ratio'] = round(stats['3h_to_4h_count'] / total_count * 100, 2)
+            stats['4h_to_8h_ratio'] = round(stats['4h_to_8h_count'] / total_count * 100, 2)
+            stats['more_than_8h_ratio'] = round(stats['more_than_8h_count'] / total_count * 100, 2)
+
+        # ==================== 生成Excel报表 ====================
+        excel_data = {
+            "创建时间": [stats['create_time']],
+            "总量": [stats['total_count']],
+            "<=2小时数量": [stats['less_than_2h_count']],
+            "<=2小时占比": [f"{stats['less_than_2h_ratio']}%"],
+            "2-3小时数量": [stats['2h_to_3h_count']],
+            "2-3小时占比": [f"{stats['2h_to_3h_ratio']}%"],
+            "3-4小时数量": [stats['3h_to_4h_count']],
+            "3-4小时占比": [f"{stats['3h_to_4h_ratio']}%"],
+            "4-8小时数量": [stats['4h_to_8h_count']],
+            "4-8小时占比": [f"{stats['4h_to_8h_ratio']}%"],
+            ">8小时数量": [stats['more_than_8h_count']],
+            ">8小时占比": [f"{stats['more_than_8h_ratio']}%"]
+        }
+        pd.DataFrame(excel_data).to_excel("数据时效统计表.xlsx", index=False)
+
+        # ==================== 导入MySQL数据库 ====================
+        with mysql_conn.cursor() as cursor:
+            sql = """
+            INSERT INTO data_timeliness_liantong (
+                create_time, total_count,
+                less_than_2h_count, less_than_2h_ratio,
+                `2h_to_3h_count`, `2h_to_3h_ratio`,
+                `3h_to_4h_count`, `3h_to_4h_ratio`,
+                `4h_to_8h_count`, `4h_to_8h_ratio`,
+                more_than_8h_count, more_than_8h_ratio
+            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+            """
+            cursor.execute(sql, (
+                stats['create_time'], stats['total_count'],
+                stats['less_than_2h_count'], stats['less_than_2h_ratio'],
+                stats['2h_to_3h_count'], stats['2h_to_3h_ratio'],
+                stats['3h_to_4h_count'], stats['3h_to_4h_ratio'],
+                stats['4h_to_8h_count'], stats['4h_to_8h_ratio'],
+                stats['more_than_8h_count'], stats['more_than_8h_ratio']
+            ))
+        mysql_conn.commit()
+
+        print("✅ 操作完成:")
+        print(f"1. Excel报表已生成(数据时效统计表.xlsx)")
+        print(f"2. 数据已插入MySQL(quality.data_timeliness_liantong)")
+        print(f"📊 统计时段:{hour_start.strftime('%H:%M')} - {now.strftime('%H:%M')}")
+        print(f"📝 统计结果:{stats}")
+
+    except Exception as e:
+        print(f"❌ 操作失败: {type(e).__name__}: {str(e)}")
+    finally:
+        mongo_client.close()
+        mysql_conn.close()
+
+
+if __name__ == '__main__':
+    data_timeliness_analysis()

+ 72 - 0
tools/联通项目/extract_site_es.py

@@ -0,0 +1,72 @@
+from elasticsearch import Elasticsearch
+import urllib3
+from datetime import datetime
+import pandas as pd
+from tqdm import tqdm  # 进度条库
+
+# 禁用SSL警告
+urllib3.disable_warnings()
+timeout = 180
+# 连接配置(使用图片中的参数)
+es_host = "http://127.0.0.1:19800"
+es_username = "jianyuGr"
+es_password = "we3g8glKfe#"
+
+def query_time_range_data():
+    try:
+        # 创建ES客户端
+        es = Elasticsearch(es_host,http_auth=(es_username, es_password),request_timeout=True)
+        # 验证连接
+        if not es.ping():
+            raise ConnectionError("无法连接到ES集群")
+
+        # 初始查询
+        query = {
+            "query": {
+                "range": {
+                    "publishtime": {
+                        # "gte": 1752422400,
+                        # "lte": 1753027199
+                        "gte": 1751299200,
+                        "lte": 1753372800
+                    }
+                }
+            },
+            "_source": ["site"],  # 只获取site字段
+            "size": 1000  # 每批获取量
+        }
+
+        # 初始化Scroll
+        resp = es.search(
+            index="bidding_v1",
+            scroll='2m',  # 保持scroll上下文2分钟
+            body=query
+        )
+        scroll_id = resp['_scroll_id']
+        total = resp['hits']['total']['value']  # 获取总数
+        print(f"📊 符合条件的数据总量: {total}条")
+
+        # 使用进度条处理
+        data = []
+        with tqdm(total=total, desc="🔄 滚动搜索进度") as pbar:
+            while len(resp['hits']['hits']):
+                # 处理当前批次
+                data.extend([hit['_source']['site'] for hit in resp['hits']['hits'] if 'site' in hit['_source']])
+                pbar.update(len(resp['hits']['hits']))
+
+                # 获取下一批
+                resp = es.scroll(scroll_id=scroll_id, scroll='2m')
+                scroll_id = resp['_scroll_id']
+
+        # 去重并导出
+        unique_sites = list(set(data))
+        pd.DataFrame(unique_sites, columns=["site"]).to_excel("unique_sites.xlsx", index=False)
+        print(f"✅ 完成!去重后获得 {len(unique_sites)} 条唯一site数据")
+
+    except Exception as e:
+        print(f"❌ 错误: {str(e)}")
+        raise
+
+
+if __name__ == '__main__':
+    query_time_range_data()