|
@@ -0,0 +1,116 @@
|
|
|
|
+from datetime import datetime
|
|
|
|
+import pandas as pd
|
|
|
|
+from pymongo import MongoClient
|
|
|
|
+from openpyxl import load_workbook
|
|
|
|
+
|
|
|
|
+# MongoDB连接配置
|
|
|
|
+host = '172.20.45.129'
|
|
|
|
+port = 27002
|
|
|
|
+dbname = 'data_quality'
|
|
|
|
+collection_name = 'bid_analysis'
|
|
|
|
+
|
|
|
|
+# 创建MongoDB连接
|
|
|
|
+client = MongoClient(host, port)
|
|
|
|
+db = client[dbname]
|
|
|
|
+collection = db[collection_name]
|
|
|
|
+
|
|
|
|
+# 从MongoDB读取数据并筛选出 create_time 等于 1739289600 的记录
|
|
|
|
+query = {"create_time": 1740585600}
|
|
|
|
+data = pd.DataFrame(list(collection.find(query)))
|
|
|
|
+
|
|
|
|
+# 定义字段中英文映射(无需移除 '_qa' 后缀)
|
|
|
|
+column_name_mapping = {
|
|
|
|
+ "area_qa": "省份",
|
|
|
|
+ "bidamount_qa": "中标金额",
|
|
|
|
+ "budget_qa": "预算",
|
|
|
|
+ "buyer_qa": "采购单位",
|
|
|
|
+ "com_package_qa": "分包",
|
|
|
|
+ "projectcode_qa": "项目编号",
|
|
|
|
+ "projectname_qa": "项目名称",
|
|
|
|
+ "title_qa": "标题",
|
|
|
|
+ "winner_qa": "中标单位",
|
|
|
|
+ "score": "标讯总分数",
|
|
|
|
+ "bidopentime_qa": "开标时间",
|
|
|
|
+ "publishtime_qa": "发布时间",
|
|
|
|
+ "toptype_qa": "信息一级分类",
|
|
|
|
+ "subtype_qa": "信息二级分类"
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+# 将字段名称更改为中文
|
|
|
|
+data.rename(columns=column_name_mapping, inplace=True)
|
|
|
|
+
|
|
|
|
+# 提取需要分析的字段(使用中文列名)
|
|
|
|
+qa_fields = ["标题", "项目名称", "中标单位", "项目编号", "采购单位", "中标金额", "省份","分包"]
|
|
|
|
+
|
|
|
|
+# 确保 error_type 字段存在
|
|
|
|
+if "error_type" in data.columns:
|
|
|
|
+ # 提取每个字段的错误信息,空字典 {} 设为 None(正确)
|
|
|
|
+ for qa_field in qa_fields:
|
|
|
|
+ # 反向映射中文列名到原始英文名(用于从 error_type 提取数据)
|
|
|
|
+ english_field = next(k for k, v in column_name_mapping.items() if v == qa_field)
|
|
|
|
+ data[qa_field] = data['error_type'].apply(
|
|
|
|
+ lambda x: x.get(english_field) if isinstance(x, dict) and x.get(english_field) != {} else None
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ # 提取标讯总分数
|
|
|
|
+ data['标讯总分数'] = data['error_type'].apply(
|
|
|
|
+ lambda x: x.get("score") if isinstance(x, dict) else None
|
|
|
|
+ )
|
|
|
|
+ data['标讯总分数'] = pd.to_numeric(data['标讯总分数'], errors='coerce')
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+# 分析函数(保持不变)
|
|
|
|
+def analyze_column(dataframe, column_name):
|
|
|
|
+ if column_name not in dataframe.columns:
|
|
|
|
+ return 0, 0, 0, 0, 0, pd.Series(dtype='object')
|
|
|
|
+
|
|
|
|
+ field_series = dataframe[column_name]
|
|
|
|
+ total = len(field_series)
|
|
|
|
+ correct = field_series.isna().sum() # NaN 表示正确(空字典 {} 已映射为 None)
|
|
|
|
+ error = total - correct
|
|
|
|
+ accuracy = correct / total if total > 0 else 0
|
|
|
|
+ error_rate = error / total if total > 0 else 0
|
|
|
|
+ error_reasons = field_series.dropna().value_counts()
|
|
|
|
+
|
|
|
|
+ return total, correct, error, accuracy, error_rate, error_reasons
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+# 分析结果存储(保持不变)
|
|
|
|
+expanded_analysis_results = []
|
|
|
|
+for qa_field in qa_fields:
|
|
|
|
+ total, correct, error, accuracy, error_rate, error_reasons = analyze_column(data, qa_field)
|
|
|
|
+ for reason, count in error_reasons.items():
|
|
|
|
+ expanded_analysis_results.append({
|
|
|
|
+ '字段': qa_field,
|
|
|
|
+ '总量': total,
|
|
|
|
+ '正确数量': correct,
|
|
|
|
+ '错误数量': error,
|
|
|
|
+ '正确率': f'{accuracy:.2%}',
|
|
|
|
+ '错误率': f'{error_rate:.2%}',
|
|
|
|
+ '错误原因': reason,
|
|
|
|
+ '错误次数': count
|
|
|
|
+ })
|
|
|
|
+
|
|
|
|
+# 转换为 DataFrame
|
|
|
|
+expanded_analysis_results_df = pd.DataFrame(expanded_analysis_results)
|
|
|
|
+
|
|
|
|
+# 后续写入 Excel 的代码保持不变
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+# 分数分析
|
|
|
|
+score_distribution_df = pd.DataFrame()
|
|
|
|
+if "标讯总分数" in data.columns:
|
|
|
|
+ score_counts = data['标讯总分数'].value_counts().sort_index()
|
|
|
|
+ total_scores = len(data['标讯总分数'])
|
|
|
|
+ score_percentages = (score_counts / total_scores).apply(lambda x: f'{x:.2%}')
|
|
|
|
+ score_distribution_df = pd.DataFrame({
|
|
|
|
+ '分数': score_counts.index,
|
|
|
|
+ '数量': score_counts.values,
|
|
|
|
+ '占比': score_percentages
|
|
|
|
+ })
|
|
|
|
+
|
|
|
|
+# 写入Excel
|
|
|
|
+with pd.ExcelWriter('质量分析报告.xlsx', engine='openpyxl') as writer:
|
|
|
|
+ expanded_analysis_results_df.to_excel(writer, sheet_name='字段分析结果', index=False)
|
|
|
|
+ if not score_distribution_df.empty:
|
|
|
|
+ score_distribution_df.to_excel(writer, sheet_name='分数分析结果', index=False)
|