|
@@ -44,33 +44,24 @@ print(data['信息二级分类'].value_counts(dropna=False))
|
|
|
# 关闭MongoDB连接
|
|
|
client.close()
|
|
|
# analyze_column 函数,处理 NaN 值
|
|
|
-def analyze_column(dataframe, column_name, special=False):
|
|
|
+def analyze_column(dataframe, column_name):
|
|
|
if column_name not in dataframe.columns:
|
|
|
- # 如果字段不存在,假设所有记录都是正确的
|
|
|
+ # 字段不存在时,认为所有记录都是正确的
|
|
|
total = len(dataframe)
|
|
|
correct = total
|
|
|
error = 0
|
|
|
- accuracy = 1.0
|
|
|
- error_rate = 0.0
|
|
|
- error_reasons = pd.Series()
|
|
|
- elif special:
|
|
|
- # 特殊字段逻辑:存在且非空为错误
|
|
|
- total = len(dataframe[column_name])
|
|
|
- # 对于特殊字段,NaN 和空字典 {} 视为正确
|
|
|
- correct = dataframe[column_name].apply(lambda x: pd.isna(x) or x == {}).sum()
|
|
|
- error = total - correct
|
|
|
- accuracy = correct / total
|
|
|
- error_rate = error / total
|
|
|
- error_reasons = dataframe[column_name].apply(
|
|
|
- lambda x: x if x != {} and not pd.isna(x) else None).dropna().value_counts()
|
|
|
else:
|
|
|
- # 常规字段逻辑
|
|
|
+ # 对于存在的字段,NaN 和空字典 {} 视为正确,其他视为错误
|
|
|
total = len(dataframe[column_name])
|
|
|
- correct = dataframe[column_name].apply(lambda x: x == {}).sum()
|
|
|
+ correct = dataframe[column_name].apply(lambda x: pd.isna(x) or x == {}).sum()
|
|
|
error = total - correct
|
|
|
- accuracy = correct / total
|
|
|
- error_rate = error / total
|
|
|
- error_reasons = dataframe[column_name].apply(lambda x: x if x != {} else None).dropna().value_counts()
|
|
|
+
|
|
|
+ accuracy = correct / total if total > 0 else 0
|
|
|
+ error_rate = error / total if total > 0 else 0
|
|
|
+
|
|
|
+ # 收集错误原因
|
|
|
+ error_reasons = dataframe[column_name].apply(
|
|
|
+ lambda x: x if x != {} and not pd.isna(x) else None).dropna().value_counts()
|
|
|
|
|
|
return total, correct, error, accuracy, error_rate, error_reasons
|
|
|
|
|
@@ -123,13 +114,12 @@ fields_to_analyze = ["省份", "中标金额", "预算", "采购单位", "分包
|
|
|
expanded_analysis_results = []
|
|
|
|
|
|
for col in fields_to_analyze:
|
|
|
- total, correct, error, accuracy, error_rate, error_reasons = analyze_column(data, col,col in ['信息一级分类', '信息二级分类'])
|
|
|
+ total, correct, error, accuracy, error_rate, error_reasons = analyze_column(data, col)
|
|
|
|
|
|
chinese_name = column_name_mapping.get(col, col)
|
|
|
reformatted_error_reasons = reformat_error_reasons_safe(error_reasons)
|
|
|
|
|
|
for reason, count in reformatted_error_reasons.items():
|
|
|
- # 将错误原因转换为字符串,并去除括号和引号
|
|
|
reason = str(reason).replace('(', '').replace(',)', '').replace("'", '')
|
|
|
expanded_analysis_results.append({
|
|
|
'字段': chinese_name,
|
|
@@ -228,5 +218,5 @@ for sheet_name in temp_wb.sheetnames:
|
|
|
target.append(row)
|
|
|
|
|
|
# 保存最终的合并文件
|
|
|
-final_merged_file_path = '数据质量分析报告.xlsx' # 最终合并文件的路径
|
|
|
+final_merged_file_path = '质量分析报告.xlsx' # 最终合并文件的路径
|
|
|
wb.save(final_merged_file_path)
|