Browse Source

空值处理

lizhikun 1 year ago
parent
commit
2a5ba637a2
1 changed files with 13 additions and 23 deletions
  1. 13 23
      result_export.py

+ 13 - 23
result_export.py

@@ -44,33 +44,24 @@ print(data['信息二级分类'].value_counts(dropna=False))
 # 关闭MongoDB连接
 client.close()
 #  analyze_column 函数,处理 NaN 值
-def analyze_column(dataframe, column_name, special=False):
+def analyze_column(dataframe, column_name):
     if column_name not in dataframe.columns:
-        # 如果字段不存在,假设所有记录都是正确的
+        # 字段不存在时,认为所有记录都是正确的
         total = len(dataframe)
         correct = total
         error = 0
-        accuracy = 1.0
-        error_rate = 0.0
-        error_reasons = pd.Series()
-    elif special:
-        # 特殊字段逻辑:存在且非空为错误
-        total = len(dataframe[column_name])
-        # 对于特殊字段,NaN 和空字典 {} 视为正确
-        correct = dataframe[column_name].apply(lambda x: pd.isna(x) or x == {}).sum()
-        error = total - correct
-        accuracy = correct / total
-        error_rate = error / total
-        error_reasons = dataframe[column_name].apply(
-            lambda x: x if x != {} and not pd.isna(x) else None).dropna().value_counts()
     else:
-        # 常规字段逻辑
+        # 对于存在的字段,NaN 和空字典 {} 视为正确,其他视为错误
         total = len(dataframe[column_name])
-        correct = dataframe[column_name].apply(lambda x: x == {}).sum()
+        correct = dataframe[column_name].apply(lambda x: pd.isna(x) or x == {}).sum()
         error = total - correct
-        accuracy = correct / total
-        error_rate = error / total
-        error_reasons = dataframe[column_name].apply(lambda x: x if x != {} else None).dropna().value_counts()
+
+    accuracy = correct / total if total > 0 else 0
+    error_rate = error / total if total > 0 else 0
+
+    # 收集错误原因
+    error_reasons = dataframe[column_name].apply(
+        lambda x: x if x != {} and not pd.isna(x) else None).dropna().value_counts()
 
     return total, correct, error, accuracy, error_rate, error_reasons
 
@@ -123,13 +114,12 @@ fields_to_analyze = ["省份", "中标金额", "预算", "采购单位", "分包
 expanded_analysis_results = []
 
 for col in fields_to_analyze:
-    total, correct, error, accuracy, error_rate, error_reasons = analyze_column(data, col,col in ['信息一级分类', '信息二级分类'])
+    total, correct, error, accuracy, error_rate, error_reasons = analyze_column(data, col)
 
     chinese_name = column_name_mapping.get(col, col)
     reformatted_error_reasons = reformat_error_reasons_safe(error_reasons)
 
     for reason, count in reformatted_error_reasons.items():
-        # 将错误原因转换为字符串,并去除括号和引号
         reason = str(reason).replace('(', '').replace(',)', '').replace("'", '')
         expanded_analysis_results.append({
             '字段': chinese_name,
@@ -228,5 +218,5 @@ for sheet_name in temp_wb.sheetnames:
         target.append(row)
 
 # 保存最终的合并文件
-final_merged_file_path = '数据质量分析报告.xlsx'  # 最终合并文件的路径
+final_merged_file_path = '质量分析报告.xlsx'  # 最终合并文件的路径
 wb.save(final_merged_file_path)