Ver código fonte

测试示例

lizhikun 2 anos atrás
pai
commit
5737091a4c
1 arquivos alterados com 66 adições e 0 exclusões
  1. 66 0
      Dataquality/test.py

+ 66 - 0
Dataquality/test.py

@@ -0,0 +1,66 @@
+import pandas as pd
+import cpca
+import matplotlib.pyplot as plt
+import openpyxl
+
+# 读取Excel文件并提取标题
+filename = 'C:\\Users\\25503\\PycharmProjects\\pythonProject\\dataqa\\2023-07-08.xlsx'
+
+df = pd.read_excel(filename)
+titles = df['title'].tolist()
+
+# 使用cpca库进行城市名称提取和省份转换,并将结果与"erea"列的值进行对比
+provinces = []
+correct_flags = []
+for title in titles:
+    df_temp = cpca.transform([title])
+    province = df_temp['省'][0]
+    provinces.append(province)
+
+    area = df.loc[df['title'] == title, 'area'].values[0]
+    area_province = cpca.transform([area])['省'][0]
+
+    correct = province == area_province
+    correct_flags.append(correct)
+
+    # 打印标记过程
+    print(f"Title: {title}")
+    print(f"Predicted Province: {province}")
+    print(f"Area: {area}")
+    print(f"Area Province: {area_province}")
+    print(f"Correct: {correct}")
+    print("-" * 30)
+
+df['predicted_province'] = provinces
+df['correct'] = correct_flags
+
+# 计算正确率和错误率
+total_count = len(df)
+correct_count = df['correct'].sum()
+error_count = total_count - correct_count
+
+accuracy = correct_count / total_count
+error_rate = error_count / total_count
+
+print(f"Total Count: {total_count}")
+print(f"Correct Count: {correct_count}")
+print(f"Error Count: {error_count}")
+print(f"Accuracy: {accuracy}")
+print(f"Error Rate: {error_rate}")
+
+# 生成结果分析图表
+labels = ['Correct', 'Error']
+counts = [correct_count, error_count]
+
+plt.bar(labels, counts)
+plt.xlabel('Result')
+plt.ylabel('Count')
+plt.title('Result Analysis')
+plt.show()
+
+# 导出带有标记的数据到Excel文件
+output_filename = 'data.xlsx'
+
+# 使用 'with' 语句来保存和自动关闭文件
+with pd.ExcelWriter(output_filename, engine='openpyxl') as writer:
+    df.to_excel(writer, index=False)