|
@@ -0,0 +1,132 @@
|
|
|
+from pymongo import MongoClient
|
|
|
+from bson import ObjectId
|
|
|
+from lib.mogodb_helper import MongoDBInterface
|
|
|
+
|
|
|
+#源1和源2,与源3(标准样本数据)对比,出结果
|
|
|
+#mongo库
|
|
|
+MongodbConfig = {
|
|
|
+ "ip_port": "192.168.3.149:27180",
|
|
|
+ "db": "data_quality",
|
|
|
+}
|
|
|
+mdb = MongoDBInterface(MongodbConfig)
|
|
|
+
|
|
|
+max_id = ObjectId("0" * 24)
|
|
|
+# max_id = ObjectId("655ec5609aed6eb2ffa654ca")
|
|
|
+
|
|
|
+# columns_to_check = ['toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipackage']
|
|
|
+# 连接MongoDB数据库
|
|
|
+#源1--标准库 和 标准样本库对比
|
|
|
+def Compare(collection_name):
|
|
|
+ toptype_num=0
|
|
|
+ subtype_num =0
|
|
|
+ area_num = 0
|
|
|
+ city_num = 0
|
|
|
+ buyer_num = 0
|
|
|
+ projectname_num = 0
|
|
|
+ projectcode_num = 0
|
|
|
+ budget_num = 0
|
|
|
+ s_winner_num = 0
|
|
|
+ bidamount_num = 0
|
|
|
+ multipackage_num = 0
|
|
|
+
|
|
|
+ #标准样本数据源
|
|
|
+ db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
|
|
|
+ coll = db["standard_sample_data"]
|
|
|
+
|
|
|
+ # for item in coll_user.find({"_id": {"$gte": max_id}}).sort("_id", 1):
|
|
|
+ for row in coll.find({ "_id" : ObjectId("659214ad66cf0db42a4a985c") }).sort("_id", 1):
|
|
|
+ _id = row.get("_id", "")
|
|
|
+ toptype =row.get("toptype","")
|
|
|
+ subtype = row.get("subtype", "")
|
|
|
+ area = row.get("area", "")
|
|
|
+ city = row.get("city", "")
|
|
|
+ buyer = row.get("buyer", "")
|
|
|
+ projectname = row.get("projectname", "")
|
|
|
+ projectcode = row.get("projectcode", "")
|
|
|
+ budget = row.get("budget", "")
|
|
|
+ s_winner = row.get("s_winner", "")
|
|
|
+ bidamount = row.get("bidamount", "")
|
|
|
+ multipacket = row.get("multipackage", "")
|
|
|
+ # print(row["_id"])
|
|
|
+ if _id:
|
|
|
+ m_id = ObjectId(_id)
|
|
|
+ result=mdb.find_by_id(collection_name,m_id)
|
|
|
+ if result==None:
|
|
|
+ print(row["_id"]+"在标准库没找到")
|
|
|
+ if result:
|
|
|
+ toptype_bid = result.get("toptype", "")
|
|
|
+ subtype_bid = result.get("subtype", "")
|
|
|
+ area_bid = result.get("area", "")
|
|
|
+ city_bid = result.get("city", "")
|
|
|
+ buyer_bid = result.get("buyer", "")
|
|
|
+ projectname_bid = result.get("projectname", "")
|
|
|
+ projectcode_bid = result.get("projectcode", "")
|
|
|
+ budget_bid = result.get("budget", "")
|
|
|
+ s_winner_bid = result.get("s_winner", "")
|
|
|
+ bidamount_bid = result.get("bidamount", "")
|
|
|
+ multipackage_bid = result.get("multipackage", "")
|
|
|
+ com_package = row.get("com_package", "")
|
|
|
+ # 大模型抽取的数据,分包是按照 com_package 中 是否为多个值 为依据,来判断多包,所以,需要做一个转换
|
|
|
+ if multipackage_bid:
|
|
|
+ multipackage_bid = multipackage_bid
|
|
|
+ else:
|
|
|
+ multipackage_bid = 0
|
|
|
+ if com_package:
|
|
|
+ if len(com_package) > 1:
|
|
|
+ multipackage_bid = 1
|
|
|
+ else:
|
|
|
+ multipackage_bid = 0
|
|
|
+
|
|
|
+ if toptype != toptype_bid:
|
|
|
+ toptype_num += 1
|
|
|
+ print("标讯一级分类不一致的id为:"+str(result.get("_id","")))
|
|
|
+ if subtype != subtype_bid :
|
|
|
+ subtype_num += 1
|
|
|
+ print("标讯二级分类不一致的id为:" +str(result.get("_id","")))
|
|
|
+ if area != area_bid :
|
|
|
+ area_num += 1
|
|
|
+ print("省份不一致的id为:" +str(result.get("_id","")))
|
|
|
+ if city != city_bid :
|
|
|
+ city_num += 1
|
|
|
+ print("城市不一致的id为:" +str(result.get("_id","")))
|
|
|
+ if buyer != buyer_bid :
|
|
|
+ buyer_num += 1
|
|
|
+ print("采购单位不一致的id为:" +str(result.get("_id","")))
|
|
|
+ if projectname != projectname_bid :
|
|
|
+ projectname_num += 1
|
|
|
+ print("项目名称不一致的id为:" +str(result.get("_id","")))
|
|
|
+ if projectcode != projectcode_bid :
|
|
|
+ projectcode_num += 1
|
|
|
+ print("项目编号不一致的id为:" +str(result.get("_id","")))
|
|
|
+ if budget != budget_bid :
|
|
|
+ budget_num += 1
|
|
|
+ print("预算不一致的id为:" +str(result.get("_id","")))
|
|
|
+ if s_winner != s_winner_bid :
|
|
|
+ s_winner_num += 1
|
|
|
+ print("中标单位不一致的id为:" +str(result.get("_id","")))
|
|
|
+ if bidamount != bidamount_bid :
|
|
|
+ bidamount_num += 1
|
|
|
+ print("中标金额不一致的id为:" +str(result.get("_id","")))
|
|
|
+ if multipacket != multipackage_bid :
|
|
|
+ multipackage_num += 1
|
|
|
+ print("是否分包不一致的id为:" +str(result.get("_id","")))
|
|
|
+ print(f"标讯一级分类错误总数为 {toptype_num}")
|
|
|
+ print(f"标讯二级分类错误总数为 {subtype_num}")
|
|
|
+ print(f"省份错误总数为 {area_num}")
|
|
|
+ print(f"城市错误总数为 {city_num}")
|
|
|
+ print(f"采购单位错误总数为 {buyer_num}")
|
|
|
+ print(f"项目名称错误总数为 {projectname_num}")
|
|
|
+ print(f"项目编号错误总数为 {projectcode_num}")
|
|
|
+ print(f"预算错误总数为 {budget_num}")
|
|
|
+ print(f"中标单位错误总数为 {s_winner_num}")
|
|
|
+ print(f"中标金额错误总数为 {bidamount_num}")
|
|
|
+ print(f"是否分包错误总数为 {multipackage_num}")
|
|
|
+ return toptype_num,
|
|
|
+#源1-规则抽取的数据 和 标准样本库做 对比
|
|
|
+print("规则抽取的数据和标准样本库做对比结果如下:")
|
|
|
+Compare("standard_sample_data_all")
|
|
|
+
|
|
|
+#源2-大模型抽取的数据 和 标准样本库做 对比
|
|
|
+print("-----------------------------------------:")
|
|
|
+print("大模型抽取的数据和标准样本库做对比结果如下:")
|
|
|
+Compare("standard_sample_data_all_ai")
|