8 сар өмнө · fd2bea1f52
--- a/tools/从mongo库导出数据execl/mongo_to_execl.py
+++ b/tools/从mongo库导出数据execl/mongo_to_execl.py
@@ -21,7 +21,7 @@ def export_to_excel(db_name, collection_name, fields, output_file):
 
				     projection = {field: 1 for field in fields}
			
 
				 
			
 
				     # 查询数据
			
 
				-    data = collection.find({},projection)
			
 
				+    data = collection.find({"flag":3},projection)
			
 
				 
			
 
				     # 将数据转换为DataFrame
			
 
				     df = pd.DataFrame(list(data))
			
@@ -33,11 +33,11 @@ if __name__ == "__main__":
 
				     # 连接到 MongoDB
			
 
				 
			
 
				     db_name = 'data_quality'  # 替换为你的数据库名称
			
 
				-    collection_name = 'standard_sample_data_all' # 替换为你的集合名称
			
 
				-    # collection_name = 'bidding_20241128_ai'  # 替换为你的集合名称
			
 
				+    # collection_name = 'standard_sample_data_all' # 替换为你的集合名称
			
 
				+    collection_name = 'bidding_20241128_ai'  # 替换为你的集合名称
			
 
				     # 定义参数
			
 
				 
			
 
				-    fields = ['_id', 'site','toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipacket','href','jyhref']  # 替换为你需要导出的字段
			
 
				+    fields = ['_id', 'site','toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipackage','href','jyhref']  # 替换为你需要导出的字段
			
 
				     output_file = 'output.xlsx'
			
 
				 
			
 
				     # 调用函数导出数据
			
--- a/tools/基于标准样本数据对比/compare_standard_result.py
+++ b/tools/基于标准样本数据对比/compare_standard_result.py
@@ -0,0 +1,132 @@
 
				+from pymongo import MongoClient
			
 
				+from bson import ObjectId
			
 
				+from lib.mogodb_helper import MongoDBInterface
			
 
				+
			
 
				+#源1和源2，与源3（标准样本数据)对比，出结果
			
 
				+#mongo库
			
 
				+MongodbConfig = {
			
 
				+    "ip_port": "192.168.3.149:27180",
			
 
				+    "db": "data_quality",
			
 
				+}
			
 
				+mdb = MongoDBInterface(MongodbConfig)
			
 
				+
			
 
				+max_id = ObjectId("0" * 24)
			
 
				+# max_id = ObjectId("655ec5609aed6eb2ffa654ca")
			
 
				+
			
 
				+# columns_to_check = ['toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipackage']
			
 
				+# 连接MongoDB数据库
			
 
				+#源1--标准库 和 标准样本库对比
			
 
				+def Compare(collection_name):
			
 
				+    toptype_num=0
			
 
				+    subtype_num =0
			
 
				+    area_num = 0
			
 
				+    city_num = 0
			
 
				+    buyer_num = 0
			
 
				+    projectname_num = 0
			
 
				+    projectcode_num = 0
			
 
				+    budget_num = 0
			
 
				+    s_winner_num = 0
			
 
				+    bidamount_num = 0
			
 
				+    multipackage_num = 0
			
 
				+
			
 
				+    #标准样本数据源
			
 
				+    db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
			
 
				+    coll = db["standard_sample_data"]
			
 
				+
			
 
				+    # for item in coll_user.find({"_id": {"$gte": max_id}}).sort("_id", 1):
			
 
				+    for row in coll.find({ "_id" : ObjectId("659214ad66cf0db42a4a985c") }).sort("_id", 1):
			
 
				+        _id = row.get("_id", "")
			
 
				+        toptype =row.get("toptype","")
			
 
				+        subtype = row.get("subtype", "")
			
 
				+        area = row.get("area", "")
			
 
				+        city = row.get("city", "")
			
 
				+        buyer = row.get("buyer", "")
			
 
				+        projectname = row.get("projectname", "")
			
 
				+        projectcode = row.get("projectcode", "")
			
 
				+        budget = row.get("budget", "")
			
 
				+        s_winner = row.get("s_winner", "")
			
 
				+        bidamount = row.get("bidamount", "")
			
 
				+        multipacket = row.get("multipackage", "")
			
 
				+        # print(row["_id"])
			
 
				+        if _id:
			
 
				+            m_id = ObjectId(_id)
			
 
				+            result=mdb.find_by_id(collection_name,m_id)
			
 
				+            if result==None:
			
 
				+                print(row["_id"]+"在标准库没找到")
			
 
				+            if result:
			
 
				+                toptype_bid = result.get("toptype", "")
			
 
				+                subtype_bid = result.get("subtype", "")
			
 
				+                area_bid = result.get("area", "")
			
 
				+                city_bid = result.get("city", "")
			
 
				+                buyer_bid = result.get("buyer", "")
			
 
				+                projectname_bid = result.get("projectname", "")
			
 
				+                projectcode_bid = result.get("projectcode", "")
			
 
				+                budget_bid = result.get("budget", "")
			
 
				+                s_winner_bid = result.get("s_winner", "")
			
 
				+                bidamount_bid = result.get("bidamount", "")
			
 
				+                multipackage_bid = result.get("multipackage", "")
			
 
				+                com_package = row.get("com_package", "")
			
 
				+                # 大模型抽取的数据，分包是按照 com_package 中 是否为多个值 为依据，来判断多包，所以，需要做一个转换
			
 
				+                if multipackage_bid:
			
 
				+                    multipackage_bid = multipackage_bid
			
 
				+                else:
			
 
				+                    multipackage_bid = 0
			
 
				+                if com_package:
			
 
				+                    if len(com_package) > 1:
			
 
				+                        multipackage_bid = 1
			
 
				+                    else:
			
 
				+                        multipackage_bid  = 0
			
 
				+
			
 
				+                if toptype != toptype_bid:
			
 
				+                    toptype_num += 1
			
 
				+                    print("标讯一级分类不一致的id为："+str(result.get("_id","")))
			
 
				+                if subtype != subtype_bid :
			
 
				+                    subtype_num += 1
			
 
				+                    print("标讯二级分类不一致的id为：" +str(result.get("_id","")))
			
 
				+                if area != area_bid :
			
 
				+                    area_num += 1
			
 
				+                    print("省份不一致的id为：" +str(result.get("_id","")))
			
 
				+                if city != city_bid :
			
 
				+                    city_num += 1
			
 
				+                    print("城市不一致的id为：" +str(result.get("_id","")))
			
 
				+                if buyer != buyer_bid :
			
 
				+                    buyer_num += 1
			
 
				+                    print("采购单位不一致的id为：" +str(result.get("_id","")))
			
 
				+                if projectname != projectname_bid :
			
 
				+                    projectname_num += 1
			
 
				+                    print("项目名称不一致的id为：" +str(result.get("_id","")))
			
 
				+                if projectcode != projectcode_bid :
			
 
				+                    projectcode_num += 1
			
 
				+                    print("项目编号不一致的id为：" +str(result.get("_id","")))
			
 
				+                if budget != budget_bid :
			
 
				+                    budget_num += 1
			
 
				+                    print("预算不一致的id为：" +str(result.get("_id","")))
			
 
				+                if s_winner != s_winner_bid :
			
 
				+                    s_winner_num += 1
			
 
				+                    print("中标单位不一致的id为：" +str(result.get("_id","")))
			
 
				+                if bidamount != bidamount_bid :
			
 
				+                    bidamount_num += 1
			
 
				+                    print("中标金额不一致的id为：" +str(result.get("_id","")))
			
 
				+                if multipacket != multipackage_bid :
			
 
				+                    multipackage_num += 1
			
 
				+                    print("是否分包不一致的id为：" +str(result.get("_id","")))
			
 
				+    print(f"标讯一级分类错误总数为 {toptype_num}")
			
 
				+    print(f"标讯二级分类错误总数为 {subtype_num}")
			
 
				+    print(f"省份错误总数为 {area_num}")
			
 
				+    print(f"城市错误总数为 {city_num}")
			
 
				+    print(f"采购单位错误总数为 {buyer_num}")
			
 
				+    print(f"项目名称错误总数为 {projectname_num}")
			
 
				+    print(f"项目编号错误总数为 {projectcode_num}")
			
 
				+    print(f"预算错误总数为 {budget_num}")
			
 
				+    print(f"中标单位错误总数为 {s_winner_num}")
			
 
				+    print(f"中标金额错误总数为 {bidamount_num}")
			
 
				+    print(f"是否分包错误总数为 {multipackage_num}")
			
 
				+    return toptype_num,
			
 
				+#源1-规则抽取的数据 和 标准样本库做 对比
			
 
				+print("规则抽取的数据和标准样本库做对比结果如下：")
			
 
				+Compare("standard_sample_data_all")
			
 
				+
			
 
				+#源2-大模型抽取的数据 和 标准样本库做 对比
			
 
				+print("-----------------------------------------：")
			
 
				+print("大模型抽取的数据和标准样本库做对比结果如下：")
			
 
				+Compare("standard_sample_data_all_ai")
			
--- a/tools/测试绩效相关/test_efficiency.py
+++ b/tools/测试绩效相关/test_efficiency.py
@@ -0,0 +1,30 @@
 
				+'''
			
 
				+质量部测试效率计算
			
 
				+计算公式=本项目提交缺陷数量/本项目的测试时间
			
 
				+author：liumiaomiao
			
 
				+'''
			
 
				+from lib.mysql_tools import MysqlUtil
			
 
				+from datetime import datetime, timedelta
			
 
				+def test_efficiency():
			
 
				+    # MySQL 数据库连接配置
			
 
				+    # mysql_db_config = {
			
 
				+    #     'host': '192.168.3.149',
			
 
				+    #     'port': 4000,
			
 
				+    #     'user': 'datagroup',
			
 
				+    #     'password': 'Dgrpdb#2024@36',
			
 
				+    #     'database': 'jianyu_subjectdb',
			
 
				+    #     'charset': 'utf8mb4'
			
 
				+    # }
			
 
				+
			
 
				+    now = datetime.now()
			
 
				+    end_date = now.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+    start_date = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d %H:%M:%S")
			
 
				+
			
 
				+    # SQL 查询
			
 
				+    mysql_query = "SELECT COUNT(*) FROM jianyu_subjectdb.dwd_f_nzj_baseinfo WHERE createtime >= %s AND createtime <= %s"
			
 
				+    params = (start_date, end_date)
			
 
				+    conn = MysqlUtil.connect_to_mysql(host='127.0.0.1', port='4001', user='kanboard', password='K99b3e9qa9d',
			
 
				+                                      database='kanboard')
			
 
				+    count = MysqlUtil.execute_sql(conn, mysql_query, params)
			
 
				+    print("拟在建baseinfo-mysql每周统计入库数量", count)
			
 
				+    return count
			
--- a/tools/读execl数据入mongo库/execl_into_mongo.py
+++ b/tools/读execl数据入mongo库/execl_into_mongo.py
@@ -1,5 +1,6 @@
 
				 import pandas as pd
			
 
				 from pymongo import MongoClient
			
 
				+from bson.objectid import ObjectId
			
 
				 
			
 
				 # 连接到 MongoDB
			
 
				 client = MongoClient('mongodb://192.168.3.149:27180/')
			
@@ -13,9 +14,12 @@ excel_file_path = 'sample_data.xlsx'  # 替换为你的 Excel 文件路径
 
				 df = pd.read_excel(excel_file_path, na_values=['', 'NA', 'N/A'], dtype={'budget': float, 'bidamount': float})
			
 
				 
			
 
				 # 指定需要检查的列名
			
 
				-columns_to_check = ['_id', 'site','toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipacket','href','jyhref']  # 替换为你需要检查的列名
			
 
				+columns_to_check = ['_id', 'site','toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipackage','href','jyhref']  # 替换为你需要检查的列名
			
 
				 df[columns_to_check] = df[columns_to_check].where(pd.notnull(df[columns_to_check]), '')
			
 
				 
			
 
				+# 将 _id 列转换为 ObjectId 类型
			
 
				+df['_id'] = df['_id'].apply(lambda x: ObjectId(str(x)) if x != '' else x)
			
 
				+
			
 
				 # 将 DataFrame 转换为字典列表
			
 
				 data = df.to_dict(orient='records')