liumiaomiao 8 сар өмнө
parent
commit
fd2bea1f52

+ 4 - 4
tools/从mongo库导出数据execl/mongo_to_execl.py

@@ -21,7 +21,7 @@ def export_to_excel(db_name, collection_name, fields, output_file):
     projection = {field: 1 for field in fields}
 
     # 查询数据
-    data = collection.find({},projection)
+    data = collection.find({"flag":3},projection)
 
     # 将数据转换为DataFrame
     df = pd.DataFrame(list(data))
@@ -33,11 +33,11 @@ if __name__ == "__main__":
     # 连接到 MongoDB
 
     db_name = 'data_quality'  # 替换为你的数据库名称
-    collection_name = 'standard_sample_data_all' # 替换为你的集合名称
-    # collection_name = 'bidding_20241128_ai'  # 替换为你的集合名称
+    # collection_name = 'standard_sample_data_all' # 替换为你的集合名称
+    collection_name = 'bidding_20241128_ai'  # 替换为你的集合名称
     # 定义参数
 
-    fields = ['_id', 'site','toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipacket','href','jyhref']  # 替换为你需要导出的字段
+    fields = ['_id', 'site','toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipackage','href','jyhref']  # 替换为你需要导出的字段
     output_file = 'output.xlsx'
 
     # 调用函数导出数据

+ 132 - 0
tools/基于标准样本数据对比/compare_standard_result.py

@@ -0,0 +1,132 @@
+from pymongo import MongoClient
+from bson import ObjectId
+from lib.mogodb_helper import MongoDBInterface
+
+#源1和源2,与源3(标准样本数据)对比,出结果
+#mongo库
+MongodbConfig = {
+    "ip_port": "192.168.3.149:27180",
+    "db": "data_quality",
+}
+mdb = MongoDBInterface(MongodbConfig)
+
+max_id = ObjectId("0" * 24)
+# max_id = ObjectId("655ec5609aed6eb2ffa654ca")
+
+# columns_to_check = ['toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipackage']
+# 连接MongoDB数据库
+#源1--标准库 和 标准样本库对比
+def Compare(collection_name):
+    toptype_num=0
+    subtype_num =0
+    area_num = 0
+    city_num = 0
+    buyer_num = 0
+    projectname_num = 0
+    projectcode_num = 0
+    budget_num = 0
+    s_winner_num = 0
+    bidamount_num = 0
+    multipackage_num = 0
+
+    #标准样本数据源
+    db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
+    coll = db["standard_sample_data"]
+
+    # for item in coll_user.find({"_id": {"$gte": max_id}}).sort("_id", 1):
+    for row in coll.find({ "_id" : ObjectId("659214ad66cf0db42a4a985c") }).sort("_id", 1):
+        _id = row.get("_id", "")
+        toptype =row.get("toptype","")
+        subtype = row.get("subtype", "")
+        area = row.get("area", "")
+        city = row.get("city", "")
+        buyer = row.get("buyer", "")
+        projectname = row.get("projectname", "")
+        projectcode = row.get("projectcode", "")
+        budget = row.get("budget", "")
+        s_winner = row.get("s_winner", "")
+        bidamount = row.get("bidamount", "")
+        multipacket = row.get("multipackage", "")
+        # print(row["_id"])
+        if _id:
+            m_id = ObjectId(_id)
+            result=mdb.find_by_id(collection_name,m_id)
+            if result==None:
+                print(row["_id"]+"在标准库没找到")
+            if result:
+                toptype_bid = result.get("toptype", "")
+                subtype_bid = result.get("subtype", "")
+                area_bid = result.get("area", "")
+                city_bid = result.get("city", "")
+                buyer_bid = result.get("buyer", "")
+                projectname_bid = result.get("projectname", "")
+                projectcode_bid = result.get("projectcode", "")
+                budget_bid = result.get("budget", "")
+                s_winner_bid = result.get("s_winner", "")
+                bidamount_bid = result.get("bidamount", "")
+                multipackage_bid = result.get("multipackage", "")
+                com_package = row.get("com_package", "")
+                # 大模型抽取的数据,分包是按照 com_package 中 是否为多个值 为依据,来判断多包,所以,需要做一个转换
+                if multipackage_bid:
+                    multipackage_bid = multipackage_bid
+                else:
+                    multipackage_bid = 0
+                if com_package:
+                    if len(com_package) > 1:
+                        multipackage_bid = 1
+                    else:
+                        multipackage_bid  = 0
+
+                if toptype != toptype_bid:
+                    toptype_num += 1
+                    print("标讯一级分类不一致的id为:"+str(result.get("_id","")))
+                if subtype != subtype_bid :
+                    subtype_num += 1
+                    print("标讯二级分类不一致的id为:" +str(result.get("_id","")))
+                if area != area_bid :
+                    area_num += 1
+                    print("省份不一致的id为:" +str(result.get("_id","")))
+                if city != city_bid :
+                    city_num += 1
+                    print("城市不一致的id为:" +str(result.get("_id","")))
+                if buyer != buyer_bid :
+                    buyer_num += 1
+                    print("采购单位不一致的id为:" +str(result.get("_id","")))
+                if projectname != projectname_bid :
+                    projectname_num += 1
+                    print("项目名称不一致的id为:" +str(result.get("_id","")))
+                if projectcode != projectcode_bid :
+                    projectcode_num += 1
+                    print("项目编号不一致的id为:" +str(result.get("_id","")))
+                if budget != budget_bid :
+                    budget_num += 1
+                    print("预算不一致的id为:" +str(result.get("_id","")))
+                if s_winner != s_winner_bid :
+                    s_winner_num += 1
+                    print("中标单位不一致的id为:" +str(result.get("_id","")))
+                if bidamount != bidamount_bid :
+                    bidamount_num += 1
+                    print("中标金额不一致的id为:" +str(result.get("_id","")))
+                if multipacket != multipackage_bid :
+                    multipackage_num += 1
+                    print("是否分包不一致的id为:" +str(result.get("_id","")))
+    print(f"标讯一级分类错误总数为 {toptype_num}")
+    print(f"标讯二级分类错误总数为 {subtype_num}")
+    print(f"省份错误总数为 {area_num}")
+    print(f"城市错误总数为 {city_num}")
+    print(f"采购单位错误总数为 {buyer_num}")
+    print(f"项目名称错误总数为 {projectname_num}")
+    print(f"项目编号错误总数为 {projectcode_num}")
+    print(f"预算错误总数为 {budget_num}")
+    print(f"中标单位错误总数为 {s_winner_num}")
+    print(f"中标金额错误总数为 {bidamount_num}")
+    print(f"是否分包错误总数为 {multipackage_num}")
+    return toptype_num,
+#源1-规则抽取的数据 和 标准样本库做 对比
+print("规则抽取的数据和标准样本库做对比结果如下:")
+Compare("standard_sample_data_all")
+
+#源2-大模型抽取的数据 和 标准样本库做 对比
+print("-----------------------------------------:")
+print("大模型抽取的数据和标准样本库做对比结果如下:")
+Compare("standard_sample_data_all_ai")

+ 30 - 0
tools/测试绩效相关/test_efficiency.py

@@ -0,0 +1,30 @@
+'''
+质量部测试效率计算
+计算公式=本项目提交缺陷数量/本项目的测试时间
+author:liumiaomiao
+'''
+from lib.mysql_tools import MysqlUtil
+from datetime import datetime, timedelta
+def test_efficiency():
+    # MySQL 数据库连接配置
+    # mysql_db_config = {
+    #     'host': '192.168.3.149',
+    #     'port': 4000,
+    #     'user': 'datagroup',
+    #     'password': 'Dgrpdb#2024@36',
+    #     'database': 'jianyu_subjectdb',
+    #     'charset': 'utf8mb4'
+    # }
+
+    now = datetime.now()
+    end_date = now.strftime("%Y-%m-%d %H:%M:%S")
+    start_date = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d %H:%M:%S")
+
+    # SQL 查询
+    mysql_query = "SELECT COUNT(*) FROM jianyu_subjectdb.dwd_f_nzj_baseinfo WHERE createtime >= %s AND createtime <= %s"
+    params = (start_date, end_date)
+    conn = MysqlUtil.connect_to_mysql(host='127.0.0.1', port='4001', user='kanboard', password='K99b3e9qa9d',
+                                      database='kanboard')
+    count = MysqlUtil.execute_sql(conn, mysql_query, params)
+    print("拟在建baseinfo-mysql每周统计入库数量", count)
+    return count

+ 5 - 1
tools/读execl数据入mongo库/execl_into_mongo.py

@@ -1,5 +1,6 @@
 import pandas as pd
 from pymongo import MongoClient
+from bson.objectid import ObjectId
 
 # 连接到 MongoDB
 client = MongoClient('mongodb://192.168.3.149:27180/')
@@ -13,9 +14,12 @@ excel_file_path = 'sample_data.xlsx'  # 替换为你的 Excel 文件路径
 df = pd.read_excel(excel_file_path, na_values=['', 'NA', 'N/A'], dtype={'budget': float, 'bidamount': float})
 
 # 指定需要检查的列名
-columns_to_check = ['_id', 'site','toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipacket','href','jyhref']  # 替换为你需要检查的列名
+columns_to_check = ['_id', 'site','toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipackage','href','jyhref']  # 替换为你需要检查的列名
 df[columns_to_check] = df[columns_to_check].where(pd.notnull(df[columns_to_check]), '')
 
+# 将 _id 列转换为 ObjectId 类型
+df['_id'] = df['_id'].apply(lambda x: ObjectId(str(x)) if x != '' else x)
+
 # 将 DataFrame 转换为字典列表
 data = df.to_dict(orient='records')