8 miesięcy temu · 3de5dd73c6
--- a/tools/从mongo导出数据/mongo_into_mongo.py
+++ b/tools/从mongo导出数据/mongo_into_mongo.py
@@ -9,7 +9,7 @@ MongodbConfig = {
 
				     "ip_port": "127.0.0.1:27088",
			
 
				     "user": "viewdata",
			
 
				     "password": "viewdata",
			
 
				-    "db": "qfw_ai",
			
 
				+    "db": "qfw",
			
 
				 }
			
 
				 
			
 
				 mdb = MongoDBInterface(MongodbConfig)
			
@@ -38,7 +38,8 @@ with MongoClient('192.168.3.149', 27180) as client:
 
				             if result==None:
			
 
				                 print(row["_id"]+"在大库没找到")
			
 
				             if result:
			
 
				-                result["_id"] = str(row["_id"])
			
 
				+                result["_id"] = ObjectId(row["_id"])
			
 
				+                insertdb.insert2db("standard_sample_data_all_ai",result)
			
 
				                 insertdb.insert2db("standard_sample_data_all",result)
			
 
				 
			
 
				 # 关闭数据库连接
			
--- a/tools/从mongo库导出数据execl/mongo_to_execl.py
+++ b/tools/从mongo库导出数据execl/mongo_to_execl.py
@@ -0,0 +1,46 @@
 
				+# 导入必要的库
			
 
				+from pymongo import MongoClient
			
 
				+import pandas as pd
			
 
				+
			
 
				+def export_to_excel(db_name, collection_name, fields, output_file):
			
 
				+    """
			
 
				+    从MongoDB导出特定字段到Excel文件。
			
 
				+
			
 
				+    参数:
			
 
				+    - db_name: 数据库名称
			
 
				+    - collection_name: 集合名称
			
 
				+    - fields: 要导出的字段列表（例如 ['name', 'age']）
			
 
				+    - output_file: 输出的Excel文件名
			
 
				+    """
			
 
				+    # 连接到MongoDB
			
 
				+    client = MongoClient('mongodb://192.168.3.149:27180/')
			
 
				+    db = client[db_name]
			
 
				+    collection = db[collection_name]
			
 
				+
			
 
				+    # 构建查询和投影
			
 
				+    projection = {field: 1 for field in fields}
			
 
				+
			
 
				+    # 查询数据
			
 
				+    data = collection.find({},projection)
			
 
				+
			
 
				+    # 将数据转换为DataFrame
			
 
				+    df = pd.DataFrame(list(data))
			
 
				+
			
 
				+    # 导出到Excel文件
			
 
				+    df.to_excel(output_file, index=False)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 连接到 MongoDB
			
 
				+
			
 
				+    db_name = 'data_quality'  # 替换为你的数据库名称
			
 
				+    collection_name = 'standard_sample_data_all' # 替换为你的集合名称
			
 
				+    # collection_name = 'bidding_20241128_ai'  # 替换为你的集合名称
			
 
				+    # 定义参数
			
 
				+
			
 
				+    fields = ['_id', 'site','toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipacket','href','jyhref']  # 替换为你需要导出的字段
			
 
				+    output_file = 'output.xlsx'
			
 
				+
			
 
				+    # 调用函数导出数据
			
 
				+    export_to_excel(db_name, collection_name, fields, output_file)
			
 
				+    print(f"数据已成功导出到 {output_file}")
			
 
				+
			
--- a/tools/样本数据导出/fix_site_data_export.py
+++ b/tools/样本数据导出/fix_site_data_export.py
@@ -0,0 +1,60 @@
 
				+from pymongo import MongoClient
			
 
				+#从已知站点导出N条样本数据,每个站点导出M条数据
			
 
				+
			
 
				+def sample_data(N,M):
			
 
				+    # 连接MongoDB数据库
			
 
				+    db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
			
 
				+    collection = db["bidding_20241128_ai"]
			
 
				+    # 把符合条件的站点名称存起来
			
 
				+    site_list = {"湖北政务服务网","广东省网上中介服务超市","黑龙江省政府采购电子卖场","湖北省政府采购网","欧贝易购","江西省网上中介服务超市","八戒公采","山西省政府采购电子卖场","政府采购云平台网上服务市场","政府采购频道"}
			
 
				+    # 初始化已标记的文档数量
			
 
				+    marked_count = 0
			
 
				+    marked_site_count = 0
			
 
				+    # 选取每个站点数据量
			
 
				+    for site in site_list:
			
 
				+        if marked_count >= N:
			
 
				+            break  # 如果已经达到或超过目标数量，停止处理
			
 
				+        marked_site_count = 0
			
 
				+        # 聚合查询
			
 
				+        pipeline = [
			
 
				+            {
			
 
				+                '$match': {'site': site}
			
 
				+            },
			
 
				+            {
			
 
				+                '$group': {
			
 
				+                     '_id': '$site',
			
 
				+                    'count': {'$sum': 1}
			
 
				+                }
			
 
				+            },
			
 
				+            {
			
 
				+                '$sort': {'count': -1}  # 根据 count 降序排序，可选
			
 
				+            }
			
 
				+        ]
			
 
				+        result = collection.aggregate(pipeline)
			
 
				+        for doc in result:
			
 
				+            print(f"Field Value: {doc['_id']}, Count: {doc['count']}")
			
 
				+            # 计算每次抽取的间隔
			
 
				+            jiange = int(doc['count'] / M)
			
 
				+
			
 
				+            # 从每个站点等间隔地取数据
			
 
				+            for i in range(M):
			
 
				+                if marked_site_count >= M:
			
 
				+                    break  # 再次检查是否已达到目标数量
			
 
				+
			
 
				+                for info in collection.find({"site": site}).sort("_id", 1).skip(i*jiange).limit(1):
			
 
				+                    print(f"Updating document with _id: {info['_id']}")
			
 
				+                    # 更新文档，设置标记
			
 
				+                    update_result = collection.update_one({"_id": info["_id"]}, {"$set": {"flag": 3}})
			
 
				+                    if update_result.modified_count == 0:
			
 
				+                        print("No document updated for _id:", info["_id"])
			
 
				+                    else:
			
 
				+                        print("Document updated successfully for _id:", info["_id"])
			
 
				+                    marked_site_count += 1
			
 
				+                    marked_count += 1
			
 
				+
			
 
				+                    if marked_site_count >= M:
			
 
				+                        break  # 再次检查是否已达到目标数量
			
 
				+
			
 
				+    print(f"Total marked documents: {marked_count}")
			
 
				+
			
 
				+sample_data(200,10)
			
--- a/tools/样本数据导出/sample_data_export.py
+++ b/tools/样本数据导出/sample_data_export.py
@@ -3,18 +3,17 @@ from pymongo import MongoClient
 
				 def sample_data(N):
			
 
				     # 连接MongoDB数据库
			
 
				     db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
			
 
				-    coll_user = db["bidding_919ai_norepeat"]
			
 
				+    coll_user = db["bidding_20241128_ai"]
			
 
				 
			
 
				     # 统计总的数据量
			
 
				-    # count_all = coll_user.estimated_document_count()
			
 
				-    count_all = coll_user.count_documents({"tag": 1})
			
 
				+    count_all = coll_user.estimated_document_count()
			
 
				+    # count_all = coll_user.count_documents({"tag": 1})
			
 
				     print("Total Document Count:", count_all)
			
 
				 
			
 
				     # 把符合条件的站点名称存起来
			
 
				     site_list = {}
			
 
				     n = 0
			
 
				     site_count = coll_user.aggregate([
			
 
				-                        {"$match": {"tag": 1}},
			
 
				                          {"$group": {"_id": "$site", "count": {"$sum": 1}}},
			
 
				                          {"$sort": {"count": -1}}])
			
 
				     for item in site_count:
			
@@ -51,10 +50,10 @@ def sample_data(N):
 
				             if marked_count >= N:
			
 
				                 break  # 再次检查是否已达到目标数量
			
 
				 
			
 
				-            for info in coll_user.find({"tag": 1, "site": key}).sort("title", 1).skip(i*2).limit(1):
			
 
				+            for info in coll_user.find({"site": key}).sort("title", 1).skip(i*2).limit(1):
			
 
				                 print(f"Updating document with _id: {info['_id']}")
			
 
				                 # 更新文档，设置标记
			
 
				-                update_result = coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 9}})
			
 
				+                update_result = coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 1}})
			
 
				                 if update_result.modified_count == 0:
			
 
				                     print("No document updated for _id:", info["_id"])
			
 
				                 else:
			
@@ -66,4 +65,4 @@ def sample_data(N):
 
				 
			
 
				     print(f"Total marked documents: {marked_count}")
			
 
				 
			
 
				-sample_data(1000)
			
 
				+sample_data(200)
			
--- a/tools/读execl数据入mongo库/execl_into_mongo.py
+++ b/tools/读execl数据入mongo库/execl_into_mongo.py
@@ -0,0 +1,27 @@
 
				+import pandas as pd
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+# 连接到 MongoDB
			
 
				+client = MongoClient('mongodb://192.168.3.149:27180/')
			
 
				+db = client['data_quality']  # 替换为你的数据库名称
			
 
				+collection = db['standard_sample_data']  # 替换为你的集合名称
			
 
				+
			
 
				+# 读取 Excel 文件
			
 
				+excel_file_path = 'sample_data.xlsx'  # 替换为你的 Excel 文件路径
			
 
				+
			
 
				+# 尝试读取 Excel 文件，并指定 dtype 参数
			
 
				+df = pd.read_excel(excel_file_path, na_values=['', 'NA', 'N/A'], dtype={'budget': float, 'bidamount': float})
			
 
				+
			
 
				+# 指定需要检查的列名
			
 
				+columns_to_check = ['_id', 'site','toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipacket','href','jyhref']  # 替换为你需要检查的列名
			
 
				+df[columns_to_check] = df[columns_to_check].where(pd.notnull(df[columns_to_check]), '')
			
 
				+
			
 
				+# 将 DataFrame 转换为字典列表
			
 
				+data = df.to_dict(orient='records')
			
 
				+
			
 
				+# 插入数据到 MongoDB
			
 
				+if data:
			
 
				+    collection.insert_many(data)
			
 
				+    print("数据已成功插入到 MongoDB")
			
 
				+else:
			
 
				+    print("没有数据可插入")