liumiaomiao 8 miesięcy temu
rodzic
commit
3de5dd73c6

+ 3 - 2
tools/从mongo导出数据/mongo_into_mongo.py

@@ -9,7 +9,7 @@ MongodbConfig = {
     "ip_port": "127.0.0.1:27088",
     "user": "viewdata",
     "password": "viewdata",
-    "db": "qfw_ai",
+    "db": "qfw",
 }
 
 mdb = MongoDBInterface(MongodbConfig)
@@ -38,7 +38,8 @@ with MongoClient('192.168.3.149', 27180) as client:
             if result==None:
                 print(row["_id"]+"在大库没找到")
             if result:
-                result["_id"] = str(row["_id"])
+                result["_id"] = ObjectId(row["_id"])
+                insertdb.insert2db("standard_sample_data_all_ai",result)
                 insertdb.insert2db("standard_sample_data_all",result)
 
 # 关闭数据库连接

+ 46 - 0
tools/从mongo库导出数据execl/mongo_to_execl.py

@@ -0,0 +1,46 @@
+# 导入必要的库
+from pymongo import MongoClient
+import pandas as pd
+
+def export_to_excel(db_name, collection_name, fields, output_file):
+    """
+    从MongoDB导出特定字段到Excel文件。
+
+    参数:
+    - db_name: 数据库名称
+    - collection_name: 集合名称
+    - fields: 要导出的字段列表(例如 ['name', 'age'])
+    - output_file: 输出的Excel文件名
+    """
+    # 连接到MongoDB
+    client = MongoClient('mongodb://192.168.3.149:27180/')
+    db = client[db_name]
+    collection = db[collection_name]
+
+    # 构建查询和投影
+    projection = {field: 1 for field in fields}
+
+    # 查询数据
+    data = collection.find({},projection)
+
+    # 将数据转换为DataFrame
+    df = pd.DataFrame(list(data))
+
+    # 导出到Excel文件
+    df.to_excel(output_file, index=False)
+
+if __name__ == "__main__":
+    # 连接到 MongoDB
+
+    db_name = 'data_quality'  # 替换为你的数据库名称
+    collection_name = 'standard_sample_data_all' # 替换为你的集合名称
+    # collection_name = 'bidding_20241128_ai'  # 替换为你的集合名称
+    # 定义参数
+
+    fields = ['_id', 'site','toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipacket','href','jyhref']  # 替换为你需要导出的字段
+    output_file = 'output.xlsx'
+
+    # 调用函数导出数据
+    export_to_excel(db_name, collection_name, fields, output_file)
+    print(f"数据已成功导出到 {output_file}")
+

+ 60 - 0
tools/样本数据导出/fix_site_data_export.py

@@ -0,0 +1,60 @@
+from pymongo import MongoClient
+#从已知站点导出N条样本数据,每个站点导出M条数据
+
+def sample_data(N,M):
+    # 连接MongoDB数据库
+    db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
+    collection = db["bidding_20241128_ai"]
+    # 把符合条件的站点名称存起来
+    site_list = {"湖北政务服务网","广东省网上中介服务超市","黑龙江省政府采购电子卖场","湖北省政府采购网","欧贝易购","江西省网上中介服务超市","八戒公采","山西省政府采购电子卖场","政府采购云平台网上服务市场","政府采购频道"}
+    # 初始化已标记的文档数量
+    marked_count = 0
+    marked_site_count = 0
+    # 选取每个站点数据量
+    for site in site_list:
+        if marked_count >= N:
+            break  # 如果已经达到或超过目标数量,停止处理
+        marked_site_count = 0
+        # 聚合查询
+        pipeline = [
+            {
+                '$match': {'site': site}
+            },
+            {
+                '$group': {
+                     '_id': '$site',
+                    'count': {'$sum': 1}
+                }
+            },
+            {
+                '$sort': {'count': -1}  # 根据 count 降序排序,可选
+            }
+        ]
+        result = collection.aggregate(pipeline)
+        for doc in result:
+            print(f"Field Value: {doc['_id']}, Count: {doc['count']}")
+            # 计算每次抽取的间隔
+            jiange = int(doc['count'] / M)
+
+            # 从每个站点等间隔地取数据
+            for i in range(M):
+                if marked_site_count >= M:
+                    break  # 再次检查是否已达到目标数量
+
+                for info in collection.find({"site": site}).sort("_id", 1).skip(i*jiange).limit(1):
+                    print(f"Updating document with _id: {info['_id']}")
+                    # 更新文档,设置标记
+                    update_result = collection.update_one({"_id": info["_id"]}, {"$set": {"flag": 3}})
+                    if update_result.modified_count == 0:
+                        print("No document updated for _id:", info["_id"])
+                    else:
+                        print("Document updated successfully for _id:", info["_id"])
+                    marked_site_count += 1
+                    marked_count += 1
+
+                    if marked_site_count >= M:
+                        break  # 再次检查是否已达到目标数量
+
+    print(f"Total marked documents: {marked_count}")
+
+sample_data(200,10)

+ 6 - 7
tools/样本数据导出/sample_data_export.py

@@ -3,18 +3,17 @@ from pymongo import MongoClient
 def sample_data(N):
     # 连接MongoDB数据库
     db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
-    coll_user = db["bidding_919ai_norepeat"]
+    coll_user = db["bidding_20241128_ai"]
 
     # 统计总的数据量
-    # count_all = coll_user.estimated_document_count()
-    count_all = coll_user.count_documents({"tag": 1})
+    count_all = coll_user.estimated_document_count()
+    # count_all = coll_user.count_documents({"tag": 1})
     print("Total Document Count:", count_all)
 
     # 把符合条件的站点名称存起来
     site_list = {}
     n = 0
     site_count = coll_user.aggregate([
-                        {"$match": {"tag": 1}},
                          {"$group": {"_id": "$site", "count": {"$sum": 1}}},
                          {"$sort": {"count": -1}}])
     for item in site_count:
@@ -51,10 +50,10 @@ def sample_data(N):
             if marked_count >= N:
                 break  # 再次检查是否已达到目标数量
 
-            for info in coll_user.find({"tag": 1, "site": key}).sort("title", 1).skip(i*2).limit(1):
+            for info in coll_user.find({"site": key}).sort("title", 1).skip(i*2).limit(1):
                 print(f"Updating document with _id: {info['_id']}")
                 # 更新文档,设置标记
-                update_result = coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 9}})
+                update_result = coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 1}})
                 if update_result.modified_count == 0:
                     print("No document updated for _id:", info["_id"])
                 else:
@@ -66,4 +65,4 @@ def sample_data(N):
 
     print(f"Total marked documents: {marked_count}")
 
-sample_data(1000)
+sample_data(200)

+ 27 - 0
tools/读execl数据入mongo库/execl_into_mongo.py

@@ -0,0 +1,27 @@
+import pandas as pd
+from pymongo import MongoClient
+
+# 连接到 MongoDB
+client = MongoClient('mongodb://192.168.3.149:27180/')
+db = client['data_quality']  # 替换为你的数据库名称
+collection = db['standard_sample_data']  # 替换为你的集合名称
+
+# 读取 Excel 文件
+excel_file_path = 'sample_data.xlsx'  # 替换为你的 Excel 文件路径
+
+# 尝试读取 Excel 文件,并指定 dtype 参数
+df = pd.read_excel(excel_file_path, na_values=['', 'NA', 'N/A'], dtype={'budget': float, 'bidamount': float})
+
+# 指定需要检查的列名
+columns_to_check = ['_id', 'site','toptype','subtype','area','city','buyer','projectname','projectcode','budget','s_winner','bidamount','multipacket','href','jyhref']  # 替换为你需要检查的列名
+df[columns_to_check] = df[columns_to_check].where(pd.notnull(df[columns_to_check]), '')
+
+# 将 DataFrame 转换为字典列表
+data = df.to_dict(orient='records')
+
+# 插入数据到 MongoDB
+if data:
+    collection.insert_many(data)
+    print("数据已成功插入到 MongoDB")
+else:
+    print("没有数据可插入")