123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263 |
- from pymongo import MongoClient
- #从已知站点导出N条样本数据,每个站点导出M条数据
- def sample_data(N,M):
- # 连接MongoDB数据库
- db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
- collection = db["bidding_20250123"]
- # 把符合条件的站点名称存起来
- site_list = {"中粮贸易阳光采购平台","航天电子采购平台","青海省政府采购电子卖场","金正大集团电子采购平台",
- "建华建材电子采购平台",
- "淄博市政府采购网上商城",
- "福建省宁德市政府采购网"}
- # 初始化已标记的文档数量
- marked_count = 0
- marked_site_count = 0
- # 选取每个站点数据量
- for site in site_list:
- if marked_count >= N:
- break # 如果已经达到或超过目标数量,停止处理
- marked_site_count = 0
- # 聚合查询
- pipeline = [
- {
- '$match': {'site': site}
- },
- {
- '$group': {
- '_id': '$site',
- 'count': {'$sum': 1}
- }
- },
- {
- '$sort': {'count': -1} # 根据 count 降序排序,可选
- }
- ]
- result = collection.aggregate(pipeline)
- for doc in result:
- print(f"Field Value: {doc['_id']}, Count: {doc['count']}")
- # 计算每次抽取的间隔
- jiange = int(doc['count'] / M)
- # 从每个站点等间隔地取数据
- for i in range(M):
- if marked_site_count >= M:
- break # 再次检查是否已达到目标数量
- for info in collection.find({"site": site}).sort("_id", 1).skip(i*jiange).limit(1):
- print(f"Updating document with _id: {info['_id']}")
- # 更新文档,设置标记
- update_result = collection.update_one({"_id": info["_id"]}, {"$set": {"flag": 3}})
- if update_result.modified_count == 0:
- print("No document updated for _id:", info["_id"])
- else:
- print("Document updated successfully for _id:", info["_id"])
- marked_site_count += 1
- marked_count += 1
- if marked_site_count >= M:
- break # 再次检查是否已达到目标数量
- print(f"Total marked documents: {marked_count}")
- sample_data(200,10)
|