from pymongo import MongoClient #从已知站点导出N条样本数据,每个站点导出M条数据 def sample_data(N,M): # 连接MongoDB数据库 db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality collection = db["bidding_20250123"] # 把符合条件的站点名称存起来 site_list = {"中粮贸易阳光采购平台","航天电子采购平台","青海省政府采购电子卖场","金正大集团电子采购平台", "建华建材电子采购平台", "淄博市政府采购网上商城", "福建省宁德市政府采购网"} # 初始化已标记的文档数量 marked_count = 0 marked_site_count = 0 # 选取每个站点数据量 for site in site_list: if marked_count >= N: break # 如果已经达到或超过目标数量,停止处理 marked_site_count = 0 # 聚合查询 pipeline = [ { '$match': {'site': site} }, { '$group': { '_id': '$site', 'count': {'$sum': 1} } }, { '$sort': {'count': -1} # 根据 count 降序排序,可选 } ] result = collection.aggregate(pipeline) for doc in result: print(f"Field Value: {doc['_id']}, Count: {doc['count']}") # 计算每次抽取的间隔 jiange = int(doc['count'] / M) # 从每个站点等间隔地取数据 for i in range(M): if marked_site_count >= M: break # 再次检查是否已达到目标数量 for info in collection.find({"site": site}).sort("_id", 1).skip(i*jiange).limit(1): print(f"Updating document with _id: {info['_id']}") # 更新文档,设置标记 update_result = collection.update_one({"_id": info["_id"]}, {"$set": {"flag": 3}}) if update_result.modified_count == 0: print("No document updated for _id:", info["_id"]) else: print("Document updated successfully for _id:", info["_id"]) marked_site_count += 1 marked_count += 1 if marked_site_count >= M: break # 再次检查是否已达到目标数量 print(f"Total marked documents: {marked_count}") sample_data(200,10)