from pymongo import MongoClient #从已知站点导出N条样本数据,每个站点导出M条数据 def sample_data_random(N, M): client = MongoClient('172.20.45.129', 27002, unicode_decode_error_handler="ignore") db = client.data_quality collection = db.bidding_20250702 site_list = { "应进必进阳光交易农村三资数字交易平台", "东方希望数字化采购平台", "首创股份电子商务平台", "南充市政府采购网上商城", "中国铁塔在线商务平台", "公采云", "甘肃省阳光招标采购平台", '招采通', '中国邮政电子采购与供应平台', '济南公共资源交易中心' } marked_count = 0 for site in site_list: if marked_count >= N: break # 检查该站点的数据量是否足够 site_count = collection.count_documents({"site": site}) if site_count == 0: print(f"站点 {site} 无数据,跳过") continue # 计算实际要抽的数量(避免超过 M 或剩余需要的 N - marked_count) sample_size = min(M, N - marked_count, site_count) # 使用 $sample 随机抽取 pipeline = [ {"$match": {"site": site}}, {"$sample": {"size": sample_size}}, {"$project": {"_id": 1}} # 只返回 _id,减少数据传输 ] sampled_docs = list(collection.aggregate(pipeline)) # 批量更新标记 if sampled_docs: sampled_ids = [doc["_id"] for doc in sampled_docs] update_result = collection.update_many( {"_id": {"$in": sampled_ids}}, {"$set": {"flag_liumiao": 1}} ) marked_count += update_result.modified_count print(f"站点 {site}: 成功标记 {update_result.modified_count} 条数据") print(f"总计标记 {marked_count} 条数据") sample_data_random(100, 10)