123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- from pymongo import MongoClient
- #从已知站点导出N条样本数据,每个站点导出M条数据
- def sample_data_random(N, M):
- client = MongoClient('172.20.45.129', 27002, unicode_decode_error_handler="ignore")
- db = client.data_quality
- collection = db.bidding_20250702
- site_list = {
- "应进必进阳光交易农村三资数字交易平台",
- "东方希望数字化采购平台",
- "首创股份电子商务平台",
- "南充市政府采购网上商城",
- "中国铁塔在线商务平台",
- "公采云",
- "甘肃省阳光招标采购平台",
- '招采通',
- '中国邮政电子采购与供应平台',
- '济南公共资源交易中心'
- }
- marked_count = 0
- for site in site_list:
- if marked_count >= N:
- break
- # 检查该站点的数据量是否足够
- site_count = collection.count_documents({"site": site})
- if site_count == 0:
- print(f"站点 {site} 无数据,跳过")
- continue
- # 计算实际要抽的数量(避免超过 M 或剩余需要的 N - marked_count)
- sample_size = min(M, N - marked_count, site_count)
- # 使用 $sample 随机抽取
- pipeline = [
- {"$match": {"site": site}},
- {"$sample": {"size": sample_size}},
- {"$project": {"_id": 1}} # 只返回 _id,减少数据传输
- ]
- sampled_docs = list(collection.aggregate(pipeline))
- # 批量更新标记
- if sampled_docs:
- sampled_ids = [doc["_id"] for doc in sampled_docs]
- update_result = collection.update_many(
- {"_id": {"$in": sampled_ids}},
- {"$set": {"flag_liumiao": 1}}
- )
- marked_count += update_result.modified_count
- print(f"站点 {site}: 成功标记 {update_result.modified_count} 条数据")
- print(f"总计标记 {marked_count} 条数据")
- sample_data_random(100, 10)
|