fix_site_data_export.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. from pymongo import MongoClient
  2. #从已知站点导出N条样本数据,每个站点导出M条数据
  3. def sample_data_random(N, M):
  4. client = MongoClient('172.20.45.129', 27002, unicode_decode_error_handler="ignore")
  5. db = client.data_quality
  6. collection = db.bidding_20250702
  7. site_list = {
  8. "应进必进阳光交易农村三资数字交易平台",
  9. "东方希望数字化采购平台",
  10. "首创股份电子商务平台",
  11. "南充市政府采购网上商城",
  12. "中国铁塔在线商务平台",
  13. "公采云",
  14. "甘肃省阳光招标采购平台",
  15. '招采通',
  16. '中国邮政电子采购与供应平台',
  17. '济南公共资源交易中心'
  18. }
  19. marked_count = 0
  20. for site in site_list:
  21. if marked_count >= N:
  22. break
  23. # 检查该站点的数据量是否足够
  24. site_count = collection.count_documents({"site": site})
  25. if site_count == 0:
  26. print(f"站点 {site} 无数据,跳过")
  27. continue
  28. # 计算实际要抽的数量(避免超过 M 或剩余需要的 N - marked_count)
  29. sample_size = min(M, N - marked_count, site_count)
  30. # 使用 $sample 随机抽取
  31. pipeline = [
  32. {"$match": {"site": site}},
  33. {"$sample": {"size": sample_size}},
  34. {"$project": {"_id": 1}} # 只返回 _id,减少数据传输
  35. ]
  36. sampled_docs = list(collection.aggregate(pipeline))
  37. # 批量更新标记
  38. if sampled_docs:
  39. sampled_ids = [doc["_id"] for doc in sampled_docs]
  40. update_result = collection.update_many(
  41. {"_id": {"$in": sampled_ids}},
  42. {"$set": {"flag_liumiao": 1}}
  43. )
  44. marked_count += update_result.modified_count
  45. print(f"站点 {site}: 成功标记 {update_result.modified_count} 条数据")
  46. print(f"总计标记 {marked_count} 条数据")
  47. sample_data_random(100, 10)