fix_site_data_export.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. from pymongo import MongoClient
  2. #从已知站点导出N条样本数据,每个站点导出M条数据
  3. def sample_data(N,M):
  4. # 连接MongoDB数据库
  5. db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
  6. collection = db["bidding_20250123"]
  7. # 把符合条件的站点名称存起来
  8. site_list = {"中粮贸易阳光采购平台","航天电子采购平台","青海省政府采购电子卖场","金正大集团电子采购平台",
  9. "建华建材电子采购平台",
  10. "淄博市政府采购网上商城",
  11. "福建省宁德市政府采购网"}
  12. # 初始化已标记的文档数量
  13. marked_count = 0
  14. marked_site_count = 0
  15. # 选取每个站点数据量
  16. for site in site_list:
  17. if marked_count >= N:
  18. break # 如果已经达到或超过目标数量,停止处理
  19. marked_site_count = 0
  20. # 聚合查询
  21. pipeline = [
  22. {
  23. '$match': {'site': site}
  24. },
  25. {
  26. '$group': {
  27. '_id': '$site',
  28. 'count': {'$sum': 1}
  29. }
  30. },
  31. {
  32. '$sort': {'count': -1} # 根据 count 降序排序,可选
  33. }
  34. ]
  35. result = collection.aggregate(pipeline)
  36. for doc in result:
  37. print(f"Field Value: {doc['_id']}, Count: {doc['count']}")
  38. # 计算每次抽取的间隔
  39. jiange = int(doc['count'] / M)
  40. # 从每个站点等间隔地取数据
  41. for i in range(M):
  42. if marked_site_count >= M:
  43. break # 再次检查是否已达到目标数量
  44. for info in collection.find({"site": site}).sort("_id", 1).skip(i*jiange).limit(1):
  45. print(f"Updating document with _id: {info['_id']}")
  46. # 更新文档,设置标记
  47. update_result = collection.update_one({"_id": info["_id"]}, {"$set": {"flag": 3}})
  48. if update_result.modified_count == 0:
  49. print("No document updated for _id:", info["_id"])
  50. else:
  51. print("Document updated successfully for _id:", info["_id"])
  52. marked_site_count += 1
  53. marked_count += 1
  54. if marked_site_count >= M:
  55. break # 再次检查是否已达到目标数量
  56. print(f"Total marked documents: {marked_count}")
  57. sample_data(200,10)