sample_data_export.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. from pymongo import MongoClient
  2. def sample_data(N):
  3. # 连接MongoDB数据库
  4. db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
  5. coll_user = db["bidding_919ai_norepeat"]
  6. # 统计总的数据量
  7. # count_all = coll_user.estimated_document_count()
  8. count_all = coll_user.count_documents({"tag": 1})
  9. print("Total Document Count:", count_all)
  10. # 把符合条件的站点名称存起来
  11. site_list = {}
  12. n = 0
  13. site_count = coll_user.aggregate([
  14. {"$match": {"tag": 1}},
  15. {"$group": {"_id": "$site", "count": {"$sum": 1}}},
  16. {"$sort": {"count": -1}}])
  17. for item in site_count:
  18. if (n / count_all) <= 0.95:
  19. n += item["count"]
  20. site_list[item["_id"]] = item["count"]
  21. # 计算每个站点相对于N的目标抽取数量的总和
  22. total_ratio = sum([min(site_list[key] / count_all, 1) for key in site_list])
  23. # 初始化已标记的文档数量
  24. marked_count = 0
  25. # 选取每个站点数据量
  26. for key in site_list:
  27. if marked_count >= N:
  28. break # 如果已经达到或超过目标数量,停止处理
  29. # 计算每个站点的目标比例
  30. target_ratio = min(site_list[key] / count_all, 1) / total_ratio
  31. # 计算每个站点应该抽取的文档数量,确保至少为1
  32. num = max(int(target_ratio * N), 2)
  33. # 如果加上这个站点的数量会超过总目标,调整数量
  34. num = min(num, N - marked_count)
  35. print(f"{key} - Count: {site_list[key]}, Num: {num}, Ratio: {target_ratio}")
  36. # 计算每次抽取的间隔
  37. jiange = int(site_list[key] / num)
  38. # 从每个站点等间隔地取数据
  39. for i in range(num):
  40. if marked_count >= N:
  41. break # 再次检查是否已达到目标数量
  42. for info in coll_user.find({"tag": 1, "site": key}).sort("title", 1).skip(i*2).limit(1):
  43. print(f"Updating document with _id: {info['_id']}")
  44. # 更新文档,设置标记
  45. update_result = coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 9}})
  46. if update_result.modified_count == 0:
  47. print("No document updated for _id:", info["_id"])
  48. else:
  49. print("Document updated successfully for _id:", info["_id"])
  50. marked_count += 1
  51. if marked_count >= N:
  52. break # 再次检查是否已达到目标数量
  53. print(f"Total marked documents: {marked_count}")
  54. sample_data(1000)