|
@@ -0,0 +1,67 @@
|
|
|
+from pymongo import MongoClient
|
|
|
+
|
|
|
+def sample_data(N):
|
|
|
+ # 连接MongoDB数据库
|
|
|
+ db = MongoClient('192.168.3.206', 27080, unicode_decode_error_handler="ignore").data_quality
|
|
|
+ coll_user = db["bidding_20231122"]
|
|
|
+
|
|
|
+ # 统计总的数据量
|
|
|
+ count_all = coll_user.estimated_document_count()
|
|
|
+ print("Total Document Count:", count_all)
|
|
|
+
|
|
|
+ # 把符合条件的站点名称存起来
|
|
|
+ site_list = {}
|
|
|
+ n = 0
|
|
|
+ site_count = coll_user.aggregate([
|
|
|
+ {"$group": {"_id": "$site", "count": {"$sum": 1}}},
|
|
|
+ {"$sort": {"count": -1}}])
|
|
|
+ for item in site_count:
|
|
|
+ if (n / count_all) <= 0.95:
|
|
|
+ n += item["count"]
|
|
|
+ site_list[item["_id"]] = item["count"]
|
|
|
+
|
|
|
+ # 计算每个站点相对于N的目标抽取数量的总和
|
|
|
+ total_ratio = sum([min(site_list[key] / count_all, 1) for key in site_list])
|
|
|
+
|
|
|
+ # 初始化已标记的文档数量
|
|
|
+ marked_count = 0
|
|
|
+
|
|
|
+ # 选取每个站点数据量
|
|
|
+ for key in site_list:
|
|
|
+ if marked_count >= N:
|
|
|
+ break # 如果已经达到或超过目标数量,停止处理
|
|
|
+
|
|
|
+ # 计算每个站点的目标比例
|
|
|
+ target_ratio = min(site_list[key] / count_all, 1) / total_ratio
|
|
|
+ # 计算每个站点应该抽取的文档数量,确保至少为1
|
|
|
+ num = max(int(target_ratio * N), 1)
|
|
|
+
|
|
|
+ # 如果加上这个站点的数量会超过总目标,调整数量
|
|
|
+ num = min(num, N - marked_count)
|
|
|
+
|
|
|
+ print(f"{key} - Count: {site_list[key]}, Num: {num}, Ratio: {target_ratio}")
|
|
|
+
|
|
|
+ # 计算每次抽取的间隔
|
|
|
+ jiange = int(site_list[key] / num)
|
|
|
+
|
|
|
+ # 从每个站点等间隔地取数据
|
|
|
+ for i in range(num):
|
|
|
+ if marked_count >= N:
|
|
|
+ break # 再次检查是否已达到目标数量
|
|
|
+
|
|
|
+ for info in coll_user.find({"title_qa.0303": "包含叠词,异常词汇,特殊词汇(测试,公告公告等)", "site": key, "flag": {"$exists": False}}).sort("_id", 1).skip(i * jiange).limit(1):
|
|
|
+ print(f"Updating document with _id: {info['_id']}")
|
|
|
+ # 更新文档,设置标记
|
|
|
+ update_result = coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 10}})
|
|
|
+ if update_result.modified_count == 0:
|
|
|
+ print("No document updated for _id:", info["_id"])
|
|
|
+ else:
|
|
|
+ print("Document updated successfully for _id:", info["_id"])
|
|
|
+ marked_count += 1
|
|
|
+
|
|
|
+ if marked_count >= N:
|
|
|
+ break # 再次检查是否已达到目标数量
|
|
|
+
|
|
|
+ print(f"Total marked documents: {marked_count}")
|
|
|
+
|
|
|
+sample_data(100)
|