|
@@ -3,18 +3,17 @@ from pymongo import MongoClient
|
|
|
def sample_data(N):
|
|
|
# 连接MongoDB数据库
|
|
|
db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
|
|
|
- coll_user = db["bidding_919ai_norepeat"]
|
|
|
+ coll_user = db["bidding_20241128_ai"]
|
|
|
|
|
|
# 统计总的数据量
|
|
|
- # count_all = coll_user.estimated_document_count()
|
|
|
- count_all = coll_user.count_documents({"tag": 1})
|
|
|
+ count_all = coll_user.estimated_document_count()
|
|
|
+ # count_all = coll_user.count_documents({"tag": 1})
|
|
|
print("Total Document Count:", count_all)
|
|
|
|
|
|
# 把符合条件的站点名称存起来
|
|
|
site_list = {}
|
|
|
n = 0
|
|
|
site_count = coll_user.aggregate([
|
|
|
- {"$match": {"tag": 1}},
|
|
|
{"$group": {"_id": "$site", "count": {"$sum": 1}}},
|
|
|
{"$sort": {"count": -1}}])
|
|
|
for item in site_count:
|
|
@@ -51,10 +50,10 @@ def sample_data(N):
|
|
|
if marked_count >= N:
|
|
|
break # 再次检查是否已达到目标数量
|
|
|
|
|
|
- for info in coll_user.find({"tag": 1, "site": key}).sort("title", 1).skip(i*2).limit(1):
|
|
|
+ for info in coll_user.find({"site": key}).sort("title", 1).skip(i*2).limit(1):
|
|
|
print(f"Updating document with _id: {info['_id']}")
|
|
|
# 更新文档,设置标记
|
|
|
- update_result = coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 9}})
|
|
|
+ update_result = coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 1}})
|
|
|
if update_result.modified_count == 0:
|
|
|
print("No document updated for _id:", info["_id"])
|
|
|
else:
|
|
@@ -66,4 +65,4 @@ def sample_data(N):
|
|
|
|
|
|
print(f"Total marked documents: {marked_count}")
|
|
|
|
|
|
-sample_data(1000)
|
|
|
+sample_data(200)
|