from pymongo import MongoClient from bson import ObjectId from lib.mongo_tools import MongoUtil,Data_save,MongoSentence #判断标讯数据的重复率 def calculate_repeat_rate(str1, str2): import Levenshtein # str1 = "茹河、红河河流沟道生态治理与农田整治提升项目(六盘山森林生态保护修复单元)-彭阳县茹河流域乃河水库至店洼水库生态缓冲带修复工程一期" # str2 = "茹河、红河河流沟道生态治理与农田整治提升项目(六盘山森林生态保护修复单元)彭阳县茹河流域乃河水库至店洼水库生态缓冲带修复工程一期" distance = Levenshtein.distance(str1, str2) max_length = max(len(str1), len(str2)) if max_length != 0: similarity = 1 - distance / max_length return similarity else: return 1.0 # 连接到目标数据库 db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality coll_user = db["liantong"] data_map={} documents = list(coll_user.find({ "flag" : { "$ne" : "cfsj" } }).sort("_id",1)) for i in range(len(documents)): for j in range(i+1,len(documents)): i_title=documents[i].get("title", "") j_title = documents[j].get("title", "") i_projectname=documents[i].get("projectname", "") j_projectname = documents[j].get("projectname", "") radio1=calculate_repeat_rate(i_title,j_title) radio2=calculate_repeat_rate(i_projectname,j_projectname) if radio1>=0.9 or radio2>=0.9: if documents[i].get("projectcode","")== documents[j].get("projectcode","") and documents[i].get("subtype","")== documents[j].get("subtype","")=='竞谈': data_map.setdefault(documents[i].get("_id",""),[]).append(str(documents[j].get("_id",""))) # and documents[i].get("subtype","")== documents[j].get("subtype","")=='采购意向' # and documents[i].get("subtype","")==documents[j].get("subtype","") # for i in range(len(documents)): # for j in range(i+1,len(documents)): # i_id=documents[i].get("id", "") # j_id = documents[j].get("id", "") # if i_id==j_id : # data_map.setdefault(documents[i].get("_id",""),[]).append(str(documents[j].get("_id",""))) # 打印结果 for key, value in data_map.items(): print(f"{key} -> {value}")