12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 |
- from pymongo import MongoClient
- from bson import ObjectId
- from lib.mongo_tools import MongoUtil,Data_save,MongoSentence
- #判断标讯数据的重复率
- def calculate_repeat_rate(str1, str2):
- import Levenshtein
- # str1 = "茹河、红河河流沟道生态治理与农田整治提升项目(六盘山森林生态保护修复单元)-彭阳县茹河流域乃河水库至店洼水库生态缓冲带修复工程一期"
- # str2 = "茹河、红河河流沟道生态治理与农田整治提升项目(六盘山森林生态保护修复单元)彭阳县茹河流域乃河水库至店洼水库生态缓冲带修复工程一期"
- distance = Levenshtein.distance(str1, str2)
- max_length = max(len(str1), len(str2))
- if max_length != 0:
- similarity = 1 - distance / max_length
- return similarity
- else:
- return 1.0
- # 连接到目标数据库
- db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
- coll_user = db["liantong"]
- data_map={}
- documents = list(coll_user.find({ "flag" : { "$ne" : "cfsj" } }).sort("_id",1))
- for i in range(len(documents)):
- for j in range(i+1,len(documents)):
- i_title=documents[i].get("title", "")
- j_title = documents[j].get("title", "")
- i_projectname=documents[i].get("projectname", "")
- j_projectname = documents[j].get("projectname", "")
- radio1=calculate_repeat_rate(i_title,j_title)
- radio2=calculate_repeat_rate(i_projectname,j_projectname)
- if radio1>=0.9 or radio2>=0.9:
- if documents[i].get("projectcode","")== documents[j].get("projectcode","") and documents[i].get("subtype","")== documents[j].get("subtype","")=='竞谈':
- data_map.setdefault(documents[i].get("_id",""),[]).append(str(documents[j].get("_id","")))
- # and documents[i].get("subtype","")== documents[j].get("subtype","")=='采购意向'
- # and documents[i].get("subtype","")==documents[j].get("subtype","")
- # for i in range(len(documents)):
- # for j in range(i+1,len(documents)):
- # i_id=documents[i].get("id", "")
- # j_id = documents[j].get("id", "")
- # if i_id==j_id :
- # data_map.setdefault(documents[i].get("_id",""),[]).append(str(documents[j].get("_id","")))
- # 打印结果
- for key, value in data_map.items():
- print(f"{key} -> {value}")
|