repeat.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. from pymongo import MongoClient
  2. from bson import ObjectId
  3. from lib.mongo_tools import MongoUtil,Data_save,MongoSentence
  4. #判断标讯数据的重复率
  5. def calculate_repeat_rate(str1, str2):
  6. import Levenshtein
  7. # str1 = "茹河、红河河流沟道生态治理与农田整治提升项目(六盘山森林生态保护修复单元)-彭阳县茹河流域乃河水库至店洼水库生态缓冲带修复工程一期"
  8. # str2 = "茹河、红河河流沟道生态治理与农田整治提升项目(六盘山森林生态保护修复单元)彭阳县茹河流域乃河水库至店洼水库生态缓冲带修复工程一期"
  9. distance = Levenshtein.distance(str1, str2)
  10. max_length = max(len(str1), len(str2))
  11. if max_length != 0:
  12. similarity = 1 - distance / max_length
  13. return similarity
  14. else:
  15. return 1.0
  16. # 连接到目标数据库
  17. db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
  18. coll_user = db["liantong"]
  19. data_map={}
  20. documents = list(coll_user.find({ "flag" : { "$ne" : "cfsj" } }).sort("_id",1))
  21. for i in range(len(documents)):
  22. for j in range(i+1,len(documents)):
  23. i_title=documents[i].get("title", "")
  24. j_title = documents[j].get("title", "")
  25. i_projectname=documents[i].get("projectname", "")
  26. j_projectname = documents[j].get("projectname", "")
  27. radio1=calculate_repeat_rate(i_title,j_title)
  28. radio2=calculate_repeat_rate(i_projectname,j_projectname)
  29. if radio1>=0.9 or radio2>=0.9:
  30. if documents[i].get("projectcode","")== documents[j].get("projectcode","") and documents[i].get("subtype","")== documents[j].get("subtype","")=='竞谈':
  31. data_map.setdefault(documents[i].get("_id",""),[]).append(str(documents[j].get("_id","")))
  32. # and documents[i].get("subtype","")== documents[j].get("subtype","")=='采购意向'
  33. # and documents[i].get("subtype","")==documents[j].get("subtype","")
  34. # for i in range(len(documents)):
  35. # for j in range(i+1,len(documents)):
  36. # i_id=documents[i].get("id", "")
  37. # j_id = documents[j].get("id", "")
  38. # if i_id==j_id :
  39. # data_map.setdefault(documents[i].get("_id",""),[]).append(str(documents[j].get("_id","")))
  40. # 打印结果
  41. for key, value in data_map.items():
  42. print(f"{key} -> {value}")