get_region.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. import cpca
  2. import re
  3. import pandas as pd
  4. def get_city_info(text):
  5. # 读取区县数据
  6. df_county = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//区县.xlsx")
  7. # 读取乡镇数据
  8. df_town = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//乡镇.xlsx")
  9. # 读取市级数据
  10. df_city = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//市.xlsx")
  11. # 使用cpca库提取地名
  12. df = cpca.transform([text])
  13. province = None
  14. city = None
  15. district = None
  16. if not df.empty:
  17. province = df.iloc[0]['省']
  18. city = df.iloc[0]['市']
  19. district = df.iloc[0]['区']
  20. if province is None and city is None and district is None:
  21. # 使用正则表达式提取乡镇信息
  22. towns = re.findall(r'[\u4e00-\u9fa5]+镇|[\u4e00-\u9fa5]+乡', text)
  23. if towns:
  24. for town in towns:
  25. town_name = None
  26. if town in df_town['乡镇名称'].values or town in df_town['乡镇简称'].values:
  27. town_name = town
  28. # 根据乡镇名查找对应的区县代码
  29. town_info = df_town[df_town['乡镇名称'] == town_name].iloc[0]
  30. county_code = town_info['区县代码']
  31. # 根据区县代码查找对应的区县名称
  32. county_info = df_county[df_county['区县代码'] == county_code].iloc[0]
  33. county_name = county_info['区县名称']
  34. # 将区县名称转换成对应的省份名称
  35. province = county_name
  36. # 尝试在市级数据中查找对应的市信息
  37. city_code = county_info['城市代码']
  38. city_info = df_city[df_city['城市代码'] == city_code]
  39. if not city_info.empty:
  40. city = city_info.iloc[0]['城市名称']
  41. # 将城市名称转换成对应的省份名称
  42. df_city_result = cpca.transform([city])
  43. province = df_city_result.iloc[0]['省']
  44. break# 找到乡镇信息后跳出循环
  45. if not province and not city and not district and '区县代码' in df_county.columns:
  46. county_code = df.iloc[0]['区县代码']
  47. city_info = df_city[df_city['城市代码'] == county_code]
  48. if not city_info.empty:
  49. city = city_info.iloc[0]['城市名称']
  50. # 将城市名称转换成对应的省份名称
  51. df_city_result = cpca.transform([city])
  52. province = df_city_result.iloc[0]['省']
  53. county_info = df_county[df_county['区县代码'] == county_code].iloc[0]
  54. district = county_info['区县名称']
  55. return province, city, district
  56. if __name__ == '__main__':
  57. # 使用方法示例
  58. province, city, district = get_city_info("河南省开发的")
  59. if get_city_info("电动蝶阀待开发的")==[None,None,None]:
  60. print("44444")
  61. print(province, city, district)