123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- import cpca
- import re
- import pandas as pd
- def get_city_info(text):
- # 读取区县数据
- df_county = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//区县.xlsx")
- # 读取乡镇数据
- df_town = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//乡镇.xlsx")
- # 读取市级数据
- df_city = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//市.xlsx")
- # 使用cpca库提取地名
- df = cpca.transform([text])
- province = None
- city = None
- district = None
- if not df.empty:
- province = df.iloc[0]['省']
- city = df.iloc[0]['市']
- district = df.iloc[0]['区']
- if province is None and city is None and district is None:
- # 使用正则表达式提取乡镇信息
- towns = re.findall(r'[\u4e00-\u9fa5]+镇|[\u4e00-\u9fa5]+乡', text)
- if towns:
- for town in towns:
- town_name = None
- if town in df_town['乡镇名称'].values or town in df_town['乡镇简称'].values:
- town_name = town
- # 根据乡镇名查找对应的区县代码
- town_info = df_town[df_town['乡镇名称'] == town_name].iloc[0]
- county_code = town_info['区县代码']
- # 根据区县代码查找对应的区县名称
- county_info = df_county[df_county['区县代码'] == county_code].iloc[0]
- county_name = county_info['区县名称']
- # 将区县名称转换成对应的省份名称
- province = county_name
- # 尝试在市级数据中查找对应的市信息
- city_code = county_info['城市代码']
- city_info = df_city[df_city['城市代码'] == city_code]
- if not city_info.empty:
- city = city_info.iloc[0]['城市名称']
- # 将城市名称转换成对应的省份名称
- df_city_result = cpca.transform([city])
- province = df_city_result.iloc[0]['省']
- break# 找到乡镇信息后跳出循环
- if not province and not city and not district and '区县代码' in df_county.columns:
- county_code = df.iloc[0]['区县代码']
- city_info = df_city[df_city['城市代码'] == county_code]
- if not city_info.empty:
- city = city_info.iloc[0]['城市名称']
- # 将城市名称转换成对应的省份名称
- df_city_result = cpca.transform([city])
- province = df_city_result.iloc[0]['省']
- county_info = df_county[df_county['区县代码'] == county_code].iloc[0]
- district = county_info['区县名称']
- return province, city, district
- if __name__ == '__main__':
- # 使用方法示例
- province, city, district = get_city_info("河南省开发的")
- if get_city_info("电动蝶阀待开发的")==[None,None,None]:
- print("44444")
- print(province, city, district)
|