import cpca import re import pandas as pd def get_city_info(text): # 读取区县数据 df_county = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//区县.xlsx") # 读取乡镇数据 df_town = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//乡镇.xlsx") # 读取市级数据 df_city = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//市.xlsx") # 使用cpca库提取地名 df = cpca.transform([text]) province = None city = None district = None if not df.empty: province = df.iloc[0]['省'] city = df.iloc[0]['市'] district = df.iloc[0]['区'] if province is None and city is None and district is None: # 使用正则表达式提取乡镇信息 towns = re.findall(r'[\u4e00-\u9fa5]+镇|[\u4e00-\u9fa5]+乡', text) if towns: for town in towns: town_name = None if town in df_town['乡镇名称'].values or town in df_town['乡镇简称'].values: town_name = town # 根据乡镇名查找对应的区县代码 town_info = df_town[df_town['乡镇名称'] == town_name].iloc[0] county_code = town_info['区县代码'] # 根据区县代码查找对应的区县名称 county_info = df_county[df_county['区县代码'] == county_code].iloc[0] county_name = county_info['区县名称'] # 将区县名称转换成对应的省份名称 province = county_name # 尝试在市级数据中查找对应的市信息 city_code = county_info['城市代码'] city_info = df_city[df_city['城市代码'] == city_code] if not city_info.empty: city = city_info.iloc[0]['城市名称'] # 将城市名称转换成对应的省份名称 df_city_result = cpca.transform([city]) province = df_city_result.iloc[0]['省'] break# 找到乡镇信息后跳出循环 if not province and not city and not district and '区县代码' in df_county.columns: county_code = df.iloc[0]['区县代码'] city_info = df_city[df_city['城市代码'] == county_code] if not city_info.empty: city = city_info.iloc[0]['城市名称'] # 将城市名称转换成对应的省份名称 df_city_result = cpca.transform([city]) province = df_city_result.iloc[0]['省'] county_info = df_county[df_county['区县代码'] == county_code].iloc[0] district = county_info['区县名称'] return province, city, district if __name__ == '__main__': # 使用方法示例 province, city, district = get_city_info("河南省开发的") if get_city_info("电动蝶阀待开发的")==[None,None,None]: print("44444") print(province, city, district)