|
@@ -1,16 +1,23 @@
|
|
|
|
+import os
|
|
import cpca
|
|
import cpca
|
|
import re
|
|
import re
|
|
import pandas as pd
|
|
import pandas as pd
|
|
|
|
|
|
def get_city_info(text):
|
|
def get_city_info(text):
|
|
- # 读取区县数据
|
|
|
|
- df_county = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//区县.xlsx")
|
|
|
|
|
|
+ # 获取当前脚本所在目录的上一级目录
|
|
|
|
+ current_dir = os.path.dirname(__file__)
|
|
|
|
+ parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
|
|
|
|
|
|
|
|
+ # 读取区县数据
|
|
|
|
+ df_county_addr = os.path.join(parent_dir, 'docs', '区县.xlsx')
|
|
|
|
+ df_county=pd.read_excel(df_county_addr)
|
|
# 读取乡镇数据
|
|
# 读取乡镇数据
|
|
- df_town = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//乡镇.xlsx")
|
|
|
|
|
|
+ df_town_addr = os.path.join(parent_dir, 'docs', '乡镇.xlsx')
|
|
|
|
+ df_town = pd.read_excel(df_town_addr)
|
|
|
|
|
|
# 读取市级数据
|
|
# 读取市级数据
|
|
- df_city = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//市.xlsx")
|
|
|
|
|
|
+ df_city_addr = os.path.join(parent_dir, 'docs', '市.xlsx')
|
|
|
|
+ df_city = pd.read_excel(df_city_addr)
|
|
|
|
|
|
# 使用cpca库提取地名
|
|
# 使用cpca库提取地名
|
|
df = cpca.transform([text])
|
|
df = cpca.transform([text])
|
|
@@ -26,7 +33,7 @@ def get_city_info(text):
|
|
|
|
|
|
if province is None and city is None and district is None:
|
|
if province is None and city is None and district is None:
|
|
# 使用正则表达式提取乡镇信息
|
|
# 使用正则表达式提取乡镇信息
|
|
- towns = re.findall(r'[\u4e00-\u9fa5]+镇|[\u4e00-\u9fa5]+乡', text)
|
|
|
|
|
|
+ towns = re.findall(r'[\u4e00-\u9fa5]+镇|[\u4e00-\u9fa5]+乡|[\u4e00-\u9fa5]+街道|[\u4e00-\u9fa5]+庄|[\u4e00-\u9fa5]+营|[\u4e00-\u9fa5]+店', text)
|
|
if towns:
|
|
if towns:
|
|
for town in towns:
|
|
for town in towns:
|
|
town_name = None
|
|
town_name = None
|
|
@@ -55,27 +62,15 @@ def get_city_info(text):
|
|
province = df_city_result.iloc[0]['省']
|
|
province = df_city_result.iloc[0]['省']
|
|
|
|
|
|
break# 找到乡镇信息后跳出循环
|
|
break# 找到乡镇信息后跳出循环
|
|
-
|
|
|
|
- if not province and not city and not district and '区县代码' in df_county.columns:
|
|
|
|
- county_code = df.iloc[0]['区县代码']
|
|
|
|
- city_info = df_city[df_city['城市代码'] == county_code]
|
|
|
|
- if not city_info.empty:
|
|
|
|
- city = city_info.iloc[0]['城市名称']
|
|
|
|
-
|
|
|
|
- # 将城市名称转换成对应的省份名称
|
|
|
|
- df_city_result = cpca.transform([city])
|
|
|
|
- province = df_city_result.iloc[0]['省']
|
|
|
|
-
|
|
|
|
- county_info = df_county[df_county['区县代码'] == county_code].iloc[0]
|
|
|
|
- district = county_info['区县名称']
|
|
|
|
-
|
|
|
|
|
|
+ else:
|
|
|
|
+ continue
|
|
return province, city, district
|
|
return province, city, district
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
|
|
|
|
# 使用方法示例
|
|
# 使用方法示例
|
|
- province, city, district = get_city_info("河南省开发的")
|
|
|
|
- if get_city_info("电动蝶阀待开发的")==[None,None,None]:
|
|
|
|
|
|
+ province, city, district = get_city_info("杞县文化广电新闻出版旅游局")
|
|
|
|
+ if province==None or city==None or district==None:
|
|
print("44444")
|
|
print("44444")
|
|
print(province, city, district)
|
|
print(province, city, district)
|
|
|
|
|