123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- import os
- import re
- import pandas as pd
- class AreaChecker(object):
- def __init__(self):
- self.errors_tables = {
- "0101": {
- "name": "全国类数据",
- "parent_name": "全国类型",
- "parent_code": "01",
- "checkFn": self.check0101
- },
- "0201": {
- "name": "没有市县正文中有",
- "parent_name": "市县类型",
- "parent_code": "01",
- "checkFn": self.check0201
- }
- }
- def check0101(self, area: str) -> bool:
- return area == "全国"
- def check0201(self, area, city, district, detail) -> bool:
- def find_city_info(query):
- for index, row in df.iterrows():
- if query in str(row["地市"]) or query in str(row["地区代码"]):
- return {
- "省份": row["省份"],
- "地市": row["地市"],
- "区县": row["区县"],
- "父级地区代码": row["父级地区代码"],
- "地区代码": row["地区代码"]
- }
- return None
- # 获取当前脚本所在目录的上一级目录
- current_dir = os.path.dirname(__file__)
- parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
- # 构建 Excel 文件路径
- xls_file = os.path.join(parent_dir, 'docs', 'area.xlsx')
- # 加载地区代码的XLS文件
- # xls_file = "C:\\Users\\25503\\PycharmProjects\\data_quality\\docs\\table_head_doc\\aera.xls"
- df = pd.read_excel(xls_file)
- detail = re.sub(r"<.*?>", "", detail)
- # 提取采购单位名称
- unit_name_pattern = r"采购单位名称:([\s\S]+?)\n"
- unit_name_match = re.search(unit_name_pattern, detail)
- purchase_unit_name = unit_name_match.group(1) if unit_name_match else None
- # 提取采购单位地址
- purchase_unit_address = r"采购单位地址:([\s\S]+?)\n"
- address_match = re.search(purchase_unit_address, detail)
- city_from_unit_address = address_match.group(1) if address_match else None
- # 提取项目所在行政区划编码
- project_district_code = r"项目所在行政区划编码:(\d+)"
- district_code_match = re.search(project_district_code, detail)
- district = district_code_match.group(1) if district_code_match else None
- print(f"提取到的区县: {district}")
- print(f"提取到的地市: {city}")
- print(f"提取到的省份: {area}")
- if not district:
- if purchase_unit_name:
- city_info = find_city_info(purchase_unit_name)
- print(f"根据单位名称找到的城市信息: {city_info}")
- if city_info is None and city_from_unit_address:
- city_info = find_city_info(city_from_unit_address)
- if city_info:
- if city_info["省份"] == area and city_info["地市"] == city:
- print("省份与区域一致,地市与城市一致,不做处理")
- else:
- print("省份或地市与区域不一致,标记为true")
- return True
- else:
- print("未找到匹配的城市信息,返回true")
- return True
- else:
- print("district不为空,保持原有标记")
- return False # 保持原有的标记
|