import os import re import pandas as pd class AreaChecker(object): def __init__(self): self.errors_tables = { "0101": { "name": "全国类数据", "parent_name": "全国类型", "parent_code": "01", "checkFn": self.check0101 }, # "0201": { # "name": "没有市县正文中有", # "parent_name": "市县类型", # "parent_code": "01", # "checkFn": self.check0201 # }, "0301": { "name": "省份不在[2,3]个字之间", "parent_name": "长度异常类型", "parent_code": "03", "checkFn": self.check0301 }, "0302": { "name": "城市不在[3,11]个字之间", "parent_name": "长度异常类型", "parent_code": "03", "checkFn": self.check0302 }, "0303": { "name": "区县不在[2,15]个字之间", "parent_name": "长度异常类型", "parent_code": "03", "checkFn": self.check0303 } } def check0101(self, area: str) -> bool: """ return true 代表返回异常 """ return area == "全国" def check0201(self, area, city, district, detail) -> bool: def find_city_info(query): for index, row in df.iterrows(): if query in str(row["地市"]) or query in str(row["地区代码"]): return { "省份": row["省份"], "地市": row["地市"], "区县": row["区县"], "父级地区代码": row["父级地区代码"], "地区代码": row["地区代码"] } return None # 获取当前脚本所在目录的上一级目录 current_dir = os.path.dirname(__file__) # parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir)) # 构建 Excel 文件路径 xls_file = os.path.join(current_dir, '..', '..', 'docs', 'aera.xls') # 加载地区代码的XLS文件 # xls_file = "C:\\Users\\25503\\PycharmProjects\\data_quality\\docs\\table_head_doc\\aera.xls" df = pd.read_excel(xls_file) detail = re.sub(r"<.*?>", "", detail) # 提取采购单位名称 unit_name_pattern = r"采购单位名称:([\s\S]+?)\n" unit_name_match = re.search(unit_name_pattern, detail) purchase_unit_name = unit_name_match.group(1) if unit_name_match else None # 提取采购单位地址 purchase_unit_address = r"采购单位地址:([\s\S]+?)\n" address_match = re.search(purchase_unit_address, detail) city_from_unit_address = address_match.group(1) if address_match else None # 提取项目所在行政区划编码 project_district_code = r"项目所在行政区划编码:(\d+)" district_code_match = re.search(project_district_code, detail) district = district_code_match.group(1) if district_code_match else None print(f"提取到的区县: {district}") print(f"提取到的地市: {city}") print(f"提取到的省份: {area}") if not district: if purchase_unit_name: city_info = find_city_info(purchase_unit_name) print(f"根据单位名称找到的城市信息: {city_info}") if city_info is None and city_from_unit_address: city_info = find_city_info(city_from_unit_address) if city_info: if city_info["省份"] == area and city_info["地市"] == city: print("省份与区域一致,地市与城市一致,不做处理") else: print("省份或地市与区域不一致,标记为true") return True else: print("未找到匹配的城市信息,返回true") return True else: print("district不为空,保持原有标记") return False # 保持原有的标记 #省份不在[2,3]个字之间 def check0301(self, area: str) -> bool: """ return true 代表返回异常 """ if 2 <= len(area) <= 3: return False return True #城市不在[3,11]个字之间 def check0302(self, city: str) -> bool: """ return true 代表返回异常 """ if 3 <= len(city) <= 11: return False return True #区县不在[2,15]个字之间 def check0303(self, district: str) -> bool: """ return true 代表返回异常 """ if 2 <= len(district) <= 15: return False return True