area.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. import os
  2. import re
  3. import pandas as pd
  4. class AreaChecker(object):
  5. def __init__(self):
  6. self.errors_tables = {
  7. "0101": {
  8. "name": "全国类数据",
  9. "parent_name": "全国类型",
  10. "parent_code": "01",
  11. "checkFn": self.check0101
  12. },
  13. "0201": {
  14. "name": "没有市县正文中有",
  15. "parent_name": "市县类型",
  16. "parent_code": "01",
  17. "checkFn": self.check0201
  18. }
  19. }
  20. def check0101(self, area: str) -> bool:
  21. return area == "全国"
  22. def check0201(self, area, city, district, detail) -> bool:
  23. def find_city_info(query):
  24. for index, row in df.iterrows():
  25. if query in str(row["地市"]) or query in str(row["地区代码"]):
  26. return {
  27. "省份": row["省份"],
  28. "地市": row["地市"],
  29. "区县": row["区县"],
  30. "父级地区代码": row["父级地区代码"],
  31. "地区代码": row["地区代码"]
  32. }
  33. return None
  34. # 获取当前脚本所在目录的上一级目录
  35. current_dir = os.path.dirname(__file__)
  36. parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
  37. # 构建 Excel 文件路径
  38. xls_file = os.path.join(parent_dir, 'docs', 'area.xlsx')
  39. # 加载地区代码的XLS文件
  40. # xls_file = "C:\\Users\\25503\\PycharmProjects\\data_quality\\docs\\table_head_doc\\aera.xls"
  41. df = pd.read_excel(xls_file)
  42. detail = re.sub(r"<.*?>", "", detail)
  43. # 提取采购单位名称
  44. unit_name_pattern = r"采购单位名称:([\s\S]+?)\n"
  45. unit_name_match = re.search(unit_name_pattern, detail)
  46. purchase_unit_name = unit_name_match.group(1) if unit_name_match else None
  47. # 提取采购单位地址
  48. purchase_unit_address = r"采购单位地址:([\s\S]+?)\n"
  49. address_match = re.search(purchase_unit_address, detail)
  50. city_from_unit_address = address_match.group(1) if address_match else None
  51. # 提取项目所在行政区划编码
  52. project_district_code = r"项目所在行政区划编码:(\d+)"
  53. district_code_match = re.search(project_district_code, detail)
  54. district = district_code_match.group(1) if district_code_match else None
  55. print(f"提取到的区县: {district}")
  56. print(f"提取到的地市: {city}")
  57. print(f"提取到的省份: {area}")
  58. if not district:
  59. if purchase_unit_name:
  60. city_info = find_city_info(purchase_unit_name)
  61. print(f"根据单位名称找到的城市信息: {city_info}")
  62. if city_info is None and city_from_unit_address:
  63. city_info = find_city_info(city_from_unit_address)
  64. if city_info:
  65. if city_info["省份"] == area and city_info["地市"] == city:
  66. print("省份与区域一致,地市与城市一致,不做处理")
  67. else:
  68. print("省份或地市与区域不一致,标记为true")
  69. return True
  70. else:
  71. print("未找到匹配的城市信息,返回true")
  72. return True
  73. else:
  74. print("district不为空,保持原有标记")
  75. return False # 保持原有的标记