area.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. import os
  2. import re
  3. import pandas as pd
  4. class AreaChecker(object):
  5. def __init__(self):
  6. self.errors_tables = {
  7. "0101": {
  8. "name": "全国类数据",
  9. "parent_name": "全国类型",
  10. "parent_code": "01",
  11. "checkFn": self.check0101
  12. },
  13. # "0201": {
  14. # "name": "没有市县正文中有",
  15. # "parent_name": "市县类型",
  16. # "parent_code": "01",
  17. # "checkFn": self.check0201
  18. # },
  19. "0301": {
  20. "name": "省份不在[2,3]个字之间",
  21. "parent_name": "长度异常类型",
  22. "parent_code": "03",
  23. "checkFn": self.check0301
  24. },
  25. "0302": {
  26. "name": "城市不在[3,11]个字之间",
  27. "parent_name": "长度异常类型",
  28. "parent_code": "03",
  29. "checkFn": self.check0302
  30. },
  31. "0303": {
  32. "name": "区县不在[2,15]个字之间",
  33. "parent_name": "长度异常类型",
  34. "parent_code": "03",
  35. "checkFn": self.check0303
  36. }
  37. }
  38. def check0101(self, area: str) -> bool:
  39. """
  40. return true 代表返回异常
  41. """
  42. return area == "全国"
  43. def check0201(self, area, city, district, detail) -> bool:
  44. def find_city_info(query):
  45. for index, row in df.iterrows():
  46. if query in str(row["地市"]) or query in str(row["地区代码"]):
  47. return {
  48. "省份": row["省份"],
  49. "地市": row["地市"],
  50. "区县": row["区县"],
  51. "父级地区代码": row["父级地区代码"],
  52. "地区代码": row["地区代码"]
  53. }
  54. return None
  55. # 获取当前脚本所在目录的上一级目录
  56. current_dir = os.path.dirname(__file__)
  57. # parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
  58. # 构建 Excel 文件路径
  59. xls_file = os.path.join(current_dir, '..', '..', 'docs', 'aera.xls')
  60. # 加载地区代码的XLS文件
  61. # xls_file = "C:\\Users\\25503\\PycharmProjects\\data_quality\\docs\\table_head_doc\\aera.xls"
  62. df = pd.read_excel(xls_file)
  63. detail = re.sub(r"<.*?>", "", detail)
  64. # 提取采购单位名称
  65. unit_name_pattern = r"采购单位名称:([\s\S]+?)\n"
  66. unit_name_match = re.search(unit_name_pattern, detail)
  67. purchase_unit_name = unit_name_match.group(1) if unit_name_match else None
  68. # 提取采购单位地址
  69. purchase_unit_address = r"采购单位地址:([\s\S]+?)\n"
  70. address_match = re.search(purchase_unit_address, detail)
  71. city_from_unit_address = address_match.group(1) if address_match else None
  72. # 提取项目所在行政区划编码
  73. project_district_code = r"项目所在行政区划编码:(\d+)"
  74. district_code_match = re.search(project_district_code, detail)
  75. district = district_code_match.group(1) if district_code_match else None
  76. print(f"提取到的区县: {district}")
  77. print(f"提取到的地市: {city}")
  78. print(f"提取到的省份: {area}")
  79. if not district:
  80. if purchase_unit_name:
  81. city_info = find_city_info(purchase_unit_name)
  82. print(f"根据单位名称找到的城市信息: {city_info}")
  83. if city_info is None and city_from_unit_address:
  84. city_info = find_city_info(city_from_unit_address)
  85. if city_info:
  86. if city_info["省份"] == area and city_info["地市"] == city:
  87. print("省份与区域一致,地市与城市一致,不做处理")
  88. else:
  89. print("省份或地市与区域不一致,标记为true")
  90. return True
  91. else:
  92. print("未找到匹配的城市信息,返回true")
  93. return True
  94. else:
  95. print("district不为空,保持原有标记")
  96. return False # 保持原有的标记
  97. #省份不在[2,3]个字之间
  98. def check0301(self, area: str) -> bool:
  99. """
  100. return true 代表返回异常
  101. """
  102. if 2 <= len(area) <= 3:
  103. return False
  104. return True
  105. #城市不在[3,11]个字之间
  106. def check0302(self, city: str) -> bool:
  107. """
  108. return true 代表返回异常
  109. """
  110. if 3 <= len(city) <= 11:
  111. return False
  112. return True
  113. #区县不在[2,15]个字之间
  114. def check0303(self, district: str) -> bool:
  115. """
  116. return true 代表返回异常
  117. """
  118. if 2 <= len(district) <= 15:
  119. return False
  120. return True