area.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. import os
  2. import re
  3. import pandas as pd
  4. class AreaChecker(object):
  5. def __init__(self):
  6. self.errors_tables = {
  7. "0101": {
  8. "name": "全国类数据",
  9. "parent_name": "全国类型",
  10. "parent_code": "01",
  11. "checkFn": self.check0101
  12. },
  13. # "0201": {
  14. # "name": "没有市县正文中有",
  15. # "parent_name": "市县类型",
  16. # "parent_code": "01",
  17. # "checkFn": self.check0201
  18. # },
  19. "0301": {
  20. "name": "省份不在[2,3]个字之间",
  21. "parent_name": "长度异常类型",
  22. "parent_code": "03",
  23. "checkFn": self.check0301
  24. },
  25. "0401": {
  26. "name": "不在31省份中",
  27. "parent_name": "内容异常类型",
  28. "parent_code": "04",
  29. "checkFn": self.check0401
  30. }
  31. }
  32. def check0101(self, area: str) -> bool:
  33. """
  34. return true 代表返回异常
  35. """
  36. return area == "全国"
  37. def check0201(self, area, city, district, detail) -> bool:
  38. def find_city_info(query):
  39. for index, row in df.iterrows():
  40. if query in str(row["地市"]) or query in str(row["地区代码"]):
  41. return {
  42. "省份": row["省份"],
  43. "地市": row["地市"],
  44. "区县": row["区县"],
  45. "父级地区代码": row["父级地区代码"],
  46. "地区代码": row["地区代码"]
  47. }
  48. return None
  49. # 获取当前脚本所在目录的上一级目录
  50. current_dir = os.path.dirname(__file__)
  51. # parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
  52. # 构建 Excel 文件路径
  53. xls_file = os.path.join(current_dir, '..', '..', 'docs', 'aera.xls')
  54. # 加载地区代码的XLS文件
  55. # xls_file = "C:\\Users\\25503\\PycharmProjects\\data_quality\\docs\\table_head_doc\\aera.xls"
  56. df = pd.read_excel(xls_file)
  57. detail = re.sub(r"<.*?>", "", detail)
  58. # 提取采购单位名称
  59. unit_name_pattern = r"采购单位名称:([\s\S]+?)\n"
  60. unit_name_match = re.search(unit_name_pattern, detail)
  61. purchase_unit_name = unit_name_match.group(1) if unit_name_match else None
  62. # 提取采购单位地址
  63. purchase_unit_address = r"采购单位地址:([\s\S]+?)\n"
  64. address_match = re.search(purchase_unit_address, detail)
  65. city_from_unit_address = address_match.group(1) if address_match else None
  66. # 提取项目所在行政区划编码
  67. project_district_code = r"项目所在行政区划编码:(\d+)"
  68. district_code_match = re.search(project_district_code, detail)
  69. district = district_code_match.group(1) if district_code_match else None
  70. print(f"提取到的区县: {district}")
  71. print(f"提取到的地市: {city}")
  72. print(f"提取到的省份: {area}")
  73. if not district:
  74. if purchase_unit_name:
  75. city_info = find_city_info(purchase_unit_name)
  76. print(f"根据单位名称找到的城市信息: {city_info}")
  77. if city_info is None and city_from_unit_address:
  78. city_info = find_city_info(city_from_unit_address)
  79. if city_info:
  80. if city_info["省份"] == area and city_info["地市"] == city:
  81. print("省份与区域一致,地市与城市一致,不做处理")
  82. else:
  83. print("省份或地市与区域不一致,标记为true")
  84. return True
  85. else:
  86. print("未找到匹配的城市信息,返回true")
  87. return True
  88. else:
  89. print("district不为空,保持原有标记")
  90. return False # 保持原有的标记
  91. #省份不在[2,3]个字之间
  92. def check0301(self, area: str) -> bool:
  93. """
  94. return true 代表返回异常
  95. """
  96. if area == '':
  97. return True
  98. if 2 <= len(area) <= 3:
  99. return False
  100. return True
  101. def check0401(self, area: str) -> bool:
  102. """
  103. return true 代表返回异常
  104. """
  105. area_31 =[
  106. "河北", "山西", "辽宁", "吉林", "黑龙江",
  107. "江苏", "浙江", "安徽", "福建", "江西",
  108. "山东", "河南", "湖北", "湖南", "广东",
  109. "海南", "四川", "贵州", "云南", "陕西",
  110. "甘肃", "青海", "内蒙古", "广西", "西藏",
  111. "宁夏", "新疆", "北京", "天津", "上海",
  112. "重庆"
  113. ]
  114. if area in area_31:
  115. return False
  116. return True