area.py 3.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. import re
  2. import pandas as pd
  3. class AreaChecker(object):
  4. def __init__(self):
  5. self.errors_tables = {
  6. "0101": {
  7. "name": "全国类数据",
  8. "parent_name": "全国类型",
  9. "parent_code": "01",
  10. "checkFn": self.check0101
  11. },
  12. "0201": {
  13. "name": "没有市县正文中有",
  14. "parent_name": "市县类型",
  15. "parent_code": "01",
  16. "checkFn": self.check0201
  17. }
  18. }
  19. def check0101(self, area: str) -> bool:
  20. return area == "全国"
  21. def check0201(self, area, city, district, detail) -> bool:
  22. def find_city_info(query):
  23. for index, row in df.iterrows():
  24. if query in str(row["地市"]) or query in str(row["地区代码"]):
  25. return {
  26. "省份": row["省份"],
  27. "地市": row["地市"],
  28. "区县": row["区县"],
  29. "父级地区代码": row["父级地区代码"],
  30. "地区代码": row["地区代码"]
  31. }
  32. return None
  33. # 加载地区代码的XLS文件
  34. xls_file = "C:\\Users\\25503\\PycharmProjects\\data_quality\\docs\\table_head_doc\\aera.xls"
  35. df = pd.read_excel(xls_file)
  36. detail = re.sub(r"<.*?>", "", detail)
  37. # 提取采购单位名称
  38. unit_name_pattern = r"采购单位名称:([\s\S]+?)\n"
  39. unit_name_match = re.search(unit_name_pattern, detail)
  40. purchase_unit_name = unit_name_match.group(1) if unit_name_match else None
  41. # 提取采购单位地址
  42. purchase_unit_address = r"采购单位地址:([\s\S]+?)\n"
  43. address_match = re.search(purchase_unit_address, detail)
  44. city_from_unit_address = address_match.group(1) if address_match else None
  45. # 提取项目所在行政区划编码
  46. project_district_code = r"项目所在行政区划编码:(\d+)"
  47. district_code_match = re.search(project_district_code, detail)
  48. district = district_code_match.group(1) if district_code_match else None
  49. print(f"提取到的区县: {district}")
  50. print(f"提取到的地市: {city}")
  51. print(f"提取到的省份: {area}")
  52. if not district:
  53. if purchase_unit_name:
  54. city_info = find_city_info(purchase_unit_name)
  55. print(f"根据单位名称找到的城市信息: {city_info}")
  56. if city_info is None and city_from_unit_address:
  57. city_info = find_city_info(city_from_unit_address)
  58. if city_info:
  59. if city_info["省份"] == area and city_info["地市"] == city:
  60. print("省份与区域一致,地市与城市一致,不做处理")
  61. else:
  62. print("省份或地市与区域不一致,标记为true")
  63. return True
  64. else:
  65. print("未找到匹配的城市信息,返回true")
  66. return True
  67. else:
  68. print("district不为空,保持原有标记")
  69. return False # 保持原有的标记