|
@@ -0,0 +1,142 @@
|
|
|
+from pymongo import MongoClient
|
|
|
+import re
|
|
|
+import pandas as pd
|
|
|
+from bson.objectid import ObjectId
|
|
|
+
|
|
|
+# 编译正则表达式模式
|
|
|
+address_pattern = re.compile(
|
|
|
+ r"采购单位地址:(.*?省(.*?市|.*?自治区)(.*?市|.*?自治州|.*?地区)(.*?[县区]|.*?市辖区)(.*?[镇乡街道]))")
|
|
|
+buyer_info_pattern = re.compile(
|
|
|
+ r"采购人信息:<>(采购单位地址:.*?省(.*?市|.*?自治区)(.*?市|.*?自治州|.*?地区)(.*?[县区]|.*?市辖区)(.*?[镇乡街道]))")
|
|
|
+
|
|
|
+def extract_address(detail):
|
|
|
+ match = address_pattern.search(detail)
|
|
|
+ if match:
|
|
|
+ address = match.group(1)
|
|
|
+ return address
|
|
|
+ else:
|
|
|
+ return None
|
|
|
+
|
|
|
+def extract_buyer_info(detail):
|
|
|
+ match = buyer_info_pattern.search(detail)
|
|
|
+ if match:
|
|
|
+ buyer_info = match.group(1)
|
|
|
+ return buyer_info
|
|
|
+ else:
|
|
|
+ return None
|
|
|
+
|
|
|
+def extract_city_from_address(address):
|
|
|
+ match = re.search(r"省(.*?市|.*?自治区)(.*?市|.*?自治州|.*?地区)(.*?[县区]|.*?市辖区)", address)
|
|
|
+ if match:
|
|
|
+ city = match.group(1)
|
|
|
+ return city
|
|
|
+ else:
|
|
|
+ return None
|
|
|
+
|
|
|
+def process_detail(detail):
|
|
|
+ # 去除<>标签
|
|
|
+ detail = re.sub(r"<.*?>", "", detail)
|
|
|
+
|
|
|
+ # 提取采购单位地址
|
|
|
+ address = extract_address(detail)
|
|
|
+
|
|
|
+ if address:
|
|
|
+ city_from_address = extract_city_from_address(address)
|
|
|
+ else:
|
|
|
+ city_from_address = None
|
|
|
+
|
|
|
+ # 从采购人信息中提取地址
|
|
|
+ buyer_info = extract_buyer_info(detail)
|
|
|
+
|
|
|
+ if buyer_info:
|
|
|
+ city_from_buyer_info = extract_city_from_address(buyer_info)
|
|
|
+ else:
|
|
|
+ city_from_buyer_info = None
|
|
|
+
|
|
|
+ return city_from_address, city_from_buyer_info
|
|
|
+
|
|
|
+def add_region_to_db():
|
|
|
+ # 使用上下文管理器连接到MongoDB
|
|
|
+ with MongoClient('192.168.3.71', 29099, unicode_decode_error_handler="ignore") as client:
|
|
|
+ db = client.quality
|
|
|
+ bidding_source_collection = db["bidding_source"]
|
|
|
+
|
|
|
+ count = 0
|
|
|
+
|
|
|
+ # 使用生成器函数以批处理方式查询集合
|
|
|
+ for item in bidding_source_collection.find().batch_size(1000):
|
|
|
+ count += 1
|
|
|
+ if count >= 10000:
|
|
|
+ break
|
|
|
+
|
|
|
+ if count % 1000 == 0:
|
|
|
+ print(count)
|
|
|
+
|
|
|
+ detail = item.get("detail", "")
|
|
|
+
|
|
|
+ # 去除<>标签
|
|
|
+ detail = re.sub(r"<.*?>", "", detail)
|
|
|
+
|
|
|
+ # 提取采购单位名称
|
|
|
+ unit_name_pattern = r"采购单位名称:([\s\S]+?)\n"
|
|
|
+ unit_name_match = re.search(unit_name_pattern, detail)
|
|
|
+ purchase_unit_name = unit_name_match.group(1) if unit_name_match else None
|
|
|
+
|
|
|
+ # 提取采购单位地址
|
|
|
+ purchase_unit_address = r"采购单位地址:([\s\S]+?)\n"
|
|
|
+ address_match = re.search(purchase_unit_address, detail)
|
|
|
+ city_from_unit_address = address_match.group(1) if address_match else None
|
|
|
+
|
|
|
+ # 提取项目所在行政区划编码
|
|
|
+ project_district_code = r"项目所在行政区划编码:(\d+)"
|
|
|
+ district_code_match = re.search(project_district_code, detail)
|
|
|
+ district_code = district_code_match.group(1) if district_code_match else None
|
|
|
+
|
|
|
+ city_info = None # 初始化城市信息
|
|
|
+
|
|
|
+ if item.get("area") == "全国":
|
|
|
+ print("区域为全国,region_qa标记为0101")
|
|
|
+ item_region_qa = "0101"
|
|
|
+
|
|
|
+ elif not district_code:
|
|
|
+ if purchase_unit_name:
|
|
|
+ city_info = find_city_info(purchase_unit_name)
|
|
|
+
|
|
|
+ if city_info is None and city_from_unit_address:
|
|
|
+ city_info = find_city_info(city_from_unit_address)
|
|
|
+
|
|
|
+ if city_info:
|
|
|
+ if city_info["省份"] == item["area"] and city_info["地市"] == item["city"]:
|
|
|
+ print("省份与区域一致,地市与城市一致,不做处理")
|
|
|
+ else:
|
|
|
+ print("省份或地市与区域不一致,region_qa标记为0201")
|
|
|
+ item_region_qa = "0201"
|
|
|
+ else:
|
|
|
+ print("未找到匹配的城市信息,region_qa标记为0201")
|
|
|
+ item_region_qa = "0201"
|
|
|
+
|
|
|
+ else:
|
|
|
+ print("district不为空,保持原有标记")
|
|
|
+ item_region_qa = item.get("region_qa", "") # 保持原有的标记
|
|
|
+
|
|
|
+ # 更新文档
|
|
|
+ bidding_source_collection.update_one({"_id": item["_id"]}, {"$set": {"region_qa": item_region_qa}})
|
|
|
+
|
|
|
+def find_city_info(query):
|
|
|
+ for index, row in df.iterrows():
|
|
|
+ if query in str(row["地市"]) or query in str(row["地区代码"]):
|
|
|
+ return {
|
|
|
+ "省份": row["省份"],
|
|
|
+ "地市": row["地市"],
|
|
|
+ "区县": row["区县"],
|
|
|
+ "父级地区代码": row["父级地区代码"],
|
|
|
+ "地区代码": row["地区代码"]
|
|
|
+ }
|
|
|
+ return None
|
|
|
+
|
|
|
+# 加载地区代码的XLS文件
|
|
|
+xls_file = "D:\\PycharmProjects\\Dataquality\\Dataquality\\dataquality\\area.xls"
|
|
|
+df = pd.read_excel(xls_file)
|
|
|
+
|
|
|
+# 调用函数
|
|
|
+add_region_to_db()
|