Răsfoiți Sursa

区域字段+正文检索

lizhikun 1 an în urmă
părinte
comite
413a388e21
2 a modificat fișierele cu 142 adăugiri și 0 ștergeri
  1. BIN
      Dataquality/dataquality/output_error.xlsx
  2. 142 0
      Dataquality/inspect_region.py

BIN
Dataquality/dataquality/output_error.xlsx


+ 142 - 0
Dataquality/inspect_region.py

@@ -0,0 +1,142 @@
+from pymongo import MongoClient
+import re
+import pandas as pd
+from bson.objectid import ObjectId
+
+# 编译正则表达式模式
+address_pattern = re.compile(
+    r"采购单位地址:(.*?省(.*?市|.*?自治区)(.*?市|.*?自治州|.*?地区)(.*?[县区]|.*?市辖区)(.*?[镇乡街道]))")
+buyer_info_pattern = re.compile(
+    r"采购人信息:<>(采购单位地址:.*?省(.*?市|.*?自治区)(.*?市|.*?自治州|.*?地区)(.*?[县区]|.*?市辖区)(.*?[镇乡街道]))")
+
+def extract_address(detail):
+    match = address_pattern.search(detail)
+    if match:
+        address = match.group(1)
+        return address
+    else:
+        return None
+
+def extract_buyer_info(detail):
+    match = buyer_info_pattern.search(detail)
+    if match:
+        buyer_info = match.group(1)
+        return buyer_info
+    else:
+        return None
+
+def extract_city_from_address(address):
+    match = re.search(r"省(.*?市|.*?自治区)(.*?市|.*?自治州|.*?地区)(.*?[县区]|.*?市辖区)", address)
+    if match:
+        city = match.group(1)
+        return city
+    else:
+        return None
+
+def process_detail(detail):
+    # 去除<>标签
+    detail = re.sub(r"<.*?>", "", detail)
+
+    # 提取采购单位地址
+    address = extract_address(detail)
+
+    if address:
+        city_from_address = extract_city_from_address(address)
+    else:
+        city_from_address = None
+
+    # 从采购人信息中提取地址
+    buyer_info = extract_buyer_info(detail)
+
+    if buyer_info:
+        city_from_buyer_info = extract_city_from_address(buyer_info)
+    else:
+        city_from_buyer_info = None
+
+    return city_from_address, city_from_buyer_info
+
+def add_region_to_db():
+    # 使用上下文管理器连接到MongoDB
+    with MongoClient('192.168.3.71', 29099, unicode_decode_error_handler="ignore") as client:
+        db = client.quality
+        bidding_source_collection = db["bidding_source"]
+
+        count = 0
+
+        # 使用生成器函数以批处理方式查询集合
+        for item in bidding_source_collection.find().batch_size(1000):
+            count += 1
+            if count >= 10000:
+                break
+
+            if count % 1000 == 0:
+                print(count)
+
+            detail = item.get("detail", "")
+
+            # 去除<>标签
+            detail = re.sub(r"<.*?>", "", detail)
+
+            # 提取采购单位名称
+            unit_name_pattern = r"采购单位名称:([\s\S]+?)\n"
+            unit_name_match = re.search(unit_name_pattern, detail)
+            purchase_unit_name = unit_name_match.group(1) if unit_name_match else None
+
+            # 提取采购单位地址
+            purchase_unit_address = r"采购单位地址:([\s\S]+?)\n"
+            address_match = re.search(purchase_unit_address, detail)
+            city_from_unit_address = address_match.group(1) if address_match else None
+
+            # 提取项目所在行政区划编码
+            project_district_code = r"项目所在行政区划编码:(\d+)"
+            district_code_match = re.search(project_district_code, detail)
+            district_code = district_code_match.group(1) if district_code_match else None
+
+            city_info = None  # 初始化城市信息
+
+            if item.get("area") == "全国":
+                print("区域为全国,region_qa标记为0101")
+                item_region_qa = "0101"
+
+            elif not district_code:
+                if purchase_unit_name:
+                    city_info = find_city_info(purchase_unit_name)
+
+                if city_info is None and city_from_unit_address:
+                    city_info = find_city_info(city_from_unit_address)
+
+                if city_info:
+                    if city_info["省份"] == item["area"] and city_info["地市"] == item["city"]:
+                        print("省份与区域一致,地市与城市一致,不做处理")
+                    else:
+                        print("省份或地市与区域不一致,region_qa标记为0201")
+                        item_region_qa = "0201"
+                else:
+                    print("未找到匹配的城市信息,region_qa标记为0201")
+                    item_region_qa = "0201"
+
+            else:
+                print("district不为空,保持原有标记")
+                item_region_qa = item.get("region_qa", "")  # 保持原有的标记
+
+            # 更新文档
+            bidding_source_collection.update_one({"_id": item["_id"]}, {"$set": {"region_qa": item_region_qa}})
+
+def find_city_info(query):
+    for index, row in df.iterrows():
+        if query in str(row["地市"]) or query in str(row["地区代码"]):
+            return {
+                "省份": row["省份"],
+                "地市": row["地市"],
+                "区县": row["区县"],
+                "父级地区代码": row["父级地区代码"],
+                "地区代码": row["地区代码"]
+            }
+    return None
+
+# 加载地区代码的XLS文件
+xls_file = "D:\\PycharmProjects\\Dataquality\\Dataquality\\dataquality\\area.xls"
+df = pd.read_excel(xls_file)
+
+# 调用函数
+add_region_to_db()