Browse Source

项目编号

lizhikun 1 year ago
parent
commit
42ad906750

+ 1 - 1
Dataquality/dataquality/es.py

@@ -2,7 +2,7 @@
 # -*- coding:utf-8 -*-
 # author : liumiaomiao
 #从es库中导出数据到测试环境mongo库
-from BasicMethods.es_tools import esutil
+from Dataquality.BasicMethods.es_tools import esutil
 from pymongo import MongoClient
 def ES_bidding(es_query):
     """

+ 5 - 5
Dataquality/dataquality/export_execl.py

@@ -65,8 +65,8 @@ def run_data_daochu():
     # mongo_list = ["_id", "title", "projectname","projectcode","buyer","s_winner", "bidamount","area","city","district","bidopentime",'href','title_qa','projectname_qa','region_qa','projectcode_qa','bidopentime_qa','budget_qa','bidamount_qa']
     # title_name = ['id', '标题', '项目名称', '项目编号', '采购单位', '中标单位', '中标金额', '省', '市', '县', '开标日期', '原文地址']
     # mongo_list = ["_id", "title", "projectname", "projectcode", "buyer", "s_winner", "bidamount", "area", "city","district", "bidopentime", 'href']
-    title_name = ['id','得分']
-    mongo_list = ["id", 'score']
+    title_name = ['id','项目名称','标题','项目编号','herf','projectcode_qa']
+    mongo_list = ["id", 'projectname', "title", "projectcode", 'href','projectcode_qa']
     data_list_correct = []
     data_list_error =[]
     for item in coll_user.find():
@@ -77,11 +77,11 @@ def run_data_daochu():
             info_temp.append(pankong(mongo_list[i],item))
         # if item["title_qa"] or item["projectname_qa"]or item["region_qa"]or item["projectcode_qa"]or item["bidopentime_qa"]or item["budget_qa"]or item["bidamount_qa"]:
         #是你所有要写入的数据
-        # if  "flag" in item and item["flag"] == 1 :
-        data_list_error.append(info_temp)
+        if item['projectcode_qa']!=[]:
+            data_list_error.append(info_temp)
         # else:
         #     data_list_correct.append(info_temp)
         # print(data_list_error)
-    writeExcel("output_error.xlsx", "统计详情", title_name, data_list_error)
+    writeExcel("projectcode.xlsx", "统计详情", title_name, data_list_error)
     # writeExcel("output_error.xlsx", "统计详情", title_name, data_list_error)
 run_data_daochu()

+ 9 - 14
Dataquality/dataquality/inspect_area.py

@@ -1,10 +1,6 @@
-'''
-区域字段数据质量检查脚本
-编写人:刘苗苗
-'''
 from bson import ObjectId
 from pymongo import MongoClient
-from BasicMethods.quality_tools import qu
+from BasicMethods.area_quality import au
 
 db = MongoClient('192.168.3.206', 27080, unicode_decode_error_handler="ignore").data_quality
 coll_user = db["bidding_20230707"]
@@ -20,39 +16,38 @@ def pankong(key, item):
 correct_count=0
 count = 0
 #find里面 单条数据用法:{"_id":ObjectId("64a8bb45990ffa1883accd78")}
-#for item in coll_user.find({"_id":ObjectId("64a8bb45990ffa1883accd78")})
 for item in coll_user.find().batch_size(1000):
     count += 1
     if count % 1000==0:
         print(count)
     title = pankong("title", item)
     #根据title抽取出来的省份
-    expect_area1 = qu.export_area(title)
+    expect_area1 = au.export_area(title)
     # print(expect_area1)
     # 根据buyer抽取出来的省份
     buyer = pankong("buyer", item)
-    expect_area = qu.export_area(buyer)
+    expect_area = au.export_area(buyer)
     # 根据buyeraddr抽取出来的省份
     buyeraddr = pankong("buyeraddr", item)
-    expect_area4 = qu.export_area(buyeraddr)
+    expect_area4 = au.export_area(buyeraddr)
     # print(expect_area)
     #根据s_winner抽取出来的省份
     s_winner=pankong("s_winner",item)
-    expect_area2 = qu.export_area(s_winner)
+    expect_area2 = au.export_area(s_winner)
     # 根据winneradder抽取出来的省份
     winneraddr = pankong("winneraddr", item)
-    expect_area3 = qu.export_area(winneraddr)
+    expect_area3 = au.export_area(winneraddr)
     # 根据agency抽取出来的省份
     agency = pankong("agency", item)
-    expect_area5 = qu.export_area(agency)
+    expect_area5 = au.export_area(agency)
     # 根据agencyaddr抽取出来的省份
     agencyaddr = pankong("agencyaddr", item)
-    expect_area6 = qu.export_area(agencyaddr)
+    expect_area6 = au.export_area(agencyaddr)
 
     #数据组抽取出来的省份,需要验证的字段
     tmp_area = pankong("area", item)
     #抽取出来的地区,通过函数在转换一遍
-    actual_area = qu.export_area(tmp_area)
+    actual_area = au.export_area(tmp_area)
     flag_buyer = 0
     flag_title = 0
     flag_s_winner = 0

+ 77 - 0
Dataquality/dataquality/inspect_projectcode.py

@@ -0,0 +1,77 @@
+from pymongo import MongoClient
+from bson import ObjectId
+import re
+
+def check_continuous_chinese(s):
+    # 匹配连续出现六个或更多汉字的情况
+    pattern = r'[\u4e00-\u9fa5]{8,}'
+    result = re.search(pattern, s)
+    return bool(result)
+def inspect_projectcode():
+    # 连接到MongoDB
+    db = MongoClient('192.168.3.71', 29099, unicode_decode_error_handler="ignore").quality
+    coll_user = db["bidding_source"]
+    count = 0
+    # for item in coll_user.find({"_id":ObjectId("64d466eab44bf08751d7a613")}):
+    for item in coll_user.find().batch_size(1000):
+        count += 1
+        if count % 1000 == 0:
+            print(count)
+
+        if "projectcode" not in item:
+            # 如果projectcode字段不存在,将projectcode_qa标记为0000
+            projectcode_qa = ["0000"]
+        #检查projectcode字段无值但正文疑似有
+        elif not item["projectcode"]:
+            # 如果projectcode为空,检查detail字段
+            if "detail" in item and "项目编号" in item["detail"]:
+                projectcode_qa = ["0101"]  # detail中包含"项目编号",标记为0101
+            else:
+                projectcode_qa = []  # detail中不包含"项目编号",不做处理
+        else:
+            projectcode = item["projectcode"]
+            projectcode_qa = []
+
+            # 检查projectcode长度
+            if len(projectcode) > 40:
+                projectcode_qa.append("0103")
+            elif len(projectcode) <= 4:
+                projectcode_qa.append("0102")
+
+            def is_valid_date_format(s):
+                # 使用正则表达式匹配8位数字字符串
+                date_format_regex = r'^(\d{8})$'
+
+                return re.match(date_format_regex, s) is not None
+
+            # 检查日期格式是否正确
+            if len(projectcode) == 8 and is_valid_date_format(projectcode):
+                projectcode_qa.append("0201")
+
+            # 检查是否包含特殊字符
+            codeUnConReg = re.compile(r"(null|勘察|测试|设计|设备|标段|监理|范围|分包|月|日)")
+            if codeUnConReg.search(projectcode):
+                projectcode_qa.append("0202")
+
+            # 检查是否包含字母数字
+            if not any(char.isalnum() for char in projectcode):
+                projectcode_qa.append("0203")
+
+            # 检查汉字占比
+            chinese_chars = [char for char in projectcode if '\u4e00' <= char <= '\u9fff']  # 匹配汉字
+            chinese_chars_ratio = len(chinese_chars) / len(projectcode)
+            if chinese_chars_ratio >= 0.5 :
+                projectcode_qa.append("0301")
+            # print(f"Project Code: {projectcode}")
+
+            # 检查汉字连续出现超过六个
+            if check_continuous_chinese(projectcode):
+                projectcode_qa.append("0302")
+
+        print(item['_id'], projectcode_qa)
+        coll_user.update_one({"_id": item["_id"]}, {"$set": {"projectcode_qa": projectcode_qa}})
+        print(f"Updating document with ID: {item['_id']} with projectcode_qa: {projectcode_qa}")
+
+
+# 调用检查函数
+inspect_projectcode()