Pārlūkot izejas kodu

项目编号优化

lizhikun 1 gadu atpakaļ
vecāks
revīzija
c61169d631
1 mainītis faili ar 15 papildinājumiem un 13 dzēšanām
  1. 15 13
      Dataquality/dataquality/inspect_projectcode.py

+ 15 - 13
Dataquality/dataquality/inspect_projectcode.py

@@ -3,8 +3,8 @@ from bson import ObjectId
 import re
 
 def check_continuous_chinese(s):
-    # 匹配连续出现个或更多汉字的情况
-    pattern = r'[\u4e00-\u9fa5]{8,}'
+    # 匹配连续出现9个或更多汉字的情况
+    pattern = r'[\u4e00-\u9fa5]{9,}'
     result = re.search(pattern, s)
     return bool(result)
 def inspect_projectcode():
@@ -33,23 +33,24 @@ def inspect_projectcode():
             projectcode_qa = []
 
             # 检查projectcode长度
-            if len(projectcode) > 40:
+            if len(projectcode) > 50:
                 projectcode_qa.append("0103")
-            elif len(projectcode) <= 4:
+            elif len(projectcode) <= 4 and len(projectcode) > 2:
                 projectcode_qa.append("0102")
 
             def is_valid_date_format(s):
-                # 使用正则表达式匹配8位数字字符串
-                date_format_regex = r'^(\d{8})$'
-
+                # 使用正则表达式匹配日期格式 XXXX/xx/xx
+                date_format_regex = r'^\d{4}/\d{2}/\d{2}$'
                 return re.match(date_format_regex, s) is not None
 
+            # 在你的检查函数中,添加以下代码:
+
             # 检查日期格式是否正确
-            if len(projectcode) == 8 and is_valid_date_format(projectcode):
+            if is_valid_date_format(projectcode):
                 projectcode_qa.append("0201")
 
             # 检查是否包含特殊字符
-            codeUnConReg = re.compile(r"(null|勘察|测试|设计|设备|标段|监理|范围|分包|月|日)")
+            codeUnConReg = re.compile(r"(null|勘察|测试|设计|监理|范围|分包|日)")
             if codeUnConReg.search(projectcode):
                 projectcode_qa.append("0202")
 
@@ -60,13 +61,14 @@ def inspect_projectcode():
             # 检查汉字占比
             chinese_chars = [char for char in projectcode if '\u4e00' <= char <= '\u9fff']  # 匹配汉字
             chinese_chars_ratio = len(chinese_chars) / len(projectcode)
-            if chinese_chars_ratio >= 0.5 :
+            if chinese_chars_ratio > 0.6 and "中国电信" not in item['projectcode']:
                 projectcode_qa.append("0301")
             # print(f"Project Code: {projectcode}")
 
-            # 检查汉字连续出现超过六个
-            if check_continuous_chinese(projectcode):
-                projectcode_qa.append("0302")
+            # 检查汉字连续出现超过9个
+            if check_continuous_chinese(projectcode) and not any(
+                    substring in projectcode for substring in ["中国电信", "政府采购项目备案书", "竞争性谈判"]):
+                projectcode_qa.append("0303")
 
         print(item['_id'], projectcode_qa)
         coll_user.update_one({"_id": item["_id"]}, {"$set": {"projectcode_qa": projectcode_qa}})