|
@@ -3,8 +3,8 @@ from bson import ObjectId
|
|
|
import re
|
|
|
|
|
|
def check_continuous_chinese(s):
|
|
|
- # 匹配连续出现六个或更多汉字的情况
|
|
|
- pattern = r'[\u4e00-\u9fa5]{8,}'
|
|
|
+ # 匹配连续出现9个或更多汉字的情况
|
|
|
+ pattern = r'[\u4e00-\u9fa5]{9,}'
|
|
|
result = re.search(pattern, s)
|
|
|
return bool(result)
|
|
|
def inspect_projectcode():
|
|
@@ -33,23 +33,24 @@ def inspect_projectcode():
|
|
|
projectcode_qa = []
|
|
|
|
|
|
# 检查projectcode长度
|
|
|
- if len(projectcode) > 40:
|
|
|
+ if len(projectcode) > 50:
|
|
|
projectcode_qa.append("0103")
|
|
|
- elif len(projectcode) <= 4:
|
|
|
+ elif len(projectcode) <= 4 and len(projectcode) > 2:
|
|
|
projectcode_qa.append("0102")
|
|
|
|
|
|
def is_valid_date_format(s):
|
|
|
- # 使用正则表达式匹配8位数字字符串
|
|
|
- date_format_regex = r'^(\d{8})$'
|
|
|
-
|
|
|
+ # 使用正则表达式匹配日期格式 XXXX/xx/xx
|
|
|
+ date_format_regex = r'^\d{4}/\d{2}/\d{2}$'
|
|
|
return re.match(date_format_regex, s) is not None
|
|
|
|
|
|
+ # 在你的检查函数中,添加以下代码:
|
|
|
+
|
|
|
# 检查日期格式是否正确
|
|
|
- if len(projectcode) == 8 and is_valid_date_format(projectcode):
|
|
|
+ if is_valid_date_format(projectcode):
|
|
|
projectcode_qa.append("0201")
|
|
|
|
|
|
# 检查是否包含特殊字符
|
|
|
- codeUnConReg = re.compile(r"(null|勘察|测试|设计|设备|标段|监理|范围|分包|月|日)")
|
|
|
+ codeUnConReg = re.compile(r"(null|勘察|测试|设计|监理|范围|分包|日)")
|
|
|
if codeUnConReg.search(projectcode):
|
|
|
projectcode_qa.append("0202")
|
|
|
|
|
@@ -60,13 +61,14 @@ def inspect_projectcode():
|
|
|
# 检查汉字占比
|
|
|
chinese_chars = [char for char in projectcode if '\u4e00' <= char <= '\u9fff'] # 匹配汉字
|
|
|
chinese_chars_ratio = len(chinese_chars) / len(projectcode)
|
|
|
- if chinese_chars_ratio >= 0.5 :
|
|
|
+ if chinese_chars_ratio > 0.6 and "中国电信" not in item['projectcode']:
|
|
|
projectcode_qa.append("0301")
|
|
|
# print(f"Project Code: {projectcode}")
|
|
|
|
|
|
- # 检查汉字连续出现超过六个
|
|
|
- if check_continuous_chinese(projectcode):
|
|
|
- projectcode_qa.append("0302")
|
|
|
+ # 检查汉字连续出现超过9个
|
|
|
+ if check_continuous_chinese(projectcode) and not any(
|
|
|
+ substring in projectcode for substring in ["中国电信", "政府采购项目备案书", "竞争性谈判"]):
|
|
|
+ projectcode_qa.append("0303")
|
|
|
|
|
|
print(item['_id'], projectcode_qa)
|
|
|
coll_user.update_one({"_id": item["_id"]}, {"$set": {"projectcode_qa": projectcode_qa}})
|