123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132 |
- import re
- from docs.config import general_config
- from util.sensitive_word import AcAutomation
- import csv
- from docs.config import abnormal_config
- from tables import CatchContentObject, fsc
- class ProjectcodeChecker(object):
- """
- 项目编号字段检查
- """
- def __init__(self):
- self.errors_tables = {
- "0101": {
- "name": "项目编号字段无值但是正文疑似有值",
- "parent_name": "数值检测",
- "parent_code": "01",
- "checkFn": self.check0101
- },
- "0102": {
- "name": "长度大于2小于等于4",
- "parent_name": "长度类型",
- "parent_code": "02",
- "checkFn": self.check0102
- },
- "0103": {
- "name": "长度大于50",
- "parent_name": "长度类型",
- "parent_code": "03",
- "checkFn": self.check0103
- },
- "0201": {
- "name": "检查日期格式",
- "parent_name": "日期格式",
- "parent_code": "01",
- "checkFn": self.check0201
- },
- "0202": {
- "name": "包含异常关键字",
- "parent_name": "异常关键字",
- "parent_code": "02",
- "checkFn": self.check0202
- },
- "0203": {
- "name": "不包含数字字母",
- "parent_name": "不包含数字字母",
- "parent_code": "03",
- "checkFn": self.check0203
- },
- "0301":{
- "name": "汉字占比>60%且不包含中国电信",
- "parent_name": "汉字占比",
- "parent_code": "01",
- "checkFn": self.check0301
- },
- "0302": {
- "name": "连续汉字超过9个",
- "parent_name": "汉字占比",
- "parent_code": "03",
- "checkFn": self.check0302
- }
- }
- def check0101(self,projectcode,detail,catch_content,attach_text) -> bool:
- '''
- :return:返回true 代表异常
- '''
- self.check_projectcode_ac = AcAutomation()
- with open(abnormal_config["table_field_config"]["path4"], "r") as f:
- reads = csv.reader(f)
- [self.check_projectcode_ac.add_word(w[0]) for w in reads]
- if projectcode == "":
- contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") #返回值是字典
- content = "\n".join(contents) #字典处理成字符串
- if self.check_projectcode_ac.search(content):
- return True
- for attach_index, attach_content in attach_text.items():
- if attach_content:
- for topic_index, topic_detail in attach_content.items():
- # oss地址
- attach_url = topic_detail.get("attach_url", "")
- if attach_url:
- # 获取附件内容
- st, content = fsc.download_text_content(attach_url)
- # 下载成功
- # 超长文本不处理,暂定30万字
- if st and content.strip():
- if len(content) > 300000:
- continue
- # 开始检测
- contents = catch_content.public_attachment_catch(content, platform="attach",document_id=attach_url)
- content = "\n".join(contents)
- if self.check_projectcode_ac.search(content):
- return True
- return False
- # 检查projectcode长度小于等于4大于2
- def check0102(self,projectcode: str) -> bool:
- return 2 < len(projectcode) <= 4
- @staticmethod
- # 检查projectcode长度大于50
- def check0103( projectcode: str) -> bool:
- return len(projectcode) > 50
- def check0201(self, projectcode: str) -> bool:
- def is_valid_date_format(s):
- date_format_regex = r'^\d{4}/\d{2}/\d{2}$'
- return re.match(date_format_regex, s) is not None
- return is_valid_date_format(projectcode)
- def check0202(self, projectcode: str) -> bool:
- codeUnConReg = re.compile(r"(null|勘察|测试|设计|监理|范围|分包|日)")
- return bool(codeUnConReg.search(projectcode))
- def check0203(self, projectcode: str) -> bool:
- return not any(char.isalnum() for char in projectcode)
- def check0301(self, projectcode: str) -> bool:
- chinese_chars = [char for char in projectcode if '\u4e00' <= char <= '\u9fff']
- chinese_chars_ratio = len(chinese_chars) / len(projectcode)
- return chinese_chars_ratio > 0.6 and "中国电信" not in projectcode
- def check0302(self, projectcode: str) -> bool:
- return len(re.findall(r'[\u4e00-\u9fa5]{9,}', projectcode)) > 0
|