import re from docs.config import general_config from util.sensitive_word import AcAutomation import csv from docs.config import abnormal_config from tables import CatchContentObject, fsc class ProjectcodeChecker(object): """ 项目编号字段检查 """ def __init__(self): self.errors_tables = { "0101": { "name": "项目编号字段无值但是正文疑似有值", "parent_name": "数值检测", "parent_code": "01", "checkFn": self.check0101 }, "0102": { "name": "长度大于2小于等于4", "parent_name": "长度类型", "parent_code": "02", "checkFn": self.check0102 }, "0103": { "name": "长度大于50", "parent_name": "长度类型", "parent_code": "03", "checkFn": self.check0103 }, "0201": { "name": "检查日期格式", "parent_name": "日期格式", "parent_code": "01", "checkFn": self.check0201 }, "0202": { "name": "包含异常关键字", "parent_name": "异常关键字", "parent_code": "02", "checkFn": self.check0202 }, "0203": { "name": "不包含数字字母", "parent_name": "不包含数字字母", "parent_code": "03", "checkFn": self.check0203 }, "0301":{ "name": "汉字占比>60%且不包含中国电信", "parent_name": "汉字占比", "parent_code": "01", "checkFn": self.check0301 }, "0302": { "name": "连续汉字超过9个", "parent_name": "汉字占比", "parent_code": "03", "checkFn": self.check0302 } } def check0101(self,projectcode,detail,catch_content,attach_text) -> bool: ''' :return:返回true 代表异常 ''' self.check_projectcode_ac = AcAutomation() with open(abnormal_config["table_field_config"]["path4"], "r") as f: reads = csv.reader(f) [self.check_projectcode_ac.add_word(w[0]) for w in reads] if projectcode == "": contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") #返回值是字典 content = "\n".join(contents) #字典处理成字符串 if self.check_projectcode_ac.search(content): return True for attach_index, attach_content in attach_text.items(): if attach_content: for topic_index, topic_detail in attach_content.items(): # oss地址 attach_url = topic_detail.get("attach_url", "") if attach_url: # 获取附件内容 st, content = fsc.download_text_content(attach_url) # 下载成功 # 超长文本不处理,暂定30万字 if st and content.strip(): if len(content) > 300000: continue # 开始检测 contents = catch_content.public_attachment_catch(content, platform="attach",document_id=attach_url) content = "\n".join(contents) if self.check_projectcode_ac.search(content): return True return False # 检查projectcode长度小于等于4大于2 def check0102(self,projectcode: str) -> bool: return 2 < len(projectcode) <= 4 @staticmethod # 检查projectcode长度大于50 def check0103( projectcode: str) -> bool: return len(projectcode) > 50 def check0201(self, projectcode: str) -> bool: def is_valid_date_format(s): date_format_regex = r'^\d{4}/\d{2}/\d{2}$' return re.match(date_format_regex, s) is not None return is_valid_date_format(projectcode) def check0202(self, projectcode: str) -> bool: codeUnConReg = re.compile(r"(null|勘察|测试|设计|监理|范围|分包|日)") return bool(codeUnConReg.search(projectcode)) def check0203(self, projectcode: str) -> bool: return not any(char.isalnum() for char in projectcode) def check0301(self, projectcode: str) -> bool: chinese_chars = [char for char in projectcode if '\u4e00' <= char <= '\u9fff'] chinese_chars_ratio = len(chinese_chars) / len(projectcode) return chinese_chars_ratio > 0.6 and "中国电信" not in projectcode def check0302(self, projectcode: str) -> bool: return len(re.findall(r'[\u4e00-\u9fa5]{9,}', projectcode)) > 0