QA
/
data_quality_server


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
							import re

from docs.config import general_config
from util.sensitive_word import AcAutomation
import csv
from docs.config import abnormal_config
from tables import CatchContentObject, fsc

class ProjectcodeChecker(object):
    """
        项目编号字段检查
    """
    def __init__(self):
        self.errors_tables = {
            "0101": {
                "name": "项目编号字段无值但是正文疑似有值",
                "parent_name": "数值检测",
                "parent_code": "01",
                "checkFn": self.check0101
            },
            "0102": {
                "name": "长度大于2小于等于4",
                "parent_name": "长度类型",
                "parent_code": "02",
                "checkFn": self.check0102
            },
            "0103": {
                "name": "长度大于50",
                "parent_name": "长度类型",
                "parent_code": "03",
                "checkFn": self.check0103
            },
            "0201": {
                "name": "检查日期格式",
                "parent_name": "日期格式",
                "parent_code": "01",
                "checkFn": self.check0201
            },
            "0202": {
                "name": "包含异常关键字",
                "parent_name": "异常关键字",
                "parent_code": "02",
                "checkFn": self.check0202
            },
            "0203": {
                "name": "不包含数字字母",
                "parent_name": "不包含数字字母",
                "parent_code": "03",
                "checkFn": self.check0203
            },
            "0301":{
                "name": "汉字占比>60%且不包含中国电信",
                "parent_name": "汉字占比",
                "parent_code": "01",
                "checkFn": self.check0301
            },

            "0302": {
                "name": "连续汉字超过9个",
                "parent_name": "汉字占比",
                "parent_code": "03",
                "checkFn": self.check0302
            }
        }

    def check0101(self,projectcode,detail,catch_content,attach_text) -> bool:
        '''
        :return:返回true 代表异常
        '''

        self.check_projectcode_ac = AcAutomation()
        with open(abnormal_config["table_field_config"]["path4"], "r") as f:
            reads = csv.reader(f)
            [self.check_projectcode_ac.add_word(w[0]) for w in reads]

        if projectcode == "":
            contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") #返回值是字典
            content = "\n".join(contents) #字典处理成字符串
            if self.check_projectcode_ac.search(content):
                return True

            for attach_index, attach_content in attach_text.items():
                if attach_content:
                    for topic_index, topic_detail in attach_content.items():
                        # oss地址
                        attach_url = topic_detail.get("attach_url", "")
                        if attach_url:
                            # 获取附件内容
                            st, content = fsc.download_text_content(attach_url)

                            # 下载成功
                            # 超长文本不处理，暂定30万字
                            if st and content.strip():
                                if len(content) > 300000:
                                    continue
                            # 开始检测
                            contents = catch_content.public_attachment_catch(content, platform="attach",document_id=attach_url)
                            content = "\n".join(contents)
                            if self.check_projectcode_ac.search(content):
                                    return True
            return False

    # 检查projectcode长度小于等于4大于2
    def check0102(self,projectcode: str) -> bool:
        return 2 < len(projectcode) <= 4

    @staticmethod
    # 检查projectcode长度大于50
    def check0103( projectcode: str) -> bool:
        return len(projectcode) > 50

    def check0201(self, projectcode: str) -> bool:
        def is_valid_date_format(s):
            date_format_regex = r'^\d{4}/\d{2}/\d{2}$'
            return re.match(date_format_regex, s) is not None

        return is_valid_date_format(projectcode)

    def check0202(self, projectcode: str) -> bool:
        codeUnConReg = re.compile(r"(null|勘察|测试|设计|监理|范围|分包|日)")
        return bool(codeUnConReg.search(projectcode))

    def check0203(self, projectcode: str) -> bool:
        return not any(char.isalnum() for char in projectcode)

    def check0301(self, projectcode: str) -> bool:
        chinese_chars = [char for char in projectcode if '\u4e00' <= char <= '\u9fff']
        chinese_chars_ratio = len(chinese_chars) / len(projectcode)
        return chinese_chars_ratio > 0.6 and "中国电信" not in projectcode

    def check0302(self, projectcode: str) -> bool:
        return len(re.findall(r'[\u4e00-\u9fa5]{9,}', projectcode)) > 0