""" 标题字段检查 """ import re from docs.config import general_config from docs.config import abnormal_config import csv class TitleChecker(object): """ 标题字段检查 """ def __init__(self): self.errors_tables = { "0101": { "name": "标题长度小于等于5", "parent_name": "长度类型", "parent_code": "01", "checkFn": self.lt5 }, "0102": { "name": "长度大于等于100", "parent_name": "长度类型", "parent_code": "01", "checkFn": self.gt100 }, "0201":{ "name": "非汉字占比>55%", "parent_name": "汉字占比", "parent_code": "02", "checkFn": self.check0201 }, "0302": { "name": "不包含通用词汇(中标公告)", "parent_name": "语义表述不完整", "parent_code": "03", "checkFn": self.check0302 } } @staticmethod def gt100(title: str) -> bool: """ 标题长度大于80 :param title: :return:返回true 代表异常 """ return len(title) >= 100 @staticmethod def lt5(title: str) -> bool: """ 标题长度小于5 :param title: :return:返回true 代表异常 """ return len(title) <= 5 def check0201(self,title: str) -> bool: """ self参数实例方法要写,静态类方法不用写 标题非汉字占比 >55% :param title: :return:返回true 代表异常 """ # chinese_chars = [char for char in title if '\u4e00' <= char <= '\u9fff'] # 匹配汉字 non_chinese_chars = [char for char in title if not ('\u4e00' <= char <= '\u9fff')] # 匹配非汉字和非字母数字字符 non_chinese_chars_radio = len(non_chinese_chars) / len(title) if non_chinese_chars_radio > 0.5: return True return False def check0302(self,title: str) -> bool: """ 没有通用后缀 :param title: :return:返回true 代表异常 """ #标题中包含异常字符 with open(abnormal_config["table_field_config"]["path6"], "r") as f: reads = csv.reader(f) for w in reads: if w[0] in title: return True #标题以异常字符结尾 #re.search()匹配整个字符串,并返回第一个成功的匹配,如果匹配失败,则返回None with open(abnormal_config["table_field_config"]["path5"], "r") as f: reads = csv.reader(f) for w in reads: ret=re.search(f"{w[0]}$", title) if ret != None: return True #标题以异常字符开始 p1 = re.compile(r"^[3|6|7|8|0|\.]") if p1.match(title): return True #放在最后判断 #标题必须以通用词汇结尾 with open(general_config["table_field_config"]["path"], "r") as f: reads = csv.reader(f) for w in reads: if re.search(f"{w[0]}$", title) !=None: return False else: return True return False