123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106 |
- """
- 标题字段检查
- """
- import re
- from docs.config import general_config
- from docs.config import abnormal_config
- import csv
- class TitleChecker(object):
- """
- 标题字段检查
- """
- def __init__(self):
- self.errors_tables = {
- "0101": {
- "name": "标题长度小于等于5",
- "parent_name": "长度类型",
- "parent_code": "01",
- "checkFn": self.lt5
- },
- "0102": {
- "name": "长度大于等于100",
- "parent_name": "长度类型",
- "parent_code": "01",
- "checkFn": self.gt100
- },
- "0201":{
- "name": "非汉字占比>55%",
- "parent_name": "汉字占比",
- "parent_code": "02",
- "checkFn": self.check0201
- },
- "0302": {
- "name": "不包含通用词汇(中标公告)",
- "parent_name": "语义表述不完整",
- "parent_code": "03",
- "checkFn": self.check0302
- }
- }
- @staticmethod
- def gt100(title: str) -> bool:
- """
- 标题长度大于80
- :param title:
- :return:返回true 代表异常
- """
- return len(title) >= 100
- @staticmethod
- def lt5(title: str) -> bool:
- """
- 标题长度小于5
- :param title:
- :return:返回true 代表异常
- """
- return len(title) <= 5
- def check0201(self,title: str) -> bool:
- """
- self参数实例方法要写,静态类方法不用写
- 标题非汉字占比 >55%
- :param title:
- :return:返回true 代表异常
- """
- # chinese_chars = [char for char in title if '\u4e00' <= char <= '\u9fff'] # 匹配汉字
- non_chinese_chars = [char for char in title if not ('\u4e00' <= char <= '\u9fff')] # 匹配非汉字和非字母数字字符
- non_chinese_chars_radio = len(non_chinese_chars) / len(title)
- if non_chinese_chars_radio > 0.5:
- return True
- return False
- def check0302(self,title: str) -> bool:
- """
- 没有通用后缀
- :param title:
- :return:返回true 代表异常
- """
- #标题中包含异常字符
- with open(abnormal_config["table_field_config"]["path6"], "r") as f:
- reads = csv.reader(f)
- for w in reads:
- if w[0] in title:
- return True
- #标题以异常字符结尾
- #re.search()匹配整个字符串,并返回第一个成功的匹配,如果匹配失败,则返回None
- with open(abnormal_config["table_field_config"]["path5"], "r") as f:
- reads = csv.reader(f)
- for w in reads:
- ret=re.search(f"{w[0]}$", title)
- if ret != None:
- return True
- #标题以异常字符开始
- p1 = re.compile(r"^[3|6|7|8|0|\.]")
- if p1.match(title):
- return True
- #放在最后判断
- #标题必须以通用词汇结尾
- with open(general_config["table_field_config"]["path"], "r") as f:
- reads = csv.reader(f)
- for w in reads:
- if re.search(f"{w[0]}$", title) !=None:
- return False
- else:
- return True
- return False
|