1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- """
- 标题字段检查
- """
- import re
- from docs.config import general_config
- from util.sensitive_word import AcAutomation
- import csv
- class TitleChecker(object):
- """
- 标题字段检查
- """
- def __init__(self):
- self.errors_tables = {
- "0101": {
- "name": "标题长度小于等于5",
- "parent_name": "长度类型",
- "parent_code": "01",
- "checkFn": self.lt5
- },
- "0102": {
- "name": "长度大于等于100",
- "parent_name": "长度类型",
- "parent_code": "01",
- "checkFn": self.gt100
- },
- "0201":{
- "name": "非汉字占比>55%",
- "parent_name": "汉字占比",
- "parent_code": "02",
- "checkFn": self.check0201
- },
- "0302": {
- "name": "不包含通用词汇(中标公告)",
- "parent_name": "语义表述不完整",
- "parent_code": "03",
- "checkFn": self.check0302
- }
- }
- @staticmethod
- def gt100(title: str) -> bool:
- """
- 标题长度大于80
- :param title:
- :return:返回true 代表异常
- """
- return len(title) >= 100
- @staticmethod
- def lt5(title: str) -> bool:
- """
- 标题长度小于5
- :param title:
- :return:返回true 代表异常
- """
- return len(title) <= 5
- def check0201(self,title: str) -> bool:
- """
- self参数实例方法要写,静态类方法不用写
- 标题非汉字占比 >55%
- :param title:
- :return:返回true 代表异常
- """
- # chinese_chars = [char for char in title if '\u4e00' <= char <= '\u9fff'] # 匹配汉字
- non_chinese_chars = [char for char in title if not ('\u4e00' <= char <= '\u9fff')] # 匹配非汉字和非字母数字字符
- non_chinese_chars_radio = len(non_chinese_chars) / len(title)
- if non_chinese_chars_radio > 0.5:
- return True
- return False
- def check0302(self,title: str) -> bool:
- """
- 没有通用后缀
- :param title:
- :return:返回true 代表异常
- """
- self.check_general_ac = AcAutomation()
- with open(general_config["table_field_config"]["path"], "r") as f:
- reads = csv.reader(f)
- [self.check_general_ac.add_word(w[0]) for w in reads]
- p1 = re.compile(r"^[3|6|7|8|0|\.]")
- p2 = re.compile(".*--")
- if p1.match(title):
- print(11111)
- return True
- if p2.match(title):
- print(2222)
- return True
- if self.check_general_ac.search(title):
- return False
- return True
|