title.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. """
  2. 标题字段检查
  3. """
  4. import re
  5. from docs.config import general_config
  6. from docs.config import abnormal_config
  7. import csv
  8. class TitleChecker(object):
  9. """
  10. 标题字段检查
  11. """
  12. def __init__(self):
  13. self.errors_tables = {
  14. "0101": {
  15. "name": "标题长度小于等于5",
  16. "parent_name": "长度类型",
  17. "parent_code": "01",
  18. "checkFn": self.lt5
  19. },
  20. "0102": {
  21. "name": "长度大于等于100",
  22. "parent_name": "长度类型",
  23. "parent_code": "01",
  24. "checkFn": self.gt100
  25. },
  26. "0201":{
  27. "name": "非汉字占比>55%",
  28. "parent_name": "汉字占比",
  29. "parent_code": "02",
  30. "checkFn": self.check0201
  31. },
  32. # "0302": {
  33. # "name": "不包含通用词汇(中标公告)",
  34. # "parent_name": "语义表述不完整",
  35. # "parent_code": "03",
  36. # "checkFn": self.check0302
  37. # },
  38. "0303": {
  39. "name": "包含叠词,异常词汇,特殊词汇(测试,公告公告等)",
  40. "parent_name": "语义表述不完整",
  41. "parent_code": "03",
  42. "checkFn": self.check0303
  43. }
  44. }
  45. @staticmethod
  46. def gt100(title: str) -> bool:
  47. """
  48. 标题长度大于80
  49. :param title:
  50. :return:返回true 代表异常
  51. """
  52. return len(title) >= 100
  53. @staticmethod
  54. def lt5(title: str) -> bool:
  55. """
  56. 标题长度小于5
  57. :param title:
  58. :return:返回true 代表异常
  59. """
  60. return len(title) <= 5
  61. def check0201(self,title: str) -> bool:
  62. """
  63. self参数实例方法要写,静态类方法不用写
  64. 标题非汉字占比 >55%
  65. :param title:
  66. :return:返回true 代表异常
  67. """
  68. # chinese_chars = [char for char in title if '\u4e00' <= char <= '\u9fff'] # 匹配汉字
  69. non_chinese_chars = [char for char in title if not ('\u4e00' <= char <= '\u9fff')] # 匹配非汉字和非字母数字字符
  70. non_chinese_chars_radio = len(non_chinese_chars) / len(title)
  71. if non_chinese_chars_radio > 0.5:
  72. return True
  73. return False
  74. def check0302(self,title: str) -> bool:
  75. """
  76. 没有通用后缀
  77. :param title:
  78. :return:返回true 代表异常
  79. """
  80. #标题必须以通用词汇结尾
  81. with open(general_config["table_field_config"]["path"], "r") as f:
  82. reads = csv.reader(f)
  83. for w in reads:
  84. if w[0] in title:
  85. return False
  86. else:
  87. return True
  88. return False
  89. def check0303(self, title: str) -> bool:
  90. """
  91. 没有通用后缀
  92. :param title:
  93. :return:返回true 代表异常
  94. """
  95. #标题中包含异常字符
  96. with open(abnormal_config["table_field_config"]["path6"], "r") as f:
  97. reads = csv.reader(f)
  98. for w in reads:
  99. if w[0] in title:
  100. return True
  101. #标题以异常字符结尾
  102. #re.search()匹配整个字符串,并返回第一个成功的匹配,如果匹配失败,则返回None
  103. # with open(abnormal_config["table_field_config"]["path5"], "r") as f:
  104. # reads = csv.reader(f)
  105. # for w in reads:
  106. # ret=re.search(f"{w[0]}$", title)
  107. # if (ret!= None) or (ret.group() == w[0]):
  108. # return True
  109. p2 = re.search("[nbsp\..\...\.]$",title)
  110. #re.search():匹配整个字符串,并返回第一个成功的匹配,如果匹配失败,则返回None
  111. if p2!=None:
  112. return True
  113. #标题以异常字符开始
  114. p1 = re.search("^[36780\.]",title)
  115. if p1!=None:
  116. return True