title.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. """
  2. 标题字段检查
  3. """
  4. import re
  5. from docs.config import general_config
  6. from docs.config import abnormal_config
  7. import csv
  8. class TitleChecker(object):
  9. """
  10. 标题字段检查
  11. """
  12. def __init__(self):
  13. self.errors_tables = {
  14. "0101": {
  15. "name": "标题长度小于等于5",
  16. "parent_name": "长度类型",
  17. "parent_code": "01",
  18. "checkFn": self.lt5
  19. },
  20. "0102": {
  21. "name": "长度大于等于100",
  22. "parent_name": "长度类型",
  23. "parent_code": "01",
  24. "checkFn": self.gt100
  25. },
  26. "0201":{
  27. "name": "非汉字占比>55%",
  28. "parent_name": "汉字占比",
  29. "parent_code": "02",
  30. "checkFn": self.check0201
  31. },
  32. "0302": {
  33. "name": "不包含通用词汇(中标公告)",
  34. "parent_name": "语义表述不完整",
  35. "parent_code": "03",
  36. "checkFn": self.check0302
  37. }
  38. }
  39. @staticmethod
  40. def gt100(title: str) -> bool:
  41. """
  42. 标题长度大于80
  43. :param title:
  44. :return:返回true 代表异常
  45. """
  46. return len(title) >= 100
  47. @staticmethod
  48. def lt5(title: str) -> bool:
  49. """
  50. 标题长度小于5
  51. :param title:
  52. :return:返回true 代表异常
  53. """
  54. return len(title) <= 5
  55. def check0201(self,title: str) -> bool:
  56. """
  57. self参数实例方法要写,静态类方法不用写
  58. 标题非汉字占比 >55%
  59. :param title:
  60. :return:返回true 代表异常
  61. """
  62. # chinese_chars = [char for char in title if '\u4e00' <= char <= '\u9fff'] # 匹配汉字
  63. non_chinese_chars = [char for char in title if not ('\u4e00' <= char <= '\u9fff')] # 匹配非汉字和非字母数字字符
  64. non_chinese_chars_radio = len(non_chinese_chars) / len(title)
  65. if non_chinese_chars_radio > 0.5:
  66. return True
  67. return False
  68. def check0302(self,title: str) -> bool:
  69. """
  70. 没有通用后缀
  71. :param title:
  72. :return:返回true 代表异常
  73. """
  74. #标题中包含异常字符
  75. with open(abnormal_config["table_field_config"]["path6"], "r") as f:
  76. reads = csv.reader(f)
  77. for w in reads:
  78. if w[0] in title:
  79. return True
  80. #标题以异常字符结尾
  81. #re.search()匹配整个字符串,并返回第一个成功的匹配,如果匹配失败,则返回None
  82. with open(abnormal_config["table_field_config"]["path5"], "r") as f:
  83. reads = csv.reader(f)
  84. for w in reads:
  85. ret=re.search(f"{w[0]}$", title)
  86. if ret != None:
  87. return True
  88. #标题以异常字符开始
  89. p1 = re.compile(r"^[3|6|7|8|0|\.]")
  90. if p1.match(title):
  91. return True
  92. #放在最后判断
  93. #标题必须以通用词汇结尾
  94. with open(general_config["table_field_config"]["path"], "r") as f:
  95. reads = csv.reader(f)
  96. for w in reads:
  97. if re.search(f"{w[0]}$", title) !=None:
  98. return False
  99. else:
  100. return True
  101. return False