title.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. """
  2. 标题字段检查
  3. """
  4. import re
  5. from docs.config import general_config
  6. from util.sensitive_word import AcAutomation
  7. import csv
  8. class TitleChecker(object):
  9. """
  10. 标题字段检查
  11. """
  12. def __init__(self):
  13. self.errors_tables = {
  14. "0101": {
  15. "name": "标题长度小于等于5",
  16. "parent_name": "长度类型",
  17. "parent_code": "01",
  18. "checkFn": self.lt5
  19. },
  20. "0102": {
  21. "name": "长度大于等于100",
  22. "parent_name": "长度类型",
  23. "parent_code": "01",
  24. "checkFn": self.gt100
  25. },
  26. "0201":{
  27. "name": "非汉字占比>55%",
  28. "parent_name": "汉字占比",
  29. "parent_code": "02",
  30. "checkFn": self.check0201
  31. },
  32. "0302": {
  33. "name": "不包含通用词汇(中标公告)",
  34. "parent_name": "语义表述不完整",
  35. "parent_code": "03",
  36. "checkFn": self.check0302
  37. }
  38. }
  39. @staticmethod
  40. def gt100(title: str) -> bool:
  41. """
  42. 标题长度大于80
  43. :param title:
  44. :return:返回true 代表异常
  45. """
  46. return len(title) >= 100
  47. @staticmethod
  48. def lt5(title: str) -> bool:
  49. """
  50. 标题长度小于5
  51. :param title:
  52. :return:返回true 代表异常
  53. """
  54. return len(title) <= 5
  55. def check0201(self,title: str) -> bool:
  56. """
  57. self参数实例方法要写,静态类方法不用写
  58. 标题非汉字占比 >55%
  59. :param title:
  60. :return:返回true 代表异常
  61. """
  62. # chinese_chars = [char for char in title if '\u4e00' <= char <= '\u9fff'] # 匹配汉字
  63. non_chinese_chars = [char for char in title if not ('\u4e00' <= char <= '\u9fff')] # 匹配非汉字和非字母数字字符
  64. non_chinese_chars_radio = len(non_chinese_chars) / len(title)
  65. if non_chinese_chars_radio > 0.5:
  66. return True
  67. return False
  68. def check0302(self,title: str) -> bool:
  69. """
  70. 没有通用后缀
  71. :param title:
  72. :return:返回true 代表异常
  73. """
  74. self.check_general_ac = AcAutomation()
  75. with open(general_config["table_field_config"]["path"], "r") as f:
  76. reads = csv.reader(f)
  77. [self.check_general_ac.add_word(w[0]) for w in reads]
  78. p1 = re.compile(r"^[3|6|7|8|0|\.]")
  79. p2 = re.compile(".*--")
  80. if p1.match(title):
  81. print(11111)
  82. return True
  83. if p2.match(title):
  84. print(2222)
  85. return True
  86. if self.check_general_ac.search(title):
  87. return False
  88. return True