projectcode.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. import re
  2. from docs.config import general_config
  3. from util.sensitive_word import AcAutomation
  4. import csv
  5. from docs.config import abnormal_config
  6. from tables import CatchContentObject, fsc
  7. class ProjectcodeChecker(object):
  8. """
  9. 项目编号字段检查
  10. """
  11. def __init__(self):
  12. self.errors_tables = {
  13. "0101": {
  14. "name": "项目编号字段无值但是正文疑似有值",
  15. "parent_name": "数值检测",
  16. "parent_code": "01",
  17. "checkFn": self.check0101
  18. },
  19. "0102": {
  20. "name": "长度大于2小于等于4",
  21. "parent_name": "长度类型",
  22. "parent_code": "02",
  23. "checkFn": self.check0102
  24. },
  25. "0103": {
  26. "name": "长度大于50",
  27. "parent_name": "长度类型",
  28. "parent_code": "03",
  29. "checkFn": self.check0103
  30. },
  31. "0201": {
  32. "name": "检查日期格式",
  33. "parent_name": "日期格式",
  34. "parent_code": "01",
  35. "checkFn": self.check0201
  36. },
  37. "0202": {
  38. "name": "包含异常关键字",
  39. "parent_name": "异常关键字",
  40. "parent_code": "02",
  41. "checkFn": self.check0202
  42. },
  43. "0203": {
  44. "name": "不包含数字字母",
  45. "parent_name": "不包含数字字母",
  46. "parent_code": "03",
  47. "checkFn": self.check0203
  48. },
  49. "0301":{
  50. "name": "汉字占比>60%且不包含中国电信",
  51. "parent_name": "汉字占比",
  52. "parent_code": "01",
  53. "checkFn": self.check0301
  54. },
  55. "0302": {
  56. "name": "连续汉字超过9个",
  57. "parent_name": "汉字占比",
  58. "parent_code": "03",
  59. "checkFn": self.check0302
  60. }
  61. }
  62. def check0101(self,projectcode,detail,catch_content,attach_text) -> bool:
  63. '''
  64. :return:返回true 代表异常
  65. '''
  66. self.check_projectcode_ac = AcAutomation()
  67. with open(abnormal_config["table_field_config"]["path4"], "r") as f:
  68. reads = csv.reader(f)
  69. [self.check_projectcode_ac.add_word(w[0]) for w in reads]
  70. if projectcode == "":
  71. contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") #返回值是字典
  72. content = "\n".join(contents) #字典处理成字符串
  73. if self.check_projectcode_ac.search(content):
  74. return True
  75. for attach_index, attach_content in attach_text.items():
  76. if attach_content:
  77. for topic_index, topic_detail in attach_content.items():
  78. # oss地址
  79. attach_url = topic_detail.get("attach_url", "")
  80. if attach_url:
  81. # 获取附件内容
  82. st, content = fsc.download_text_content(attach_url)
  83. # 下载成功
  84. # 超长文本不处理,暂定30万字
  85. if st and content.strip():
  86. if len(content) > 300000:
  87. continue
  88. # 开始检测
  89. contents = catch_content.public_attachment_catch(content, platform="attach",document_id=attach_url)
  90. content = "\n".join(contents)
  91. if self.check_projectcode_ac.search(content):
  92. return True
  93. return False
  94. # 检查projectcode长度小于等于4大于2
  95. def check0102(self,projectcode: str) -> bool:
  96. return 2 < len(projectcode) <= 4
  97. @staticmethod
  98. # 检查projectcode长度大于50
  99. def check0103( projectcode: str) -> bool:
  100. return len(projectcode) > 50
  101. def check0201(self, projectcode: str) -> bool:
  102. def is_valid_date_format(s):
  103. date_format_regex = r'^\d{4}/\d{2}/\d{2}$'
  104. return re.match(date_format_regex, s) is not None
  105. return is_valid_date_format(projectcode)
  106. def check0202(self, projectcode: str) -> bool:
  107. codeUnConReg = re.compile(r"(null|勘察|测试|设计|监理|范围|分包|日)")
  108. return bool(codeUnConReg.search(projectcode))
  109. def check0203(self, projectcode: str) -> bool:
  110. return not any(char.isalnum() for char in projectcode)
  111. def check0301(self, projectcode: str) -> bool:
  112. chinese_chars = [char for char in projectcode if '\u4e00' <= char <= '\u9fff']
  113. chinese_chars_ratio = len(chinese_chars) / len(projectcode)
  114. return chinese_chars_ratio > 0.6 and "中国电信" not in projectcode
  115. def check0302(self, projectcode: str) -> bool:
  116. return len(re.findall(r'[\u4e00-\u9fa5]{9,}', projectcode)) > 0