check_utils.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. import sys
  2. import os
  3. sys.path.append(os.path.dirname(os.getcwd()))
  4. import re
  5. from utils.execptions import (
  6. AccountError,
  7. CheckError,
  8. )
  9. __all__ = ['CheckText', 'CheckTask']
  10. class CheckContent:
  11. def __init__(self):
  12. self.sensitive_words = {
  13. '正式会员', '账户充值', 'VIP会员查阅', '>(注册)<', '>(登录)<', '高级会员',
  14. '标准会员', '点击支付',
  15. # '隐私政策及用户服务协议','.*<a href=\"(.*?)\">点击查看内容'
  16. }
  17. @staticmethod
  18. def check_text_length(val: str):
  19. if len(val) == 0:
  20. raise CheckError(code=10101, reason='文本内容为空')
  21. elif not re.findall(r'[\u4e00-\u9fa5]+', val, re.S):
  22. raise CheckError(code=10102, reason='不存在中文字符')
  23. else:
  24. '''清洗数字、字母、中文之外的干扰元素'''
  25. sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
  26. for pattern in sub_pattern:
  27. val = re.sub(pattern, '', val)
  28. # 若文本长度小于2,表示页面内容无详情内容
  29. if len(val) < 2:
  30. raise CheckError(code=10102, reason='页面无有效内容')
  31. @staticmethod
  32. def check_content(val: str):
  33. if val.count("部分文件可能不支持在线浏览"):
  34. raise CheckError(code=10103, reason='文件不支持在线浏览')
  35. @staticmethod
  36. def check_account_privilege(val: str):
  37. if val.count("高级会员"):
  38. raise AccountError(code=10011, reason='账号权限等级过低')
  39. elif "本招标项目仅供正式会员查阅" in val:
  40. raise AccountError(code=10012, reason='账号无会员访问权限')
  41. def check_sensitive_word(self, val: str):
  42. total = set()
  43. for word in self.sensitive_words:
  44. result = re.search(word, val)
  45. if result is not None:
  46. total.add(word)
  47. if len(total) > 0:
  48. raise CheckError(code=10104, reason='敏感词过滤')
  49. def __check(self, text):
  50. self.check_sensitive_word(text)
  51. self.check_text_length(text)
  52. self.check_content(text)
  53. self.check_account_privilege(text)
  54. def __call__(self, text: str, *args, **kwargs):
  55. self.__check(text)
  56. class CheckPrePareRequest:
  57. def __init__(self):
  58. self.crawl_keywords = {
  59. '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
  60. '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
  61. '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
  62. '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
  63. '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
  64. '终止', '系统'
  65. }
  66. @staticmethod
  67. def check_es_cache(title: str, publish_time: int, rows: dict):
  68. """
  69. :param title: 标题
  70. :param publish_time: 发布时间的时间戳(l_np_publishtime)
  71. :param rows: 采集内容
  72. """
  73. # retrieved_result = es_search(title, publish_time)
  74. retrieved_result = 0
  75. if retrieved_result != 0:
  76. '''es查询数据结果'''
  77. rows['count'] = retrieved_result
  78. raise CheckError(code=10105, reason='es已收录标题')
  79. def check_crawl_title(self, title: str):
  80. for keyword in self.crawl_keywords:
  81. valid_keyword = re.search(keyword, title)
  82. if valid_keyword is not None:
  83. print(valid_keyword)
  84. break
  85. else:
  86. raise CheckError(code=10106, reason='标题未检索到采集关键词', title=title)
  87. def __check(self, rows: dict):
  88. title, publish_time = rows['title'], rows['l_np_publishtime']
  89. self.check_crawl_title(title)
  90. self.check_es_cache(title, publish_time, rows)
  91. def __call__(self, rows: dict, *args, **kwargs):
  92. self.__check(rows)
  93. CheckText = CheckContent()
  94. CheckTask = CheckPrePareRequest()