import sys import os sys.path.append(os.path.dirname(os.getcwd())) import re from utils.execptions import ( AccountError, CheckError, ) __all__ = ['CheckText', 'CheckTask'] class CheckContent: def __init__(self): self.sensitive_words = { '正式会员', '账户充值', 'VIP会员查阅', '>(注册)<', '>(登录)<', '高级会员', '标准会员', '点击支付', # '隐私政策及用户服务协议','.*点击查看内容' } @staticmethod def check_text_length(val: str): if len(val) == 0: raise CheckError(code=10101, reason='文本内容为空') elif not re.findall(r'[\u4e00-\u9fa5]+', val, re.S): raise CheckError(code=10102, reason='不存在中文字符') else: '''清洗数字、字母、中文之外的干扰元素''' sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+'] for pattern in sub_pattern: val = re.sub(pattern, '', val) # 若文本长度小于2，表示页面内容无详情内容 if len(val) < 2: raise CheckError(code=10102, reason='页面无有效内容') @staticmethod def check_content(val: str): if val.count("部分文件可能不支持在线浏览"): raise CheckError(code=10103, reason='文件不支持在线浏览') @staticmethod def check_account_privilege(val: str): if val.count("高级会员"): raise AccountError(code=10011, reason='账号权限等级过低') elif "本招标项目仅供正式会员查阅" in val: raise AccountError(code=10012, reason='账号无会员访问权限') def check_sensitive_word(self, val: str): total = set() for word in self.sensitive_words: result = re.search(word, val) if result is not None: total.add(word) if len(total) > 0: raise CheckError(code=10104, reason='敏感词过滤') def __check(self, text): self.check_sensitive_word(text) self.check_text_length(text) self.check_content(text) self.check_account_privilege(text) def __call__(self, text: str, *args, **kwargs): self.__check(text) class CheckPrePareRequest: def __init__(self): self.crawl_keywords = { '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询', '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选', '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价', '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正', '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑', '终止', '系统' } @staticmethod def check_es_cache(title: str, publish_time: int, rows: dict): """ :param title: 标题 :param publish_time: 发布时间的时间戳(l_np_publishtime) :param rows: 采集内容 """ # retrieved_result = es_search(title, publish_time) retrieved_result = 0 if retrieved_result != 0: '''es查询数据结果''' rows['count'] = retrieved_result raise CheckError(code=10105, reason='es已收录标题') def check_crawl_title(self, title: str): for keyword in self.crawl_keywords: valid_keyword = re.search(keyword, title) if valid_keyword is not None: print(valid_keyword) break else: raise CheckError(code=10106, reason='标题未检索到采集关键词', title=title) def __check(self, rows: dict): title, publish_time = rows['title'], rows['l_np_publishtime'] self.check_crawl_title(title) self.check_es_cache(title, publish_time, rows) def __call__(self, rows: dict, *args, **kwargs): self.__check(rows) CheckText = CheckContent() CheckTask = CheckPrePareRequest()