import threading import time from common.log import logger from crawler.bloom_filter.RedisBloomFilter import RedisFilter from settings import ( MGO_REPETITION, REQUIREMENT_PHRASE, SENSITIVE_WORDS ) def _requirement_phrase(title: str): """关键词""" for word in REQUIREMENT_PHRASE: if title.find(word) != -1: return True return False def _sensitive_word(title: str): """敏感词""" for word in SENSITIVE_WORDS: if title.find(word) != -1: return True return False class Validator: def __init__(self): self.rbf = RedisFilter(redis_key='duplicated_') self.rbf.start(1000000000, 0.00001) self.loop_Interval = 7200 self._sensitive_word = _sensitive_word self._requirement_phrase = _requirement_phrase def _load_filter_feature(self): while True: count = 0 try: for item in MGO_REPETITION.find(projection={'url': 1}).sort([('_id', -1)]): url = item['url'] if not isinstance(url, str): MGO_REPETITION.delete_one({'_id': item['_id']}) continue if not self.rbf.is_exists(url): self.rbf.add(url) count += 1 finally: logger.info(f'[站点判重]数据加载:{len(self.rbf)}条,新增:{count}条') time.sleep(self.loop_Interval) def load_filter(self): logger.info(f'[站点判重]初始化加载') threading.Thread( target=self._load_filter_feature, name='DuplicateRemoval_' ).start() def add_filter_feature(self, feature: str): self.rbf.add(feature) def words(self, title, task): if self._sensitive_word(title): task['sensitive'] = True return False elif not self._requirement_phrase(title): task['requirement'] = True return False return True def url(self, base_url): return not self.rbf.is_exists(base_url)