import threading import time from common.log import logger from crawler.bloom_filter.RedisBloomFilter import RedisFilter from settings import ( MGO_REMOVAL_DUPLICATE, REQUIREMENT_PHRASE ) def _requirement_phrase(val: str): """关键词""" for word in REQUIREMENT_PHRASE: if val.find(word) != -1: return True return False class Validator: def __init__(self): self._rbf = RedisFilter(redis_key='RemovalDuplicate_') self._rbf.start(1000000000, 0.00001) self._requirement_phrase = _requirement_phrase self._loop_Interval = 7200 def _sync_data_rubbish(self): while True: count = 0 cursor = MGO_REMOVAL_DUPLICATE.find(projection={'domain': 1}) try: for item in cursor.sort([('_id', -1)]): domain = item['domain'] if not isinstance(domain, str): MGO_REMOVAL_DUPLICATE.delete_one({'_id': item['_id']}) continue if not self._rbf.is_exists(domain): self._rbf.add(domain) count += 1 finally: logger.info(f'[过滤器]数据加载:{len(self._rbf)}条,新增:{count}条') time.sleep(self._loop_Interval) def load_filter(self): logger.info(f'[过滤器]初始化加载') threading.Thread( target=self._sync_data_rubbish, name='RemovalDuplicate_' ).start() def add_url(self, url: str): self._rbf.add(url) def requirement_word(self, val): return self._requirement_phrase(val) def url(self, url: str): return self._rbf.is_exists(url)