|
@@ -1,59 +0,0 @@
|
|
-import threading
|
|
|
|
-import time
|
|
|
|
-
|
|
|
|
-from common.log import logger
|
|
|
|
-from crawler.bloom_filter.RedisBloomFilter import RedisFilter
|
|
|
|
-from settings import (
|
|
|
|
- MGO_REMOVAL_DUPLICATE,
|
|
|
|
- REQUIREMENT_PHRASE
|
|
|
|
-)
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def _requirement_phrase(val: str):
|
|
|
|
- """关键词"""
|
|
|
|
- for word in REQUIREMENT_PHRASE:
|
|
|
|
- if val.find(word) != -1:
|
|
|
|
- return True
|
|
|
|
- return False
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-class Validator:
|
|
|
|
-
|
|
|
|
- def __init__(self):
|
|
|
|
- self._rbf = RedisFilter(redis_key='RemovalDuplicate_')
|
|
|
|
- self._rbf.start(1000000000, 0.00001)
|
|
|
|
- self._requirement_phrase = _requirement_phrase
|
|
|
|
- self._loop_Interval = 7200
|
|
|
|
-
|
|
|
|
- def _sync_data_rubbish(self):
|
|
|
|
- while True:
|
|
|
|
- count = 0
|
|
|
|
- cursor = MGO_REMOVAL_DUPLICATE.find(projection={'domain': 1})
|
|
|
|
- try:
|
|
|
|
- for item in cursor.sort([('_id', -1)]):
|
|
|
|
- domain = item['domain']
|
|
|
|
- if not isinstance(domain, str):
|
|
|
|
- MGO_REMOVAL_DUPLICATE.delete_one({'_id': item['_id']})
|
|
|
|
- continue
|
|
|
|
- if not self._rbf.is_exists(domain):
|
|
|
|
- self._rbf.add(domain)
|
|
|
|
- count += 1
|
|
|
|
- finally:
|
|
|
|
- logger.info(f'[过滤器]数据加载:{len(self._rbf)}条,新增:{count}条')
|
|
|
|
- time.sleep(self._loop_Interval)
|
|
|
|
-
|
|
|
|
- def load_filter(self):
|
|
|
|
- logger.info(f'[过滤器]初始化加载')
|
|
|
|
- threading.Thread(
|
|
|
|
- target=self._sync_data_rubbish,
|
|
|
|
- name='RemovalDuplicate_'
|
|
|
|
- ).start()
|
|
|
|
-
|
|
|
|
- def add_url(self, url: str):
|
|
|
|
- self._rbf.add(url)
|
|
|
|
-
|
|
|
|
- def requirement_word(self, val):
|
|
|
|
- return self._requirement_phrase(val)
|
|
|
|
-
|
|
|
|
- def url(self, url: str):
|
|
|
|
- return self._rbf.is_exists(url)
|
|
|