|
@@ -4,7 +4,7 @@ import time
|
|
|
from common.log import logger
|
|
|
from crawler.bloom_filter.RedisBloomFilter import RedisFilter
|
|
|
from settings import (
|
|
|
- MGO_REPETITION,
|
|
|
+ MGO_REMOVAL_DUPLICATE,
|
|
|
REQUIREMENT_PHRASE,
|
|
|
SENSITIVE_WORDS
|
|
|
)
|
|
@@ -29,37 +29,38 @@ def _sensitive_word(title: str):
|
|
|
class Validator:
|
|
|
|
|
|
def __init__(self):
|
|
|
- self.rbf = RedisFilter(redis_key='duplicated_')
|
|
|
- self.rbf.start(1000000000, 0.00001)
|
|
|
- self.loop_Interval = 7200
|
|
|
+ self._rbf = RedisFilter(redis_key='RemovalDuplicate_')
|
|
|
+ self._rbf.start(1000000000, 0.00001)
|
|
|
self._sensitive_word = _sensitive_word
|
|
|
self._requirement_phrase = _requirement_phrase
|
|
|
+ self._loop_Interval = 7200
|
|
|
|
|
|
- def _load_filter_feature(self):
|
|
|
+ def _sync_data_rubbish(self):
|
|
|
while True:
|
|
|
count = 0
|
|
|
+ cursor = MGO_REMOVAL_DUPLICATE.find(projection={'domain': 1})
|
|
|
try:
|
|
|
- for item in MGO_REPETITION.find(projection={'url': 1}).sort([('_id', -1)]):
|
|
|
- url = item['url']
|
|
|
- if not isinstance(url, str):
|
|
|
- MGO_REPETITION.delete_one({'_id': item['_id']})
|
|
|
+ for item in cursor.sort([('_id', -1)]):
|
|
|
+ domain = item['domain']
|
|
|
+ if not isinstance(domain, str):
|
|
|
+ MGO_REMOVAL_DUPLICATE.delete_one({'_id': item['_id']})
|
|
|
continue
|
|
|
- if not self.rbf.is_exists(url):
|
|
|
- self.rbf.add(url)
|
|
|
+ if not self._rbf.is_exists(domain):
|
|
|
+ self._rbf.add(domain)
|
|
|
count += 1
|
|
|
finally:
|
|
|
- logger.info(f'[过滤器]数据加载:{len(self.rbf)}条,新增:{count}条')
|
|
|
- time.sleep(self.loop_Interval)
|
|
|
+ logger.info(f'[过滤器]数据加载:{len(self._rbf)}条,新增:{count}条')
|
|
|
+ time.sleep(self._loop_Interval)
|
|
|
|
|
|
def load_filter(self):
|
|
|
logger.info(f'[过滤器]初始化加载')
|
|
|
threading.Thread(
|
|
|
- target=self._load_filter_feature,
|
|
|
- name='DuplicateRemoval_'
|
|
|
+ target=self._sync_data_rubbish,
|
|
|
+ name='RemovalDuplicate_'
|
|
|
).start()
|
|
|
|
|
|
- def add_filter_feature(self, feature: str):
|
|
|
- self.rbf.add(feature)
|
|
|
+ def add_url(self, url: str):
|
|
|
+ self._rbf.add(url)
|
|
|
|
|
|
def words(self, title, task):
|
|
|
if self._sensitive_word(title):
|
|
@@ -70,5 +71,5 @@ class Validator:
|
|
|
return False
|
|
|
return True
|
|
|
|
|
|
- def url(self, base_url):
|
|
|
- return not self.rbf.is_exists(base_url)
|
|
|
+ def url(self, url: str):
|
|
|
+ return self._rbf.is_exists(url)
|