123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- import threading
- import time
- from common.log import logger
- from crawler.bloom_filter.RedisBloomFilter import RedisFilter
- from settings import (
- MGO_REPETITION,
- REQUIREMENT_PHRASE,
- SENSITIVE_WORDS
- )
- def _requirement_phrase(title: str):
- """关键词"""
- for word in REQUIREMENT_PHRASE:
- if title.find(word) != -1:
- return True
- return False
- def _sensitive_word(title: str):
- """敏感词"""
- for word in SENSITIVE_WORDS:
- if title.find(word) != -1:
- return False
- return True
- class Validator:
- def __init__(self):
- self.rbf = RedisFilter(redis_key='duplicated_')
- self.rbf.start(1000000000, 0.00001)
- self.loop_Interval = 7200
- def _load_filter_feature(self):
- while True:
- count = 0
- try:
- for item in MGO_REPETITION.find(projection={'url': 1}).sort([('_id', -1)]):
- url = item['url']
- if not isinstance(url, str):
- MGO_REPETITION.delete_one({'_id': item['_id']})
- continue
- if not self.rbf.is_exists(url):
- self.rbf.add(url)
- count += 1
- finally:
- logger.info(f'[站点判重]数据加载:{len(self.rbf)}条,新增:{count}条')
- time.sleep(self.loop_Interval)
- def load_filter(self):
- logger.info(f'[站点判重]初始化加载')
- threading.Thread(
- target=self._load_filter_feature,
- name='DuplicateRemoval_'
- ).start()
- def add_filter_feature(self, feature: str):
- self.rbf.add(feature)
- def title(self, title, task):
- if _sensitive_word(title):
- task['sensitive'] = True
- return False
- elif not _requirement_phrase(title):
- task['requirement'] = True
- return False
- return True
- def url(self, base_url):
- return self.rbf.is_exists(base_url)
|