data_spider
/
topic_spider


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
							import threading
import time

from common.log import logger
from crawler.bloom_filter.RedisBloomFilter import RedisFilter
from settings import (
    MGO_REMOVAL_DUPLICATE,
    REQUIREMENT_PHRASE
)


def _requirement_phrase(val: str):
    """关键词"""
    for word in REQUIREMENT_PHRASE:
        if val.find(word) != -1:
            return True
    return False


class Validator:

    def __init__(self):
        self._rbf = RedisFilter(redis_key='RemovalDuplicate_')
        self._rbf.start(1000000000, 0.00001)
        self._requirement_phrase = _requirement_phrase
        self._loop_Interval = 7200

    def _sync_data_rubbish(self):
        while True:
            count = 0
            cursor = MGO_REMOVAL_DUPLICATE.find(projection={'domain': 1})
            try:
                for item in cursor.sort([('_id', -1)]):
                    domain = item['domain']
                    if not isinstance(domain, str):
                        MGO_REMOVAL_DUPLICATE.delete_one({'_id': item['_id']})
                        continue
                    if not self._rbf.is_exists(domain):
                        self._rbf.add(domain)
                        count += 1
            finally:
                logger.info(f'[过滤器]数据加载:{len(self._rbf)}条,新增:{count}条')
                time.sleep(self._loop_Interval)

    def load_filter(self):
        logger.info(f'[过滤器]初始化加载')
        threading.Thread(
            target=self._sync_data_rubbish,
            name='RemovalDuplicate_'
        ).start()

    def add_url(self, url: str):
        self._rbf.add(url)

    def requirement_word(self, val):
        return self._requirement_phrase(val)

    def url(self, url: str):
        return self._rbf.is_exists(url)