dongzhaorui 3 лет назад
Родитель
Сommit
f29cefa23c
1 измененных файлов с 20 добавлено и 19 удалено
  1. 20 19
      find_source/crawler/retrieve/verify.py

+ 20 - 19
find_source/crawler/retrieve/verify.py

@@ -4,7 +4,7 @@ import time
 from common.log import logger
 from crawler.bloom_filter.RedisBloomFilter import RedisFilter
 from settings import (
-    MGO_REPETITION,
+    MGO_REMOVAL_DUPLICATE,
     REQUIREMENT_PHRASE,
     SENSITIVE_WORDS
 )
@@ -29,37 +29,38 @@ def _sensitive_word(title: str):
 class Validator:
 
     def __init__(self):
-        self.rbf = RedisFilter(redis_key='duplicated_')
-        self.rbf.start(1000000000, 0.00001)
-        self.loop_Interval = 7200
+        self._rbf = RedisFilter(redis_key='RemovalDuplicate_')
+        self._rbf.start(1000000000, 0.00001)
         self._sensitive_word = _sensitive_word
         self._requirement_phrase = _requirement_phrase
+        self._loop_Interval = 7200
 
-    def _load_filter_feature(self):
+    def _sync_data_rubbish(self):
         while True:
             count = 0
+            cursor = MGO_REMOVAL_DUPLICATE.find(projection={'domain': 1})
             try:
-                for item in MGO_REPETITION.find(projection={'url': 1}).sort([('_id', -1)]):
-                    url = item['url']
-                    if not isinstance(url, str):
-                        MGO_REPETITION.delete_one({'_id': item['_id']})
+                for item in cursor.sort([('_id', -1)]):
+                    domain = item['domain']
+                    if not isinstance(domain, str):
+                        MGO_REMOVAL_DUPLICATE.delete_one({'_id': item['_id']})
                         continue
-                    if not self.rbf.is_exists(url):
-                        self.rbf.add(url)
+                    if not self._rbf.is_exists(domain):
+                        self._rbf.add(domain)
                         count += 1
             finally:
-                logger.info(f'[过滤器]数据加载:{len(self.rbf)}条,新增:{count}条')
-                time.sleep(self.loop_Interval)
+                logger.info(f'[过滤器]数据加载:{len(self._rbf)}条,新增:{count}条')
+                time.sleep(self._loop_Interval)
 
     def load_filter(self):
         logger.info(f'[过滤器]初始化加载')
         threading.Thread(
-            target=self._load_filter_feature,
-            name='DuplicateRemoval_'
+            target=self._sync_data_rubbish,
+            name='RemovalDuplicate_'
         ).start()
 
-    def add_filter_feature(self, feature: str):
-        self.rbf.add(feature)
+    def add_url(self, url: str):
+        self._rbf.add(url)
 
     def words(self, title, task):
         if self._sensitive_word(title):
@@ -70,5 +71,5 @@ class Validator:
             return False
         return True
 
-    def url(self, base_url):
-        return not self.rbf.is_exists(base_url)
+    def url(self, url: str):
+        return self._rbf.is_exists(url)