瀏覽代碼

过滤器:敏感词检索模块

dongzhaorui 3 年之前
父節點
當前提交
fd58c79842
共有 2 個文件被更改,包括 73 次插入0 次删除
  1. 1 0
      find_source/crawler/retrieve/__init__.py
  2. 72 0
      find_source/crawler/retrieve/verify.py

+ 1 - 0
find_source/crawler/retrieve/__init__.py

@@ -0,0 +1 @@
+from .verify import Validator

+ 72 - 0
find_source/crawler/retrieve/verify.py

@@ -0,0 +1,72 @@
+import threading
+import time
+
+from common.log import logger
+from crawler.bloom_filter.RedisBloomFilter import RedisFilter
+from settings import (
+    MGO_REPETITION,
+    REQUIREMENT_PHRASE,
+    SENSITIVE_WORDS
+)
+
+
+def _requirement_phrase(title: str):
+    """关键词"""
+    for word in REQUIREMENT_PHRASE:
+        if title.find(word) != -1:
+            return True
+    return False
+
+
+def _sensitive_word(title: str):
+    """敏感词"""
+    for word in SENSITIVE_WORDS:
+        if title.find(word) != -1:
+            return False
+    return True
+
+
+class Validator:
+
+    def __init__(self):
+        self.rbf = RedisFilter(redis_key='duplicated_')
+        self.rbf.start(1000000000, 0.00001)
+        self.loop_Interval = 7200
+
+    def _load_filter_feature(self):
+        while True:
+            count = 0
+            try:
+                for item in MGO_REPETITION.find(projection={'url': 1}).sort([('_id', -1)]):
+                    url = item['url']
+                    if not isinstance(url, str):
+                        MGO_REPETITION.delete_one({'_id': item['_id']})
+                        continue
+                    if not self.rbf.is_exists(url):
+                        self.rbf.add(url)
+                        count += 1
+            finally:
+                logger.info(f'[站点判重]数据加载:{len(self.rbf)}条,新增:{count}条')
+                time.sleep(self.loop_Interval)
+
+    def load_filter(self):
+        logger.info(f'[站点判重]初始化加载')
+        threading.Thread(
+            target=self._load_filter_feature,
+            name='DuplicateRemoval_'
+        ).start()
+
+    def add_filter_feature(self, feature: str):
+        self.rbf.add(feature)
+
+    def title(self, title, task):
+        if _sensitive_word(title):
+            task['sensitive'] = True
+            return False
+        elif not _requirement_phrase(title):
+            task['requirement'] = True
+            return False
+        return True
+
+    def url(self, base_url):
+        return self.rbf.is_exists(base_url)