dongzhaorui 2 anni fa
parent
commit
adb20cad4a

+ 4 - 4
FworkSpider/feapder/dedup/__init__.py

@@ -15,7 +15,7 @@ from feapder.utils.tools import get_md5
 from .bloomfilter import BloomFilter, ScalableBloomFilter
 from .expirefilter import ExpireFilter
 from .litefilter import LiteFilter
-from .swordfishfilter import SwordFishFilter
+from .redisclusterfilter import RedisClusterFilter
 
 
 class Dedup:
@@ -23,7 +23,7 @@ class Dedup:
     MemoryFilter = 2
     ExpireFilter = 3
     LiteFilter = 4
-    SwordFishFilter = 5
+    RedisClusterFilter = 5
 
     def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
         if filter_type == Dedup.ExpireFilter:
@@ -43,8 +43,8 @@ class Dedup:
                 expire_time_record_key=expire_time_record_key,
                 redis_url=kwargs.get("redis_url"),
             )
-        elif filter_type == Dedup.SwordFishFilter:
-            self.dedup = SwordFishFilter(
+        elif filter_type == Dedup.RedisClusterFilter:
+            self.dedup = RedisClusterFilter(
                 redis_url=kwargs.get("redis_url"),
                 expire_time=kwargs.get("expire_time")
             )

+ 29 - 19
FworkSpider/feapder/dedup/swordfishfilter.py → FworkSpider/feapder/dedup/redisclusterfilter.py

@@ -2,21 +2,22 @@
 """
 Created on 2023-03-01
 ---------
-@summary:
+@summary: redis集群过滤
 ---------
 @author: dzr
 @email: dongzhaorui@topnet.net.cn
 """
-from Crypto.Hash import SHA256
+import copy
 
 from feapder.db.redisdb import RedisDB
 from feapder.dedup.basefilter import BaseFilter
+from feapder.utils.tools import get_sha256
 
 
-class SwordFishFilter(BaseFilter):
+class RedisClusterFilter(BaseFilter):
     redis_cluster = None
 
-    def __init__(self, redis_url, expire_time=None):
+    def __init__(self, redis_url, to_sha256: bool = True, expire_time=None):
         if not redis_url:
             raise ValueError("redis_url can't be None")
 
@@ -29,28 +30,35 @@ class SwordFishFilter(BaseFilter):
             )
 
         self._ex = expire_time or 86400 * 365 * 2  # 2年 = 86400 * 365 * 2
-        self._prefix = 'pylist_'
+        self._prefix1 = 'list_'
+        self._prefix2 = 'pylist_'
+
+        self._to_sha256 = to_sha256
 
     def __repr__(self):
         return "<RedisCluster: {}>".format(self.startup_nodes)
 
-    @staticmethod
-    def sha256_encrypt(info):
-        if info is None:
-            return ''
-        res = SHA256.new(info.encode('utf-8'))
-        data = res.hexdigest()
-        return data
+    def _deal_datas(self, datas):
+        if self._to_sha256:
+            if isinstance(datas, list):
+                keys = [get_sha256(data) for data in datas]
+            else:
+                keys = get_sha256(datas)
+        else:
+            keys = copy.deepcopy(datas)
 
-    def encrypt_datas(self, datas):
-        return [self.sha256_encrypt(data) for data in datas]
+        return keys
 
     def _exists(self, key):
         return self.redis_cluster.exists(key)
 
     def exists(self, key):
-        """全量检索或者列表页检索"""
-        if self._exists(key) > 0 or self._exists(self._prefix + key) > 0:
+        """全量检索/lua增量检索/python增量检索"""
+        if (
+                self._exists(key) > 0
+                or self._exists(self._prefix1 + key) > 0
+                or self._exists(self._prefix2 + key) > 0
+        ):
             return True
         return False
 
@@ -62,12 +70,14 @@ class SwordFishFilter(BaseFilter):
         """
         is_list = isinstance(keys, list)
         keys = keys if is_list else [keys]
-        encrypt_keys = self.encrypt_datas(keys)
+        encrypt_keys = self._deal_datas(keys)
 
         is_added = []
         for key in encrypt_keys:
             if not self.exists(key):
-                is_added.append(self.redis_cluster.set(self._prefix + key, 1, ex=self._ex))
+                is_added.append(
+                    self.redis_cluster.set(self._prefix2 + key, 1, ex=self._ex)
+                )
             else:
                 is_added.append(False)
 
@@ -81,7 +91,7 @@ class SwordFishFilter(BaseFilter):
         """
         is_list = isinstance(keys, list)
         keys = keys if is_list else [keys]
-        encrypt_keys = self.encrypt_datas(keys)
+        encrypt_keys = self._deal_datas(keys)
 
         is_exist = []
         for key in encrypt_keys: