瀏覽代碼

update:修复去重指纹的生成算法

dongzhaorui 2 年之前
父節點
當前提交
195a6eb78a

+ 2 - 2
FworkSpider/feapder/dedup/README.md

@@ -95,7 +95,7 @@ def test_filter():
 from feapder.dedup import Dedup
 from feapder.dedup import Dedup
 
 
 def test_filter():
 def test_filter():
-    dedup = Dedup(Dedup.RedisClusterFilter, redis_url=["192.168.3.207:2179", "192.168.3.166:2379"], expire_time=100)
+    dedup = Dedup(Dedup.RedisFilter, redis_url=["192.168.3.207:2179", "192.168.3.166:2379"], expire_time=60)
 
 
     # 制造已存在数据
     # 制造已存在数据
     datas = ["xxx", "bbb"]
     datas = ["xxx", "bbb"]
@@ -113,7 +113,7 @@ def test_filter():
 from feapder.dedup import Dedup
 from feapder.dedup import Dedup
 
 
 def test_filter():
 def test_filter():
-    dedup = Dedup(Dedup.RedisFilter, expire_time=100)
+    dedup = Dedup(Dedup.RedisFilter, expire_time=60)
 
 
     # 制造已存在数据
     # 制造已存在数据
     datas = ["xxx", "bbb"]
     datas = ["xxx", "bbb"]

+ 0 - 108
FworkSpider/feapder/dedup/redisclusterfilter.py

@@ -1,108 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-03-01
----------
-@summary: redis集群过滤
----------
-@author: dzr
-@email: dongzhaorui@topnet.net.cn
-"""
-import copy
-
-from feapder.db.redisdb import RedisDB
-from feapder.dedup.basefilter import BaseFilter
-from feapder.utils.tools import get_sha256
-
-
-class RedisClusterFilter(BaseFilter):
-    redis_cluster = None
-
-    def __init__(self, redis_url, to_sha256: bool = True, expire_time=None):
-        if not redis_url:
-            raise ValueError("redis_url can't be None")
-
-        self.startup_nodes = redis_url
-        if self.__class__.redis_cluster is None:
-            self.__class__.redis_cluster = RedisDB(
-                ip_ports=self.startup_nodes,
-                decode_responses=True,
-                user_pass='',
-            )
-
-        self._ex = expire_time or 86400 * 365 * 2  # 2年 = 86400 * 365 * 2
-        self._prefix1 = 'list_'
-        self._prefix2 = 'pylist_'
-
-        self._to_sha256 = to_sha256
-
-    def __repr__(self):
-        return "<RedisCluster: {}>".format(self.startup_nodes)
-
-    def _deal_datas(self, datas):
-        if self._to_sha256:
-            if isinstance(datas, list):
-                keys = [get_sha256(data) for data in datas]
-            else:
-                keys = get_sha256(datas)
-        else:
-            keys = copy.deepcopy(datas)
-
-        return keys
-
-    def _exists(self, key):
-        return self.redis_cluster.exists(key)
-
-    def exists(self, key):
-        """全量检索/lua增量检索/python增量检索"""
-        if (
-                self._exists(key) > 0
-                or self._exists(self._prefix1 + key) > 0
-                or self._exists(self._prefix2 + key) > 0
-        ):
-            return True
-        return False
-
-    def add(self, keys, *args, **kwargs):
-        """
-        添加数据
-        @param keys: 检查关键词在redis_cluster中是否存在,支持列表批量
-        @return: list / 单个值(如果数据已存在 返回 False 否则返回 True, 可以理解为是否添加成功)
-        """
-        is_list = isinstance(keys, list)
-        keys = keys if is_list else [keys]
-        encrypt_keys = self._deal_datas(keys)
-
-        is_added = []
-        for key in encrypt_keys:
-            if not self.exists(key):
-                is_added.append(
-                    self.redis_cluster.set(self._prefix2 + key, 1, ex=self._ex)
-                )
-            else:
-                is_added.append(False)
-
-        return is_added if is_list else is_added[0]
-
-    def get(self, keys):
-        """
-        检查数据是否存在
-        @param keys: list / 单个值
-        @return: list / 单个值 (存在返回True 不存在返回False)
-        """
-        is_list = isinstance(keys, list)
-        keys = keys if is_list else [keys]
-        encrypt_keys = self._deal_datas(keys)
-
-        is_exist = []
-        for key in encrypt_keys:
-            is_exist.append(self.exists(key))
-
-        # 判断数据本身是否重复
-        temp_set = set()
-        for i, key in enumerate(encrypt_keys):
-            if key in temp_set:
-                is_exist[i] = True
-            else:
-                temp_set.add(key)
-
-        return is_exist if is_list else is_exist[0]

+ 31 - 32
FworkSpider/feapder/dedup/redisfilter.py

@@ -2,55 +2,51 @@
 """
 """
 Created on 2023-03-01
 Created on 2023-03-01
 ---------
 ---------
-@summary: redis单机过滤
+@summary: redis集群/单机过滤
 ---------
 ---------
 @author: dzr
 @author: dzr
 @email: dongzhaorui@topnet.net.cn
 @email: dongzhaorui@topnet.net.cn
 """
 """
-import copy
 
 
 from feapder.db.redisdb import RedisDB
 from feapder.db.redisdb import RedisDB
 from feapder.dedup.basefilter import BaseFilter
 from feapder.dedup.basefilter import BaseFilter
-from feapder.utils.tools import get_sha256
+import feapder.utils.tools as tools
 
 
 
 
 class RedisFilter(BaseFilter):
 class RedisFilter(BaseFilter):
     redis_db = None
     redis_db = None
 
 
-    def __init__(self, redis_url=None, to_sha256: bool = True, expire_time=None):
-        self._url = redis_url
+    def __init__(self, redis_url=None, expire_time=None):
         if not self.__class__.redis_db:
         if not self.__class__.redis_db:
-            self.__class__.redis_db = RedisDB(url=redis_url)
+            if isinstance(redis_url, list) and len(redis_url) > 1:
+                self.__class__.redis_db = RedisDB(
+                    ip_ports=redis_url,
+                    decode_responses=True,
+                    user_pass='',
+                )  # 集群
+            else:
+                self.__class__.redis_db = RedisDB(url=redis_url)  # 单机
 
 
         self._ex = expire_time or 86400 * 365 * 2  # 2年 = 86400 * 365 * 2
         self._ex = expire_time or 86400 * 365 * 2  # 2年 = 86400 * 365 * 2
         self._prefix1 = 'list_'
         self._prefix1 = 'list_'
         self._prefix2 = 'pylist_'
         self._prefix2 = 'pylist_'
 
 
-        self._to_sha256 = to_sha256
-
     def __repr__(self):
     def __repr__(self):
-        return "<RedisDB: {}>".format(self.redis_db)
-
-    def _deal_datas(self, datas):
-        if self._to_sha256:
-            if isinstance(datas, list):
-                keys = [get_sha256(data) for data in datas]
-            else:
-                keys = get_sha256(datas)
-        else:
-            keys = copy.deepcopy(datas)
-
-        return keys
-
-    def _exists(self, key):
-        return self.redis_db.exists(key)
+        return "<RedisFilter: {}>".format(self.redis_db)
 
 
     def exists(self, key):
     def exists(self, key):
         """全量检索/lua增量检索/python增量检索"""
         """全量检索/lua增量检索/python增量检索"""
+        if '&&' in key:
+            md5, sha256 = key.split("&&")
+            mixture = tools.get_sha256(md5)
+        else:
+            mixture = sha256 = key
+
         if (
         if (
-                self._exists(key) > 0
-                or self._exists(self._prefix1 + key) > 0
-                or self._exists(self._prefix2 + key) > 0
+                self.redis_db.exists(sha256) > 0
+                or self.redis_db.exists(self._prefix1 + sha256) > 0
+                or self.redis_db.exists(self._prefix2 + sha256) > 0
+                or self.redis_db.exists(self._prefix2 + mixture) > 0
         ):
         ):
             return True
             return True
         return False
         return False
@@ -63,13 +59,17 @@ class RedisFilter(BaseFilter):
         """
         """
         is_list = isinstance(keys, list)
         is_list = isinstance(keys, list)
         keys = keys if is_list else [keys]
         keys = keys if is_list else [keys]
-        encrypt_keys = self._deal_datas(keys)
 
 
         is_added = []
         is_added = []
-        for key in encrypt_keys:
+        for key in keys:
             if not self.exists(key):
             if not self.exists(key):
+                if '&&' in key:
+                    md5, sha256 = key.split("&&")
+                else:
+                    sha256 = key
+
                 is_added.append(
                 is_added.append(
-                    self.redis_db.set(self._prefix2 + key, 1, ex=self._ex)
+                    self.redis_db.set(self._prefix2 + sha256, 1, ex=self._ex)
                 )
                 )
             else:
             else:
                 is_added.append(False)
                 is_added.append(False)
@@ -84,15 +84,14 @@ class RedisFilter(BaseFilter):
         """
         """
         is_list = isinstance(keys, list)
         is_list = isinstance(keys, list)
         keys = keys if is_list else [keys]
         keys = keys if is_list else [keys]
-        encrypt_keys = self._deal_datas(keys)
 
 
         is_exist = []
         is_exist = []
-        for key in encrypt_keys:
+        for key in keys:
             is_exist.append(self.exists(key))
             is_exist.append(self.exists(key))
 
 
         # 判断数据本身是否重复
         # 判断数据本身是否重复
         temp_set = set()
         temp_set = set()
-        for i, key in enumerate(encrypt_keys):
+        for i, key in enumerate(keys):
             if key in temp_set:
             if key in temp_set:
                 is_exist[i] = True
                 is_exist[i] = True
             else:
             else:

+ 2 - 8
FworkSpider/feapder/utils/tools.py

@@ -42,7 +42,6 @@ import execjs  # pip install PyExecJS
 import redis
 import redis
 import requests
 import requests
 import six
 import six
-from Crypto.Hash.SHA256 import SHA256Hash
 from requests.cookies import RequestsCookieJar
 from requests.cookies import RequestsCookieJar
 from w3lib.url import canonicalize_url as _canonicalize_url
 from w3lib.url import canonicalize_url as _canonicalize_url
 
 
@@ -1745,14 +1744,9 @@ def get_sha256(*args):
     @result: 5580c91ea29bf5bd963f4c08dfcacd983566e44ecea1735102bc380576fd6f30
     @result: 5580c91ea29bf5bd963f4c08dfcacd983566e44ecea1735102bc380576fd6f30
     """
     """
 
 
-    # sha256 = hashlib.sha256()
-    # for arg in args:
-    #     sha256.update(str(arg).encode())
-    # return sha256.hexdigest()
-
-    sha256 = SHA256Hash()
+    sha256 = hashlib.sha256()
     for arg in args:
     for arg in args:
-        sha256.update(str(arg).encode('utf-8'))
+        sha256.update(str(arg).encode())
     return sha256.hexdigest()  # 64位
     return sha256.hexdigest()  # 64位
 
 
 
 

+ 1 - 1
FworkSpider/items/base_item.py

@@ -43,6 +43,6 @@ class SwordFishProjectItem(Item):
 
 
         if args:
         if args:
             args = sorted(args)
             args = sorted(args)
-            return tools.get_md5(*args)
+            return tools.get_md5(*args) + "&&" + tools.get_sha256(*args)
         else:
         else:
             return None
             return None