|
@@ -9,36 +9,24 @@ Created on 2018-12-13 21:08
|
|
"""
|
|
"""
|
|
|
|
|
|
import copy
|
|
import copy
|
|
-from typing import Any, List, Union, Optional, Tuple, Callable
|
|
|
|
|
|
+from typing import Any, List, Union, Tuple, Callable, Optional
|
|
|
|
|
|
from feapder.utils.tools import get_md5
|
|
from feapder.utils.tools import get_md5
|
|
from .bloomfilter import BloomFilter, ScalableBloomFilter
|
|
from .bloomfilter import BloomFilter, ScalableBloomFilter
|
|
from .expirefilter import ExpireFilter
|
|
from .expirefilter import ExpireFilter
|
|
|
|
+from .litefilter import LiteFilter
|
|
|
|
+from .redisfilter import RedisFilter, MRedisFilter
|
|
|
|
|
|
|
|
|
|
class Dedup:
|
|
class Dedup:
|
|
BloomFilter = 1
|
|
BloomFilter = 1
|
|
MemoryFilter = 2
|
|
MemoryFilter = 2
|
|
ExpireFilter = 3
|
|
ExpireFilter = 3
|
|
|
|
+ LiteFilter = 4
|
|
|
|
+ RedisFilter = 5
|
|
|
|
+ MRedisFilter = 6
|
|
|
|
|
|
def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
|
|
def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
|
|
- """
|
|
|
|
- 去重过滤器 集成BloomFilter、MemoryFilter、ExpireFilter
|
|
|
|
- Args:
|
|
|
|
- filter_type: 过滤器类型 BloomFilter
|
|
|
|
- name: 过滤器名称 该名称会默认以dedup作为前缀 dedup:expire_set:[name]/dedup:bloomfilter:[name]。 默认ExpireFilter name=过期时间; BloomFilter name=dedup:bloomfilter:bloomfilter
|
|
|
|
- absolute_name: 过滤器绝对名称 不会加dedup前缀,当此值不为空时name参数无效
|
|
|
|
- expire_time: ExpireFilter的过期时间 单位为秒,其他两种过滤器不用指定
|
|
|
|
- error_rate: BloomFilter/MemoryFilter的误判率 默认为0.00001
|
|
|
|
- to_md5: 去重前是否将数据转为MD5,默认是
|
|
|
|
- redis_url: redis://[[username]:[password]]@localhost:6379/0
|
|
|
|
- BloomFilter 与 ExpireFilter 使用
|
|
|
|
- 默认会读取setting中的redis配置,若无setting,则需要专递redis_url
|
|
|
|
- initial_capacity: 单个布隆过滤器去重容量 默认100000000,当布隆过滤器容量满时会扩展下一个布隆过滤器
|
|
|
|
- error_rate:布隆过滤器的误判率 默认0.00001
|
|
|
|
- **kwargs:
|
|
|
|
- """
|
|
|
|
-
|
|
|
|
if filter_type == Dedup.ExpireFilter:
|
|
if filter_type == Dedup.ExpireFilter:
|
|
try:
|
|
try:
|
|
expire_time = kwargs["expire_time"]
|
|
expire_time = kwargs["expire_time"]
|
|
@@ -56,13 +44,22 @@ class Dedup:
|
|
expire_time_record_key=expire_time_record_key,
|
|
expire_time_record_key=expire_time_record_key,
|
|
redis_url=kwargs.get("redis_url"),
|
|
redis_url=kwargs.get("redis_url"),
|
|
)
|
|
)
|
|
-
|
|
|
|
|
|
+ elif filter_type == Dedup.RedisFilter:
|
|
|
|
+ self.dedup = RedisFilter(
|
|
|
|
+ ip_ports=kwargs.get("ip_ports"),
|
|
|
|
+ user_pass=kwargs.get("user_pass", ""),
|
|
|
|
+ redis_url=kwargs.get("redis_url"),
|
|
|
|
+ expire_time=kwargs.get("expire_time")
|
|
|
|
+ )
|
|
|
|
+ elif filter_type == Dedup.MRedisFilter:
|
|
|
|
+ self.dedup = MRedisFilter(
|
|
|
|
+ redis_conf=kwargs.get("redis_conf"),
|
|
|
|
+ expire_time=kwargs.get("expire_time")
|
|
|
|
+ )
|
|
else:
|
|
else:
|
|
initial_capacity = kwargs.get("initial_capacity", 100000000)
|
|
initial_capacity = kwargs.get("initial_capacity", 100000000)
|
|
error_rate = kwargs.get("error_rate", 0.00001)
|
|
error_rate = kwargs.get("error_rate", 0.00001)
|
|
- name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get(
|
|
|
|
- "name", "bloomfilter"
|
|
|
|
- )
|
|
|
|
|
|
+ name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get("name", "bloomfilter")
|
|
if filter_type == Dedup.BloomFilter:
|
|
if filter_type == Dedup.BloomFilter:
|
|
self.dedup = ScalableBloomFilter(
|
|
self.dedup = ScalableBloomFilter(
|
|
name=name,
|
|
name=name,
|
|
@@ -78,6 +75,8 @@ class Dedup:
|
|
error_rate=error_rate,
|
|
error_rate=error_rate,
|
|
bitarray_type=ScalableBloomFilter.BASE_MEMORY,
|
|
bitarray_type=ScalableBloomFilter.BASE_MEMORY,
|
|
)
|
|
)
|
|
|
|
+ elif filter_type == Dedup.LiteFilter:
|
|
|
|
+ self.dedup = LiteFilter()
|
|
else:
|
|
else:
|
|
raise ValueError(
|
|
raise ValueError(
|
|
"filter_type 类型错误,仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
|
|
"filter_type 类型错误,仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
|