|
@@ -25,8 +25,8 @@ class BloomFilter(object):
|
|
def __init__(self, data_size, error_rate=0.00001):
|
|
def __init__(self, data_size, error_rate=0.00001):
|
|
"""
|
|
"""
|
|
|
|
|
|
- :param data_size: 所需存放数据的数量
|
|
|
|
- :param error_rate: 可接受的误报率,默认0.00001
|
|
|
|
|
|
+ :param data_size: 数据量
|
|
|
|
+ :param error_rate: 误报率,默认0.00001
|
|
|
|
|
|
通过这两个参数来确定需要多少个哈希函数以及位数组的大小
|
|
通过这两个参数来确定需要多少个哈希函数以及位数组的大小
|
|
|
|
|
|
@@ -118,39 +118,50 @@ class BloomFilter(object):
|
|
|
|
|
|
def _adjust_param(self, data_size, error_rate):
|
|
def _adjust_param(self, data_size, error_rate):
|
|
"""
|
|
"""
|
|
- :param data_size:
|
|
|
|
- :param error_rate:
|
|
|
|
- :return:
|
|
|
|
-
|
|
|
|
通过数据量和期望的误报率 计算出 位数组大小 和 哈希函数的数量
|
|
通过数据量和期望的误报率 计算出 位数组大小 和 哈希函数的数量
|
|
- k为哈希函数个数 m为位数组大小
|
|
|
|
|
|
+
|
|
|
|
+ k为哈希函数个数 m为位数组大小
|
|
n为数据量 p为误报率
|
|
n为数据量 p为误报率
|
|
- m = - (nlnp)/(ln2)^2
|
|
|
|
|
|
+ m = n * abs(ln(P)) / (k * (ln(2) ** 2))
|
|
|
|
+
|
|
|
|
+ k = log2(1/P)
|
|
|
|
|
|
- k = (m/n) ln2
|
|
|
|
|
|
+ # solving for m = bits_per_slice
|
|
|
|
+ # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
|
|
|
|
+ # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
|
|
|
|
+ # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
|
|
|
|
+ # k ~= log2(1/P)
|
|
|
|
+
|
|
|
|
+ :param data_size: 数据量
|
|
|
|
+ :param error_rate: 误报率
|
|
|
|
+ :return: 位数组大小 和 哈希函数的数量
|
|
"""
|
|
"""
|
|
p = error_rate
|
|
p = error_rate
|
|
n = data_size
|
|
n = data_size
|
|
- """
|
|
|
|
- # M = num_bits (num_slices * bits_per_slice) 位数组大小
|
|
|
|
- # k = num_slicesk 哈希函数个数
|
|
|
|
- # P = error_rate (error_rate)
|
|
|
|
- # n = capacity(data_size)
|
|
|
|
- # k = log2(1/P)
|
|
|
|
-
|
|
|
|
- # solving for m = bits_per_slice
|
|
|
|
- # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
|
|
|
|
- # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
|
|
|
|
- # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
|
|
|
|
-
|
|
|
|
- # k = int(math.ceil(math.log(1.0 / p, 2)))
|
|
|
|
- # bits_per_slice = int(math.ceil((n * abs(math.log(p))) / (k * (math.log(2) ** 2))))
|
|
|
|
- # m = k * bits_per_slice
|
|
|
|
- # return m, k
|
|
|
|
- """
|
|
|
|
- m = - (n * (math.log(p, math.e)) / (math.log(2, math.e)) ** 2)
|
|
|
|
- k = m / n * math.log(2, math.e)
|
|
|
|
- return int(m), int(k)
|
|
|
|
|
|
+ k = int(math.ceil(math.log(1.0 / p, 2)))
|
|
|
|
+ bits_per_slice = int(math.ceil((n * abs(math.log(p))) / (k * (math.log(2) ** 2))))
|
|
|
|
+ m = k * bits_per_slice
|
|
|
|
+ return m, k
|
|
|
|
+
|
|
|
|
+ # def _adjust_param(self, data_size, error_rate):
|
|
|
|
+ # """
|
|
|
|
+ # 通过数据量和期望的误报率 计算出 位数组大小 和 哈希函数的数量
|
|
|
|
+ #
|
|
|
|
+ # k为哈希函数个数 m为位数组大小
|
|
|
|
+ # n为数据量 p为误报率
|
|
|
|
+ # m = - (n*lnp)/(ln2)^2
|
|
|
|
+ #
|
|
|
|
+ # k = (m/n) ln2
|
|
|
|
+ #
|
|
|
|
+ # :param data_size: 数据量
|
|
|
|
+ # :param error_rate: 误报率
|
|
|
|
+ # :return:
|
|
|
|
+ # """
|
|
|
|
+ # p = error_rate
|
|
|
|
+ # n = data_size
|
|
|
|
+ # m = int(- (n * (math.log(p, math.e)) / (math.log(2, math.e)) ** 2))
|
|
|
|
+ # k = int(m / n * math.log(2, math.e))
|
|
|
|
+ # return m, k
|
|
|
|
|
|
def __len__(self):
|
|
def __len__(self):
|
|
""""
|
|
""""
|