This commit is contained in:
Gregory Burd 2012-06-20 11:04:10 +01:00
parent 103766e41a
commit d30133a87a
2 changed files with 37 additions and 38 deletions

View file

@ -2,7 +2,7 @@
%% @reference [http://en.wikipedia.org/wiki/Bloom_filter]
-module(bloom).
-export([new/1, new/2, is_bloom/1, is_element/2, add_element/2, clear/1, count/1]).
-export([new/1, new/2, is_bloom/1, is_element/2, add_element/2, clear/1, count/1, filter_size/1]).
-import(math, [log/1, pow/2]).
-import(erlang, [phash2/2]).
@ -16,46 +16,46 @@
-endif.
-record(bloom, {
m = 0, % The size of the bitmap in bits.
bitmap = <<>>, % The bitmap.
k = 0, % The number of hashes.
n = 0, % The maximum number of keys.
keys = 0 % The current number of keys.
m = 0 :: non_neg_integer(), % The size of the bitmap in bits.
bitmap = <<>> :: binary(), % The bitmap.
k = 0 :: non_neg_integer(), % The number of hashes.
n = 0 :: non_neg_integer(), % The maximum number of keys.
keys = 0 :: non_neg_integer() % The current number of keys.
}).
%% @spec new(capacity) -> bloom().
%% @equiv new(capacity, 0.001)
-spec new(non_neg_integer()) -> #bloom{}.
new(N) -> new(N, 0.001).
%% @spec new(integer(), float()) -> bloom()
%% @doc Creates a new Bloom filter, given a maximum number of keys and a
%% false-positive error rate.
-spec new(non_neg_integer(), float()) -> #bloom{}.
new(N, E) when N > 0, is_float(E), E > 0, E =< 1 ->
{M, K} = calc_least_bits(N, E),
#bloom{m=M, bitmap = <<0:((M+7) div 8 * 8)>>, k=K, n=N}.
%% @spec clear(bloom()) -> bloom().
%% @doc Creates a new empty Bloom filter from an existing one.
clear(#bloom{#bitmap=Bitmap} = B) ->
B#bloom{<<0:bit_size(Bitmap)>>, n=0}.
-spec clear(#bloom{}) -> #bloom{}.
clear(#bloom{bitmap=Bitmap} = B) ->
B#bloom{bitmap = <<0:(erlang:bit_size(Bitmap))>>, n=0}.
%% @spec count(bloom()) -> unsigned().
%% @doc Returns the number of elements encoded into this Bloom filter.
count(#bloom{#keys=N}) ->
-spec count(#bloom{}) -> non_neg_integer().
count(#bloom{keys=N}) ->
N.
%% @spec filter_size(bloom()) -> unsigned().
%% @doc Returns the number of bits used in this Bloom filter.
filter_size(#bloom{#bitmap=Bitmap}) ->
-spec filter_size(#bloom{}) -> non_neg_integer().
filter_size(#bloom{bitmap=Bitmap}) ->
bit_size(Bitmap).
%% @spec is_bloom(bloom()) -> bool()
%% @doc Determines if the given argument is a bloom record.
-spec is_bloom(#bloom{}) -> true | false.
is_bloom(#bloom{}) -> true;
is_bloom(_) -> false.
%% @spec is_element(string(), bloom()) -> bool()
%% @doc Determines if the key is (probably) an element of the filter.
-spec is_element(term(), #bloom{}) -> true | false.
is_element(Key, B) -> is_element(Key, B, calc_idxs(Key, B)).
is_element(_, _, []) -> true;
is_element(Key, B, [Idx | T]) ->
@ -67,8 +67,8 @@ is_element(Key, B, [Idx | T]) ->
false -> false
end.
%% @spec add_element(string(), bloom()) -> bloom()
%% @doc Adds the key to the filter.
-spec add_element(term(), #bloom{}) -> #bloom{}.
add_element(Key, #bloom{keys=Keys, n=N, bitmap=Bitmap} = B) when Keys < N ->
Idxs = calc_idxs(Key, B),
Bitmap0 = set_bits(Bitmap, Idxs),
@ -77,6 +77,9 @@ add_element(Key, #bloom{keys=Keys, n=N, bitmap=Bitmap} = B) when Keys < N ->
false -> B#bloom{bitmap=Bitmap0, keys=Keys+1}
end.
%% @internal
%% @doc Set the bits at the provided index(s) to "1" in the binary.
-spec set_bits(binary(), list(non_neg_integer())) -> binary().
set_bits(Bin, []) -> Bin;
set_bits(Bin, [Idx | Idxs]) ->
ByteIdx = Idx div 8,
@ -85,16 +88,9 @@ set_bits(Bin, [Idx | Idxs]) ->
Byte0 = Byte bor Mask,
set_bits(<<Pre/binary, Byte0:8, Post/binary>>, Idxs).
%% set2(N, Bin) ->
%% <<L:N/bits, _:1, R/bits>> = Bin,
%% <<L/bits, 1:1, R/bits>>.
%% a(N, B) ->
%% fun (<<L:N/bits, _:1, R/bits>>) ->
%% <<L/bits, 1:1, R/bits>>
%% end(B).
% Find the optimal bitmap size and number of hashes.
%% @internal
%% @doc Find the optimal bitmap size and number of hashes.
%TODO -spec(non_neg_integer(), number()) -> non_neg_integer().
calc_least_bits(N, E) -> calc_least_bits(N, E, 1, 0, 0).
calc_least_bits(N, E, K, MinM, BestK) ->
M = -1 * K * N / log(1 - pow(E, 1/K)),
@ -105,8 +101,10 @@ calc_least_bits(N, E, K, MinM, BestK) ->
_ -> calc_least_bits(N, E, K+1, CurM, CurK)
end.
% This uses the "enhanced double hashing" algorithm.
% Todo: handle case of m > 2^32.
%% @internal
%% @doc This uses the "enhanced double hashing" algorithm.
%% TODO: handle case of m > 2^32.
%TODO -spec(term(), #bloom{}) -> list(non_neg_integer()).
calc_idxs(Key, #bloom{m=M, k=K}) ->
X = phash2(Key, M),
Y = phash2({"salt", Key}, M),

View file

@ -22,7 +22,7 @@
%% the ebloom module.
-module(ebloom).
-author('Dave Smith <dizzyd@dizzyd.com>').
-author('Greg Burd <greg@burd.me>').
-export([new/3,
insert/2,
@ -40,15 +40,16 @@
-spec new(integer(), float(), integer()) -> {ok, reference()}.
new(Count, FalseProb, _Seed) ->
bloom:new(Count, FalseProb).
{ok, bloom:new(Count, FalseProb)}.
-spec insert(reference(), binary()) -> ok.
insert(Ref, Bin) ->
bloom:add_element(Key, Ref).
bloom:add_element(Bin, Ref),
ok.
-spec contains(reference(), binary()) -> true | false.
contains(Ref, Bin) ->
is_element(Bin, Ref).
bloom:is_element(Bin, Ref).
-spec clear(reference()) -> ok.
clear(Ref) ->
@ -64,19 +65,19 @@ elements(Ref) ->
-spec effective_fpp(reference()) -> float().
effective_fpp(_Ref) ->
raise not_yet_implemented.
throw(not_yet_implemented).
-spec intersect(reference(), reference()) -> ok.
intersect(_Ref, _OtherRef) ->
raise not_yet_implemented.
throw(not_yet_implemented).
-spec union(reference(), reference()) -> ok.
union(_Ref, _OtherRef) ->
raise not_yet_implemented.
throw(not_yet_implemented).
-spec difference(reference(), reference()) -> ok.
difference(_Ref, _OtherRef) ->
raise not_yet_implemented.
throw(not_yet_implemented).
-spec serialize(reference()) -> binary().
serialize(Ref) ->