initial import to git
This commit is contained in:
commit
719418647c
2 changed files with 97 additions and 0 deletions
85
bloom.erl
Normal file
85
bloom.erl
Normal file
|
@ -0,0 +1,85 @@
|
|||
%% @doc Implementation of the Bloom filter data structure.
|
||||
%% @reference [http://en.wikipedia.org/wiki/Bloom_filter]
|
||||
|
||||
-module(bloom).
|
||||
-export([new/1, new/2, is_bloom/1, is_element/2, add_element/2]).
|
||||
-import(math, [log/1, pow/2]).
|
||||
-import(erlang, [phash2/2]).
|
||||
|
||||
-record(bloom, {
|
||||
m = 0, % The size of the bitmap in bits.
|
||||
bitmap = <<>>, % The bitmap.
|
||||
k = 0, % The number of hashes.
|
||||
n = 0, % The maximum number of keys.
|
||||
keys = 0 % The current number of keys.
|
||||
}).
|
||||
|
||||
%% @spec new(capacity) -> bloom()
|
||||
%% @equiv new(capacity, 0.001)
|
||||
new(N) -> new(N, 0.001).
|
||||
|
||||
%% @spec new(integer(), float()) -> bloom()
|
||||
%% @doc Creates a new Bloom filter, given a maximum number of keys and a
|
||||
%% false-positive error rate.
|
||||
new(N, E) when N > 0, is_float(E), E > 0, E =< 1 ->
|
||||
{M, K} = calc_least_bits(N, E),
|
||||
#bloom{m=M, bitmap = <<0:((M+7) div 8 * 8)>>, k=K, n=N}.
|
||||
|
||||
%% @spec is_bloom(bloom()) -> bool()
|
||||
%% @doc Determines if the given argument is a bloom record.
|
||||
is_bloom(#bloom{}) -> true;
|
||||
is_bloom(_) -> false.
|
||||
|
||||
%% @spec is_element(string(), bloom()) -> bool()
|
||||
%% @doc Determines if the key is (probably) an element of the filter.
|
||||
is_element(Key, B) -> is_element(Key, B, calc_idxs(Key, B)).
|
||||
is_element(_, _, []) -> true;
|
||||
is_element(Key, B, [Idx | T]) ->
|
||||
ByteIdx = Idx div 8,
|
||||
<<_:ByteIdx/binary, Byte:8, _/binary>> = B#bloom.bitmap,
|
||||
Mask = 1 bsl (Idx rem 8),
|
||||
case 0 =/= Byte band Mask of
|
||||
true -> is_element(Key, B, T);
|
||||
false -> false
|
||||
end.
|
||||
|
||||
%% @spec add_element(string(), bloom()) -> bloom()
|
||||
%% @doc Adds the key to the filter.
|
||||
add_element(Key, #bloom{keys=Keys, n=N, bitmap=Bitmap} = B) when Keys < N ->
|
||||
Idxs = calc_idxs(Key, B),
|
||||
Bitmap0 = set_bits(Bitmap, Idxs),
|
||||
case Bitmap0 == Bitmap of
|
||||
true -> B; % Don't increment key count for duplicates.
|
||||
false -> B#bloom{bitmap=Bitmap0, keys=Keys+1}
|
||||
end.
|
||||
|
||||
set_bits(Bin, []) -> Bin;
|
||||
set_bits(Bin, [Idx | Idxs]) ->
|
||||
ByteIdx = Idx div 8,
|
||||
<<Pre:ByteIdx/binary, Byte:8, Post/binary>> = Bin,
|
||||
Mask = 1 bsl (Idx rem 8),
|
||||
Byte0 = Byte bor Mask,
|
||||
set_bits(<<Pre/binary, Byte0:8, Post/binary>>, Idxs).
|
||||
|
||||
% Find the optimal bitmap size and number of hashes.
|
||||
calc_least_bits(N, E) -> calc_least_bits(N, E, 1, 0, 0).
|
||||
calc_least_bits(N, E, K, MinM, BestK) ->
|
||||
M = -1 * K * N / log(1 - pow(E, 1/K)),
|
||||
{CurM, CurK} = if M < MinM -> {M, K}; true -> {MinM, BestK} end,
|
||||
case K of
|
||||
1 -> calc_least_bits(N, E, K+1, M, K);
|
||||
100 -> {trunc(CurM)+1, CurK};
|
||||
_ -> calc_least_bits(N, E, K+1, CurM, CurK)
|
||||
end.
|
||||
|
||||
% This uses the "enhanced double hashing" algorithm.
|
||||
% Todo: handle case of m > 2^32.
|
||||
calc_idxs(Key, #bloom{m=M, k=K}) ->
|
||||
X = phash2(Key, M),
|
||||
Y = phash2({"salt", Key}, M),
|
||||
calc_idxs(M, K - 1, X, Y, [X]).
|
||||
calc_idxs(_, 0, _, _, Acc) -> Acc;
|
||||
calc_idxs(M, I, X, Y, Acc) ->
|
||||
Xi = (X+Y) rem M,
|
||||
Yi = (Y+I) rem M,
|
||||
calc_idxs(M, I-1, Xi, Yi, [Xi | Acc]).
|
12
bloom.pub
Normal file
12
bloom.pub
Normal file
|
@ -0,0 +1,12 @@
|
|||
{author, {"gray", "graygee@gmail.com", {2007, 10, 3}}}.
|
||||
{category, ["type"]}.
|
||||
{name, "bloom"}.
|
||||
{vsn, "0.01"}.
|
||||
{depends, []}.
|
||||
{keywords, ["bloomfilter", "bloom", "filter", "digest", "hash"]}.
|
||||
{summary, "Bloom filters"}.
|
||||
{abstract, "Implements the Bloom filter probabilistic data structure. "
|
||||
"Bloom filters are a space-efficient means to test whether an elements is a "
|
||||
"member of a set."}.
|
||||
{home, "http://code.google.com/p/bloomerl/"}.
|
||||
{source, {erl, "http://bloomerl.googlecode.com/svn/trunk/bloom.erl"}}.
|
Loading…
Reference in a new issue