commit 719418647c165d80492ac23c4a0d7c9b586851e8 Author: gray Date: Tue Mar 31 04:26:00 2009 -0700 initial import to git diff --git a/bloom.erl b/bloom.erl new file mode 100644 index 0000000..e03546b --- /dev/null +++ b/bloom.erl @@ -0,0 +1,85 @@ +%% @doc Implementation of the Bloom filter data structure. +%% @reference [http://en.wikipedia.org/wiki/Bloom_filter] + +-module(bloom). +-export([new/1, new/2, is_bloom/1, is_element/2, add_element/2]). +-import(math, [log/1, pow/2]). +-import(erlang, [phash2/2]). + +-record(bloom, { + m = 0, % The size of the bitmap in bits. + bitmap = <<>>, % The bitmap. + k = 0, % The number of hashes. + n = 0, % The maximum number of keys. + keys = 0 % The current number of keys. +}). + +%% @spec new(capacity) -> bloom() +%% @equiv new(capacity, 0.001) +new(N) -> new(N, 0.001). + +%% @spec new(integer(), float()) -> bloom() +%% @doc Creates a new Bloom filter, given a maximum number of keys and a +%% false-positive error rate. +new(N, E) when N > 0, is_float(E), E > 0, E =< 1 -> + {M, K} = calc_least_bits(N, E), + #bloom{m=M, bitmap = <<0:((M+7) div 8 * 8)>>, k=K, n=N}. + +%% @spec is_bloom(bloom()) -> bool() +%% @doc Determines if the given argument is a bloom record. +is_bloom(#bloom{}) -> true; +is_bloom(_) -> false. + +%% @spec is_element(string(), bloom()) -> bool() +%% @doc Determines if the key is (probably) an element of the filter. +is_element(Key, B) -> is_element(Key, B, calc_idxs(Key, B)). +is_element(_, _, []) -> true; +is_element(Key, B, [Idx | T]) -> + ByteIdx = Idx div 8, + <<_:ByteIdx/binary, Byte:8, _/binary>> = B#bloom.bitmap, + Mask = 1 bsl (Idx rem 8), + case 0 =/= Byte band Mask of + true -> is_element(Key, B, T); + false -> false + end. + +%% @spec add_element(string(), bloom()) -> bloom() +%% @doc Adds the key to the filter. +add_element(Key, #bloom{keys=Keys, n=N, bitmap=Bitmap} = B) when Keys < N -> + Idxs = calc_idxs(Key, B), + Bitmap0 = set_bits(Bitmap, Idxs), + case Bitmap0 == Bitmap of + true -> B; % Don't increment key count for duplicates. + false -> B#bloom{bitmap=Bitmap0, keys=Keys+1} + end. + +set_bits(Bin, []) -> Bin; +set_bits(Bin, [Idx | Idxs]) -> + ByteIdx = Idx div 8, + <> = Bin, + Mask = 1 bsl (Idx rem 8), + Byte0 = Byte bor Mask, + set_bits(<
>, Idxs).
+
+% Find the optimal bitmap size and number of hashes.
+calc_least_bits(N, E) -> calc_least_bits(N, E, 1, 0, 0).
+calc_least_bits(N, E, K, MinM, BestK) ->
+    M = -1 * K * N / log(1 - pow(E, 1/K)),
+    {CurM, CurK} = if M < MinM -> {M, K}; true -> {MinM, BestK} end,
+    case K of
+          1 -> calc_least_bits(N, E, K+1, M, K);
+        100 -> {trunc(CurM)+1, CurK};
+          _ -> calc_least_bits(N, E, K+1, CurM, CurK)
+    end.
+
+% This uses the "enhanced double hashing" algorithm.
+% Todo: handle case of m > 2^32.
+calc_idxs(Key, #bloom{m=M, k=K}) ->
+    X = phash2(Key, M),
+    Y = phash2({"salt", Key}, M),
+    calc_idxs(M, K - 1, X, Y, [X]).
+calc_idxs(_, 0, _, _, Acc) -> Acc;
+calc_idxs(M, I, X, Y, Acc) ->
+    Xi = (X+Y) rem M,
+    Yi = (Y+I) rem M,
+    calc_idxs(M, I-1, Xi, Yi, [Xi | Acc]).
diff --git a/bloom.pub b/bloom.pub
new file mode 100644
index 0000000..e452581
--- /dev/null
+++ b/bloom.pub
@@ -0,0 +1,12 @@
+{author, {"gray", "graygee@gmail.com", {2007, 10, 3}}}.
+{category, ["type"]}.
+{name, "bloom"}.
+{vsn, "0.01"}.
+{depends, []}.
+{keywords, ["bloomfilter", "bloom", "filter", "digest", "hash"]}.
+{summary, "Bloom filters"}.
+{abstract, "Implements the Bloom filter probabilistic data structure. "
+"Bloom filters are a space-efficient means to test whether an elements is a "
+"member of a set."}.
+{home, "http://code.google.com/p/bloomerl/"}.
+{source, {erl, "http://bloomerl.googlecode.com/svn/trunk/bloom.erl"}}.