Use ebloom by default

This commit changes the default bloom filter to
be basho’s bloom, which is significantly more
stable w.r.t. performance.  The code can still
read the old bloom filters; new files are written
with the new filters.

The default is controlled in src/hanoidb.hrl
using the USE_EBLOOM macro.
This commit is contained in:
Kresten Krab Thorup 2014-11-28 16:15:20 +01:00
parent a1bbadfb34
commit f4feca27e5
5 changed files with 66 additions and 25 deletions

View file

@ -4,7 +4,7 @@
{eunit_opts, [verbose, {report, {eunit_surefire, [{dir, "."}]}}]}.
{erl_opts, [%{d,'DEBUG',true},
{d,'USE_SCALABLE_BLOOM',true},
{d,'USE_EBLOOM',true},
{parse_transform, lager_transform},
fail_on_warning,
warn_unused_vars,

View file

@ -70,22 +70,15 @@
| value()
| filepos().
-ifdef(USE_SCALABLE_BLOOM).
-define(BLOOM_NEW(Size), {ok, hanoidb_bloom:bloom(Size, 0.01)}).
-define(BLOOM_TO_BIN(Bloom), hanoidb_bloom:encode(Bloom)). %% -> Binary
-define(BIN_TO_BLOOM(Bin), {ok, hanoidb_bloom:decode(Bin)}).
-define(BLOOM_INSERT(Bloom, Key), {ok, hanoidb_bloom:add(Key,Bloom)}).
-define(BLOOM_CONTAINS(Bloom, Key), hanoidb_bloom:member(Key, Bloom)). %% -> 'true' | 'false'
-ifdef(USE_EBLOOM).
-define(HANOI_BLOOM_TYPE, ebloom).
-else.
-define(BLOOM_NEW(Size), begin ebloom:new(Size, 0.01, Size) end).
-define(BLOOM_TO_BIN(Bloom), begin ebloom:serialize(Bloom) end). %% -> Binary
-define(BIN_TO_BLOOM(Bin), begin ebloom:deserialize(Bin) end).
-define(BLOOM_INSERT(Bloom, Key), begin ok=ebloom:insert(Bloom, Key), {ok, Bloom} end).
-define(BLOOM_CONTAINS(Bloom, Key), begin ebloom:member(Bloom, Key) end). %% -> 'true' | 'false'
-define(HANOI_BLOOM_TYPE, sbloom).
-endif.
-define(BLOOM_NEW(Size), hanoidb_util:bloom_new(Size, ?HANOI_BLOOM_TYPE)).
-define(BLOOM_TO_BIN(Bloom), hanoidb_util:bloom_to_bin(Bloom)).
-define(BIN_TO_BLOOM(Bin, Fmt), hanoidb_util:bin_to_bloom(Bin, Fmt)).
-define(BLOOM_INSERT(Bloom, Key), hanoidb_util:bloom_insert(Bloom, Key)).
-define(BLOOM_CONTAINS(Bloom, Key), hanoidb_util:bloom_contains(Bloom, Key)).

View file

@ -85,7 +85,7 @@ open(Name, Config) ->
{ok, <<RootPos:64/unsigned>>} = file:pread(File, FileInfo#file_info.size - 8, 8),
{ok, <<BloomSize:32/unsigned>>} = file:pread(File, FileInfo#file_info.size - 12, 4),
{ok, BloomData} = file:pread(File, (FileInfo#file_info.size - 12 - BloomSize), BloomSize),
{ok, Bloom} = ?BIN_TO_BLOOM(BloomData),
{ok, Bloom} = hanoidb_util:bin_to_bloom(BloomData),
%% read in the root node
Root =

View file

@ -38,7 +38,16 @@
, tstamp/0
, expiry_time/1
, has_expired/1
, ensure_expiry/1 ]).
, ensure_expiry/1
, bloom_type/1
, bloom_new/2
, bloom_to_bin/1
, bin_to_bloom/1
, bin_to_bloom/2
, bloom_insert/2
, bloom_contains/2
]).
-include("src/hanoidb.hrl").
@ -265,4 +274,43 @@ ensure_expiry(Opts) ->
ok
end.
bloom_type({ebloom, _}) ->
ebloom;
bloom_type({sbloom, _}) ->
sbloom.
bloom_new(Size, sbloom) ->
{ok, {sbloom, hanoidb_bloom:bloom(Size, 0.01)}};
bloom_new(Size, ebloom) ->
{ok, Bloom} = ebloom:new(Size, 0.01, Size),
{ok, {ebloom, Bloom}}.
bloom_to_bin({sbloom, Bloom}) ->
hanoidb_bloom:encode(Bloom);
bloom_to_bin({ebloom, Bloom}) ->
ebloom:serialize(Bloom).
bin_to_bloom(GZiped = <<16#1F, 16#8B, _/binary>>) ->
bin_to_bloom(GZiped, sbloom);
bin_to_bloom(TermBin = <<131, _/binary>>) ->
erlang:term_to_binary(TermBin);
bin_to_bloom(Blob) ->
bin_to_bloom(Blob, ebloom).
bin_to_bloom(Binary, sbloom) ->
{ok, {sbloom, hanoidb_bloom:decode(Binary)}};
bin_to_bloom(Binary, ebloom) ->
{ok, Bloom} = ebloom:deserialize(Binary),
{ok, {ebloom, Bloom}}.
bloom_insert({sbloom, Bloom}, Key) ->
{ok, {sbloom, hanoidb_bloom:add(Key, Bloom)}};
bloom_insert({ebloom, Bloom}, Key) ->
ok = ebloom:insert(Bloom, Key),
{ok, {ebloom, Bloom}}.
bloom_contains({sbloom, Bloom}, Key) ->
hanoidb_bloom:member(Key, Bloom);
bloom_contains({ebloom, Bloom}, Key) ->
ebloom:contains(Bloom, Key).

View file

@ -55,9 +55,9 @@
name :: string(),
bloom :: term(),
bloom :: {ebloom, term()} | {sbloom, term()},
block_size = ?NODE_SIZE :: integer(),
compress = none :: none | snappy | gzip, % | lz4,
compress = none :: none | snappy | gzip | lz4,
opts = [] :: list(any()),
value_count = 0 :: integer(),
@ -170,11 +170,11 @@ serialize(#state{ bloom=Bloom, index_file=File, index_file_pos=Position }=State)
exit({bad_position, Position, WrongPosition})
end,
ok = file:close(File),
erlang:term_to_binary( { State#state{ index_file=undefined, bloom=undefined }, ?BLOOM_TO_BIN(Bloom) } ).
erlang:term_to_binary( { State#state{ index_file=undefined, bloom=undefined }, ?BLOOM_TO_BIN(Bloom), hanoidb_util:bloom_type(Bloom) } ).
deserialize(Binary) ->
{State, Bin} = erlang:binary_to_term(Binary),
{ok, Bloom} = ?BIN_TO_BLOOM(Bin),
{State, Bin, Type} = erlang:binary_to_term(Binary),
{ok, Bloom} = ?BIN_TO_BLOOM(Bin, Type),
{ok, IdxFile} = do_open(State#state.name, State#state.opts, []),
State#state{ bloom=Bloom, index_file=IdxFile }.
@ -200,7 +200,7 @@ archive_nodes(#state{ nodes=[], last_node_pos=LastNodePos, last_node_size=_LastN
_ ->
LastNodePos
end,
Trailer = << 0:32/unsigned, BloomBin/binary, BloomSize:32/unsigned, RootPos:64/unsigned >>,
Trailer = [ << 0:32/unsigned>> , BloomBin, << BloomSize:32/unsigned, RootPos:64/unsigned >> ],
ok = file:write(IdxFile, Trailer),
ok = file:datasync(IdxFile),