Add bloom filter to btree index format

This commit is contained in:
Kresten Krab Thorup 2012-01-04 15:36:52 +01:00
parent 6e13f55044
commit 5af86b9e23
6 changed files with 47 additions and 19 deletions

View file

@ -2,5 +2,6 @@
{cover_enabled, true}.
{deps, [
{plain_fsm, "1.1.*", {git, "git://github.com/uwiger/plain_fsm", {branch, "master"}}}
{plain_fsm, "1.1.*", {git, "git://github.com/uwiger/plain_fsm", {branch, "master"}}},
{ebloom, "1.0.*", {git, "git://github.com/basho/ebloom.git", {branch, "master"}}}
]}.

View file

@ -269,7 +269,8 @@ begin_merge(State) ->
Owner = self(),
spawn_link(fun() ->
{ok, OutCount} = fractal_btree_merger:merge(AFileName, BFileName, XFileName),
{ok, OutCount} = fractal_btree_merger:merge(AFileName, BFileName, XFileName,
State#state.level * 2),
Owner ! {merge_done, OutCount, XFileName}
end),

View file

@ -4,12 +4,12 @@
%% Naive Merge of two b-trees. A better implementation should iterate leafs, not KV's
%%
-export([merge/3]).
-export([merge/4]).
-record(state, { out, a_pid, b_pid }).
merge(A,B,C) ->
{ok, Out} = fractal_btree_writer:open(C),
merge(A,B,C, Size) ->
{ok, Out} = fractal_btree_writer:open(C, Size),
Owner = self(),
PID1 = spawn_link(fun() -> scan(Owner, A) end),
PID2 = spawn_link(fun() -> scan(Owner, B) end),

View file

@ -5,7 +5,7 @@
-export([open/1,close/1,lookup/2,fold/3]).
-record(node, { level, members=[] }).
-record(index, {file, root}).
-record(index, {file, root, bloom}).
open(Name) ->
@ -14,11 +14,15 @@ open(Name) ->
%% read root position
{ok, <<RootPos:64/unsigned>>} = file:pread(File, FileInfo#file_info.size-8, 8),
{ok, <<BloomSize:32/unsigned>>} = file:pread(File, FileInfo#file_info.size-12, 4),
{ok, BloomData} = file:pread(File, FileInfo#file_info.size-12-BloomSize ,BloomSize),
{ok, Bloom} = ebloom:deserialize(BloomData),
%% suck in the root
{ok, Root} = read_node(File, RootPos),
{ok, #index{file=File, root=Root}}.
{ok, #index{file=File, root=Root, bloom=Bloom}}.
fold(Fun, Acc0, #index{file=File}) ->
@ -43,8 +47,13 @@ close(#index{file=File}) ->
file:close(File).
lookup(#index{file=File, root=Node},Key) ->
lookup_in_node(File,Node,Key).
lookup(#index{file=File, root=Node, bloom=Bloom}, Key) ->
case ebloom:contains(Bloom, Key) of
true ->
lookup_in_node(File,Node,Key);
false ->
notfound
end.
lookup_in_node(_File,#node{level=0,members=Members},Key) ->
case lists:keyfind(Key,1,Members) of

View file

@ -1,6 +1,11 @@
-module(fractal_btree_writer).
%%
%% Streaming btree writer. Accepts only monotonically increasing keys for put.
%%
%% TODO: add a bloom filter to the file
-define(NODE_SIZE, 2*1024).
-behavior(gen_server).
@ -9,7 +14,7 @@
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
-export([open/1, add/3,close/1]).
-export([open/1, open/2, add/3,close/1]).
-record(node, { level, members=[], size=0 }).
@ -20,14 +25,20 @@
nodes = [] :: [ #node{} ],
name :: string()
name :: string(),
bloom
}).
%%% PUBLIC API
open(Name,Size) ->
gen_server:start(?MODULE, [Name,Size], []).
open(Name) ->
gen_server:start(?MODULE, [Name], []).
gen_server:start(?MODULE, [Name,2048], []).
add(Ref,Key,Data) ->
@ -39,15 +50,16 @@ close(Ref) ->
%%%
init([Name]) ->
init([Name,Size]) ->
% io:format("got name: ~p~n", [Name]),
{ok, IdxFile} = file:open( fractal_btree_util:index_file_name(Name),
[raw, exclusive, write, delayed_write]),
{ok, BloomFilter} = ebloom:new(Size, 0.01, 123),
{ok, #state{ name=Name,
index_file_pos=0, index_file=IdxFile
index_file_pos=0, index_file=IdxFile,
bloom = BloomFilter
}}.
handle_cast({add, Key, Data}, State) when is_binary(Key), is_binary(Data) ->
@ -79,9 +91,12 @@ code_change(_OldVsn, State, _Extra) ->
flush_nodes(#state{ nodes=[], last_node_pos=LastNodePos }=State) ->
flush_nodes(#state{ nodes=[], last_node_pos=LastNodePos, bloom=Ref }=State) ->
Trailer = << 0:8, LastNodePos:64/unsigned >>,
Bloom = ebloom:serialize(Ref),
BloomSize = byte_size(Bloom),
Trailer = << 0:32, Bloom/binary, BloomSize:32/unsigned, LastNodePos:64/unsigned >>,
IdxFile = State#state.index_file,
ok = file:write(IdxFile, Trailer),
@ -112,6 +127,8 @@ add_record(Level, Key, Value, #state{ nodes=[ #node{level=Level, members=List, s
NewSize = NodeSize + fractal_btree_util:estimate_node_size_increment(List, Key, Value),
ebloom:insert( State#state.bloom, Key ),
NodeMembers = [{Key,Value} | List],
if
NewSize >= ?NODE_SIZE ->

View file

@ -83,7 +83,7 @@ merge_test() ->
ok = fractal_btree_writer:close(BT2),
{Time,{ok,Count}} = timer:tc(fractal_btree_merger, merge, ["test1", "test2", "test3"]),
{Time,{ok,Count}} = timer:tc(fractal_btree_merger, merge, ["test1", "test2", "test3", 8]),
error_logger:info_msg("time to merge: ~p/sec (time=~p, count=~p)~n", [1000000/(Time/Count), Time/1000000, Count]),