From 5af86b9e23f997f7081c10172bca0705bd8eb19d Mon Sep 17 00:00:00 2001 From: Kresten Krab Thorup Date: Wed, 4 Jan 2012 15:36:52 +0100 Subject: [PATCH] Add bloom filter to btree index format --- rebar.config | 3 ++- src/fractal_btree_level.erl | 3 ++- src/fractal_btree_merger.erl | 6 +++--- src/fractal_btree_reader.erl | 17 +++++++++++++---- src/fractal_btree_writer.erl | 35 ++++++++++++++++++++++++++--------- test/fractal_btree_tests.erl | 2 +- 6 files changed, 47 insertions(+), 19 deletions(-) diff --git a/rebar.config b/rebar.config index a1dd19f..5ff7129 100644 --- a/rebar.config +++ b/rebar.config @@ -2,5 +2,6 @@ {cover_enabled, true}. {deps, [ - {plain_fsm, "1.1.*", {git, "git://github.com/uwiger/plain_fsm", {branch, "master"}}} + {plain_fsm, "1.1.*", {git, "git://github.com/uwiger/plain_fsm", {branch, "master"}}}, + {ebloom, "1.0.*", {git, "git://github.com/basho/ebloom.git", {branch, "master"}}} ]}. diff --git a/src/fractal_btree_level.erl b/src/fractal_btree_level.erl index c2f9c9b..cd2f55f 100644 --- a/src/fractal_btree_level.erl +++ b/src/fractal_btree_level.erl @@ -269,7 +269,8 @@ begin_merge(State) -> Owner = self(), spawn_link(fun() -> - {ok, OutCount} = fractal_btree_merger:merge(AFileName, BFileName, XFileName), + {ok, OutCount} = fractal_btree_merger:merge(AFileName, BFileName, XFileName, + State#state.level * 2), Owner ! {merge_done, OutCount, XFileName} end), diff --git a/src/fractal_btree_merger.erl b/src/fractal_btree_merger.erl index 39d7de0..74acd3b 100644 --- a/src/fractal_btree_merger.erl +++ b/src/fractal_btree_merger.erl @@ -4,12 +4,12 @@ %% Naive Merge of two b-trees. A better implementation should iterate leafs, not KV's %% --export([merge/3]). +-export([merge/4]). -record(state, { out, a_pid, b_pid }). -merge(A,B,C) -> - {ok, Out} = fractal_btree_writer:open(C), +merge(A,B,C, Size) -> + {ok, Out} = fractal_btree_writer:open(C, Size), Owner = self(), PID1 = spawn_link(fun() -> scan(Owner, A) end), PID2 = spawn_link(fun() -> scan(Owner, B) end), diff --git a/src/fractal_btree_reader.erl b/src/fractal_btree_reader.erl index e72b1e4..13654b7 100644 --- a/src/fractal_btree_reader.erl +++ b/src/fractal_btree_reader.erl @@ -5,7 +5,7 @@ -export([open/1,close/1,lookup/2,fold/3]). -record(node, { level, members=[] }). --record(index, {file, root}). +-record(index, {file, root, bloom}). open(Name) -> @@ -14,11 +14,15 @@ open(Name) -> %% read root position {ok, <>} = file:pread(File, FileInfo#file_info.size-8, 8), + {ok, <>} = file:pread(File, FileInfo#file_info.size-12, 4), + {ok, BloomData} = file:pread(File, FileInfo#file_info.size-12-BloomSize ,BloomSize), + + {ok, Bloom} = ebloom:deserialize(BloomData), %% suck in the root {ok, Root} = read_node(File, RootPos), - {ok, #index{file=File, root=Root}}. + {ok, #index{file=File, root=Root, bloom=Bloom}}. fold(Fun, Acc0, #index{file=File}) -> @@ -43,8 +47,13 @@ close(#index{file=File}) -> file:close(File). -lookup(#index{file=File, root=Node},Key) -> - lookup_in_node(File,Node,Key). +lookup(#index{file=File, root=Node, bloom=Bloom}, Key) -> + case ebloom:contains(Bloom, Key) of + true -> + lookup_in_node(File,Node,Key); + false -> + notfound + end. lookup_in_node(_File,#node{level=0,members=Members},Key) -> case lists:keyfind(Key,1,Members) of diff --git a/src/fractal_btree_writer.erl b/src/fractal_btree_writer.erl index e3ee7c6..70da656 100644 --- a/src/fractal_btree_writer.erl +++ b/src/fractal_btree_writer.erl @@ -1,6 +1,11 @@ - -module(fractal_btree_writer). +%% +%% Streaming btree writer. Accepts only monotonically increasing keys for put. +%% + +%% TODO: add a bloom filter to the file + -define(NODE_SIZE, 2*1024). -behavior(gen_server). @@ -9,7 +14,7 @@ -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). --export([open/1, add/3,close/1]). +-export([open/1, open/2, add/3,close/1]). -record(node, { level, members=[], size=0 }). @@ -20,14 +25,20 @@ nodes = [] :: [ #node{} ], - name :: string() + name :: string(), + + bloom }). %%% PUBLIC API +open(Name,Size) -> + gen_server:start(?MODULE, [Name,Size], []). + + open(Name) -> - gen_server:start(?MODULE, [Name], []). + gen_server:start(?MODULE, [Name,2048], []). add(Ref,Key,Data) -> @@ -39,15 +50,16 @@ close(Ref) -> %%% -init([Name]) -> +init([Name,Size]) -> % io:format("got name: ~p~n", [Name]), {ok, IdxFile} = file:open( fractal_btree_util:index_file_name(Name), [raw, exclusive, write, delayed_write]), - + {ok, BloomFilter} = ebloom:new(Size, 0.01, 123), {ok, #state{ name=Name, - index_file_pos=0, index_file=IdxFile + index_file_pos=0, index_file=IdxFile, + bloom = BloomFilter }}. handle_cast({add, Key, Data}, State) when is_binary(Key), is_binary(Data) -> @@ -79,9 +91,12 @@ code_change(_OldVsn, State, _Extra) -> -flush_nodes(#state{ nodes=[], last_node_pos=LastNodePos }=State) -> +flush_nodes(#state{ nodes=[], last_node_pos=LastNodePos, bloom=Ref }=State) -> - Trailer = << 0:8, LastNodePos:64/unsigned >>, + Bloom = ebloom:serialize(Ref), + BloomSize = byte_size(Bloom), + + Trailer = << 0:32, Bloom/binary, BloomSize:32/unsigned, LastNodePos:64/unsigned >>, IdxFile = State#state.index_file, ok = file:write(IdxFile, Trailer), @@ -112,6 +127,8 @@ add_record(Level, Key, Value, #state{ nodes=[ #node{level=Level, members=List, s NewSize = NodeSize + fractal_btree_util:estimate_node_size_increment(List, Key, Value), + ebloom:insert( State#state.bloom, Key ), + NodeMembers = [{Key,Value} | List], if NewSize >= ?NODE_SIZE -> diff --git a/test/fractal_btree_tests.erl b/test/fractal_btree_tests.erl index 9e89aac..699e0a8 100644 --- a/test/fractal_btree_tests.erl +++ b/test/fractal_btree_tests.erl @@ -83,7 +83,7 @@ merge_test() -> ok = fractal_btree_writer:close(BT2), - {Time,{ok,Count}} = timer:tc(fractal_btree_merger, merge, ["test1", "test2", "test3"]), + {Time,{ok,Count}} = timer:tc(fractal_btree_merger, merge, ["test1", "test2", "test3", 8]), error_logger:info_msg("time to merge: ~p/sec (time=~p, count=~p)~n", [1000000/(Time/Count), Time/1000000, Count]),