Use new on-disk page format
This updates moves from kvlist to on-disk bisect format (https://github.com/krestenkrab/vbisect) which means that tree pages are not deserialised when read form disk. This improves performance of GETs by ~20% and improves top-level latencies significantly. CRC32 checks are now on the page level.
This commit is contained in:
parent
f4feca27e5
commit
198da5ef5f
5 changed files with 445 additions and 64 deletions
|
@ -82,3 +82,13 @@
|
||||||
-define(BLOOM_INSERT(Bloom, Key), hanoidb_util:bloom_insert(Bloom, Key)).
|
-define(BLOOM_INSERT(Bloom, Key), hanoidb_util:bloom_insert(Bloom, Key)).
|
||||||
-define(BLOOM_CONTAINS(Bloom, Key), hanoidb_util:bloom_contains(Bloom, Key)).
|
-define(BLOOM_CONTAINS(Bloom, Key), hanoidb_util:bloom_contains(Bloom, Key)).
|
||||||
|
|
||||||
|
%% tags used in the on-disk representation
|
||||||
|
-define(TAG_KV_DATA, 16#80).
|
||||||
|
-define(TAG_DELETED, 16#81).
|
||||||
|
-define(TAG_POSLEN32, 16#82).
|
||||||
|
-define(TAG_TRANSACT, 16#83).
|
||||||
|
-define(TAG_KV_DATA2, 16#84).
|
||||||
|
-define(TAG_DELETED2, 16#85).
|
||||||
|
-define(TAG_END, 16#FF).
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -69,12 +69,12 @@ merge(A,B,C, Size, IsLastLevel, Options) ->
|
||||||
{ok, Out} = hanoidb_writer:init([C, [{size, Size} | Options]]),
|
{ok, Out} = hanoidb_writer:init([C, [{size, Size} | Options]]),
|
||||||
AKVs =
|
AKVs =
|
||||||
case hanoidb_reader:first_node(IXA) of
|
case hanoidb_reader:first_node(IXA) of
|
||||||
{node, AKV} -> AKV;
|
{kvlist, AKV} -> AKV;
|
||||||
none -> []
|
none -> []
|
||||||
end,
|
end,
|
||||||
BKVs =
|
BKVs =
|
||||||
case hanoidb_reader:first_node(IXB) of
|
case hanoidb_reader:first_node(IXB) of
|
||||||
{node, BKV} ->BKV;
|
{kvlist, BKV} ->BKV;
|
||||||
none -> []
|
none -> []
|
||||||
end,
|
end,
|
||||||
scan(IXA, IXB, Out, IsLastLevel, AKVs, BKVs, {0, none}).
|
scan(IXA, IXB, Out, IsLastLevel, AKVs, BKVs, {0, none}).
|
||||||
|
@ -177,7 +177,7 @@ scan(IXA, IXB, Out, IsLastLevel, AKVs, BKVs, {N, FromPID}) when N < 1, AKVs =/=
|
||||||
|
|
||||||
scan(IXA, IXB, Out, IsLastLevel, [], BKVs, Step) ->
|
scan(IXA, IXB, Out, IsLastLevel, [], BKVs, Step) ->
|
||||||
case hanoidb_reader:next_node(IXA) of
|
case hanoidb_reader:next_node(IXA) of
|
||||||
{node, AKVs} ->
|
{kvlist, AKVs} ->
|
||||||
scan(IXA, IXB, Out, IsLastLevel, AKVs, BKVs, Step);
|
scan(IXA, IXB, Out, IsLastLevel, AKVs, BKVs, Step);
|
||||||
end_of_data ->
|
end_of_data ->
|
||||||
hanoidb_reader:close(IXA),
|
hanoidb_reader:close(IXA),
|
||||||
|
@ -186,7 +186,7 @@ scan(IXA, IXB, Out, IsLastLevel, [], BKVs, Step) ->
|
||||||
|
|
||||||
scan(IXA, IXB, Out, IsLastLevel, AKVs, [], Step) ->
|
scan(IXA, IXB, Out, IsLastLevel, AKVs, [], Step) ->
|
||||||
case hanoidb_reader:next_node(IXB) of
|
case hanoidb_reader:next_node(IXB) of
|
||||||
{node, BKVs} ->
|
{kvlist, BKVs} ->
|
||||||
scan(IXA, IXB, Out, IsLastLevel, AKVs, BKVs, Step);
|
scan(IXA, IXB, Out, IsLastLevel, AKVs, BKVs, Step);
|
||||||
end_of_data ->
|
end_of_data ->
|
||||||
hanoidb_reader:close(IXB),
|
hanoidb_reader:close(IXB),
|
||||||
|
@ -248,7 +248,7 @@ scan_only(IX, Out, IsLastLevel, KVs, {N, FromPID}) when N < 1, KVs =/= [] ->
|
||||||
|
|
||||||
scan_only(IX, Out, IsLastLevel, [], {_, FromPID}=Step) ->
|
scan_only(IX, Out, IsLastLevel, [], {_, FromPID}=Step) ->
|
||||||
case hanoidb_reader:next_node(IX) of
|
case hanoidb_reader:next_node(IX) of
|
||||||
{node, KVs} ->
|
{kvlist, KVs} ->
|
||||||
scan_only(IX, Out, IsLastLevel, KVs, Step);
|
scan_only(IX, Out, IsLastLevel, KVs, Step);
|
||||||
end_of_data ->
|
end_of_data ->
|
||||||
case FromPID of
|
case FromPID of
|
||||||
|
|
|
@ -37,24 +37,23 @@
|
||||||
-export([serialize/1, deserialize/1]).
|
-export([serialize/1, deserialize/1]).
|
||||||
|
|
||||||
-record(node, {level :: non_neg_integer(),
|
-record(node, {level :: non_neg_integer(),
|
||||||
members=[] :: list(any()) }).
|
members=[] :: list(any()) | binary() }).
|
||||||
|
|
||||||
-record(index, {file :: file:io_device(),
|
-record(index, {file :: file:io_device(),
|
||||||
root :: #node{} | none,
|
root= none :: #node{} | none,
|
||||||
bloom :: term(),
|
bloom :: term(),
|
||||||
name :: string(),
|
name :: string(),
|
||||||
config=[] :: term() }).
|
config=[] :: term() }).
|
||||||
|
|
||||||
-type read_file() :: #index{}.
|
-type read_file() :: #index{}.
|
||||||
|
-export_type([read_file/0]).
|
||||||
|
|
||||||
-spec open(Name::string()) -> {ok, read_file()} | {error, any()}.
|
-spec open(Name::string()) -> {ok, read_file()} | {error, any()}.
|
||||||
open(Name) ->
|
open(Name) ->
|
||||||
open(Name, [random]).
|
open(Name, [random]).
|
||||||
|
|
||||||
-type config() :: [sequential | folding | random | {atom(), term()}].
|
-type config() :: [sequential | folding | random | {atom(), term()}].
|
||||||
|
|
||||||
-spec open(Name::string(), config()) -> {ok, read_file()} | {error, any()}.
|
-spec open(Name::string(), config()) -> {ok, read_file()} | {error, any()}.
|
||||||
|
|
||||||
open(Name, Config) ->
|
open(Name, Config) ->
|
||||||
case proplists:get_bool(sequential, Config) of
|
case proplists:get_bool(sequential, Config) of
|
||||||
true ->
|
true ->
|
||||||
|
@ -115,11 +114,15 @@ deserialize({seq_read_file, Index, Position}) ->
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
fold(Fun, Acc0, #index{file=File}) ->
|
fold(Fun, Acc0, #index{file=File}) ->
|
||||||
{ok, Node} = read_node(File,?FIRST_BLOCK_POS),
|
{ok, Node} = read_node(File,?FIRST_BLOCK_POS),
|
||||||
fold0(File,fun({K,V},Acc) -> Fun(K,V,Acc) end,Node,Acc0).
|
fold0(File,fun({K,V},Acc) -> Fun(K,V,Acc) end,Node,Acc0).
|
||||||
|
|
||||||
fold0(File,Fun,#node{level=0, members=List},Acc0) ->
|
fold0(File,Fun,#node{level=0, members=BinPage},Acc0) when is_binary(BinPage) ->
|
||||||
|
Acc1 = vbisect:foldl(fun(K, V, Acc2) -> Fun({K, decode_binary_value(V)}, Acc2) end,Acc0,BinPage),
|
||||||
|
fold1(File,Fun,Acc1);
|
||||||
|
fold0(File,Fun,#node{level=0, members=List},Acc0) when is_list(List) ->
|
||||||
Acc1 = lists:foldl(Fun,Acc0,List),
|
Acc1 = lists:foldl(Fun,Acc0,List),
|
||||||
fold1(File,Fun,Acc1);
|
fold1(File,Fun,Acc1);
|
||||||
fold0(File,Fun,_InnerNode,Acc0) ->
|
fold0(File,Fun,_InnerNode,Acc0) ->
|
||||||
|
@ -133,22 +136,39 @@ fold1(File,Fun,Acc0) ->
|
||||||
fold0(File,Fun,Node,Acc0)
|
fold0(File,Fun,Node,Acc0)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
-spec range_fold(function(), any(), #index{}, #key_range{}) ->
|
-spec range_fold(fun((binary(),binary(),any()) -> any()), any(), #index{}, #key_range{}) ->
|
||||||
{limit, any(), binary()} | {done, any()}.
|
{limit, any(), binary()} | {done, any()}.
|
||||||
range_fold(Fun, Acc0, #index{file=File,root=Root}, Range) ->
|
range_fold(Fun, Acc0, #index{file=File,root=Root}, Range) ->
|
||||||
case lookup_node(File,Range#key_range.from_key,Root,?FIRST_BLOCK_POS) of
|
case Range#key_range.from_key =< first_key(Root) of
|
||||||
|
true ->
|
||||||
|
{ok, _} = file:position(File, ?FIRST_BLOCK_POS),
|
||||||
|
range_fold_from_here(Fun, Acc0, File, Range, Range#key_range.limit);
|
||||||
|
false ->
|
||||||
|
case find_leaf_node(File,Range#key_range.from_key,Root,?FIRST_BLOCK_POS) of
|
||||||
{ok, {Pos,_}} ->
|
{ok, {Pos,_}} ->
|
||||||
{ok, _} = file:position(File, Pos),
|
{ok, _} = file:position(File, Pos),
|
||||||
do_range_fold(Fun, Acc0, File, Range, Range#key_range.limit);
|
range_fold_from_here(Fun, Acc0, File, Range, Range#key_range.limit);
|
||||||
{ok, Pos} ->
|
{ok, Pos} ->
|
||||||
{ok, _} = file:position(File, Pos),
|
{ok, _} = file:position(File, Pos),
|
||||||
do_range_fold(Fun, Acc0, File, Range, Range#key_range.limit);
|
range_fold_from_here(Fun, Acc0, File, Range, Range#key_range.limit);
|
||||||
none ->
|
none ->
|
||||||
{done, Acc0}
|
{done, Acc0}
|
||||||
|
end
|
||||||
end.
|
end.
|
||||||
|
|
||||||
fold_until_stop(Fun,Acc,List) ->
|
first_key(#node{members=Dict}) ->
|
||||||
fold_until_stop2(Fun, {continue, Acc}, List).
|
{_,FirstKey} = fold_until_stop(fun({K,_},_) -> {stop, K} end, none, Dict),
|
||||||
|
FirstKey.
|
||||||
|
|
||||||
|
fold_until_stop(Fun,Acc,List) when is_list(List) ->
|
||||||
|
fold_until_stop2(Fun, {continue, Acc}, List);
|
||||||
|
fold_until_stop(Fun,Acc0,Bin) when is_binary(Bin) ->
|
||||||
|
vbisect:fold_until_stop(fun({Key,VBin},Acc1) ->
|
||||||
|
% io:format("-> DOING ~p,~p~n", [Key,Acc1]),
|
||||||
|
Fun({Key, decode_binary_value(VBin)}, Acc1)
|
||||||
|
end,
|
||||||
|
Acc0,
|
||||||
|
Bin).
|
||||||
|
|
||||||
fold_until_stop2(_Fun,{stop,Result},_) ->
|
fold_until_stop2(_Fun,{stop,Result},_) ->
|
||||||
{stopped, Result};
|
{stopped, Result};
|
||||||
|
@ -170,7 +190,8 @@ get_value({Value, _TStamp}) ->
|
||||||
get_value(Value) ->
|
get_value(Value) ->
|
||||||
Value.
|
Value.
|
||||||
|
|
||||||
do_range_fold(Fun, Acc0, File, Range, undefined) ->
|
range_fold_from_here(Fun, Acc0, File, Range, undefined) ->
|
||||||
|
% io:format("RANGE_FOLD_FROM_HERE(~p,~p)~n", [Acc0,File]),
|
||||||
case next_leaf_node(File) of
|
case next_leaf_node(File) of
|
||||||
eof ->
|
eof ->
|
||||||
{done, Acc0};
|
{done, Acc0};
|
||||||
|
@ -185,18 +206,19 @@ do_range_fold(Fun, Acc0, File, Range, undefined) ->
|
||||||
false ->
|
false ->
|
||||||
{continue, Fun(Key, get_value(Value), Acc)}
|
{continue, Fun(Key, get_value(Value), Acc)}
|
||||||
end;
|
end;
|
||||||
(_, Acc) ->
|
(_Huh, Acc) ->
|
||||||
|
% io:format("SKIPPING ~p~n", [_Huh]),
|
||||||
{continue, Acc}
|
{continue, Acc}
|
||||||
end,
|
end,
|
||||||
Acc0,
|
Acc0,
|
||||||
Members) of
|
Members) of
|
||||||
{stopped, Result} -> Result;
|
{stopped, Result} -> Result;
|
||||||
{ok, Acc1} ->
|
{ok, Acc1} ->
|
||||||
do_range_fold(Fun, Acc1, File, Range, undefined)
|
range_fold_from_here(Fun, Acc1, File, Range, undefined)
|
||||||
end
|
end
|
||||||
end;
|
end;
|
||||||
|
|
||||||
do_range_fold(Fun, Acc0, File, Range, N0) ->
|
range_fold_from_here(Fun, Acc0, File, Range, N0) ->
|
||||||
case next_leaf_node(File) of
|
case next_leaf_node(File) of
|
||||||
eof ->
|
eof ->
|
||||||
{done, Acc0};
|
{done, Acc0};
|
||||||
|
@ -226,52 +248,74 @@ do_range_fold(Fun, Acc0, File, Range, N0) ->
|
||||||
{continue, Acc}
|
{continue, Acc}
|
||||||
end,
|
end,
|
||||||
{N0, Acc0},
|
{N0, Acc0},
|
||||||
Members) of
|
Members)
|
||||||
{stopped, Result} -> Result;
|
of
|
||||||
|
{stopped, Result} ->
|
||||||
|
Result;
|
||||||
{ok, {N2, Acc1}} ->
|
{ok, {N2, Acc1}} ->
|
||||||
do_range_fold(Fun, Acc1, File, Range, N2)
|
range_fold_from_here(Fun, Acc1, File, Range, N2)
|
||||||
end
|
end
|
||||||
end.
|
end.
|
||||||
|
|
||||||
lookup_node(_File,_FromKey,#node{level=0},Pos) ->
|
find_leaf_node(_File,_FromKey,#node{level=0},Pos) ->
|
||||||
{ok, Pos};
|
{ok, Pos};
|
||||||
lookup_node(File,FromKey,#node{members=Members,level=N},_) ->
|
find_leaf_node(File,FromKey,#node{members=Members,level=N},_) when is_list(Members) ->
|
||||||
case find_start(FromKey, Members) of
|
case find_start(FromKey, Members) of
|
||||||
{ok, ChildPos} when N==1 ->
|
|
||||||
{ok, ChildPos};
|
|
||||||
{ok, ChildPos} ->
|
{ok, ChildPos} ->
|
||||||
case read_node(File,ChildPos) of
|
recursive_find(File, FromKey, N, ChildPos);
|
||||||
{ok, ChildNode} ->
|
|
||||||
lookup_node(File,FromKey,ChildNode,ChildPos);
|
|
||||||
eof ->
|
|
||||||
none
|
|
||||||
end;
|
|
||||||
not_found ->
|
not_found ->
|
||||||
none
|
none
|
||||||
end;
|
end;
|
||||||
lookup_node(_,_,none,_) ->
|
find_leaf_node(File,FromKey,#node{members=Members,level=N},_) when is_binary(Members) ->
|
||||||
|
case vbisect:find_geq(FromKey,Members) of
|
||||||
|
{ok, _, <<?TAG_POSLEN32, Pos:64/unsigned, Len:32/unsigned>>} ->
|
||||||
|
% io:format("** FIND_LEAF_NODE(~p,~p) -> {~p,~p}~n", [FromKey, N, Pos,Len]),
|
||||||
|
recursive_find(File, FromKey, N, {Pos,Len});
|
||||||
|
none ->
|
||||||
|
% io:format("** FIND_LEAF_NODE(~p,~p) -> none~n", [FromKey, N]),
|
||||||
|
none
|
||||||
|
end;
|
||||||
|
find_leaf_node(_,_,none,_) ->
|
||||||
none.
|
none.
|
||||||
|
|
||||||
|
recursive_find(_File,_FromKey,1,ChildPos) ->
|
||||||
|
{ok, ChildPos};
|
||||||
|
recursive_find(File,FromKey,N,ChildPos) when N>1 ->
|
||||||
|
case read_node(File,ChildPos) of
|
||||||
|
{ok, ChildNode} ->
|
||||||
|
find_leaf_node(File, FromKey,ChildNode,ChildPos);
|
||||||
|
eof ->
|
||||||
|
none
|
||||||
|
end.
|
||||||
|
|
||||||
|
|
||||||
|
%% used by the merger, needs list value
|
||||||
first_node(#index{file=File}) ->
|
first_node(#index{file=File}) ->
|
||||||
case read_node(File, ?FIRST_BLOCK_POS) of
|
case read_node(File, ?FIRST_BLOCK_POS) of
|
||||||
{ok, #node{level=0, members=Members}} ->
|
{ok, #node{level=0, members=Members}} ->
|
||||||
{node, Members};
|
{kvlist, decode_member_list(Members)};
|
||||||
eof->
|
eof->
|
||||||
none
|
none
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
%% used by the merger, needs list value
|
||||||
next_node(#index{file=File}=_Index) ->
|
next_node(#index{file=File}=_Index) ->
|
||||||
case next_leaf_node(File) of
|
case next_leaf_node(File) of
|
||||||
{ok, #node{level=0, members=Members}} ->
|
{ok, #node{level=0, members=Members}} ->
|
||||||
{node, Members};
|
{kvlist, decode_member_list(Members)};
|
||||||
% {ok, #node{level=N}} when N>0 ->
|
|
||||||
% next_node(Index);
|
|
||||||
eof ->
|
eof ->
|
||||||
end_of_data
|
end_of_data
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
decode_member_list(List) when is_list(List) ->
|
||||||
|
List;
|
||||||
|
decode_member_list(BinDict) when is_binary(BinDict) ->
|
||||||
|
vbisect:foldr( fun(Key,Value,Acc) ->
|
||||||
|
[{Key, decode_binary_value(Value) }|Acc]
|
||||||
|
end,
|
||||||
|
[],
|
||||||
|
BinDict).
|
||||||
|
|
||||||
close(#index{file=undefined}) ->
|
close(#index{file=undefined}) ->
|
||||||
ok;
|
ok;
|
||||||
close(#index{file=File}) ->
|
close(#index{file=File}) ->
|
||||||
|
@ -297,11 +341,20 @@ lookup(#index{file=File, root=Node, bloom=Bloom}, Key) ->
|
||||||
end.
|
end.
|
||||||
|
|
||||||
lookup_in_node(_File,#node{level=0,members=Members}, Key) ->
|
lookup_in_node(_File,#node{level=0,members=Members}, Key) ->
|
||||||
case lists:keyfind(Key,1,Members) of
|
find_in_leaf(Key,Members);
|
||||||
false ->
|
|
||||||
not_found;
|
lookup_in_node(File,#node{members=Members},Key) when is_binary(Members) ->
|
||||||
{_,Value} ->
|
case vbisect:find_geq(Key,Members) of
|
||||||
{ok, Value}
|
{ok, _Key, <<?TAG_POSLEN32, Pos:64, Size:32>>} ->
|
||||||
|
% io:format("FOUND ~p @ ~p~n", [_Key, {Pos,Size}]),
|
||||||
|
case read_node(File,{Pos,Size}) of
|
||||||
|
{ok, Node} ->
|
||||||
|
lookup_in_node(File, Node, Key);
|
||||||
|
eof ->
|
||||||
|
not_found
|
||||||
|
end;
|
||||||
|
none ->
|
||||||
|
not_found
|
||||||
end;
|
end;
|
||||||
|
|
||||||
lookup_in_node(File,#node{members=Members},Key) ->
|
lookup_in_node(File,#node{members=Members},Key) ->
|
||||||
|
@ -416,3 +469,29 @@ next_leaf_node(File) ->
|
||||||
next_leaf_node(File)
|
next_leaf_node(File)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
|
||||||
|
find_in_leaf(Key,Bin) when is_binary(Bin) ->
|
||||||
|
case vbisect:find(Key,Bin) of
|
||||||
|
{ok, BinValue} ->
|
||||||
|
{ok, decode_binary_value(BinValue)};
|
||||||
|
error ->
|
||||||
|
not_found
|
||||||
|
end;
|
||||||
|
find_in_leaf(Key,List) when is_list(List) ->
|
||||||
|
case lists:keyfind(Key, 1, List) of
|
||||||
|
{_, Value} ->
|
||||||
|
{ok, Value};
|
||||||
|
false ->
|
||||||
|
not_found
|
||||||
|
end.
|
||||||
|
|
||||||
|
decode_binary_value(<<?TAG_KV_DATA, Value/binary>>) ->
|
||||||
|
Value;
|
||||||
|
decode_binary_value(<<?TAG_KV_DATA2, TStamp:32, Value/binary>>) ->
|
||||||
|
{Value, TStamp};
|
||||||
|
decode_binary_value(<<?TAG_DELETED>>) ->
|
||||||
|
?TOMBSTONE;
|
||||||
|
decode_binary_value(<<?TAG_DELETED2, TStamp:32>>) ->
|
||||||
|
{?TOMBSTONE, TStamp};
|
||||||
|
decode_binary_value(<<?TAG_POSLEN32, Pos:64, Len:32>>) ->
|
||||||
|
{Pos, Len}.
|
||||||
|
|
|
@ -53,14 +53,10 @@
|
||||||
|
|
||||||
-define(ERLANG_ENCODED, 131).
|
-define(ERLANG_ENCODED, 131).
|
||||||
-define(CRC_ENCODED, 127).
|
-define(CRC_ENCODED, 127).
|
||||||
|
-define(BISECT_ENCODED, 126).
|
||||||
|
|
||||||
-define(TAG_KV_DATA, 16#80).
|
|
||||||
-define(TAG_DELETED, 16#81).
|
-define(FILE_ENCODING, bisect).
|
||||||
-define(TAG_POSLEN32, 16#82).
|
|
||||||
-define(TAG_TRANSACT, 16#83).
|
|
||||||
-define(TAG_KV_DATA2, 16#84).
|
|
||||||
-define(TAG_DELETED2, 16#85).
|
|
||||||
-define(TAG_END, 16#FF).
|
|
||||||
|
|
||||||
-compile({inline, [crc_encapsulate/1, crc_encapsulate_kv_entry/2 ]}).
|
-compile({inline, [crc_encapsulate/1, crc_encapsulate_kv_entry/2 ]}).
|
||||||
|
|
||||||
|
@ -143,18 +139,47 @@ uncompress(<<?GZIP_COMPRESSION, Data/binary>>) ->
|
||||||
zlib:gunzip(Data).
|
zlib:gunzip(Data).
|
||||||
|
|
||||||
encode_index_node(KVList, Method) ->
|
encode_index_node(KVList, Method) ->
|
||||||
TermData = [ ?TAG_END |
|
TermData =
|
||||||
|
case ?FILE_ENCODING of
|
||||||
|
bisect ->
|
||||||
|
Binary = vbisect:from_orddict(lists:map(fun binary_encode_kv/1, KVList)),
|
||||||
|
CRC = erlang:crc32(Binary),
|
||||||
|
[?BISECT_ENCODED, <<CRC:32>>, Binary];
|
||||||
|
hanoi2 ->
|
||||||
|
[ ?TAG_END |
|
||||||
lists:map(fun ({Key,Value}) ->
|
lists:map(fun ({Key,Value}) ->
|
||||||
crc_encapsulate_kv_entry(Key, Value)
|
crc_encapsulate_kv_entry(Key, Value)
|
||||||
end,
|
end,
|
||||||
KVList) ],
|
KVList) ]
|
||||||
|
end,
|
||||||
{MethodName, OutData} = compress(Method, TermData),
|
{MethodName, OutData} = compress(Method, TermData),
|
||||||
{ok, [MethodName | OutData]}.
|
{ok, [MethodName | OutData]}.
|
||||||
|
|
||||||
decode_index_node(Level, Data) ->
|
decode_index_node(Level, Data) ->
|
||||||
TermData = uncompress(Data),
|
TermData = uncompress(Data),
|
||||||
{ok, KVList} = decode_kv_list(TermData),
|
case decode_kv_list(TermData) of
|
||||||
{ok, {node, Level, KVList}}.
|
{ok, KVList} ->
|
||||||
|
{ok, {node, Level, KVList}};
|
||||||
|
{bisect, Binary} ->
|
||||||
|
% io:format("[page level=~p~n", [Level]),
|
||||||
|
% vbisect:foldl(fun(K,V,_) -> io:format(" ~p -> ~p,~n", [K,V]) end, 0, Binary),
|
||||||
|
% io:format("]~n",[]),
|
||||||
|
{ok, {node, Level, Binary}}
|
||||||
|
end.
|
||||||
|
|
||||||
|
|
||||||
|
binary_encode_kv({Key, {Value,infinity}}) ->
|
||||||
|
binary_encode_kv({Key,Value});
|
||||||
|
binary_encode_kv({Key, {?TOMBSTONE, TStamp}}) ->
|
||||||
|
{Key, <<?TAG_DELETED2, TStamp:32>>};
|
||||||
|
binary_encode_kv({Key, ?TOMBSTONE}) ->
|
||||||
|
{Key, <<?TAG_DELETED>>};
|
||||||
|
binary_encode_kv({Key, {Value, TStamp}}) when is_binary(Value) ->
|
||||||
|
{Key, <<?TAG_KV_DATA2, TStamp:32, Value/binary>>};
|
||||||
|
binary_encode_kv({Key, Value}) when is_binary(Value)->
|
||||||
|
{Key, <<?TAG_KV_DATA, Value/binary>>};
|
||||||
|
binary_encode_kv({Key, {Pos, Len}}) when Len < 16#ffffffff ->
|
||||||
|
{Key, <<?TAG_POSLEN32, Pos:64/unsigned, Len:32/unsigned>>}.
|
||||||
|
|
||||||
|
|
||||||
-spec crc_encapsulate_kv_entry(binary(), expvalue()) -> iolist().
|
-spec crc_encapsulate_kv_entry(binary(), expvalue()) -> iolist().
|
||||||
|
@ -193,7 +218,14 @@ decode_kv_list(<<?TAG_END, Custom/binary>>) ->
|
||||||
decode_kv_list(<<?ERLANG_ENCODED, _/binary>>=TermData) ->
|
decode_kv_list(<<?ERLANG_ENCODED, _/binary>>=TermData) ->
|
||||||
{ok, erlang:term_to_binary(TermData)};
|
{ok, erlang:term_to_binary(TermData)};
|
||||||
decode_kv_list(<<?CRC_ENCODED, Custom/binary>>) ->
|
decode_kv_list(<<?CRC_ENCODED, Custom/binary>>) ->
|
||||||
decode_crc_data(Custom, [], []).
|
decode_crc_data(Custom, [], []);
|
||||||
|
decode_kv_list(<<?BISECT_ENCODED, CRC:32/unsigned, Binary/binary>>) ->
|
||||||
|
CRCTest = erlang:crc32( Binary ),
|
||||||
|
if CRC == CRCTest ->
|
||||||
|
{bisect, Binary};
|
||||||
|
true ->
|
||||||
|
{bisect, vbisect:from_orddict([])}
|
||||||
|
end.
|
||||||
|
|
||||||
-spec decode_crc_data(binary(), list(), list()) -> {ok, [kventry()]} | {partial, [kventry()], iolist()}.
|
-spec decode_crc_data(binary(), list(), list()) -> {ok, [kventry()]} | {partial, [kventry()], iolist()}.
|
||||||
decode_crc_data(<<>>, [], Acc) ->
|
decode_crc_data(<<>>, [], Acc) ->
|
||||||
|
|
260
src/vbisect.erl
Normal file
260
src/vbisect.erl
Normal file
|
@ -0,0 +1,260 @@
|
||||||
|
|
||||||
|
|
||||||
|
-module(vbisect).
|
||||||
|
|
||||||
|
-export([from_orddict/1,
|
||||||
|
from_gb_tree/1,
|
||||||
|
to_gb_tree/1,
|
||||||
|
first_key/1,
|
||||||
|
find/2, find_geq/2,
|
||||||
|
foldl/3, foldr/3, fold_until_stop/3,
|
||||||
|
to_orddict/1,
|
||||||
|
merge/3]).
|
||||||
|
|
||||||
|
-define(MAGIC, "vbis").
|
||||||
|
-type key() :: binary().
|
||||||
|
-type value() :: binary().
|
||||||
|
-type bindict() :: binary().
|
||||||
|
|
||||||
|
-ifdef(TEST).
|
||||||
|
-include_lib("eunit/include/eunit.hrl").
|
||||||
|
-endif.
|
||||||
|
|
||||||
|
-spec from_gb_tree(gb_trees:tree()) -> bindict().
|
||||||
|
from_gb_tree({Count,Node}) when Count =< 16#ffffffff ->
|
||||||
|
{_BinSize,IOList} = encode_gb_node(Node),
|
||||||
|
erlang:iolist_to_binary([ <<?MAGIC, Count:32/unsigned >> | IOList ]).
|
||||||
|
|
||||||
|
encode_gb_node({Key, Value, Smaller, Bigger}) when is_binary(Key), is_binary(Value) ->
|
||||||
|
{BinSizeSmaller, IOSmaller} = encode_gb_node(Smaller),
|
||||||
|
{BinSizeBigger, IOBigger} = encode_gb_node(Bigger),
|
||||||
|
|
||||||
|
KeySize = byte_size(Key),
|
||||||
|
ValueSize = byte_size(Value),
|
||||||
|
{ 2 + KeySize
|
||||||
|
+ 4 + ValueSize
|
||||||
|
+ 4 + BinSizeSmaller
|
||||||
|
+ BinSizeBigger,
|
||||||
|
|
||||||
|
[ << KeySize:16, Key/binary,
|
||||||
|
BinSizeSmaller:32 >>, IOSmaller,
|
||||||
|
<< ValueSize:32, Value/binary >> | IOBigger ] };
|
||||||
|
|
||||||
|
encode_gb_node(nil) ->
|
||||||
|
{ 0, [] }.
|
||||||
|
|
||||||
|
to_gb_tree(<<?MAGIC, Count:32, Nodes/binary >>) ->
|
||||||
|
{ Count, to_gb_node(Nodes) }.
|
||||||
|
|
||||||
|
to_gb_node( <<>> ) ->
|
||||||
|
nil;
|
||||||
|
|
||||||
|
to_gb_node( << KeySize:16, Key:KeySize/binary,
|
||||||
|
BinSizeSmaller:32, Smaller:BinSizeSmaller/binary,
|
||||||
|
ValueSize:32, Value:ValueSize/binary,
|
||||||
|
Bigger/binary >> ) ->
|
||||||
|
{Key, Value,
|
||||||
|
to_gb_node(Smaller),
|
||||||
|
to_gb_node(Bigger)}.
|
||||||
|
|
||||||
|
-spec find(Key::key(), Dict::bindict()) ->
|
||||||
|
{ ok, value() } | error.
|
||||||
|
find(Key, <<?MAGIC, _:32, Binary/binary>>) ->
|
||||||
|
find_node(byte_size(Key), Key, Binary).
|
||||||
|
|
||||||
|
find_node(KeySize, Key, <<HereKeySize:16, HereKey:HereKeySize/binary,
|
||||||
|
BinSizeSmaller:32, _:BinSizeSmaller/binary,
|
||||||
|
ValueSize:32, Value:ValueSize/binary,
|
||||||
|
_/binary>> = Bin) ->
|
||||||
|
if
|
||||||
|
Key < HereKey ->
|
||||||
|
Skip = 6 + HereKeySize,
|
||||||
|
<< _:Skip/binary, Smaller:BinSizeSmaller/binary, _/binary>> = Bin,
|
||||||
|
find_node(KeySize, Key, Smaller);
|
||||||
|
HereKey < Key ->
|
||||||
|
Skip = 10 + HereKeySize + BinSizeSmaller + ValueSize,
|
||||||
|
<< _:Skip/binary, Bigger/binary>> = Bin,
|
||||||
|
find_node(KeySize, Key, Bigger);
|
||||||
|
true ->
|
||||||
|
{ok, Value}
|
||||||
|
end;
|
||||||
|
|
||||||
|
find_node(_, _, <<>>) ->
|
||||||
|
error.
|
||||||
|
|
||||||
|
to_orddict(BinDict) ->
|
||||||
|
foldr(fun(Key,Value,Acc) ->
|
||||||
|
[{Key,Value}|Acc]
|
||||||
|
end,
|
||||||
|
[],
|
||||||
|
BinDict).
|
||||||
|
|
||||||
|
merge(Fun, BinDict1, BinDict2) ->
|
||||||
|
OD1 = to_orddict(BinDict1),
|
||||||
|
OD2 = to_orddict(BinDict2),
|
||||||
|
OD3 = orddict:merge(Fun, OD1, OD2),
|
||||||
|
from_orddict(OD3).
|
||||||
|
|
||||||
|
-spec first_key( bindict() ) -> binary() | none.
|
||||||
|
first_key(BinDict) ->
|
||||||
|
{_, Key} = fold_until_stop(fun({K,_},_) -> {stop, K} end, none, BinDict),
|
||||||
|
Key.
|
||||||
|
|
||||||
|
%% @doc Find largest {K,V} where K is smaller than or equal to key.
|
||||||
|
%% This is good for an inner node where key is the smallest key
|
||||||
|
%% in the child node.
|
||||||
|
|
||||||
|
-spec find_geq(Key::binary(), Binary::binary()) ->
|
||||||
|
none | {ok, Key::key(), Value::value()}.
|
||||||
|
|
||||||
|
find_geq(Key, <<?MAGIC, _:32, Binary/binary>>) ->
|
||||||
|
find_geq_node(byte_size(Key), Key, Binary, none).
|
||||||
|
|
||||||
|
find_geq_node(_, _, <<>>, Else) ->
|
||||||
|
Else;
|
||||||
|
|
||||||
|
find_geq_node(KeySize, Key, <<HereKeySize:16, HereKey:HereKeySize/binary,
|
||||||
|
BinSizeSmaller:32, _:BinSizeSmaller/binary,
|
||||||
|
ValueSize:32, Value:ValueSize/binary,
|
||||||
|
_/binary>> = Bin, Else) ->
|
||||||
|
if
|
||||||
|
Key < HereKey ->
|
||||||
|
Skip = 6 + HereKeySize,
|
||||||
|
<< _:Skip/binary, Smaller:BinSizeSmaller/binary, _/binary>> = Bin,
|
||||||
|
find_geq_node(KeySize, Key, Smaller, Else);
|
||||||
|
HereKey < Key ->
|
||||||
|
Skip = 10 + HereKeySize + BinSizeSmaller + ValueSize,
|
||||||
|
<< _:Skip/binary, Bigger/binary>> = Bin,
|
||||||
|
find_geq_node(KeySize, Key, Bigger, {ok, HereKey, Value});
|
||||||
|
true ->
|
||||||
|
{ok, HereKey, Value}
|
||||||
|
end.
|
||||||
|
|
||||||
|
-spec foldl(fun((Key::key(), Value::value(), Acc::term()) -> term()), term(), bindict()) ->
|
||||||
|
term().
|
||||||
|
foldl(Fun, Acc, <<?MAGIC, _:32, Binary/binary>>) ->
|
||||||
|
foldl_node(Fun, Acc, Binary).
|
||||||
|
|
||||||
|
foldl_node(_Fun, Acc, <<>>) ->
|
||||||
|
Acc;
|
||||||
|
|
||||||
|
foldl_node(Fun, Acc, <<KeySize:16, Key:KeySize/binary,
|
||||||
|
BinSizeSmaller:32, Smaller:BinSizeSmaller/binary,
|
||||||
|
ValueSize:32, Value:ValueSize/binary,
|
||||||
|
Bigger/binary>>) ->
|
||||||
|
Acc1 = foldl_node(Fun, Acc, Smaller),
|
||||||
|
Acc2 = Fun(Key, Value, Acc1),
|
||||||
|
foldl_node(Fun, Acc2, Bigger).
|
||||||
|
|
||||||
|
|
||||||
|
-spec fold_until_stop(function(), term(), bindict()) -> {stopped, term()} | {ok, term()}.
|
||||||
|
|
||||||
|
fold_until_stop(Fun, Acc, <<?MAGIC, _:32, Bin/binary>>) ->
|
||||||
|
fold_until_stop2(Fun, {continue, Acc}, Bin).
|
||||||
|
|
||||||
|
fold_until_stop2(_Fun,{stop,Result},_) ->
|
||||||
|
{stopped, Result};
|
||||||
|
fold_until_stop2(_Fun,{continue, Acc},<<>>) ->
|
||||||
|
{ok, Acc};
|
||||||
|
fold_until_stop2(Fun,{continue, Acc}, <<KeySize:16, Key:KeySize/binary,
|
||||||
|
BinSizeSmaller:32, Smaller:BinSizeSmaller/binary,
|
||||||
|
ValueSize:32, Value:ValueSize/binary,
|
||||||
|
Bigger/binary>>) ->
|
||||||
|
|
||||||
|
case fold_until_stop2(Fun, {continue, Acc}, Smaller) of
|
||||||
|
{stopped, Result} ->
|
||||||
|
{stopped, Result};
|
||||||
|
{ok, Acc1} ->
|
||||||
|
ContinueOrStopAcc = Fun({Key,Value}, Acc1),
|
||||||
|
fold_until_stop2(Fun, ContinueOrStopAcc, Bigger)
|
||||||
|
end.
|
||||||
|
|
||||||
|
|
||||||
|
-spec foldr(fun((Key::key(), Value::value(), Acc::term()) -> term()), term(), bindict()) ->
|
||||||
|
term().
|
||||||
|
foldr(Fun, Acc, <<?MAGIC, _:32, Binary/binary>>) ->
|
||||||
|
foldr_node(Fun, Acc, Binary).
|
||||||
|
|
||||||
|
foldr_node(_Fun, Acc, <<>>) ->
|
||||||
|
Acc;
|
||||||
|
|
||||||
|
foldr_node(Fun, Acc, <<KeySize:16, Key:KeySize/binary,
|
||||||
|
BinSizeSmaller:32, Smaller:BinSizeSmaller/binary,
|
||||||
|
ValueSize:32, Value:ValueSize/binary,
|
||||||
|
Bigger/binary>>) ->
|
||||||
|
Acc1 = foldr_node(Fun, Acc, Bigger),
|
||||||
|
Acc2 = Fun(Key, Value, Acc1),
|
||||||
|
foldr_node(Fun, Acc2, Smaller).
|
||||||
|
|
||||||
|
|
||||||
|
from_orddict(OrdDict) ->
|
||||||
|
from_gb_tree(gb_trees:from_orddict(OrdDict)).
|
||||||
|
|
||||||
|
-ifdef(TEST).
|
||||||
|
|
||||||
|
speed_test_() ->
|
||||||
|
{timeout, 600,
|
||||||
|
fun() ->
|
||||||
|
Start = 100000000000000,
|
||||||
|
N = 100000,
|
||||||
|
Keys = lists:seq(Start, Start+N),
|
||||||
|
KeyValuePairs = lists:map(fun (I) -> {<<I:64/integer>>, <<255:8/integer>>} end,
|
||||||
|
Keys),
|
||||||
|
|
||||||
|
%% Will mostly be unique, if N is bigger than 10000
|
||||||
|
ReadKeys = [<<(lists:nth(random:uniform(N), Keys)):64/integer>> || _ <- lists:seq(1, 1000)],
|
||||||
|
B = from_orddict(KeyValuePairs),
|
||||||
|
time_reads(B, N, ReadKeys)
|
||||||
|
end}.
|
||||||
|
|
||||||
|
|
||||||
|
geq_test() ->
|
||||||
|
B = from_orddict([{<<2>>,<<2>>},{<<4>>,<<4>>},{<<6>>,<<6>>},{<<122>>,<<122>>}]),
|
||||||
|
none = find_geq(<<1>>, B),
|
||||||
|
{ok, <<2>>, <<2>>} = find_geq(<<2>>, B),
|
||||||
|
{ok, <<2>>, <<2>>} = find_geq(<<3>>, B),
|
||||||
|
{ok, <<4>>, <<4>>} = find_geq(<<5>>, B),
|
||||||
|
{ok, <<6>>, <<6>>} = find_geq(<<100>>, B),
|
||||||
|
{ok, <<122>>, <<122>>} = find_geq(<<150>>, B),
|
||||||
|
true.
|
||||||
|
|
||||||
|
|
||||||
|
time_reads(B, Size, ReadKeys) ->
|
||||||
|
Parent = self(),
|
||||||
|
spawn(
|
||||||
|
fun() ->
|
||||||
|
Runs = 20,
|
||||||
|
Timings =
|
||||||
|
lists:map(
|
||||||
|
fun (_) ->
|
||||||
|
StartTime = now(),
|
||||||
|
find_many(B, ReadKeys),
|
||||||
|
timer:now_diff(now(), StartTime)
|
||||||
|
end, lists:seq(1, Runs)),
|
||||||
|
|
||||||
|
Rps = 1000000 / ((lists:sum(Timings) / length(Timings)) / 1000),
|
||||||
|
error_logger:info_msg("Average over ~p runs, ~p keys in dict~n"
|
||||||
|
"Average fetch ~p keys: ~p us, max: ~p us~n"
|
||||||
|
"Average fetch 1 key: ~p us~n"
|
||||||
|
"Theoretical sequential RPS: ~w~n",
|
||||||
|
[Runs, Size, length(ReadKeys),
|
||||||
|
lists:sum(Timings) / length(Timings),
|
||||||
|
lists:max(Timings),
|
||||||
|
(lists:sum(Timings) / length(Timings)) / length(ReadKeys),
|
||||||
|
trunc(Rps)]),
|
||||||
|
|
||||||
|
Parent ! done
|
||||||
|
end),
|
||||||
|
receive done -> ok after 1000 -> ok end.
|
||||||
|
|
||||||
|
-spec find_many(bindict(), [key()]) -> non_neg_integer().
|
||||||
|
find_many(B, Keys) ->
|
||||||
|
lists:foldl(fun (K, N) ->
|
||||||
|
case find(K, B) of
|
||||||
|
{ok, _} -> N+1;
|
||||||
|
error -> N
|
||||||
|
end
|
||||||
|
end,
|
||||||
|
0, Keys).
|
||||||
|
|
||||||
|
-endif.
|
Loading…
Reference in a new issue