Add merkle library #48
3 changed files with 10 additions and 64 deletions
|
@ -6,7 +6,6 @@
|
|||
|
||||
{deps, [
|
||||
{lager, ".*", {git, "git://github.com/basho/lager.git", {tag, "2.2.0"}}},
|
||||
{merklet, ".*", {git, "https://github.com/ferd/merklet.git", {branch, "master"}}},
|
||||
{protobuffs, "0.8.*", {git, "git://github.com/basho/erlang_protobuffs.git", {tag, "0.8.1p4"}}},
|
||||
{riak_dt, ".*", {git, "git://github.com/basho/riak_dt.git", {branch, "develop"}}},
|
||||
{node_package, ".*", {git, "git://github.com/basho/node_package.git", {branch, "develop"}}},
|
||||
|
|
|
@ -21,12 +21,6 @@
|
|||
%% @doc Creates a Merkle tree per file based on the checksum data for
|
||||
%% a given data file.
|
||||
%%
|
||||
%% Has selectable backend, chosen at open.
|
||||
%%
|
||||
%% The default 'merklet' implementation uses the `merklet' library. Keys are
|
||||
%% encoded as `<<Offset:64, Size:32>>' values encoded as `<<Tag:8, Csum/binary>>'
|
||||
%% *or* as `<<0>>' for unwritten bytes, or `<<1>>' for trimmed bytes.
|
||||
%%
|
||||
%% The `naive' implementation representation is:
|
||||
%%
|
||||
%% `<<Length:64, Offset:32, 0>>' for unwritten bytes
|
||||
|
@ -35,8 +29,8 @@
|
|||
%%
|
||||
%% The tree feeds these leaf nodes into hashes representing chunks of a minimum
|
||||
%% size of at least 1024 KB (1 MB), but if the file size is larger, we will try
|
||||
%% to get about 100 chunks for called "Level 1." We aim for around 10 hashes at
|
||||
%% level 2, and then 2 hashes level 3 and finally the root.
|
||||
%% to get about 100 chunks for the first rollup "Level 1." We aim for around 10
|
||||
%% hashes at level 2, and then 2 hashes level 3 and finally the root.
|
||||
|
||||
-module(machi_merkle_tree).
|
||||
|
||||
|
@ -57,12 +51,8 @@
|
|||
|
||||
-define(TRIMMED, <<1>>).
|
||||
-define(UNWRITTEN, <<0>>).
|
||||
-define(ENCODE(Offset, Size), <<Offset:64/unsigned-big, Size:32/unsigned-big>>).
|
||||
-define(NAIVE_ENCODE(Offset, Size, Data), <<Offset:64/unsigned-big, Size:32/unsigned-big, Data/binary>>).
|
||||
|
||||
-define(NEW_MERKLET, undefined).
|
||||
-define(TIMEOUT, (10*1000)).
|
||||
|
||||
-define(MINIMUM_CHUNK, 1048576). %% 1024 * 1024
|
||||
-define(LEVEL_SIZE, 10).
|
||||
-define(H, sha).
|
||||
|
@ -70,13 +60,12 @@
|
|||
%% public API
|
||||
|
||||
open(Filename, DataDir) ->
|
||||
open(Filename, DataDir, merklet).
|
||||
open(Filename, DataDir, naive).
|
||||
|
||||
open(Filename, DataDir, Type) ->
|
||||
Tree = load_filename(Filename, DataDir, Type),
|
||||
{ok, #mt{ filename = Filename, tree = Tree, backend = Type}}.
|
||||
|
||||
tree(#mt{ tree = T, backend = merklet }) -> T;
|
||||
tree(#mt{ tree = T, backend = naive }) ->
|
||||
case T#naive.recalc of
|
||||
true -> build_tree(T);
|
||||
|
@ -90,19 +79,11 @@ diff(#mt{backend = naive, tree = T1}, #mt{backend = naive, tree = T2}) ->
|
|||
true -> same;
|
||||
false -> naive_diff(T1, T2)
|
||||
end;
|
||||
diff(#mt{backend = merklet, tree = T1}, #mt{backend = merklet, tree = T2}) ->
|
||||
case merklet:diff(T1, T2) of
|
||||
[] -> same;
|
||||
Diff -> Diff
|
||||
end;
|
||||
diff(_, _) -> error(badarg).
|
||||
|
||||
%% private
|
||||
|
||||
% @private
|
||||
load_filename(Filename, DataDir, merklet) ->
|
||||
{_Last, M} = do_load(Filename, DataDir, fun insert_csum/2, ?NEW_MERKLET),
|
||||
M;
|
||||
load_filename(Filename, DataDir, naive) ->
|
||||
{Last, M} = do_load(Filename, DataDir, fun insert_csum_naive/2, []),
|
||||
ChunkSize = max(?MINIMUM_CHUNK, Last div 100),
|
||||
|
@ -117,16 +98,6 @@ do_load(Filename, DataDir, FoldFun, AccInit) ->
|
|||
Acc.
|
||||
|
||||
% @private
|
||||
insert_csum({Last, Size, _Csum}=In, {Last, MT}) ->
|
||||
%% no gap here, insert a record
|
||||
{Last+Size, update_merkle_tree(In, MT)};
|
||||
insert_csum({Offset, Size, _Csum}=In, {Last, MT}) ->
|
||||
%% gap here, insert unwritten record
|
||||
%% *AND* insert written record
|
||||
Hole = Offset - Last,
|
||||
MT0 = update_merkle_tree({Last, Hole, unwritten}, MT),
|
||||
{Offset+Size, update_merkle_tree(In, MT0)}.
|
||||
|
||||
insert_csum_naive({Last, Size, _Csum}=In, {Last, MT}) ->
|
||||
%% no gap
|
||||
{Last+Size, update_acc(In, MT)};
|
||||
|
@ -136,32 +107,20 @@ insert_csum_naive({Offset, Size, _Csum}=In, {Last, MT}) ->
|
|||
{Offset+Size, update_acc(In, MT0)}.
|
||||
|
||||
% @private
|
||||
update_merkle_tree({Offset, Size, unwritten}, MT) ->
|
||||
merklet:insert({?ENCODE(Offset, Size), ?UNWRITTEN}, MT);
|
||||
update_merkle_tree({Offset, Size, trimmed}, MT) ->
|
||||
merklet:insert({?ENCODE(Offset, Size), ?TRIMMED}, MT);
|
||||
update_merkle_tree({Offset, Size, Csum}, MT) ->
|
||||
merklet:insert({?ENCODE(Offset, Size), Csum}, MT).
|
||||
|
||||
update_acc({Offset, Size, unwritten}, MT) ->
|
||||
[ {Offset, Size, ?NAIVE_ENCODE(Offset, Size, ?UNWRITTEN)} | MT ];
|
||||
update_acc({Offset, Size, trimmed}, MT) ->
|
||||
[ {Offset, Size, ?NAIVE_ENCODE(Offset, Size, ?TRIMMED)} | MT ];
|
||||
update_acc({Offset, Size, Csum}, MT) ->
|
||||
update_acc({Offset, Size, <<_Tag:8, Csum/binary>>}, MT) ->
|
||||
[ {Offset, Size, ?NAIVE_ENCODE(Offset, Size, Csum)} | MT ].
|
||||
|
||||
build_tree(MT = #naive{ leaves = L, chunk_size = ChunkSize }) ->
|
||||
lager:debug("Leaves: ~p~n", [L]),
|
||||
Lvl1s = build_level_1(ChunkSize, L, 1, [ crypto:hash_init(?H) ]),
|
||||
lager:debug("Lvl1: ~p~n", [Lvl1s]),
|
||||
Mod2 = length(Lvl1s) div ?LEVEL_SIZE,
|
||||
Lvl2s = build_int_level(Mod2, Lvl1s, 1, [ crypto:hash_init(?H) ]),
|
||||
lager:debug("Lvl2: ~p~n", [Lvl2s]),
|
||||
Mod3 = length(Lvl2s) div 2,
|
||||
Lvl3s = build_int_level(Mod3, Lvl2s, 1, [ crypto:hash_init(?H) ]),
|
||||
lager:debug("Lvl3: ~p~n", [Lvl3s]),
|
||||
Root = build_root(Lvl3s, crypto:hash_init(?H)),
|
||||
lager:debug("Root: ~p~n", [Root]),
|
||||
MT#naive{ root = Root, lvl1 = Lvl1s, lvl2 = Lvl2s, lvl3 = Lvl3s, recalc = false }.
|
||||
|
||||
build_root([], Ctx) ->
|
||||
|
@ -189,19 +148,9 @@ build_level_1(Size, [{Pos, Len, Hash}|T], Multiple, [ Ctx | Rest ])
|
|||
build_level_1(Size, T, Multiple, [ crypto:hash_update(Ctx, Hash) | Rest ]).
|
||||
|
||||
naive_diff(#naive{lvl1 = L1}, #naive{lvl1=L2, chunk_size=CS2}) ->
|
||||
lager:debug("naive diff: Our lvl1: ~p~n", [L1]),
|
||||
lager:debug("naive diff: Their chunk size: ~p, lvl1: ~p~n", [CS2, L2]),
|
||||
|
||||
Set1 = gb_sets:from_list(lists:zip(lists:seq(1, length(L1), L1))),
|
||||
Set2 = gb_sets:from_list(lists:zip(lists:seq(1, length(L2), L2))),
|
||||
|
||||
%% The byte ranges in list 2 that do not match in list 1
|
||||
%%
|
||||
%% We have to decide what to do now - should we filter the
|
||||
%% leaf nodes using these ranges and find specific divergence
|
||||
%% between Tree1 and Tree2?
|
||||
%%
|
||||
%% Or should we do something else?
|
||||
[ {(X-1)*CS2, CS2, SHA} || {X, SHA} <- gb_sets:to_list(gb_sets:subtract(Set1, Set2)) ].
|
||||
|
||||
|
||||
|
|
|
@ -26,9 +26,13 @@
|
|||
-include_lib("eunit/include/eunit.hrl").
|
||||
-include_lib("kernel/include/file.hrl").
|
||||
|
||||
-define(TESTFILE, "yza^4c784dc2-19bf-4ac6-91f6-58bbe5aa88e0^1").
|
||||
-define(GAP_CHANCE, 0.10).
|
||||
|
||||
%% unit tests
|
||||
|
||||
|
||||
%% Define or remove these ifdefs if benchmarking is desired.
|
||||
-ifdef(BENCH).
|
||||
choose_filename() ->
|
||||
random_from_list([
|
||||
"def^c5ea7511-d649-47d6-a8c3-2b619379c237^1",
|
||||
|
@ -140,7 +144,6 @@ torture_test(C) ->
|
|||
ok = file:close(F).
|
||||
|
||||
run_torture_test() ->
|
||||
{MTime, M} = timer:tc(fun() -> merklet_torture() end),
|
||||
{NTime, N} = timer:tc(fun() -> naive_torture() end),
|
||||
|
||||
MSize = byte_size(term_to_binary(M)),
|
||||
|
@ -148,12 +151,6 @@ run_torture_test() ->
|
|||
|
||||
{MSize, MTime, NSize, NTime}.
|
||||
|
||||
merklet_torture() ->
|
||||
lists:foldl(
|
||||
fun({O, S, Sha}, Acc) ->
|
||||
merklet:insert({<<O:64/unsigned-big, S:32/unsigned-big>>, Sha}, Acc)
|
||||
end, undefined, torture_generator()).
|
||||
|
||||
naive_torture() ->
|
||||
N = lists:foldl(fun(T, Acc) -> machi_merkle_tree:update_acc(T, Acc) end, [], torture_generator()),
|
||||
T = #naive{ leaves = lists:reverse(N), chunk_size = 10010, recalc = true },
|
||||
|
@ -161,3 +158,4 @@ naive_torture() ->
|
|||
|
||||
torture_generator() ->
|
||||
[ {O, 1, crypto:hash(sha, term_to_binary(now()))} || O <- lists:seq(1024, 1000000) ].
|
||||
-endif. % BENCH
|
||||
|
|
Loading…
Reference in a new issue