Incremental merge refactor, step #2

Now incremental merge has a new strategy.
In stead of doing the same amount of merge
work at all levels, we now compute the total
merge work load, and do as much as possible
on the first level, subtract work done, and
delegate to the next level, etc.

The effect of this is that we do more IO on
fewer files, improving sequential-ness of
the workload involved in the incremental merge.
This commit is contained in:
Kresten Krab Thorup 2012-04-24 00:31:28 +02:00
parent 9fb8a5e73f
commit ff36e401b7
3 changed files with 72 additions and 47 deletions

View file

@ -197,13 +197,13 @@ init([Dir, Opts]) ->
case file:read_file_info(Dir) of case file:read_file_info(Dir) of
{ok, #file_info{ type=directory }} -> {ok, #file_info{ type=directory }} ->
{ok, TopLevel, MaxLevel} = open_levels(Dir,Opts), {ok, TopLevel, MaxLevel} = open_levels(Dir,Opts),
{ok, Nursery} = hanoi_nursery:recover(Dir, TopLevel); {ok, Nursery} = hanoi_nursery:recover(Dir, TopLevel, MaxLevel);
{error, E} when E =:= enoent -> {error, E} when E =:= enoent ->
ok = file:make_dir(Dir), ok = file:make_dir(Dir),
{ok, TopLevel} = hanoi_level:open(Dir, ?TOP_LEVEL, undefined, Opts, self()), {ok, TopLevel} = hanoi_level:open(Dir, ?TOP_LEVEL, undefined, Opts, self()),
MaxLevel = ?TOP_LEVEL, MaxLevel = ?TOP_LEVEL,
{ok, Nursery} = hanoi_nursery:new(Dir) {ok, Nursery} = hanoi_nursery:new(Dir, MaxLevel)
end, end,
{ok, #state{ top=TopLevel, dir=Dir, nursery=Nursery, opt=Opts, max_level=MaxLevel }}. {ok, #state{ top=TopLevel, dir=Dir, nursery=Nursery, opt=Opts, max_level=MaxLevel }}.
@ -233,12 +233,12 @@ open_levels(Dir,Options) ->
file:delete(filename:join(Dir,"nursery.data")), file:delete(filename:join(Dir,"nursery.data")),
{TopLevel, MaxMerge} = {TopLevel, MaxMerge} =
lists:foldl( fun(LevelNo, {Prev, Max}) -> lists:foldl( fun(LevelNo, {Prev, MergeWork0}) ->
{ok, Level} = hanoi_level:open(Dir,LevelNo,Prev,Options,self()), {ok, Level} = hanoi_level:open(Dir,LevelNo,Prev,Options,self()),
NextMax = max(Max, hanoi_level:unmerged_count(Level)), MergeWork = MergeWork0 + hanoi_level:unmerged_count(Level),
{Level, NextMax} {Level, MergeWork}
end, end,
{undefined, 0}, {undefined, 0},
lists:seq(MaxLevel, min(?TOP_LEVEL, MinLevel), -1)), lists:seq(MaxLevel, min(?TOP_LEVEL, MinLevel), -1)),
@ -262,8 +262,10 @@ parse_level(FileName) ->
end. end.
handle_info({bottom_level, N}, State) when N > State#state.max_level -> handle_info({bottom_level, N}, #state{ nursery=Nursery }=State)
{noreply,State#state{ max_level = N }}; when N > State#state.max_level ->
{noreply,State#state{ max_level = N,
nursery= hanoi_nursery:set_max_level(Nursery, N) }};
handle_info(Info,State) -> handle_info(Info,State) ->
error_logger:error_msg("Unknown info ~p~n", [Info]), error_logger:error_msg("Unknown info ~p~n", [Info]),
@ -333,9 +335,9 @@ do_put(Key, Value, State=#state{ nursery=Nursery, top=Top }) ->
{ok, Nursery2} = hanoi_nursery:add_maybe_flush(Key, Value, Nursery, Top), {ok, Nursery2} = hanoi_nursery:add_maybe_flush(Key, Value, Nursery, Top),
{ok, State#state{ nursery=Nursery2 }}. {ok, State#state{ nursery=Nursery2 }}.
flush_nursery(State=#state{nursery=Nursery, top=Top, dir=Dir}) -> flush_nursery(State=#state{nursery=Nursery, top=Top, dir=Dir, max_level=MaxLevel}) ->
ok = hanoi_nursery:finish(Nursery, Top), ok = hanoi_nursery:finish(Nursery, Top),
{ok, Nursery2} = hanoi_nursery:new(Dir), {ok, Nursery2} = hanoi_nursery:new(Dir, MaxLevel),
{ok, State#state{ nursery=Nursery2 }}. {ok, State#state{ nursery=Nursery2 }}.
start_app() -> start_app() ->

View file

@ -50,7 +50,7 @@
-record(state, { -record(state, {
a, b, c, next, dir, level, inject_done_ref, merge_pid, folding = [], a, b, c, next, dir, level, inject_done_ref, merge_pid, folding = [],
step_next_ref, step_caller, step_merge_ref, step_next_ref, step_caller, step_merge_ref,
opts = [], owner opts = [], owner, work_done=0
}). }).
%%%%% PUBLIC OPERATIONS %%%%% PUBLIC OPERATIONS
@ -209,7 +209,7 @@ initialize2(State) ->
check_begin_merge_then_loop(State=#state{a=BT1, b=BT2, merge_pid=undefined}) check_begin_merge_then_loop(State=#state{a=BT1, b=BT2, merge_pid=undefined})
when BT1/=undefined, BT2 /= undefined -> when BT1/=undefined, BT2 /= undefined ->
{ok, MergePID} = begin_merge(State), {ok, MergePID} = begin_merge(State),
main_loop(State#state{merge_pid=MergePID }); main_loop(State#state{merge_pid=MergePID,work_done=0 });
check_begin_merge_then_loop(State) -> check_begin_merge_then_loop(State) ->
main_loop(State). main_loop(State).
@ -247,13 +247,7 @@ main_loop(State = #state{ next=Next }) ->
?REQ(From, unmerged_count) -> ?REQ(From, unmerged_count) ->
Files = reply(From, total_unmerged(State)),
(if State#state.b == undefined -> 0; true -> 1 end)
+ (if State#state.c == undefined -> 0; true -> 1 end),
Amount = Files * ?BTREE_SIZE( State#state.level ),
reply(From, Amount),
main_loop(State); main_loop(State);
%% replies OK when there is no current step in progress %% replies OK when there is no current step in progress
@ -261,7 +255,11 @@ main_loop(State = #state{ next=Next }) ->
when State#state.step_merge_ref == undefined, when State#state.step_merge_ref == undefined,
State#state.step_next_ref == undefined -> State#state.step_next_ref == undefined ->
reply(From, ok), reply(From, ok),
self() ! ?REQ(undefined, {step, HowMuch}), if HowMuch > 0 ->
self() ! ?REQ(undefined, {step, HowMuch});
true ->
ok
end,
main_loop(State); main_loop(State);
%% accept step any time there is not an outstanding step %% accept step any time there is not an outstanding step
@ -271,30 +269,37 @@ main_loop(State = #state{ next=Next }) ->
State#state.step_next_ref == undefined State#state.step_next_ref == undefined
-> ->
WorkLeftHere = max(0,total_unmerged(State) - State#state.work_done),
WorkToDoHere = min(WorkLeftHere, HowMuch),
DelegateWork = max(0,HowMuch - WorkToDoHere),
%% delegate the step request to the next level %% delegate the step request to the next level
if Next =:= undefined -> if Next =:= undefined; DelegateWork == 0 ->
DelegateRef = undefined; DelegateRef = undefined;
true -> true ->
DelegateRef = send_request(Next, {step, HowMuch}) DelegateRef = send_request(Next, {step, DelegateWork})
end, end,
%% if there is a merge worker, send him a step message if (State#state.merge_pid == undefined)
case State#state.merge_pid of orelse (WorkToDoHere =< 0) ->
undefined -> MergeRef = undefined,
MergeRef = undefined; NewWorkDone = State#state.work_done;
MergePID -> true ->
MergePID = State#state.merge_pid,
MergeRef = monitor(process, MergePID), MergeRef = monitor(process, MergePID),
MergePID ! {step, {self(), MergeRef}, HowMuch} MergePID ! {step, {self(), MergeRef}, WorkToDoHere},
NewWorkDone = State#state.work_done + WorkToDoHere
end, end,
if (Next =:= undefined) andalso (MergeRef =:= undefined) -> if (Next =:= undefined) andalso (MergeRef =:= undefined) ->
%% nothing to do ... just return OK %% nothing to do ... just return OK
State2 = reply_step_ok(State#state { step_caller = StepFrom }), State2 = reply_step_ok(State#state { step_caller = StepFrom }),
main_loop(State2); main_loop(State2#state { work_done = NewWorkDone });
true -> true ->
main_loop(State#state{ step_next_ref=DelegateRef, main_loop(State#state{ step_next_ref=DelegateRef,
step_caller=StepFrom, step_caller=StepFrom,
step_merge_ref=MergeRef step_merge_ref=MergeRef,
work_done = NewWorkDone
}) })
end; end;
@ -545,6 +550,15 @@ reply_step_ok(State) ->
end, end,
State#state{ step_caller=undefined }. State#state{ step_caller=undefined }.
total_unmerged(State) ->
Files =
(if State#state.b == undefined -> 0; true -> 1 end)
+ (if State#state.c == undefined -> 0; true -> 1 end),
Files * ?BTREE_SIZE( State#state.level ).
do_lookup(_Key, []) -> do_lookup(_Key, []) ->
not_found; not_found;
do_lookup(_Key, [Pid]) when is_pid(Pid) -> do_lookup(_Key, [Pid]) when is_pid(Pid) ->

View file

@ -25,38 +25,40 @@
-module(hanoi_nursery). -module(hanoi_nursery).
-author('Kresten Krab Thorup <krab@trifork.com>'). -author('Kresten Krab Thorup <krab@trifork.com>').
-export([new/1, recover/2, add/3, finish/2, lookup/2, add_maybe_flush/4]). -export([new/2, recover/3, add/3, finish/2, lookup/2, add_maybe_flush/4]).
-export([do_level_fold/3]). -export([do_level_fold/3, set_max_level/2]).
-include("include/hanoi.hrl"). -include("include/hanoi.hrl").
-include("hanoi.hrl"). -include("hanoi.hrl").
-include_lib("kernel/include/file.hrl"). -include_lib("kernel/include/file.hrl").
-record(nursery, { log_file, dir, cache, total_size=0, count=0, last_sync=now() }). -record(nursery, { log_file, dir, cache, total_size=0, count=0,
last_sync=now(), max_level }).
-spec new(string()) -> {ok, #nursery{}} | {error, term()}. -spec new(string(), integer()) -> {ok, #nursery{}} | {error, term()}.
-define(LOGFILENAME(Dir), filename:join(Dir, "nursery.log")). -define(LOGFILENAME(Dir), filename:join(Dir, "nursery.log")).
new(Directory) -> new(Directory, MaxLevel) ->
{ok, File} = file:open( ?LOGFILENAME(Directory), {ok, File} = file:open( ?LOGFILENAME(Directory),
[raw, exclusive, write, delayed_write, append]), [raw, exclusive, write, delayed_write, append]),
{ok, #nursery{ log_file=File, dir=Directory, cache= gb_trees:empty() }}. {ok, #nursery{ log_file=File, dir=Directory, cache= gb_trees:empty(),
max_level=MaxLevel}}.
recover(Directory, TopLevel) -> recover(Directory, TopLevel, MaxLevel) ->
case file:read_file_info( ?LOGFILENAME(Directory) ) of case file:read_file_info( ?LOGFILENAME(Directory) ) of
{ok, _} -> {ok, _} ->
ok = do_recover(Directory, TopLevel), ok = do_recover(Directory, TopLevel, MaxLevel),
new(Directory); new(Directory, MaxLevel);
{error, enoent} -> {error, enoent} ->
new(Directory) new(Directory, MaxLevel)
end. end.
do_recover(Directory, TopLevel) -> do_recover(Directory, TopLevel, MaxLevel) ->
%% repair the log file; storing it in nursery2 %% repair the log file; storing it in nursery2
LogFileName = ?LOGFILENAME(Directory), LogFileName = ?LOGFILENAME(Directory),
{ok, Nursery} = read_nursery_from_log(Directory), {ok, Nursery} = read_nursery_from_log(Directory, MaxLevel),
ok = finish(Nursery, TopLevel), ok = finish(Nursery, TopLevel),
@ -65,11 +67,11 @@ do_recover(Directory, TopLevel) ->
ok. ok.
read_nursery_from_log(Directory) -> read_nursery_from_log(Directory, MaxLevel) ->
{ok, LogFile} = file:open( ?LOGFILENAME(Directory), [raw, read, read_ahead, binary] ), {ok, LogFile} = file:open( ?LOGFILENAME(Directory), [raw, read, read_ahead, binary] ),
{ok, Cache} = load_good_chunks(LogFile, gb_trees:empty()), {ok, Cache} = load_good_chunks(LogFile, gb_trees:empty()),
ok = file:close(LogFile), ok = file:close(LogFile),
{ok, #nursery{ dir=Directory, cache=Cache, count=gb_trees:size(Cache) }}. {ok, #nursery{ dir=Directory, cache=Cache, count=gb_trees:size(Cache), max_level=MaxLevel }}.
%% Just read the log file into a cache (gb_tree). %% Just read the log file into a cache (gb_tree).
%% If any errors happen here, then we simply ignore them and return %% If any errors happen here, then we simply ignore them and return
@ -148,7 +150,10 @@ lookup(Key, #nursery{ cache=Cache }) ->
% Finish this nursery (encode it to a btree, and delete the nursery file) % Finish this nursery (encode it to a btree, and delete the nursery file)
% @end % @end
-spec finish(Nursery::#nursery{}, TopLevel::pid()) -> ok. -spec finish(Nursery::#nursery{}, TopLevel::pid()) -> ok.
finish(#nursery{ dir=Dir, cache=Cache, log_file=LogFile, total_size=_TotalSize, count=Count }, TopLevel) -> finish(#nursery{ dir=Dir, cache=Cache, log_file=LogFile,
total_size=_TotalSize, count=Count,
max_level=MaxLevel
}, TopLevel) ->
%% first, close the log file (if it is open) %% first, close the log file (if it is open)
if LogFile /= undefined -> if LogFile /= undefined ->
@ -181,7 +186,8 @@ finish(#nursery{ dir=Dir, cache=Cache, log_file=LogFile, total_size=_TotalSize,
%% issue some work if this is a top-level inject (blocks until previous such %% issue some work if this is a top-level inject (blocks until previous such
%% incremental merge is finished). %% incremental merge is finished).
hanoi_level:incremental_merge(TopLevel, ?BTREE_SIZE(?TOP_LEVEL)), hanoi_level:incremental_merge(TopLevel,
(MaxLevel-?TOP_LEVEL+1)*?BTREE_SIZE(?TOP_LEVEL)),
ok; ok;
_ -> _ ->
@ -193,14 +199,14 @@ finish(#nursery{ dir=Dir, cache=Cache, log_file=LogFile, total_size=_TotalSize,
file:delete(LogFileName), file:delete(LogFileName),
ok. ok.
add_maybe_flush(Key, Value, Nursery=#nursery{ dir=Dir }, Top) -> add_maybe_flush(Key, Value, Nursery=#nursery{ dir=Dir, max_level=MaxLevel }, Top) ->
case add(Nursery, Key, Value) of case add(Nursery, Key, Value) of
{ok, _} = OK -> {ok, _} = OK ->
OK; OK;
{full, Nursery2} -> {full, Nursery2} ->
ok = hanoi_nursery:finish(Nursery2, Top), ok = hanoi_nursery:finish(Nursery2, Top),
{error, enoent} = file:read_file_info( filename:join(Dir, "nursery.log")), {error, enoent} = file:read_file_info( filename:join(Dir, "nursery.log")),
hanoi_nursery:new(Dir) hanoi_nursery:new(Dir, MaxLevel)
end. end.
do_level_fold(#nursery{ cache=Cache }, FoldWorkerPID, KeyRange) -> do_level_fold(#nursery{ cache=Cache }, FoldWorkerPID, KeyRange) ->
@ -217,3 +223,6 @@ do_level_fold(#nursery{ cache=Cache }, FoldWorkerPID, KeyRange) ->
gb_trees:to_list(Cache)), gb_trees:to_list(Cache)),
FoldWorkerPID ! {level_done, Ref}, FoldWorkerPID ! {level_done, Ref},
ok. ok.
set_max_level(Nursery = #nursery{}, MaxLevel) ->
Nursery#nursery{ max_level = MaxLevel }.