WIP: move to stats via ETS, success/failure propagates, yay!
This commit is contained in:
parent
cad84442bb
commit
4ae0f94649
2 changed files with 77 additions and 48 deletions
|
@ -1943,6 +1943,39 @@ perhaps_start_repair(
|
||||||
perhaps_start_repair(S) ->
|
perhaps_start_repair(S) ->
|
||||||
S.
|
S.
|
||||||
|
|
||||||
|
do_repair(
|
||||||
|
#ch_mgr{name=MyName,
|
||||||
|
proj=#projection_v1{upi=UPI,
|
||||||
|
repairing=[Dst|_]=Repairing,
|
||||||
|
members_dict=MembersDict}}=_S_copy,
|
||||||
|
Opts, ap_mode=_RepairMode) ->
|
||||||
|
T1 = os:timestamp(),
|
||||||
|
RepairId = proplists:get_value(repair_id, Opts, id1),
|
||||||
|
error_logger:info_msg("Repair start: tail ~p of ~p -> ~p, ID ~p\n",
|
||||||
|
[MyName, UPI, Repairing, RepairId]),
|
||||||
|
|
||||||
|
ETS = ets:new(repair_stats, [private, set]),
|
||||||
|
ETS_T_Keys = [t_in_files, t_in_chunks, t_in_bytes,
|
||||||
|
t_out_files, t_out_chunks, t_out_bytes,
|
||||||
|
t_bad_chunks, t_elapsed_seconds],
|
||||||
|
[ets:insert(ETS, {K, 0}) || K <- ETS_T_Keys],
|
||||||
|
|
||||||
|
Res = machi_chain_repair:repair_ap(MyName, Repairing, UPI,
|
||||||
|
MembersDict, ETS, Opts),
|
||||||
|
T2 = os:timestamp(),
|
||||||
|
Elapsed = (timer:now_diff(T2, T1) div 1000) / 1000,
|
||||||
|
ets:insert(ETS, {t_elapsed_seconds, Elapsed}),
|
||||||
|
Summary = case Res of ok -> "success";
|
||||||
|
_ -> "FAILURE"
|
||||||
|
end,
|
||||||
|
Stats = [{K, ets:lookup_element(ETS, K, 2)} || K <- ETS_T_Keys],
|
||||||
|
error_logger:info_msg("Repair ~s: tail ~p of ~p finished repair ID ~p: "
|
||||||
|
"~p\nStats ~w\n",
|
||||||
|
[Summary, MyName, UPI, RepairId, Res, Stats]),
|
||||||
|
timer:sleep(12345),
|
||||||
|
ets:delete(ETS),
|
||||||
|
exit({todo, Res}).
|
||||||
|
|
||||||
sanitize_repair_state(#ch_mgr{name=MyName,
|
sanitize_repair_state(#ch_mgr{name=MyName,
|
||||||
repair_start=Start,
|
repair_start=Start,
|
||||||
repair_final_status=Res,
|
repair_final_status=Res,
|
||||||
|
@ -1957,22 +1990,6 @@ sanitize_repair_state(#ch_mgr{name=MyName,
|
||||||
sanitize_repair_state(S) ->
|
sanitize_repair_state(S) ->
|
||||||
S.
|
S.
|
||||||
|
|
||||||
do_repair(
|
|
||||||
#ch_mgr{name=MyName,
|
|
||||||
proj=#projection_v1{upi=UPI,
|
|
||||||
repairing=[Dst|_]=Repairing,
|
|
||||||
members_dict=MembersDict}}=_S_copy,
|
|
||||||
Opts, ap_mode=_RepairMode) ->
|
|
||||||
RepairId = proplists:get_value(repair_id, Opts, id1),
|
|
||||||
error_logger:info_msg("Chain tail ~p of ~p starting repair ~p of ~p\n",
|
|
||||||
[MyName, UPI, RepairId, Repairing]),
|
|
||||||
{ok, Info} = machi_chain_repair:repair_ap(MyName, Repairing, UPI,
|
|
||||||
MembersDict, Opts),
|
|
||||||
error_logger:info_msg("Chain tail ~p of ~p finished repair ~p: ~p\n",
|
|
||||||
[MyName, UPI, RepairId, Info]),
|
|
||||||
timer:sleep(12345),
|
|
||||||
exit({todo, Info}).
|
|
||||||
|
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
|
||||||
perhaps_call_t(S, Partitions, FLU, DoIt) ->
|
perhaps_call_t(S, Partitions, FLU, DoIt) ->
|
||||||
|
|
|
@ -46,23 +46,14 @@
|
||||||
-define(VERB(Fmt), if Verb -> io:format(Fmt ); true -> ok end).
|
-define(VERB(Fmt), if Verb -> io:format(Fmt ); true -> ok end).
|
||||||
-define(VERB(Fmt, Args), if Verb -> io:format(Fmt, Args); true -> ok end).
|
-define(VERB(Fmt, Args), if Verb -> io:format(Fmt, Args); true -> ok end).
|
||||||
|
|
||||||
-record(stat, {
|
-export([repair_cp/4, repair_ap/6]).
|
||||||
i_files = 0,
|
|
||||||
i_chs = 0,
|
|
||||||
i_bytes = 0,
|
|
||||||
o_files = 0,
|
|
||||||
o_chs = 0,
|
|
||||||
o_bytes = 0}
|
|
||||||
).
|
|
||||||
|
|
||||||
-export([repair_cp/4, repair_ap/5]).
|
|
||||||
|
|
||||||
repair_cp(Src, Dst, MembersDict, Opts) ->
|
repair_cp(Src, Dst, MembersDict, Opts) ->
|
||||||
%% TODO: add missing function: wipe away any trace of chunks
|
%% TODO: add missing function: wipe away any trace of chunks
|
||||||
%% are present on Dst but missing on Src.
|
%% are present on Dst but missing on Src.
|
||||||
exit(todo_cp_mode).
|
exit(todo_cp_mode).
|
||||||
|
|
||||||
repair_ap(Src, Repairing, UPI, MembersDict, Opts) ->
|
repair_ap(Src, Repairing, UPI, MembersDict, ETS, Opts) ->
|
||||||
%% Use process dict so that 'after' clause can always quit all
|
%% Use process dict so that 'after' clause can always quit all
|
||||||
%% proxy pids.
|
%% proxy pids.
|
||||||
put(proxies_dict, orddict:new()),
|
put(proxies_dict, orddict:new()),
|
||||||
|
@ -86,14 +77,15 @@ repair_ap(Src, Repairing, UPI, MembersDict, Opts) ->
|
||||||
RepairMode = proplists:get_value(repair_mode, Opts, repair),
|
RepairMode = proplists:get_value(repair_mode, Opts, repair),
|
||||||
[ok = repair_file(ap_mode, RepairMode, File, Size, MissingList,
|
[ok = repair_file(ap_mode, RepairMode, File, Size, MissingList,
|
||||||
proplists:get_value(verbose, Opts, true),
|
proplists:get_value(verbose, Opts, true),
|
||||||
Src, ProxiesDict) ||
|
Src, ProxiesDict, ETS) ||
|
||||||
{File, {Size, MissingList}} <- MissingFileSummary],
|
{File, {Size, MissingList}} <- MissingFileSummary],
|
||||||
{ok, [yo_no_error, todo_stats_here]}
|
ok
|
||||||
catch
|
catch
|
||||||
What:Why ->
|
What:Why ->
|
||||||
|
Stack = erlang:get_stacktrace(),
|
||||||
io:format(user, "What Why ~p ~p @\n\t~p\n",
|
io:format(user, "What Why ~p ~p @\n\t~p\n",
|
||||||
[What, Why, erlang:get_stacktrace()]),
|
[What, Why, Stack]),
|
||||||
{error, yo_error}
|
{error, {What, Why, Stack}}
|
||||||
after
|
after
|
||||||
[(catch machi_proxy_flu1_client:quit(Pid)) ||
|
[(catch machi_proxy_flu1_client:quit(Pid)) ||
|
||||||
Pid <- orddict:to_list(get(proxies_dict))]
|
Pid <- orddict:to_list(get(proxies_dict))]
|
||||||
|
@ -135,12 +127,12 @@ append_file_dict(Proxy, FLU_name, D) ->
|
||||||
%% read while the current write is still in progress.
|
%% read while the current write is still in progress.
|
||||||
|
|
||||||
repair_file(ap_mode, RepairMode,
|
repair_file(ap_mode, RepairMode,
|
||||||
File, Size, [], Verb, Src, ProxiesDict) ->
|
File, Size, [], Verb, Src, ProxiesDict, ETS) ->
|
||||||
?VERB("~p: ~s: present on both: ", [Src, File]),
|
?VERB("~p: ~s: present on both: ", [Src, File]),
|
||||||
?VERB("TODO!\n"), ok;
|
?VERB("TODO!\n"), ok;
|
||||||
%%TODO: repair_both_present(File, Size, RepairMode, V, SrcS, SrcS2, DstS, DstS2);
|
%%TODO: repair_both_present(File, Size, RepairMode, V, SrcS, SrcS2, DstS, DstS2);
|
||||||
repair_file(ap_mode, RepairMode,
|
repair_file(ap_mode, RepairMode,
|
||||||
File, Size, MissingList, Verb, Src, ProxiesDict) ->
|
File, Size, MissingList, Verb, Src, ProxiesDict, ETS) ->
|
||||||
case lists:member(Src, MissingList) of
|
case lists:member(Src, MissingList) of
|
||||||
true ->
|
true ->
|
||||||
?VERB("~p: ~s -> ~p, skipping: not on source server\n",
|
?VERB("~p: ~s -> ~p, skipping: not on source server\n",
|
||||||
|
@ -155,11 +147,11 @@ repair_file(ap_mode, RepairMode,
|
||||||
orddict:filter(fun(K, _V) -> lists:member(K, MissingList) end,
|
orddict:filter(fun(K, _V) -> lists:member(K, MissingList) end,
|
||||||
ProxiesDict),
|
ProxiesDict),
|
||||||
SrcProxy = orddict:fetch(Src, ProxiesDict),
|
SrcProxy = orddict:fetch(Src, ProxiesDict),
|
||||||
ok = copy_file(File, SrcProxy, MissingProxiesDict, Verb),
|
ok = copy_file(File, SrcProxy, MissingProxiesDict, Verb, ETS),
|
||||||
?VERB("done\n", [])
|
?VERB("done\n", [])
|
||||||
end.
|
end.
|
||||||
|
|
||||||
copy_file(File, SrcProxy, MissingProxiesDict, Verb) ->
|
copy_file(File, SrcProxy, MissingProxiesDict, Verb, ETS) ->
|
||||||
%% Use the first source socket to enumerate the chunks & checksums.
|
%% Use the first source socket to enumerate the chunks & checksums.
|
||||||
%% Use the second source socket to copy each chunk.
|
%% Use the second source socket to copy each chunk.
|
||||||
N = length(orddict:to_list(MissingProxiesDict)),
|
N = length(orddict:to_list(MissingProxiesDict)),
|
||||||
|
@ -167,34 +159,54 @@ copy_file(File, SrcProxy, MissingProxiesDict, Verb) ->
|
||||||
?SHORT_TIMEOUT),
|
?SHORT_TIMEOUT),
|
||||||
{ok, CheckSums} = machi_proxy_flu1_client:checksum_list(
|
{ok, CheckSums} = machi_proxy_flu1_client:checksum_list(
|
||||||
SrcProxy, EpochID, File, ?LONG_TIMEOUT),
|
SrcProxy, EpochID, File, ?LONG_TIMEOUT),
|
||||||
|
EtsKeys = [{in_files, t_in_files}, {in_chunks, t_in_chunks},
|
||||||
|
{in_bytes, t_in_bytes}, {out_files, t_out_files},
|
||||||
|
{out_chunks, t_out_chunks}, {out_bytes, t_out_bytes}],
|
||||||
|
[ets:insert(ETS, {L_K, 0}) || {L_K, _T_K} <- EtsKeys],
|
||||||
CopyChunks =
|
CopyChunks =
|
||||||
fun({Offset, Size, CSum}, #stat{i_chs=In_Cs, i_bytes=In_Bs,
|
fun({Offset, Size, CSum}, {ok, ETS, _, _} = Acc) ->
|
||||||
o_chs=Out_Cs, o_bytes=Out_Bs}=Acc) ->
|
case ets:lookup_element(ETS, in_chunks, 2) rem 100 of
|
||||||
if In_Cs rem 100 == 0 -> ?VERB(".", []);
|
0 -> ?VERB(".", []);
|
||||||
true -> ok end,
|
_ -> ok
|
||||||
|
end,
|
||||||
|
_T1 = os:timestamp(),
|
||||||
{ok, Chunk} = machi_proxy_flu1_client:read_chunk(
|
{ok, Chunk} = machi_proxy_flu1_client:read_chunk(
|
||||||
SrcProxy, EpochID, File, Offset, Size),
|
SrcProxy, EpochID, File, Offset, Size),
|
||||||
|
_T2 = os:timestamp(),
|
||||||
case machi_util:checksum_chunk(Chunk) of
|
case machi_util:checksum_chunk(Chunk) of
|
||||||
CSum_now when CSum_now == CSum ->
|
CSum_now when CSum_now == CSum ->
|
||||||
[begin
|
[begin
|
||||||
|
_T3 = os:timestamp(),
|
||||||
ok = machi_proxy_flu1_client:write_chunk(
|
ok = machi_proxy_flu1_client:write_chunk(
|
||||||
DstProxy, EpochID, File, Offset, Chunk)
|
DstProxy, EpochID, File, Offset, Chunk),
|
||||||
|
_T4 = os:timestamp()
|
||||||
end || {_FLU, DstProxy} <- MissingProxiesDict],
|
end || {_FLU, DstProxy} <- MissingProxiesDict],
|
||||||
Acc#stat{i_chs=In_Cs + 1, i_bytes=In_Bs + Size,
|
ets:update_counter(ETS, in_chunks, 1),
|
||||||
o_chs=Out_Cs + N, o_bytes=Out_Bs+(N*Size)};
|
ets:update_counter(ETS, in_bytes, Size),
|
||||||
|
ets:update_counter(ETS, out_chunks, N),
|
||||||
|
ets:update_counter(ETS, out_bytes, N*Size),
|
||||||
|
Acc;
|
||||||
CSum_now ->
|
CSum_now ->
|
||||||
error_logger:error_msg(
|
error_logger:error_msg(
|
||||||
"TODO: Checksum failure: "
|
"TODO: Checksum failure: "
|
||||||
"file ~p offset ~p size ~p: "
|
"file ~p offset ~p size ~p: "
|
||||||
"expected ~p got ~p\n",
|
"expected ~p got ~p\n",
|
||||||
[File, Offset, Size, CSum, CSum_now]),
|
[File, Offset, Size, CSum, CSum_now]),
|
||||||
exit({todo_csum_error,
|
ets:update_counter(ETS, t_bad_chunks, 1),
|
||||||
{File, Offset, Size, CSum, CSum_now}})
|
Acc
|
||||||
end
|
end;
|
||||||
|
(_, _=Acc) -> % failure: skip rest of file
|
||||||
|
Acc
|
||||||
end,
|
end,
|
||||||
Stats = lists:foldl(CopyChunks, #stat{i_files=1,o_files=N}, CheckSums),
|
ets:update_counter(ETS, t_in_files, 1),
|
||||||
#stat{i_chs=In_Cs, o_chs=Out_Cs} = Stats,
|
ets:update_counter(ETS, t_out_files, N),
|
||||||
?VERB("copied ~w chunks to ~w replicas, ", [In_Cs, Out_Cs]),
|
_CopyRes = lists:foldl(CopyChunks, {ok, ETS, x, x}, CheckSums),
|
||||||
|
?VERB("copied ~w chunks to ~w chunks, ",
|
||||||
|
[ets:lookup_element(ETS, K, 2) || K <- [in_chunks, out_chunks] ]),
|
||||||
|
|
||||||
|
%% Copy this file's stats to the total counts.
|
||||||
|
[ets:update_counter(ETS, T_K, ets:lookup_element(ETS, L_K, 2)) ||
|
||||||
|
{L_K, T_K} <- EtsKeys],
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
%% copy_file_proc_checksum_fun(File, SrcS, DstS, _Verbose) ->
|
%% copy_file_proc_checksum_fun(File, SrcS, DstS, _Verbose) ->
|
||||||
|
|
Loading…
Reference in a new issue