WIP: narrowing in on repair problems due to double-write errors

This commit is contained in:
Scott Lystig Fritchie 2016-02-09 01:27:58 +09:00
parent fbb0203f67
commit a7f42d636e
5 changed files with 32 additions and 11 deletions

View file

@ -2967,7 +2967,8 @@ zerf_find_last_annotated(FLU, MajoritySize, S) ->
end.
perhaps_verbose_c111(P_latest2, S) ->
case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of
case true of
%%TODO put me back: case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of
true ->
Dbg2X = lists:keydelete(react, 1,
P_latest2#projection_v1.dbg2) ++
@ -2975,16 +2976,18 @@ perhaps_verbose_c111(P_latest2, S) ->
P_latest2x = P_latest2#projection_v1{dbg2=Dbg2X}, % limit verbose len.
Last2 = get(last_verbose),
Summ2 = machi_projection:make_summary(P_latest2x),
if P_latest2#projection_v1.upi == [],
(S#ch_mgr.proj)#projection_v1.upi /= [] ->
%% if P_latest2#projection_v1.upi == [],
%% (S#ch_mgr.proj)#projection_v1.upi /= [] ->
if true ->
<<CSumRep:4/binary,_/binary>> =
P_latest2#projection_v1.epoch_csum,
io:format(user, "\n~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), (S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]);
io:format(user, "~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), (S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]);
true ->
ok
end,
case proplists:get_value(private_write_verbose,
S#ch_mgr.opts) of
%% TODO put me back: case proplists:get_value(private_write_verbose,
%% S#ch_mgr.opts) of
case true of
true when Summ2 /= Last2 ->
put(last_verbose, Summ2),
?V("\n~s ~p uses plain: ~w \n",

View file

@ -274,7 +274,19 @@ make_repair_directives3([{Offset, Size, CSum, _FLU}=A|Rest0],
%% byte range from all FLUs
%% 3b. Log big warning about data loss.
%% 4. Log any other checksum discrepencies as they are found.
exit({todo_repair_sanity_check, ?LINE, File, Offset, As})
QQ = [begin
Pxy = orddict:fetch(FLU, ProxiesDict),
{ok, EpochID} = machi_proxy_flu1_client:get_epoch_id(
Pxy, ?SHORT_TIMEOUT),
NSInfo = undefined,
XX = machi_proxy_flu1_client:read_chunk(
Pxy, NSInfo, EpochID, File, Offset, Size, undefined,
?SHORT_TIMEOUT),
{FLU, XX}
end || {__Offset, __Size, __CSum, FLU} <- As],
exit({todo_repair_sanity_check, ?LINE, File, Offset, {as,As}, {qq,QQ}})
%% exit({todo_repair_sanity_check, ?LINE, File, Offset, As})
end,
%% List construction guarantees us that there's at least one ?MAX_OFFSET
%% item remains. Sort order + our "taking" of all exact Offset+Size

View file

@ -786,9 +786,9 @@ do_repair_chunk2([], ReturnMode, Chunk, _CSum, _Repaired, _NSInfo, File, Offset,
%% TODO: add stats for # of repairs, length(_Repaired)-1, etc etc?
case ReturnMode of
read ->
{ok, Chunk, S};
{reply, {ok, {[Chunk], []}}, S};
{append, Offset, Size, File} ->
{ok, {Offset, Size, File}, S}
{reply, {ok, {[{Offset, Size, File}], []}}, S}
end;
do_repair_chunk2([First|Rest]=ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offset,
Size, Depth, STime, #state{epoch_id=EpochID, proxies_dict=PD}=S) ->

View file

@ -231,10 +231,14 @@ find_or_make_filename(Tid, DataDir, NS, NSLocator, Prefix, N) ->
end.
generate_filename(DataDir, NS, NSLocator, Prefix, N) ->
{A,B,C} = erlang:now(),
TODO = lists:flatten(filename:basename(DataDir) ++ "," ++ io_lib:format("~w,~w,~w", [A,B,C])),
{F, _} = machi_util:make_data_filename(
DataDir,
NS, NSLocator, Prefix,
generate_uuid_v4_str(),
TODO,
%% TODO put me back!!
%% generate_uuid_v4_str(),
N),
binary_to_list(F).

View file

@ -121,7 +121,9 @@ append(CRIndex, Bin, #state{verbose=V}=S) ->
NSInfo = #ns_info{},
NoCSum = <<>>,
Opts1 = #append_opts{},
io:format(user, "append_chunk ~p ~P ->\n", [Prefix, Bin, 6]),
Res = (catch machi_cr_client:append_chunk(C, NSInfo, Prefix, Bin, NoCSum, Opts1, sec(1))),
io:format(user, "append_chunk ~p ~P ->\n ~p\n", [Prefix, Bin, 6, Res]),
case Res of
{ok, {_Off, Len, _FileName}=Key} ->
case ets:insert_new(?WRITTEN_TAB, {Key, Bin}) of
@ -188,6 +190,7 @@ change_partition(Partition,
[] -> ?V("## Turn OFF partition: ~w~n", [Partition]);
_ -> ?V("## Turn ON partition: ~w~n", [Partition])
end || Verbose],
io:format(user, "partition ~p\n", [Partition]),
machi_partition_simulator:always_these_partitions(Partition),
_ = machi_partition_simulator:get(FLUNames),
%% Don't wait for stable chain, tick will be executed on demand
@ -456,7 +459,6 @@ assert_chunk(C, {Off, Len, FileName}=Key, Bin) ->
FileNameStr = binary_to_list(FileName),
%% TODO : Use CSum instead of binary (after disuccsion about CSum is calmed down?)
NSInfo = undefined,
io:format(user, "TODO fix broken read_chunk mod ~s line ~w\n", [?MODULE, ?LINE]),
case (catch machi_cr_client:read_chunk(C, NSInfo, FileName, Off, Len, undefined, sec(3))) of
{ok, {[{FileNameStr, Off, Bin, _}], []}} ->
ok;