From a7f42d636e05f5fb82a669049af0a733995972cd Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 9 Feb 2016 01:27:58 +0900 Subject: [PATCH] WIP: narrowing in on repair problems due to double-write errors --- src/machi_chain_manager1.erl | 15 +++++++++------ src/machi_chain_repair.erl | 14 +++++++++++++- src/machi_cr_client.erl | 4 ++-- src/machi_flu_filename_mgr.erl | 6 +++++- test/machi_ap_repair_eqc.erl | 4 +++- 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 7f112d0..075d834 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2967,7 +2967,8 @@ zerf_find_last_annotated(FLU, MajoritySize, S) -> end. perhaps_verbose_c111(P_latest2, S) -> - case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of + case true of + %%TODO put me back: case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of true -> Dbg2X = lists:keydelete(react, 1, P_latest2#projection_v1.dbg2) ++ @@ -2975,16 +2976,18 @@ perhaps_verbose_c111(P_latest2, S) -> P_latest2x = P_latest2#projection_v1{dbg2=Dbg2X}, % limit verbose len. Last2 = get(last_verbose), Summ2 = machi_projection:make_summary(P_latest2x), - if P_latest2#projection_v1.upi == [], - (S#ch_mgr.proj)#projection_v1.upi /= [] -> + %% if P_latest2#projection_v1.upi == [], + %% (S#ch_mgr.proj)#projection_v1.upi /= [] -> + if true -> <> = P_latest2#projection_v1.epoch_csum, - io:format(user, "\n~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), (S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]); + io:format(user, "~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), (S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]); true -> ok end, - case proplists:get_value(private_write_verbose, - S#ch_mgr.opts) of + %% TODO put me back: case proplists:get_value(private_write_verbose, + %% S#ch_mgr.opts) of + case true of true when Summ2 /= Last2 -> put(last_verbose, Summ2), ?V("\n~s ~p uses plain: ~w \n", diff --git a/src/machi_chain_repair.erl b/src/machi_chain_repair.erl index 146fe65..f249268 100644 --- a/src/machi_chain_repair.erl +++ b/src/machi_chain_repair.erl @@ -274,7 +274,19 @@ make_repair_directives3([{Offset, Size, CSum, _FLU}=A|Rest0], %% byte range from all FLUs %% 3b. Log big warning about data loss. %% 4. Log any other checksum discrepencies as they are found. - exit({todo_repair_sanity_check, ?LINE, File, Offset, As}) + QQ = [begin + Pxy = orddict:fetch(FLU, ProxiesDict), + {ok, EpochID} = machi_proxy_flu1_client:get_epoch_id( + Pxy, ?SHORT_TIMEOUT), + NSInfo = undefined, + XX = machi_proxy_flu1_client:read_chunk( + Pxy, NSInfo, EpochID, File, Offset, Size, undefined, + ?SHORT_TIMEOUT), + {FLU, XX} + end || {__Offset, __Size, __CSum, FLU} <- As], + + exit({todo_repair_sanity_check, ?LINE, File, Offset, {as,As}, {qq,QQ}}) + %% exit({todo_repair_sanity_check, ?LINE, File, Offset, As}) end, %% List construction guarantees us that there's at least one ?MAX_OFFSET %% item remains. Sort order + our "taking" of all exact Offset+Size diff --git a/src/machi_cr_client.erl b/src/machi_cr_client.erl index cc4a508..19a6d1a 100644 --- a/src/machi_cr_client.erl +++ b/src/machi_cr_client.erl @@ -786,9 +786,9 @@ do_repair_chunk2([], ReturnMode, Chunk, _CSum, _Repaired, _NSInfo, File, Offset, %% TODO: add stats for # of repairs, length(_Repaired)-1, etc etc? case ReturnMode of read -> - {ok, Chunk, S}; + {reply, {ok, {[Chunk], []}}, S}; {append, Offset, Size, File} -> - {ok, {Offset, Size, File}, S} + {reply, {ok, {[{Offset, Size, File}], []}}, S} end; do_repair_chunk2([First|Rest]=ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offset, Size, Depth, STime, #state{epoch_id=EpochID, proxies_dict=PD}=S) -> diff --git a/src/machi_flu_filename_mgr.erl b/src/machi_flu_filename_mgr.erl index 7140266..1e504df 100644 --- a/src/machi_flu_filename_mgr.erl +++ b/src/machi_flu_filename_mgr.erl @@ -231,10 +231,14 @@ find_or_make_filename(Tid, DataDir, NS, NSLocator, Prefix, N) -> end. generate_filename(DataDir, NS, NSLocator, Prefix, N) -> +{A,B,C} = erlang:now(), +TODO = lists:flatten(filename:basename(DataDir) ++ "," ++ io_lib:format("~w,~w,~w", [A,B,C])), {F, _} = machi_util:make_data_filename( DataDir, NS, NSLocator, Prefix, - generate_uuid_v4_str(), +TODO, + %% TODO put me back!! + %% generate_uuid_v4_str(), N), binary_to_list(F). diff --git a/test/machi_ap_repair_eqc.erl b/test/machi_ap_repair_eqc.erl index bbb8717..0f5f5a2 100644 --- a/test/machi_ap_repair_eqc.erl +++ b/test/machi_ap_repair_eqc.erl @@ -121,7 +121,9 @@ append(CRIndex, Bin, #state{verbose=V}=S) -> NSInfo = #ns_info{}, NoCSum = <<>>, Opts1 = #append_opts{}, +io:format(user, "append_chunk ~p ~P ->\n", [Prefix, Bin, 6]), Res = (catch machi_cr_client:append_chunk(C, NSInfo, Prefix, Bin, NoCSum, Opts1, sec(1))), +io:format(user, "append_chunk ~p ~P ->\n ~p\n", [Prefix, Bin, 6, Res]), case Res of {ok, {_Off, Len, _FileName}=Key} -> case ets:insert_new(?WRITTEN_TAB, {Key, Bin}) of @@ -188,6 +190,7 @@ change_partition(Partition, [] -> ?V("## Turn OFF partition: ~w~n", [Partition]); _ -> ?V("## Turn ON partition: ~w~n", [Partition]) end || Verbose], + io:format(user, "partition ~p\n", [Partition]), machi_partition_simulator:always_these_partitions(Partition), _ = machi_partition_simulator:get(FLUNames), %% Don't wait for stable chain, tick will be executed on demand @@ -456,7 +459,6 @@ assert_chunk(C, {Off, Len, FileName}=Key, Bin) -> FileNameStr = binary_to_list(FileName), %% TODO : Use CSum instead of binary (after disuccsion about CSum is calmed down?) NSInfo = undefined, - io:format(user, "TODO fix broken read_chunk mod ~s line ~w\n", [?MODULE, ?LINE]), case (catch machi_cr_client:read_chunk(C, NSInfo, FileName, Off, Len, undefined, sec(3))) of {ok, {[{FileNameStr, Off, Bin, _}], []}} -> ok;