From 0b88a12c166874b9a85189c7d61c0f17a0e43513 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Mon, 3 Nov 2014 20:17:56 +0900 Subject: [PATCH] WIP: Debugging cruft, egadz, but improving (see below) So, this still pops up occasionally: % rebar skip_deps=true -v eunit suites=machi_flu0_test,machi_chain_manager1 [...] a private: [{epoch,223},{author,a},{upi,[a,b]},{repair,[]},{down,[c]},{d,[{author_proc,react},{nodes_up,[a,b]}]},{d2,[{up_nodz,[a,b]},{hooray,{v2,{2014,11,3},{20,19,57}}}]}] b private: [{epoch,224},{author,b},{upi,[b,a]},{repair,[]},{down,[c]},{d,[{author_proc,react},{nodes_up,[a,b]}]},{d2,[{up_nodz,[a,b]},{hooray,{v2,{2014,11,3},{20,19,57}}}]}] c private: [{epoch,191},{author,c},{upi,[c]},{repair,[]},{down,[a,b]},{d,[{author_proc,react},{nodes_up,[c]}]},{d2,[{up_nodz,[c]},{hooray,{v2,{2014,11,3},{20,19,57}}}]}] The mis-ordering between [a,b] and [b,a] happens after the partition settled on the islands of [a,b] and [c]. { c100 , ? LINE , _AnyOtherReturnValue } {c100,734, {err,error, {badmatch,[a,b]}, from, [{epoch,70}, {author,a}, {upi,[a]}, {repair,[b]}, {down,[c]}, {d, [{author_proc,react}, {nodes_up,[a,b]}]}, {d2,[]}], to, [{epoch,194}, {author,b}, {upi,[b,a]}, {repair,[]}, {down,[c]}, {d, [{author_proc,react}, {nodes_up,[a,b]}]}, {d2,[]}], relative_to,a,stack,[...] --- .../poc-machi/src/machi_chain_manager1.erl | 81 ++++++++++++++++--- .../test/machi_partition_simulator.erl | 9 ++- 2 files changed, 73 insertions(+), 17 deletions(-) diff --git a/prototype/poc-machi/src/machi_chain_manager1.erl b/prototype/poc-machi/src/machi_chain_manager1.erl index 121aaf6..9a4c4cd 100644 --- a/prototype/poc-machi/src/machi_chain_manager1.erl +++ b/prototype/poc-machi/src/machi_chain_manager1.erl @@ -385,7 +385,7 @@ calc_projection(#ch_mgr{proj=LastProj, runenv=RunEnv} = S, RelativeToServer, %% that there are no partitions at all? calc_projection(_OldThreshold, _NoPartitionThreshold, LastProj, - RelativeToServer, Dbg, #ch_mgr{name=MyName,runenv=RunEnv1}=S) -> + _RelativeToServer, Dbg, #ch_mgr{name=MyName,runenv=RunEnv1}=S) -> #projection{epoch_number=OldEpochNum, all_members=All_list, upi=OldUPI_list, @@ -416,7 +416,9 @@ calc_projection(_OldThreshold, _NoPartitionThreshold, LastProj, {Prob, RunEnvX} = roll_dice(100, RunEnv2), if Prob =< 50 andalso (NewUPI_list == [] orelse - (RelativeToServer == hd(NewUPI_list))) -> + true) -> + %% TODO fix + %% (RelativeToServer == hd(NewUPI_list))) -> {NewUPI_list ++ [H], T, RunEnvX}; true -> {NewUPI_list, OldRepairing_list, RunEnvX} @@ -455,7 +457,8 @@ calc_up_nodes(#ch_mgr{name=MyName, proj=Proj, runenv=RunEnv1}=S) -> calc_up_nodes(MyName, AllMembers, RunEnv1) -> %% Seed1 = proplists:get_value(seed, RunEnv1), - Partitions2 = machi_partition_simulator:get(AllMembers), + {Partitions2, _Islands} = machi_partition_simulator:get(AllMembers), + catch put(react, [{partitions,Partitions2},{islands,_Islands}|get(react)]), UpNodes = lists:sort( [Node || Node <- AllMembers, not lists:member({MyName, Node}, Partitions2), @@ -520,12 +523,15 @@ rank_projection(#projection{author_server=Author, (2*N + length(UPI_list)). do_react_to_env(S) -> + put(react, []), react_to_env_A10(S). react_to_env_A10(S) -> + put(react, [a10|get(react)]), react_to_env_A20(0, S). react_to_env_A20(Retries, #ch_mgr{myflu=MyFLU} = S) -> + put(react, [a20|get(react)]), %% io:format(user, "current: ~w\n", [make_projection_summary(S#ch_mgr.proj)]), RelativeToServer = MyFLU, {P_newprop, S2} = calc_projection(S, RelativeToServer, @@ -534,6 +540,7 @@ react_to_env_A20(Retries, #ch_mgr{myflu=MyFLU} = S) -> react_to_env_A30(Retries, P_newprop, S2). react_to_env_A30(Retries, P_newprop, S) -> + put(react, [a30|get(react)]), {UnanimousTag, P_latest, ReadExtra, S2} = do_cl_read_latest_public_projection(true, S), LatestEpoch = P_latest#projection.epoch_number, ?D({UnanimousTag, LatestEpoch}), @@ -546,9 +553,9 @@ react_to_env_A30(Retries, P_newprop, S) -> LatestUnanimousP = if UnanimousTag == unanimous andalso - All_UPI_Repairing_were_unanimous -> true; - UnanimousTag == unanimous -> false; - UnanimousTag == not_unanimous -> false; + All_UPI_Repairing_were_unanimous -> put(react, [{a30,?LINE}|get(react)]),true; + UnanimousTag == unanimous -> put(react, [{a30,?LINE,UPI_Repairing_FLUs, UnanimousFLUs}|get(react)]),false; + UnanimousTag == not_unanimous -> put(react, [{a30,?LINE}|get(react)]),false; true -> exit({badbad, UnanimousTag}) end, react_to_env_A40(Retries, P_newprop, P_latest, @@ -556,17 +563,28 @@ react_to_env_A30(Retries, P_newprop, S) -> react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, #ch_mgr{myflu=MyFLU, proj=P_current}=S) -> + put(react, [a40|get(react)]), [{Rank_newprop, _}] = rank_projections([P_newprop], P_current), [{Rank_latest, _}] = rank_projections([P_latest], P_current), LatestAuthorDownP = lists:member(P_latest#projection.author_server, P_newprop#projection.down), + if P_newprop#projection.epoch_number == 666 + orelse P_latest#projection.epoch_number == 666 -> + io:format(user, "\nBUMMER\nLatest ~p\nNewprop ~p\nRunenv ~p\nTsns ~w", [make_projection_summary(P_newprop), make_projection_summary(P_latest), S#ch_mgr.runenv, lists:reverse(get(react))]), + exit(bummer); + true -> + ok + end, + %% Proj = S#ch_mgr.proj, if Proj#projection.epoch_number >= 7 -> ?Dw({Rank_newprop,Rank_latest}); true -> ok end, if P_latest#projection.epoch_number > P_current#projection.epoch_number orelse not LatestUnanimousP -> + put(react, [{a40, ?LINE, P_latest#projection.epoch_number > P_current#projection.epoch_number, not LatestUnanimousP}|get(react)]), + ?D({a40,?LINE}), ?D(P_latest#projection.epoch_number), ?D(P_current#projection.epoch_number), @@ -581,6 +599,7 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, P_latest#projection.epoch_number < P_current#projection.epoch_number orelse P_latest /= P_current -> + put(react, [{a40, ?LINE}|get(react)]), ?D({a40,?LINE}), %% Both of these cases are rare. Elsewhere, the code @@ -606,6 +625,7 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, %% A40a (see flowchart) Rank_newprop > Rank_latest -> + put(react, [{a40, ?LINE}|get(react)]), ?D({a40,?LINE}), react_to_env_C300(P_newprop, P_latest, S); @@ -615,11 +635,13 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, (P_newprop#projection.upi /= P_latest#projection.upi orelse P_newprop#projection.repairing /= P_latest#projection.repairing) -> + put(react, [{a40, ?LINE}|get(react)]), ?D({a40,?LINE}), react_to_env_C300(P_newprop, P_latest, S); %% A40c (see flowchart) LatestAuthorDownP -> + put(react, [{a40, ?LINE}|get(react)]), ?D({a40,?LINE}), %% TODO: I believe that membership in the @@ -644,18 +666,22 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, react_to_env_C300(P_newprop, P_latest, S); true -> + put(react, [{a40, ?LINE}|get(react)]), ?D({a40,?LINE}), {{no_change, P_latest#projection.epoch_number}, S} end. react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, Rank_newprop, Rank_latest, #ch_mgr{name=MyName}=S) -> + put(react, [b10|get(react)]), if LatestUnanimousP -> + put(react, [{b10, ?LINE}|get(react)]), ?D({b10, ?LINE}), react_to_env_C100(P_newprop, P_latest, S); Retries > 2 -> + put(react, [{b10, ?LINE}|get(react)]), ?D({b10, ?LINE}), %% The author of P_latest is too slow or crashed. @@ -665,6 +691,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, Rank_latest >= Rank_newprop andalso P_latest#projection.author_server /= MyName -> + put(react, [{b10, ?LINE}|get(react)]), ?D({b10, ?LINE}), %% Give the author of P_latest an opportunite to write a @@ -673,6 +700,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, react_to_env_C200(Retries, P_latest, S); true -> + put(react, [{b10, ?LINE}|get(react)]), ?D({b10, ?LINE}), %% P_newprop is best, so let's write it. @@ -681,12 +709,29 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, react_to_env_C100(P_newprop, P_latest, #ch_mgr{myflu=MyFLU, proj=P_current}=S) -> - case projection_transition_is_sane(P_current, P_latest, MyFLU) of - true -> + put(react, [c100|get(react)]), + I_am_UPI_in_newprop_p = lists:member(MyFLU, P_newprop#projection.upi), + I_am_Repairing_in_latest_p = lists:member(MyFLU, P_latest#projection.repairing), + ShortCircuit_p = + P_latest#projection.epoch_number > P_current#projection.epoch_number + andalso + I_am_UPI_in_newprop_p + andalso + I_am_Repairing_in_latest_p, + + case {ShortCircuit_p, projection_transition_is_sane(P_current, P_latest, + MyFLU)} of + {true, _} -> + %% Someone else believes that I am repairing. We assume + %% that nobody is being Byzantine, so we'll believe it. + %% We ignore our proposal and try to go with the latest. +?D(short_circuitshort_circuitshort_circuitshort_circuitshort_circuit), + react_to_env_C110(P_latest, S); + {_, true} -> ?D({c100, ?LINE}), react_to_env_C110(P_latest, S); - _AnyOtherReturnValue -> -?D({c100, ?LINE, _AnyOtherReturnValue}), timer:sleep(50), + {_, _AnyOtherReturnValue} -> +?D({c100, ?LINE, _AnyOtherReturnValue}), %% %% P_latest is known to be crap. %% %% By process of elimination, P_newprop is best, %% %% so let's write it. @@ -699,6 +744,7 @@ react_to_env_C100(P_newprop, P_latest, end. react_to_env_C110(P_latest, #ch_mgr{myflu=MyFLU} = S) -> + put(react, [c110|get(react)]), %% TOOD: Should we carry along any extra info that that would be useful %% in the dbg2 list? Extra_todo = [], @@ -712,10 +758,12 @@ react_to_env_C110(P_latest, #ch_mgr{myflu=MyFLU} = S) -> react_to_env_C120(P_latest, S). react_to_env_C120(P_latest, S) -> + put(react, [c120|get(react)]), {{now_using, P_latest#projection.epoch_number}, S#ch_mgr{proj=P_latest, proj_proposed=none}}. react_to_env_C200(Retries, P_latest, S) -> + put(react, [c200|get(react)]), try yo:tell_author_yo(P_latest#projection.author_server) catch Type:Err -> @@ -725,15 +773,18 @@ react_to_env_C200(Retries, P_latest, S) -> react_to_env_C210(Retries, S). react_to_env_C210(Retries, S) -> + put(react, [c210|get(react)]), %% TODO: implement the ranked sleep thingie? - timer:sleep(10), + timer:sleep(100), react_to_env_C220(Retries, S). react_to_env_C220(Retries, S) -> + put(react, [c220|get(react)]), react_to_env_A20(Retries + 1, S). react_to_env_C300(#projection{epoch_number=Epoch_newprop}=P_newprop, #projection{epoch_number=_Epoch_latest}=_P_latest, S) -> + put(react, [c300|get(react)]), NewEpoch = erlang:max(Epoch_newprop, _Epoch_latest) + 1, P_newprop2 = P_newprop#projection{epoch_number=NewEpoch}, %% %% Let's return to the old epoch thingie and see what happens......... @@ -742,11 +793,13 @@ react_to_env_C300(#projection{epoch_number=Epoch_newprop}=P_newprop, react_to_env_C310(update_projection_checksum(P_newprop2), S). react_to_env_C310(P_newprop, S) -> + put(react, [{c310,make_projection_summary(P_newprop)}|get(react)]), Epoch = P_newprop#projection.epoch_number, {_Res, S2} = cl_write_public_proj_skip_local_error(Epoch, P_newprop, S), %% MyFLU=S#ch_mgr.myflu, ?D({c310, MyFLU, Epoch, _Res}), timer:sleep(200), %% MPS = mps(P_newprop), ?D(MPS), ?D({c310, _Res}), + put(react, [{c310,_Res}|get(react)]), react_to_env_A10(S2). @@ -931,9 +984,11 @@ perhaps_call(#ch_mgr{name=MyName, myflu=MyFLU}, Partitions, FLU, DoIt) -> false -> Res; _ -> + (catch put(react, [timeout2|get(react)])), exit(timeout) end; _ -> + (catch put(react, [{timeout1,me,MyFLU,to,FLU,RemoteFLU_p,Partitions}|get(react)])), exit(timeout) end. @@ -1043,7 +1098,7 @@ nonunanimous_setup_and_fix_test() -> end. zoof_test() -> - machi_partition_simulator:start_link({1,2,3}, 50, 50), + machi_partition_simulator:start_link({111,222,333}, 50, 10), {ok, FLUa} = machi_flu0:start_link(a), {ok, FLUb} = machi_flu0:start_link(b), @@ -1087,7 +1142,7 @@ zoof_test() -> end, DoIt(), - machi_partition_simulator:reset_thresholds(100, 0), + machi_partition_simulator:reset_thresholds(999, 0), DoIt(), DoIt(), diff --git a/prototype/poc-machi/test/machi_partition_simulator.erl b/prototype/poc-machi/test/machi_partition_simulator.erl index 5ff3cb4..c5bbc38 100644 --- a/prototype/poc-machi/test/machi_partition_simulator.erl +++ b/prototype/poc-machi/test/machi_partition_simulator.erl @@ -64,18 +64,19 @@ reset_thresholds(OldThreshold, NoPartitionThreshold) -> init({Seed, OldThreshold, NoPartitionThreshold}) -> {ok, #state{seed=Seed, - old_partitions=[], + old_partitions={[],[]}, old_threshold=OldThreshold, no_partition_threshold=NoPartitionThreshold}}. handle_call({get, Nodes}, _From, S) -> - {Seed2, Partitions2} = + {Seed2, Partitions} = calc_network_partitions(Nodes, S#state.seed, S#state.old_partitions, S#state.old_threshold, S#state.no_partition_threshold), - {reply, Partitions2, S#state{seed=Seed2}}; + {reply, Partitions, S#state{seed=Seed2, + old_partitions=Partitions}}; handle_call({reset_thresholds, OldThreshold, NoPartitionThreshold}, _From, S) -> {reply, ok, S#state{old_threshold=OldThreshold, no_partition_threshold=NoPartitionThreshold}}; @@ -125,7 +126,7 @@ make_network_partition_locations(Nodes, Seed1) -> [Nd || {Weight, Nd} <- WeightsNodes, (Max - IslandSep) =< Weight, Weight < Max] || Max <- lists:seq(IslandSep + 1, 101, IslandSep)], - {Seed2, lists:usort(make_islands(Islands))}. + {Seed2, {lists:usort(make_islands(Islands)), Islands}}. make_islands([]) -> [];