WIP: Debugging cruft, egadz, but improving (see below)

So, this still pops up occasionally:

    % rebar skip_deps=true -v eunit suites=machi_flu0_test,machi_chain_manager1
    [...]
    a private: [{epoch,223},{author,a},{upi,[a,b]},{repair,[]},{down,[c]},{d,[{author_proc,react},{nodes_up,[a,b]}]},{d2,[{up_nodz,[a,b]},{hooray,{v2,{2014,11,3},{20,19,57}}}]}]
    b private: [{epoch,224},{author,b},{upi,[b,a]},{repair,[]},{down,[c]},{d,[{author_proc,react},{nodes_up,[a,b]}]},{d2,[{up_nodz,[a,b]},{hooray,{v2,{2014,11,3},{20,19,57}}}]}]
    c private: [{epoch,191},{author,c},{upi,[c]},{repair,[]},{down,[a,b]},{d,[{author_proc,react},{nodes_up,[c]}]},{d2,[{up_nodz,[c]},{hooray,{v2,{2014,11,3},{20,19,57}}}]}]

The mis-ordering between [a,b] and [b,a] happens after the partition settled
on the islands of [a,b] and [c].

    { c100 , ? LINE , _AnyOtherReturnValue } {c100,734,
                                          {err,error,
                                           {badmatch,[a,b]},
                                           from,
                                           [{epoch,70},
                                            {author,a},
                                            {upi,[a]},
                                            {repair,[b]},
                                            {down,[c]},
                                            {d,
                                             [{author_proc,react},
                                              {nodes_up,[a,b]}]},
                                            {d2,[]}],
                                           to,
                                           [{epoch,194},
                                            {author,b},
                                            {upi,[b,a]},
                                            {repair,[]},
                                            {down,[c]},
                                            {d,
                                             [{author_proc,react},
                                              {nodes_up,[a,b]}]},
                                            {d2,[]}],
                                           relative_to,a,stack,[...]
This commit is contained in:
Scott Lystig Fritchie 2014-11-03 20:17:56 +09:00
parent 5d0eed865a
commit 0b88a12c16
2 changed files with 73 additions and 17 deletions

View file

@ -385,7 +385,7 @@ calc_projection(#ch_mgr{proj=LastProj, runenv=RunEnv} = S, RelativeToServer,
%% that there are no partitions at all? %% that there are no partitions at all?
calc_projection(_OldThreshold, _NoPartitionThreshold, LastProj, calc_projection(_OldThreshold, _NoPartitionThreshold, LastProj,
RelativeToServer, Dbg, #ch_mgr{name=MyName,runenv=RunEnv1}=S) -> _RelativeToServer, Dbg, #ch_mgr{name=MyName,runenv=RunEnv1}=S) ->
#projection{epoch_number=OldEpochNum, #projection{epoch_number=OldEpochNum,
all_members=All_list, all_members=All_list,
upi=OldUPI_list, upi=OldUPI_list,
@ -416,7 +416,9 @@ calc_projection(_OldThreshold, _NoPartitionThreshold, LastProj,
{Prob, RunEnvX} = roll_dice(100, RunEnv2), {Prob, RunEnvX} = roll_dice(100, RunEnv2),
if Prob =< 50 andalso (NewUPI_list == [] if Prob =< 50 andalso (NewUPI_list == []
orelse orelse
(RelativeToServer == hd(NewUPI_list))) -> true) ->
%% TODO fix
%% (RelativeToServer == hd(NewUPI_list))) ->
{NewUPI_list ++ [H], T, RunEnvX}; {NewUPI_list ++ [H], T, RunEnvX};
true -> true ->
{NewUPI_list, OldRepairing_list, RunEnvX} {NewUPI_list, OldRepairing_list, RunEnvX}
@ -455,7 +457,8 @@ calc_up_nodes(#ch_mgr{name=MyName, proj=Proj, runenv=RunEnv1}=S) ->
calc_up_nodes(MyName, AllMembers, RunEnv1) -> calc_up_nodes(MyName, AllMembers, RunEnv1) ->
%% Seed1 = proplists:get_value(seed, RunEnv1), %% Seed1 = proplists:get_value(seed, RunEnv1),
Partitions2 = machi_partition_simulator:get(AllMembers), {Partitions2, _Islands} = machi_partition_simulator:get(AllMembers),
catch put(react, [{partitions,Partitions2},{islands,_Islands}|get(react)]),
UpNodes = lists:sort( UpNodes = lists:sort(
[Node || Node <- AllMembers, [Node || Node <- AllMembers,
not lists:member({MyName, Node}, Partitions2), not lists:member({MyName, Node}, Partitions2),
@ -520,12 +523,15 @@ rank_projection(#projection{author_server=Author,
(2*N + length(UPI_list)). (2*N + length(UPI_list)).
do_react_to_env(S) -> do_react_to_env(S) ->
put(react, []),
react_to_env_A10(S). react_to_env_A10(S).
react_to_env_A10(S) -> react_to_env_A10(S) ->
put(react, [a10|get(react)]),
react_to_env_A20(0, S). react_to_env_A20(0, S).
react_to_env_A20(Retries, #ch_mgr{myflu=MyFLU} = S) -> react_to_env_A20(Retries, #ch_mgr{myflu=MyFLU} = S) ->
put(react, [a20|get(react)]),
%% io:format(user, "current: ~w\n", [make_projection_summary(S#ch_mgr.proj)]), %% io:format(user, "current: ~w\n", [make_projection_summary(S#ch_mgr.proj)]),
RelativeToServer = MyFLU, RelativeToServer = MyFLU,
{P_newprop, S2} = calc_projection(S, RelativeToServer, {P_newprop, S2} = calc_projection(S, RelativeToServer,
@ -534,6 +540,7 @@ react_to_env_A20(Retries, #ch_mgr{myflu=MyFLU} = S) ->
react_to_env_A30(Retries, P_newprop, S2). react_to_env_A30(Retries, P_newprop, S2).
react_to_env_A30(Retries, P_newprop, S) -> react_to_env_A30(Retries, P_newprop, S) ->
put(react, [a30|get(react)]),
{UnanimousTag, P_latest, ReadExtra, S2} = {UnanimousTag, P_latest, ReadExtra, S2} =
do_cl_read_latest_public_projection(true, S), do_cl_read_latest_public_projection(true, S),
LatestEpoch = P_latest#projection.epoch_number, ?D({UnanimousTag, LatestEpoch}), LatestEpoch = P_latest#projection.epoch_number, ?D({UnanimousTag, LatestEpoch}),
@ -546,9 +553,9 @@ react_to_env_A30(Retries, P_newprop, S) ->
LatestUnanimousP = LatestUnanimousP =
if UnanimousTag == unanimous if UnanimousTag == unanimous
andalso andalso
All_UPI_Repairing_were_unanimous -> true; All_UPI_Repairing_were_unanimous -> put(react, [{a30,?LINE}|get(react)]),true;
UnanimousTag == unanimous -> false; UnanimousTag == unanimous -> put(react, [{a30,?LINE,UPI_Repairing_FLUs, UnanimousFLUs}|get(react)]),false;
UnanimousTag == not_unanimous -> false; UnanimousTag == not_unanimous -> put(react, [{a30,?LINE}|get(react)]),false;
true -> exit({badbad, UnanimousTag}) true -> exit({badbad, UnanimousTag})
end, end,
react_to_env_A40(Retries, P_newprop, P_latest, react_to_env_A40(Retries, P_newprop, P_latest,
@ -556,17 +563,28 @@ react_to_env_A30(Retries, P_newprop, S) ->
react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
#ch_mgr{myflu=MyFLU, proj=P_current}=S) -> #ch_mgr{myflu=MyFLU, proj=P_current}=S) ->
put(react, [a40|get(react)]),
[{Rank_newprop, _}] = rank_projections([P_newprop], P_current), [{Rank_newprop, _}] = rank_projections([P_newprop], P_current),
[{Rank_latest, _}] = rank_projections([P_latest], P_current), [{Rank_latest, _}] = rank_projections([P_latest], P_current),
LatestAuthorDownP = lists:member(P_latest#projection.author_server, LatestAuthorDownP = lists:member(P_latest#projection.author_server,
P_newprop#projection.down), P_newprop#projection.down),
if P_newprop#projection.epoch_number == 666
orelse P_latest#projection.epoch_number == 666 ->
io:format(user, "\nBUMMER\nLatest ~p\nNewprop ~p\nRunenv ~p\nTsns ~w", [make_projection_summary(P_newprop), make_projection_summary(P_latest), S#ch_mgr.runenv, lists:reverse(get(react))]),
exit(bummer);
true ->
ok
end,
%% Proj = S#ch_mgr.proj, if Proj#projection.epoch_number >= 7 -> ?Dw({Rank_newprop,Rank_latest}); true -> ok end, %% Proj = S#ch_mgr.proj, if Proj#projection.epoch_number >= 7 -> ?Dw({Rank_newprop,Rank_latest}); true -> ok end,
if if
P_latest#projection.epoch_number > P_current#projection.epoch_number P_latest#projection.epoch_number > P_current#projection.epoch_number
orelse orelse
not LatestUnanimousP -> not LatestUnanimousP ->
put(react, [{a40, ?LINE, P_latest#projection.epoch_number > P_current#projection.epoch_number, not LatestUnanimousP}|get(react)]),
?D({a40,?LINE}), ?D({a40,?LINE}),
?D(P_latest#projection.epoch_number), ?D(P_latest#projection.epoch_number),
?D(P_current#projection.epoch_number), ?D(P_current#projection.epoch_number),
@ -581,6 +599,7 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
P_latest#projection.epoch_number < P_current#projection.epoch_number P_latest#projection.epoch_number < P_current#projection.epoch_number
orelse orelse
P_latest /= P_current -> P_latest /= P_current ->
put(react, [{a40, ?LINE}|get(react)]),
?D({a40,?LINE}), ?D({a40,?LINE}),
%% Both of these cases are rare. Elsewhere, the code %% Both of these cases are rare. Elsewhere, the code
@ -606,6 +625,7 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
%% A40a (see flowchart) %% A40a (see flowchart)
Rank_newprop > Rank_latest -> Rank_newprop > Rank_latest ->
put(react, [{a40, ?LINE}|get(react)]),
?D({a40,?LINE}), ?D({a40,?LINE}),
react_to_env_C300(P_newprop, P_latest, S); react_to_env_C300(P_newprop, P_latest, S);
@ -615,11 +635,13 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
(P_newprop#projection.upi /= P_latest#projection.upi (P_newprop#projection.upi /= P_latest#projection.upi
orelse orelse
P_newprop#projection.repairing /= P_latest#projection.repairing) -> P_newprop#projection.repairing /= P_latest#projection.repairing) ->
put(react, [{a40, ?LINE}|get(react)]),
?D({a40,?LINE}), ?D({a40,?LINE}),
react_to_env_C300(P_newprop, P_latest, S); react_to_env_C300(P_newprop, P_latest, S);
%% A40c (see flowchart) %% A40c (see flowchart)
LatestAuthorDownP -> LatestAuthorDownP ->
put(react, [{a40, ?LINE}|get(react)]),
?D({a40,?LINE}), ?D({a40,?LINE}),
%% TODO: I believe that membership in the %% TODO: I believe that membership in the
@ -644,18 +666,22 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
react_to_env_C300(P_newprop, P_latest, S); react_to_env_C300(P_newprop, P_latest, S);
true -> true ->
put(react, [{a40, ?LINE}|get(react)]),
?D({a40,?LINE}), ?D({a40,?LINE}),
{{no_change, P_latest#projection.epoch_number}, S} {{no_change, P_latest#projection.epoch_number}, S}
end. end.
react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
Rank_newprop, Rank_latest, #ch_mgr{name=MyName}=S) -> Rank_newprop, Rank_latest, #ch_mgr{name=MyName}=S) ->
put(react, [b10|get(react)]),
if if
LatestUnanimousP -> LatestUnanimousP ->
put(react, [{b10, ?LINE}|get(react)]),
?D({b10, ?LINE}), ?D({b10, ?LINE}),
react_to_env_C100(P_newprop, P_latest, S); react_to_env_C100(P_newprop, P_latest, S);
Retries > 2 -> Retries > 2 ->
put(react, [{b10, ?LINE}|get(react)]),
?D({b10, ?LINE}), ?D({b10, ?LINE}),
%% The author of P_latest is too slow or crashed. %% The author of P_latest is too slow or crashed.
@ -665,6 +691,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
Rank_latest >= Rank_newprop Rank_latest >= Rank_newprop
andalso andalso
P_latest#projection.author_server /= MyName -> P_latest#projection.author_server /= MyName ->
put(react, [{b10, ?LINE}|get(react)]),
?D({b10, ?LINE}), ?D({b10, ?LINE}),
%% Give the author of P_latest an opportunite to write a %% Give the author of P_latest an opportunite to write a
@ -673,6 +700,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
react_to_env_C200(Retries, P_latest, S); react_to_env_C200(Retries, P_latest, S);
true -> true ->
put(react, [{b10, ?LINE}|get(react)]),
?D({b10, ?LINE}), ?D({b10, ?LINE}),
%% P_newprop is best, so let's write it. %% P_newprop is best, so let's write it.
@ -681,12 +709,29 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
react_to_env_C100(P_newprop, P_latest, react_to_env_C100(P_newprop, P_latest,
#ch_mgr{myflu=MyFLU, proj=P_current}=S) -> #ch_mgr{myflu=MyFLU, proj=P_current}=S) ->
case projection_transition_is_sane(P_current, P_latest, MyFLU) of put(react, [c100|get(react)]),
true -> I_am_UPI_in_newprop_p = lists:member(MyFLU, P_newprop#projection.upi),
I_am_Repairing_in_latest_p = lists:member(MyFLU, P_latest#projection.repairing),
ShortCircuit_p =
P_latest#projection.epoch_number > P_current#projection.epoch_number
andalso
I_am_UPI_in_newprop_p
andalso
I_am_Repairing_in_latest_p,
case {ShortCircuit_p, projection_transition_is_sane(P_current, P_latest,
MyFLU)} of
{true, _} ->
%% Someone else believes that I am repairing. We assume
%% that nobody is being Byzantine, so we'll believe it.
%% We ignore our proposal and try to go with the latest.
?D(short_circuitshort_circuitshort_circuitshort_circuitshort_circuit),
react_to_env_C110(P_latest, S);
{_, true} ->
?D({c100, ?LINE}), ?D({c100, ?LINE}),
react_to_env_C110(P_latest, S); react_to_env_C110(P_latest, S);
_AnyOtherReturnValue -> {_, _AnyOtherReturnValue} ->
?D({c100, ?LINE, _AnyOtherReturnValue}), timer:sleep(50), ?D({c100, ?LINE, _AnyOtherReturnValue}),
%% %% P_latest is known to be crap. %% %% P_latest is known to be crap.
%% %% By process of elimination, P_newprop is best, %% %% By process of elimination, P_newprop is best,
%% %% so let's write it. %% %% so let's write it.
@ -699,6 +744,7 @@ react_to_env_C100(P_newprop, P_latest,
end. end.
react_to_env_C110(P_latest, #ch_mgr{myflu=MyFLU} = S) -> react_to_env_C110(P_latest, #ch_mgr{myflu=MyFLU} = S) ->
put(react, [c110|get(react)]),
%% TOOD: Should we carry along any extra info that that would be useful %% TOOD: Should we carry along any extra info that that would be useful
%% in the dbg2 list? %% in the dbg2 list?
Extra_todo = [], Extra_todo = [],
@ -712,10 +758,12 @@ react_to_env_C110(P_latest, #ch_mgr{myflu=MyFLU} = S) ->
react_to_env_C120(P_latest, S). react_to_env_C120(P_latest, S).
react_to_env_C120(P_latest, S) -> react_to_env_C120(P_latest, S) ->
put(react, [c120|get(react)]),
{{now_using, P_latest#projection.epoch_number}, {{now_using, P_latest#projection.epoch_number},
S#ch_mgr{proj=P_latest, proj_proposed=none}}. S#ch_mgr{proj=P_latest, proj_proposed=none}}.
react_to_env_C200(Retries, P_latest, S) -> react_to_env_C200(Retries, P_latest, S) ->
put(react, [c200|get(react)]),
try try
yo:tell_author_yo(P_latest#projection.author_server) yo:tell_author_yo(P_latest#projection.author_server)
catch Type:Err -> catch Type:Err ->
@ -725,15 +773,18 @@ react_to_env_C200(Retries, P_latest, S) ->
react_to_env_C210(Retries, S). react_to_env_C210(Retries, S).
react_to_env_C210(Retries, S) -> react_to_env_C210(Retries, S) ->
put(react, [c210|get(react)]),
%% TODO: implement the ranked sleep thingie? %% TODO: implement the ranked sleep thingie?
timer:sleep(10), timer:sleep(100),
react_to_env_C220(Retries, S). react_to_env_C220(Retries, S).
react_to_env_C220(Retries, S) -> react_to_env_C220(Retries, S) ->
put(react, [c220|get(react)]),
react_to_env_A20(Retries + 1, S). react_to_env_A20(Retries + 1, S).
react_to_env_C300(#projection{epoch_number=Epoch_newprop}=P_newprop, react_to_env_C300(#projection{epoch_number=Epoch_newprop}=P_newprop,
#projection{epoch_number=_Epoch_latest}=_P_latest, S) -> #projection{epoch_number=_Epoch_latest}=_P_latest, S) ->
put(react, [c300|get(react)]),
NewEpoch = erlang:max(Epoch_newprop, _Epoch_latest) + 1, NewEpoch = erlang:max(Epoch_newprop, _Epoch_latest) + 1,
P_newprop2 = P_newprop#projection{epoch_number=NewEpoch}, P_newprop2 = P_newprop#projection{epoch_number=NewEpoch},
%% %% Let's return to the old epoch thingie and see what happens......... %% %% Let's return to the old epoch thingie and see what happens.........
@ -742,11 +793,13 @@ react_to_env_C300(#projection{epoch_number=Epoch_newprop}=P_newprop,
react_to_env_C310(update_projection_checksum(P_newprop2), S). react_to_env_C310(update_projection_checksum(P_newprop2), S).
react_to_env_C310(P_newprop, S) -> react_to_env_C310(P_newprop, S) ->
put(react, [{c310,make_projection_summary(P_newprop)}|get(react)]),
Epoch = P_newprop#projection.epoch_number, Epoch = P_newprop#projection.epoch_number,
{_Res, S2} = cl_write_public_proj_skip_local_error(Epoch, P_newprop, S), {_Res, S2} = cl_write_public_proj_skip_local_error(Epoch, P_newprop, S),
%% MyFLU=S#ch_mgr.myflu, ?D({c310, MyFLU, Epoch, _Res}), timer:sleep(200), %% MyFLU=S#ch_mgr.myflu, ?D({c310, MyFLU, Epoch, _Res}), timer:sleep(200),
%% MPS = mps(P_newprop), ?D(MPS), %% MPS = mps(P_newprop), ?D(MPS),
?D({c310, _Res}), ?D({c310, _Res}),
put(react, [{c310,_Res}|get(react)]),
react_to_env_A10(S2). react_to_env_A10(S2).
@ -931,9 +984,11 @@ perhaps_call(#ch_mgr{name=MyName, myflu=MyFLU}, Partitions, FLU, DoIt) ->
false -> false ->
Res; Res;
_ -> _ ->
(catch put(react, [timeout2|get(react)])),
exit(timeout) exit(timeout)
end; end;
_ -> _ ->
(catch put(react, [{timeout1,me,MyFLU,to,FLU,RemoteFLU_p,Partitions}|get(react)])),
exit(timeout) exit(timeout)
end. end.
@ -1043,7 +1098,7 @@ nonunanimous_setup_and_fix_test() ->
end. end.
zoof_test() -> zoof_test() ->
machi_partition_simulator:start_link({1,2,3}, 50, 50), machi_partition_simulator:start_link({111,222,333}, 50, 10),
{ok, FLUa} = machi_flu0:start_link(a), {ok, FLUa} = machi_flu0:start_link(a),
{ok, FLUb} = machi_flu0:start_link(b), {ok, FLUb} = machi_flu0:start_link(b),
@ -1087,7 +1142,7 @@ zoof_test() ->
end, end,
DoIt(), DoIt(),
machi_partition_simulator:reset_thresholds(100, 0), machi_partition_simulator:reset_thresholds(999, 0),
DoIt(), DoIt(),
DoIt(), DoIt(),

View file

@ -64,18 +64,19 @@ reset_thresholds(OldThreshold, NoPartitionThreshold) ->
init({Seed, OldThreshold, NoPartitionThreshold}) -> init({Seed, OldThreshold, NoPartitionThreshold}) ->
{ok, #state{seed=Seed, {ok, #state{seed=Seed,
old_partitions=[], old_partitions={[],[]},
old_threshold=OldThreshold, old_threshold=OldThreshold,
no_partition_threshold=NoPartitionThreshold}}. no_partition_threshold=NoPartitionThreshold}}.
handle_call({get, Nodes}, _From, S) -> handle_call({get, Nodes}, _From, S) ->
{Seed2, Partitions2} = {Seed2, Partitions} =
calc_network_partitions(Nodes, calc_network_partitions(Nodes,
S#state.seed, S#state.seed,
S#state.old_partitions, S#state.old_partitions,
S#state.old_threshold, S#state.old_threshold,
S#state.no_partition_threshold), S#state.no_partition_threshold),
{reply, Partitions2, S#state{seed=Seed2}}; {reply, Partitions, S#state{seed=Seed2,
old_partitions=Partitions}};
handle_call({reset_thresholds, OldThreshold, NoPartitionThreshold}, _From, S) -> handle_call({reset_thresholds, OldThreshold, NoPartitionThreshold}, _From, S) ->
{reply, ok, S#state{old_threshold=OldThreshold, {reply, ok, S#state{old_threshold=OldThreshold,
no_partition_threshold=NoPartitionThreshold}}; no_partition_threshold=NoPartitionThreshold}};
@ -125,7 +126,7 @@ make_network_partition_locations(Nodes, Seed1) ->
[Nd || {Weight, Nd} <- WeightsNodes, [Nd || {Weight, Nd} <- WeightsNodes,
(Max - IslandSep) =< Weight, Weight < Max] (Max - IslandSep) =< Weight, Weight < Max]
|| Max <- lists:seq(IslandSep + 1, 101, IslandSep)], || Max <- lists:seq(IslandSep + 1, 101, IslandSep)],
{Seed2, lists:usort(make_islands(Islands))}. {Seed2, {lists:usort(make_islands(Islands)), Islands}}.
make_islands([]) -> make_islands([]) ->
[]; [];