WIP: bugfix for rare flapping infinite loop (done^2 fix I hope)
How can even computer? So, there's a flavor of the flapping infinite loop problem that can happen without flapping being detected (by the existing flapping detector, that is). That detector relies on a series of accepted projections to converge to a single projection repeated X times. However, it's possible to have a race with a simulated repair "finishing" that causes a problem so that no more projections are ever accepted. Oops. See also: new comments in do_react_to_env().
This commit is contained in:
parent
57b7122035
commit
b8c642aaa7
1 changed files with 60 additions and 61 deletions
|
@ -68,7 +68,7 @@
|
|||
flaps=0 :: integer(),
|
||||
flap_start = ?NOT_FLAPPING_START
|
||||
:: {{'epk', integer()}, erlang:timestamp()},
|
||||
flap_not_sanes :: orddict:orddict(),
|
||||
not_sanes :: orddict:orddict(),
|
||||
repair_worker :: 'undefined' | pid(),
|
||||
repair_start :: 'undefined' | erlang:timestamp(),
|
||||
repair_final_status :: 'undefined' | term(),
|
||||
|
@ -226,7 +226,7 @@ init({MyName, InitMembersDict, MgrOpts}) ->
|
|||
flap_limit=length(All_list) + 50,
|
||||
timer='undefined',
|
||||
proj_history=queue:new(),
|
||||
flap_not_sanes=orddict:new(),
|
||||
not_sanes=orddict:new(),
|
||||
runenv=RunEnv,
|
||||
opts=MgrOpts},
|
||||
{_, S2} = do_set_chain_members_dict(MembersDict, S),
|
||||
|
@ -805,8 +805,46 @@ do_react_to_env(#ch_mgr{name=MyName,
|
|||
S2#ch_mgr{proj=NewProj, members_dict=NewMembersDict}}
|
||||
end;
|
||||
do_react_to_env(S) ->
|
||||
%% The not_sanes manager counting dictionary is not strictly
|
||||
%% limited to flapping scenarios. (Though the mechanism first
|
||||
%% started as a way to deal with rare flapping scenarios.)
|
||||
%%
|
||||
%% I believe that the problem cannot happen in real life, but it can
|
||||
%% happen in simulated environments, especially if the simulation for
|
||||
%% repair can be approximately infinitely fast.
|
||||
%%
|
||||
%% For example:
|
||||
%% P_current: epoch=1135, UPI=[b,e,a], Repairing=[c,d], author=e
|
||||
%%
|
||||
%% Now a partition happens, a & b are on an island, c & d & e on
|
||||
%% the other island.
|
||||
%%
|
||||
%% P_newprop: epoch=1136, UPI=[e,c], Repairing=[d], author=e
|
||||
%%
|
||||
%% Why does e think that this is feasible? Well, the old UPI was
|
||||
%% [b,e,a], and we know that a & b are partitioned away from e.
|
||||
%% Therefore e chooses the best UPI, [e]. However, the simulator
|
||||
%% now also says, hey, there are nodes in the repairing list, so
|
||||
%% let's simulate a repair ... and the repair goes infinitely
|
||||
%% quickly ...and the epoch is stable during the repair period
|
||||
%% (i.e., both e/repairer and c/repairee remained in the same
|
||||
%% epoch 1135) ... so e decides that the simulated repair is
|
||||
%% "finished" and it's time to add the repairee to the tail of the
|
||||
%% UPI ... so that's why 1136's UPI=[e,c].
|
||||
%%
|
||||
%% I'll try to add a condition to the simulated repair to try to
|
||||
%% make slightly fewer assumptions in a row. However, I believe
|
||||
%% it's a good idea to keep this too-many-not_sane-transition-
|
||||
%% attempts counter very generic (i.e., not specific for flapping
|
||||
%% as it once was).
|
||||
%%
|
||||
%% The not_sanes counter dict should be reset each time we start
|
||||
%% an iteration. One could argue that state only for a single
|
||||
%% iteration shouldn't go in #ch_mgr but should be a separate arg
|
||||
%% threaded through each of the FSM funcs.
|
||||
%% TODO possible refactoring task?
|
||||
put(react, []),
|
||||
react_to_env_A10(S).
|
||||
react_to_env_A10(S#ch_mgr{not_sanes=orddict:new()}).
|
||||
|
||||
react_to_env_A10(S) ->
|
||||
?REACT(a10),
|
||||
|
@ -986,7 +1024,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
|
|||
?REACT({a30, ?LINE, [{inner_summary,
|
||||
machi_projection:make_summary(P_inner2)}]}),
|
||||
%% Adjust the outer projection's #flap_i info.
|
||||
?V("~w,", [{'YOYO',MyName,NewEpoch}]),
|
||||
?V("~w,", [{'FLAP',MyName,NewEpoch}]),
|
||||
#projection_v1{flap=OldFlap} = P_newprop3,
|
||||
NewFlap = OldFlap#flap_i{flapping_me=true},
|
||||
?REACT({a30, ?LINE, [flap_continue,
|
||||
|
@ -1360,7 +1398,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
|
|||
react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
|
||||
flap=Flap_latest0}=P_latest,
|
||||
#ch_mgr{name=MyName, proj=P_current,
|
||||
flap_not_sanes=NotSanesDict0}=S) ->
|
||||
not_sanes=NotSanesDict0}=S) ->
|
||||
?REACT(c100),
|
||||
|
||||
Sane = projection_transition_is_sane(P_current, P_latest, MyName),
|
||||
|
@ -1382,18 +1420,16 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
|
|||
_ when P_current#projection_v1.epoch_number == 0 ->
|
||||
%% Epoch == 0 is reserved for first-time, just booting conditions.
|
||||
?REACT({c100, ?LINE, [first_write]}),
|
||||
erase(perhaps_reset_loop),
|
||||
if Sane == true -> ok; true -> ?V("insane-~w-~w@~w,", [MyName, P_newprop#projection_v1.epoch_number, ?LINE]) end, %%% DELME!!!
|
||||
react_to_env_C110(P_latest, S);
|
||||
true ->
|
||||
?REACT({c100, ?LINE, [sane]}),
|
||||
erase(perhaps_reset_loop),
|
||||
if Sane == true -> ok; true -> ?V("insane-~w-~w@~w,", [MyName, P_newprop#projection_v1.epoch_number, ?LINE]) end, %%% DELME!!!
|
||||
react_to_env_C110(P_latest, S);
|
||||
%% 20150715: I've seen this loop happen with {expected_author2,X}
|
||||
%% where nobody agrees, weird.
|
||||
false when is_record(Flap_latest, flap_i) andalso
|
||||
Flap_latest#flap_i.flapping_me == true ->
|
||||
_ ->
|
||||
?REACT({c100, ?LINE}),
|
||||
?V("\n\n1YOYO ~w breaking the cycle of ~p\n", [MyName, machi_projection:make_summary(P_latest)]),
|
||||
%% This is a fun case. We had just enough asymmetric partition
|
||||
%% to cause the chain to fragment into two *incompatible* and
|
||||
%% *overlapping membership* chains, but the chain fragmentation
|
||||
|
@ -1435,62 +1471,30 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
|
|||
%%
|
||||
%% So, we're going to keep track in #ch_mgr state for the number
|
||||
%% of times that this insane judgement has happened.
|
||||
|
||||
%%
|
||||
%% See also: comment in do_react_to_env() about
|
||||
%% non-flapping-scenario that can also cause us to want to
|
||||
%% collapse to the none_projection to break a
|
||||
%% livelock/infinite loop.
|
||||
react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
|
||||
P_newprop, P_latest, S);
|
||||
{expected_author2,_}=_ExpectedErr when
|
||||
is_record(Flap_latest, flap_i) andalso
|
||||
Flap_latest#flap_i.flapping_me == true ->
|
||||
?REACT({c100, ?LINE}),
|
||||
react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
|
||||
P_newprop, P_latest, S);
|
||||
{expected_author2,_ExpectedAuthor2}=_ExpectedErr ->
|
||||
case get(perhaps_reset_loop) of
|
||||
undefined ->
|
||||
put(perhaps_reset_loop, 1),
|
||||
?REACT({c100, ?LINE, [not_sane, get(why2), _ExpectedErr]}),
|
||||
react_to_env_C300(P_newprop, P_latest, S);
|
||||
X when X > ?TOO_FREQUENT_BREAKER ->
|
||||
%% Ha, yes, this is possible. For example:
|
||||
%% outer: author=e,upi=[b,a,d],repair=[c,e]
|
||||
%% inner: author=e,upi=[b,e], repair=[]
|
||||
%% In this case, the transition from inner to outer by A30
|
||||
%% has chosen the wrong author. We have two choices.
|
||||
%% 1. Accept this transition, because it really was the
|
||||
%% safe & transition-approved UPI+repeairing that we
|
||||
%% were using while we were flapping. I'm 99% certain
|
||||
%% that this is safe. TODO: Verify
|
||||
%% 2. I'm not yet 100% certain that #1 is safe, so instead
|
||||
%% we fall back to the one thing that we know is safe:
|
||||
%% the 'none' projection, which lets the chain rebuild
|
||||
%% itself normally during future iterations.
|
||||
?REACT({c100, ?LINE}),
|
||||
react_to_env_C103(P_latest, S);
|
||||
X ->
|
||||
put(perhaps_reset_loop, X+1),
|
||||
?REACT({c100, ?LINE, [not_sane, get(why2), _ExpectedErr]}),
|
||||
react_to_env_C300(P_newprop, P_latest, S)
|
||||
end;
|
||||
_AnyOtherReturnValue ->
|
||||
%% P_latest is not sane.
|
||||
%% By process of elimination, P_newprop is best,
|
||||
%% so let's write it.
|
||||
?REACT({c100, ?LINE, [not_sane, get(why2), _AnyOtherReturnValue]}),
|
||||
erase(perhaps_reset_loop),
|
||||
react_to_env_C300(P_newprop, P_latest, S)
|
||||
P_newprop, P_latest, S)
|
||||
end.
|
||||
|
||||
react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
|
||||
P_newprop, P_latest, S) ->
|
||||
NotSanesDict = orddict:update_counter(Author_latest, 1, NotSanesDict0),
|
||||
S2 = S#ch_mgr{flap_not_sanes=NotSanesDict},
|
||||
S2 = S#ch_mgr{not_sanes=NotSanesDict},
|
||||
case orddict:fetch(Author_latest, NotSanesDict) of
|
||||
N when N > ?TOO_FREQUENT_BREAKER ->
|
||||
?V("\n\nYOYO ~w breaking the cycle of ~p\n", [MyName, machi_projection:make_summary(P_latest)]),
|
||||
?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}),
|
||||
react_to_env_C103(P_latest, S2);
|
||||
N ->
|
||||
?V("YOYO,~w,~w,~w,",[MyName, P_latest#projection_v1.epoch_number,N]),
|
||||
?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}),
|
||||
%% P_latest is not sane.
|
||||
%% By process of elimination, P_newprop is best,
|
||||
%% so let's write it.
|
||||
react_to_env_C300(P_newprop, P_latest, S2)
|
||||
end.
|
||||
|
||||
|
@ -1507,8 +1511,7 @@ react_to_env_C103(#projection_v1{epoch_number=Epoch_latest,
|
|||
P_none = machi_projection:update_checksum(P_none1),
|
||||
%% Use it, darn it, because it's 100% safe. And exit flapping state.
|
||||
react_to_env_C100(P_none, P_none, S#ch_mgr{flaps=0,
|
||||
flap_start=?NOT_FLAPPING_START,
|
||||
flap_not_sanes=orddict:new()}).
|
||||
flap_start=?NOT_FLAPPING_START}).
|
||||
|
||||
react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) ->
|
||||
?REACT(c110),
|
||||
|
@ -1632,8 +1635,7 @@ react_to_env_C310(P_newprop, S) ->
|
|||
|
||||
calculate_flaps(P_newprop, _P_current, _FlapLimit,
|
||||
#ch_mgr{name=MyName, proj_history=H, flap_start=FlapStart,
|
||||
flaps=Flaps, flap_not_sanes=NotSanesDict0,
|
||||
runenv=RunEnv1}=S) ->
|
||||
flaps=Flaps, runenv=RunEnv1}=S) ->
|
||||
HistoryPs = queue:to_list(H),
|
||||
Ps = HistoryPs ++ [P_newprop],
|
||||
UniqueProposalSummaries = lists:usort([{P#projection_v1.upi,
|
||||
|
@ -1695,7 +1697,6 @@ calculate_flaps(P_newprop, _P_current, _FlapLimit,
|
|||
true ->
|
||||
NewFlapStart = FlapStart
|
||||
end,
|
||||
NotSanesDict = NotSanesDict0,
|
||||
|
||||
%% Wow, this behavior is almost spooky.
|
||||
%%
|
||||
|
@ -1725,7 +1726,6 @@ calculate_flaps(P_newprop, _P_current, _FlapLimit,
|
|||
{_N, _} ->
|
||||
NewFlaps = 0,
|
||||
NewFlapStart = ?NOT_FLAPPING_START,
|
||||
NotSanesDict = orddict:new(),
|
||||
AllFlapCounts = [],
|
||||
AllHosed = []
|
||||
end,
|
||||
|
@ -1748,8 +1748,7 @@ calculate_flaps(P_newprop, _P_current, _FlapLimit,
|
|||
%% It isn't doing what I'd originally intended. Fix it.
|
||||
{machi_projection:update_checksum(P_newprop#projection_v1{
|
||||
flap=FlappingI}),
|
||||
S#ch_mgr{flaps=NewFlaps, flap_start=NewFlapStart,
|
||||
flap_not_sanes=NotSanesDict, runenv=RunEnv1}}.
|
||||
S#ch_mgr{flaps=NewFlaps, flap_start=NewFlapStart, runenv=RunEnv1}}.
|
||||
|
||||
make_flapping_i() ->
|
||||
make_flapping_i({{epk,-1},?NOT_FLAPPING}, 0, [], [], []).
|
||||
|
|
Loading…
Reference in a new issue