WIP: bugfix for rare flapping infinite loop (done^2 fix I hope)
How can even computer? So, there's a flavor of the flapping infinite loop problem that can happen without flapping being detected (by the existing flapping detector, that is). That detector relies on a series of accepted projections to converge to a single projection repeated X times. However, it's possible to have a race with a simulated repair "finishing" that causes a problem so that no more projections are ever accepted. Oops. See also: new comments in do_react_to_env().
This commit is contained in:
parent
57b7122035
commit
b8c642aaa7
1 changed files with 60 additions and 61 deletions
|
@ -68,7 +68,7 @@
|
||||||
flaps=0 :: integer(),
|
flaps=0 :: integer(),
|
||||||
flap_start = ?NOT_FLAPPING_START
|
flap_start = ?NOT_FLAPPING_START
|
||||||
:: {{'epk', integer()}, erlang:timestamp()},
|
:: {{'epk', integer()}, erlang:timestamp()},
|
||||||
flap_not_sanes :: orddict:orddict(),
|
not_sanes :: orddict:orddict(),
|
||||||
repair_worker :: 'undefined' | pid(),
|
repair_worker :: 'undefined' | pid(),
|
||||||
repair_start :: 'undefined' | erlang:timestamp(),
|
repair_start :: 'undefined' | erlang:timestamp(),
|
||||||
repair_final_status :: 'undefined' | term(),
|
repair_final_status :: 'undefined' | term(),
|
||||||
|
@ -226,7 +226,7 @@ init({MyName, InitMembersDict, MgrOpts}) ->
|
||||||
flap_limit=length(All_list) + 50,
|
flap_limit=length(All_list) + 50,
|
||||||
timer='undefined',
|
timer='undefined',
|
||||||
proj_history=queue:new(),
|
proj_history=queue:new(),
|
||||||
flap_not_sanes=orddict:new(),
|
not_sanes=orddict:new(),
|
||||||
runenv=RunEnv,
|
runenv=RunEnv,
|
||||||
opts=MgrOpts},
|
opts=MgrOpts},
|
||||||
{_, S2} = do_set_chain_members_dict(MembersDict, S),
|
{_, S2} = do_set_chain_members_dict(MembersDict, S),
|
||||||
|
@ -805,8 +805,46 @@ do_react_to_env(#ch_mgr{name=MyName,
|
||||||
S2#ch_mgr{proj=NewProj, members_dict=NewMembersDict}}
|
S2#ch_mgr{proj=NewProj, members_dict=NewMembersDict}}
|
||||||
end;
|
end;
|
||||||
do_react_to_env(S) ->
|
do_react_to_env(S) ->
|
||||||
|
%% The not_sanes manager counting dictionary is not strictly
|
||||||
|
%% limited to flapping scenarios. (Though the mechanism first
|
||||||
|
%% started as a way to deal with rare flapping scenarios.)
|
||||||
|
%%
|
||||||
|
%% I believe that the problem cannot happen in real life, but it can
|
||||||
|
%% happen in simulated environments, especially if the simulation for
|
||||||
|
%% repair can be approximately infinitely fast.
|
||||||
|
%%
|
||||||
|
%% For example:
|
||||||
|
%% P_current: epoch=1135, UPI=[b,e,a], Repairing=[c,d], author=e
|
||||||
|
%%
|
||||||
|
%% Now a partition happens, a & b are on an island, c & d & e on
|
||||||
|
%% the other island.
|
||||||
|
%%
|
||||||
|
%% P_newprop: epoch=1136, UPI=[e,c], Repairing=[d], author=e
|
||||||
|
%%
|
||||||
|
%% Why does e think that this is feasible? Well, the old UPI was
|
||||||
|
%% [b,e,a], and we know that a & b are partitioned away from e.
|
||||||
|
%% Therefore e chooses the best UPI, [e]. However, the simulator
|
||||||
|
%% now also says, hey, there are nodes in the repairing list, so
|
||||||
|
%% let's simulate a repair ... and the repair goes infinitely
|
||||||
|
%% quickly ...and the epoch is stable during the repair period
|
||||||
|
%% (i.e., both e/repairer and c/repairee remained in the same
|
||||||
|
%% epoch 1135) ... so e decides that the simulated repair is
|
||||||
|
%% "finished" and it's time to add the repairee to the tail of the
|
||||||
|
%% UPI ... so that's why 1136's UPI=[e,c].
|
||||||
|
%%
|
||||||
|
%% I'll try to add a condition to the simulated repair to try to
|
||||||
|
%% make slightly fewer assumptions in a row. However, I believe
|
||||||
|
%% it's a good idea to keep this too-many-not_sane-transition-
|
||||||
|
%% attempts counter very generic (i.e., not specific for flapping
|
||||||
|
%% as it once was).
|
||||||
|
%%
|
||||||
|
%% The not_sanes counter dict should be reset each time we start
|
||||||
|
%% an iteration. One could argue that state only for a single
|
||||||
|
%% iteration shouldn't go in #ch_mgr but should be a separate arg
|
||||||
|
%% threaded through each of the FSM funcs.
|
||||||
|
%% TODO possible refactoring task?
|
||||||
put(react, []),
|
put(react, []),
|
||||||
react_to_env_A10(S).
|
react_to_env_A10(S#ch_mgr{not_sanes=orddict:new()}).
|
||||||
|
|
||||||
react_to_env_A10(S) ->
|
react_to_env_A10(S) ->
|
||||||
?REACT(a10),
|
?REACT(a10),
|
||||||
|
@ -986,7 +1024,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
|
||||||
?REACT({a30, ?LINE, [{inner_summary,
|
?REACT({a30, ?LINE, [{inner_summary,
|
||||||
machi_projection:make_summary(P_inner2)}]}),
|
machi_projection:make_summary(P_inner2)}]}),
|
||||||
%% Adjust the outer projection's #flap_i info.
|
%% Adjust the outer projection's #flap_i info.
|
||||||
?V("~w,", [{'YOYO',MyName,NewEpoch}]),
|
?V("~w,", [{'FLAP',MyName,NewEpoch}]),
|
||||||
#projection_v1{flap=OldFlap} = P_newprop3,
|
#projection_v1{flap=OldFlap} = P_newprop3,
|
||||||
NewFlap = OldFlap#flap_i{flapping_me=true},
|
NewFlap = OldFlap#flap_i{flapping_me=true},
|
||||||
?REACT({a30, ?LINE, [flap_continue,
|
?REACT({a30, ?LINE, [flap_continue,
|
||||||
|
@ -1360,7 +1398,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
|
||||||
react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
|
react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
|
||||||
flap=Flap_latest0}=P_latest,
|
flap=Flap_latest0}=P_latest,
|
||||||
#ch_mgr{name=MyName, proj=P_current,
|
#ch_mgr{name=MyName, proj=P_current,
|
||||||
flap_not_sanes=NotSanesDict0}=S) ->
|
not_sanes=NotSanesDict0}=S) ->
|
||||||
?REACT(c100),
|
?REACT(c100),
|
||||||
|
|
||||||
Sane = projection_transition_is_sane(P_current, P_latest, MyName),
|
Sane = projection_transition_is_sane(P_current, P_latest, MyName),
|
||||||
|
@ -1382,18 +1420,16 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
|
||||||
_ when P_current#projection_v1.epoch_number == 0 ->
|
_ when P_current#projection_v1.epoch_number == 0 ->
|
||||||
%% Epoch == 0 is reserved for first-time, just booting conditions.
|
%% Epoch == 0 is reserved for first-time, just booting conditions.
|
||||||
?REACT({c100, ?LINE, [first_write]}),
|
?REACT({c100, ?LINE, [first_write]}),
|
||||||
erase(perhaps_reset_loop),
|
if Sane == true -> ok; true -> ?V("insane-~w-~w@~w,", [MyName, P_newprop#projection_v1.epoch_number, ?LINE]) end, %%% DELME!!!
|
||||||
react_to_env_C110(P_latest, S);
|
react_to_env_C110(P_latest, S);
|
||||||
true ->
|
true ->
|
||||||
?REACT({c100, ?LINE, [sane]}),
|
?REACT({c100, ?LINE, [sane]}),
|
||||||
erase(perhaps_reset_loop),
|
if Sane == true -> ok; true -> ?V("insane-~w-~w@~w,", [MyName, P_newprop#projection_v1.epoch_number, ?LINE]) end, %%% DELME!!!
|
||||||
react_to_env_C110(P_latest, S);
|
react_to_env_C110(P_latest, S);
|
||||||
%% 20150715: I've seen this loop happen with {expected_author2,X}
|
%% 20150715: I've seen this loop happen with {expected_author2,X}
|
||||||
%% where nobody agrees, weird.
|
%% where nobody agrees, weird.
|
||||||
false when is_record(Flap_latest, flap_i) andalso
|
_ ->
|
||||||
Flap_latest#flap_i.flapping_me == true ->
|
|
||||||
?REACT({c100, ?LINE}),
|
?REACT({c100, ?LINE}),
|
||||||
?V("\n\n1YOYO ~w breaking the cycle of ~p\n", [MyName, machi_projection:make_summary(P_latest)]),
|
|
||||||
%% This is a fun case. We had just enough asymmetric partition
|
%% This is a fun case. We had just enough asymmetric partition
|
||||||
%% to cause the chain to fragment into two *incompatible* and
|
%% to cause the chain to fragment into two *incompatible* and
|
||||||
%% *overlapping membership* chains, but the chain fragmentation
|
%% *overlapping membership* chains, but the chain fragmentation
|
||||||
|
@ -1435,62 +1471,30 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
|
||||||
%%
|
%%
|
||||||
%% So, we're going to keep track in #ch_mgr state for the number
|
%% So, we're going to keep track in #ch_mgr state for the number
|
||||||
%% of times that this insane judgement has happened.
|
%% of times that this insane judgement has happened.
|
||||||
|
%%
|
||||||
|
%% See also: comment in do_react_to_env() about
|
||||||
|
%% non-flapping-scenario that can also cause us to want to
|
||||||
|
%% collapse to the none_projection to break a
|
||||||
|
%% livelock/infinite loop.
|
||||||
react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
|
react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
|
||||||
P_newprop, P_latest, S);
|
P_newprop, P_latest, S)
|
||||||
{expected_author2,_}=_ExpectedErr when
|
|
||||||
is_record(Flap_latest, flap_i) andalso
|
|
||||||
Flap_latest#flap_i.flapping_me == true ->
|
|
||||||
?REACT({c100, ?LINE}),
|
|
||||||
react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
|
|
||||||
P_newprop, P_latest, S);
|
|
||||||
{expected_author2,_ExpectedAuthor2}=_ExpectedErr ->
|
|
||||||
case get(perhaps_reset_loop) of
|
|
||||||
undefined ->
|
|
||||||
put(perhaps_reset_loop, 1),
|
|
||||||
?REACT({c100, ?LINE, [not_sane, get(why2), _ExpectedErr]}),
|
|
||||||
react_to_env_C300(P_newprop, P_latest, S);
|
|
||||||
X when X > ?TOO_FREQUENT_BREAKER ->
|
|
||||||
%% Ha, yes, this is possible. For example:
|
|
||||||
%% outer: author=e,upi=[b,a,d],repair=[c,e]
|
|
||||||
%% inner: author=e,upi=[b,e], repair=[]
|
|
||||||
%% In this case, the transition from inner to outer by A30
|
|
||||||
%% has chosen the wrong author. We have two choices.
|
|
||||||
%% 1. Accept this transition, because it really was the
|
|
||||||
%% safe & transition-approved UPI+repeairing that we
|
|
||||||
%% were using while we were flapping. I'm 99% certain
|
|
||||||
%% that this is safe. TODO: Verify
|
|
||||||
%% 2. I'm not yet 100% certain that #1 is safe, so instead
|
|
||||||
%% we fall back to the one thing that we know is safe:
|
|
||||||
%% the 'none' projection, which lets the chain rebuild
|
|
||||||
%% itself normally during future iterations.
|
|
||||||
?REACT({c100, ?LINE}),
|
|
||||||
react_to_env_C103(P_latest, S);
|
|
||||||
X ->
|
|
||||||
put(perhaps_reset_loop, X+1),
|
|
||||||
?REACT({c100, ?LINE, [not_sane, get(why2), _ExpectedErr]}),
|
|
||||||
react_to_env_C300(P_newprop, P_latest, S)
|
|
||||||
end;
|
|
||||||
_AnyOtherReturnValue ->
|
|
||||||
%% P_latest is not sane.
|
|
||||||
%% By process of elimination, P_newprop is best,
|
|
||||||
%% so let's write it.
|
|
||||||
?REACT({c100, ?LINE, [not_sane, get(why2), _AnyOtherReturnValue]}),
|
|
||||||
erase(perhaps_reset_loop),
|
|
||||||
react_to_env_C300(P_newprop, P_latest, S)
|
|
||||||
end.
|
end.
|
||||||
|
|
||||||
react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
|
react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
|
||||||
P_newprop, P_latest, S) ->
|
P_newprop, P_latest, S) ->
|
||||||
NotSanesDict = orddict:update_counter(Author_latest, 1, NotSanesDict0),
|
NotSanesDict = orddict:update_counter(Author_latest, 1, NotSanesDict0),
|
||||||
S2 = S#ch_mgr{flap_not_sanes=NotSanesDict},
|
S2 = S#ch_mgr{not_sanes=NotSanesDict},
|
||||||
case orddict:fetch(Author_latest, NotSanesDict) of
|
case orddict:fetch(Author_latest, NotSanesDict) of
|
||||||
N when N > ?TOO_FREQUENT_BREAKER ->
|
N when N > ?TOO_FREQUENT_BREAKER ->
|
||||||
?V("\n\nYOYO ~w breaking the cycle of ~p\n", [MyName, machi_projection:make_summary(P_latest)]),
|
?V("\n\nYOYO ~w breaking the cycle of ~p\n", [MyName, machi_projection:make_summary(P_latest)]),
|
||||||
?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}),
|
?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}),
|
||||||
react_to_env_C103(P_latest, S2);
|
react_to_env_C103(P_latest, S2);
|
||||||
N ->
|
N ->
|
||||||
|
?V("YOYO,~w,~w,~w,",[MyName, P_latest#projection_v1.epoch_number,N]),
|
||||||
?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}),
|
?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}),
|
||||||
|
%% P_latest is not sane.
|
||||||
|
%% By process of elimination, P_newprop is best,
|
||||||
|
%% so let's write it.
|
||||||
react_to_env_C300(P_newprop, P_latest, S2)
|
react_to_env_C300(P_newprop, P_latest, S2)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
@ -1507,8 +1511,7 @@ react_to_env_C103(#projection_v1{epoch_number=Epoch_latest,
|
||||||
P_none = machi_projection:update_checksum(P_none1),
|
P_none = machi_projection:update_checksum(P_none1),
|
||||||
%% Use it, darn it, because it's 100% safe. And exit flapping state.
|
%% Use it, darn it, because it's 100% safe. And exit flapping state.
|
||||||
react_to_env_C100(P_none, P_none, S#ch_mgr{flaps=0,
|
react_to_env_C100(P_none, P_none, S#ch_mgr{flaps=0,
|
||||||
flap_start=?NOT_FLAPPING_START,
|
flap_start=?NOT_FLAPPING_START}).
|
||||||
flap_not_sanes=orddict:new()}).
|
|
||||||
|
|
||||||
react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) ->
|
react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) ->
|
||||||
?REACT(c110),
|
?REACT(c110),
|
||||||
|
@ -1632,8 +1635,7 @@ react_to_env_C310(P_newprop, S) ->
|
||||||
|
|
||||||
calculate_flaps(P_newprop, _P_current, _FlapLimit,
|
calculate_flaps(P_newprop, _P_current, _FlapLimit,
|
||||||
#ch_mgr{name=MyName, proj_history=H, flap_start=FlapStart,
|
#ch_mgr{name=MyName, proj_history=H, flap_start=FlapStart,
|
||||||
flaps=Flaps, flap_not_sanes=NotSanesDict0,
|
flaps=Flaps, runenv=RunEnv1}=S) ->
|
||||||
runenv=RunEnv1}=S) ->
|
|
||||||
HistoryPs = queue:to_list(H),
|
HistoryPs = queue:to_list(H),
|
||||||
Ps = HistoryPs ++ [P_newprop],
|
Ps = HistoryPs ++ [P_newprop],
|
||||||
UniqueProposalSummaries = lists:usort([{P#projection_v1.upi,
|
UniqueProposalSummaries = lists:usort([{P#projection_v1.upi,
|
||||||
|
@ -1695,7 +1697,6 @@ calculate_flaps(P_newprop, _P_current, _FlapLimit,
|
||||||
true ->
|
true ->
|
||||||
NewFlapStart = FlapStart
|
NewFlapStart = FlapStart
|
||||||
end,
|
end,
|
||||||
NotSanesDict = NotSanesDict0,
|
|
||||||
|
|
||||||
%% Wow, this behavior is almost spooky.
|
%% Wow, this behavior is almost spooky.
|
||||||
%%
|
%%
|
||||||
|
@ -1725,7 +1726,6 @@ calculate_flaps(P_newprop, _P_current, _FlapLimit,
|
||||||
{_N, _} ->
|
{_N, _} ->
|
||||||
NewFlaps = 0,
|
NewFlaps = 0,
|
||||||
NewFlapStart = ?NOT_FLAPPING_START,
|
NewFlapStart = ?NOT_FLAPPING_START,
|
||||||
NotSanesDict = orddict:new(),
|
|
||||||
AllFlapCounts = [],
|
AllFlapCounts = [],
|
||||||
AllHosed = []
|
AllHosed = []
|
||||||
end,
|
end,
|
||||||
|
@ -1748,8 +1748,7 @@ calculate_flaps(P_newprop, _P_current, _FlapLimit,
|
||||||
%% It isn't doing what I'd originally intended. Fix it.
|
%% It isn't doing what I'd originally intended. Fix it.
|
||||||
{machi_projection:update_checksum(P_newprop#projection_v1{
|
{machi_projection:update_checksum(P_newprop#projection_v1{
|
||||||
flap=FlappingI}),
|
flap=FlappingI}),
|
||||||
S#ch_mgr{flaps=NewFlaps, flap_start=NewFlapStart,
|
S#ch_mgr{flaps=NewFlaps, flap_start=NewFlapStart, runenv=RunEnv1}}.
|
||||||
flap_not_sanes=NotSanesDict, runenv=RunEnv1}}.
|
|
||||||
|
|
||||||
make_flapping_i() ->
|
make_flapping_i() ->
|
||||||
make_flapping_i({{epk,-1},?NOT_FLAPPING}, 0, [], [], []).
|
make_flapping_i({{epk,-1},?NOT_FLAPPING}, 0, [], [], []).
|
||||||
|
|
Loading…
Reference in a new issue