WIP: bugfix for rare flapping infinite loop (done^2 fix I hope)

How can even computer? So, there's a flavor of the flapping infinite loop problem that can happen without flapping being detected (by the existing flapping detector, that is). That detector relies on a series of accepted projections to converge to a single projection repeated X times. However, it's possible to have a race with a simulated repair "finishing" that causes a problem so that no more projections are ever accepted. Oops. See also: new comments in do_react_to_env().
2015-07-19 00:43:10 +09:00 · 2015-07-19 00:43:10 +09:00 · b8c642aaa7
commit b8c642aaa7
parent 57b7122035
1 changed files with 60 additions and 61 deletions
--- a/src/machi_chain_manager1.erl
+++ b/src/machi_chain_manager1.erl
@ -68,7 +68,7 @@
          flaps=0         :: integer(),
          flap_start = ?NOT_FLAPPING_START
                          :: {{'epk', integer()}, erlang:timestamp()},
-          flap_not_sanes  :: orddict:orddict(),
+          not_sanes       :: orddict:orddict(),
          repair_worker   :: 'undefined' | pid(),
          repair_start    :: 'undefined' | erlang:timestamp(),
          repair_final_status :: 'undefined' | term(),
@ -226,7 +226,7 @@ init({MyName, InitMembersDict, MgrOpts}) ->
                flap_limit=length(All_list) + 50,
                timer='undefined',
                proj_history=queue:new(),
-                flap_not_sanes=orddict:new(),
+                not_sanes=orddict:new(),
                runenv=RunEnv,
                opts=MgrOpts},
    {_, S2} = do_set_chain_members_dict(MembersDict, S),
@ -805,8 +805,46 @@ do_react_to_env(#ch_mgr{name=MyName,
             S2#ch_mgr{proj=NewProj, members_dict=NewMembersDict}}
    end;
 do_react_to_env(S) ->
+    %% The not_sanes manager counting dictionary is not strictly
+    %% limited to flapping scenarios.  (Though the mechanism first
+    %% started as a way to deal with rare flapping scenarios.)
+    %%
+    %% I believe that the problem cannot happen in real life, but it can
+    %% happen in simulated environments, especially if the simulation for
+    %% repair can be approximately infinitely fast.
+    %%
+    %% For example:
+    %%   P_current: epoch=1135, UPI=[b,e,a], Repairing=[c,d], author=e
+    %%
+    %%   Now a partition happens, a & b are on an island, c & d & e on
+    %%   the other island.
+    %%
+    %%   P_newprop: epoch=1136, UPI=[e,c], Repairing=[d], author=e
+    %%
+    %% Why does e think that this is feasible?  Well, the old UPI was
+    %% [b,e,a], and we know that a & b are partitioned away from e.
+    %% Therefore e chooses the best UPI, [e].  However, the simulator
+    %% now also says, hey, there are nodes in the repairing list, so
+    %% let's simulate a repair ... and the repair goes infinitely
+    %% quickly ...and the epoch is stable during the repair period
+    %% (i.e., both e/repairer and c/repairee remained in the same
+    %% epoch 1135) ... so e decides that the simulated repair is
+    %% "finished" and it's time to add the repairee to the tail of the
+    %% UPI ... so that's why 1136's UPI=[e,c].
+    %%
+    %% I'll try to add a condition to the simulated repair to try to
+    %% make slightly fewer assumptions in a row.  However, I believe
+    %% it's a good idea to keep this too-many-not_sane-transition-
+    %% attempts counter very generic (i.e., not specific for flapping
+    %% as it once was).
+    %%
+    %% The not_sanes counter dict should be reset each time we start
+    %% an iteration.  One could argue that state only for a single
+    %% iteration shouldn't go in #ch_mgr but should be a separate arg
+    %% threaded through each of the FSM funcs.
+    %% TODO possible refactoring task?
    put(react, []),
-    react_to_env_A10(S).
+    react_to_env_A10(S#ch_mgr{not_sanes=orddict:new()}).

 react_to_env_A10(S) ->
    ?REACT(a10),
@ -986,7 +1024,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
                ?REACT({a30, ?LINE, [{inner_summary,
                                    machi_projection:make_summary(P_inner2)}]}),
                %% Adjust the outer projection's #flap_i info.
-                ?V("~w,", [{'YOYO',MyName,NewEpoch}]),
+                ?V("~w,", [{'FLAP',MyName,NewEpoch}]),
                #projection_v1{flap=OldFlap} = P_newprop3,
                NewFlap = OldFlap#flap_i{flapping_me=true},
                ?REACT({a30, ?LINE, [flap_continue,
@ -1360,7 +1398,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
 react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
                                            flap=Flap_latest0}=P_latest,
                  #ch_mgr{name=MyName, proj=P_current,
-                          flap_not_sanes=NotSanesDict0}=S) ->
+                          not_sanes=NotSanesDict0}=S) ->
    ?REACT(c100),

    Sane = projection_transition_is_sane(P_current, P_latest, MyName),
@ -1382,18 +1420,16 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
        _ when P_current#projection_v1.epoch_number == 0 ->
            %% Epoch == 0 is reserved for first-time, just booting conditions.
            ?REACT({c100, ?LINE, [first_write]}),
-            erase(perhaps_reset_loop),
+            if Sane == true -> ok;  true -> ?V("insane-~w-~w@~w,", [MyName, P_newprop#projection_v1.epoch_number, ?LINE]) end, %%% DELME!!!
            react_to_env_C110(P_latest, S);
        true ->
            ?REACT({c100, ?LINE, [sane]}),
-            erase(perhaps_reset_loop),
+            if Sane == true -> ok;  true -> ?V("insane-~w-~w@~w,", [MyName, P_newprop#projection_v1.epoch_number, ?LINE]) end, %%% DELME!!!
            react_to_env_C110(P_latest, S);
        %% 20150715: I've seen this loop happen with {expected_author2,X}
        %% where nobody agrees, weird.
-        false when is_record(Flap_latest, flap_i) andalso
-                   Flap_latest#flap_i.flapping_me == true ->
+        _ ->
            ?REACT({c100, ?LINE}),
-            ?V("\n\n1YOYO ~w breaking the cycle of ~p\n", [MyName, machi_projection:make_summary(P_latest)]),
            %% This is a fun case.  We had just enough asymmetric partition
            %% to cause the chain to fragment into two *incompatible* and
            %% *overlapping membership* chains, but the chain fragmentation
@ -1435,62 +1471,30 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
            %%
            %% So, we're going to keep track in #ch_mgr state for the number
            %% of times that this insane judgement has happened.
-
+            %%
+            %% See also: comment in do_react_to_env() about
+            %% non-flapping-scenario that can also cause us to want to
+            %% collapse to the none_projection to break a
+            %% livelock/infinite loop.
            react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
-                                    P_newprop, P_latest, S);
-        {expected_author2,_}=_ExpectedErr when
-                   is_record(Flap_latest, flap_i) andalso
-                   Flap_latest#flap_i.flapping_me == true ->
-            ?REACT({c100, ?LINE}),
-            react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
-                                    P_newprop, P_latest, S);
-        {expected_author2,_ExpectedAuthor2}=_ExpectedErr ->
-            case get(perhaps_reset_loop) of
-                undefined ->
-                    put(perhaps_reset_loop, 1),
-                    ?REACT({c100, ?LINE, [not_sane, get(why2), _ExpectedErr]}),
-                    react_to_env_C300(P_newprop, P_latest, S);
-                X when X > ?TOO_FREQUENT_BREAKER ->
-                    %% Ha, yes, this is possible.  For example:
-                    %% outer: author=e,upi=[b,a,d],repair=[c,e]
-                    %% inner: author=e,upi=[b,e],  repair=[]
-                    %% In this case, the transition from inner to outer by A30
-                    %% has chosen the wrong author.  We have two choices.
-                    %% 1. Accept this transition, because it really was the
-                    %%    safe & transition-approved UPI+repeairing that we
-                    %%    were using while we were flapping.  I'm 99% certain
-                    %%    that this is safe.  TODO: Verify
-                    %% 2. I'm not yet 100% certain that #1 is safe, so instead
-                    %%    we fall back to the one thing that we know is safe:
-                    %%    the 'none' projection, which lets the chain rebuild
-                    %%    itself normally during future iterations.
-                    ?REACT({c100, ?LINE}),
-                    react_to_env_C103(P_latest, S);
-                X ->
-                    put(perhaps_reset_loop, X+1),
-                    ?REACT({c100, ?LINE, [not_sane, get(why2), _ExpectedErr]}),
-                    react_to_env_C300(P_newprop, P_latest, S)
-            end;
-        _AnyOtherReturnValue ->
-            %% P_latest is not sane.
-            %% By process of elimination, P_newprop is best,
-            %% so let's write it.
-            ?REACT({c100, ?LINE, [not_sane, get(why2), _AnyOtherReturnValue]}),
-            erase(perhaps_reset_loop),
-            react_to_env_C300(P_newprop, P_latest, S)
+                                    P_newprop, P_latest, S)
    end.

 react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
                        P_newprop, P_latest, S) ->
    NotSanesDict = orddict:update_counter(Author_latest, 1, NotSanesDict0),
-    S2 = S#ch_mgr{flap_not_sanes=NotSanesDict},
+    S2 = S#ch_mgr{not_sanes=NotSanesDict},
    case orddict:fetch(Author_latest, NotSanesDict) of
        N when N > ?TOO_FREQUENT_BREAKER ->
            ?V("\n\nYOYO ~w breaking the cycle of ~p\n", [MyName, machi_projection:make_summary(P_latest)]),
            ?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}),
            react_to_env_C103(P_latest, S2);
        N ->
+           ?V("YOYO,~w,~w,~w,",[MyName, P_latest#projection_v1.epoch_number,N]),
            ?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}),
+            %% P_latest is not sane.
+            %% By process of elimination, P_newprop is best,
+            %% so let's write it.
            react_to_env_C300(P_newprop, P_latest, S2)
    end.

@ -1507,8 +1511,7 @@ react_to_env_C103(#projection_v1{epoch_number=Epoch_latest,
    P_none = machi_projection:update_checksum(P_none1),
    %% Use it, darn it, because it's 100% safe.  And exit flapping state.
    react_to_env_C100(P_none, P_none, S#ch_mgr{flaps=0,
-                                               flap_start=?NOT_FLAPPING_START,
-                                               flap_not_sanes=orddict:new()}).
+                                               flap_start=?NOT_FLAPPING_START}).

 react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) ->
    ?REACT(c110),
@ -1632,8 +1635,7 @@ react_to_env_C310(P_newprop, S) ->

 calculate_flaps(P_newprop, _P_current, _FlapLimit,
                #ch_mgr{name=MyName, proj_history=H, flap_start=FlapStart,
-                        flaps=Flaps, flap_not_sanes=NotSanesDict0,
-                        runenv=RunEnv1}=S) ->
+                        flaps=Flaps, runenv=RunEnv1}=S) ->
    HistoryPs = queue:to_list(H),
    Ps = HistoryPs ++ [P_newprop],
    UniqueProposalSummaries = lists:usort([{P#projection_v1.upi,
@ -1695,7 +1697,6 @@ calculate_flaps(P_newprop, _P_current, _FlapLimit,
               true ->
                    NewFlapStart = FlapStart
            end,
-            NotSanesDict = NotSanesDict0,

            %% Wow, this behavior is almost spooky.
            %%
@ -1725,7 +1726,6 @@ calculate_flaps(P_newprop, _P_current, _FlapLimit,
        {_N, _} ->
            NewFlaps = 0,
            NewFlapStart = ?NOT_FLAPPING_START,
-            NotSanesDict = orddict:new(),
            AllFlapCounts = [],
            AllHosed = []
    end,
@ -1748,8 +1748,7 @@ calculate_flaps(P_newprop, _P_current, _FlapLimit,
    %% It isn't doing what I'd originally intended.  Fix it.
    {machi_projection:update_checksum(P_newprop#projection_v1{
                                                         flap=FlappingI}),
-     S#ch_mgr{flaps=NewFlaps, flap_start=NewFlapStart,
-              flap_not_sanes=NotSanesDict, runenv=RunEnv1}}.
+     S#ch_mgr{flaps=NewFlaps, flap_start=NewFlapStart, runenv=RunEnv1}}.

 make_flapping_i() ->
    make_flapping_i({{epk,-1},?NOT_FLAPPING}, 0, [], [], []).