From 6c543dfc1854ed1692da3d2122ae5411e6b5562d Mon Sep 17 00:00:00 2001
From: Scott Lystig Fritchie <slfritchie@snookles.com>
Date: Mon, 14 Sep 2015 14:42:52 +0900
Subject: [PATCH] Re-use the flapping criteria for a different use (more)

Hooray, very early I ended up with a simulator example which kicked
in and tested this change.  (A deterministice fault injection method
for testing would also be valuable, probably.)

    machi_chain_manager1_converge_demo:t(7, [{private_write_verbose,true}]).

We switched partitions in the simulator like this:

    SET partitions = [{b,f},{c,f},{d,e},{f,e}] (2 of 90252) at {14,37,5}
    ...
    Stable projection at epoch 1429 upi=[b,c,g,a,d],repairing=[]
    ...
    SET partitions = [{b,d},{c,b},{d,c},{f,a}] (3 of 90252) at {14,37,44}

Part of the chain reassembled quickly from the following UPIs: [g], then
[g,e], then [g,e,f] via a series of successful simulated repairs.  For
the first two repairs, all parties (e & f & g) are unanimous about the
projections.  For the final repair, very strange, not all three adopt
[g,e,f] chain: e says nothing, f & g use it.

Also weird, then g immediately moves f!  upi=[g,e],repairing=[f].
Then e also adopts this chain of 2.  From that point forward, f keeps
trying to use upi=[g,e,f],[] and the others try using only upi=[g,e],[f].
There are lots of messages from g saying that it's insane (correctly!)
to try calc=1487:[g,e],[f] -> 1494:[g,e,f],[] without a valid repair
author.

It's worth checking why g dropped from [g,e,f] -> [g,e].  But even
still, this new use for the flapping counter & reset via C103 is
working.  ... Ah, now I understand.  The very occasional undefined
socket bug in machi_flu1_client appears to be the cause: g had a
one-time problem talking with f and so decided f was down long enough to
make the shorter UPI.  The other participants didn't have any such
problem with f and so kept f in the UPI.  This would have been a
deadlock/infinite loop case without someone deciding to reset state.
---
 src/machi_chain_manager1.erl | 79 +++++++++++++++++++++++-------------
 1 file changed, 51 insertions(+), 28 deletions(-)

diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl
index 0a5d101..276498d 100644
--- a/src/machi_chain_manager1.erl
+++ b/src/machi_chain_manager1.erl
@@ -102,6 +102,9 @@
 %% Amount of epoch number skip-ahead for set_chain_members call
 -define(SET_CHAIN_MEMBERS_EPOCH_SKIP, 1111).
 
+%% Maximum length of the history of adopted projections (via C120).
+-define(MAX_HISTORY_LENGTH, 30).
+
 %% API
 -export([start_link/2, start_link/3, stop/1, ping/1,
          set_chain_members/2, set_chain_members/3, set_active/2,
@@ -1487,26 +1490,6 @@ io:format(user, "CONFIRM debug question line ~w\n", [?LINE]),
             end
     end.
 
-react_to_env_A49(P_latest, FinalProps, #ch_mgr{consistency_mode=cp_mode,
-                                               proj=P_current} = S) ->
-    ?REACT(a49),
-    %% Using the none projection as our new P_current does *not* work:
-    %% if we forget what P_current is, then we risk not being able to
-    %% detect an insane chain transition or else risk a false positive
-    %% insane check.
-    %%
-    %% Instead, we will create an implicit annotation in P_current
-    %% that will force A29 to always use the projection from
-    %% make_zerf() as the basis for our next transition calculations.
-    %% In this wacky case, we break the checksum on P_current so that
-    %% A29's epoch_id comparison will always be unequal and thus
-    %% always trigger make_zerf().
-    Dbg = P_current#projection_v1.dbg,
-    P_current2 = P_current#projection_v1{epoch_csum= <<"broken">>,
-                                         dbg=[{zerf_backstop,true},
-                                              {zerf_in,a49}|Dbg]},
-    react_to_env_A50(P_latest, FinalProps, set_proj(S, P_current2)).
-
 react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) ->
     ?REACT(a50),
     ?REACT({a50, ?LINE, [{current_epoch, P_current#projection_v1.epoch_number},
@@ -1519,7 +1502,8 @@ react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) ->
 
 react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, P_current_calc,
                 _AmHosedP, Rank_newprop, Rank_latest,
-                 #ch_mgr{name=MyName, proj=P_current}=S) ->
+                 #ch_mgr{name=MyName, proj=P_current,
+                         proj_history=History}=S) ->
     ?REACT(b10),
 
     P_current_upi = if is_record(P_current, projection_v1) ->
@@ -1556,9 +1540,50 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, P_current_calc,
                           true ->
                                true
                        end,
+    HistoryList = queue:to_list(History),
+    UniqueHistories = lists:usort(HistoryList),
+    UniqueHistoryTrigger_p = length(HistoryList) > (?MAX_HISTORY_LENGTH-1)
+                             andalso case UniqueHistories of
+                                 [ [] ]       -> false;
+                                 [ [MyName] ] -> false;
+                                 [ _  ]       -> true;
+                                 _            -> false
+                             end,
     ?REACT({b10,?LINE,[{newprop_epoch,P_newprop#projection_v1.epoch_number},
-                       {is_relevant_to_me_p,IsRelevantToMe_p}]}),
+                       {is_relevant_to_me_p,IsRelevantToMe_p},
+                       {unique_histories,UniqueHistories},
+                       {unique_history_trigger_p,UniqueHistoryTrigger_p}]}),
     if
+        UniqueHistoryTrigger_p ->
+            ?REACT({b10, ?LINE, []}),
+            %% We need to do something drastic: we're repeating ourselves.  In
+            %% a former version of this code, we called this "flapping" and
+            %% would enter an alternative projection calculation mode in order
+            %% to dampen the flapping.  In today's version of the code, we use
+            %% the machi_fitness service to help us figure out who is causing
+            %% the flapping so that we can exclude them from our projection
+            %% calculations.
+            %%
+            %% In this hypothetical example (i.e., I haven't witnessed this
+            %% actually happening with this code, but it's a valid scenario, I
+            %% believe), with a chain length of 7, during an asymmetric
+            %% partition scenario, we could have:
+            %%     f suggests       : upi=[b,g,f] repairing=[c]
+            %%     c,b,g all suggest: upi=[c,b,g], repairing=[f]
+            %%
+            %% The projection ranking and UPI lengths are the same, but the
+            %% UPIs are incompatible.  In the Git history, very recently, we
+            %% hit a case very similar to this one.  The fix, after more
+            %% thought, wasn't sufficient.  A more comprehensive fix, I
+            %% believe, is using the older "flapping" detection mechanism for
+            %% this worst-case condition: go to C103, fall back to shortest &
+            %% safe projection, tell fitness server we're administratively
+            %% down for a short while (to signal to other state machines that
+            %% they need to adapt to our bad situation), and then resume.
+
+            io:format(user, "\nCONFIRM dbg *************************** ~w UniqueHistoryTrigger_p\n", [MyName]),
+            react_to_env_C103(P_newprop, P_latest, P_current_calc, S);
+
         LatestUnanimousP andalso IsRelevantToMe_p ->
             ?REACT({b10, ?LINE,
                     [{latest_unanimous_p, LatestUnanimousP},
@@ -1671,8 +1696,7 @@ react_to_env_C100(P_newprop,
     end.
 
 react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
-                        P_newprop, P_latest, P_current_calc,
-                        #ch_mgr{consistency_mode=CMode} = S) ->
+                        P_newprop, P_latest, P_current_calc, S) ->
     NotSanesDict = orddict:update_counter(Author_latest, 1, NotSanesDict0),
     S2 = S#ch_mgr{not_sanes=NotSanesDict, sane_transitions=0},
     case orddict:fetch(Author_latest, NotSanesDict) of
@@ -1788,9 +1812,7 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) ->
 react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H,
                                                 sane_transitions=Xtns}=S) ->
     ?REACT(c120),
-    %% TODO: revisit this constant?
-    MaxLength = length(P_latest#projection_v1.all_members) * 1.5,
-    H2   = add_and_trunc_history(P_latest, H, MaxLength),
+    H2   = add_and_trunc_history(P_latest, H, ?MAX_HISTORY_LENGTH),
 
     %% diversion_c120_verbose_goop(P_latest, S),
     ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}),
@@ -1808,8 +1830,9 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H,
     {{now_using, FinalProps, P_latest#projection_v1.epoch_number}, S3}.
 
 add_and_trunc_history(P_latest, H, MaxLength) ->
+    Latest_U_R = {P_latest#projection_v1.upi, P_latest#projection_v1.repairing},
     H2 = if P_latest#projection_v1.epoch_number > 0 ->
-                 queue:in(P_latest, H);
+                 queue:in(Latest_U_R, H);
             true ->
                  H
          end,