From e14493373bc068b9b7ac72b7454eb085568d4433 Mon Sep 17 00:00:00 2001
From: Scott Lystig Fritchie <slfritchie@snookles.com>
Date: Mon, 20 Jul 2015 14:04:25 +0900
Subject: [PATCH] Bugfix: add missing reset of not_sanes dictionary, fix
 comments

---
 src/machi_chain_manager1.erl | 50 +++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl
index 2832f22..0d78825 100644
--- a/src/machi_chain_manager1.erl
+++ b/src/machi_chain_manager1.erl
@@ -69,6 +69,7 @@
           flap_start = ?NOT_FLAPPING_START
                           :: {{'epk', integer()}, erlang:timestamp()},
           not_sanes       :: orddict:orddict(),
+          sane_transitions = 0 :: non_neg_integer(),
           repair_worker   :: 'undefined' | pid(),
           repair_start    :: 'undefined' | erlang:timestamp(),
           repair_final_status :: 'undefined' | term(),
@@ -845,13 +846,23 @@ do_react_to_env(S) ->
     %% attempts counter very generic (i.e., not specific for flapping
     %% as it once was).
     %%
-    %% The not_sanes counter dict should be reset each time we start
-    %% an iteration.  One could argue that state only for a single
-    %% iteration shouldn't go in #ch_mgr but should be a separate arg
-    %% threaded through each of the FSM funcs.
-    %% TODO possible refactoring task?
+    %% The not_sanes counter dict should be reset when we have had at
+    %% least 3 state transitions that did not have a not_sane
+    %% suggested projection transition or whenever we fall back to the
+    %% none_projection.
+    %%
+    %% We'll probably implement a very simple counter that may/will be
+    %% *inaccurate* by at most one -- so any reset test should ignore
+    %% counter values of 0 & 1.
+    %%
     put(react, []),
-    react_to_env_A10(S#ch_mgr{not_sanes=orddict:new()}).
+    if S#ch_mgr.sane_transitions > 3 ->         % TODO review this constant
+            %% ?V("Skr,~w,", [S#ch_mgr.name]),
+            react_to_env_A10(S#ch_mgr{not_sanes=orddict:new()});
+       true ->
+            %% ?V("Sk,~w,~w,", [S#ch_mgr.name, S#ch_mgr.sane_transitions]),
+            react_to_env_A10(S)
+    end.
 
 react_to_env_A10(S) ->
     ?REACT(a10),
@@ -1435,8 +1446,8 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
             react_to_env_C110(P_latest, S);
         %% 20150715: I've seen this loop happen with {expected_author2,X}
         %% where nobody agrees, weird.
-        _ ->
-            ?REACT({c100, ?LINE}),
+        DoctorSays ->
+            ?REACT({c100, ?LINE, [{not_sane, DoctorSays}]}),
             %% This is a fun case.  We had just enough asymmetric partition
             %% to cause the chain to fragment into two *incompatible* and
             %% *overlapping membership* chains, but the chain fragmentation
@@ -1490,7 +1501,7 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
 react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
                         P_newprop, P_latest, S) ->
     NotSanesDict = orddict:update_counter(Author_latest, 1, NotSanesDict0),
-    S2 = S#ch_mgr{not_sanes=NotSanesDict},
+    S2 = S#ch_mgr{not_sanes=NotSanesDict, sane_transitions=0},
     case orddict:fetch(Author_latest, NotSanesDict) of
         N when N > ?TOO_FREQUENT_BREAKER ->
             ?V("\n\nYOYO ~w breaking the cycle of ~p\n", [MyName, machi_projection:make_summary(P_latest)]),
@@ -1506,9 +1517,9 @@ react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
     end.
 
 react_to_env_C103(#projection_v1{epoch_number=Epoch_latest,
-                                   all_members=All_list,
-                                   members_dict=MembersDict} = P_latest,
-                  #ch_mgr{name=MyName}=S) ->
+                                 all_members=All_list,
+                                 members_dict=MembersDict} = P_latest,
+                  #ch_mgr{name=MyName, proj=P_current}=S) ->
     #projection_v1{epoch_number=Epoch_latest,
                    all_members=All_list,
                    members_dict=MembersDict} = P_latest,
@@ -1517,8 +1528,16 @@ react_to_env_C103(#projection_v1{epoch_number=Epoch_latest,
                                     dbg=[{none_projection,true}]},
     P_none = machi_projection:update_checksum(P_none1),
     %% Use it, darn it, because it's 100% safe.  And exit flapping state.
+    ?REACT({c103, ?LINE,
+            [{current_epoch, P_current#projection_v1.epoch_number},
+             {none_projection_epoch, Epoch_latest}]}),
+    %% Reset the not_sanes count dictionary here, or else an already
+    %% ?TOO_FREQUENT_BREAKER count for an author might prevent a
+    %% transition from C100_inner()->C300, which can lead to infinite
+    %% looping C100->C103->C100.
     react_to_env_C100(P_none, P_none, S#ch_mgr{flaps=0,
-                                               flap_start=?NOT_FLAPPING_START}).
+                                               flap_start=?NOT_FLAPPING_START,
+                                               not_sanes=orddict:new()}).
 
 react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) ->
     ?REACT(c110),
@@ -1578,7 +1597,8 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) ->
     end,
     react_to_env_C120(P_latest, [], S).
 
-react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H}=S) ->
+react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H,
+                                                sane_transitions=Xtns}=S) ->
     ?REACT(c120),
     H2 = queue:in(P_latest, H),
     H3 = case queue:len(H2) of
@@ -1597,7 +1617,7 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H}=S) ->
 
     ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}),
     {{now_using, FinalProps, P_latest#projection_v1.epoch_number},
-     S#ch_mgr{proj=P_latest, proj_history=H3}}.
+     S#ch_mgr{proj=P_latest, proj_history=H3, sane_transitions=Xtns + 1}}.
 
 react_to_env_C200(Retries, P_latest, S) ->
     ?REACT(c200),