Bugfix: add missing reset of not_sanes dictionary, fix comments

2015-07-20 14:04:25 +09:00 · 2015-07-20 14:04:25 +09:00 · e14493373b
commit e14493373b
parent f7ef8c54f5
1 changed files with 35 additions and 15 deletions
--- a/src/machi_chain_manager1.erl
+++ b/src/machi_chain_manager1.erl
@ -69,6 +69,7 @@
          flap_start = ?NOT_FLAPPING_START
                          :: {{'epk', integer()}, erlang:timestamp()},
          not_sanes       :: orddict:orddict(),
+          sane_transitions = 0 :: non_neg_integer(),
          repair_worker   :: 'undefined' | pid(),
          repair_start    :: 'undefined' | erlang:timestamp(),
          repair_final_status :: 'undefined' | term(),
@ -845,13 +846,23 @@ do_react_to_env(S) ->
    %% attempts counter very generic (i.e., not specific for flapping
    %% as it once was).
    %%
-    %% The not_sanes counter dict should be reset each time we start
-    %% an iteration.  One could argue that state only for a single
-    %% iteration shouldn't go in #ch_mgr but should be a separate arg
-    %% threaded through each of the FSM funcs.
-    %% TODO possible refactoring task?
+    %% The not_sanes counter dict should be reset when we have had at
+    %% least 3 state transitions that did not have a not_sane
+    %% suggested projection transition or whenever we fall back to the
+    %% none_projection.
+    %%
+    %% We'll probably implement a very simple counter that may/will be
+    %% *inaccurate* by at most one -- so any reset test should ignore
+    %% counter values of 0 & 1.
+    %%
    put(react, []),
-    react_to_env_A10(S#ch_mgr{not_sanes=orddict:new()}).
+    if S#ch_mgr.sane_transitions > 3 ->         % TODO review this constant
+            %% ?V("Skr,~w,", [S#ch_mgr.name]),
+            react_to_env_A10(S#ch_mgr{not_sanes=orddict:new()});
+       true ->
+            %% ?V("Sk,~w,~w,", [S#ch_mgr.name, S#ch_mgr.sane_transitions]),
+            react_to_env_A10(S)
+    end.

 react_to_env_A10(S) ->
    ?REACT(a10),
@ -1435,8 +1446,8 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
            react_to_env_C110(P_latest, S);
        %% 20150715: I've seen this loop happen with {expected_author2,X}
        %% where nobody agrees, weird.
-        _ ->
-            ?REACT({c100, ?LINE}),
+        DoctorSays ->
+            ?REACT({c100, ?LINE, [{not_sane, DoctorSays}]}),
            %% This is a fun case.  We had just enough asymmetric partition
            %% to cause the chain to fragment into two *incompatible* and
            %% *overlapping membership* chains, but the chain fragmentation
@ -1490,7 +1501,7 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest,
 react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
                        P_newprop, P_latest, S) ->
    NotSanesDict = orddict:update_counter(Author_latest, 1, NotSanesDict0),
-    S2 = S#ch_mgr{not_sanes=NotSanesDict},
+    S2 = S#ch_mgr{not_sanes=NotSanesDict, sane_transitions=0},
    case orddict:fetch(Author_latest, NotSanesDict) of
        N when N > ?TOO_FREQUENT_BREAKER ->
            ?V("\n\nYOYO ~w breaking the cycle of ~p\n", [MyName, machi_projection:make_summary(P_latest)]),
@ -1508,7 +1519,7 @@ react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
 react_to_env_C103(#projection_v1{epoch_number=Epoch_latest,
                                 all_members=All_list,
                                 members_dict=MembersDict} = P_latest,
-                  #ch_mgr{name=MyName}=S) ->
+                  #ch_mgr{name=MyName, proj=P_current}=S) ->
    #projection_v1{epoch_number=Epoch_latest,
                   all_members=All_list,
                   members_dict=MembersDict} = P_latest,
@ -1517,8 +1528,16 @@ react_to_env_C103(#projection_v1{epoch_number=Epoch_latest,
                                    dbg=[{none_projection,true}]},
    P_none = machi_projection:update_checksum(P_none1),
    %% Use it, darn it, because it's 100% safe.  And exit flapping state.
+    ?REACT({c103, ?LINE,
+            [{current_epoch, P_current#projection_v1.epoch_number},
+             {none_projection_epoch, Epoch_latest}]}),
+    %% Reset the not_sanes count dictionary here, or else an already
+    %% ?TOO_FREQUENT_BREAKER count for an author might prevent a
+    %% transition from C100_inner()->C300, which can lead to infinite
+    %% looping C100->C103->C100.
    react_to_env_C100(P_none, P_none, S#ch_mgr{flaps=0,
-                                               flap_start=?NOT_FLAPPING_START}).
+                                               flap_start=?NOT_FLAPPING_START,
+                                               not_sanes=orddict:new()}).

 react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) ->
    ?REACT(c110),
@ -1578,7 +1597,8 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) ->
    end,
    react_to_env_C120(P_latest, [], S).

-react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H}=S) ->
+react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H,
+                                                sane_transitions=Xtns}=S) ->
    ?REACT(c120),
    H2 = queue:in(P_latest, H),
    H3 = case queue:len(H2) of
@ -1597,7 +1617,7 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H}=S) ->

    ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}),
    {{now_using, FinalProps, P_latest#projection_v1.epoch_number},
-     S#ch_mgr{proj=P_latest, proj_history=H3}}.
+     S#ch_mgr{proj=P_latest, proj_history=H3, sane_transitions=Xtns + 1}}.

 react_to_env_C200(Retries, P_latest, S) ->
    ?REACT(c200),