WIP: By Jove, I believe the chain manager is working

2015-04-14 15:30:24 +09:00 · 2015-04-14 15:30:24 +09:00 · 59936eda62
commit 59936eda62
parent 09051aecce
4 changed files with 104 additions and 61 deletions
--- a/TODO-shortterm.org
+++ b/TODO-shortterm.org
@ -7,7 +7,7 @@
  Done via compare() func.

 ** DONE Change all protocol ops to add epoch ID
-** TODO Add projection store to each FLU.
+** DONE Add projection store to each FLU.

 *** DONE What should the API look like? (borrow from chain mgr PoC?)

@ -23,8 +23,14 @@ method as append/write where there's a variable size blob.  But we'll
 format that blob as a term_to_binary().  Then dispatch to a single
 func, and pattern match Erlang style in that func.

-*** TODO Do it.
+*** DONE Do it.

+** TODO Fix all known bugs with Chain Manager
+
+*** DONE Fix known bugs
+*** TODO Clean up crufty TODO comments and other obvious cruft
+
+** TODO Finish OTP'izing the Chain Manager with FLU & proj store processes
 ** TODO Change all protocol ops to enforce the epoch ID
 ** TODO Add projection wedging logic to each FLU.

--- a/include/machi_projection.hrl
+++ b/include/machi_projection.hrl
@ -44,11 +44,11 @@
          epoch_number    :: pv1_epoch_n(),
          epoch_csum      :: pv1_csum(),
          author_server   :: pv1_server(),
-          creation_time   :: pv1_timestamp(),
          all_members     :: [pv1_server()],
-          down            :: [pv1_server()],
+          creation_time   :: pv1_timestamp(),
          upi             :: [pv1_server()],
          repairing       :: [pv1_server()],
+          down            :: [pv1_server()],
          dbg             :: list(), %proplist(), is checksummed
          dbg2            :: list(), %proplist(), is not checksummed
          members_dict    :: p_srvr_dict()
--- a/src/machi_chain_manager1.erl
+++ b/src/machi_chain_manager1.erl
@ -139,10 +139,7 @@ init({MyName, MembersDict, MgrOpts}) ->
              {flapping_i, Opt(flapping, [])},
              {up_nodes, Opt(up_nodes, not_init_yet)}],
    ActiveP = Opt(active_mode, true),
-    Down_list = All_list -- [MyName],
-    UPI_list = [MyName],
-    NoneProj = machi_projection:new(MyName, MembersDict,
-                                    Down_list, UPI_list, [], []),
+    NoneProj = make_none_projection(MyName, All_list, MembersDict),
    Proxies = orddict:fold(
                fun(K, P, Acc) ->
                        {ok, Pid} = ?FLU_PC:start_link(P),
@ -220,10 +217,15 @@ code_change(_OldVsn, S, _Extra) ->

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

+make_none_projection(MyName, All_list, MembersDict) ->
+    Down_list = All_list,
+    UPI_list = [],
+    machi_projection:new(MyName, MembersDict, UPI_list, Down_list, [], []).
+
 set_active_timer(#ch_mgr{name=MyName, members_dict=MembersDict}=S) ->
    FLU_list = [P#p_srvr.name || {_,P} <- orddict:to_list(MembersDict)],
    USec = calc_sleep_ranked_order(1000, 2000, MyName, FLU_list),
-    {ok, TRef} = timer:send_interval(USec, yo_yo_yo),
+    {ok, TRef} = timer:send_interval(USec, yo_yo_yo_todo),
    S#ch_mgr{timer=TRef}.

 do_cl_write_public_proj(Proj, S) ->
@ -516,6 +518,29 @@ rank_and_sort_projections(Ps, CurrentProj) ->
 %% Caller must ensure all Projs are of the same epoch number.
 %% If the caller gives us projections with different epochs, we assume
 %% that the caller is doing an OK thing.
+%%
+%% TODO: This implementation currently gives higher rank to the last
+%%       member of All_list, which is typically/always/TODO-CLARIFY
+%%       sorted.  That's fine, but there's a source of unnecessary
+%%       churn: during repair, we assume that the head of the chain is
+%%       the coordinator of the repair.  So any time that the head
+%%       makes a repair-related transition, that projection may get
+%%       quickly replaced by an identical projection that merely has
+%%       higher rank because it's authored by a higher-ranked member.
+%%       Worst case, for chain len=4:
+%%          E+0: author=a, upi=[a], repairing=[b,c,d]
+%%          E+1: author=b, upi=[a], repairing=[b,c,d] (**)
+%%          E+2: author=c, upi=[a], repairing=[b,c,d] (**)
+%%          E+3: author=d, upi=[a], repairing=[b,c,d] (**)
+%%          E+4: author=a, upi=[a,b], repairing=[c,d]
+%%          E+5: author=b, upi=[a,b], repairing=[c,d] (**)
+%%          E+6: author=c, upi=[a,b], repairing=[c,d] (**)
+%%          E+7: author=d, upi=[a,b], repairing=[c,d] (**)
+%%          E+... 6 more (**) epochs when c & d finish their respective repairs.
+%%       Ideally, the "(**)" epochs are avoidable churn.
+%%       Perhaps this means that we should change the responsibility
+%%       for repair management to the highest ranking member of the
+%%       UPI_list?

 rank_projections(Projs, CurrentProj) ->
    #projection_v1{all_members=All_list} = CurrentProj,
@ -528,11 +553,8 @@ rank_projection(#projection_v1{upi=[]}, _MemberRank, _N) ->
    -100;
 rank_projection(#projection_v1{author_server=Author,
                               upi=UPI_list,
-                               repairing=Repairing_list,
-                               dbg=Dbg}, MemberRank, N) ->
-    RankBoost = proplists:get_value({'rank_boost!', Author}, Dbg, 0),
+                               repairing=Repairing_list}, MemberRank, N) ->
    AuthorRank = orddict:fetch(Author, MemberRank),
-    RankBoost +
    AuthorRank +
        (  N * length(Repairing_list)) +
        (N*N * length(UPI_list)).
@ -581,8 +603,9 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
                 #ch_mgr{name=MyName, proj=P_current,
                         flap_limit=FlapLimit} = S) ->
    ?REACT(a30),
-   io:format(user, "HEE30s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- get(react), is_atom(X) orelse element(1,X) == b10])]),
+   %% io:format(user, "HEE30s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- get(react), is_atom(X) orelse element(1,X) == b10])]),
    {P_newprop1, S2} = calc_projection(S, MyName),
+    ?REACT({a30, ?LINE, [{current, machi_projection:make_summary(S#ch_mgr.proj)}]}),
    ?REACT({a30, ?LINE, [{newprop1, machi_projection:make_summary(P_newprop1)}]}),

    %% Are we flapping yet?
@ -595,7 +618,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
    NewEpoch = erlang:max(Epoch_newprop2, Epoch_latest) + 1,
    P_newprop3 = P_newprop2#projection_v1{epoch_number=NewEpoch},
    ?REACT({a30, ?LINE, [{newprop3, machi_projection:make_summary(P_newprop3)}]}),
-    if MyName == 'd' -> io:format(user, "QQQQQ ~w P_latest is ~w\n", [MyName, machi_projection:make_summary(P_latest)]); true -> ok end,
+    %% if MyName == 'd' -> io:format(user, "QQQQQ ~w P_latest is ~w\n", [MyName, machi_projection:make_summary(P_latest)]); true -> ok end,

    {P_newprop10, S10} =
        case get_flap_count(P_newprop3) of
@ -637,9 +660,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
                               andalso
                               P_oldinner#projection_v1.down ==
                               P_inner#projection_v1.down ->
-                                    %% HRM, distrust?...
-                                    %% P_oldinner#projection_v1.epoch_number;
-                                    P_oldinner#projection_v1.epoch_number + 1;
+                                    P_oldinner#projection_v1.epoch_number;
                               true ->
                                    P_oldinner#projection_v1.epoch_number + 1
                            end
@ -741,36 +762,50 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
            {_, _}        -> false
        end,

-    P_newprop20 =
-        if Kicker_p orelse MoveFromInnerToNorm_p ->
-                %% TODO this clause probably needs adjustment.
-                FlapHack = {flapping_i,
-                            [{flap_count, {{epk,-1},?NOT_FLAPPING},0},
-                             {all_hosed, []},
-                             {all_flap_counts, []},
-                             {bad, []}]},
-                RankBoost = {{'rank_boost!', MyName}, 4242},
-                P_inner2A = inner_projection_or_self(P_current),
-                P_inner2B =
-                    P_inner2A#projection_v1{epoch_number=
+    if Kicker_p orelse MoveFromInnerToNorm_p ->
+            ClauseInfo = [{inner_kicker, Kicker_p},
+                          {move_from_inner, MoveFromInnerToNorm_p}],
+            ?REACT({a30, ?LINE, ClauseInfo}),
+            %% %% 2015-04-14: YEAH, this appears to work!
+            %% %% 1. Create a "safe" projection that is upi=[],repairing=[]
+            %% %% 2. Declare it to be best & latest by pure fiat.
+            %% %%    (The C100 transition will double-check that it's safe.)
+            %% %% 3. Jump to C100.  Then, for the next iteration,
+            %% %%    our P_current state to a smallest-possible-score
+            %% %%    state ... and let the chain reassemble itself from
+            %% %%    length zero.
+            %% #projection_v1{epoch_number=Epoch_newprop10, all_members=All_list,
+            %%                members_dict=MembersDict} = P_newprop10,
+            %% P_noneprop0 = make_none_projection(MyName, All_list, MembersDict),
+            %% P_noneprop1 = P_noneprop0#projection_v1{epoch_number=Epoch_newprop10},
+            %% %% Just to be clear, we clobber any flapping info by setting dbg.
+            %% P_noneprop = P_noneprop1#projection_v1{dbg=ClauseInfo},
+            %% react_to_env_C100(P_noneprop, P_latest, S);
+
+            %% 2015-04-14: Let's experiment with using the current inner
+            %% projection (or, if there really is no inner, just P_current).
+            %% This is safe because it's already P_current and by assumption,
+            %% anything that made it through the logical maze to get here
+            %% is safe.  So re-using it with a higher epoch number doesn't
+            %% make any significant change.
+            %%
+            %% Yeah, it appears to work, also, nice!  This can help save some
+            %% repair operations (compared to the other safe thing to do
+            %% here, which uses make_none_projection() to build & repair the
+            %% entire chain from scratch).
+
+            P_inner2A = inner_projection_or_self(P_current),
+            P_inner2B =
+                P_inner2A#projection_v1{epoch_number=
                                        P_newprop10#projection_v1.epoch_number,
-                                        dbg=[FlapHack,RankBoost]},
-                io:format(user, "QQQ ~w switching to inner: ~w\n", [MyName, machi_projection:make_summary(P_inner2B)]),
+                                        dbg=ClauseInfo},
+            react_to_env_C100(P_inner2B, P_latest, S);

-                LEFT OFF HERE ... what if we:
-                   1. Create a "safe" projection that is upi=[],repairing=[]
-                   2. Declare it to be best & latest by pure fiat.
-                   3. Jump to C100?/C110? to a cycle of iteration,
-                      push our P_current state to a smallest-possible-score
-                      state, then let the rest reassemble itself.
-
-                P_inner2B;
-           true ->
-                P_newprop10
-        end,
-
-    react_to_env_A40(Retries, P_newprop20, P_latest,
-                     LatestUnanimousP, S10).
+       true ->
+            ?REACT({a30, ?LINE}),
+            react_to_env_A40(Retries, P_newprop10, P_latest,
+                             LatestUnanimousP, S10)
+    end.

 react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
                 #ch_mgr{name=MyName, proj=P_current}=S) ->
@ -895,7 +930,7 @@ react_to_env_A50(P_latest, FinalProps, S) ->
    ?REACT(a50),

    _HH = get(react),
-   io:format(user, "HEE50s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- _HH, is_atom(X)])]),
+    %% io:format(user, "HEE50s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- _HH, is_atom(X)])]),
    %% io:format(user, "HEE50 ~w ~w ~p\n", [S#ch_mgr.name, self(), lists:reverse(_HH)]),

    ?REACT({a50, ?LINE, [{latest_epoch, P_latest#projection_v1.epoch_number},
@ -1086,10 +1121,10 @@ react_to_env_C100(P_newprop, P_latest,
            %% By process of elimination, P_newprop is best,
            %% so let's write it.
 io:format(user, "\nUrp: ~p ~p ~p ~p\n", [MyName, ShortCircuit_p, _AnyOtherReturnValue, Inner_sane_p]),
-io:format(user, "c100 P_newprop : ~w\n", [machi_projection:make_summary(P_newprop)]),
-io:format(user, "c100 P_newpropY: ~w\n", [machi_projection:make_summary(P_newpropY)]),
-io:format(user, "c100 P_latest : ~w\n", [machi_projection:make_summary(P_latest)]),
-io:format(user, "c100 P_latestY: ~w\n", [machi_projection:make_summary(P_latestY)]),
+%% io:format(user, "c100 P_newprop : ~w\n", [machi_projection:make_summary(P_newprop)]),
+%% io:format(user, "c100 P_newpropY: ~w\n", [machi_projection:make_summary(P_newpropY)]),
+%% io:format(user, "c100 P_latest : ~w\n", [machi_projection:make_summary(P_latest)]),
+%% io:format(user, "c100 P_latestY: ~w\n", [machi_projection:make_summary(P_latestY)]),
            ?REACT({c100, ?LINE, [not_sane]}),
            react_to_env_C300(P_newprop, P_latest, S)
    end.
@ -1151,7 +1186,7 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H} = S) ->
         end,

    HH = get(react),
-    io:format(user, "HEE120s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- HH, is_atom(X)])]),
+    %% io:format(user, "HEE120s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- HH, is_atom(X)])]),
    %% io:format(user, "HEE120 ~w ~w ~p\n", [S#ch_mgr.name, self(), lists:reverse(HH)]),

    ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}),
@ -1196,7 +1231,7 @@ react_to_env_C310(P_newprop, S) ->
    ?REACT(c310),
    Epoch = P_newprop#projection_v1.epoch_number,
    {WriteRes, S2} = cl_write_public_proj_skip_local_error(Epoch, P_newprop, S),
-    io:format(user, "QQQ ~w public write ~w: ~w\n", [S#ch_mgr.name, machi_projection:make_summary(P_newprop), WriteRes]),
+    %% io:format(user, "QQQ ~w public write ~w: ~w\n", [S#ch_mgr.name, machi_projection:make_summary(P_newprop), WriteRes]),
    ?REACT({c310, ?LINE,
            [{newprop, machi_projection:make_summary(P_newprop)},
            {write_result, WriteRes}]}),
@ -1428,12 +1463,14 @@ projection_transition_is_sane(
    true = sets:is_disjoint(DownS2, RepairingS2),
    true = sets:is_disjoint(UPIS2, RepairingS2),

-    %% The author must not be down.
-    false = lists:member(AuthorServer1, Down_list1),
-    false = lists:member(AuthorServer2, Down_list2),
+    %% TODO relaxing this is ok, perhaps?
+    %% %% The author must not be down.
+    %% false = lists:member(AuthorServer1, Down_list1),
+    %% false = lists:member(AuthorServer2, Down_list2),
+    %% TODO relaxing this is ok, perhaps, also?
    %% The author must be in either the UPI or repairing list.
-    true = lists:member(AuthorServer1, UPI_list1 ++ Repairing_list1),
-    true = lists:member(AuthorServer2, UPI_list2 ++ Repairing_list2),
+    %% true = lists:member(AuthorServer1, UPI_list1 ++ Repairing_list1),
+    %% true = lists:member(AuthorServer2, UPI_list2 ++ Repairing_list2),

    %% Additions to the UPI chain may only be at the tail
    UPI_common_prefix = find_common_prefix(UPI_list1, UPI_list2),
--- a/test/machi_chain_manager1_converge_demo.erl
+++ b/test/machi_chain_manager1_converge_demo.erl
@ -215,7 +215,7 @@ convergence_demo_testfun(NumFLUs) ->
                     [receive
                          done ->
                              ok
-                      after 995000 ->
+                      after 120*1000 ->
                              exit(icky_timeout)
                      end || _ <- Pids]
             end,
@ -267,7 +267,7 @@ convergence_demo_testfun(NumFLUs) ->
           io:format(user, "\nSweet, all_hosed are identical-or-islands-inconclusive.\n", []),
           timer:sleep(1000),
           ok
-       %% end || Partition <- AllPartitionCombinations
+       end || Partition <- AllPartitionCombinations
       %% end || Partition <- [ [{a,b},{b,d},{c,b}],
       %%                       [{a,b},{b,d},{c,b}, {a,b},{b,a},{a,c},{c,a},{a,d},{d,a}],
       %%                       %% [{a,b},{b,d},{c,b}, {b,a},{a,b},{b,c},{c,b},{b,d},{d,b}],
@ -275,7 +275,7 @@ convergence_demo_testfun(NumFLUs) ->
       %%                       [{a,b},{b,d},{c,b}, {d,a},{a,d},{d,b},{b,d},{d,c},{c,d}] ]
       %% end || Partition <- [ [{a,b}, {b,c}],
       %%                       [{a,b}, {c,b}]  ]
-       end || Partition <- [ [{a,b}, {b,c}]  ]  %% hosed-not-equal @ 3 FLUs
+       %% end || Partition <- [ [{a,b}, {b,c}]  ]  %% hosed-not-equal @ 3 FLUs
       %% end || Partition <- [ [{b,d}] ]
       %% end || Partition <- [ [{a,b}, {b,a}] ]
       %% end || Partition <- [ [{a,b}, {b,a}, {a,c},{c,a}] ]