WIP: By Jove, I believe the chain manager is working

This commit is contained in:
Scott Lystig Fritchie 2015-04-14 15:30:24 +09:00
parent 09051aecce
commit 59936eda62
4 changed files with 104 additions and 61 deletions

View file

@ -7,7 +7,7 @@
Done via compare() func.
** DONE Change all protocol ops to add epoch ID
** TODO Add projection store to each FLU.
** DONE Add projection store to each FLU.
*** DONE What should the API look like? (borrow from chain mgr PoC?)
@ -23,8 +23,14 @@ method as append/write where there's a variable size blob. But we'll
format that blob as a term_to_binary(). Then dispatch to a single
func, and pattern match Erlang style in that func.
*** TODO Do it.
*** DONE Do it.
** TODO Fix all known bugs with Chain Manager
*** DONE Fix known bugs
*** TODO Clean up crufty TODO comments and other obvious cruft
** TODO Finish OTP'izing the Chain Manager with FLU & proj store processes
** TODO Change all protocol ops to enforce the epoch ID
** TODO Add projection wedging logic to each FLU.

View file

@ -44,11 +44,11 @@
epoch_number :: pv1_epoch_n(),
epoch_csum :: pv1_csum(),
author_server :: pv1_server(),
creation_time :: pv1_timestamp(),
all_members :: [pv1_server()],
down :: [pv1_server()],
creation_time :: pv1_timestamp(),
upi :: [pv1_server()],
repairing :: [pv1_server()],
down :: [pv1_server()],
dbg :: list(), %proplist(), is checksummed
dbg2 :: list(), %proplist(), is not checksummed
members_dict :: p_srvr_dict()

View file

@ -139,10 +139,7 @@ init({MyName, MembersDict, MgrOpts}) ->
{flapping_i, Opt(flapping, [])},
{up_nodes, Opt(up_nodes, not_init_yet)}],
ActiveP = Opt(active_mode, true),
Down_list = All_list -- [MyName],
UPI_list = [MyName],
NoneProj = machi_projection:new(MyName, MembersDict,
Down_list, UPI_list, [], []),
NoneProj = make_none_projection(MyName, All_list, MembersDict),
Proxies = orddict:fold(
fun(K, P, Acc) ->
{ok, Pid} = ?FLU_PC:start_link(P),
@ -220,10 +217,15 @@ code_change(_OldVsn, S, _Extra) ->
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
make_none_projection(MyName, All_list, MembersDict) ->
Down_list = All_list,
UPI_list = [],
machi_projection:new(MyName, MembersDict, UPI_list, Down_list, [], []).
set_active_timer(#ch_mgr{name=MyName, members_dict=MembersDict}=S) ->
FLU_list = [P#p_srvr.name || {_,P} <- orddict:to_list(MembersDict)],
USec = calc_sleep_ranked_order(1000, 2000, MyName, FLU_list),
{ok, TRef} = timer:send_interval(USec, yo_yo_yo),
{ok, TRef} = timer:send_interval(USec, yo_yo_yo_todo),
S#ch_mgr{timer=TRef}.
do_cl_write_public_proj(Proj, S) ->
@ -516,6 +518,29 @@ rank_and_sort_projections(Ps, CurrentProj) ->
%% Caller must ensure all Projs are of the same epoch number.
%% If the caller gives us projections with different epochs, we assume
%% that the caller is doing an OK thing.
%%
%% TODO: This implementation currently gives higher rank to the last
%% member of All_list, which is typically/always/TODO-CLARIFY
%% sorted. That's fine, but there's a source of unnecessary
%% churn: during repair, we assume that the head of the chain is
%% the coordinator of the repair. So any time that the head
%% makes a repair-related transition, that projection may get
%% quickly replaced by an identical projection that merely has
%% higher rank because it's authored by a higher-ranked member.
%% Worst case, for chain len=4:
%% E+0: author=a, upi=[a], repairing=[b,c,d]
%% E+1: author=b, upi=[a], repairing=[b,c,d] (**)
%% E+2: author=c, upi=[a], repairing=[b,c,d] (**)
%% E+3: author=d, upi=[a], repairing=[b,c,d] (**)
%% E+4: author=a, upi=[a,b], repairing=[c,d]
%% E+5: author=b, upi=[a,b], repairing=[c,d] (**)
%% E+6: author=c, upi=[a,b], repairing=[c,d] (**)
%% E+7: author=d, upi=[a,b], repairing=[c,d] (**)
%% E+... 6 more (**) epochs when c & d finish their respective repairs.
%% Ideally, the "(**)" epochs are avoidable churn.
%% Perhaps this means that we should change the responsibility
%% for repair management to the highest ranking member of the
%% UPI_list?
rank_projections(Projs, CurrentProj) ->
#projection_v1{all_members=All_list} = CurrentProj,
@ -528,11 +553,8 @@ rank_projection(#projection_v1{upi=[]}, _MemberRank, _N) ->
-100;
rank_projection(#projection_v1{author_server=Author,
upi=UPI_list,
repairing=Repairing_list,
dbg=Dbg}, MemberRank, N) ->
RankBoost = proplists:get_value({'rank_boost!', Author}, Dbg, 0),
repairing=Repairing_list}, MemberRank, N) ->
AuthorRank = orddict:fetch(Author, MemberRank),
RankBoost +
AuthorRank +
( N * length(Repairing_list)) +
(N*N * length(UPI_list)).
@ -581,8 +603,9 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
#ch_mgr{name=MyName, proj=P_current,
flap_limit=FlapLimit} = S) ->
?REACT(a30),
io:format(user, "HEE30s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- get(react), is_atom(X) orelse element(1,X) == b10])]),
%% io:format(user, "HEE30s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- get(react), is_atom(X) orelse element(1,X) == b10])]),
{P_newprop1, S2} = calc_projection(S, MyName),
?REACT({a30, ?LINE, [{current, machi_projection:make_summary(S#ch_mgr.proj)}]}),
?REACT({a30, ?LINE, [{newprop1, machi_projection:make_summary(P_newprop1)}]}),
%% Are we flapping yet?
@ -595,7 +618,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
NewEpoch = erlang:max(Epoch_newprop2, Epoch_latest) + 1,
P_newprop3 = P_newprop2#projection_v1{epoch_number=NewEpoch},
?REACT({a30, ?LINE, [{newprop3, machi_projection:make_summary(P_newprop3)}]}),
if MyName == 'd' -> io:format(user, "QQQQQ ~w P_latest is ~w\n", [MyName, machi_projection:make_summary(P_latest)]); true -> ok end,
%% if MyName == 'd' -> io:format(user, "QQQQQ ~w P_latest is ~w\n", [MyName, machi_projection:make_summary(P_latest)]); true -> ok end,
{P_newprop10, S10} =
case get_flap_count(P_newprop3) of
@ -637,9 +660,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
andalso
P_oldinner#projection_v1.down ==
P_inner#projection_v1.down ->
%% HRM, distrust?...
%% P_oldinner#projection_v1.epoch_number;
P_oldinner#projection_v1.epoch_number + 1;
P_oldinner#projection_v1.epoch_number;
true ->
P_oldinner#projection_v1.epoch_number + 1
end
@ -741,36 +762,50 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
{_, _} -> false
end,
P_newprop20 =
if Kicker_p orelse MoveFromInnerToNorm_p ->
%% TODO this clause probably needs adjustment.
FlapHack = {flapping_i,
[{flap_count, {{epk,-1},?NOT_FLAPPING},0},
{all_hosed, []},
{all_flap_counts, []},
{bad, []}]},
RankBoost = {{'rank_boost!', MyName}, 4242},
P_inner2A = inner_projection_or_self(P_current),
P_inner2B =
P_inner2A#projection_v1{epoch_number=
if Kicker_p orelse MoveFromInnerToNorm_p ->
ClauseInfo = [{inner_kicker, Kicker_p},
{move_from_inner, MoveFromInnerToNorm_p}],
?REACT({a30, ?LINE, ClauseInfo}),
%% %% 2015-04-14: YEAH, this appears to work!
%% %% 1. Create a "safe" projection that is upi=[],repairing=[]
%% %% 2. Declare it to be best & latest by pure fiat.
%% %% (The C100 transition will double-check that it's safe.)
%% %% 3. Jump to C100. Then, for the next iteration,
%% %% our P_current state to a smallest-possible-score
%% %% state ... and let the chain reassemble itself from
%% %% length zero.
%% #projection_v1{epoch_number=Epoch_newprop10, all_members=All_list,
%% members_dict=MembersDict} = P_newprop10,
%% P_noneprop0 = make_none_projection(MyName, All_list, MembersDict),
%% P_noneprop1 = P_noneprop0#projection_v1{epoch_number=Epoch_newprop10},
%% %% Just to be clear, we clobber any flapping info by setting dbg.
%% P_noneprop = P_noneprop1#projection_v1{dbg=ClauseInfo},
%% react_to_env_C100(P_noneprop, P_latest, S);
%% 2015-04-14: Let's experiment with using the current inner
%% projection (or, if there really is no inner, just P_current).
%% This is safe because it's already P_current and by assumption,
%% anything that made it through the logical maze to get here
%% is safe. So re-using it with a higher epoch number doesn't
%% make any significant change.
%%
%% Yeah, it appears to work, also, nice! This can help save some
%% repair operations (compared to the other safe thing to do
%% here, which uses make_none_projection() to build & repair the
%% entire chain from scratch).
P_inner2A = inner_projection_or_self(P_current),
P_inner2B =
P_inner2A#projection_v1{epoch_number=
P_newprop10#projection_v1.epoch_number,
dbg=[FlapHack,RankBoost]},
io:format(user, "QQQ ~w switching to inner: ~w\n", [MyName, machi_projection:make_summary(P_inner2B)]),
dbg=ClauseInfo},
react_to_env_C100(P_inner2B, P_latest, S);
LEFT OFF HERE ... what if we:
1. Create a "safe" projection that is upi=[],repairing=[]
2. Declare it to be best & latest by pure fiat.
3. Jump to C100?/C110? to a cycle of iteration,
push our P_current state to a smallest-possible-score
state, then let the rest reassemble itself.
P_inner2B;
true ->
P_newprop10
end,
react_to_env_A40(Retries, P_newprop20, P_latest,
LatestUnanimousP, S10).
true ->
?REACT({a30, ?LINE}),
react_to_env_A40(Retries, P_newprop10, P_latest,
LatestUnanimousP, S10)
end.
react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
#ch_mgr{name=MyName, proj=P_current}=S) ->
@ -895,7 +930,7 @@ react_to_env_A50(P_latest, FinalProps, S) ->
?REACT(a50),
_HH = get(react),
io:format(user, "HEE50s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- _HH, is_atom(X)])]),
%% io:format(user, "HEE50s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- _HH, is_atom(X)])]),
%% io:format(user, "HEE50 ~w ~w ~p\n", [S#ch_mgr.name, self(), lists:reverse(_HH)]),
?REACT({a50, ?LINE, [{latest_epoch, P_latest#projection_v1.epoch_number},
@ -1086,10 +1121,10 @@ react_to_env_C100(P_newprop, P_latest,
%% By process of elimination, P_newprop is best,
%% so let's write it.
io:format(user, "\nUrp: ~p ~p ~p ~p\n", [MyName, ShortCircuit_p, _AnyOtherReturnValue, Inner_sane_p]),
io:format(user, "c100 P_newprop : ~w\n", [machi_projection:make_summary(P_newprop)]),
io:format(user, "c100 P_newpropY: ~w\n", [machi_projection:make_summary(P_newpropY)]),
io:format(user, "c100 P_latest : ~w\n", [machi_projection:make_summary(P_latest)]),
io:format(user, "c100 P_latestY: ~w\n", [machi_projection:make_summary(P_latestY)]),
%% io:format(user, "c100 P_newprop : ~w\n", [machi_projection:make_summary(P_newprop)]),
%% io:format(user, "c100 P_newpropY: ~w\n", [machi_projection:make_summary(P_newpropY)]),
%% io:format(user, "c100 P_latest : ~w\n", [machi_projection:make_summary(P_latest)]),
%% io:format(user, "c100 P_latestY: ~w\n", [machi_projection:make_summary(P_latestY)]),
?REACT({c100, ?LINE, [not_sane]}),
react_to_env_C300(P_newprop, P_latest, S)
end.
@ -1151,7 +1186,7 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H} = S) ->
end,
HH = get(react),
io:format(user, "HEE120s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- HH, is_atom(X)])]),
%% io:format(user, "HEE120s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- HH, is_atom(X)])]),
%% io:format(user, "HEE120 ~w ~w ~p\n", [S#ch_mgr.name, self(), lists:reverse(HH)]),
?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}),
@ -1196,7 +1231,7 @@ react_to_env_C310(P_newprop, S) ->
?REACT(c310),
Epoch = P_newprop#projection_v1.epoch_number,
{WriteRes, S2} = cl_write_public_proj_skip_local_error(Epoch, P_newprop, S),
io:format(user, "QQQ ~w public write ~w: ~w\n", [S#ch_mgr.name, machi_projection:make_summary(P_newprop), WriteRes]),
%% io:format(user, "QQQ ~w public write ~w: ~w\n", [S#ch_mgr.name, machi_projection:make_summary(P_newprop), WriteRes]),
?REACT({c310, ?LINE,
[{newprop, machi_projection:make_summary(P_newprop)},
{write_result, WriteRes}]}),
@ -1428,12 +1463,14 @@ projection_transition_is_sane(
true = sets:is_disjoint(DownS2, RepairingS2),
true = sets:is_disjoint(UPIS2, RepairingS2),
%% The author must not be down.
false = lists:member(AuthorServer1, Down_list1),
false = lists:member(AuthorServer2, Down_list2),
%% TODO relaxing this is ok, perhaps?
%% %% The author must not be down.
%% false = lists:member(AuthorServer1, Down_list1),
%% false = lists:member(AuthorServer2, Down_list2),
%% TODO relaxing this is ok, perhaps, also?
%% The author must be in either the UPI or repairing list.
true = lists:member(AuthorServer1, UPI_list1 ++ Repairing_list1),
true = lists:member(AuthorServer2, UPI_list2 ++ Repairing_list2),
%% true = lists:member(AuthorServer1, UPI_list1 ++ Repairing_list1),
%% true = lists:member(AuthorServer2, UPI_list2 ++ Repairing_list2),
%% Additions to the UPI chain may only be at the tail
UPI_common_prefix = find_common_prefix(UPI_list1, UPI_list2),

View file

@ -215,7 +215,7 @@ convergence_demo_testfun(NumFLUs) ->
[receive
done ->
ok
after 995000 ->
after 120*1000 ->
exit(icky_timeout)
end || _ <- Pids]
end,
@ -267,7 +267,7 @@ convergence_demo_testfun(NumFLUs) ->
io:format(user, "\nSweet, all_hosed are identical-or-islands-inconclusive.\n", []),
timer:sleep(1000),
ok
%% end || Partition <- AllPartitionCombinations
end || Partition <- AllPartitionCombinations
%% end || Partition <- [ [{a,b},{b,d},{c,b}],
%% [{a,b},{b,d},{c,b}, {a,b},{b,a},{a,c},{c,a},{a,d},{d,a}],
%% %% [{a,b},{b,d},{c,b}, {b,a},{a,b},{b,c},{c,b},{b,d},{d,b}],
@ -275,7 +275,7 @@ convergence_demo_testfun(NumFLUs) ->
%% [{a,b},{b,d},{c,b}, {d,a},{a,d},{d,b},{b,d},{d,c},{c,d}] ]
%% end || Partition <- [ [{a,b}, {b,c}],
%% [{a,b}, {c,b}] ]
end || Partition <- [ [{a,b}, {b,c}] ] %% hosed-not-equal @ 3 FLUs
%% end || Partition <- [ [{a,b}, {b,c}] ] %% hosed-not-equal @ 3 FLUs
%% end || Partition <- [ [{b,d}] ]
%% end || Partition <- [ [{a,b}, {b,a}] ]
%% end || Partition <- [ [{a,b}, {b,a}, {a,c},{c,a}] ]