From 09051aecce6ee79170d9a4a997cdfa010ef48b6a Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 14 Apr 2015 00:54:38 +0900 Subject: [PATCH 1/6] WIP: experiments for transitioning out of inner/nested projection state --- src/machi_chain_manager1.erl | 169 +++++++++++++++++--- test/machi_chain_manager1_converge_demo.erl | 6 +- 2 files changed, 150 insertions(+), 25 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 90ca727..380c39e 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -528,10 +528,12 @@ rank_projection(#projection_v1{upi=[]}, _MemberRank, _N) -> -100; rank_projection(#projection_v1{author_server=Author, upi=UPI_list, - repairing=Repairing_list}, MemberRank, N) -> + repairing=Repairing_list, + dbg=Dbg}, MemberRank, N) -> + RankBoost = proplists:get_value({'rank_boost!', Author}, Dbg, 0), AuthorRank = orddict:fetch(Author, MemberRank), - %% (AuthorRank-AuthorRank) + % feels unstable???? - AuthorRank + % feels stable + RankBoost + + AuthorRank + ( N * length(Repairing_list)) + (N*N * length(UPI_list)). @@ -579,6 +581,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, #ch_mgr{name=MyName, proj=P_current, flap_limit=FlapLimit} = S) -> ?REACT(a30), + io:format(user, "HEE30s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- get(react), is_atom(X) orelse element(1,X) == b10])]), {P_newprop1, S2} = calc_projection(S, MyName), ?REACT({a30, ?LINE, [{newprop1, machi_projection:make_summary(P_newprop1)}]}), @@ -587,15 +590,17 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% Move the epoch number up ... originally done in C300. #projection_v1{epoch_number=Epoch_newprop2}=P_newprop2, - #projection_v1{epoch_number=Epoch_latest}=P_latest, + #projection_v1{epoch_number=Epoch_latest, + author_server=Author_latest}=P_latest, NewEpoch = erlang:max(Epoch_newprop2, Epoch_latest) + 1, P_newprop3 = P_newprop2#projection_v1{epoch_number=NewEpoch}, ?REACT({a30, ?LINE, [{newprop3, machi_projection:make_summary(P_newprop3)}]}), + if MyName == 'd' -> io:format(user, "QQQQQ ~w P_latest is ~w\n", [MyName, machi_projection:make_summary(P_latest)]); true -> ok end, {P_newprop10, S10} = case get_flap_count(P_newprop3) of {_, P_newprop3_flap_count} when P_newprop3_flap_count >= FlapLimit -> - AllHosed = get_all_hosed(S3), + AllHosed = get_all_hosed(P_newprop3), {P_i, S_i} = calc_projection(S3, MyName, AllHosed), P_inner = case lists:member(MyName, AllHosed) of false -> @@ -649,8 +654,122 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, _ -> {P_newprop3, S3} end, + %% QQQ WHAT IF? + %% "What if we checked P_current for flap count, and it was > 0?" + %% "What if we checked P_newprop10 for flap count, and it was = 0?" + %% "If P_current also has an inner proj, then pull that inner proj..." + %% "out and use *it* for P_newprop20/yeah!!?????" + %% + %% QQQ 2015-04-13: New thinking + %% + %% -1. Hey, I'm wondering, duh, isn't #4 the right thing???? + %% + %% 0. I think I'm leaning toward trying to use option #3 below first. + %% If that doesn't work, then consider #1 or #2? + %% + %% 1. There are a couple of approaches: a CRDT-style thing, info + %% is always growing and always mergeable (and probably having a + %% pruning problem?). So add a piece of data to the projection + %% that is always merged from all parties that says that a FLU F + %% now believes that the flapping episode that started at epoch E1 + %% has now ended as of epoch E2. (It would probably be prunable + %% that for every FLU we maintain only the last two or one + %% flapping episode finished events?) + %% + %% 2. We could do something like query the public (and private?) + %% stores of all participants when we've flapping, to find + %% transient information that's written in some epoch E-d prior to + %% what we've witnessed in our latest-public-projection-read that + %% gave us news of the outside world via P_latest. ?? + %% + %% 3. If we see P_latest come in from some other author (not us), + %% and it no longer has an flapping started epoch counter that + %% matches what recall from previous flaps, then we should reset + %% our flap count to zero and propose the last inner projection? + %% That makes a safe (?) transition from flapping to not-flapping, + %% yeah? + %% + %% 4. What a sec. We *KNOW* from our code below ... + %% case {inner_projection_exists(P_current), + %% inner_projection_exists(P_newprop10)} of + %% {true, false} -> + %% ... that *P_newprop10* says that we're no longer flapping. Yay. + %% So we should just use the last inner proj, P_current's inner proj. + %% Hrrrmmmm, except that's what we're just trying to do brute-force here? + %% So, what's wrong with what we're doing here, again??? + %% + %% AAAAH, right. The case I'm dealing with right now is an asymmetric + %% partition in a 4 member chain that affects all_hosed=[a,b,c] but + %% member D is *NOT* noticing anything different in the current scheme: + %% {inner_projection_exists(current), inner_projection_exists(new)} + %% is {true, true}. + %% Yes, that hypothesis is confirmed by time-honored io:format() tracing. + %% + %% So, we need something to kick a silly member like 'd' out of its + %% am-still-flapping rut. So, let's try this: + %% If we see a P_latest from author != MyName, and if it has a + %% P_latest's author's flap count is 0, but that same member's + %% flap count in P_current is non-zero, then we assume that author + %% has moved out of flapping state and that we ought to do the same. + %% + %% Hrm, well, the 'rank_boost!' thing isn't doing what I thought it + %% would. So, to resume in the morning ... see the LEFT OFF HERE below. - react_to_env_A40(Retries, P_newprop10, P_latest, + %% Remember! P_current is this manager's private in-use projection. + %% It is always older or equal to P_latest's epoch! + Current_flap_counts = get_all_flap_counts(P_current), + Latest_authors_flap_count_current = proplists:get_value( + Author_latest, Current_flap_counts), + Latest_flap_counts = get_all_flap_counts(P_latest), + Latest_authors_flap_count_latest = proplists:get_value( + Author_latest, Latest_flap_counts), + Kicker_p = case {Latest_authors_flap_count_current, + Latest_authors_flap_count_latest} of + {NotUndef, undefined} when NotUndef /= undefined -> + true; + {_, _} -> + false + end, + + %% Here's a more common reason for moving from inner projection to + %% a normal projection: the old prob has an inner but the newprop + %% does not. + MoveFromInnerToNorm_p = + case {inner_projection_exists(P_current), + inner_projection_exists(P_newprop10)} of + {true, false} -> true; + {_, _} -> false + end, + + P_newprop20 = + if Kicker_p orelse MoveFromInnerToNorm_p -> + %% TODO this clause probably needs adjustment. + FlapHack = {flapping_i, + [{flap_count, {{epk,-1},?NOT_FLAPPING},0}, + {all_hosed, []}, + {all_flap_counts, []}, + {bad, []}]}, + RankBoost = {{'rank_boost!', MyName}, 4242}, + P_inner2A = inner_projection_or_self(P_current), + P_inner2B = + P_inner2A#projection_v1{epoch_number= + P_newprop10#projection_v1.epoch_number, + dbg=[FlapHack,RankBoost]}, + io:format(user, "QQQ ~w switching to inner: ~w\n", [MyName, machi_projection:make_summary(P_inner2B)]), + + LEFT OFF HERE ... what if we: + 1. Create a "safe" projection that is upi=[],repairing=[] + 2. Declare it to be best & latest by pure fiat. + 3. Jump to C100?/C110? to a cycle of iteration, + push our P_current state to a smallest-possible-score + state, then let the rest reassemble itself. + + P_inner2B; + true -> + P_newprop10 + end, + + react_to_env_A40(Retries, P_newprop20, P_latest, LatestUnanimousP, S10). react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, @@ -776,7 +895,7 @@ react_to_env_A50(P_latest, FinalProps, S) -> ?REACT(a50), _HH = get(react), -% io:format(user, "HEE50s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- _HH, is_atom(X)])]), + io:format(user, "HEE50s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- _HH, is_atom(X)])]), %% io:format(user, "HEE50 ~w ~w ~p\n", [S#ch_mgr.name, self(), lists:reverse(_HH)]), ?REACT({a50, ?LINE, [{latest_epoch, P_latest#projection_v1.epoch_number}, @@ -796,7 +915,13 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, if LatestUnanimousP -> - ?REACT({b10, ?LINE, [{latest_unanimous_p, LatestUnanimousP}]}), + %% ?REACT({b10, ?LINE, [{latest_unanimous_p, LatestUnanimousP}]}), + ?REACT({b10, ?LINE, [{latest_unanimous_p, LatestUnanimousP}, + {latest_epoch,P_latest#projection_v1.epoch_number}, + {latest_author,P_latest#projection_v1.author_server}, + {newprop_epoch,P_newprop#projection_v1.epoch_number}, + {newprop_author,P_newprop#projection_v1.author_server} +]}), put(b10_hack, false), react_to_env_C100(P_newprop, P_latest, S); @@ -894,6 +1019,7 @@ io:format(user, "<--x=~w-oooo-~w-~w-~w->", [X, MyName, P_newprop_flap_count,Flap true -> ?REACT({b10, ?LINE}), + ?REACT({b10, ?LINE, [{retries,Retries},{rank_latest, Rank_latest}, {rank_newprop, Rank_newprop}, {latest_author, P_latest#projection_v1.author_server}]}), % TODO debug delete me! put(b10_hack, false), %% P_newprop is best, so let's write it. @@ -1070,12 +1196,13 @@ react_to_env_C310(P_newprop, S) -> ?REACT(c310), Epoch = P_newprop#projection_v1.epoch_number, {WriteRes, S2} = cl_write_public_proj_skip_local_error(Epoch, P_newprop, S), + io:format(user, "QQQ ~w public write ~w: ~w\n", [S#ch_mgr.name, machi_projection:make_summary(P_newprop), WriteRes]), ?REACT({c310, ?LINE, [{newprop, machi_projection:make_summary(P_newprop)}, {write_result, WriteRes}]}), react_to_env_A10(S2). -calculate_flaps(P_newprop, _P_current, FlapLimit, +calculate_flaps(P_newprop, _P_current, _FlapLimit, #ch_mgr{name=MyName, proj_history=H, flap_start=FlapStart, flaps=Flaps, runenv=RunEnv0} = S) -> RunEnv1 = replace(RunEnv0, [{flapping_i, []}]), @@ -1175,16 +1302,18 @@ calculate_flaps(P_newprop, _P_current, FlapLimit, AllHosed = [] end, - %% If there's at least one count in AllFlapCounts that isn't my - %% flap count, and if it's over the flap limit, then consider them - %% settled. - AllFlapCountsSettled = lists:keydelete(MyName, 1, AllFlapCounts) /= [] - andalso - my_find_minmost(AllFlapCounts) >= FlapLimit, + %% 2015-04-13: TODO: this whole notion of "settled" flap counts + %% has not worked as initially planned. Remove it all. + %% %% If there's at least one count in AllFlapCounts that isn't my + %% %% flap count, and if it's over the flap limit, then consider them + %% %% settled. + %% AllFlapCountsSettled = lists:keydelete(MyName, 1, AllFlapCounts) /= [] + %% andalso + %% my_find_minmost(AllFlapCounts) >= FlapLimit, FlappingI = {flapping_i, [{flap_count, {NewFlapStart, NewFlaps}}, {all_hosed, AllHosed}, {all_flap_counts, lists:sort(AllFlapCounts)}, - {all_flap_counts_settled, AllFlapCountsSettled}, + %% {all_flap_counts_settled, AllFlapCountsSettled}, {bad,BadFLUs}, {da_downu, DownUnion}, % debugging aid {da_hosedtu, HosedTransUnion}, % debugging aid @@ -1501,7 +1630,7 @@ get_raw_flapping_i(#projection_v1{dbg=Dbg}) -> proplists:get_value(flapping_i, Dbg, []). get_flap_count(P) -> - proplists:get_value(flap_count, get_raw_flapping_i(P), 0). + proplists:get_value(flap_count, get_raw_flapping_i(P), {0,0}). get_all_flap_counts(P) -> proplists:get_value(all_flap_counts, get_raw_flapping_i(P), []). @@ -1515,11 +1644,7 @@ get_all_flap_counts_counts(P) -> end. get_all_hosed(P) when is_record(P, projection_v1)-> - proplists:get_value(all_hosed, get_raw_flapping_i(P), []); -get_all_hosed(S) when is_record(S, ch_mgr) -> - proplists:get_value(all_hosed, - proplists:get_value(flapping_i, S#ch_mgr.runenv, []), - []). + proplists:get_value(all_hosed, get_raw_flapping_i(P), []). merge_flap_counts(FlapCounts) -> merge_flap_counts(FlapCounts, orddict:new()). diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index 1d2c537..9b585fc 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -273,9 +273,9 @@ convergence_demo_testfun(NumFLUs) -> %% %% [{a,b},{b,d},{c,b}, {b,a},{a,b},{b,c},{c,b},{b,d},{d,b}], %% [{a,b},{b,d},{c,b}, {c,a},{a,c},{c,b},{b,c},{c,d},{d,c}], %% [{a,b},{b,d},{c,b}, {d,a},{a,d},{d,b},{b,d},{d,c},{c,d}] ] - end || Partition <- [ [{a,b}, {b,c}], - [{a,b}, {c,b}] ] - %% end || Partition <- [ [{a,b}, {b,c}] ] %% hosed-not-equal @ 3 FLUs + %% end || Partition <- [ [{a,b}, {b,c}], + %% [{a,b}, {c,b}] ] + end || Partition <- [ [{a,b}, {b,c}] ] %% hosed-not-equal @ 3 FLUs %% end || Partition <- [ [{b,d}] ] %% end || Partition <- [ [{a,b}, {b,a}] ] %% end || Partition <- [ [{a,b}, {b,a}, {a,c},{c,a}] ] From 59936eda62954743eff64d5aea24a6b7e6f4f151 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 14 Apr 2015 15:30:24 +0900 Subject: [PATCH 2/6] WIP: By Jove, I believe the chain manager is working --- TODO-shortterm.org | 10 +- include/machi_projection.hrl | 4 +- src/machi_chain_manager1.erl | 145 ++++++++++++-------- test/machi_chain_manager1_converge_demo.erl | 6 +- 4 files changed, 104 insertions(+), 61 deletions(-) diff --git a/TODO-shortterm.org b/TODO-shortterm.org index 428bddc..216d65d 100644 --- a/TODO-shortterm.org +++ b/TODO-shortterm.org @@ -7,7 +7,7 @@ Done via compare() func. ** DONE Change all protocol ops to add epoch ID -** TODO Add projection store to each FLU. +** DONE Add projection store to each FLU. *** DONE What should the API look like? (borrow from chain mgr PoC?) @@ -23,8 +23,14 @@ method as append/write where there's a variable size blob. But we'll format that blob as a term_to_binary(). Then dispatch to a single func, and pattern match Erlang style in that func. -*** TODO Do it. +*** DONE Do it. +** TODO Fix all known bugs with Chain Manager + +*** DONE Fix known bugs +*** TODO Clean up crufty TODO comments and other obvious cruft + +** TODO Finish OTP'izing the Chain Manager with FLU & proj store processes ** TODO Change all protocol ops to enforce the epoch ID ** TODO Add projection wedging logic to each FLU. diff --git a/include/machi_projection.hrl b/include/machi_projection.hrl index 59baf03..670116f 100644 --- a/include/machi_projection.hrl +++ b/include/machi_projection.hrl @@ -44,11 +44,11 @@ epoch_number :: pv1_epoch_n(), epoch_csum :: pv1_csum(), author_server :: pv1_server(), - creation_time :: pv1_timestamp(), all_members :: [pv1_server()], - down :: [pv1_server()], + creation_time :: pv1_timestamp(), upi :: [pv1_server()], repairing :: [pv1_server()], + down :: [pv1_server()], dbg :: list(), %proplist(), is checksummed dbg2 :: list(), %proplist(), is not checksummed members_dict :: p_srvr_dict() diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 380c39e..9971a0a 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -139,10 +139,7 @@ init({MyName, MembersDict, MgrOpts}) -> {flapping_i, Opt(flapping, [])}, {up_nodes, Opt(up_nodes, not_init_yet)}], ActiveP = Opt(active_mode, true), - Down_list = All_list -- [MyName], - UPI_list = [MyName], - NoneProj = machi_projection:new(MyName, MembersDict, - Down_list, UPI_list, [], []), + NoneProj = make_none_projection(MyName, All_list, MembersDict), Proxies = orddict:fold( fun(K, P, Acc) -> {ok, Pid} = ?FLU_PC:start_link(P), @@ -220,10 +217,15 @@ code_change(_OldVsn, S, _Extra) -> %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +make_none_projection(MyName, All_list, MembersDict) -> + Down_list = All_list, + UPI_list = [], + machi_projection:new(MyName, MembersDict, UPI_list, Down_list, [], []). + set_active_timer(#ch_mgr{name=MyName, members_dict=MembersDict}=S) -> FLU_list = [P#p_srvr.name || {_,P} <- orddict:to_list(MembersDict)], USec = calc_sleep_ranked_order(1000, 2000, MyName, FLU_list), - {ok, TRef} = timer:send_interval(USec, yo_yo_yo), + {ok, TRef} = timer:send_interval(USec, yo_yo_yo_todo), S#ch_mgr{timer=TRef}. do_cl_write_public_proj(Proj, S) -> @@ -516,6 +518,29 @@ rank_and_sort_projections(Ps, CurrentProj) -> %% Caller must ensure all Projs are of the same epoch number. %% If the caller gives us projections with different epochs, we assume %% that the caller is doing an OK thing. +%% +%% TODO: This implementation currently gives higher rank to the last +%% member of All_list, which is typically/always/TODO-CLARIFY +%% sorted. That's fine, but there's a source of unnecessary +%% churn: during repair, we assume that the head of the chain is +%% the coordinator of the repair. So any time that the head +%% makes a repair-related transition, that projection may get +%% quickly replaced by an identical projection that merely has +%% higher rank because it's authored by a higher-ranked member. +%% Worst case, for chain len=4: +%% E+0: author=a, upi=[a], repairing=[b,c,d] +%% E+1: author=b, upi=[a], repairing=[b,c,d] (**) +%% E+2: author=c, upi=[a], repairing=[b,c,d] (**) +%% E+3: author=d, upi=[a], repairing=[b,c,d] (**) +%% E+4: author=a, upi=[a,b], repairing=[c,d] +%% E+5: author=b, upi=[a,b], repairing=[c,d] (**) +%% E+6: author=c, upi=[a,b], repairing=[c,d] (**) +%% E+7: author=d, upi=[a,b], repairing=[c,d] (**) +%% E+... 6 more (**) epochs when c & d finish their respective repairs. +%% Ideally, the "(**)" epochs are avoidable churn. +%% Perhaps this means that we should change the responsibility +%% for repair management to the highest ranking member of the +%% UPI_list? rank_projections(Projs, CurrentProj) -> #projection_v1{all_members=All_list} = CurrentProj, @@ -528,11 +553,8 @@ rank_projection(#projection_v1{upi=[]}, _MemberRank, _N) -> -100; rank_projection(#projection_v1{author_server=Author, upi=UPI_list, - repairing=Repairing_list, - dbg=Dbg}, MemberRank, N) -> - RankBoost = proplists:get_value({'rank_boost!', Author}, Dbg, 0), + repairing=Repairing_list}, MemberRank, N) -> AuthorRank = orddict:fetch(Author, MemberRank), - RankBoost + AuthorRank + ( N * length(Repairing_list)) + (N*N * length(UPI_list)). @@ -581,8 +603,9 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, #ch_mgr{name=MyName, proj=P_current, flap_limit=FlapLimit} = S) -> ?REACT(a30), - io:format(user, "HEE30s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- get(react), is_atom(X) orelse element(1,X) == b10])]), + %% io:format(user, "HEE30s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- get(react), is_atom(X) orelse element(1,X) == b10])]), {P_newprop1, S2} = calc_projection(S, MyName), + ?REACT({a30, ?LINE, [{current, machi_projection:make_summary(S#ch_mgr.proj)}]}), ?REACT({a30, ?LINE, [{newprop1, machi_projection:make_summary(P_newprop1)}]}), %% Are we flapping yet? @@ -595,7 +618,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, NewEpoch = erlang:max(Epoch_newprop2, Epoch_latest) + 1, P_newprop3 = P_newprop2#projection_v1{epoch_number=NewEpoch}, ?REACT({a30, ?LINE, [{newprop3, machi_projection:make_summary(P_newprop3)}]}), - if MyName == 'd' -> io:format(user, "QQQQQ ~w P_latest is ~w\n", [MyName, machi_projection:make_summary(P_latest)]); true -> ok end, + %% if MyName == 'd' -> io:format(user, "QQQQQ ~w P_latest is ~w\n", [MyName, machi_projection:make_summary(P_latest)]); true -> ok end, {P_newprop10, S10} = case get_flap_count(P_newprop3) of @@ -637,9 +660,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, andalso P_oldinner#projection_v1.down == P_inner#projection_v1.down -> - %% HRM, distrust?... - %% P_oldinner#projection_v1.epoch_number; - P_oldinner#projection_v1.epoch_number + 1; + P_oldinner#projection_v1.epoch_number; true -> P_oldinner#projection_v1.epoch_number + 1 end @@ -741,36 +762,50 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, {_, _} -> false end, - P_newprop20 = - if Kicker_p orelse MoveFromInnerToNorm_p -> - %% TODO this clause probably needs adjustment. - FlapHack = {flapping_i, - [{flap_count, {{epk,-1},?NOT_FLAPPING},0}, - {all_hosed, []}, - {all_flap_counts, []}, - {bad, []}]}, - RankBoost = {{'rank_boost!', MyName}, 4242}, - P_inner2A = inner_projection_or_self(P_current), - P_inner2B = - P_inner2A#projection_v1{epoch_number= + if Kicker_p orelse MoveFromInnerToNorm_p -> + ClauseInfo = [{inner_kicker, Kicker_p}, + {move_from_inner, MoveFromInnerToNorm_p}], + ?REACT({a30, ?LINE, ClauseInfo}), + %% %% 2015-04-14: YEAH, this appears to work! + %% %% 1. Create a "safe" projection that is upi=[],repairing=[] + %% %% 2. Declare it to be best & latest by pure fiat. + %% %% (The C100 transition will double-check that it's safe.) + %% %% 3. Jump to C100. Then, for the next iteration, + %% %% our P_current state to a smallest-possible-score + %% %% state ... and let the chain reassemble itself from + %% %% length zero. + %% #projection_v1{epoch_number=Epoch_newprop10, all_members=All_list, + %% members_dict=MembersDict} = P_newprop10, + %% P_noneprop0 = make_none_projection(MyName, All_list, MembersDict), + %% P_noneprop1 = P_noneprop0#projection_v1{epoch_number=Epoch_newprop10}, + %% %% Just to be clear, we clobber any flapping info by setting dbg. + %% P_noneprop = P_noneprop1#projection_v1{dbg=ClauseInfo}, + %% react_to_env_C100(P_noneprop, P_latest, S); + + %% 2015-04-14: Let's experiment with using the current inner + %% projection (or, if there really is no inner, just P_current). + %% This is safe because it's already P_current and by assumption, + %% anything that made it through the logical maze to get here + %% is safe. So re-using it with a higher epoch number doesn't + %% make any significant change. + %% + %% Yeah, it appears to work, also, nice! This can help save some + %% repair operations (compared to the other safe thing to do + %% here, which uses make_none_projection() to build & repair the + %% entire chain from scratch). + + P_inner2A = inner_projection_or_self(P_current), + P_inner2B = + P_inner2A#projection_v1{epoch_number= P_newprop10#projection_v1.epoch_number, - dbg=[FlapHack,RankBoost]}, - io:format(user, "QQQ ~w switching to inner: ~w\n", [MyName, machi_projection:make_summary(P_inner2B)]), + dbg=ClauseInfo}, + react_to_env_C100(P_inner2B, P_latest, S); - LEFT OFF HERE ... what if we: - 1. Create a "safe" projection that is upi=[],repairing=[] - 2. Declare it to be best & latest by pure fiat. - 3. Jump to C100?/C110? to a cycle of iteration, - push our P_current state to a smallest-possible-score - state, then let the rest reassemble itself. - - P_inner2B; - true -> - P_newprop10 - end, - - react_to_env_A40(Retries, P_newprop20, P_latest, - LatestUnanimousP, S10). + true -> + ?REACT({a30, ?LINE}), + react_to_env_A40(Retries, P_newprop10, P_latest, + LatestUnanimousP, S10) + end. react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, #ch_mgr{name=MyName, proj=P_current}=S) -> @@ -895,7 +930,7 @@ react_to_env_A50(P_latest, FinalProps, S) -> ?REACT(a50), _HH = get(react), - io:format(user, "HEE50s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- _HH, is_atom(X)])]), + %% io:format(user, "HEE50s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- _HH, is_atom(X)])]), %% io:format(user, "HEE50 ~w ~w ~p\n", [S#ch_mgr.name, self(), lists:reverse(_HH)]), ?REACT({a50, ?LINE, [{latest_epoch, P_latest#projection_v1.epoch_number}, @@ -1086,10 +1121,10 @@ react_to_env_C100(P_newprop, P_latest, %% By process of elimination, P_newprop is best, %% so let's write it. io:format(user, "\nUrp: ~p ~p ~p ~p\n", [MyName, ShortCircuit_p, _AnyOtherReturnValue, Inner_sane_p]), -io:format(user, "c100 P_newprop : ~w\n", [machi_projection:make_summary(P_newprop)]), -io:format(user, "c100 P_newpropY: ~w\n", [machi_projection:make_summary(P_newpropY)]), -io:format(user, "c100 P_latest : ~w\n", [machi_projection:make_summary(P_latest)]), -io:format(user, "c100 P_latestY: ~w\n", [machi_projection:make_summary(P_latestY)]), +%% io:format(user, "c100 P_newprop : ~w\n", [machi_projection:make_summary(P_newprop)]), +%% io:format(user, "c100 P_newpropY: ~w\n", [machi_projection:make_summary(P_newpropY)]), +%% io:format(user, "c100 P_latest : ~w\n", [machi_projection:make_summary(P_latest)]), +%% io:format(user, "c100 P_latestY: ~w\n", [machi_projection:make_summary(P_latestY)]), ?REACT({c100, ?LINE, [not_sane]}), react_to_env_C300(P_newprop, P_latest, S) end. @@ -1151,7 +1186,7 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H} = S) -> end, HH = get(react), - io:format(user, "HEE120s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- HH, is_atom(X)])]), + %% io:format(user, "HEE120s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- HH, is_atom(X)])]), %% io:format(user, "HEE120 ~w ~w ~p\n", [S#ch_mgr.name, self(), lists:reverse(HH)]), ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}), @@ -1196,7 +1231,7 @@ react_to_env_C310(P_newprop, S) -> ?REACT(c310), Epoch = P_newprop#projection_v1.epoch_number, {WriteRes, S2} = cl_write_public_proj_skip_local_error(Epoch, P_newprop, S), - io:format(user, "QQQ ~w public write ~w: ~w\n", [S#ch_mgr.name, machi_projection:make_summary(P_newprop), WriteRes]), + %% io:format(user, "QQQ ~w public write ~w: ~w\n", [S#ch_mgr.name, machi_projection:make_summary(P_newprop), WriteRes]), ?REACT({c310, ?LINE, [{newprop, machi_projection:make_summary(P_newprop)}, {write_result, WriteRes}]}), @@ -1428,12 +1463,14 @@ projection_transition_is_sane( true = sets:is_disjoint(DownS2, RepairingS2), true = sets:is_disjoint(UPIS2, RepairingS2), - %% The author must not be down. - false = lists:member(AuthorServer1, Down_list1), - false = lists:member(AuthorServer2, Down_list2), + %% TODO relaxing this is ok, perhaps? + %% %% The author must not be down. + %% false = lists:member(AuthorServer1, Down_list1), + %% false = lists:member(AuthorServer2, Down_list2), + %% TODO relaxing this is ok, perhaps, also? %% The author must be in either the UPI or repairing list. - true = lists:member(AuthorServer1, UPI_list1 ++ Repairing_list1), - true = lists:member(AuthorServer2, UPI_list2 ++ Repairing_list2), + %% true = lists:member(AuthorServer1, UPI_list1 ++ Repairing_list1), + %% true = lists:member(AuthorServer2, UPI_list2 ++ Repairing_list2), %% Additions to the UPI chain may only be at the tail UPI_common_prefix = find_common_prefix(UPI_list1, UPI_list2), diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index 9b585fc..176e721 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -215,7 +215,7 @@ convergence_demo_testfun(NumFLUs) -> [receive done -> ok - after 995000 -> + after 120*1000 -> exit(icky_timeout) end || _ <- Pids] end, @@ -267,7 +267,7 @@ convergence_demo_testfun(NumFLUs) -> io:format(user, "\nSweet, all_hosed are identical-or-islands-inconclusive.\n", []), timer:sleep(1000), ok - %% end || Partition <- AllPartitionCombinations + end || Partition <- AllPartitionCombinations %% end || Partition <- [ [{a,b},{b,d},{c,b}], %% [{a,b},{b,d},{c,b}, {a,b},{b,a},{a,c},{c,a},{a,d},{d,a}], %% %% [{a,b},{b,d},{c,b}, {b,a},{a,b},{b,c},{c,b},{b,d},{d,b}], @@ -275,7 +275,7 @@ convergence_demo_testfun(NumFLUs) -> %% [{a,b},{b,d},{c,b}, {d,a},{a,d},{d,b},{b,d},{d,c},{c,d}] ] %% end || Partition <- [ [{a,b}, {b,c}], %% [{a,b}, {c,b}] ] - end || Partition <- [ [{a,b}, {b,c}] ] %% hosed-not-equal @ 3 FLUs + %% end || Partition <- [ [{a,b}, {b,c}] ] %% hosed-not-equal @ 3 FLUs %% end || Partition <- [ [{b,d}] ] %% end || Partition <- [ [{a,b}, {b,a}] ] %% end || Partition <- [ [{a,b}, {b,a}, {a,c},{c,a}] ] From 9e587b3d119adc7d680c147c26cb5c63d30e4091 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 14 Apr 2015 16:17:49 +0900 Subject: [PATCH 3/6] WIP: crufty TODO & comment cleanup --- TODO-shortterm.org | 4 +- src/machi_chain_manager1.erl | 257 +++++--------------- test/machi_chain_manager1_converge_demo.erl | 4 +- 3 files changed, 70 insertions(+), 195 deletions(-) diff --git a/TODO-shortterm.org b/TODO-shortterm.org index 216d65d..3abd967 100644 --- a/TODO-shortterm.org +++ b/TODO-shortterm.org @@ -28,7 +28,9 @@ func, and pattern match Erlang style in that func. ** TODO Fix all known bugs with Chain Manager *** DONE Fix known bugs -*** TODO Clean up crufty TODO comments and other obvious cruft +*** DONE Clean up crufty TODO comments and other obvious cruft +*** TODO Re-add verification step of stable epochs, including inner projections! +*** TODO Attempt to remove cruft items in flapping_i? ** TODO Finish OTP'izing the Chain Manager with FLU & proj store processes ** TODO Change all protocol ops to enforce the epoch ID diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 9971a0a..b27c974 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -541,6 +541,8 @@ rank_and_sort_projections(Ps, CurrentProj) -> %% Perhaps this means that we should change the responsibility %% for repair management to the highest ranking member of the %% UPI_list? +%% TODO Hrrrmmmmm ... what about the TODO comment in A40's A40a clause? +%% That could perhaps resolve this same problem in a better way? rank_projections(Projs, CurrentProj) -> #projection_v1{all_members=All_list} = CurrentProj, @@ -603,7 +605,6 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, #ch_mgr{name=MyName, proj=P_current, flap_limit=FlapLimit} = S) -> ?REACT(a30), - %% io:format(user, "HEE30s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- get(react), is_atom(X) orelse element(1,X) == b10])]), {P_newprop1, S2} = calc_projection(S, MyName), ?REACT({a30, ?LINE, [{current, machi_projection:make_summary(S#ch_mgr.proj)}]}), ?REACT({a30, ?LINE, [{newprop1, machi_projection:make_summary(P_newprop1)}]}), @@ -618,7 +619,6 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, NewEpoch = erlang:max(Epoch_newprop2, Epoch_latest) + 1, P_newprop3 = P_newprop2#projection_v1{epoch_number=NewEpoch}, ?REACT({a30, ?LINE, [{newprop3, machi_projection:make_summary(P_newprop3)}]}), - %% if MyName == 'd' -> io:format(user, "QQQQQ ~w P_latest is ~w\n", [MyName, machi_projection:make_summary(P_latest)]); true -> ok end, {P_newprop10, S10} = case get_flap_count(P_newprop3) of @@ -675,49 +675,22 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, _ -> {P_newprop3, S3} end, - %% QQQ WHAT IF? - %% "What if we checked P_current for flap count, and it was > 0?" - %% "What if we checked P_newprop10 for flap count, and it was = 0?" - %% "If P_current also has an inner proj, then pull that inner proj..." - %% "out and use *it* for P_newprop20/yeah!!?????" + + %% Here's a more common reason for moving from inner projection to + %% a normal projection: the old proj has an inner but the newprop + %% does not. + MoveFromInnerToNorm_p = + case {inner_projection_exists(P_current), + inner_projection_exists(P_newprop10)} of + {true, false} -> true; + {_, _} -> false + end, + + %% If P_current says that we believe that we're currently flapping, + %% and if P_newprop10 says that we're no longer flapping, then we + %% really ought to stop flapping, right. %% - %% QQQ 2015-04-13: New thinking - %% - %% -1. Hey, I'm wondering, duh, isn't #4 the right thing???? - %% - %% 0. I think I'm leaning toward trying to use option #3 below first. - %% If that doesn't work, then consider #1 or #2? - %% - %% 1. There are a couple of approaches: a CRDT-style thing, info - %% is always growing and always mergeable (and probably having a - %% pruning problem?). So add a piece of data to the projection - %% that is always merged from all parties that says that a FLU F - %% now believes that the flapping episode that started at epoch E1 - %% has now ended as of epoch E2. (It would probably be prunable - %% that for every FLU we maintain only the last two or one - %% flapping episode finished events?) - %% - %% 2. We could do something like query the public (and private?) - %% stores of all participants when we've flapping, to find - %% transient information that's written in some epoch E-d prior to - %% what we've witnessed in our latest-public-projection-read that - %% gave us news of the outside world via P_latest. ?? - %% - %% 3. If we see P_latest come in from some other author (not us), - %% and it no longer has an flapping started epoch counter that - %% matches what recall from previous flaps, then we should reset - %% our flap count to zero and propose the last inner projection? - %% That makes a safe (?) transition from flapping to not-flapping, - %% yeah? - %% - %% 4. What a sec. We *KNOW* from our code below ... - %% case {inner_projection_exists(P_current), - %% inner_projection_exists(P_newprop10)} of - %% {true, false} -> - %% ... that *P_newprop10* says that we're no longer flapping. Yay. - %% So we should just use the last inner proj, P_current's inner proj. - %% Hrrrmmmm, except that's what we're just trying to do brute-force here? - %% So, what's wrong with what we're doing here, again??? + %% Not quite so simple.... %% %% AAAAH, right. The case I'm dealing with right now is an asymmetric %% partition in a 4 member chain that affects all_hosed=[a,b,c] but @@ -727,17 +700,15 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% Yes, that hypothesis is confirmed by time-honored io:format() tracing. %% %% So, we need something to kick a silly member like 'd' out of its - %% am-still-flapping rut. So, let's try this: - %% If we see a P_latest from author != MyName, and if it has a - %% P_latest's author's flap count is 0, but that same member's + %% rut of am-still-flapping. So, let's try this: + %% If we see a P_latest from author != MyName, and if P_latest's + %% author's flap count is now 0 (latest!), but that same member's %% flap count in P_current is non-zero, then we assume that author - %% has moved out of flapping state and that we ought to do the same. - %% - %% Hrm, well, the 'rank_boost!' thing isn't doing what I thought it - %% would. So, to resume in the morning ... see the LEFT OFF HERE below. + %% has moved out of flapping state and that therefore we ought to do + %% the same. %% Remember! P_current is this manager's private in-use projection. - %% It is always older or equal to P_latest's epoch! + %% It is always less than or equal to P_latest's epoch! Current_flap_counts = get_all_flap_counts(P_current), Latest_authors_flap_count_current = proplists:get_value( Author_latest, Current_flap_counts), @@ -752,17 +723,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, false end, - %% Here's a more common reason for moving from inner projection to - %% a normal projection: the old prob has an inner but the newprop - %% does not. - MoveFromInnerToNorm_p = - case {inner_projection_exists(P_current), - inner_projection_exists(P_newprop10)} of - {true, false} -> true; - {_, _} -> false - end, - - if Kicker_p orelse MoveFromInnerToNorm_p -> + if MoveFromInnerToNorm_p orelse Kicker_p -> ClauseInfo = [{inner_kicker, Kicker_p}, {move_from_inner, MoveFromInnerToNorm_p}], ?REACT({a30, ?LINE, ClauseInfo}), @@ -792,7 +753,16 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% Yeah, it appears to work, also, nice! This can help save some %% repair operations (compared to the other safe thing to do %% here, which uses make_none_projection() to build & repair the - %% entire chain from scratch). + %% entire chain from scratch). Note that this isn't a guarantee + %% that repair steps will be minimized: for a 4-member cluster + %% that has an asymmetric partition which organizes 3 clusters of + %% inner-upi=[a], inner-upi=[b], and inner-upi[c,d], there is no + %% guarantee (yet?) that the [c,d] chain will be the UPI basis + %% for repairs when the partition is healed: the quickest author + %% after the healing will make that choice for everyone. + %% TODO: Perhaps that quickest author should consult all of the + %% other private stores, check their inner, and if there is a + %% higher rank there, then goto C200 for a wait-and-see cycle? P_inner2A = inner_projection_or_self(P_current), P_inner2B = @@ -864,7 +834,7 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, %% A40a (see flowchart) Rank_newprop > Rank_latest -> - ?REACT({b10, ?LINE, + ?REACT({a40, ?LINE, [{rank_latest, Rank_latest}, {rank_newprop, Rank_newprop}, {latest_author, P_latest#projection_v1.author_server}]}), @@ -928,11 +898,6 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, react_to_env_A50(P_latest, FinalProps, S) -> ?REACT(a50), - - _HH = get(react), - %% io:format(user, "HEE50s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- _HH, is_atom(X)])]), - %% io:format(user, "HEE50 ~w ~w ~p\n", [S#ch_mgr.name, self(), lists:reverse(_HH)]), - ?REACT({a50, ?LINE, [{latest_epoch, P_latest#projection_v1.epoch_number}, {final_props, FinalProps}]}), {{no_change, FinalProps, P_latest#projection_v1.epoch_number}, S}. @@ -943,20 +908,16 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, ?REACT(b10), {_P_newprop_flap_time, P_newprop_flap_count} = get_flap_count(P_newprop), - _LatestAllFlapCounts = get_all_flap_counts_counts(P_latest), - %% Transitive flap counts are buggy: the current method to observe - %% them is **buggy**. - %% P_latest_trans_flap_count = my_find_minmost(LatestAllFlapCounts), if LatestUnanimousP -> - %% ?REACT({b10, ?LINE, [{latest_unanimous_p, LatestUnanimousP}]}), - ?REACT({b10, ?LINE, [{latest_unanimous_p, LatestUnanimousP}, - {latest_epoch,P_latest#projection_v1.epoch_number}, - {latest_author,P_latest#projection_v1.author_server}, - {newprop_epoch,P_newprop#projection_v1.epoch_number}, - {newprop_author,P_newprop#projection_v1.author_server} -]}), + ?REACT({b10, ?LINE, + [{latest_unanimous_p, LatestUnanimousP}, + {latest_epoch,P_latest#projection_v1.epoch_number}, + {latest_author,P_latest#projection_v1.author_server}, + {newprop_epoch,P_newprop#projection_v1.epoch_number}, + {newprop_author,P_newprop#projection_v1.author_server} + ]}), put(b10_hack, false), react_to_env_C100(P_newprop, P_latest, S); @@ -965,25 +926,11 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, %% I am flapping ... what else do I do? ?REACT({b10, ?LINE, [i_am_flapping, {newprop_flap_count, P_newprop_flap_count}, - %% {latest_trans_flap_count, P_latest_trans_flap_count}, {flap_limit, FlapLimit}]}), _B10Hack = get(b10_hack), - %% if _B10Hack == false andalso P_newprop_flap_count - FlapLimit - 3 =< 0 -> io:format(user, "{FLAP: ~w flaps ~w}!\n", [S#ch_mgr.name, P_newprop_flap_count]), put(b10_hack, true); true -> ok end, io:format(user, "{FLAP: ~w flaps ~w}!\n", [S#ch_mgr.name, P_newprop_flap_count]), -%io:format(user, "FLAP: ~w flapz ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- get(react), is_atom(X)])]), - if - %% So, if we noticed a flap count by some FLU X with a - %% count below FlapLimit, then X crashes so that X's - %% flap count remains below FlapLimit, then we could get - %% stuck forever? Hrm, except that 'crashes' ought to be - %% detected by our own failure detector and get us out of - %% this current flapping situation, right? TODO - %% - %% 2015-04-10: TODO Flap count detection, as it has - %% been attempted before now, is buggy. - %% %% MEANWHILE, we have learned some things about this %% algorithm in the past few months. With the introduction %% of the "inner projection" concept, we know that the inner @@ -999,7 +946,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, %% doesn't give an accurate sense of global flapping state. %% FlapLimit is enough to be able to tell us to slow down. - true -> %% P_latest_trans_flap_count >= FlapLimit -> + true -> %% We already know that I'm flapping. We need to %% signal to the rest of the world that I'm writing %% and flapping and churning, so we cannot always @@ -1017,7 +964,6 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, end, FinalProps = [{my_flap_limit, FlapLimit}, {throttle_seconds, ThrottleTime}], -io:format(user, "<--x=~w-.--~w-~w-~w->", [X, MyName, P_newprop_flap_count,FlapLimit]), react_to_env_A50(P_latest, FinalProps, S2); true -> %% It is our moral imperative to write so that @@ -1025,7 +971,6 @@ io:format(user, "<--x=~w-.--~w-~w-~w->", [X, MyName, P_newprop_flap_count,FlapLi %% everyone notices then eventually falls into %% consensus. ?REACT({b10, ?LINE, [flap_continue]}), -io:format(user, "<--x=~w-oooo-~w-~w-~w->", [X, MyName, P_newprop_flap_count,FlapLimit]), react_to_env_C300(P_newprop, P_latest, S2) end end; @@ -1065,21 +1010,6 @@ react_to_env_C100(P_newprop, P_latest, #ch_mgr{name=MyName, proj=P_current}=S) -> ?REACT(c100), - %% TODO 2015-04-10 - %% OK, well, we need to be checking sanity on inner projections here, - %% but how to do it is still a bit of a mystery. - %% - %% If the *Y bindings are identical to incoming args, then we aren't - %% checking at all. That's bad, but we don't go into Infinite Loops of - %% ReallyReallyBad. - - P_newpropY = P_newprop, - P_latestY = P_latest, - P_currentY = P_current, - %% P_newpropY = inner_projection_or_self(P_newprop), - %% P_latestY = inner_projection_or_self(P_latest), - %% P_currentY = inner_projection_or_self(P_current), - I_am_UPI_in_newprop_p = lists:member(MyName, P_newprop#projection_v1.upi), I_am_Repairing_in_latest_p = lists:member(MyName, P_latest#projection_v1.repairing), @@ -1092,13 +1022,6 @@ react_to_env_C100(P_newprop, P_latest, Current_sane_p = projection_transition_is_sane(P_current, P_latest, MyName), - Inner_sane_p = - if P_currentY == P_current, P_latestY == P_latest -> - true; - true -> - projection_transition_is_sane(P_currentY, P_latestY, MyName) - end, - case {ShortCircuit_p, Current_sane_p} of _ when P_current#projection_v1.epoch_number == 0 -> %% Epoch == 0 is reserved for first-time, just booting conditions. @@ -1110,43 +1033,24 @@ react_to_env_C100(P_newprop, P_latest, %% am/should be repairing. We ignore our proposal and try %% to go with the latest. ?REACT({c100, ?LINE, [repairing_short_circuit]}), - if Inner_sane_p == false -> io:format(user, "QQQ line ~p false\n", [?LINE]), timer:sleep(500); true -> ok end, react_to_env_C110(P_latest, S); - {_, true} when Inner_sane_p -> + {_, true} -> ?REACT({c100, ?LINE, [sane]}), - if Inner_sane_p == false -> io:format(user, "QQQ line ~p false\n", [?LINE]), timer:sleep(500); true -> ok end, react_to_env_C110(P_latest, S); {_, _AnyOtherReturnValue} -> - %% P_latest is not sane or else P_latestY is not sane. + %% P_latest is not sane. %% By process of elimination, P_newprop is best, %% so let's write it. -io:format(user, "\nUrp: ~p ~p ~p ~p\n", [MyName, ShortCircuit_p, _AnyOtherReturnValue, Inner_sane_p]), -%% io:format(user, "c100 P_newprop : ~w\n", [machi_projection:make_summary(P_newprop)]), -%% io:format(user, "c100 P_newpropY: ~w\n", [machi_projection:make_summary(P_newpropY)]), -%% io:format(user, "c100 P_latest : ~w\n", [machi_projection:make_summary(P_latest)]), -%% io:format(user, "c100 P_latestY: ~w\n", [machi_projection:make_summary(P_latestY)]), ?REACT({c100, ?LINE, [not_sane]}), react_to_env_C300(P_newprop, P_latest, S) end. react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) -> ?REACT(c110), - %% TOOD: Should we carry along any extra info that that would be useful - %% in the dbg2 list? Extra_todo = [], - RunEnv = S#ch_mgr.runenv, - Islands = proplists:get_value(network_islands, RunEnv), - P_latest2 = machi_projection:update_dbg2( - P_latest, - [%% {network_islands, Islands}, - %% {hooray, {v2, date(), time()}} - Islands--Islands - |Extra_todo]), + P_latest2 = machi_projection:update_dbg2(P_latest, Extra_todo), MyNamePid = proxy_pid(MyName, S), - %% TODO: We need to fetch the inner projection, if it exists, and - %% write it to the private store. Put the outer projection - %% into dbg2 for forensics and perhaps re-start use? ok = ?FLU_PC:write_projection(MyNamePid, private, P_latest2, ?TO), case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of true -> @@ -1185,10 +1089,6 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H} = S) -> H2 end, - HH = get(react), - %% io:format(user, "HEE120s ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse([X || X <- HH, is_atom(X)])]), - %% io:format(user, "HEE120 ~w ~w ~p\n", [S#ch_mgr.name, self(), lists:reverse(HH)]), - ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}), {{now_using, FinalProps, P_latest#projection_v1.epoch_number}, S#ch_mgr{proj=P_latest, proj_history=H3}}. @@ -1231,7 +1131,6 @@ react_to_env_C310(P_newprop, S) -> ?REACT(c310), Epoch = P_newprop#projection_v1.epoch_number, {WriteRes, S2} = cl_write_public_proj_skip_local_error(Epoch, P_newprop, S), - %% io:format(user, "QQQ ~w public write ~w: ~w\n", [S#ch_mgr.name, machi_projection:make_summary(P_newprop), WriteRes]), ?REACT({c310, ?LINE, [{newprop, machi_projection:make_summary(P_newprop)}, {write_result, WriteRes}]}), @@ -1337,36 +1236,26 @@ calculate_flaps(P_newprop, _P_current, _FlapLimit, AllHosed = [] end, - %% 2015-04-13: TODO: this whole notion of "settled" flap counts - %% has not worked as initially planned. Remove it all. - %% %% If there's at least one count in AllFlapCounts that isn't my - %% %% flap count, and if it's over the flap limit, then consider them - %% %% settled. - %% AllFlapCountsSettled = lists:keydelete(MyName, 1, AllFlapCounts) /= [] - %% andalso - %% my_find_minmost(AllFlapCounts) >= FlapLimit, FlappingI = {flapping_i, [{flap_count, {NewFlapStart, NewFlaps}}, {all_hosed, AllHosed}, {all_flap_counts, lists:sort(AllFlapCounts)}, - %% {all_flap_counts_settled, AllFlapCountsSettled}, - {bad,BadFLUs}, - {da_downu, DownUnion}, % debugging aid - {da_hosedtu, HosedTransUnion}, % debugging aid - {da_downreports, [{P#projection_v1.epoch_number, P#projection_v1.author_server, P#projection_v1.down} || P <- [BestP|NotBestPs]]} % debugging aid - ]}, + {bad,BadFLUs}]}, Dbg2 = [FlappingI|P_newprop#projection_v1.dbg], - %% SLF TODO: 2015-03-04: I'm growing increasingly suspicious of + %% TODO: 2015-03-04: I'm growing increasingly suspicious of %% the 'runenv' variable that's threaded through all this code. - %% It isn't doing what I'd originally intended. And I think that - %% the flapping information that we've just constructed here is - %% going to get lost, and that's a shame. Fix it. + %% It isn't doing what I'd originally intended. Fix it. RunEnv2 = replace(RunEnv1, [FlappingI]), - %% NOTE: If we'd increment of flaps here, that doesn't mean that - %% someone's public proj store has been updated. For example, + %% NOTE: Just because we increment flaps here, there's no correlation + %% to successful public proj store writes! For example, %% if we loop through states C2xx a few times, we would incr %% flaps each time ... but the C2xx path doesn't write a new - %% proposal to everyone's public proj stores, and there's no - %% guarantee that anyone else as written a new public proj either. + %% proposal to everyone's public proj stores. Similarly, + %% if we go through to C300, we will *try* to write to all public + %% stores, but the C3xx path doesn't care if all of those write + %% attempts *fail*. Our flap count is a rough heuristic only, and + %% a large local flaps count gives no concrete guarantee that any + %% communication has been successful with any other part of the + %% cluster. {machi_projection:update_checksum(P_newprop#projection_v1{dbg=Dbg2}), S#ch_mgr{flaps=NewFlaps, flap_start=NewFlapStart, runenv=RunEnv2}}. @@ -1435,7 +1324,7 @@ projection_transition_is_sane( true = is_binary(CSum1) andalso is_binary(CSum2), {_,_,_} = CreationTime1, {_,_,_} = CreationTime2, - true = is_atom(AuthorServer1) andalso is_atom(AuthorServer2), % todo will probably change + true = is_atom(AuthorServer1) andalso is_atom(AuthorServer2), % todo type may change? true = is_list(All_list1) andalso is_list(All_list2), true = is_list(Down_list1) andalso is_list(Down_list2), true = is_list(UPI_list1) andalso is_list(UPI_list2), @@ -1463,15 +1352,6 @@ projection_transition_is_sane( true = sets:is_disjoint(DownS2, RepairingS2), true = sets:is_disjoint(UPIS2, RepairingS2), - %% TODO relaxing this is ok, perhaps? - %% %% The author must not be down. - %% false = lists:member(AuthorServer1, Down_list1), - %% false = lists:member(AuthorServer2, Down_list2), - %% TODO relaxing this is ok, perhaps, also? - %% The author must be in either the UPI or repairing list. - %% true = lists:member(AuthorServer1, UPI_list1 ++ Repairing_list1), - %% true = lists:member(AuthorServer2, UPI_list2 ++ Repairing_list2), - %% Additions to the UPI chain may only be at the tail UPI_common_prefix = find_common_prefix(UPI_list1, UPI_list2), if UPI_common_prefix == [] -> @@ -1593,10 +1473,7 @@ projection_transition_is_sane( %% then adopts that projection (and unwedges %% itself, etc etc). - %% io:format(user, "QQQ: RetrospectiveP ~p\n", [RetrospectiveP]), - %% io:format(user, "QQQ: UPI_2_suffix ~p\n", [UPI_2_suffix]), - %% io:format(user, "QQQ: UPI_2_suffix_from_UPI1 ~p\n", [UPI_2_suffix_from_UPI1]), - %% io:format(user, "QQQ: UPI_2_suffix_from_Repairing1 ~p\n", [UPI_2_suffix_from_Repairing1]), + exit({todo, revisit, ?MODULE, ?LINE}), io:format(user, "|~p,~p TODO revisit|", [?MODULE, ?LINE]), ok; @@ -1606,9 +1483,11 @@ projection_transition_is_sane( %% normal has a UPI that has nothing to do with %% RelativeToServer a.k.a. me. %% from: - %% {epoch,847},{author,c},{upi,[c]},{repair,[]},{down,[a,b,d]}, + %% {epoch,847},{author,c},{upi,[c]},{repair,[]}, + %% {down,[a,b,d]} %% to: - %% {epoch,848},{author,a},{upi,[a]},{repair,[]},{down,[b,c,d]}, + %% {epoch,848},{author,a},{upi,[a]},{repair,[]}, + %% {down,[b,c,d]} if UPI_2_suffix == [AuthorServer2] -> true; not RetrospectiveP -> @@ -1623,12 +1502,6 @@ projection_transition_is_sane( S1 = machi_projection:make_summary(P1), S2 = machi_projection:make_summary(P2), Trace = erlang:get_stacktrace(), - %% %% TODO: this history goop is useful sometimes for debugging but - %% %% not for any "real" use. Get rid of it, for the long term. - %% H = (catch [{FLUName, Type, P#projection_v1.epoch_number, machi_projection:make_summary(P)} || - %% FLUName <- P1#projection_v1.all_members, - %% Type <- [public,private], - %% P <- ?FLU_PC:proj_get_all(orddict:fetch(FLUName, What?), Type)]), {err, _Type, _Err, from, S1, to, S2, relative_to, RelativeToServer, history, (catch lists:sort([no_history])), stack, Trace} @@ -1691,7 +1564,7 @@ merge_flap_counts([], D) -> merge_flap_counts([FlapCount|Rest], D1) -> %% We know that FlapCount is list({Actor, {{_epk,FlapStartTime},NumFlaps}}). D2 = orddict:from_list(FlapCount), - D2 = orddict:from_list(FlapCount), + D2 = orddict:from_list(FlapCount), %% If the FlapStartTimes are identical, then pick the bigger flap count. %% If the FlapStartTimes differ, then pick the larger start time tuple. D3 = orddict:merge(fun(_Key, {{_,T1}, NF1}= V1, {{_,T2}, NF2}=V2) diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index 176e721..a807c3c 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -267,7 +267,7 @@ convergence_demo_testfun(NumFLUs) -> io:format(user, "\nSweet, all_hosed are identical-or-islands-inconclusive.\n", []), timer:sleep(1000), ok - end || Partition <- AllPartitionCombinations + %% end || Partition <- AllPartitionCombinations %% end || Partition <- [ [{a,b},{b,d},{c,b}], %% [{a,b},{b,d},{c,b}, {a,b},{b,a},{a,c},{c,a},{a,d},{d,a}], %% %% [{a,b},{b,d},{c,b}, {b,a},{a,b},{b,c},{c,b},{b,d},{d,b}], @@ -278,7 +278,7 @@ convergence_demo_testfun(NumFLUs) -> %% end || Partition <- [ [{a,b}, {b,c}] ] %% hosed-not-equal @ 3 FLUs %% end || Partition <- [ [{b,d}] ] %% end || Partition <- [ [{a,b}, {b,a}] ] - %% end || Partition <- [ [{a,b}, {b,a}, {a,c},{c,a}] ] + end || Partition <- [ [{a,b}, {b,a}, {a,c},{c,a}] ] %% end || Partition <- [ [{a,b}], %% [{b,a}] ] %% end || Partition <- [ [{a,b}, {c,b}], From 90df6552568bb30165d999af0657b512d8ca26da Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 14 Apr 2015 16:32:47 +0900 Subject: [PATCH 4/6] WIP: Ha! There's a bug, this verbose logging change made it easier to see --- src/machi_chain_manager1.erl | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index b27c974..c6f8f3b 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1059,16 +1059,15 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) -> {HH,MM,SS} = time(), case inner_projection_exists(P_latest2) of false -> - ok; + io:format(user, "\n~2..0w:~2..0w:~2..0w.~3..0w ~p uses plain: ~w\n", + [HH,MM,SS,MSec, S#ch_mgr.name, + machi_projection:make_summary(P_latest2)]); true -> P_inner = inner_projection_or_self(P_latest2), - io:format(user, "\n~2..0w:~2..0w:~2..0w.~3..0w ~p uses INNER: ~w\n", + io:format(user, "\n~2..0w:~2..0w:~2..0w.~3..0w ~p uses inner: ~w\n", [HH,MM,SS,MSec, S#ch_mgr.name, machi_projection:make_summary(P_inner)]) - end, - io:format(user, "\n~2..0w:~2..0w:~2..0w.~3..0w ~p uses: ~w\n", - [HH,MM,SS,MSec, S#ch_mgr.name, - machi_projection:make_summary(P_latest2)]); + end; _ -> ok end, From 02bc7fe0bc56c5fe3c59296a3e4a95c944336a44 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 14 Apr 2015 18:19:08 +0900 Subject: [PATCH 5/6] WIP: Fix bug that flaps inside an inner projection, oops! --- TODO-shortterm.org | 1 + src/machi_chain_manager1.erl | 41 ++++++++++++++++++++- test/machi_chain_manager1_converge_demo.erl | 6 +-- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/TODO-shortterm.org b/TODO-shortterm.org index 3abd967..8dc7a75 100644 --- a/TODO-shortterm.org +++ b/TODO-shortterm.org @@ -43,3 +43,4 @@ func, and pattern match Erlang style in that func. *** TODO Make chain manager code flexible enough to run "real world" or "sim" ** TODO Replace registered name use from FLU write/append dispatcher ** TODO Move the FLU server to gen_server behavior? +** TODO Implement real data repair, orchestrated by the chain manager diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index c6f8f3b..3bcb462 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -666,6 +666,11 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, end end, + %% TODO: When we implement the real chain repair function, we + %% need to keep in mind that an inner projection with + %% up nodes > 1, repair is required there! In the + %% current simulator, repair is not simulated and + %% finished (and then growing the UPI list). Fix. P_inner2 = P_inner#projection_v1{epoch_number=FinalInnerEpoch}, InnerInfo = [{inner_summary, machi_projection:make_summary(P_inner2)}, {inner_projection, P_inner2}], @@ -772,7 +777,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, react_to_env_C100(P_inner2B, P_latest, S); true -> - ?REACT({a30, ?LINE}), + ?REACT({a30, ?LINE, []}), react_to_env_A40(Retries, P_newprop10, P_latest, LatestUnanimousP, S10) end. @@ -908,8 +913,36 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, ?REACT(b10), {_P_newprop_flap_time, P_newprop_flap_count} = get_flap_count(P_newprop), + UnanimousLatestInnerNotRelevant_p = + case inner_projection_exists(P_latest) of + true when P_latest#projection_v1.author_server /= MyName -> + #projection_v1{down=Down_inner} = inner_projection_or_self( + P_latest), + case lists:member(MyName, Down_inner) of + true -> + %% Some foreign author's inner projection thinks that + %% I'm down. Silly! We ought to ignore this one. + ?REACT({b10, ?LINE, [{down_inner, Down_inner}]}), + true; + false -> + ?REACT({b10, ?LINE, [{down_inner, Down_inner}]}), + false + end; + _Else_u -> + false + end, if + LatestUnanimousP + andalso + UnanimousLatestInnerNotRelevant_p -> + ?REACT({b10, ?LINE, []}), + put(b10_hack, false), + + %% Do not go to C100, because we want to ignore this latest + %% proposal. Write ours instead via C300. + react_to_env_C300(P_newprop, P_latest, S); + LatestUnanimousP -> ?REACT({b10, ?LINE, [{latest_unanimous_p, LatestUnanimousP}, @@ -992,7 +1025,9 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, {latest_author, P_latest#projection_v1.author_server}]}), put(b10_hack, false), - %% Give the author of P_latest an opportunite to write a + %% TODO: Is a UnanimousLatestInnerNotRelevant_p test needed in this clause??? + + %% Give the author of P_latest an opportunity to write a %% new projection in a new epoch to resolve this mixed %% opinion. react_to_env_C200(Retries, P_latest, S); @@ -1087,6 +1122,8 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H} = S) -> _ -> H2 end, + %% HH = [if is_atom(X) -> X; is_tuple(X) -> {element(1,X), element(2,X)} end || X <- get(react), is_atom(X) orelse size(X) == 3], + %% io:format(user, "HEE120 ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse(HH)]), ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}), {{now_using, FinalProps, P_latest#projection_v1.epoch_number}, diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index a807c3c..de6db4b 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -278,9 +278,9 @@ convergence_demo_testfun(NumFLUs) -> %% end || Partition <- [ [{a,b}, {b,c}] ] %% hosed-not-equal @ 3 FLUs %% end || Partition <- [ [{b,d}] ] %% end || Partition <- [ [{a,b}, {b,a}] ] - end || Partition <- [ [{a,b}, {b,a}, {a,c},{c,a}] ] - %% end || Partition <- [ [{a,b}], - %% [{b,a}] ] + %% end || Partition <- [ [{a,b}, {b,a}, {a,c},{c,a}] ] + end || Partition <- [ [{a,b}], + [{b,a}] ] %% end || Partition <- [ [{a,b}, {c,b}], %% [{a,b}, {b,c}] ] %% end || Partition <- [ [{a,b}, {b,c}, {c,d}], From 55492c1cac74f3d8cd9f6dd9a2b894382d91621e Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 18 Apr 2015 01:42:47 +0900 Subject: [PATCH 6/6] Update on the status of prototype/chain-manager code: now moved to TOP/src on --- prototype/README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/prototype/README.md b/prototype/README.md index 02088be..a89473f 100644 --- a/prototype/README.md +++ b/prototype/README.md @@ -53,11 +53,12 @@ This is a very early experiment to try to create a distributed "rough consensus" algorithm that is sufficient & safe for managing the order of a Chain Replication chain, its members, and its chain order. -* Code status: **active**! - Unlike the other code projects in this repository's `prototype` directory, the chain management code is still under active -development. It is quite likely (as of early March 2015) that this -code will be robust enough to move to the "real" Machi code base soon. - +development. However, the chain manager code here in the `prototype` +subdirectory will remain frozen in time. +Efforts in April 2015 have moved the chain manager code to the "top level" +of the repository. All new work is being merged weekly into the `master` +branch, see `src/machi_chain_manager1.erl` and related source at the top of +the repo.