From 96ca7b70822f0aa06beb07320e017015b93faa2d Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 7 Jul 2015 01:29:37 +0900 Subject: [PATCH] Bugfix for rare race between just-finished repair and flapping ending Fix for today: We are going to game the system. We know that C100 is going to be checking authorship relative to P_current's UPI's tail. Therefore, we're just going to set it here. Why??? Because we have been using this projection safely for the entire flapping period! ... The only other way I see is to allow C100 to carve out an exception if the repair finished PLUS author_server check fails PLUS if we came from here, but that feels a bit fragile to me: if some code factoring happens in projection_transition_is_saneprojection_transition_is_sane() or elsewhere that causes the author_server check to be something-other-than-the-final-thing-checked, then such a refactoring would likely cause an even harder bug to find & fix. Conditions tested: 5 FLUs plus alternating partitions of: [ [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{b,a},{d,e}], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [] ]. --- src/machi_chain_manager1.erl | 80 ++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index b61d2d1..c7e47c8 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -966,7 +966,9 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, ?REACT({a30, ?LINE, DbgX}), {P_newprop3#projection_v1{dbg=DbgX, inner=P_inner2}, S_i}; - _ -> + {_, P_newprop3_flap_count} -> + ?REACT({a30, ?LINE,[{newprop3_flap_count,P_newprop3_flap_count}, + {flap_limit, FlapLimit}]}), {P_newprop3, S3} end, @@ -1019,6 +1021,8 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, if MoveFromInnerToNorm_p orelse Kicker_p -> ClauseInfo = [{inner_kicker, Kicker_p}, + {inner_kicker2, {Latest_authors_flap_count_current, + Latest_authors_flap_count_latest}}, {move_from_inner, MoveFromInnerToNorm_p}], ?REACT({a30, ?LINE, ClauseInfo}), %% %% 2015-04-14: YEAH, this appears to work! @@ -1075,13 +1079,24 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% transition safety check, and so all 5 spin in an infinite loop, %% cool! %% - %% Fix for today: Send a signal (through a new func arg) to C100 - %% that we're moving from inner to outer. If the - %% 'expected_author2' error is the only sanity check that fails at - %% C100, then that's OK, because: 1. We've lost track of the - %% author, so we can't satisfy the check 100% of the time. (We - %% have the option of picking the - + %% Fix for today: We are going to game the system. We know that + %% C100 is going to be checking authorship relative to P_current's + %% UPI's tail. Therefore, we're just going to set it here. + %% Why??? Because we have been using this projection safely for + %% the entire flapping period! ... The only other way I see is to + %% allow C100 to carve out an exception if the repair finished + %% PLUS author_server check fails PLUS if we came from here, but + %% that feels a bit fragile to me: if some code factoring happens + %% in projection_transition_is_saneprojection_transition_is_sane() + %% or elsewhere that causes the author_server check to be + %% something-other-than-the-final-thing-checked, then such a + %% refactoring would likely cause an even harder bug to find & + %% fix. Conditions tested: 5 FLUs plus alternating partitions of: + %% [ + %% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], + %% [{b,a},{d,e}], + %% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [] + %% ]. %% %% TODO: Perhaps that quickest author should consult all of the %% other private stores, check their inner, and if there is a @@ -1092,18 +1107,26 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, P_inner2A = inner_projection_or_self(P_current), ResetEpoch = P_newprop10#projection_v1.epoch_number, - ResetAuthor = P_current#projection_v1.author_server, - ClauseInfo2 = [{old_author, P_inner2A#projection_v1.author_server}, + ResetAuthor = case P_current#projection_v1.upi of + [] -> + %% Drat, fall back to current's author. + P_current#projection_v1.author_server; + _ -> + lists:last(P_current#projection_v1.upi) + end, + ClauseInfo2 = [{move_from_inner_to_outer, true}, + {old_author, P_inner2A#projection_v1.author_server}, {reset_author, ResetAuthor}, {reset_epoch, ResetEpoch}], P_inner2B = - P_inner2A#projection_v1{epoch_number=ResetEpoch, - author_server=ResetAuthor, - dbg=ClauseInfo++ClauseInfo2}, + machi_projection:update_checksum( + P_inner2A#projection_v1{epoch_number=ResetEpoch, + author_server=ResetAuthor, + dbg=ClauseInfo++ClauseInfo2}), ReactI = [{inner2b,machi_projection:make_summary(P_inner2B)}], ?REACT({a30, ?LINE, ReactI}), -io:format(user, "HEE30 ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse(get(react))]), timer:sleep(100), - react_to_env_C100(P_inner2B, P_latest, reset_loop_maybe, S); +io:format(user, "HEE30 ~w ~w ~P\n", [S#ch_mgr.name, self(), get(react), 250]), timer:sleep(100), + react_to_env_C100(P_inner2B, P_latest, S); true -> ?REACT({a30, ?LINE, []}), react_to_env_A40(Retries, P_newprop10, P_latest, @@ -1281,7 +1304,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, ]}), put(b10_hack, false), - react_to_env_C100(P_newprop, P_latest, undefined, S); + react_to_env_C100(P_newprop, P_latest, S); P_newprop_flap_count >= FlapLimit -> %% I am flapping ... what else do I do? @@ -1374,7 +1397,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, react_to_env_C300(P_newprop, P_latest, S) end. -react_to_env_C100(P_newprop, P_latest, PerhapsReset, +react_to_env_C100(P_newprop, P_latest, #ch_mgr{name=MyName, proj=P_current}=S) -> ?REACT(c100), @@ -1395,20 +1418,15 @@ react_to_env_C100(P_newprop, P_latest, PerhapsReset, erase(perhaps_reset_loop), react_to_env_C110(P_latest, S); _AnyOtherReturnValue -> -io:format(user, "RESETLOOP: ~p ~w ~P", [MyName, get(perhaps_reset_loop), get(react), 70]), - if PerhapsReset == reset_loop_maybe -> - case get(perhaps_reset_loop) of - undefined -> - put(perhaps_reset_loop, 1); - X when X > 10 -> - Msg = lists:flatten( - io_lib:format("~P", [get(react), 200])), - exit({not_supposed_to_happen, ?MODULE, ?LINE, Msg}); - X -> - put(perhaps_reset_loop, X+1) - end; - PerhapsReset == undefined -> - ok + case get(perhaps_reset_loop) of + undefined -> + put(perhaps_reset_loop, 1); + X when X > 15 -> + Msg = lists:flatten( + io_lib:format("~P", [get(react), 300])), + exit({not_supposed_to_happen, ?MODULE, ?LINE, Msg}); + X -> + put(perhaps_reset_loop, X+1) end, %% P_latest is not sane. %% By process of elimination, P_newprop is best,