Bugfix for rare race between just-finished repair and flapping ending

Fix for today: We are going to game the system.  We know that
    C100 is going to be checking authorship relative to P_current's
    UPI's tail.  Therefore, we're just going to set it here.
    Why???  Because we have been using this projection safely for
    the entire flapping period!  ... The only other way I see is to
    allow C100 to carve out an exception if the repair finished
    PLUS author_server check fails PLUS if we came from here, but
    that feels a bit fragile to me: if some code factoring happens
    in projection_transition_is_saneprojection_transition_is_sane()
    or elsewhere that causes the author_server check to be
    something-other-than-the-final-thing-checked, then such a
    refactoring would likely cause an even harder bug to find &
    fix.  Conditions tested: 5 FLUs plus alternating partitions of:
    [
     [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
     [{b,a},{d,e}],
     [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], []
    ].
This commit is contained in:
Scott Lystig Fritchie 2015-07-07 01:29:37 +09:00
parent 54b5014446
commit 96ca7b7082

View file

@ -966,7 +966,9 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
?REACT({a30, ?LINE, DbgX}),
{P_newprop3#projection_v1{dbg=DbgX,
inner=P_inner2}, S_i};
_ ->
{_, P_newprop3_flap_count} ->
?REACT({a30, ?LINE,[{newprop3_flap_count,P_newprop3_flap_count},
{flap_limit, FlapLimit}]}),
{P_newprop3, S3}
end,
@ -1019,6 +1021,8 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
if MoveFromInnerToNorm_p orelse Kicker_p ->
ClauseInfo = [{inner_kicker, Kicker_p},
{inner_kicker2, {Latest_authors_flap_count_current,
Latest_authors_flap_count_latest}},
{move_from_inner, MoveFromInnerToNorm_p}],
?REACT({a30, ?LINE, ClauseInfo}),
%% %% 2015-04-14: YEAH, this appears to work!
@ -1075,13 +1079,24 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
%% transition safety check, and so all 5 spin in an infinite loop,
%% cool!
%%
%% Fix for today: Send a signal (through a new func arg) to C100
%% that we're moving from inner to outer. If the
%% 'expected_author2' error is the only sanity check that fails at
%% C100, then that's OK, because: 1. We've lost track of the
%% author, so we can't satisfy the check 100% of the time. (We
%% have the option of picking the
%% Fix for today: We are going to game the system. We know that
%% C100 is going to be checking authorship relative to P_current's
%% UPI's tail. Therefore, we're just going to set it here.
%% Why??? Because we have been using this projection safely for
%% the entire flapping period! ... The only other way I see is to
%% allow C100 to carve out an exception if the repair finished
%% PLUS author_server check fails PLUS if we came from here, but
%% that feels a bit fragile to me: if some code factoring happens
%% in projection_transition_is_saneprojection_transition_is_sane()
%% or elsewhere that causes the author_server check to be
%% something-other-than-the-final-thing-checked, then such a
%% refactoring would likely cause an even harder bug to find &
%% fix. Conditions tested: 5 FLUs plus alternating partitions of:
%% [
%% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
%% [{b,a},{d,e}],
%% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], []
%% ].
%%
%% TODO: Perhaps that quickest author should consult all of the
%% other private stores, check their inner, and if there is a
@ -1092,18 +1107,26 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
P_inner2A = inner_projection_or_self(P_current),
ResetEpoch = P_newprop10#projection_v1.epoch_number,
ResetAuthor = P_current#projection_v1.author_server,
ClauseInfo2 = [{old_author, P_inner2A#projection_v1.author_server},
ResetAuthor = case P_current#projection_v1.upi of
[] ->
%% Drat, fall back to current's author.
P_current#projection_v1.author_server;
_ ->
lists:last(P_current#projection_v1.upi)
end,
ClauseInfo2 = [{move_from_inner_to_outer, true},
{old_author, P_inner2A#projection_v1.author_server},
{reset_author, ResetAuthor},
{reset_epoch, ResetEpoch}],
P_inner2B =
machi_projection:update_checksum(
P_inner2A#projection_v1{epoch_number=ResetEpoch,
author_server=ResetAuthor,
dbg=ClauseInfo++ClauseInfo2},
dbg=ClauseInfo++ClauseInfo2}),
ReactI = [{inner2b,machi_projection:make_summary(P_inner2B)}],
?REACT({a30, ?LINE, ReactI}),
io:format(user, "HEE30 ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse(get(react))]), timer:sleep(100),
react_to_env_C100(P_inner2B, P_latest, reset_loop_maybe, S);
io:format(user, "HEE30 ~w ~w ~P\n", [S#ch_mgr.name, self(), get(react), 250]), timer:sleep(100),
react_to_env_C100(P_inner2B, P_latest, S);
true ->
?REACT({a30, ?LINE, []}),
react_to_env_A40(Retries, P_newprop10, P_latest,
@ -1281,7 +1304,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
]}),
put(b10_hack, false),
react_to_env_C100(P_newprop, P_latest, undefined, S);
react_to_env_C100(P_newprop, P_latest, S);
P_newprop_flap_count >= FlapLimit ->
%% I am flapping ... what else do I do?
@ -1374,7 +1397,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
react_to_env_C300(P_newprop, P_latest, S)
end.
react_to_env_C100(P_newprop, P_latest, PerhapsReset,
react_to_env_C100(P_newprop, P_latest,
#ch_mgr{name=MyName, proj=P_current}=S) ->
?REACT(c100),
@ -1395,20 +1418,15 @@ react_to_env_C100(P_newprop, P_latest, PerhapsReset,
erase(perhaps_reset_loop),
react_to_env_C110(P_latest, S);
_AnyOtherReturnValue ->
io:format(user, "RESETLOOP: ~p ~w ~P", [MyName, get(perhaps_reset_loop), get(react), 70]),
if PerhapsReset == reset_loop_maybe ->
case get(perhaps_reset_loop) of
undefined ->
put(perhaps_reset_loop, 1);
X when X > 10 ->
X when X > 15 ->
Msg = lists:flatten(
io_lib:format("~P", [get(react), 200])),
io_lib:format("~P", [get(react), 300])),
exit({not_supposed_to_happen, ?MODULE, ?LINE, Msg});
X ->
put(perhaps_reset_loop, X+1)
end;
PerhapsReset == undefined ->
ok
end,
%% P_latest is not sane.
%% By process of elimination, P_newprop is best,