Bugfix for rare race between just-finished repair and flapping ending
Fix for today: We are going to game the system. We know that C100 is going to be checking authorship relative to P_current's UPI's tail. Therefore, we're just going to set it here. Why??? Because we have been using this projection safely for the entire flapping period! ... The only other way I see is to allow C100 to carve out an exception if the repair finished PLUS author_server check fails PLUS if we came from here, but that feels a bit fragile to me: if some code factoring happens in projection_transition_is_saneprojection_transition_is_sane() or elsewhere that causes the author_server check to be something-other-than-the-final-thing-checked, then such a refactoring would likely cause an even harder bug to find & fix. Conditions tested: 5 FLUs plus alternating partitions of: [ [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{b,a},{d,e}], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [] ].
This commit is contained in:
parent
54b5014446
commit
96ca7b7082
1 changed files with 49 additions and 31 deletions
|
@ -966,7 +966,9 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
|
|||
?REACT({a30, ?LINE, DbgX}),
|
||||
{P_newprop3#projection_v1{dbg=DbgX,
|
||||
inner=P_inner2}, S_i};
|
||||
_ ->
|
||||
{_, P_newprop3_flap_count} ->
|
||||
?REACT({a30, ?LINE,[{newprop3_flap_count,P_newprop3_flap_count},
|
||||
{flap_limit, FlapLimit}]}),
|
||||
{P_newprop3, S3}
|
||||
end,
|
||||
|
||||
|
@ -1019,6 +1021,8 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
|
|||
|
||||
if MoveFromInnerToNorm_p orelse Kicker_p ->
|
||||
ClauseInfo = [{inner_kicker, Kicker_p},
|
||||
{inner_kicker2, {Latest_authors_flap_count_current,
|
||||
Latest_authors_flap_count_latest}},
|
||||
{move_from_inner, MoveFromInnerToNorm_p}],
|
||||
?REACT({a30, ?LINE, ClauseInfo}),
|
||||
%% %% 2015-04-14: YEAH, this appears to work!
|
||||
|
@ -1075,13 +1079,24 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
|
|||
%% transition safety check, and so all 5 spin in an infinite loop,
|
||||
%% cool!
|
||||
%%
|
||||
%% Fix for today: Send a signal (through a new func arg) to C100
|
||||
%% that we're moving from inner to outer. If the
|
||||
%% 'expected_author2' error is the only sanity check that fails at
|
||||
%% C100, then that's OK, because: 1. We've lost track of the
|
||||
%% author, so we can't satisfy the check 100% of the time. (We
|
||||
%% have the option of picking the
|
||||
|
||||
%% Fix for today: We are going to game the system. We know that
|
||||
%% C100 is going to be checking authorship relative to P_current's
|
||||
%% UPI's tail. Therefore, we're just going to set it here.
|
||||
%% Why??? Because we have been using this projection safely for
|
||||
%% the entire flapping period! ... The only other way I see is to
|
||||
%% allow C100 to carve out an exception if the repair finished
|
||||
%% PLUS author_server check fails PLUS if we came from here, but
|
||||
%% that feels a bit fragile to me: if some code factoring happens
|
||||
%% in projection_transition_is_saneprojection_transition_is_sane()
|
||||
%% or elsewhere that causes the author_server check to be
|
||||
%% something-other-than-the-final-thing-checked, then such a
|
||||
%% refactoring would likely cause an even harder bug to find &
|
||||
%% fix. Conditions tested: 5 FLUs plus alternating partitions of:
|
||||
%% [
|
||||
%% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
|
||||
%% [{b,a},{d,e}],
|
||||
%% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], []
|
||||
%% ].
|
||||
%%
|
||||
%% TODO: Perhaps that quickest author should consult all of the
|
||||
%% other private stores, check their inner, and if there is a
|
||||
|
@ -1092,18 +1107,26 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
|
|||
|
||||
P_inner2A = inner_projection_or_self(P_current),
|
||||
ResetEpoch = P_newprop10#projection_v1.epoch_number,
|
||||
ResetAuthor = P_current#projection_v1.author_server,
|
||||
ClauseInfo2 = [{old_author, P_inner2A#projection_v1.author_server},
|
||||
ResetAuthor = case P_current#projection_v1.upi of
|
||||
[] ->
|
||||
%% Drat, fall back to current's author.
|
||||
P_current#projection_v1.author_server;
|
||||
_ ->
|
||||
lists:last(P_current#projection_v1.upi)
|
||||
end,
|
||||
ClauseInfo2 = [{move_from_inner_to_outer, true},
|
||||
{old_author, P_inner2A#projection_v1.author_server},
|
||||
{reset_author, ResetAuthor},
|
||||
{reset_epoch, ResetEpoch}],
|
||||
P_inner2B =
|
||||
machi_projection:update_checksum(
|
||||
P_inner2A#projection_v1{epoch_number=ResetEpoch,
|
||||
author_server=ResetAuthor,
|
||||
dbg=ClauseInfo++ClauseInfo2},
|
||||
dbg=ClauseInfo++ClauseInfo2}),
|
||||
ReactI = [{inner2b,machi_projection:make_summary(P_inner2B)}],
|
||||
?REACT({a30, ?LINE, ReactI}),
|
||||
io:format(user, "HEE30 ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse(get(react))]), timer:sleep(100),
|
||||
react_to_env_C100(P_inner2B, P_latest, reset_loop_maybe, S);
|
||||
io:format(user, "HEE30 ~w ~w ~P\n", [S#ch_mgr.name, self(), get(react), 250]), timer:sleep(100),
|
||||
react_to_env_C100(P_inner2B, P_latest, S);
|
||||
true ->
|
||||
?REACT({a30, ?LINE, []}),
|
||||
react_to_env_A40(Retries, P_newprop10, P_latest,
|
||||
|
@ -1281,7 +1304,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
|
|||
]}),
|
||||
put(b10_hack, false),
|
||||
|
||||
react_to_env_C100(P_newprop, P_latest, undefined, S);
|
||||
react_to_env_C100(P_newprop, P_latest, S);
|
||||
|
||||
P_newprop_flap_count >= FlapLimit ->
|
||||
%% I am flapping ... what else do I do?
|
||||
|
@ -1374,7 +1397,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
|
|||
react_to_env_C300(P_newprop, P_latest, S)
|
||||
end.
|
||||
|
||||
react_to_env_C100(P_newprop, P_latest, PerhapsReset,
|
||||
react_to_env_C100(P_newprop, P_latest,
|
||||
#ch_mgr{name=MyName, proj=P_current}=S) ->
|
||||
?REACT(c100),
|
||||
|
||||
|
@ -1395,20 +1418,15 @@ react_to_env_C100(P_newprop, P_latest, PerhapsReset,
|
|||
erase(perhaps_reset_loop),
|
||||
react_to_env_C110(P_latest, S);
|
||||
_AnyOtherReturnValue ->
|
||||
io:format(user, "RESETLOOP: ~p ~w ~P", [MyName, get(perhaps_reset_loop), get(react), 70]),
|
||||
if PerhapsReset == reset_loop_maybe ->
|
||||
case get(perhaps_reset_loop) of
|
||||
undefined ->
|
||||
put(perhaps_reset_loop, 1);
|
||||
X when X > 10 ->
|
||||
X when X > 15 ->
|
||||
Msg = lists:flatten(
|
||||
io_lib:format("~P", [get(react), 200])),
|
||||
io_lib:format("~P", [get(react), 300])),
|
||||
exit({not_supposed_to_happen, ?MODULE, ?LINE, Msg});
|
||||
X ->
|
||||
put(perhaps_reset_loop, X+1)
|
||||
end;
|
||||
PerhapsReset == undefined ->
|
||||
ok
|
||||
end,
|
||||
%% P_latest is not sane.
|
||||
%% By process of elimination, P_newprop is best,
|
||||
|
|
Loading…
Reference in a new issue