Bugfix: undo the jump directly from A30 -> C100.
This commit is contained in:
parent
ed7dcd14db
commit
e9e4c54b25
1 changed files with 12 additions and 81 deletions
|
@ -1040,86 +1040,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
|
||||||
Latest_authors_flap_count_latest}},
|
Latest_authors_flap_count_latest}},
|
||||||
{move_from_inner, MoveFromInnerToNorm_p}],
|
{move_from_inner, MoveFromInnerToNorm_p}],
|
||||||
?REACT({a30, ?LINE, ClauseInfo}),
|
?REACT({a30, ?LINE, ClauseInfo}),
|
||||||
%% %% 2015-04-14: YEAH, this appears to work!
|
%% Move from inner projection to outer.
|
||||||
%% %% 1. Create a "safe" projection that is upi=[],repairing=[]
|
|
||||||
%% %% 2. Declare it to be best & latest by pure fiat.
|
|
||||||
%% %% (The C100 transition will double-check that it's safe.)
|
|
||||||
%% %% 3. Jump to C100. Then, for the next iteration,
|
|
||||||
%% %% our P_current state to a smallest-possible-score
|
|
||||||
%% %% state ... and let the chain reassemble itself from
|
|
||||||
%% %% length zero.
|
|
||||||
%% #projection_v1{epoch_number=Epoch_newprop10, all_members=All_list,
|
|
||||||
%% members_dict=MembersDict} = P_newprop10,
|
|
||||||
%% P_noneprop0 = make_none_projection(MyName, All_list, MembersDict),
|
|
||||||
%% P_noneprop1 = P_noneprop0#projection_v1{epoch_number=Epoch_newprop10},
|
|
||||||
%% %% Just to be clear, we clobber any flapping info by setting dbg.
|
|
||||||
%% P_noneprop = P_noneprop1#projection_v1{dbg=ClauseInfo},
|
|
||||||
%% react_to_env_C100(P_noneprop, P_latest, S);
|
|
||||||
|
|
||||||
%% 2015-04-14: Let's experiment with using the current inner
|
|
||||||
%% projection (or, if there really is no inner, just P_current).
|
|
||||||
%% This is safe because it's already P_current and by assumption,
|
|
||||||
%% anything that made it through the logical maze to get here
|
|
||||||
%% is safe. So re-using it with a higher epoch number doesn't
|
|
||||||
%% make any significant change.
|
|
||||||
%%
|
|
||||||
%% Yeah, it appears to work, also, nice! This can help save some
|
|
||||||
%% repair operations (compared to the other safe thing to do
|
|
||||||
%% here, which uses make_none_projection() to build & repair the
|
|
||||||
%% entire chain from scratch). Note that this isn't a guarantee
|
|
||||||
%% that repair steps will be minimized: for a 4-member cluster
|
|
||||||
%% that has an asymmetric partition which organizes 3 clusters of
|
|
||||||
%% inner-upi=[a], inner-upi=[b], and inner-upi[c,d], there is no
|
|
||||||
%% guarantee (yet?) that the [c,d] chain will be the UPI basis
|
|
||||||
%% for repairs when the partition is healed: the quickest author
|
|
||||||
%% after the healing will make that choice for everyone.
|
|
||||||
%%
|
|
||||||
%% 2015-07-06: Ha! This works, almost all of the time. But there
|
|
||||||
%% is a bug.
|
|
||||||
%%
|
|
||||||
%% The bug: if a repair has finished near the time that we fall
|
|
||||||
%% out of flapping mode and back to normal (one of the reasons
|
|
||||||
%% that we are here), then it's possible to have a situation like
|
|
||||||
%% this:
|
|
||||||
%% outer: {epoch,4638},{author,c},{upi,[e,c]},{repair,[d,a,b]}
|
|
||||||
%% inner: {epoch,4539},{author,e},{upi,[e,c,d]},{repair,[]}
|
|
||||||
%%
|
|
||||||
%% Code prior to today would simply use the inner projection and
|
|
||||||
%% only keep the outer's epoch number. However, if we do that,
|
|
||||||
%% then C100 will fail a sanity check: author e cannot add d to
|
|
||||||
%% the end of the UPI, only C is allowed to do that.
|
|
||||||
%%
|
|
||||||
%% After checking all 5 participants, they all agree with the
|
|
||||||
%% outer and inner shown above. But all 5 fail their C100
|
|
||||||
%% transition safety check, and so all 5 spin in an infinite loop,
|
|
||||||
%% cool!
|
|
||||||
%%
|
|
||||||
%% Fix for today: We are going to game the system. We know that
|
|
||||||
%% C100 is going to be checking authorship relative to P_current's
|
|
||||||
%% UPI's tail. Therefore, we're just going to set it here.
|
|
||||||
%% Why??? Because we have been using this projection safely for
|
|
||||||
%% the entire flapping period! ... The only other way I see is to
|
|
||||||
%% allow C100 to carve out an exception if the repair finished
|
|
||||||
%% PLUS author_server check fails PLUS if we came from here, but
|
|
||||||
%% that feels a bit fragile to me: if some code factoring happens
|
|
||||||
%% in projection_transition_is_saneprojection_transition_is_sane()
|
|
||||||
%% or elsewhere that causes the author_server check to be
|
|
||||||
%% something-other-than-the-final-thing-checked, then such a
|
|
||||||
%% refactoring would likely cause an even harder bug to find &
|
|
||||||
%% fix. Conditions tested: 5 FLUs plus alternating partitions of:
|
|
||||||
%% [
|
|
||||||
%% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
|
|
||||||
%% [{b,a},{d,e}],
|
|
||||||
%% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], []
|
|
||||||
%% ].
|
|
||||||
%%
|
|
||||||
%% TODO: Perhaps that quickest author should consult all of the
|
|
||||||
%% other private stores, check their inner, and if there is a
|
|
||||||
%% higher rank there, then goto C200 for a wait-and-see cycle?
|
|
||||||
%% TODO: 2015-07-04 The suggestion in TODO above appears very good.
|
|
||||||
%% Also, at least some of the time, MyName is included
|
|
||||||
%% in the down list, quite odd! Go investigate that.
|
|
||||||
|
|
||||||
P_inner2A = inner_projection_or_self(P_current),
|
P_inner2A = inner_projection_or_self(P_current),
|
||||||
ResetEpoch = P_newprop10#projection_v1.epoch_number,
|
ResetEpoch = P_newprop10#projection_v1.epoch_number,
|
||||||
ResetAuthor = case P_current#projection_v1.upi of
|
ResetAuthor = case P_current#projection_v1.upi of
|
||||||
|
@ -1140,7 +1061,17 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra,
|
||||||
dbg=ClauseInfo++ClauseInfo2}),
|
dbg=ClauseInfo++ClauseInfo2}),
|
||||||
ReactI = [{inner2b,machi_projection:make_summary(P_inner2B)}],
|
ReactI = [{inner2b,machi_projection:make_summary(P_inner2B)}],
|
||||||
?REACT({a30, ?LINE, ReactI}),
|
?REACT({a30, ?LINE, ReactI}),
|
||||||
react_to_env_C100(P_inner2B, P_latest, S);
|
%% In the past, we've tried:
|
||||||
|
%% react_to_env_C100(P_inner2B, P_latest, S);
|
||||||
|
%%
|
||||||
|
%% But we *know* that direct transition is racy/buggy: if
|
||||||
|
%% P_latest UPIs are not unanimous, then we run the risk of
|
||||||
|
%% non-disjoint UPIs; state B10 exists for a reason!
|
||||||
|
%%
|
||||||
|
%% So, we're going to use P_inner2B as our new proposal and run
|
||||||
|
%% it through the regular system, as we did prior to 2015-04-14.
|
||||||
|
react_to_env_A40(Retries, P_inner2B, P_latest,
|
||||||
|
LatestUnanimousP, S10);
|
||||||
true ->
|
true ->
|
||||||
?REACT({a30, ?LINE, []}),
|
?REACT({a30, ?LINE, []}),
|
||||||
react_to_env_A40(Retries, P_newprop10, P_latest,
|
react_to_env_A40(Retries, P_newprop10, P_latest,
|
||||||
|
|
Loading…
Reference in a new issue