Fix repairs when everyone is in stable flapping state

This commit is contained in:
Scott Lystig Fritchie 2015-08-22 21:27:01 +09:00
parent 2b2facaba2
commit 0414da783a

View file

@ -690,14 +690,17 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
NewUPI_list =
[X || X <- OldUPI_list, lists:member(X, Up) andalso
not lists:member(X, OldWitness_list)],
#projection_v1{upi=CurrentUPI_list} = CurrentProj,
LastInCurrentUPI = case CurrentUPI_list of
[] -> does_not_exist_because_upi_is_empty;
[_|_] -> lists:last(CurrentUPI_list)
%% If we are not flapping (AllHosed /= [], which is a good enough proxy),
%% then we do our repair checks based on the inner projection only. There
%% is no value in doing repairs during flapping.
RepChk_Proj = if AllHosed == [] ->
CurrentProj;
true ->
inner_projection_or_self(CurrentProj)
end,
LastInNewUPI = case NewUPI_list of
RepChk_LastInUPI = case RepChk_Proj#projection_v1.upi of
[] -> does_not_exist_because_upi_is_empty;
[_|_] -> lists:last(NewUPI_list)
[_|_] -> lists:last(RepChk_Proj#projection_v1.upi)
end,
Repairing_list2 = [X || X <- OldRepairing_list,
lists:member(X, Up),
@ -709,7 +712,7 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
{[], []} ->
D_foo=[d_foo1],
{NewUPI_list, [], RunEnv2};
{[], [H|T]} when RelativeToServer == LastInNewUPI ->
{[], [H|T]} when RelativeToServer == RepChk_LastInUPI ->
%% The author is tail of the UPI list. Let's see if
%% *everyone* in the UPI+repairing lists are using our
%% projection. This is to simulate a requirement that repair
@ -719,9 +722,9 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
%% TODO create a real API call for fetching this info?
SameEpoch_p = check_latest_private_projections_same_epoch(
NewUPI_list ++ Repairing_list2,
S#ch_mgr.proj, Partitions, S),
RepChk_Proj, Partitions, S),
if Simulator_p andalso SimRepair_p andalso
SameEpoch_p andalso RelativeToServer == LastInCurrentUPI ->
SameEpoch_p andalso RelativeToServer == RepChk_LastInUPI ->
D_foo=[{repair_airquote_done, {we_agree, (S#ch_mgr.proj)#projection_v1.epoch_number}}],
{NewUPI_list ++ [H], T, RunEnv2};
not (Simulator_p andalso SimRepair_p)
@ -817,6 +820,7 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
{P3, S#ch_mgr{runenv=RunEnv3}, Up}.
check_latest_private_projections_same_epoch(FLUs, MyProj, Partitions, S) ->
#projection_v1{epoch_number=MyEpoch, epoch_csum=MyCSum} = MyProj,
%% NOTE: The caller must provide us with the FLUs list for all
%% FLUs that must be up & available right now. So any
%% failure of perhaps_call_t() means that we must return
@ -828,17 +832,17 @@ check_latest_private_projections_same_epoch(FLUs, MyProj, Partitions, S) ->
?FLU_PC:read_latest_projection(Pid, private, ?TO)
end,
case perhaps_call_t(S, Partitions, FLU, F) of
{ok, RemotePrivateProj} ->
if MyProj#projection_v1.epoch_number ==
RemotePrivateProj#projection_v1.epoch_number
andalso
MyProj#projection_v1.epoch_csum ==
RemotePrivateProj#projection_v1.epoch_csum ->
{ok, RPJ} ->
#projection_v1{epoch_number=RemoteEpoch,
epoch_csum=RemoteCSum} =
inner_projection_or_self(RPJ),
if MyEpoch == RemoteEpoch,
MyCSum == RemoteCSum ->
true;
true ->
false
end;
_ ->
_Else ->
false
end
end,
@ -1344,7 +1348,19 @@ a30_make_inner_projection(P_current, P_newprop3, P_latest, Up,
ordsets:from_list(UPI_latest_i ++ Repairing_latest_i)),
if SameEnough_p ->
?REACT({a30, ?LINE, []}),
P_latest_i;
case P_current_has_inner_p andalso
(UPI_current_x /= P_i3#projection_v1.upi orelse
Repairing_current_x /= P_i3#projection_v1.repairing)
of
true ->
%% Current proj is inner *and* our new
%% proposed inner proj differs substantially
%% from the current. Don't use latest or
%% current.
false;
false ->
P_latest_i
end;
CurrentHasInner_and_LatestIsDisjoint_p ->
?REACT({a30, ?LINE, []}),
P_current_ios;
@ -1541,9 +1557,40 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
true ->
?REACT({a40, ?LINE, [true]}),
GoTo50_p =
case inner_projection_exists(P_current) andalso
inner_projection_exists(P_newprop) andalso
inner_projection_exists(P_latest) of
true ->
%% All three projections are flapping ... do we have a
%% new projection (probably due to repair) that is
%% worth suggesting via C300?
#projection_v1{epoch_number=Epoch_currenti} =
inner_projection_or_self(P_current),
#projection_v1{epoch_number=Epoch_newpropi} =
inner_projection_or_self(P_newprop),
?REACT({a30, ?LINE, [{epoch_currenti,Epoch_currenti},
{epoch_newpropi,Epoch_newpropi}]}),
if Epoch_currenti > Epoch_newpropi ->
%% Inner has a newer epoch, don't go to A50.
?REACT({a30, ?LINE, []}),
false;
true ->
?REACT({a30, ?LINE, []}),
true
end;
false ->
?REACT({a30, ?LINE, []}),
true
end,
if GoTo50_p ->
?REACT({a30, ?LINE, []}),
FinalProps = [{throttle_seconds, 0}],
react_to_env_A50(P_latest, FinalProps, S)
react_to_env_A50(P_latest, FinalProps, S);
true ->
?REACT({a30, ?LINE, []}),
react_to_env_C300(P_newprop, P_latest, S)
end
end.
react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) ->
@ -1781,7 +1828,7 @@ react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName,
S2 = S#ch_mgr{not_sanes=NotSanesDict, sane_transitions=0},
case orddict:fetch(Author_latest, NotSanesDict) of
N when N > ?TOO_FREQUENT_BREAKER ->
?V("\n\nYOYO ~w breaking the cycle of ~p\n", [MyName, machi_projection:make_summary(P_latest)]),
?V("\n\nYOYO ~w breaking the cycle of:\n current: ~w\n new : ~w\n", [MyName, machi_projection:make_summary(S#ch_mgr.proj), machi_projection:make_summary(P_latest)]),
?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}),
react_to_env_C103(P_latest, S2);
N ->
@ -1811,6 +1858,7 @@ react_to_env_C103(#projection_v1{epoch_number=Epoch_latest,
?REACT({c103, ?LINE,
[{current_epoch, P_current#projection_v1.epoch_number},
{none_projection_epoch, Epoch_latest}]}),
timer:sleep(5000), % Let someone else clean up
%% Reset the not_sanes count dictionary here, or else an already
%% ?TOO_FREQUENT_BREAKER count for an author might prevent a
%% transition from C100_inner()->C300, which can lead to infinite
@ -2154,7 +2202,7 @@ if LeaveFlapping_p -> io:format(user, "CALC_FLAP: ~w: flapping_now ~w start ~w l
if AmFlapping_p ->
S2;
true ->
clear_flapping_state(S2)
clear_most_flapping_state(S2)
end}.
make_flapping_i() ->
@ -2234,7 +2282,7 @@ projection_transition_is_sane(P1, P2, RelativeToServer, RetrospectiveP) ->
?RETURN2(Else)
end.
projection_transition_is_sane_final_review(_P1, P2,
projection_transition_is_sane_final_review(P1, P2,
{expected_author2,UPI1_tail}=Else) ->
%% Reminder: P1 & P2 are outer projections
%%
@ -2252,12 +2300,25 @@ projection_transition_is_sane_final_review(_P1, P2,
%%
%% So, there's a special return value that tells us to try to check for
%% the correct authorship here.
if UPI1_tail == P2#projection_v1.author_server ->
P1HasInner_p = inner_projection_exists(P1),
P2HasInner_p = inner_projection_exists(P2),
P1_LastInnerUPI = case (inner_projection_or_self(P1))#projection_v1.upi of
P1InnerUPI=[_|_] when P1HasInner_p ->
lists:last(P1InnerUPI);
_ ->
no_such_author
end,
if P1HasInner_p, P2HasInner_p ->
if UPI1_tail == P1_LastInnerUPI ->
?RETURN2(true);
true ->
?RETURN2(Else)
end;
UPI1_tail == P2#projection_v1.author_server ->
?RETURN2(true);
true ->
?RETURN2({gazzuknkgazzuknk, Else, gazzuknk})
end;
projection_transition_is_sane_final_review(
#projection_v1{mode=CMode1}=_P1,
#projection_v1{mode=CMode2}=_P2,
@ -2819,11 +2880,14 @@ all_hosed_history(#projection_v1{epoch_number=_Epoch, flap=Flap},
end.
clear_flapping_state(S) ->
S2 = clear_most_flapping_state(S),
S#ch_mgr{not_sanes=orddict:new()}.
clear_most_flapping_state(S) ->
S#ch_mgr{flap_count=0,
flap_start=?NOT_FLAPPING_START,
%% Do not clear flap_last_up.
flap_counts_last=[],
not_sanes=orddict:new()}.
flap_counts_last=[]}.
full_majority_size(N) when is_integer(N) ->
(N div 2) + 1;