WIP: bugfix for rare flapping infinite loop (unfinished)
This commit is contained in:
parent
0f667c4356
commit
6f9a603e99
1 changed files with 111 additions and 57 deletions
|
@ -1246,7 +1246,6 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
|
||||||
andalso
|
andalso
|
||||||
UnanimousLatestInnerNotRelevant_p ->
|
UnanimousLatestInnerNotRelevant_p ->
|
||||||
?REACT({b10, ?LINE, []}),
|
?REACT({b10, ?LINE, []}),
|
||||||
put(b10_hack, false),
|
|
||||||
|
|
||||||
%% Do not go to C100, because we want to ignore this latest
|
%% Do not go to C100, because we want to ignore this latest
|
||||||
%% proposal. Write ours instead via C300.
|
%% proposal. Write ours instead via C300.
|
||||||
|
@ -1260,7 +1259,6 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
|
||||||
{newprop_epoch,P_newprop#projection_v1.epoch_number},
|
{newprop_epoch,P_newprop#projection_v1.epoch_number},
|
||||||
{newprop_author,P_newprop#projection_v1.author_server}
|
{newprop_author,P_newprop#projection_v1.author_server}
|
||||||
]}),
|
]}),
|
||||||
put(b10_hack, false),
|
|
||||||
|
|
||||||
react_to_env_C100(P_newprop, P_latest, S);
|
react_to_env_C100(P_newprop, P_latest, S);
|
||||||
|
|
||||||
|
@ -1269,17 +1267,14 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
|
||||||
?REACT({b10, ?LINE, [i_am_flapping,
|
?REACT({b10, ?LINE, [i_am_flapping,
|
||||||
{newprop_flap_count, P_newprop_flap_count},
|
{newprop_flap_count, P_newprop_flap_count},
|
||||||
{flap_limit, FlapLimit}]}),
|
{flap_limit, FlapLimit}]}),
|
||||||
_B10Hack = get(b10_hack),
|
|
||||||
case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of
|
case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of
|
||||||
true ->
|
true ->
|
||||||
io:format(user, "{FLAP: ~w flaps ~w}! ", [S#ch_mgr.name, P_newprop_flap_count]);
|
io:format(user, "{FLAP: ~w flaps ~w}! ", [S#ch_mgr.name, P_newprop_flap_count]);
|
||||||
_ ->
|
_ ->
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
|
|
||||||
if
|
|
||||||
%% MEANWHILE, we have learned some things about this
|
%% MEANWHILE, we have learned some things about this
|
||||||
%% algorithm in the past few months. With the introduction
|
%% algorithm in the past many months. With the introduction
|
||||||
%% of the "inner projection" concept, we know that the inner
|
%% of the "inner projection" concept, we know that the inner
|
||||||
%% projection may be stable but the "outer" projection will
|
%% projection may be stable but the "outer" projection will
|
||||||
%% continue to be flappy for as long as there's an
|
%% continue to be flappy for as long as there's an
|
||||||
|
@ -1293,7 +1288,6 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
|
||||||
%% doesn't give an accurate sense of global flapping state.
|
%% doesn't give an accurate sense of global flapping state.
|
||||||
%% FlapLimit is enough to be able to tell us to slow down.
|
%% FlapLimit is enough to be able to tell us to slow down.
|
||||||
|
|
||||||
true ->
|
|
||||||
%% We already know that I'm flapping. We need to
|
%% We already know that I'm flapping. We need to
|
||||||
%% signal to the rest of the world that I'm writing
|
%% signal to the rest of the world that I'm writing
|
||||||
%% and flapping and churning, so we cannot always
|
%% and flapping and churning, so we cannot always
|
||||||
|
@ -1317,14 +1311,28 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
|
||||||
%% the flap cycle continues enough times so that
|
%% the flap cycle continues enough times so that
|
||||||
%% everyone notices then eventually falls into
|
%% everyone notices then eventually falls into
|
||||||
%% consensus.
|
%% consensus.
|
||||||
?REACT({b10, ?LINE, [flap_continue]}),
|
#projection_v1{all_members=All_list,
|
||||||
react_to_env_C300(P_newprop, P_latest, S2)
|
down=Down_list,
|
||||||
end
|
flap=OldFlap} = P_newprop,
|
||||||
|
#flap_i{all_flap_counts=AllFlapCounts} = OldFlap,
|
||||||
|
PossibleFlappers = All_list -- Down_list,
|
||||||
|
SeenFlappers =
|
||||||
|
[FLU || {FLU, {{{epk,_},_}, Cnt}} <- AllFlapCounts,
|
||||||
|
Cnt >= FlapLimit],
|
||||||
|
FlappingAll = (PossibleFlappers -- SeenFlappers) == [],
|
||||||
|
%% io:format(user, ", ~w of ~w is ~W, ", [lists:sort(SeenFlappers), lists:sort(PossibleFlappers), FlappingAll, 5]),
|
||||||
|
NewFlap = OldFlap#flap_i{flapping_me=true,
|
||||||
|
flapping_all=FlappingAll},
|
||||||
|
P_newprop2 = machi_projection:update_checksum(
|
||||||
|
P_newprop#projection_v1{flap=NewFlap}),
|
||||||
|
?REACT({b10, ?LINE, [flap_continue,
|
||||||
|
{flapping_me, true},
|
||||||
|
{flapping_all, FlappingAll}]}),
|
||||||
|
react_to_env_C300(P_newprop2, P_latest, S2)
|
||||||
end;
|
end;
|
||||||
|
|
||||||
Retries > 2 ->
|
Retries > 2 ->
|
||||||
?REACT({b10, ?LINE, [{retries, Retries}]}),
|
?REACT({b10, ?LINE, [{retries, Retries}]}),
|
||||||
put(b10_hack, false),
|
|
||||||
|
|
||||||
%% The author of P_latest is too slow or crashed.
|
%% The author of P_latest is too slow or crashed.
|
||||||
%% Let's try to write P_newprop and see what happens!
|
%% Let's try to write P_newprop and see what happens!
|
||||||
|
@ -1337,7 +1345,6 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
|
||||||
[{rank_latest, Rank_latest},
|
[{rank_latest, Rank_latest},
|
||||||
{rank_newprop, Rank_newprop},
|
{rank_newprop, Rank_newprop},
|
||||||
{latest_author, P_latest#projection_v1.author_server}]}),
|
{latest_author, P_latest#projection_v1.author_server}]}),
|
||||||
put(b10_hack, false),
|
|
||||||
|
|
||||||
%% TODO: Is a UnanimousLatestInnerNotRelevant_p test needed in this clause???
|
%% TODO: Is a UnanimousLatestInnerNotRelevant_p test needed in this clause???
|
||||||
|
|
||||||
|
@ -1349,7 +1356,6 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
|
||||||
true ->
|
true ->
|
||||||
?REACT({b10, ?LINE}),
|
?REACT({b10, ?LINE}),
|
||||||
?REACT({b10, ?LINE, [{retries,Retries},{rank_latest, Rank_latest}, {rank_newprop, Rank_newprop}, {latest_author, P_latest#projection_v1.author_server}]}), % TODO debug delete me!
|
?REACT({b10, ?LINE, [{retries,Retries},{rank_latest, Rank_latest}, {rank_newprop, Rank_newprop}, {latest_author, P_latest#projection_v1.author_server}]}), % TODO debug delete me!
|
||||||
put(b10_hack, false),
|
|
||||||
|
|
||||||
%% P_newprop is best, so let's write it.
|
%% P_newprop is best, so let's write it.
|
||||||
react_to_env_C300(P_newprop, P_latest, S)
|
react_to_env_C300(P_newprop, P_latest, S)
|
||||||
|
@ -1359,6 +1365,10 @@ react_to_env_C100(P_newprop, P_latest,
|
||||||
#ch_mgr{name=MyName, proj=P_current}=S) ->
|
#ch_mgr{name=MyName, proj=P_current}=S) ->
|
||||||
?REACT(c100),
|
?REACT(c100),
|
||||||
|
|
||||||
|
%% Note: The value of `Sane' may be `true', `false', or `term() /= true'.
|
||||||
|
%% The error value `false' is reserved for chain order violations.
|
||||||
|
%% Any other non-true value can be used for projection structure
|
||||||
|
%% construction errors, checksum error, etc.
|
||||||
Sane = projection_transition_is_sane(P_current, P_latest, MyName),
|
Sane = projection_transition_is_sane(P_current, P_latest, MyName),
|
||||||
case Sane of
|
case Sane of
|
||||||
_ when P_current#projection_v1.epoch_number == 0 ->
|
_ when P_current#projection_v1.epoch_number == 0 ->
|
||||||
|
@ -1682,6 +1692,11 @@ projection_transition_is_sane_retrospective(P1, P2, RelativeToServer) ->
|
||||||
projection_transition_is_sane(P1, P2, RelativeToServer) ->
|
projection_transition_is_sane(P1, P2, RelativeToServer) ->
|
||||||
projection_transition_is_sane(P1, P2, RelativeToServer, false).
|
projection_transition_is_sane(P1, P2, RelativeToServer, false).
|
||||||
|
|
||||||
|
%% @doc Check if a projection transition is sane & safe.
|
||||||
|
%%
|
||||||
|
%% NOTE: The return value convention is `true' for sane/safe and
|
||||||
|
%% `term() /= true' for any unsafe/insane value.
|
||||||
|
|
||||||
projection_transition_is_sane(P1, P2, RelativeToServer, RetrospectiveP) ->
|
projection_transition_is_sane(P1, P2, RelativeToServer, RetrospectiveP) ->
|
||||||
put(why2, []),
|
put(why2, []),
|
||||||
case projection_transition_is_sane_with_si_epoch(
|
case projection_transition_is_sane_with_si_epoch(
|
||||||
|
@ -1741,6 +1756,12 @@ projection_transition_is_sane_final_review(_P1, P2,
|
||||||
projection_transition_is_sane_final_review(_P1, _P2, Else) ->
|
projection_transition_is_sane_final_review(_P1, _P2, Else) ->
|
||||||
?RETURN2(Else).
|
?RETURN2(Else).
|
||||||
|
|
||||||
|
%% @doc Check if a projection transition is sane & safe with a
|
||||||
|
%% strictly increasing epoch number.
|
||||||
|
%%
|
||||||
|
%% NOTE: The return value convention is `true' for sane/safe and
|
||||||
|
%% `term() /= true' for any unsafe/insane value.
|
||||||
|
|
||||||
projection_transition_is_sane_with_si_epoch(
|
projection_transition_is_sane_with_si_epoch(
|
||||||
#projection_v1{epoch_number=Epoch1} = P1,
|
#projection_v1{epoch_number=Epoch1} = P1,
|
||||||
#projection_v1{epoch_number=Epoch2} = P2,
|
#projection_v1{epoch_number=Epoch2} = P2,
|
||||||
|
@ -1749,11 +1770,22 @@ projection_transition_is_sane_with_si_epoch(
|
||||||
P1, P2, RelativeToServer, RetrospectiveP) of
|
P1, P2, RelativeToServer, RetrospectiveP) of
|
||||||
true ->
|
true ->
|
||||||
%% Must be a strictly increasing epoch.
|
%% Must be a strictly increasing epoch.
|
||||||
?RETURN2(Epoch2 > Epoch1);
|
case Epoch2 > Epoch1 of
|
||||||
|
true ->
|
||||||
|
?RETURN2(true);
|
||||||
|
false ->
|
||||||
|
?RETURN2({epoch_not_si, Epoch2, 'not_gt', Epoch1})
|
||||||
|
end;
|
||||||
Else ->
|
Else ->
|
||||||
?RETURN2(Else)
|
?RETURN2(Else)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
%% @doc Check if a projection transition is sane & safe with the
|
||||||
|
%% exception of a strictly increasing epoch number (equality is ok).
|
||||||
|
%%
|
||||||
|
%% NOTE: The return value convention is `true' for sane/safe and
|
||||||
|
%% `term() /= true' for any unsafe/insane value.
|
||||||
|
|
||||||
projection_transition_is_sane_except_si_epoch(
|
projection_transition_is_sane_except_si_epoch(
|
||||||
#projection_v1{epoch_number=Epoch1,
|
#projection_v1{epoch_number=Epoch1,
|
||||||
epoch_csum=CSum1,
|
epoch_csum=CSum1,
|
||||||
|
@ -1816,6 +1848,9 @@ projection_transition_is_sane_except_si_epoch(
|
||||||
true = sets:is_disjoint(DownS2, RepairingS2),
|
true = sets:is_disjoint(DownS2, RepairingS2),
|
||||||
true = sets:is_disjoint(UPIS2, RepairingS2),
|
true = sets:is_disjoint(UPIS2, RepairingS2),
|
||||||
|
|
||||||
|
%% We won't check the checksum of P1, but we will of P2.
|
||||||
|
P2 = machi_projection:update_checksum(P2),
|
||||||
|
|
||||||
%% Hooray, all basic properties of the projection's elements are
|
%% Hooray, all basic properties of the projection's elements are
|
||||||
%% not obviously bad. Now let's check if the UPI+Repairing->UPI
|
%% not obviously bad. Now let's check if the UPI+Repairing->UPI
|
||||||
%% transition is good.
|
%% transition is good.
|
||||||
|
@ -1828,6 +1863,8 @@ projection_transition_is_sane_except_si_epoch(
|
||||||
S1 = machi_projection:make_summary(P1),
|
S1 = machi_projection:make_summary(P1),
|
||||||
S2 = machi_projection:make_summary(P2),
|
S2 = machi_projection:make_summary(P2),
|
||||||
Trace = erlang:get_stacktrace(),
|
Trace = erlang:get_stacktrace(),
|
||||||
|
%% There are basic data structure checks only, do not return `false'
|
||||||
|
%% here.
|
||||||
{err, _Type, _Err, from, S1, to, S2, relative_to, RelativeToServer,
|
{err, _Type, _Err, from, S1, to, S2, relative_to, RelativeToServer,
|
||||||
history, (catch lists:sort([no_history])),
|
history, (catch lists:sort([no_history])),
|
||||||
stack, Trace}
|
stack, Trace}
|
||||||
|
@ -2069,17 +2106,26 @@ simple_chain_state_transition_is_sane(UPI1, Repair1, UPI2) ->
|
||||||
?RETURN2(simple_chain_state_transition_is_sane(undefined, UPI1, Repair1,
|
?RETURN2(simple_chain_state_transition_is_sane(undefined, UPI1, Repair1,
|
||||||
undefined, UPI2)).
|
undefined, UPI2)).
|
||||||
|
|
||||||
|
%% @doc Simple check if a projection transition is sane & safe: we assume
|
||||||
|
%% that the caller has checked basic projection data structure contents.
|
||||||
|
%%
|
||||||
|
%% NOTE: The return value convention is `true' for sane/safe and
|
||||||
|
%% `term() /= true' for any unsafe/insane value.
|
||||||
|
|
||||||
simple_chain_state_transition_is_sane(_Author1, UPI1, Repair1, Author2, UPI2) ->
|
simple_chain_state_transition_is_sane(_Author1, UPI1, Repair1, Author2, UPI2) ->
|
||||||
{KeepsDels, Orders} = mk(UPI1, Repair1, UPI2),
|
{KeepsDels, Orders} = mk(UPI1, Repair1, UPI2),
|
||||||
NumKeeps = length([x || keep <- KeepsDels]),
|
NumKeeps = length([x || keep <- KeepsDels]),
|
||||||
NumOrders = length(Orders),
|
NumOrders = length(Orders),
|
||||||
Answer1 = false == lists:member(error, Orders)
|
NoErrorInOrders = (false == lists:member(error, Orders)),
|
||||||
andalso Orders == lists:sort(Orders)
|
OrdersOK = (Orders == lists:sort(Orders)),
|
||||||
andalso length(UPI2) == NumKeeps + NumOrders,
|
UPI2LengthOK = (length(UPI2) == NumKeeps + NumOrders),
|
||||||
catch (put(react, [{sane,author1,_Author1, upi1,UPI1, repair1,Repair1,
|
Answer1 = NoErrorInOrders andalso OrdersOK andalso UPI2LengthOK,
|
||||||
|
catch ?REACT({simple, ?LINE,
|
||||||
|
[{sane, answer1,Answer1,
|
||||||
|
author1,_Author1, upi1,UPI1, repair1,Repair1,
|
||||||
author2,Author2, upi2,UPI2,
|
author2,Author2, upi2,UPI2,
|
||||||
keepsdels,KeepsDels, orders,Orders, numKeeps,NumKeeps,
|
keepsdels,KeepsDels, orders,Orders, numKeeps,NumKeeps,
|
||||||
numOrders,NumOrders, answer1,Answer1}|get(react)])),
|
numOrders,NumOrders, answer1,Answer1}]}),
|
||||||
if not Answer1 ->
|
if not Answer1 ->
|
||||||
?RETURN2(Answer1);
|
?RETURN2(Answer1);
|
||||||
true ->
|
true ->
|
||||||
|
@ -2104,6 +2150,14 @@ simple_chain_state_transition_is_sane(_Author1, UPI1, Repair1, Author2, UPI2) ->
|
||||||
end
|
end
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
%% @doc Check if a projection transition is sane & safe: we assume
|
||||||
|
%% that the caller has checked basic projection data structure contents.
|
||||||
|
%%
|
||||||
|
%% NOTE: The return value convention is `true' for sane/safe and `term() /=
|
||||||
|
%% true' for any unsafe/insane value. This function (and its callee
|
||||||
|
%% functions) are the only functions (throughout all of the chain state
|
||||||
|
%% transition sanity checking functions) that is allowed to return `false'.
|
||||||
|
|
||||||
chain_state_transition_is_sane(Author1, UPI1, Repair1, Author2, UPI2) ->
|
chain_state_transition_is_sane(Author1, UPI1, Repair1, Author2, UPI2) ->
|
||||||
ToSelfOnly_p = if UPI2 == [Author2] -> true;
|
ToSelfOnly_p = if UPI2 == [Author2] -> true;
|
||||||
true -> false
|
true -> false
|
||||||
|
|
Loading…
Reference in a new issue