Tweak IsRelevantToMe_p in B10 (more)
Last night we hit a rare case of failed convergence. f was out of sync with the rest of the world. f: upi=[b,g,f] repairing=[a,c] The "rest of the world" used a larger chain at: *: upi=[c,b,g,a], repairing=[f] And f refused to join the larger chain because of the way that IsRelevantToMe_p was being calculated before this commit. Hrrrm, though, I'm not convinced that this particular problem is fixed 100% by this patch. What if the chain lengths were the same but also UPI incompatible? e.g. if I remove 'a' from the "real world (in the partition simulator)" example above: f: upi=[b,g,f] repairing=[c] *: upi=[c,b,g], repairing=[f] Hrmmmmm, I may need to reintroduce the my-recent-adopted-projection- flapping-like-counter thingie to try to break this kind of incompatible deadlock.
This commit is contained in:
parent
62186395ed
commit
fdf78bdbbc
3 changed files with 42 additions and 10 deletions
|
@ -384,8 +384,9 @@ handle_info({'DOWN',_Ref,process,Worker,Res},
|
|||
{noreply, S#ch_mgr{ignore_timer=false,
|
||||
repair_worker=undefined,
|
||||
repair_final_status=Res}};
|
||||
handle_info(Msg, S) ->
|
||||
case get(todo_bummer) of undefined -> io:format("TODO: got ~p\n", [Msg]);
|
||||
handle_info(Msg, #ch_mgr{name=MyName}=S) ->
|
||||
case get(todo_bummer) of undefined -> io:format("TODO: ~w got ~p\n",
|
||||
[MyName, Msg]);
|
||||
_ -> ok
|
||||
end,
|
||||
put(todo_bummer, true),
|
||||
|
@ -1371,6 +1372,14 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
|
|||
{current_epoch, P_current#projection_v1.epoch_number},
|
||||
{latest_unanimous_p, LatestUnanimousP}]}),
|
||||
|
||||
%% TODO 2015-09-14: Should rank be factored in here? If P_latest
|
||||
%% rank is definitely lower than current rank (or perhaps lower
|
||||
%% than P_newprop rank?), then don't accept it. Hrm, I'm not sure
|
||||
%% how that would ripple through the rest of this state machine &
|
||||
%% interactions with the other state machines, hrmmmm. For a real
|
||||
%% example, if current upi/rep = [c,b,g,a],[f], going to P_latest
|
||||
%% of [b,g,f],[a,c] doesn't make sense (even if we ignore the UPI
|
||||
%% sanity problem in this example).
|
||||
react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
|
||||
P_current_calc, AmHosedP,
|
||||
Rank_newprop, Rank_latest, S);
|
||||
|
@ -1536,14 +1545,14 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, P_current_calc,
|
|||
false;
|
||||
I_am_in_P_latest_repairing ->
|
||||
%% If I'm already in the current UPI, and the
|
||||
%% UPI is longer than 1 (i.e., more than just
|
||||
%% me), then it makes no sense to leave the UPI
|
||||
%% to go to someone else's suggestion of
|
||||
%% current UPI is longer than P_latest's UPI,
|
||||
%% then it makes no sense to leave the UPI to
|
||||
%% go to someone else's suggestion of
|
||||
%% repairing. If I'm the only member of
|
||||
%% P_current UPI, then sure, then having me
|
||||
%% join a repairing list is relevant.
|
||||
not (lists:member(MyName, P_current_upi) andalso
|
||||
length(P_current_upi) > 1);
|
||||
length(P_current_upi) > length(P_latest_upi));
|
||||
true ->
|
||||
true
|
||||
end,
|
||||
|
|
|
@ -667,7 +667,17 @@ do_pb_request_common(Sock, ReqID, Req, GetReply_p) ->
|
|||
{error, {badmatch, Noo, erlang:get_stacktrace()}};
|
||||
error:{badmatch,_}=BadMatch ->
|
||||
put(bad_sock, Sock),
|
||||
{error, {badmatch, BadMatch, erlang:get_stacktrace()}}
|
||||
{error, {badmatch, BadMatch, erlang:get_stacktrace()}};
|
||||
error:Whoa ->
|
||||
put(bad_sock, Sock),
|
||||
%% TODO: The machi_chain_manager1_converge_demo:t() test can
|
||||
%% create a large number of these errors when moving from
|
||||
%% no partitions to many partitions:
|
||||
%% Whoa undefined: function_clause
|
||||
%% In theory this is harmless, because the client will retry
|
||||
%% with a new socket. But, fix it anyway.
|
||||
io:format(user, "DBG Whoa ~w: ~w\n", [Sock, Whoa]),
|
||||
{error, {whoa, Whoa, erlang:get_stacktrace()}}
|
||||
end.
|
||||
|
||||
filter_sock_error_result({error, closed}) ->
|
||||
|
|
|
@ -269,7 +269,7 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) ->
|
|||
%% If stable, return true to short circuit remaining
|
||||
private_projections_are_stable(Namez, DoIt)
|
||||
end, false, lists:seq(0, MaxIters)),
|
||||
io:format(user, "\nSweet, private projections are stable\n", []),
|
||||
io:format(user, "\nSweet, private projections are stable at ~w\n", [time()]),
|
||||
io:format(user, "\t~P\n", [get(stable), 14]),
|
||||
io:format(user, "Rolling sanity check ... ", []),
|
||||
PrivProjs = [{Name, begin
|
||||
|
@ -411,7 +411,8 @@ make_partition_list(All_list) ->
|
|||
%% Concat = _X_Ys1 ++ _X_Ys2 ++ _X_Ys3,
|
||||
Concat = _X_Ys1 ++ _X_Ys2 ++ _X_Ys3 ++ _X_Ys4,
|
||||
NoPartitions = lists:duplicate(trunc(length(Concat) * 0.1), []),
|
||||
random_sort(lists:usort([lists:sort(L) || L <- Concat]) ++ NoPartitions).
|
||||
uniq_reverse(random_sort(lists:usort([lists:sort(L) || L <- Concat])
|
||||
++ NoPartitions)).
|
||||
|
||||
%% %% for len=5 and 2 witnesses
|
||||
%% [
|
||||
|
@ -720,9 +721,21 @@ get_latest_inner_proj_summ(FLU) ->
|
|||
EpochID = {E, CSum4},
|
||||
{EpochID, UPI, Repairing, Down, Witnesses, Inner_p}.
|
||||
|
||||
uniq_reverse(L) ->
|
||||
uniq_reverse(L, []).
|
||||
|
||||
uniq_reverse([], Acc) ->
|
||||
Acc;
|
||||
uniq_reverse([H|T], []) ->
|
||||
uniq_reverse(T, [H]);
|
||||
uniq_reverse([Same|T], [Same|_]=Acc) ->
|
||||
uniq_reverse(T, Acc);
|
||||
uniq_reverse([H|T], Acc) ->
|
||||
uniq_reverse(T, [H|Acc]).
|
||||
|
||||
random_sort(L) ->
|
||||
random:seed(now()),
|
||||
L1 = [{random:uniform(99999), X} || X <- L],
|
||||
L1 = [{random:uniform(), X} || X <- L],
|
||||
[X || {_, X} <- lists:sort(L1)].
|
||||
|
||||
foo(NumFLUs, MgrOpts0) ->
|
||||
|
|
Loading…
Reference in a new issue