Tweak IsRelevantToMe_p in B10 (more)

Last night we hit a rare case of failed convergence.

f was out of sync with the rest of the world.
f: upi=[b,g,f] repairing=[a,c]
The "rest of the world" used a larger chain at:
*: upi=[c,b,g,a], repairing=[f]

And f refused to join the larger chain because of the way that
IsRelevantToMe_p was being calculated before this commit.

Hrrrm, though, I'm not convinced that this particular problem
is fixed 100% by this patch.  What if the chain lengths were
the same but also UPI incompatible?  e.g. if I remove 'a' from
the "real world (in the partition simulator)" example above:

f: upi=[b,g,f] repairing=[c]
*: upi=[c,b,g], repairing=[f]

Hrmmmmm, I may need to reintroduce the my-recent-adopted-projection-
flapping-like-counter thingie to try to break this kind of
incompatible deadlock.
This commit is contained in:
Scott Lystig Fritchie 2015-09-14 13:40:34 +09:00
parent 62186395ed
commit fdf78bdbbc
3 changed files with 42 additions and 10 deletions

View file

@ -384,8 +384,9 @@ handle_info({'DOWN',_Ref,process,Worker,Res},
{noreply, S#ch_mgr{ignore_timer=false,
repair_worker=undefined,
repair_final_status=Res}};
handle_info(Msg, S) ->
case get(todo_bummer) of undefined -> io:format("TODO: got ~p\n", [Msg]);
handle_info(Msg, #ch_mgr{name=MyName}=S) ->
case get(todo_bummer) of undefined -> io:format("TODO: ~w got ~p\n",
[MyName, Msg]);
_ -> ok
end,
put(todo_bummer, true),
@ -1371,6 +1372,14 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
{current_epoch, P_current#projection_v1.epoch_number},
{latest_unanimous_p, LatestUnanimousP}]}),
%% TODO 2015-09-14: Should rank be factored in here? If P_latest
%% rank is definitely lower than current rank (or perhaps lower
%% than P_newprop rank?), then don't accept it. Hrm, I'm not sure
%% how that would ripple through the rest of this state machine &
%% interactions with the other state machines, hrmmmm. For a real
%% example, if current upi/rep = [c,b,g,a],[f], going to P_latest
%% of [b,g,f],[a,c] doesn't make sense (even if we ignore the UPI
%% sanity problem in this example).
react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP,
P_current_calc, AmHosedP,
Rank_newprop, Rank_latest, S);
@ -1536,14 +1545,14 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, P_current_calc,
false;
I_am_in_P_latest_repairing ->
%% If I'm already in the current UPI, and the
%% UPI is longer than 1 (i.e., more than just
%% me), then it makes no sense to leave the UPI
%% to go to someone else's suggestion of
%% current UPI is longer than P_latest's UPI,
%% then it makes no sense to leave the UPI to
%% go to someone else's suggestion of
%% repairing. If I'm the only member of
%% P_current UPI, then sure, then having me
%% join a repairing list is relevant.
not (lists:member(MyName, P_current_upi) andalso
length(P_current_upi) > 1);
length(P_current_upi) > length(P_latest_upi));
true ->
true
end,

View file

@ -667,7 +667,17 @@ do_pb_request_common(Sock, ReqID, Req, GetReply_p) ->
{error, {badmatch, Noo, erlang:get_stacktrace()}};
error:{badmatch,_}=BadMatch ->
put(bad_sock, Sock),
{error, {badmatch, BadMatch, erlang:get_stacktrace()}}
{error, {badmatch, BadMatch, erlang:get_stacktrace()}};
error:Whoa ->
put(bad_sock, Sock),
%% TODO: The machi_chain_manager1_converge_demo:t() test can
%% create a large number of these errors when moving from
%% no partitions to many partitions:
%% Whoa undefined: function_clause
%% In theory this is harmless, because the client will retry
%% with a new socket. But, fix it anyway.
io:format(user, "DBG Whoa ~w: ~w\n", [Sock, Whoa]),
{error, {whoa, Whoa, erlang:get_stacktrace()}}
end.
filter_sock_error_result({error, closed}) ->

View file

@ -269,7 +269,7 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) ->
%% If stable, return true to short circuit remaining
private_projections_are_stable(Namez, DoIt)
end, false, lists:seq(0, MaxIters)),
io:format(user, "\nSweet, private projections are stable\n", []),
io:format(user, "\nSweet, private projections are stable at ~w\n", [time()]),
io:format(user, "\t~P\n", [get(stable), 14]),
io:format(user, "Rolling sanity check ... ", []),
PrivProjs = [{Name, begin
@ -411,7 +411,8 @@ make_partition_list(All_list) ->
%% Concat = _X_Ys1 ++ _X_Ys2 ++ _X_Ys3,
Concat = _X_Ys1 ++ _X_Ys2 ++ _X_Ys3 ++ _X_Ys4,
NoPartitions = lists:duplicate(trunc(length(Concat) * 0.1), []),
random_sort(lists:usort([lists:sort(L) || L <- Concat]) ++ NoPartitions).
uniq_reverse(random_sort(lists:usort([lists:sort(L) || L <- Concat])
++ NoPartitions)).
%% %% for len=5 and 2 witnesses
%% [
@ -720,9 +721,21 @@ get_latest_inner_proj_summ(FLU) ->
EpochID = {E, CSum4},
{EpochID, UPI, Repairing, Down, Witnesses, Inner_p}.
uniq_reverse(L) ->
uniq_reverse(L, []).
uniq_reverse([], Acc) ->
Acc;
uniq_reverse([H|T], []) ->
uniq_reverse(T, [H]);
uniq_reverse([Same|T], [Same|_]=Acc) ->
uniq_reverse(T, Acc);
uniq_reverse([H|T], Acc) ->
uniq_reverse(T, [H|Acc]).
random_sort(L) ->
random:seed(now()),
L1 = [{random:uniform(99999), X} || X <- L],
L1 = [{random:uniform(), X} || X <- L],
[X || {_, X} <- lists:sort(L1)].
foo(NumFLUs, MgrOpts0) ->