WIP: Move convergence demo to new module machi_chain_manager1_converg_demo.erl
This commit is contained in:
parent
6cd9dfc977
commit
2b1eb9b144
2 changed files with 412 additions and 360 deletions
412
test/machi_chain_manager1_converge_demo.erl
Normal file
412
test/machi_chain_manager1_converge_demo.erl
Normal file
|
@ -0,0 +1,412 @@
|
||||||
|
%% -------------------------------------------------------------------
|
||||||
|
%%
|
||||||
|
%% Machi: a small village of replicated files
|
||||||
|
%%
|
||||||
|
%% Copyright (c) 2014 Basho Technologies, Inc. All Rights Reserved.
|
||||||
|
%%
|
||||||
|
%% This file is provided to you under the Apache License,
|
||||||
|
%% Version 2.0 (the "License"); you may not use this file
|
||||||
|
%% except in compliance with the License. You may obtain
|
||||||
|
%% a copy of the License at
|
||||||
|
%%
|
||||||
|
%% http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
%%
|
||||||
|
%% Unless required by applicable law or agreed to in writing,
|
||||||
|
%% software distributed under the License is distributed on an
|
||||||
|
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
%% KIND, either express or implied. See the License for the
|
||||||
|
%% specific language governing permissions and limitations
|
||||||
|
%% under the License.
|
||||||
|
%%
|
||||||
|
%% -------------------------------------------------------------------
|
||||||
|
-module(machi_chain_manager1_converge_demo).
|
||||||
|
|
||||||
|
-include("machi.hrl").
|
||||||
|
-include("machi_projection.hrl").
|
||||||
|
|
||||||
|
-define(MGR, machi_chain_manager1).
|
||||||
|
|
||||||
|
-define(D(X), io:format(user, "~s ~p\n", [??X, X])).
|
||||||
|
-define(Dw(X), io:format(user, "~s ~w\n", [??X, X])).
|
||||||
|
-define(FLU_C, machi_flu1_client).
|
||||||
|
-define(FLU_PC, machi_proxy_flu1_client).
|
||||||
|
|
||||||
|
-compile(export_all).
|
||||||
|
|
||||||
|
-ifdef(TEST).
|
||||||
|
|
||||||
|
-ifdef(EQC).
|
||||||
|
-include_lib("eqc/include/eqc.hrl").
|
||||||
|
%% -include_lib("eqc/include/eqc_statem.hrl").
|
||||||
|
-define(QC_OUT(P),
|
||||||
|
eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)).
|
||||||
|
-endif.
|
||||||
|
|
||||||
|
-include_lib("eunit/include/eunit.hrl").
|
||||||
|
|
||||||
|
short_doc() ->
|
||||||
|
"
|
||||||
|
A visualization of the convergence behavior of the chain self-management
|
||||||
|
algorithm for Machi.
|
||||||
|
1. Set up 4 FLUs and chain manager pairs.
|
||||||
|
2. Create a number of different network partition scenarios, where
|
||||||
|
(simulated) partitions may be symmetric or asymmetric. Then halt changing
|
||||||
|
the partitions and keep the simulated network stable and broken.
|
||||||
|
3. Run a number of iterations of the algorithm in parallel by poking each
|
||||||
|
of the manager processes on a random'ish basis.
|
||||||
|
4. Afterward, fetch the chain transition changes made by each FLU and
|
||||||
|
verify that no transition was unsafe.
|
||||||
|
|
||||||
|
During the iteration periods, the following is a cheatsheet for the output.
|
||||||
|
See the internal source for interpreting the rest of the output.
|
||||||
|
|
||||||
|
'Let loose the dogs of war!' Network instability
|
||||||
|
'SET partitions = ' Network stability (but broken)
|
||||||
|
'x uses:' The FLU x has made an internal state transition. The rest of
|
||||||
|
the line is a dump of internal state.
|
||||||
|
'{t}' This is a tick event which triggers one of the manager processes
|
||||||
|
to evaluate its environment and perhaps make a state transition.
|
||||||
|
|
||||||
|
A long chain of '{t}{t}{t}{t}' means that the chain state has settled
|
||||||
|
to a stable configuration, which is the goal of the algorithm.
|
||||||
|
Press control-c to interrupt....".
|
||||||
|
|
||||||
|
long_doc() ->
|
||||||
|
"
|
||||||
|
'Let loose the dogs of war!'
|
||||||
|
|
||||||
|
The simulated network is very unstable for a few seconds.
|
||||||
|
|
||||||
|
'x uses'
|
||||||
|
|
||||||
|
After a single iteration, server x has determined that the chain
|
||||||
|
should be defined by the upi, repair, and down list in this record.
|
||||||
|
If all participants reach the same conclusion at the same epoch
|
||||||
|
number (and checksum, see next item below), then the chain is
|
||||||
|
stable, fully configured, and can provide full service.
|
||||||
|
|
||||||
|
'epoch,E'
|
||||||
|
|
||||||
|
The epoch number for this decision is E. The checksum of the full
|
||||||
|
record is not shown. For purposes of the protocol, a server will
|
||||||
|
'wedge' itself and refuse service (until a new config is chosen)
|
||||||
|
whenever: a). it sees a bigger epoch number mentioned somewhere, or
|
||||||
|
b). it sees the same epoch number but a different checksum. In case
|
||||||
|
of b), there was a network partition that has healed, and both sides
|
||||||
|
had chosen to operate with an identical epoch number but different
|
||||||
|
chain configs.
|
||||||
|
|
||||||
|
'upi', 'repair', and 'down'
|
||||||
|
|
||||||
|
Members in the chain that are fully in sync and thus preserving the
|
||||||
|
Update Propagation Invariant, up but under repair (simulated), and
|
||||||
|
down, respectively.
|
||||||
|
|
||||||
|
'ps,[some list]'
|
||||||
|
|
||||||
|
The list of asymmetric network partitions. {a,b} means that a
|
||||||
|
cannot send to b, but b can send to a.
|
||||||
|
|
||||||
|
This partition list is recorded for debugging purposes but is *not*
|
||||||
|
used by the algorithm. The algorithm only 'feels' its effects via
|
||||||
|
simulated timeout whenever there's a partition in one of the
|
||||||
|
messaging directions.
|
||||||
|
|
||||||
|
'nodes_up,[list]'
|
||||||
|
|
||||||
|
The best guess right now of which ndoes are up, relative to the
|
||||||
|
author node, specified by '{author,X}'
|
||||||
|
|
||||||
|
'SET partitions = [some list]'
|
||||||
|
|
||||||
|
All subsequent iterations should have a stable list of partitions,
|
||||||
|
i.e. the 'ps' list described should be stable.
|
||||||
|
|
||||||
|
'{FLAP: x flaps n}!'
|
||||||
|
|
||||||
|
Server x has detected that it's flapping/oscillating after iteration
|
||||||
|
n of a naive/1st draft detection algorithm.
|
||||||
|
".
|
||||||
|
|
||||||
|
%% convergence_demo_test_() ->
|
||||||
|
%% {timeout, 98*300, fun() -> convergence_demo_testfun() end}.
|
||||||
|
|
||||||
|
%% convergence_demo_testfun() ->
|
||||||
|
%% convergence_demo_testfun(3).
|
||||||
|
|
||||||
|
t() ->
|
||||||
|
t(3).
|
||||||
|
|
||||||
|
t(N) ->
|
||||||
|
convergence_demo_testfun(N).
|
||||||
|
|
||||||
|
convergence_demo_testfun(NumFLUs) ->
|
||||||
|
timer:sleep(100),
|
||||||
|
io:format(user, short_doc(), []),
|
||||||
|
%% Faster test startup, commented: timer:sleep(3000),
|
||||||
|
|
||||||
|
FLU_biglist = [a,b,c,d,e,f,g],
|
||||||
|
All_list = lists:sublist(FLU_biglist, NumFLUs),
|
||||||
|
io:format(user, "\nSET # of FLus = ~w members ~w).\n",
|
||||||
|
[NumFLUs, All_list]),
|
||||||
|
machi_partition_simulator:start_link({111,222,33}, 0, 100),
|
||||||
|
_ = machi_partition_simulator:get(All_list),
|
||||||
|
|
||||||
|
Namez =
|
||||||
|
[begin
|
||||||
|
{ok, Pid} = machi_flu0:start_link(Name),
|
||||||
|
{Name, Pid}
|
||||||
|
end || Name <- All_list ],
|
||||||
|
|
||||||
|
MgrOpts = [private_write_verbose],
|
||||||
|
MgrNamez =
|
||||||
|
[begin
|
||||||
|
{ok, MPid} = ?MGR:start_link(Name, All_list, FLUPid, MgrOpts),
|
||||||
|
{Name, MPid}
|
||||||
|
end || {Name, FLUPid} <- Namez],
|
||||||
|
try
|
||||||
|
[{_, Ma}|_] = MgrNamez,
|
||||||
|
{ok, P1} = ?MGR:test_calc_projection(Ma, false),
|
||||||
|
P1Epoch = P1#projection_v1.epoch_number,
|
||||||
|
[ok = machi_flu0:proj_write(FLUPid, P1Epoch, public, P1) ||
|
||||||
|
{_, FLUPid} <- Namez, FLUPid /= Ma],
|
||||||
|
|
||||||
|
machi_partition_simulator:reset_thresholds(10, 50),
|
||||||
|
_ = machi_partition_simulator:get(All_list),
|
||||||
|
|
||||||
|
Parent = self(),
|
||||||
|
DoIt = fun(Iters, S_min, S_max) ->
|
||||||
|
io:format(user, "\nDoIt: top\n\n", []),
|
||||||
|
Pids = [spawn(fun() ->
|
||||||
|
random:seed(now()),
|
||||||
|
[begin
|
||||||
|
erlang:yield(),
|
||||||
|
S_max_rand = random:uniform(
|
||||||
|
S_max + 1),
|
||||||
|
io:format(user, "{t}", []),
|
||||||
|
Elapsed =
|
||||||
|
?MGR:sleep_ranked_order(
|
||||||
|
S_min, S_max_rand,
|
||||||
|
M_name, All_list),
|
||||||
|
_ = ?MGR:test_react_to_env(MMM),
|
||||||
|
%% if M_name == d ->
|
||||||
|
%% [_ = ?MGR:test_react_to_env(MMM) ||
|
||||||
|
%% _ <- lists:seq(1,3)],
|
||||||
|
%% superunfair;
|
||||||
|
%% true ->
|
||||||
|
%% ok
|
||||||
|
%% end,
|
||||||
|
%% Be more unfair by not
|
||||||
|
%% sleeping here.
|
||||||
|
%% timer:sleep(S_max - Elapsed),
|
||||||
|
Elapsed
|
||||||
|
end || _ <- lists:seq(1, Iters)],
|
||||||
|
Parent ! done
|
||||||
|
end) || {M_name, MMM} <- MgrNamez ],
|
||||||
|
[receive
|
||||||
|
done ->
|
||||||
|
ok
|
||||||
|
after 995000 ->
|
||||||
|
exit(icky_timeout)
|
||||||
|
end || _ <- Pids]
|
||||||
|
end,
|
||||||
|
|
||||||
|
_XandYs1 = [[{X,Y}] || X <- All_list, Y <- All_list, X /= Y],
|
||||||
|
_XandYs2 = [[{X,Y}, {A,B}] || X <- All_list, Y <- All_list, X /= Y,
|
||||||
|
A <- All_list, B <- All_list, A /= B,
|
||||||
|
X /= A],
|
||||||
|
_XandYs3 = [[{X,Y}, {A,B}, {C,D}] || X <- All_list, Y <- All_list, X /= Y,
|
||||||
|
A <- All_list, B <- All_list, A /= B,
|
||||||
|
C <- All_list, D <- All_list, C /= D,
|
||||||
|
X /= A, X /= C, A /= C],
|
||||||
|
%% AllPartitionCombinations = _XandYs1 ++ _XandYs2,
|
||||||
|
%% AllPartitionCombinations = _XandYs3,
|
||||||
|
AllPartitionCombinations = _XandYs1 ++ _XandYs2 ++ _XandYs3,
|
||||||
|
?D({?LINE, length(AllPartitionCombinations)}),
|
||||||
|
|
||||||
|
machi_partition_simulator:reset_thresholds(10, 50),
|
||||||
|
io:format(user, "\nLet loose the dogs of war!\n", []),
|
||||||
|
DoIt(30, 0, 0),
|
||||||
|
[begin
|
||||||
|
io:format(user, "\nSET partitions = ~w.\n", [ [] ]),machi_partition_simulator:no_partitions(),
|
||||||
|
[DoIt(50, 10, 100) || _ <- [1,2,3]],
|
||||||
|
|
||||||
|
%% machi_partition_simulator:reset_thresholds(10, 50),
|
||||||
|
%% io:format(user, "\nLet loose the dogs of war!\n", []),
|
||||||
|
%% DoIt(30, 0, 0),
|
||||||
|
|
||||||
|
machi_partition_simulator:always_these_partitions(Partition),
|
||||||
|
io:format(user, "\nSET partitions = ~w.\n", [Partition]),
|
||||||
|
[DoIt(50, 10, 100) || _ <- [1,2,3,4] ],
|
||||||
|
PPP =
|
||||||
|
[begin
|
||||||
|
PPPallPubs = machi_flu0:proj_list_all(FLU, public),
|
||||||
|
[begin
|
||||||
|
{ok, Pr} = machi_flu0:proj_read(FLU, PPPepoch, public),
|
||||||
|
{Pr#projection_v1.epoch_number, FLUName, Pr}
|
||||||
|
end || PPPepoch <- PPPallPubs]
|
||||||
|
end || {FLUName, FLU} <- Namez],
|
||||||
|
io:format(user, "PPP ~p\n", [lists:sort(lists:append(PPP))]),
|
||||||
|
|
||||||
|
%%%%%%%% {stable,true} = {stable,private_projections_are_stable(Namez, DoIt)},
|
||||||
|
{hosed_ok,true} = {hosed_ok,all_hosed_lists_are_identical(Namez, Partition)},
|
||||||
|
io:format(user, "\nSweet, all_hosed are identical-or-islands-inconclusive.\n", []),
|
||||||
|
timer:sleep(1000),
|
||||||
|
ok
|
||||||
|
end || Partition <- AllPartitionCombinations
|
||||||
|
%% end || Partition <- [ [{a,b},{b,d},{c,b}],
|
||||||
|
%% [{a,b},{b,d},{c,b}, {a,b},{b,a},{a,c},{c,a},{a,d},{d,a}],
|
||||||
|
%% %% [{a,b},{b,d},{c,b}, {b,a},{a,b},{b,c},{c,b},{b,d},{d,b}],
|
||||||
|
%% [{a,b},{b,d},{c,b}, {c,a},{a,c},{c,b},{b,c},{c,d},{d,c}],
|
||||||
|
%% [{a,b},{b,d},{c,b}, {d,a},{a,d},{d,b},{b,d},{d,c},{c,d}] ]
|
||||||
|
%% end || Partition <- [ [{a,b}, {b,c}],
|
||||||
|
%% [{a,b}, {c,b}] ]
|
||||||
|
%% end || Partition <- [ [{a,b}, {b,c}] ] %% hosed-not-equal @ 3 FLUs
|
||||||
|
%% end || Partition <- [ [{a,b}],
|
||||||
|
%% [{b,a}] ]
|
||||||
|
%% end || Partition <- [ [{a,b}, {c,b}],
|
||||||
|
%% [{a,b}, {b,c}] ]
|
||||||
|
%% end || Partition <- [ [{a,b}, {b,c}, {c,d}],
|
||||||
|
%% [{a,b}, {b,c},{b,d}, {c,d}],
|
||||||
|
%% [{b,a}, {b,c}, {c,d}],
|
||||||
|
%% [{a,b}, {c,b}, {c,d}],
|
||||||
|
%% [{a,b}, {b,c}, {d,c}] ]
|
||||||
|
%% end || Partition <- [ [{a,b}, {b,c}, {c,d}, {d,e}],
|
||||||
|
%% [{b,a}, {b,c}, {c,d}, {d,e}],
|
||||||
|
%% [{a,b}, {c,b}, {c,d}, {d,e}],
|
||||||
|
%% [{a,b}, {b,c}, {d,c}, {d,e}],
|
||||||
|
%% [{a,b}, {b,c}, {c,d}, {e,d}] ]
|
||||||
|
%% end || Partition <- [ [{c,a}] ]
|
||||||
|
%% end || Partition <- [ [{c,a}], [{c,b}, {a, b}] ]
|
||||||
|
%% end || Partition <- [ [{a,b},{b,a}, {a,c},{c,a}, {a,d},{d,a}],
|
||||||
|
%% [{a,b},{b,a}, {a,c},{c,a}, {a,d},{d,a}, {b,c}],
|
||||||
|
%% [{a,b},{b,a}, {a,c},{c,a}, {a,d},{d,a}, {c,d}] ]
|
||||||
|
%% end || Partition <- [ [{a,b}],
|
||||||
|
%% [{a,b}, {a,b},{b,a},{a,c},{c,a},{a,d},{d,a}],
|
||||||
|
%% [{a,b}, {b,a},{a,b},{b,c},{c,b},{b,d},{d,b}],
|
||||||
|
%% [{a,b}, {c,a},{a,c},{c,b},{b,c},{c,d},{d,c}],
|
||||||
|
%% [{a,b}, {d,a},{a,d},{d,b},{b,d},{d,c},{c,d}] ]
|
||||||
|
],
|
||||||
|
%% exit(end_experiment),
|
||||||
|
|
||||||
|
io:format(user, "\nSET partitions = []\n", []),
|
||||||
|
io:format(user, "We should see convergence to 1 correct chain.\n", []),
|
||||||
|
machi_partition_simulator:no_partitions(),
|
||||||
|
[DoIt(50, 10, 100) || _ <- [1]],
|
||||||
|
io:format(user, "Sweet, finishing early\n", []), exit(yoyoyo_testing_hack),
|
||||||
|
%% WARNING: In asymmetric partitions, private_projections_are_stable()
|
||||||
|
%% will never be true; code beyond this point on the -exp3
|
||||||
|
%% branch is bit-rotted, sorry!
|
||||||
|
true = private_projections_are_stable(Namez, DoIt),
|
||||||
|
io:format(user, "~s\n", [os:cmd("date")]),
|
||||||
|
|
||||||
|
%% We are stable now ... analyze it.
|
||||||
|
|
||||||
|
%% Create a report where at least one FLU has written a
|
||||||
|
%% private projection.
|
||||||
|
Report = machi_chain_manager1_test:unanimous_report(Namez),
|
||||||
|
%% ?D(Report),
|
||||||
|
|
||||||
|
%% Report is ordered by Epoch. For each private projection
|
||||||
|
%% written during any given epoch, confirm that all chain
|
||||||
|
%% members appear in only one unique chain, i.e., the sets of
|
||||||
|
%% unique chains are disjoint.
|
||||||
|
true = machi_chain_manager1_test:all_reports_are_disjoint(Report),
|
||||||
|
|
||||||
|
%% Given the report, we flip it around so that we observe the
|
||||||
|
%% sets of chain transitions relative to each FLU.
|
||||||
|
R_Chains = [machi_chain_manager1_test:extract_chains_relative_to_flu(
|
||||||
|
FLU, Report) || FLU <- All_list],
|
||||||
|
%% ?D(R_Chains),
|
||||||
|
R_Projs = [{FLU, [machi_chain_manager1_test:chain_to_projection(
|
||||||
|
FLU, Epoch, UPI, Repairing, All_list) ||
|
||||||
|
{Epoch, UPI, Repairing} <- E_Chains]} ||
|
||||||
|
{FLU, E_Chains} <- R_Chains],
|
||||||
|
|
||||||
|
%% For each chain transition experienced by a particular FLU,
|
||||||
|
%% confirm that each state transition is OK.
|
||||||
|
try
|
||||||
|
[{FLU, true} = {FLU, ?MGR:projection_transitions_are_sane(Ps, FLU)} ||
|
||||||
|
{FLU, Ps} <- R_Projs],
|
||||||
|
io:format(user, "\nAll sanity checks pass, hooray!\n", [])
|
||||||
|
catch _Err:_What ->
|
||||||
|
io:format(user, "Report ~p\n", [Report]),
|
||||||
|
exit({line, ?LINE, _Err, _What})
|
||||||
|
end,
|
||||||
|
%% ?D(R_Projs),
|
||||||
|
|
||||||
|
ok
|
||||||
|
after
|
||||||
|
[ok = ?MGR:stop(MgrPid) || {_, MgrPid} <- MgrNamez],
|
||||||
|
[ok = machi_flu0:stop(FLUPid) || {_, FLUPid} <- Namez],
|
||||||
|
ok = machi_partition_simulator:stop()
|
||||||
|
end.
|
||||||
|
|
||||||
|
private_projections_are_stable(Namez, PollFunc) ->
|
||||||
|
Private1 = [machi_flu0:proj_get_latest_num(FLU, private) ||
|
||||||
|
{_Name, FLU} <- Namez],
|
||||||
|
PollFunc(5, 1, 10),
|
||||||
|
Private2 = [machi_flu0:proj_get_latest_num(FLU, private) ||
|
||||||
|
{_Name, FLU} <- Namez],
|
||||||
|
true = (Private1 == Private2).
|
||||||
|
|
||||||
|
all_hosed_lists_are_identical(Namez, Partition0) ->
|
||||||
|
Partition = lists:usort(Partition0),
|
||||||
|
Ps = [machi_flu0:proj_read_latest(FLU, private) || {_Name, FLU} <- Namez],
|
||||||
|
UniqueAllHoseds = lists:usort([machi_chain_manager1:get_all_hosed(P) ||
|
||||||
|
{ok, P} <- Ps]),
|
||||||
|
Members = [M || {M, _Pid} <- Namez],
|
||||||
|
Islands = machi_partition_simulator:partitions2num_islands(
|
||||||
|
Members, Partition),
|
||||||
|
%% io:format(user, "all_hosed_lists_are_identical:\n", []),
|
||||||
|
%% io:format(user, " Uniques = ~p Islands ~p\n Partition ~p\n",
|
||||||
|
%% [Uniques, Islands, Partition]),
|
||||||
|
case length(UniqueAllHoseds) of
|
||||||
|
1 ->
|
||||||
|
true;
|
||||||
|
%% TODO: With the addition of the digraph stuff below, the clause
|
||||||
|
%% below probably isn't necessary anymore, since the
|
||||||
|
%% digraph calculation should catch complete partition islands?
|
||||||
|
_ when Islands == 'many' ->
|
||||||
|
%% There are at least two partitions, so yes, it's quite
|
||||||
|
%% possible that the all_hosed lists may differ.
|
||||||
|
%% TODO Fix this up to be smarter about fully-isolated
|
||||||
|
%% islands of partition.
|
||||||
|
true;
|
||||||
|
_ ->
|
||||||
|
DG = digraph:new(),
|
||||||
|
Connection = machi_partition_simulator:partition2connection(
|
||||||
|
Members, Partition),
|
||||||
|
[digraph:add_vertex(DG, X) || X <- Members],
|
||||||
|
[digraph:add_edge(DG, X, Y) || {X,Y} <- Connection],
|
||||||
|
Any =
|
||||||
|
lists:any(
|
||||||
|
fun(X) ->
|
||||||
|
NotX = Members -- [X],
|
||||||
|
lists:any(
|
||||||
|
fun(Y) ->
|
||||||
|
%% There must be a shortest path of length
|
||||||
|
%% two in both directions, otherwise
|
||||||
|
%% the read projection call will fail.
|
||||||
|
%% And it's that failure that we're
|
||||||
|
%% interested in here.
|
||||||
|
XtoY = digraph:get_short_path(DG, X, Y),
|
||||||
|
YtoX = digraph:get_short_path(DG, Y, X),
|
||||||
|
(XtoY == false orelse
|
||||||
|
length(XtoY) > 2)
|
||||||
|
orelse
|
||||||
|
(YtoX == false orelse
|
||||||
|
length(YtoX) > 2)
|
||||||
|
end, NotX)
|
||||||
|
end, Members),
|
||||||
|
digraph:delete(DG),
|
||||||
|
if Any == true ->
|
||||||
|
%% There's a missing path of length 2 between some
|
||||||
|
%% two FLUs, so yes, there's going to be
|
||||||
|
%% non-identical all_hosed lists.
|
||||||
|
true;
|
||||||
|
true ->
|
||||||
|
false % There's no excuse, buddy
|
||||||
|
end
|
||||||
|
end.
|
||||||
|
-endif. % TEST
|
|
@ -254,365 +254,5 @@ nonunanimous_setup_and_fix_test() ->
|
||||||
ok = machi_partition_simulator:stop()
|
ok = machi_partition_simulator:stop()
|
||||||
end.
|
end.
|
||||||
|
|
||||||
short_doc() ->
|
|
||||||
"
|
|
||||||
A visualization of the convergence behavior of the chain self-management
|
|
||||||
algorithm for Machi.
|
|
||||||
1. Set up 4 FLUs and chain manager pairs.
|
|
||||||
2. Create a number of different network partition scenarios, where
|
|
||||||
(simulated) partitions may be symmetric or asymmetric. Then halt changing
|
|
||||||
the partitions and keep the simulated network stable and broken.
|
|
||||||
3. Run a number of iterations of the algorithm in parallel by poking each
|
|
||||||
of the manager processes on a random'ish basis.
|
|
||||||
4. Afterward, fetch the chain transition changes made by each FLU and
|
|
||||||
verify that no transition was unsafe.
|
|
||||||
|
|
||||||
During the iteration periods, the following is a cheatsheet for the output.
|
|
||||||
See the internal source for interpreting the rest of the output.
|
|
||||||
|
|
||||||
'Let loose the dogs of war!' Network instability
|
|
||||||
'SET partitions = ' Network stability (but broken)
|
|
||||||
'x uses:' The FLU x has made an internal state transition. The rest of
|
|
||||||
the line is a dump of internal state.
|
|
||||||
'{t}' This is a tick event which triggers one of the manager processes
|
|
||||||
to evaluate its environment and perhaps make a state transition.
|
|
||||||
|
|
||||||
A long chain of '{t}{t}{t}{t}' means that the chain state has settled
|
|
||||||
to a stable configuration, which is the goal of the algorithm.
|
|
||||||
Press control-c to interrupt....".
|
|
||||||
|
|
||||||
long_doc() ->
|
|
||||||
"
|
|
||||||
'Let loose the dogs of war!'
|
|
||||||
|
|
||||||
The simulated network is very unstable for a few seconds.
|
|
||||||
|
|
||||||
'x uses'
|
|
||||||
|
|
||||||
After a single iteration, server x has determined that the chain
|
|
||||||
should be defined by the upi, repair, and down list in this record.
|
|
||||||
If all participants reach the same conclusion at the same epoch
|
|
||||||
number (and checksum, see next item below), then the chain is
|
|
||||||
stable, fully configured, and can provide full service.
|
|
||||||
|
|
||||||
'epoch,E'
|
|
||||||
|
|
||||||
The epoch number for this decision is E. The checksum of the full
|
|
||||||
record is not shown. For purposes of the protocol, a server will
|
|
||||||
'wedge' itself and refuse service (until a new config is chosen)
|
|
||||||
whenever: a). it sees a bigger epoch number mentioned somewhere, or
|
|
||||||
b). it sees the same epoch number but a different checksum. In case
|
|
||||||
of b), there was a network partition that has healed, and both sides
|
|
||||||
had chosen to operate with an identical epoch number but different
|
|
||||||
chain configs.
|
|
||||||
|
|
||||||
'upi', 'repair', and 'down'
|
|
||||||
|
|
||||||
Members in the chain that are fully in sync and thus preserving the
|
|
||||||
Update Propagation Invariant, up but under repair (simulated), and
|
|
||||||
down, respectively.
|
|
||||||
|
|
||||||
'ps,[some list]'
|
|
||||||
|
|
||||||
The list of asymmetric network partitions. {a,b} means that a
|
|
||||||
cannot send to b, but b can send to a.
|
|
||||||
|
|
||||||
This partition list is recorded for debugging purposes but is *not*
|
|
||||||
used by the algorithm. The algorithm only 'feels' its effects via
|
|
||||||
simulated timeout whenever there's a partition in one of the
|
|
||||||
messaging directions.
|
|
||||||
|
|
||||||
'nodes_up,[list]'
|
|
||||||
|
|
||||||
The best guess right now of which ndoes are up, relative to the
|
|
||||||
author node, specified by '{author,X}'
|
|
||||||
|
|
||||||
'SET partitions = [some list]'
|
|
||||||
|
|
||||||
All subsequent iterations should have a stable list of partitions,
|
|
||||||
i.e. the 'ps' list described should be stable.
|
|
||||||
|
|
||||||
'{FLAP: x flaps n}!'
|
|
||||||
|
|
||||||
Server x has detected that it's flapping/oscillating after iteration
|
|
||||||
n of a naive/1st draft detection algorithm.
|
|
||||||
".
|
|
||||||
|
|
||||||
convergence_demo_testTODO_() ->
|
|
||||||
{timeout, 98*300, fun() -> convergence_demo_testfun() end}.
|
|
||||||
|
|
||||||
convergence_demo_testfun() ->
|
|
||||||
convergence_demo_testfun(3).
|
|
||||||
|
|
||||||
convergence_demo_testfun(NumFLUs) ->
|
|
||||||
timer:sleep(100),
|
|
||||||
io:format(user, short_doc(), []),
|
|
||||||
%% Faster test startup, commented: timer:sleep(3000),
|
|
||||||
|
|
||||||
FLU_biglist = [a,b,c,d,e,f,g],
|
|
||||||
All_list = lists:sublist(FLU_biglist, NumFLUs),
|
|
||||||
io:format(user, "\nSET # of FLus = ~w members ~w).\n",
|
|
||||||
[NumFLUs, All_list]),
|
|
||||||
machi_partition_simulator:start_link({111,222,33}, 0, 100),
|
|
||||||
_ = machi_partition_simulator:get(All_list),
|
|
||||||
|
|
||||||
Namez =
|
|
||||||
[begin
|
|
||||||
{ok, Pid} = machi_flu0:start_link(Name),
|
|
||||||
{Name, Pid}
|
|
||||||
end || Name <- All_list ],
|
|
||||||
|
|
||||||
MgrOpts = [private_write_verbose],
|
|
||||||
MgrNamez =
|
|
||||||
[begin
|
|
||||||
{ok, MPid} = ?MGR:start_link(Name, All_list, FLUPid, MgrOpts),
|
|
||||||
{Name, MPid}
|
|
||||||
end || {Name, FLUPid} <- Namez],
|
|
||||||
try
|
|
||||||
[{_, Ma}|_] = MgrNamez,
|
|
||||||
{ok, P1} = ?MGR:test_calc_projection(Ma, false),
|
|
||||||
P1Epoch = P1#projection_v1.epoch_number,
|
|
||||||
[ok = machi_flu0:proj_write(FLUPid, P1Epoch, public, P1) ||
|
|
||||||
{_, FLUPid} <- Namez, FLUPid /= Ma],
|
|
||||||
|
|
||||||
machi_partition_simulator:reset_thresholds(10, 50),
|
|
||||||
_ = machi_partition_simulator:get(All_list),
|
|
||||||
|
|
||||||
Parent = self(),
|
|
||||||
DoIt = fun(Iters, S_min, S_max) ->
|
|
||||||
io:format(user, "\nDoIt: top\n\n", []),
|
|
||||||
Pids = [spawn(fun() ->
|
|
||||||
random:seed(now()),
|
|
||||||
[begin
|
|
||||||
erlang:yield(),
|
|
||||||
S_max_rand = random:uniform(
|
|
||||||
S_max + 1),
|
|
||||||
io:format(user, "{t}", []),
|
|
||||||
Elapsed =
|
|
||||||
?MGR:sleep_ranked_order(
|
|
||||||
S_min, S_max_rand,
|
|
||||||
M_name, All_list),
|
|
||||||
_ = ?MGR:test_react_to_env(MMM),
|
|
||||||
%% if M_name == d ->
|
|
||||||
%% [_ = ?MGR:test_react_to_env(MMM) ||
|
|
||||||
%% _ <- lists:seq(1,3)],
|
|
||||||
%% superunfair;
|
|
||||||
%% true ->
|
|
||||||
%% ok
|
|
||||||
%% end,
|
|
||||||
%% Be more unfair by not
|
|
||||||
%% sleeping here.
|
|
||||||
%% timer:sleep(S_max - Elapsed),
|
|
||||||
Elapsed
|
|
||||||
end || _ <- lists:seq(1, Iters)],
|
|
||||||
Parent ! done
|
|
||||||
end) || {M_name, MMM} <- MgrNamez ],
|
|
||||||
[receive
|
|
||||||
done ->
|
|
||||||
ok
|
|
||||||
after 995000 ->
|
|
||||||
exit(icky_timeout)
|
|
||||||
end || _ <- Pids]
|
|
||||||
end,
|
|
||||||
|
|
||||||
_XandYs1 = [[{X,Y}] || X <- All_list, Y <- All_list, X /= Y],
|
|
||||||
_XandYs2 = [[{X,Y}, {A,B}] || X <- All_list, Y <- All_list, X /= Y,
|
|
||||||
A <- All_list, B <- All_list, A /= B,
|
|
||||||
X /= A],
|
|
||||||
_XandYs3 = [[{X,Y}, {A,B}, {C,D}] || X <- All_list, Y <- All_list, X /= Y,
|
|
||||||
A <- All_list, B <- All_list, A /= B,
|
|
||||||
C <- All_list, D <- All_list, C /= D,
|
|
||||||
X /= A, X /= C, A /= C],
|
|
||||||
%% AllPartitionCombinations = _XandYs1 ++ _XandYs2,
|
|
||||||
%% AllPartitionCombinations = _XandYs3,
|
|
||||||
AllPartitionCombinations = _XandYs1 ++ _XandYs2 ++ _XandYs3,
|
|
||||||
?D({?LINE, length(AllPartitionCombinations)}),
|
|
||||||
|
|
||||||
machi_partition_simulator:reset_thresholds(10, 50),
|
|
||||||
io:format(user, "\nLet loose the dogs of war!\n", []),
|
|
||||||
DoIt(30, 0, 0),
|
|
||||||
[begin
|
|
||||||
io:format(user, "\nSET partitions = ~w.\n", [ [] ]),machi_partition_simulator:no_partitions(),
|
|
||||||
[DoIt(50, 10, 100) || _ <- [1,2,3]],
|
|
||||||
|
|
||||||
%% machi_partition_simulator:reset_thresholds(10, 50),
|
|
||||||
%% io:format(user, "\nLet loose the dogs of war!\n", []),
|
|
||||||
%% DoIt(30, 0, 0),
|
|
||||||
|
|
||||||
machi_partition_simulator:always_these_partitions(Partition),
|
|
||||||
io:format(user, "\nSET partitions = ~w.\n", [Partition]),
|
|
||||||
[DoIt(50, 10, 100) || _ <- [1,2,3,4] ],
|
|
||||||
PPP =
|
|
||||||
[begin
|
|
||||||
PPPallPubs = machi_flu0:proj_list_all(FLU, public),
|
|
||||||
[begin
|
|
||||||
{ok, Pr} = machi_flu0:proj_read(FLU, PPPepoch, public),
|
|
||||||
{Pr#projection_v1.epoch_number, FLUName, Pr}
|
|
||||||
end || PPPepoch <- PPPallPubs]
|
|
||||||
end || {FLUName, FLU} <- Namez],
|
|
||||||
io:format(user, "PPP ~p\n", [lists:sort(lists:append(PPP))]),
|
|
||||||
|
|
||||||
%%%%%%%% {stable,true} = {stable,private_projections_are_stable(Namez, DoIt)},
|
|
||||||
{hosed_ok,true} = {hosed_ok,all_hosed_lists_are_identical(Namez, Partition)},
|
|
||||||
io:format(user, "\nSweet, all_hosed are identical-or-islands-inconclusive.\n", []),
|
|
||||||
timer:sleep(1000),
|
|
||||||
ok
|
|
||||||
end || Partition <- AllPartitionCombinations
|
|
||||||
%% end || Partition <- [ [{a,b},{b,d},{c,b}],
|
|
||||||
%% [{a,b},{b,d},{c,b}, {a,b},{b,a},{a,c},{c,a},{a,d},{d,a}],
|
|
||||||
%% %% [{a,b},{b,d},{c,b}, {b,a},{a,b},{b,c},{c,b},{b,d},{d,b}],
|
|
||||||
%% [{a,b},{b,d},{c,b}, {c,a},{a,c},{c,b},{b,c},{c,d},{d,c}],
|
|
||||||
%% [{a,b},{b,d},{c,b}, {d,a},{a,d},{d,b},{b,d},{d,c},{c,d}] ]
|
|
||||||
%% end || Partition <- [ [{a,b}, {b,c}],
|
|
||||||
%% [{a,b}, {c,b}] ]
|
|
||||||
%% end || Partition <- [ [{a,b}, {b,c}] ] %% hosed-not-equal @ 3 FLUs
|
|
||||||
%% end || Partition <- [ [{a,b}],
|
|
||||||
%% [{b,a}] ]
|
|
||||||
%% end || Partition <- [ [{a,b}, {c,b}],
|
|
||||||
%% [{a,b}, {b,c}] ]
|
|
||||||
%% end || Partition <- [ [{a,b}, {b,c}, {c,d}],
|
|
||||||
%% [{a,b}, {b,c},{b,d}, {c,d}],
|
|
||||||
%% [{b,a}, {b,c}, {c,d}],
|
|
||||||
%% [{a,b}, {c,b}, {c,d}],
|
|
||||||
%% [{a,b}, {b,c}, {d,c}] ]
|
|
||||||
%% end || Partition <- [ [{a,b}, {b,c}, {c,d}, {d,e}],
|
|
||||||
%% [{b,a}, {b,c}, {c,d}, {d,e}],
|
|
||||||
%% [{a,b}, {c,b}, {c,d}, {d,e}],
|
|
||||||
%% [{a,b}, {b,c}, {d,c}, {d,e}],
|
|
||||||
%% [{a,b}, {b,c}, {c,d}, {e,d}] ]
|
|
||||||
%% end || Partition <- [ [{c,a}] ]
|
|
||||||
%% end || Partition <- [ [{c,a}], [{c,b}, {a, b}] ]
|
|
||||||
%% end || Partition <- [ [{a,b},{b,a}, {a,c},{c,a}, {a,d},{d,a}],
|
|
||||||
%% [{a,b},{b,a}, {a,c},{c,a}, {a,d},{d,a}, {b,c}],
|
|
||||||
%% [{a,b},{b,a}, {a,c},{c,a}, {a,d},{d,a}, {c,d}] ]
|
|
||||||
%% end || Partition <- [ [{a,b}],
|
|
||||||
%% [{a,b}, {a,b},{b,a},{a,c},{c,a},{a,d},{d,a}],
|
|
||||||
%% [{a,b}, {b,a},{a,b},{b,c},{c,b},{b,d},{d,b}],
|
|
||||||
%% [{a,b}, {c,a},{a,c},{c,b},{b,c},{c,d},{d,c}],
|
|
||||||
%% [{a,b}, {d,a},{a,d},{d,b},{b,d},{d,c},{c,d}] ]
|
|
||||||
],
|
|
||||||
%% exit(end_experiment),
|
|
||||||
|
|
||||||
io:format(user, "\nSET partitions = []\n", []),
|
|
||||||
io:format(user, "We should see convergence to 1 correct chain.\n", []),
|
|
||||||
machi_partition_simulator:no_partitions(),
|
|
||||||
[DoIt(50, 10, 100) || _ <- [1]],
|
|
||||||
io:format(user, "Sweet, finishing early\n", []), exit(yoyoyo_testing_hack),
|
|
||||||
%% WARNING: In asymmetric partitions, private_projections_are_stable()
|
|
||||||
%% will never be true; code beyond this point on the -exp3
|
|
||||||
%% branch is bit-rotted, sorry!
|
|
||||||
true = private_projections_are_stable(Namez, DoIt),
|
|
||||||
io:format(user, "~s\n", [os:cmd("date")]),
|
|
||||||
|
|
||||||
%% We are stable now ... analyze it.
|
|
||||||
|
|
||||||
%% Create a report where at least one FLU has written a
|
|
||||||
%% private projection.
|
|
||||||
Report = unanimous_report(Namez),
|
|
||||||
%% ?D(Report),
|
|
||||||
|
|
||||||
%% Report is ordered by Epoch. For each private projection
|
|
||||||
%% written during any given epoch, confirm that all chain
|
|
||||||
%% members appear in only one unique chain, i.e., the sets of
|
|
||||||
%% unique chains are disjoint.
|
|
||||||
true = all_reports_are_disjoint(Report),
|
|
||||||
|
|
||||||
%% Given the report, we flip it around so that we observe the
|
|
||||||
%% sets of chain transitions relative to each FLU.
|
|
||||||
R_Chains = [extract_chains_relative_to_flu(FLU, Report) ||
|
|
||||||
FLU <- All_list],
|
|
||||||
%% ?D(R_Chains),
|
|
||||||
R_Projs = [{FLU, [chain_to_projection(FLU, Epoch, UPI, Repairing,
|
|
||||||
All_list) ||
|
|
||||||
{Epoch, UPI, Repairing} <- E_Chains]} ||
|
|
||||||
{FLU, E_Chains} <- R_Chains],
|
|
||||||
|
|
||||||
%% For each chain transition experienced by a particular FLU,
|
|
||||||
%% confirm that each state transition is OK.
|
|
||||||
try
|
|
||||||
[{FLU, true} = {FLU, ?MGR:projection_transitions_are_sane(Ps, FLU)} ||
|
|
||||||
{FLU, Ps} <- R_Projs],
|
|
||||||
io:format(user, "\nAll sanity checks pass, hooray!\n", [])
|
|
||||||
catch _Err:_What ->
|
|
||||||
io:format(user, "Report ~p\n", [Report]),
|
|
||||||
exit({line, ?LINE, _Err, _What})
|
|
||||||
end,
|
|
||||||
%% ?D(R_Projs),
|
|
||||||
|
|
||||||
ok
|
|
||||||
after
|
|
||||||
[ok = ?MGR:stop(MgrPid) || {_, MgrPid} <- MgrNamez],
|
|
||||||
[ok = machi_flu0:stop(FLUPid) || {_, FLUPid} <- Namez],
|
|
||||||
ok = machi_partition_simulator:stop()
|
|
||||||
end.
|
|
||||||
|
|
||||||
private_projections_are_stable(Namez, PollFunc) ->
|
|
||||||
Private1 = [machi_flu0:proj_get_latest_num(FLU, private) ||
|
|
||||||
{_Name, FLU} <- Namez],
|
|
||||||
PollFunc(5, 1, 10),
|
|
||||||
Private2 = [machi_flu0:proj_get_latest_num(FLU, private) ||
|
|
||||||
{_Name, FLU} <- Namez],
|
|
||||||
true = (Private1 == Private2).
|
|
||||||
|
|
||||||
all_hosed_lists_are_identical(Namez, Partition0) ->
|
|
||||||
Partition = lists:usort(Partition0),
|
|
||||||
Ps = [machi_flu0:proj_read_latest(FLU, private) || {_Name, FLU} <- Namez],
|
|
||||||
UniqueAllHoseds = lists:usort([machi_chain_manager1:get_all_hosed(P) ||
|
|
||||||
{ok, P} <- Ps]),
|
|
||||||
Members = [M || {M, _Pid} <- Namez],
|
|
||||||
Islands = machi_partition_simulator:partitions2num_islands(
|
|
||||||
Members, Partition),
|
|
||||||
%% io:format(user, "all_hosed_lists_are_identical:\n", []),
|
|
||||||
%% io:format(user, " Uniques = ~p Islands ~p\n Partition ~p\n",
|
|
||||||
%% [Uniques, Islands, Partition]),
|
|
||||||
case length(UniqueAllHoseds) of
|
|
||||||
1 ->
|
|
||||||
true;
|
|
||||||
%% TODO: With the addition of the digraph stuff below, the clause
|
|
||||||
%% below probably isn't necessary anymore, since the
|
|
||||||
%% digraph calculation should catch complete partition islands?
|
|
||||||
_ when Islands == 'many' ->
|
|
||||||
%% There are at least two partitions, so yes, it's quite
|
|
||||||
%% possible that the all_hosed lists may differ.
|
|
||||||
%% TODO Fix this up to be smarter about fully-isolated
|
|
||||||
%% islands of partition.
|
|
||||||
true;
|
|
||||||
_ ->
|
|
||||||
DG = digraph:new(),
|
|
||||||
Connection = machi_partition_simulator:partition2connection(
|
|
||||||
Members, Partition),
|
|
||||||
[digraph:add_vertex(DG, X) || X <- Members],
|
|
||||||
[digraph:add_edge(DG, X, Y) || {X,Y} <- Connection],
|
|
||||||
Any =
|
|
||||||
lists:any(
|
|
||||||
fun(X) ->
|
|
||||||
NotX = Members -- [X],
|
|
||||||
lists:any(
|
|
||||||
fun(Y) ->
|
|
||||||
%% There must be a shortest path of length
|
|
||||||
%% two in both directions, otherwise
|
|
||||||
%% the read projection call will fail.
|
|
||||||
%% And it's that failure that we're
|
|
||||||
%% interested in here.
|
|
||||||
XtoY = digraph:get_short_path(DG, X, Y),
|
|
||||||
YtoX = digraph:get_short_path(DG, Y, X),
|
|
||||||
(XtoY == false orelse
|
|
||||||
length(XtoY) > 2)
|
|
||||||
orelse
|
|
||||||
(YtoX == false orelse
|
|
||||||
length(YtoX) > 2)
|
|
||||||
end, NotX)
|
|
||||||
end, Members),
|
|
||||||
digraph:delete(DG),
|
|
||||||
if Any == true ->
|
|
||||||
%% There's a missing path of length 2 between some
|
|
||||||
%% two FLUs, so yes, there's going to be
|
|
||||||
%% non-identical all_hosed lists.
|
|
||||||
true;
|
|
||||||
true ->
|
|
||||||
false % There's no excuse, buddy
|
|
||||||
end
|
|
||||||
end.
|
|
||||||
|
|
||||||
-endif. % not PULSE
|
-endif. % not PULSE
|
||||||
-endif. % TEST
|
-endif. % TEST
|
||||||
|
|
Loading…
Reference in a new issue