machi/test/machi_chain_manager1_converge_demo.erl

555 lines
23 KiB
Erlang
Raw Normal View History

%% -------------------------------------------------------------------
%%
%% Machi: a small village of replicated files
%%
%% Copyright (c) 2014-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(machi_chain_manager1_converge_demo).
-include("machi.hrl").
-include("machi_projection.hrl").
-define(MGR, machi_chain_manager1).
-define(D(X), io:format(user, "~s ~p\n", [??X, X])).
-define(Dw(X), io:format(user, "~s ~w\n", [??X, X])).
-define(FLU_C, machi_flu1_client).
-define(FLU_PC, machi_proxy_flu1_client).
-compile(export_all).
-ifdef(TEST).
2015-06-02 13:13:15 +00:00
-ifndef(PULSE).
-ifdef(EQC).
-include_lib("eqc/include/eqc.hrl").
%% -include_lib("eqc/include/eqc_statem.hrl").
-define(QC_OUT(P),
eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)).
-endif.
-include_lib("eunit/include/eunit.hrl").
short_doc() ->
"
A visualization of the convergence behavior of the chain self-management
algorithm for Machi.
1. Set up 4 FLUs and chain manager pairs.
2. Create a number of different network partition scenarios, where
(simulated) partitions may be symmetric or asymmetric. Then halt changing
the partitions and keep the simulated network stable and broken.
3. Run a number of iterations of the algorithm in parallel by poking each
of the manager processes on a random'ish basis.
4. Afterward, fetch the chain transition changes made by each FLU and
verify that no transition was unsafe.
During the iteration periods, the following is a cheatsheet for the output.
See the internal source for interpreting the rest of the output.
'Let loose the dogs of war!' Network instability
'SET partitions = ' Network stability (but broken)
'x uses:' The FLU x has made an internal state transition. The rest of
the line is a dump of internal state.
'{t}' This is a tick event which triggers one of the manager processes
to evaluate its environment and perhaps make a state transition.
A long chain of '{t}{t}{t}{t}' means that the chain state has settled
to a stable configuration, which is the goal of the algorithm.
Press control-c to interrupt....".
long_doc() ->
"
'Let loose the dogs of war!'
The simulated network is very unstable for a few seconds.
'x uses'
After a single iteration, server x has determined that the chain
should be defined by the upi, repair, and down list in this record.
If all participants reach the same conclusion at the same epoch
number (and checksum, see next item below), then the chain is
stable, fully configured, and can provide full service.
'epoch,E'
The epoch number for this decision is E. The checksum of the full
record is not shown. For purposes of the protocol, a server will
'wedge' itself and refuse service (until a new config is chosen)
whenever: a). it sees a bigger epoch number mentioned somewhere, or
b). it sees the same epoch number but a different checksum. In case
of b), there was a network partition that has healed, and both sides
had chosen to operate with an identical epoch number but different
chain configs.
'upi', 'repair', and 'down'
Members in the chain that are fully in sync and thus preserving the
Update Propagation Invariant, up but under repair (simulated), and
down, respectively.
'ps,[some list]'
The list of asymmetric network partitions. {a,b} means that a
cannot send to b, but b can send to a.
This partition list is recorded for debugging purposes but is *not*
used by the algorithm. The algorithm only 'feels' its effects via
simulated timeout whenever there's a partition in one of the
messaging directions.
'nodes_up,[list]'
The best guess right now of which ndoes are up, relative to the
author node, specified by '{author,X}'
'SET partitions = [some list]'
All subsequent iterations should have a stable list of partitions,
i.e. the 'ps' list described should be stable.
'{FLAP: x flaps n}!'
Server x has detected that it's flapping/oscillating after iteration
n of a naive/1st draft detection algorithm.
".
%% ' silly Emacs syntax highlighting....
2015-04-09 12:32:04 +00:00
%% convergence_demo_test_() ->
%% {timeout, 98*300, fun() -> convergence_demo_testfun() end}.
2015-04-09 12:32:04 +00:00
%% convergence_demo_testfun() ->
%% convergence_demo_testfun(3).
-define(DEFAULT_MGR_OPTS, [{private_write_verbose, false},
{active_mode,false},
{use_partition_simulator, true}]).
t() ->
t(3).
t(N) ->
t(N, ?DEFAULT_MGR_OPTS).
t(N, MgrOpts) ->
convergence_demo_testfun(N, MgrOpts).
convergence_demo_testfun(NumFLUs, MgrOpts0) ->
timer:sleep(100),
2015-04-09 12:08:15 +00:00
%% Faster test startup, commented: io:format(user, short_doc(), []),
%% Faster test startup, commented: timer:sleep(3000),
2015-04-09 12:08:15 +00:00
TcpPort = 62877,
ok = filelib:ensure_dir("/tmp/c/not-used"),
2015-08-22 05:56:26 +00:00
FluInfo = [
{a,TcpPort+0,"/tmp/c/data.a"}, {b,TcpPort+1,"/tmp/c/data.b"},
2015-07-06 07:11:14 +00:00
{c,TcpPort+2,"/tmp/c/data.c"}, {d,TcpPort+3,"/tmp/c/data.d"},
{e,TcpPort+4,"/tmp/c/data.e"}, {f,TcpPort+5,"/tmp/c/data.f"},
{g,TcpPort+6,"/tmp/c/data.g"}, {h,TcpPort+7,"/tmp/c/data.h"},
2015-08-22 05:56:26 +00:00
{i,TcpPort+8,"/tmp/c/data.i"}, {j,TcpPort+9,"/tmp/c/data.j"},
{k,TcpPort+10,"/tmp/c/data.k"}, {l,TcpPort+11,"/tmp/c/data.l"},
{m,TcpPort+12,"/tmp/c/data.m"}, {n,TcpPort+13,"/tmp/c/data.n"},
{o,TcpPort+14,"/tmp/c/data.o"}, {p,TcpPort+15,"/tmp/c/data.p"},
{q,TcpPort+16,"/tmp/c/data.q"}, {r,TcpPort+17,"/tmp/c/data.r"}
],
2015-04-09 12:08:15 +00:00
FLU_biglist = [X || {X,_,_} <- FluInfo],
All_list = lists:sublist(FLU_biglist, NumFLUs),
2015-06-02 11:55:18 +00:00
io:format(user, "\nSET # of FLUs = ~w members ~w).\n",
[NumFLUs, All_list]),
machi_partition_simulator:start_link({111,222,33}, 0, 100),
_ = machi_partition_simulator:get(All_list),
2015-04-09 12:08:15 +00:00
Ps = [#p_srvr{name=Name,address="localhost",port=Port} ||
{Name,Port,_Dir} <- lists:sublist(FluInfo, NumFLUs)],
PsDirs = lists:zip(Ps,
[Dir || {_,_,Dir} <- lists:sublist(FluInfo, NumFLUs)]),
FLU_pids = [machi_flu1_test:setup_test_flu(Name, Port, Dir) ||
2015-04-10 02:08:17 +00:00
{#p_srvr{name=Name,port=Port}, Dir} <- PsDirs],
2015-04-09 12:08:15 +00:00
Namez = [begin
{ok, PPid} = ?FLU_PC:start_link(P),
{Name, PPid}
2015-04-10 02:08:17 +00:00
end || {#p_srvr{name=Name}=P, _Dir} <- PsDirs],
2015-04-09 12:08:15 +00:00
MembersDict = machi_projection:make_members_dict(Ps),
MgrOpts = MgrOpts0 ++ ?DEFAULT_MGR_OPTS,
MgrNamez =
[begin
2015-04-09 12:08:15 +00:00
{ok, MPid} = ?MGR:start_link(P#p_srvr.name, MembersDict, MgrOpts),
{P#p_srvr.name, MPid}
end || P <- Ps],
try
[{_, Ma}|_] = MgrNamez,
{ok, P1} = ?MGR:test_calc_projection(Ma, false),
2015-04-09 12:08:15 +00:00
[ok = ?FLU_PC:write_projection(FLUPid, public, P1) ||
{_, FLUPid} <- Namez, FLUPid /= Ma],
machi_partition_simulator:reset_thresholds(10, 50),
_ = machi_partition_simulator:get(All_list),
Parent = self(),
DoIt = fun(Iters, S_min, S_max) ->
%% io:format(user, "\nDoIt: top\n\n", []),
io:format(user, "DoIt, ", []),
Pids = [spawn(fun() ->
random:seed(now()),
[begin
erlang:yield(),
S_max_rand = random:uniform(
S_max + 1),
%% io:format(user, "{t}", []),
Elapsed =
?MGR:sleep_ranked_order(
S_min, S_max_rand,
M_name, All_list),
_ = ?MGR:trigger_react_to_env(MMM),
%% Be more unfair by not
%% sleeping here.
2015-06-02 09:10:45 +00:00
% timer:sleep(S_max - Elapsed),
Elapsed
end || _ <- lists:seq(1, Iters)],
Parent ! done
end) || {M_name, MMM} <- MgrNamez ],
[receive
done ->
ok
after 120*1000 ->
exit(icky_timeout)
end || _ <- Pids]
end,
machi_partition_simulator:reset_thresholds(10, 50),
io:format(user, "\nLet loose the dogs of war!\n", []),
%% machi_partition_simulator:always_these_partitions([]),
%% io:format(user, "\nPuppies for everyone!\n", []),
2015-07-06 07:11:14 +00:00
[DoIt(30, 0, 0) || _ <- lists:seq(1,2)],
AllPs = make_partition_list(All_list),
PartitionCounts = lists:zip(AllPs, lists:seq(1, length(AllPs))),
MaxIters = NumFLUs * (NumFLUs + 1) * 6,
[begin
machi_partition_simulator:always_these_partitions(Partition),
io:format(user, "\nSET partitions = ~w (~w of ~w) at ~w\n",
[Partition, Count, length(AllPs), time()]),
true = lists:foldl(
fun(_, true) ->
true;
(_, _) ->
%% Run a few iterations
[DoIt(10, 10, 50) || _ <- lists:seq(1, 6)],
%% If stable, return true to short circuit remaining
private_projections_are_stable(Namez, DoIt)
end, false, lists:seq(0, MaxIters)),
2015-06-02 09:10:45 +00:00
io:format(user, "\nSweet, private projections are stable\n", []),
io:format(user, "\t~P\n", [get(stable), 14]),
WIP: major fixups to the chmgr state transition checking (more below) So, the PULSE test is failing, which is good. However, I believe that the failures are all due to the model now being *too strict*. The model is now catching failures which are now benign, I think. {bummer_NOT_DISJOINT,{[a,b,b,c,d], [{a,not_in_this_epoch}, {b,not_in_this_epoch}, {c,"[{epoch,1546},{author,c},{upi,[c]},{repair,[b]},{down,[a,d]},{d,[{ps,[{a,c},{c,a},{a,d},{b,d},{c,d}]},{nodes_up,[b,c]}]},{d2,[]}]"}, {d,"[{epoch,1546},{author,d},{upi,[d]},{repair,[a,b]},{down,[c]},{d,[{ps,[{c,b},{d,c}]},{nodes_up,[a,b,d]}]},{d2,[]}]"}]}}}, In this and all other examples, the UPIs are disjoint but the repairs are not disjoint. I believe the model ought to be ignoring the repair list. {bummer_NOT_DISJOINT,{[a,a,b], [{a,"[{epoch,1174},{author,a},{upi,[a]},{repair,[]},{down,[b]},{d,[{ps,[{a,b},{b,a}]},{nodes_up,[a]}]},{d2,[]}]"}, {b,"[{epoch,1174},{author,b},{upi,[b]},{repair,[a]},{down,[]},{d,[{ps,[]},{nodes_up,[a,b]}]},{d2,[]}]"}]}}}, or {bummer_NOT_DISJOINT,{[c,c,e], [{a,not_in_this_epoch}, {b,not_in_this_epoch}, {c,"[{epoch,1388},{author,c},{upi,[c]},{repair,[]},{down,[a,b,d,e]},{d,[{ps,[{a,b},{a,c},{c,a},{a,d},{d,a},{e,a},{c,b},{b,e},{e,b},{c,d},{e,c},{e,d}]},{nodes_up,[c]}]},{d2,[]}]"}, {d,not_in_this_epoch}, {e,"[{epoch,1388},{author,e},{upi,[e]},{repair,[c]},{down,[a,b,d]},{d,[{ps,[{a,b},{b,a},{a,c},{c,a},{a,d},{d,a},{a,e},{e,a},{b,c},{c,b},{b,d},{b,e},{e,b},{c,d},{d,c},{d,e},{e,d}]},{nodes_up,[c,e]}]},{d2,[]}]"}]}}},
2015-07-07 13:11:19 +00:00
io:format(user, "Rolling sanity check ... ", []),
MaxFiles = 1*1000,
WIP: major fixups to the chmgr state transition checking (more below) So, the PULSE test is failing, which is good. However, I believe that the failures are all due to the model now being *too strict*. The model is now catching failures which are now benign, I think. {bummer_NOT_DISJOINT,{[a,b,b,c,d], [{a,not_in_this_epoch}, {b,not_in_this_epoch}, {c,"[{epoch,1546},{author,c},{upi,[c]},{repair,[b]},{down,[a,d]},{d,[{ps,[{a,c},{c,a},{a,d},{b,d},{c,d}]},{nodes_up,[b,c]}]},{d2,[]}]"}, {d,"[{epoch,1546},{author,d},{upi,[d]},{repair,[a,b]},{down,[c]},{d,[{ps,[{c,b},{d,c}]},{nodes_up,[a,b,d]}]},{d2,[]}]"}]}}}, In this and all other examples, the UPIs are disjoint but the repairs are not disjoint. I believe the model ought to be ignoring the repair list. {bummer_NOT_DISJOINT,{[a,a,b], [{a,"[{epoch,1174},{author,a},{upi,[a]},{repair,[]},{down,[b]},{d,[{ps,[{a,b},{b,a}]},{nodes_up,[a]}]},{d2,[]}]"}, {b,"[{epoch,1174},{author,b},{upi,[b]},{repair,[a]},{down,[]},{d,[{ps,[]},{nodes_up,[a,b]}]},{d2,[]}]"}]}}}, or {bummer_NOT_DISJOINT,{[c,c,e], [{a,not_in_this_epoch}, {b,not_in_this_epoch}, {c,"[{epoch,1388},{author,c},{upi,[c]},{repair,[]},{down,[a,b,d,e]},{d,[{ps,[{a,b},{a,c},{c,a},{a,d},{d,a},{e,a},{c,b},{b,e},{e,b},{c,d},{e,c},{e,d}]},{nodes_up,[c]}]},{d2,[]}]"}, {d,not_in_this_epoch}, {e,"[{epoch,1388},{author,e},{upi,[e]},{repair,[c]},{down,[a,b,d]},{d,[{ps,[{a,b},{b,a},{a,c},{c,a},{a,d},{d,a},{a,e},{e,a},{b,c},{c,b},{b,d},{b,e},{e,b},{c,d},{d,c},{d,e},{e,d}]},{nodes_up,[c,e]}]},{d2,[]}]"}]}}},
2015-07-07 13:11:19 +00:00
PrivProjs = [{Name, begin
{ok, Ps8} = ?FLU_PC:get_all_projections(
FLU, private, infinity),
Ps9 = if length(Ps8) < MaxFiles ->
WIP: major fixups to the chmgr state transition checking (more below) So, the PULSE test is failing, which is good. However, I believe that the failures are all due to the model now being *too strict*. The model is now catching failures which are now benign, I think. {bummer_NOT_DISJOINT,{[a,b,b,c,d], [{a,not_in_this_epoch}, {b,not_in_this_epoch}, {c,"[{epoch,1546},{author,c},{upi,[c]},{repair,[b]},{down,[a,d]},{d,[{ps,[{a,c},{c,a},{a,d},{b,d},{c,d}]},{nodes_up,[b,c]}]},{d2,[]}]"}, {d,"[{epoch,1546},{author,d},{upi,[d]},{repair,[a,b]},{down,[c]},{d,[{ps,[{c,b},{d,c}]},{nodes_up,[a,b,d]}]},{d2,[]}]"}]}}}, In this and all other examples, the UPIs are disjoint but the repairs are not disjoint. I believe the model ought to be ignoring the repair list. {bummer_NOT_DISJOINT,{[a,a,b], [{a,"[{epoch,1174},{author,a},{upi,[a]},{repair,[]},{down,[b]},{d,[{ps,[{a,b},{b,a}]},{nodes_up,[a]}]},{d2,[]}]"}, {b,"[{epoch,1174},{author,b},{upi,[b]},{repair,[a]},{down,[]},{d,[{ps,[]},{nodes_up,[a,b]}]},{d2,[]}]"}]}}}, or {bummer_NOT_DISJOINT,{[c,c,e], [{a,not_in_this_epoch}, {b,not_in_this_epoch}, {c,"[{epoch,1388},{author,c},{upi,[c]},{repair,[]},{down,[a,b,d,e]},{d,[{ps,[{a,b},{a,c},{c,a},{a,d},{d,a},{e,a},{c,b},{b,e},{e,b},{c,d},{e,c},{e,d}]},{nodes_up,[c]}]},{d2,[]}]"}, {d,not_in_this_epoch}, {e,"[{epoch,1388},{author,e},{upi,[e]},{repair,[c]},{down,[a,b,d]},{d,[{ps,[{a,b},{b,a},{a,c},{c,a},{a,d},{d,a},{a,e},{e,a},{b,c},{c,b},{b,d},{b,e},{e,b},{c,d},{d,c},{d,e},{e,d}]},{nodes_up,[c,e]}]},{d2,[]}]"}]}}},
2015-07-07 13:11:19 +00:00
Ps8;
true ->
lists:nthtail(MaxFiles, Ps8)
WIP: major fixups to the chmgr state transition checking (more below) So, the PULSE test is failing, which is good. However, I believe that the failures are all due to the model now being *too strict*. The model is now catching failures which are now benign, I think. {bummer_NOT_DISJOINT,{[a,b,b,c,d], [{a,not_in_this_epoch}, {b,not_in_this_epoch}, {c,"[{epoch,1546},{author,c},{upi,[c]},{repair,[b]},{down,[a,d]},{d,[{ps,[{a,c},{c,a},{a,d},{b,d},{c,d}]},{nodes_up,[b,c]}]},{d2,[]}]"}, {d,"[{epoch,1546},{author,d},{upi,[d]},{repair,[a,b]},{down,[c]},{d,[{ps,[{c,b},{d,c}]},{nodes_up,[a,b,d]}]},{d2,[]}]"}]}}}, In this and all other examples, the UPIs are disjoint but the repairs are not disjoint. I believe the model ought to be ignoring the repair list. {bummer_NOT_DISJOINT,{[a,a,b], [{a,"[{epoch,1174},{author,a},{upi,[a]},{repair,[]},{down,[b]},{d,[{ps,[{a,b},{b,a}]},{nodes_up,[a]}]},{d2,[]}]"}, {b,"[{epoch,1174},{author,b},{upi,[b]},{repair,[a]},{down,[]},{d,[{ps,[]},{nodes_up,[a,b]}]},{d2,[]}]"}]}}}, or {bummer_NOT_DISJOINT,{[c,c,e], [{a,not_in_this_epoch}, {b,not_in_this_epoch}, {c,"[{epoch,1388},{author,c},{upi,[c]},{repair,[]},{down,[a,b,d,e]},{d,[{ps,[{a,b},{a,c},{c,a},{a,d},{d,a},{e,a},{c,b},{b,e},{e,b},{c,d},{e,c},{e,d}]},{nodes_up,[c]}]},{d2,[]}]"}, {d,not_in_this_epoch}, {e,"[{epoch,1388},{author,e},{upi,[e]},{repair,[c]},{down,[a,b,d]},{d,[{ps,[{a,b},{b,a},{a,c},{c,a},{a,d},{d,a},{a,e},{e,a},{b,c},{c,b},{b,d},{b,e},{e,b},{c,d},{d,c},{d,e},{e,d}]},{nodes_up,[c,e]}]},{d2,[]}]"}]}}},
2015-07-07 13:11:19 +00:00
end,
[P || P <- Ps9,
P#projection_v1.epoch_number /= 0]
end} || {Name, FLU} <- Namez],
try
[{FLU, true} = {FLU, ?MGR:projection_transitions_are_sane_retrospective(Psx, FLU)} ||
{FLU, Psx} <- PrivProjs]
catch _Err:_What ->
io:format(user, "PrivProjs ~p\n", [PrivProjs]),
exit({line, ?LINE, _Err, _What})
end,
io:format(user, "Yay!\n", []),
%% io:format(user, "\n\nEXITING!\n\n", []), timer:sleep(500), erlang:halt(0),
WIP: major fixups to the chmgr state transition checking (more below) So, the PULSE test is failing, which is good. However, I believe that the failures are all due to the model now being *too strict*. The model is now catching failures which are now benign, I think. {bummer_NOT_DISJOINT,{[a,b,b,c,d], [{a,not_in_this_epoch}, {b,not_in_this_epoch}, {c,"[{epoch,1546},{author,c},{upi,[c]},{repair,[b]},{down,[a,d]},{d,[{ps,[{a,c},{c,a},{a,d},{b,d},{c,d}]},{nodes_up,[b,c]}]},{d2,[]}]"}, {d,"[{epoch,1546},{author,d},{upi,[d]},{repair,[a,b]},{down,[c]},{d,[{ps,[{c,b},{d,c}]},{nodes_up,[a,b,d]}]},{d2,[]}]"}]}}}, In this and all other examples, the UPIs are disjoint but the repairs are not disjoint. I believe the model ought to be ignoring the repair list. {bummer_NOT_DISJOINT,{[a,a,b], [{a,"[{epoch,1174},{author,a},{upi,[a]},{repair,[]},{down,[b]},{d,[{ps,[{a,b},{b,a}]},{nodes_up,[a]}]},{d2,[]}]"}, {b,"[{epoch,1174},{author,b},{upi,[b]},{repair,[a]},{down,[]},{d,[{ps,[]},{nodes_up,[a,b]}]},{d2,[]}]"}]}}}, or {bummer_NOT_DISJOINT,{[c,c,e], [{a,not_in_this_epoch}, {b,not_in_this_epoch}, {c,"[{epoch,1388},{author,c},{upi,[c]},{repair,[]},{down,[a,b,d,e]},{d,[{ps,[{a,b},{a,c},{c,a},{a,d},{d,a},{e,a},{c,b},{b,e},{e,b},{c,d},{e,c},{e,d}]},{nodes_up,[c]}]},{d2,[]}]"}, {d,not_in_this_epoch}, {e,"[{epoch,1388},{author,e},{upi,[e]},{repair,[c]},{down,[a,b,d]},{d,[{ps,[{a,b},{b,a},{a,c},{c,a},{a,d},{d,a},{a,e},{e,a},{b,c},{c,b},{b,d},{b,e},{e,b},{c,d},{d,c},{d,e},{e,d}]},{nodes_up,[c,e]}]},{d2,[]}]"}]}}},
2015-07-07 13:11:19 +00:00
ReportXX = machi_chain_manager1_test:unanimous_report(Namez),
true = machi_chain_manager1_test:all_reports_are_disjoint(ReportXX),
io:format(user, "Yay for ReportXX!\n", []),
[begin
Privs = filelib:wildcard(Dir ++ "/projection/private/*"),
FilesToDel1 = lists:sublist(Privs,
max(0, length(Privs)-MaxFiles)),
[_ = file:delete(File) || File <- FilesToDel1],
Pubs = filelib:wildcard(Dir ++ "/projection/public/*"),
FilesToDel2 = lists:sublist(Pubs,
max(0, length(Pubs)-MaxFiles)),
[_ = file:delete(File) || File <- FilesToDel2]
end || Dir <- filelib:wildcard("/tmp/c/data*")],
2015-06-02 09:10:45 +00:00
timer:sleep(1250),
ok
end || {Partition, Count} <- PartitionCounts
],
io:format(user, "\nSET partitions = []\n", []),
io:format(user, "We should see convergence to 1 correct chain.\n", []),
machi_partition_simulator:no_partitions(),
2015-06-02 09:10:45 +00:00
[DoIt(50, 10, 50) || _ <- [1]],
true = private_projections_are_stable(Namez, DoIt),
io:format(user, "~s\n", [os:cmd("date")]),
%% We are stable now ... analyze it.
%% Create a report where at least one FLU has written a
%% private projection.
Report = machi_chain_manager1_test:unanimous_report(Namez),
%% ?D(Report),
%% Report is ordered by Epoch. For each private projection
%% written during any given epoch, confirm that all chain
%% members appear in only one unique chain, i.e., the sets of
%% unique chains are disjoint.
true = machi_chain_manager1_test:all_reports_are_disjoint(Report),
WIP: major fixups to the chmgr state transition checking (more below) So, the PULSE test is failing, which is good. However, I believe that the failures are all due to the model now being *too strict*. The model is now catching failures which are now benign, I think. {bummer_NOT_DISJOINT,{[a,b,b,c,d], [{a,not_in_this_epoch}, {b,not_in_this_epoch}, {c,"[{epoch,1546},{author,c},{upi,[c]},{repair,[b]},{down,[a,d]},{d,[{ps,[{a,c},{c,a},{a,d},{b,d},{c,d}]},{nodes_up,[b,c]}]},{d2,[]}]"}, {d,"[{epoch,1546},{author,d},{upi,[d]},{repair,[a,b]},{down,[c]},{d,[{ps,[{c,b},{d,c}]},{nodes_up,[a,b,d]}]},{d2,[]}]"}]}}}, In this and all other examples, the UPIs are disjoint but the repairs are not disjoint. I believe the model ought to be ignoring the repair list. {bummer_NOT_DISJOINT,{[a,a,b], [{a,"[{epoch,1174},{author,a},{upi,[a]},{repair,[]},{down,[b]},{d,[{ps,[{a,b},{b,a}]},{nodes_up,[a]}]},{d2,[]}]"}, {b,"[{epoch,1174},{author,b},{upi,[b]},{repair,[a]},{down,[]},{d,[{ps,[]},{nodes_up,[a,b]}]},{d2,[]}]"}]}}}, or {bummer_NOT_DISJOINT,{[c,c,e], [{a,not_in_this_epoch}, {b,not_in_this_epoch}, {c,"[{epoch,1388},{author,c},{upi,[c]},{repair,[]},{down,[a,b,d,e]},{d,[{ps,[{a,b},{a,c},{c,a},{a,d},{d,a},{e,a},{c,b},{b,e},{e,b},{c,d},{e,c},{e,d}]},{nodes_up,[c]}]},{d2,[]}]"}, {d,not_in_this_epoch}, {e,"[{epoch,1388},{author,e},{upi,[e]},{repair,[c]},{down,[a,b,d]},{d,[{ps,[{a,b},{b,a},{a,c},{c,a},{a,d},{d,a},{a,e},{e,a},{b,c},{c,b},{b,d},{b,e},{e,b},{c,d},{d,c},{d,e},{e,d}]},{nodes_up,[c,e]}]},{d2,[]}]"}]}}},
2015-07-07 13:11:19 +00:00
%% io:format(user, "\nLast Reports: ~p\n", [lists:nthtail(length(Report)-8,Report)]),
%% For each chain transition experienced by a particular FLU,
%% confirm that each state transition is OK.
PrivProjs = [{Name, begin
{ok, Ps9} = ?FLU_PC:get_all_projections(FLU,
private),
[P || P <- Ps9,
P#projection_v1.epoch_number /= 0]
end} || {Name, FLU} <- Namez],
try
[{FLU, true} = {FLU, ?MGR:projection_transitions_are_sane_retrospective(Psx, FLU)} ||
{FLU, Psx} <- PrivProjs],
io:format(user, "\nAll sanity checks pass, hooray!\n", [])
catch _Err:_What ->
io:format(user, "Report ~p\n", [Report]),
io:format(user, "PrivProjs ~p\n", [PrivProjs]),
exit({line, ?LINE, _Err, _What})
end,
%% ?D(R_Projs),
ok
2015-04-09 12:08:15 +00:00
catch
XX:YY ->
io:format(user, "BUMMER ~p ~p @ ~p\n",
[XX, YY, erlang:get_stacktrace()]),
exit({bummer,XX,YY})
after
[ok = ?MGR:stop(MgrPid) || {_, MgrPid} <- MgrNamez],
2015-04-09 12:08:15 +00:00
[ok = ?FLU_PC:quit(PPid) || {_, PPid} <- Namez],
2015-04-09 12:32:04 +00:00
[ok = machi_flu1:stop(FLUPid) || FLUPid <- FLU_pids],
ok = machi_partition_simulator:stop()
end.
2015-06-02 09:10:45 +00:00
%% Many of the static partition lists below have been problematic at one
%% time or another.....
%%
%% Uncomment *one* of the following make_partition_list() bodies.
make_partition_list(All_list) ->
_X_Ys1 = [[{X,Y}] || X <- All_list, Y <- All_list, X /= Y],
_X_Ys2 = [[{X,Y}, {A,B}] || X <- All_list, Y <- All_list, X /= Y,
A <- All_list, B <- All_list, A /= B,
X /= A],
_X_Ys3 = [[{X,Y}, {A,B}, {C,D}] || X <- All_list, Y <- All_list, X /= Y,
A <- All_list, B <- All_list, A /= B,
C <- All_list, D <- All_list, C /= D,
X /= A, X /= C, A /= C],
2015-07-07 06:02:29 +00:00
%% Concat = _X_Ys1,
%% Concat = _X_Ys2,
%% Concat = _X_Ys1 ++ _X_Ys2,
%% %% Concat = _X_Ys3,
%% Concat = _X_Ys1 ++ _X_Ys2 ++ _X_Ys3,
%% random_sort(lists:usort([lists:sort(L) || L <- Concat])).
2015-06-02 09:10:45 +00:00
%% [ [{a,b},{b,d},{c,b}],
%% [{a,b},{b,d},{c,b}, {a,b},{b,a},{a,c},{c,a},{a,d},{d,a}],
%% %% [{a,b},{b,d},{c,b}, {b,a},{a,b},{b,c},{c,b},{b,d},{d,b}],
%% [{a,b},{b,d},{c,b}, {c,a},{a,c},{c,b},{b,c},{c,d},{d,c}],
%% [{a,b},{b,d},{c,b}, {d,a},{a,d},{d,b},{b,d},{d,c},{c,d}] ].
%% [ [{a,b}, {b,c}],
%% [{a,b}, {c,b}] ].
2015-07-06 07:11:14 +00:00
%% [ [{a,b}, {b,c}] ]. %% hosed-not-equal @ 3 FLUs
2015-06-02 09:10:45 +00:00
2015-07-06 07:11:14 +00:00
%% [ [{b,d}] ].
%% [ [{a,b}], [], [{a,b}], [], [{a,b}] ].
2015-06-02 09:10:45 +00:00
2015-07-07 06:02:29 +00:00
%% [
%% %% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
%% %% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
%% %% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
%% %% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
%% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
%% [{b,a},{d,e}],
%% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], []
%% ].
2015-06-02 09:10:45 +00:00
%% [ [{a,b}, {b,a}] ].
[
[{c,b}, {c,a}],
[{b,c}, {b,a}]
].
%% [
%% [{a,b}], [],
%% [{b,a}, {b,c}], [],
%% [{c,b}, {c,a}, {d,c}], [],
%% [{c,b}, {c,a}], [],
%% [{b,a}, {c,a}], [],
%% [{a,b}, {c,b}], [],
%% [{b,c}, {a,c}]
%% ].
2015-06-02 09:10:45 +00:00
%% [ [{a,b},{b,c},{c,a}],
%% [{a,b}, {b,a}, {a,c},{c,a}] ].
2015-07-06 07:11:14 +00:00
%% [ [{a,b}, {c,b}],
%% [{a,b}, {b,c}] ].
2015-06-02 09:10:45 +00:00
%% [ [{a,b}, {b,c}, {c,d}],
%% [{a,b}, {b,c},{b,d}, {c,d}],
%% [{b,a}, {b,c}, {c,d}],
%% [{a,b}, {c,b}, {c,d}],
%% [{a,b}, {b,c}, {d,c}] ].
%% [
%% %% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
%% %% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
%% %% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
%% %% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
%% [{a,b}], [], [{a,b}], [], [{a,b}]
%% %% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
%% %% [{b,a},{d,e}],
%% %% [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], []
%% ].
2015-06-02 09:10:45 +00:00
%% [ [{a,b}, {b,c}, {c,d}, {d,e}],
%% [{b,a}, {b,c}, {c,d}, {d,e}],
%% [{a,b}, {c,b}, {c,d}, {d,e}],
%% [{a,b}, {b,c}, {d,c}, {d,e}],
%% [{a,b}, {b,c}, {c,d}, {e,d}] ].
2015-07-06 07:11:14 +00:00
%% [ [{c,a}] ]. %% TODO double-check for total repair stability at SET=[]!!
2015-06-02 09:10:45 +00:00
2015-07-06 07:11:14 +00:00
%% [ [{c,a}],
%% [{c,b}, {a, b}] ].
2015-06-02 09:10:45 +00:00
%% [ [{a,b},{b,a}, {a,c},{c,a}, {a,d},{d,a}],
%% [{a,b},{b,a}, {a,c},{c,a}, {a,d},{d,a}, {b,c}],
%% [{a,b},{b,a}, {a,c},{c,a}, {a,d},{d,a}, {c,d}] ].
%% [ [{a,b}, {a,b},{b,a},{a,c},{c,a},{a,d},{d,a}],
2015-06-02 09:10:45 +00:00
%% [{a,b}, {b,a},{a,b},{b,c},{c,b},{b,d},{d,b}],
%% [{a,b}],
2015-06-02 09:10:45 +00:00
%% [{a,b}, {c,a},{a,c},{c,b},{b,c},{c,d},{d,c}],
%% [{a,b}, {d,a},{a,d},{d,b},{b,d},{d,c},{c,d}] ].
todo_why_does_this_crash_sometimes(FLUName, FLU, PPPepoch) ->
try
{ok, _}=Res = ?FLU_PC:read_projection(FLU, public, PPPepoch),
Res
catch _:_ ->
io:format(user, "QQQ Whoa, it crashed this time for ~p at epoch ~p\n",
[FLUName, PPPepoch]),
timer:sleep(1000),
2015-06-02 09:10:45 +00:00
exit(still_a_problem),
?FLU_PC:read_projection(FLU, public, PPPepoch)
end.
private_projections_are_stable(Namez, PollFunc) ->
Private1 = [{Name, get_latest_inner_proj_summ(FLU)} || {Name,FLU} <- Namez],
[PollFunc(5, 1, 10) || _ <- lists:seq(1,2)],
Private2 = [{Name, get_latest_inner_proj_summ(FLU)} || {Name,FLU} <- Namez],
%% Is = [Inner_p || {_,_,_,_,Inner_p} <- Private1],
put(stable, lists:sort(Private1)),
%% We want either all true or all false (inner or not) ... except
%% that it isn't quite that simple. I've now witnessed a case
%% where the projections are stable but not everyone is
%% unanimously outer or unanimously inner!
%% Old partitions: [{a,b},{b,c},{c,a}]
%% result: all 3 had inner proj of [self]
%% New partitions: [{b,a},{c,b}]
%% Priv1 [{342,[c,a],[],[b],[],false},
%% {326,[b],[],[a,c],[],true},
%% {342,[c,a],[],[b],[],false}]
%% ... and it stays completely stable with these epoch #s.
%%
%% So, instead, if inner/outer status isn't unanimous, then we
%% should check to see if the sets of unique UPIs are disjoint.
%%
FLUs = [FLU || {FLU,_Pid} <- Namez],
U_UPI_Rs = lists:usort([UPI++Rep ||
{_Nm,{_Epoch,UPI,Rep,_Dn,_W,InnerP}} <- Private2]),
FLU_uses = [{Name, Epoch} ||
{Name,{Epoch,_UPI,Rep,_Dn,_W,InnerP}} <- Private2],
Witnesses = hd([Ws ||
{_Name,{_Epoch,_UPI,Rep,_Dn,Ws,InnerP}} <- Private2]),
HaveWitnesses_p = Witnesses /= [],
Unanimous_with_all_peers_p =
lists:all(fun({FLU, UsesEpoch}) ->
WhoInEpoch = [Name ||
{Name,{Epoch,_UPI,_Rep,_Dn,_W,I_}}<-Private2,
Epoch == UsesEpoch],
WhoInEpoch_s = ordsets:from_list(WhoInEpoch),
UPI_R_versions = [UPI++Rep ||
{_Name,{Epoch,UPI,Rep,_Dn,_W,I_}}<-Private2,
Epoch == UsesEpoch],
UPI_R_vers_s = ordsets:from_list(hd(UPI_R_versions)),
UPI_R_versions == [ [] ] % This FLU in minority partition
orelse
(length(lists:usort(UPI_R_versions)) == 1
andalso
ordsets:is_subset(UPI_R_vers_s, WhoInEpoch_s))
end, FLU_uses),
%% io:format(user, "\nPriv1 ~P agree ~p\n", [lists:sort(Private1), 14, Unanimous_with_all_peers_p]),
%%io:format(user, "U_UPI_Rs ~p\n", [U_UPI_Rs]),
%%io:format(user, "FLUs ~p\n", [FLUs]),
%%io:format(user, "Unanimous_with_all_peers_p ~p\n", [Unanimous_with_all_peers_p]),
Flat_U_UPI_Rs = lists:flatten(U_UPI_Rs),
Private1 == Private2 andalso
%% If not disjoint, then a flu will appear twice in flattented U_UPIs.
lists:sort(Flat_U_UPI_Rs) == lists:usort(Flat_U_UPI_Rs) andalso
%% Another property that we want is that for each participant
%% X mentioned in a UPI or Repairing list of some epoch E that
%% X is using the same epoch E.
%%
%% It's possible (in theory) for humming consensus to agree on
%% the membership of UPI+Repairing but arrive those lists at
%% different epoch numbers. Machi chain replication won't
%% work in that case: all participants need to be using the
%% same epoch (and csum)! (NOTE: We ignore epoch_csum here.)
Unanimous_with_all_peers_p.
2015-06-02 09:10:45 +00:00
get_latest_inner_proj_summ(FLU) ->
{ok, Proj} = ?FLU_PC:read_latest_projection(FLU, private),
#projection_v1{epoch_number=E, epoch_csum=CSum,
upi=UPI, repairing=Repairing,
witnesses=Witnesses, down=Down} =
2015-06-02 09:10:45 +00:00
machi_chain_manager1:inner_projection_or_self(Proj),
Inner_p = machi_chain_manager1:inner_projection_exists(Proj),
EpochID = {E, CSum},
{EpochID, UPI, Repairing, Down, Witnesses, Inner_p}.
2015-06-02 09:10:45 +00:00
random_sort(L) ->
random:seed(now()),
L1 = [{random:uniform(99999), X} || X <- L],
[X || {_, X} <- lists:sort(L1)].
2015-06-02 13:13:15 +00:00
-endif. % !PULSE
-endif. % TEST