From 0a4c0f963ea1aada716f253ae3368d24a9389b95 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 25 Aug 2015 19:12:23 +0900 Subject: [PATCH 01/51] Add failing test case for annotating private projections via dbg2 list --- .gitignore | 6 +++ test/machi_projection_store_test.erl | 60 ++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 test/machi_projection_store_test.erl diff --git a/.gitignore b/.gitignore index 5243bad..6ad1c94 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,9 @@ edoc # PB artifacts for Erlang include/machi_pb.hrl + +# Misc Scott cruft +*.patch +current_counterexample.eqc +foo* +typescript* diff --git a/test/machi_projection_store_test.erl b/test/machi_projection_store_test.erl new file mode 100644 index 0000000..96cafdf --- /dev/null +++ b/test/machi_projection_store_test.erl @@ -0,0 +1,60 @@ +%% ------------------------------------------------------------------- +%% +%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved. +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- + +-module(machi_projection_store_test). + +-ifdef(TEST). +-ifndef(PULSE). + +-compile(export_all). +-define(PS, machi_projection_store). + +-include("machi_projection.hrl"). + +smoke_test() -> + {ok, SupPid} = machi_flu_sup:start_link(), + PortBase = 64820, + Dir = "./data.a", + Os = [{ignore_stability_time, true}, {active_mode, false}], + os:cmd("rm -rf " ++ Dir), + {ok,Yo}=machi_flu_psup:start_flu_package(a, PortBase, "./data.a", Os), + + try + P1 = machi_projection:new(1, a, [], [], [], [], []), + ok = ?PS:write(a_pstore, public, P1), + {error, written} = ?PS:write(a_pstore, public, P1), + + ok = ?PS:write(a_pstore, private, P1), + {error, written} = ?PS:write(a_pstore, private, P1), + P1b = P1#projection_v1{dbg2=[version_b]}, + ok = ?PS:write(a_pstore, private, P1b), + P1c = P1#projection_v1{dbg2=[version_c]}, + ok = ?PS:write(a_pstore, private, P1c), + {error, written} = ?PS:write(a_pstore, private, P1c), + + ok + after + machi_flu_psup:stop_flu_package(a), + exit(SupPid, normal), + timer:sleep(10) + end. + +-endif. % !PULSE +-endif. % TEST From 83f49472dbf957a6caf29006a5035f7166dfd653 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 25 Aug 2015 19:31:05 +0900 Subject: [PATCH 02/51] WIP: intermediate refactoring --- src/machi_projection_store.erl | 93 ++++++++++++++++------------ test/machi_projection_store_test.erl | 3 + 2 files changed, 58 insertions(+), 38 deletions(-) diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl index b75197d..0cfb79e 100644 --- a/src/machi_projection_store.erl +++ b/src/machi_projection_store.erl @@ -267,56 +267,73 @@ do_proj_write(private=ProjType, #projection_v1{epoch_number=Epoch}=Proj, S) -> case S#state.max_public_epochid of {PublicEpoch, _} when PublicEpoch =< Epoch -> do_proj_write2(ProjType, Proj, S); - {PublicEpoch, _} -> + {_PublicEpoch, _} -> {{error, bad_arg}, S} end. -do_proj_write2(ProjType, #projection_v1{epoch_number=Epoch}=Proj, S) -> +do_proj_write2(ProjType, #projection_v1{epoch_csum=CSum}=Proj, S) -> + case (machi_projection:update_checksum(Proj))#projection_v1.epoch_csum of + CSum2 when CSum2 == CSum -> + do_proj_write3(ProjType, Proj, S); + _Else -> + io:format(user, "Oops ~s epoch ~w csum ~W expected ~W\n", [S#state.public_dir, Proj#projection_v1.epoch_number, _Else, 6, CSum, 6]), + {{error, bad_arg}, S} + end. + +do_proj_write3(ProjType, #projection_v1{epoch_number=Epoch, + epoch_csum=CSum}=Proj, S) -> %% TODO: We probably ought to check the projection checksum for sanity, eh? Dir = pick_path(ProjType, S), Path = filename:join(Dir, epoch2name(Epoch)), - case file:read_file_info(Path) of - {ok, _FI} -> + case file:read_file(Path) of + {ok, _Bin} when ProjType == public -> + {{error, written}, S}; + {ok, Bin} when ProjType == private -> + #projection_v1{epoch_number=CurEpoch, + epoch_csum=CurCSum} = _CurProj = binary_to_term(Bin), {{error, written}, S}; {error, enoent} -> - {ok, FH} = file:open(Path, [write, raw, binary]), - ok = file:write(FH, term_to_binary(Proj)), - ok = file:sync(FH), - ok = file:close(FH), - EffectiveProj = machi_chain_manager1:inner_projection_or_self(Proj), - EffectiveEpoch = EffectiveProj#projection_v1.epoch_number, - EpochId = {Epoch, Proj#projection_v1.epoch_csum}, - EffectiveEpochId = {EffectiveEpoch, EffectiveProj#projection_v1.epoch_csum}, - %% - NewS = if ProjType == public, - Epoch > element(1, S#state.max_public_epochid) -> - if Epoch == EffectiveEpoch -> - %% This is a regular projection, i.e., - %% does not have an inner proj. - update_wedge_state( - S#state.wedge_notify_pid, true, - EffectiveEpochId); - Epoch /= EffectiveEpoch -> - %% This projection has an inner proj. - %% The outer proj is flapping, so we do - %% not bother wedging. - ok - end, - S#state{max_public_epochid=EpochId}; - ProjType == private, - Epoch > element(1, S#state.max_private_epochid) -> - update_wedge_state( - S#state.wedge_notify_pid, false, - EffectiveEpochId), - S#state{max_private_epochid=EpochId}; - true -> - S - end, - {ok, NewS}; + do_proj_write4(ProjType, Proj, Path, Epoch, S); {error, Else} -> {{error, Else}, S} end. +do_proj_write4(ProjType, Proj, Path, Epoch, S) -> + {ok, FH} = file:open(Path, [write, raw, binary]), + ok = file:write(FH, term_to_binary(Proj)), + ok = file:sync(FH), + ok = file:close(FH), + EffectiveProj = machi_chain_manager1:inner_projection_or_self(Proj), + EffectiveEpoch = EffectiveProj#projection_v1.epoch_number, + EpochId = {Epoch, Proj#projection_v1.epoch_csum}, + EffectiveEpochId = {EffectiveEpoch, EffectiveProj#projection_v1.epoch_csum}, + %% + NewS = if ProjType == public, + Epoch > element(1, S#state.max_public_epochid) -> + if Epoch == EffectiveEpoch -> + %% This is a regular projection, i.e., + %% does not have an inner proj. + update_wedge_state( + S#state.wedge_notify_pid, true, + EffectiveEpochId); + Epoch /= EffectiveEpoch -> + %% This projection has an inner proj. + %% The outer proj is flapping, so we do + %% not bother wedging. + ok + end, + S#state{max_public_epochid=EpochId}; + ProjType == private, + Epoch > element(1, S#state.max_private_epochid) -> + update_wedge_state( + S#state.wedge_notify_pid, false, + EffectiveEpochId), + S#state{max_private_epochid=EpochId}; + true -> + S + end, + {ok, NewS}. + update_wedge_state(PidSpec, Boolean, {0,_}=EpochId) -> %% Epoch #0 is a special case: no projection has been written yet. %% However, given the way that machi_flu_psup starts the diff --git a/test/machi_projection_store_test.erl b/test/machi_projection_store_test.erl index 96cafdf..eca6245 100644 --- a/test/machi_projection_store_test.erl +++ b/test/machi_projection_store_test.erl @@ -41,6 +41,9 @@ smoke_test() -> ok = ?PS:write(a_pstore, public, P1), {error, written} = ?PS:write(a_pstore, public, P1), + Pbad = P1#projection_v1{epoch_number=99238}, % break checksum + {error, bad_arg} = ?PS:write(a_pstore, public, Pbad), + ok = ?PS:write(a_pstore, private, P1), {error, written} = ?PS:write(a_pstore, private, P1), P1b = P1#projection_v1{dbg2=[version_b]}, From c0ee323637c19efc6d15a1f939a0e3becfb86b79 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 25 Aug 2015 19:42:33 +0900 Subject: [PATCH 03/51] Our new unit test works, yay --- src/machi_projection_store.erl | 12 ++++++++++-- test/machi_projection_store_test.erl | 6 ++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl index 0cfb79e..c5cd371 100644 --- a/src/machi_projection_store.erl +++ b/src/machi_projection_store.erl @@ -276,7 +276,6 @@ do_proj_write2(ProjType, #projection_v1{epoch_csum=CSum}=Proj, S) -> CSum2 when CSum2 == CSum -> do_proj_write3(ProjType, Proj, S); _Else -> - io:format(user, "Oops ~s epoch ~w csum ~W expected ~W\n", [S#state.public_dir, Proj#projection_v1.epoch_number, _Else, 6, CSum, 6]), {{error, bad_arg}, S} end. @@ -291,7 +290,16 @@ do_proj_write3(ProjType, #projection_v1{epoch_number=Epoch, {ok, Bin} when ProjType == private -> #projection_v1{epoch_number=CurEpoch, epoch_csum=CurCSum} = _CurProj = binary_to_term(Bin), - {{error, written}, S}; + %% We've already checked that CSum is correct matches the + %% contents of this new projection version. If the epoch_csum + %% values match, and if we trust the value on disk (TODO paranoid + %% check that, also), then the only difference must be the dbg2 + %% list, which is ok. + if CurCSum == CSum -> + do_proj_write4(ProjType, Proj, Path, Epoch, S); + true -> + {{error, written}, S} + end; {error, enoent} -> do_proj_write4(ProjType, Proj, Path, Epoch, S); {error, Else} -> diff --git a/test/machi_projection_store_test.erl b/test/machi_projection_store_test.erl index eca6245..db5ffe8 100644 --- a/test/machi_projection_store_test.erl +++ b/test/machi_projection_store_test.erl @@ -45,12 +45,14 @@ smoke_test() -> {error, bad_arg} = ?PS:write(a_pstore, public, Pbad), ok = ?PS:write(a_pstore, private, P1), - {error, written} = ?PS:write(a_pstore, private, P1), + P1a = machi_projection:update_checksum(P1#projection_v1{dbg=[diff_yo]}), + {error, written} = ?PS:write(a_pstore, private, P1a), + P1b = P1#projection_v1{dbg2=[version_b]}, ok = ?PS:write(a_pstore, private, P1b), P1c = P1#projection_v1{dbg2=[version_c]}, ok = ?PS:write(a_pstore, private, P1c), - {error, written} = ?PS:write(a_pstore, private, P1c), + {error, written} = ?PS:write(a_pstore, private, P1a), ok after From c12231c7b62b6e405c389b39a0af7457e85e0127 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 25 Aug 2015 19:45:31 +0900 Subject: [PATCH 04/51] Fix other tests to accomodate new semantics --- test/machi_flu1_test.erl | 5 ++++- test/machi_proxy_flu1_client_test.erl | 12 ++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/test/machi_flu1_test.erl b/test/machi_flu1_test.erl index 7ad11f2..6420be5 100644 --- a/test/machi_flu1_test.erl +++ b/test/machi_flu1_test.erl @@ -186,7 +186,10 @@ flu_projection_common(Host, TcpPort, T) -> P_a = #p_srvr{name=a, address="localhost", port=4321}, P1 = machi_projection:new(1, a, [P_a], [], [a], [], []), ok = ?FLU_C:write_projection(Host, TcpPort, T, P1), - {error, written} = ?FLU_C:write_projection(Host, TcpPort, T, P1), + case ?FLU_C:write_projection(Host, TcpPort, T, P1) of + {error, written} when T == public -> ok; + ok when T == private -> ok + end, {ok, P1} = ?FLU_C:read_projection(Host, TcpPort, T, 1), {ok, {1,_}} = ?FLU_C:get_latest_epochid(Host, TcpPort, T), {ok, P1} = ?FLU_C:read_latest_projection(Host, TcpPort, T), diff --git a/test/machi_proxy_flu1_client_test.erl b/test/machi_proxy_flu1_client_test.erl index aba5612..20faa3a 100644 --- a/test/machi_proxy_flu1_client_test.erl +++ b/test/machi_proxy_flu1_client_test.erl @@ -128,7 +128,8 @@ flu_restart_test() -> infinity), P_a = #p_srvr{name=a, address="localhost", port=6622}, P1 = machi_projection:new(1, a, [P_a], [], [a], [], []), - P1xx = P1#projection_v1{dbg2=["not exactly the same as P1!!!"]}, + P1xx = P1#projection_v1{dbg2=["dbg2 changes are ok"]}, + P1yy = P1#projection_v1{dbg=["not exactly the same as P1!!!"]}, EpochID = {P1#projection_v1.epoch_number, P1#projection_v1.epoch_csum}, ok = ?MUT:write_projection(Prox1, public, P1), @@ -202,12 +203,19 @@ flu_restart_test() -> (line) -> io:format("line ~p, ", [?LINE]); (stop) -> ?MUT:write_projection(Prox1, public, P1xx) end, - fun(run) -> {error, written} = + + fun(run) -> ok = %% P1xx is difference only in dbg2 ?MUT:write_projection(Prox1, private, P1xx), ok; (line) -> io:format("line ~p, ", [?LINE]); (stop) -> ?MUT:write_projection(Prox1, private, P1xx) end, + fun(run) -> {error, bad_arg} = % P1yy has got bad checksum + ?MUT:write_projection(Prox1, private, P1yy), + ok; + (line) -> io:format("line ~p, ", [?LINE]); + (stop) -> ?MUT:write_projection(Prox1, private, P1yy) + end, fun(run) -> {ok, [_]} = ?MUT:get_all_projections(Prox1, public), From e8f3ab381d49dd45c8d5f13b6ddbf5c1d373f7c1 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 26 Aug 2015 14:54:01 +0900 Subject: [PATCH 05/51] Add set_consistency_mode() to projection store API, use it --- src/machi_chain_manager1.erl | 19 +++++++++++++++++-- src/machi_projection_store.erl | 14 +++++++++++--- test/machi_projection_store_test.erl | 3 +++ 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 71f4216..317321c 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -232,12 +232,18 @@ init({MyName, InitMembersDict, MgrOpts}) -> random:seed(now()), init_remember_partition_hack(), Opt = fun(Key, Default) -> proplists:get_value(Key, MgrOpts, Default) end, - CMode = Opt(consistency_mode, ap_mode), InitWitness_list = Opt(witnesses, []), ZeroAll_list = [P#p_srvr.name || {_,P} <- orddict:to_list(InitMembersDict)], ZeroProj = make_none_projection(MyName, ZeroAll_list, InitWitness_list, InitMembersDict), ok = store_zeroth_projection_maybe(ZeroProj, MgrOpts), + CMode = Opt(consistency_mode, ap_mode), + case get_projection_store_regname(MgrOpts) of + undefined -> + ok; + PS -> + ok = set_consistency_mode(PS, CMode) + end, %% Using whatever is the largest epoch number in our local private %% store, this manager starts out using the "none" projection. If @@ -309,6 +315,7 @@ handle_call({set_chain_members, MembersDict, Witness_list}, _From, Witness_list /= [] -> cp_mode end, + ok = set_consistency_mode(machi_flu_psup:make_proj_supname(MyName), CMode), NewProj = machi_projection:update_checksum( OldProj#projection_v1{author_server=MyName, creation_time=now(), @@ -442,7 +449,7 @@ get_my_proj_boot_info(MgrOpts, DefaultDict, DefaultProj, ProjType) -> %% 0th epoch is already written, there's no problem. store_zeroth_projection_maybe(ZeroProj, MgrOpts) -> - case proplists:get_value(projection_store_registered_name, MgrOpts) of + case get_projection_store_regname(MgrOpts) of undefined -> ok; Store -> @@ -451,6 +458,14 @@ store_zeroth_projection_maybe(ZeroProj, MgrOpts) -> ok end. +get_projection_store_regname(MgrOpts) -> + proplists:get_value(projection_store_registered_name, MgrOpts). + +set_consistency_mode(undefined, CMode) -> + ok; +set_consistency_mode(ProjStore, CMode) -> + machi_projection_store:set_consistency_mode(ProjStore, CMode). + set_active_timer(#ch_mgr{name=MyName, members_dict=MembersDict}=S) -> FLU_list = [P#p_srvr.name || {_,P} <- orddict:to_list(MembersDict)], %% Perturb the order a little bit, to avoid near-lock-step diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl index c5cd371..dd3d5dc 100644 --- a/src/machi_projection_store.erl +++ b/src/machi_projection_store.erl @@ -59,7 +59,7 @@ get_all_projections/2, get_all_projections/3, list_all_projections/2, list_all_projections/3 ]). --export([set_wedge_notify_pid/2]). +-export([set_wedge_notify_pid/2, set_consistency_mode/2]). %% gen_server callbacks -export([init/1, handle_call/3, handle_cast/2, handle_info/2, @@ -72,7 +72,8 @@ private_dir = "" :: string(), wedge_notify_pid :: pid() | atom(), max_public_epochid = ?NO_EPOCH :: {-1 | non_neg_integer(), binary()}, - max_private_epochid = ?NO_EPOCH :: {-1 | non_neg_integer(), binary()} + max_private_epochid = ?NO_EPOCH :: {-1 | non_neg_integer(), binary()}, + consistency_mode=ap_mode :: 'ap_mode' | 'cp_mode' }). %% @doc Start a new projection store server. @@ -159,7 +160,12 @@ list_all_projections(PidSpec, ProjType, Timeout) g_call(PidSpec, {list_all_projections, ProjType}, Timeout). set_wedge_notify_pid(PidSpec, NotifyWedgeStateChanges) -> - gen_server:call(PidSpec, {set_wedge_notify_pid, NotifyWedgeStateChanges}). + gen_server:call(PidSpec, {set_wedge_notify_pid, NotifyWedgeStateChanges}, + infinity). + +set_consistency_mode(PidSpec, CMode) + when CMode == ap_mode; CMode == cp_mode -> + gen_server:call(PidSpec, {set_consistency_mode, CMode}, infinity). %%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -224,6 +230,8 @@ handle_call({{list_all_projections, ProjType}, LC1}, _From, S) -> {reply, {{ok, find_all(Dir)}, LC2}, S}; handle_call({set_wedge_notify_pid, NotifyWedgeStateChanges}, _From, S) -> {reply, ok, S#state{wedge_notify_pid=NotifyWedgeStateChanges}}; +handle_call({set_consistency_mode, CMode}, _From, S) -> + {reply, ok, S#state{consistency_mode=CMode}}; handle_call(_Request, _From, S) -> Reply = {whaaaaaaaaaaaaazz, _Request}, {reply, Reply, S}. diff --git a/test/machi_projection_store_test.erl b/test/machi_projection_store_test.erl index db5ffe8..19a44d0 100644 --- a/test/machi_projection_store_test.erl +++ b/test/machi_projection_store_test.erl @@ -54,6 +54,9 @@ smoke_test() -> ok = ?PS:write(a_pstore, private, P1c), {error, written} = ?PS:write(a_pstore, private, P1a), + ok = ?PS:set_consistency_mode(a_pstore, ap_mode), + ok = ?PS:set_consistency_mode(a_pstore, cp_mode), + ok after machi_flu_psup:stop_flu_package(a), From 568e165f4f2525d27dcfccced976e3217ed28ea4 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 26 Aug 2015 15:51:14 +0900 Subject: [PATCH 06/51] Allow pstore -> FLU unwedge only in ap_mode, machi_cr_client_test broken (uses cp_mode) --- src/machi_projection_store.erl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl index dd3d5dc..7767565 100644 --- a/src/machi_projection_store.erl +++ b/src/machi_projection_store.erl @@ -314,7 +314,7 @@ do_proj_write3(ProjType, #projection_v1{epoch_number=Epoch, {{error, Else}, S} end. -do_proj_write4(ProjType, Proj, Path, Epoch, S) -> +do_proj_write4(ProjType, Proj, Path, Epoch, #state{consistency_mode=CMode}=S) -> {ok, FH} = file:open(Path, [write, raw, binary]), ok = file:write(FH, term_to_binary(Proj)), ok = file:sync(FH), @@ -340,11 +340,14 @@ do_proj_write4(ProjType, Proj, Path, Epoch, S) -> end, S#state{max_public_epochid=EpochId}; ProjType == private, + CMode == ap_mode, Epoch > element(1, S#state.max_private_epochid) -> update_wedge_state( S#state.wedge_notify_pid, false, EffectiveEpochId), S#state{max_private_epochid=EpochId}; + %% If ProjType == private and CMode == cp_mode, then + %% the unwedge action is not performed here! true -> S end, From 9222881689feb7b6bf1bffc4287d62456bab6ebc Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 26 Aug 2015 17:51:43 +0900 Subject: [PATCH 07/51] Oops, bugfixes --- src/machi_projection_store.erl | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl index 7767565..ac2ed0e 100644 --- a/src/machi_projection_store.erl +++ b/src/machi_projection_store.erl @@ -303,7 +303,7 @@ do_proj_write3(ProjType, #projection_v1{epoch_number=Epoch, %% values match, and if we trust the value on disk (TODO paranoid %% check that, also), then the only difference must be the dbg2 %% list, which is ok. - if CurCSum == CSum -> + if CurEpoch == Epoch, CurCSum == CSum -> do_proj_write4(ProjType, Proj, Path, Epoch, S); true -> {{error, written}, S} @@ -340,14 +340,17 @@ do_proj_write4(ProjType, Proj, Path, Epoch, #state{consistency_mode=CMode}=S) -> end, S#state{max_public_epochid=EpochId}; ProjType == private, - CMode == ap_mode, Epoch > element(1, S#state.max_private_epochid) -> - update_wedge_state( - S#state.wedge_notify_pid, false, - EffectiveEpochId), + if CMode == ap_mode -> + update_wedge_state( + S#state.wedge_notify_pid, false, + EffectiveEpochId); + true -> + %% If ProjType == private and CMode == cp_mode, then + %% the unwedge action is not performed here! + ok + end, S#state{max_private_epochid=EpochId}; - %% If ProjType == private and CMode == cp_mode, then - %% the unwedge action is not performed here! true -> S end, From 28335a1310c87212d946dc28bcfcebeaa80e8af0 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 26 Aug 2015 18:47:39 +0900 Subject: [PATCH 08/51] Add CP mode unwedge. All eunit tests are passing again. --- src/machi_chain_manager1.erl | 123 +++++++++++++++++++++++++-------- src/machi_projection.erl | 6 +- src/machi_projection_store.erl | 9 ++- test/machi_cr_client_test.erl | 5 +- 4 files changed, 111 insertions(+), 32 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 317321c..5476769 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -61,6 +61,7 @@ name :: pv1_server(), flap_limit :: non_neg_integer(), proj :: projection(), + proj_unanimous :: boolean(), %% timer :: 'undefined' | timer:tref(), ignore_timer :: boolean(), @@ -272,8 +273,7 @@ init({MyName, InitMembersDict, MgrOpts}) -> {network_islands, Opt(network_islands, [])}, {up_nodes, Opt(up_nodes, not_init_yet)}], ActiveP = Opt(active_mode, true), - S = #ch_mgr{name=MyName, - proj=Proj, + S = set_proj(#ch_mgr{name=MyName, %% TODO 2015-03-04: revisit, should this constant be bigger? %% Yes, this should be bigger, but it's a hack. There is %% no guarantee that all parties will advance to a minimum @@ -284,7 +284,7 @@ init({MyName, InitMembersDict, MgrOpts}) -> not_sanes=orddict:new(), consistency_mode=CMode, runenv=RunEnv, - opts=MgrOpts}, + opts=MgrOpts}, Proj), {_, S2} = do_set_chain_members_dict(MembersDict, S), S3 = if ActiveP == false -> S2; @@ -310,11 +310,7 @@ handle_call({set_chain_members, MembersDict, Witness_list}, _From, NewUPI = OldUPI -- MissingInNew, NewDown = All_list -- NewUPI, NewEpoch = OldEpoch + ?SET_CHAIN_MEMBERS_EPOCH_SKIP, - CMode = if Witness_list == [] -> - ap_mode; - Witness_list /= [] -> - cp_mode - end, + CMode = calc_consistency_mode(Witness_list), ok = set_consistency_mode(machi_flu_psup:make_proj_supname(MyName), CMode), NewProj = machi_projection:update_checksum( OldProj#projection_v1{author_server=MyName, @@ -329,8 +325,8 @@ handle_call({set_chain_members, MembersDict, Witness_list}, _From, members_dict=MembersDict}), %% Reset all flapping state. NewProj2 = NewProj#projection_v1{flap=make_flapping_i()}, - S3 = clear_flapping_state(S2#ch_mgr{proj=NewProj2, - proj_history=queue:new()}), + S3 = clear_flapping_state(set_proj(S2#ch_mgr{proj_history=queue:new()}, + NewProj2)), {_QQ, S4} = do_react_to_env(S3), {reply, Reply, S4}; handle_call({set_active, Boolean}, _From, #ch_mgr{timer=TRef}=S) -> @@ -461,7 +457,7 @@ store_zeroth_projection_maybe(ZeroProj, MgrOpts) -> get_projection_store_regname(MgrOpts) -> proplists:get_value(projection_store_registered_name, MgrOpts). -set_consistency_mode(undefined, CMode) -> +set_consistency_mode(undefined, _CMode) -> ok; set_consistency_mode(ProjStore, CMode) -> machi_projection_store:set_consistency_mode(ProjStore, CMode). @@ -540,6 +536,12 @@ read_latest_projection_call_only(ProjectionType, AllHosed, #projection_v1{all_members=All_list} = CurrentProj, All_queried_list = All_list -- AllHosed, + {Rs, S2} = read_latest_projection_call_only2(ProjectionType, + All_queried_list, S), + FLUsRs = lists:zip(All_queried_list, Rs), + {All_queried_list, FLUsRs, S2}. + +read_latest_projection_call_only2(ProjectionType, All_queried_list, S) -> {_UpNodes, Partitions, S2} = calc_up_nodes(S), DoIt = fun(Pid) -> case (?FLU_PC:read_latest_projection(Pid, ProjectionType, ?TO)) of @@ -547,12 +549,9 @@ read_latest_projection_call_only(ProjectionType, AllHosed, Else -> Else end end, - Rs = [perhaps_call_t(S, Partitions, FLU, fun(Pid) -> DoIt(Pid) end) || + Rs = [(catch perhaps_call_t(S, Partitions, FLU, fun(Pid) -> DoIt(Pid) end)) || FLU <- All_queried_list], - %% Rs = [perhaps_call_t(S, Partitions, FLU, fun(Pid) -> DoIt(Pid) end) || - %% FLU <- All_queried_list], - FLUsRs = lists:zip(All_queried_list, Rs), - {All_queried_list, FLUsRs, S2}. + {Rs, S2}. cl_read_latest_projection(ProjectionType, S) -> AllHosed = [], @@ -985,8 +984,10 @@ do_react_to_env(#ch_mgr{name=MyName, {{empty_members_dict, [], Epoch}, S}; true -> {_, S2} = do_set_chain_members_dict(NewMembersDict, S), + CMode = calc_consistency_mode(NewProj#projection_v1.witnesses), {{empty_members_dict, [], Epoch}, - S2#ch_mgr{proj=NewProj, members_dict=NewMembersDict}} + set_proj(S2#ch_mgr{members_dict=NewMembersDict, + consistency_mode=CMode}, NewProj)} end; do_react_to_env(S) -> %% The not_sanes manager counting dictionary is not strictly @@ -1033,13 +1034,15 @@ do_react_to_env(S) -> %% put(react, []), try - if S#ch_mgr.sane_transitions > 3 -> % TODO review this constant - %% ?V("Skr,~w,", [S#ch_mgr.name]), - react_to_env_A10(S#ch_mgr{not_sanes=orddict:new()}); - true -> - %% ?V("Sk,~w,~w,", [S#ch_mgr.name, S#ch_mgr.sane_transitions]), - react_to_env_A10(S) - end + S2 = if S#ch_mgr.sane_transitions > 3 -> % TODO review this constant + S#ch_mgr{not_sanes=orddict:new()}; + true -> + S + end, + %% When in CP mode, we call the poll function twice: once before + %% reacting & once after. This call is the 2nd. + {Res, S3} = react_to_env_A10(S2), + {Res, poll_private_proj_is_upi_unanimous(S3)} catch throw:{zerf,_}=_Throw -> Proj = S#ch_mgr.proj, @@ -1049,7 +1052,7 @@ io:format(user, "zerf ~p caught ~p\n", [S#ch_mgr.name, _Throw]), react_to_env_A10(S) -> ?REACT(a10), - react_to_env_A20(0, S). + react_to_env_A20(0, poll_private_proj_is_upi_unanimous(S)). react_to_env_A20(Retries, #ch_mgr{name=MyName}=S) -> ?REACT(a20), @@ -1150,7 +1153,7 @@ react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, %% io:format(user, "zerf_in @ A29: ~p: ~w\n", [MyName, machi_projection:make_summary(Zerf)]), P_current2 = Zerf#projection_v1{ flap=P_current#projection_v1.flap}, - %% io:format(user, "A29 ~w cur_flap ~W, ", [S#ch_mgr.name, P_current#projection_v1.flap, 8]), + %% Do not use the usual set_proj() wrapper here. react_to_env_A30(Retries, P_latest, LatestUnanimousP, ReadExtra, S#ch_mgr{proj=P_current2}); Zerf -> @@ -1629,6 +1632,7 @@ react_to_env_A49(_P_latest, FinalProps, #ch_mgr{name=MyName, members_dict=MembersDict} = P_current, P_none = make_none_projection(MyName, All_list, Witness_list, MembersDict), + %% Do not use the usual set_proj() wrapper here. react_to_env_A50(P_none, FinalProps, S#ch_mgr{proj=P_none}). react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> @@ -2031,8 +2035,8 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H, diversion_c120_verbose_goop(P_latest, S), ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}), {{now_using, FinalProps, P_latest#projection_v1.epoch_number}, - S#ch_mgr{proj=P_latest, proj_history=H2, - sane_transitions=Xtns + 1}}. + set_proj(S#ch_mgr{proj_history=H2, + sane_transitions=Xtns + 1}, P_latest)}. add_and_trunc_history(P_latest, H, MaxLength) -> H2 = if P_latest#projection_v1.epoch_number > 0 -> @@ -2599,6 +2603,61 @@ projection_transition_is_sane_except_si_epoch( stack, Trace} end. +poll_private_proj_is_upi_unanimous(#ch_mgr{consistency_mode=ap_mode} = S) -> + S; +poll_private_proj_is_upi_unanimous(#ch_mgr{consistency_mode=cp_mode, + proj_unanimous=true} = S) -> + S; +poll_private_proj_is_upi_unanimous(#ch_mgr{consistency_mode=cp_mode, + proj_unanimous=false, + proj=Proj} = S) -> + if Proj#projection_v1.upi == [] % Nobody to poll? + orelse + Proj#projection_v1.epoch_number == 0 -> % Skip polling for epoch 0? + S; + true -> + poll_private_proj_is_upi_unanimous_sleep(0, S) + end. + +poll_private_proj_is_upi_unanimous_sleep(Count, S) when Count > 2 -> + S; +poll_private_proj_is_upi_unanimous_sleep(Count, S) -> + timer:sleep((Count * Count) * 50), + case poll_private_proj_is_upi_unanimous3(S) of + #ch_mgr{proj_unanimous=true} = S2 -> + S2; + S2 -> + poll_private_proj_is_upi_unanimous_sleep(Count + 1, S2) + end. + +poll_private_proj_is_upi_unanimous3(#ch_mgr{name=_MyName, proj=Proj0, + opts=MgrOpts} = S) -> + Proj = inner_projection_or_self(Proj0), + UPI = Proj#projection_v1.upi, + EpochID = machi_projection:make_epoch_id(Proj), + {Rs, S2} = read_latest_projection_call_only2(private, UPI, S), + Rs2 = [if is_record(P, projection_v1) -> + machi_projection:make_epoch_id(P); + true -> + P + end || #projection_v1{}=P <- Rs], + case lists:usort(Rs2) of + [EID] when EID == EpochID -> + Annotation = {private_proj_is_upi_unanimous, {EpochID, now()}}, + %% Careful, use the outer projection here! + NewDbg2 = [Annotation|Proj0#projection_v1.dbg2], + NewProj = Proj0#projection_v1{dbg2=NewDbg2}, + ProjStore = get_projection_store_regname(MgrOpts), + ok = machi_projection_store:write(ProjStore, private, NewProj), + %% Unwedge our FLU. + io:format(user, "\nUnwedge ~w @ ~W\n", [_MyName, EpochID, 7]), + {ok, NotifyPid} = machi_projection_store:get_wedge_notify_pid(ProjStore), + _ = machi_flu1:update_wedge_state(NotifyPid, false, EpochID), + S2#ch_mgr{proj_unanimous=true}; + _Else -> + S2 + end. + sleep_ranked_order(MinSleep, MaxSleep, FLU, FLU_list) -> USec = calc_sleep_ranked_order(MinSleep, MaxSleep, FLU, FLU_list), timer:sleep(USec), @@ -3255,3 +3314,11 @@ calc_magic_down([H|T], G) -> search_last_flap_counts(FLU, FlapCountsLast) -> proplists:get_value(FLU, FlapCountsLast, undefined). + +calc_consistency_mode(_Witness_list = []) -> + ap_mode; +calc_consistency_mode(_Witness_list) -> + cp_mode. + +set_proj(S, Proj) -> + S#ch_mgr{proj=Proj, proj_unanimous=false}. diff --git a/src/machi_projection.erl b/src/machi_projection.erl index 52f7dac..68d2998 100644 --- a/src/machi_projection.erl +++ b/src/machi_projection.erl @@ -31,7 +31,8 @@ compare/2, get_epoch_id/1, make_summary/1, - make_members_dict/1 + make_members_dict/1, + make_epoch_id/1 ]). %% @doc Create a new projection record. @@ -201,3 +202,6 @@ make_members_dict(Ps) -> exit({badarg, {make_members_dict, lists:filter(F_neither, Ps)}}) end end. + +make_epoch_id(#projection_v1{epoch_number=Epoch, epoch_csum=CSum}) -> + {Epoch, CSum}. diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl index ac2ed0e..e5131ea 100644 --- a/src/machi_projection_store.erl +++ b/src/machi_projection_store.erl @@ -59,7 +59,8 @@ get_all_projections/2, get_all_projections/3, list_all_projections/2, list_all_projections/3 ]). --export([set_wedge_notify_pid/2, set_consistency_mode/2]). +-export([set_wedge_notify_pid/2, get_wedge_notify_pid/1, + set_consistency_mode/2]). %% gen_server callbacks -export([init/1, handle_call/3, handle_cast/2, handle_info/2, @@ -163,6 +164,10 @@ set_wedge_notify_pid(PidSpec, NotifyWedgeStateChanges) -> gen_server:call(PidSpec, {set_wedge_notify_pid, NotifyWedgeStateChanges}, infinity). +get_wedge_notify_pid(PidSpec) -> + gen_server:call(PidSpec, {get_wedge_notify_pid}, + infinity). + set_consistency_mode(PidSpec, CMode) when CMode == ap_mode; CMode == cp_mode -> gen_server:call(PidSpec, {set_consistency_mode, CMode}, infinity). @@ -230,6 +235,8 @@ handle_call({{list_all_projections, ProjType}, LC1}, _From, S) -> {reply, {{ok, find_all(Dir)}, LC2}, S}; handle_call({set_wedge_notify_pid, NotifyWedgeStateChanges}, _From, S) -> {reply, ok, S#state{wedge_notify_pid=NotifyWedgeStateChanges}}; +handle_call({get_wedge_notify_pid}, _From, S) -> + {reply, {ok, S#state.wedge_notify_pid}, S}; handle_call({set_consistency_mode, CMode}, _From, S) -> {reply, ok, S#state{consistency_mode=CMode}}; handle_call(_Request, _From, S) -> diff --git a/test/machi_cr_client_test.erl b/test/machi_cr_client_test.erl index dc8d9b0..e842755 100644 --- a/test/machi_cr_client_test.erl +++ b/test/machi_cr_client_test.erl @@ -205,7 +205,7 @@ witness_smoke_test2() -> %% Whew ... ok, now start some damn tests. {ok, C1} = machi_cr_client:start_link([P || {_,P}<-orddict:to_list(D)]), - machi_cr_client:append_chunk(C1, Prefix, Chunk1), + {ok, _} = machi_cr_client:append_chunk(C1, Prefix, Chunk1), {ok, {Off1,Size1,File1}} = machi_cr_client:append_chunk(C1, Prefix, Chunk1), Chunk1_badcs = {<>, Chunk1}, @@ -215,7 +215,8 @@ witness_smoke_test2() -> %% Stop 'b' and let the chain reset. ok = machi_flu_psup:stop_flu_package(b), - run_ticks([a_chmgr,c_chmgr]), + %% Run ticks enough times to force auto-unwedge of both a & c. + [run_ticks([a_chmgr,c_chmgr]) || _ <- [1,2,3,4] ], %% The chain should now be [a,c]. %% Let's wedge OurWitness and see what happens: timeout/partition. From 8a61a85ae0b18c5927eb919b3dcf30000bf27645 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Thu, 27 Aug 2015 16:19:22 +0900 Subject: [PATCH 09/51] WIP: rewrite make_zerf() to use new annotation scheme --- src/machi_chain_manager1.erl | 167 ++++++++++++++++------------------- 1 file changed, 74 insertions(+), 93 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 5476769..ed97dcb 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -3083,7 +3083,8 @@ full_majority_size(L) when is_list(L) -> make_zerf(#projection_v1{epoch_number=OldEpochNum, all_members=AllMembers, members_dict=MembersDict, - witnesses=OldWitness_list + witnesses=OldWitness_list, + flap=OldFlap } = _LastProj, #ch_mgr{name=MyName, consistency_mode=cp_mode, @@ -3096,108 +3097,88 @@ make_zerf(#projection_v1{epoch_number=OldEpochNum, throw({zerf, {not_enough_up, Up, AllMembers}}); true -> make_zerf2(OldEpochNum, Up, MajoritySize, MyName, - AllMembers, OldWitness_list, MembersDict, S) + AllMembers, OldWitness_list, MembersDict, OldFlap, S) end. -make_zerf2(OldEpochNum, Up, MajoritySize, MyName, AllMembers, OldWitness_list, MembersDict, S) -> +make_zerf2(OldEpochNum, Up, MajoritySize, MyName, AllMembers, OldWitness_list, + MembersDict, OldFlap, S) -> try - Epochs = lists:reverse( - lists:usort( - lists:flatten( - [begin - Proxy = proxy_pid(FLU, S), - {ok, Es} = ?FLU_PC:list_all_projections( - Proxy, private, ?TO*5), - [E || E <- Es] - end || FLU <- Up]))), - put(epochs, Epochs), - Relation = [], - Proj = zerf_find_last_common(Epochs, Relation, MajoritySize, Up, S), - Proj#projection_v1{flap=make_flapping_i()} + Proj = zerf_find_last_common(MajoritySize, Up, S), + Proj2 = + Proj#projection_v1{flap=OldFlap} + , io:format(user, "ZERF ~w\n", [machi_projection:make_summary(Proj2)]), + Proj2 catch throw:{zerf,no_common} -> - FirstEpoch_p = case get(epochs) of - [0] -> true; - [_, 0] -> true; - _ -> false - end, - if FirstEpoch_p -> - %% Epoch 0 special case: make the "all" projection. - %% calc_projection2() will then filter out any FLUs that - %% aren't currently up to create the first chain. If not - %% enough are up now, then it will fail to create a first - %% chain. - %% - %% If epoch 0 isn't the only epoch that we've looked at, - %% but we still couldn't find a common projection, then - %% we still need to default to the "all" projection and let - %% subsequent chain calculations do their calculations.... - P = make_all_projection(MyName, AllMembers, OldWitness_list, - MembersDict), - machi_projection:update_checksum( - P#projection_v1{epoch_number=OldEpochNum, - mode=cp_mode, - dbg2=[zerf_all]}); - true -> - %% Make it appear like nobody is up now: we'll have to - %% wait until the Up list changes so that - %% zerf_find_last_common() can confirm a common stable - %% last stable epoch. - - P = make_none_projection(MyName, AllMembers,OldWitness_list, - MembersDict), - machi_projection:update_checksum( - P#projection_v1{epoch_number=OldEpochNum, - mode=cp_mode, - dbg2=[zerf_none, {es, get(epochs)},{up,Up},{maj,MajoritySize}]}) - end; + %% Epoch 0 special case: make the "all" projection. + %% calc_projection2() will then filter out any FLUs that + %% aren't currently up to create the first chain. If not + %% enough are up now, then it will fail to create a first + %% chain. + %% + %% If epoch 0 isn't the only epoch that we've looked at, + %% but we still couldn't find a common projection, then + %% we still need to default to the "all" projection and let + %% subsequent chain calculations do their calculations.... + P = make_all_projection(MyName, AllMembers, OldWitness_list, + MembersDict), + P2 = + machi_projection:update_checksum( + P#projection_v1{epoch_number=OldEpochNum, + mode=cp_mode, + dbg2=[zerf_all]}), + io:format(user, "ZERF ~w\n", [machi_projection:make_summary(P2)]), + P2; _X:_Y -> throw({zerf, {damn_exception, Up, _X, _Y, erlang:get_stacktrace()}}) - after - erase(epochs) end. -zerf_find_last_common([], _Relation, _MajoritySize, _Up, _S) -> - throw({zerf, no_common}); -zerf_find_last_common(UnsearchedEpochs, Relation, MajoritySize, Up, S) -> - {NowEpochs, NextEpochs} = my_lists_split(5, UnsearchedEpochs), - Rel2 = lists:foldl( - fun({E, FLU}, Rel) -> - Proxy = proxy_pid(FLU, S), - case (catch ?FLU_PC:read_projection(Proxy, private, - E, ?TO)) of - {ok, Proj} -> - %% Sort order: we want inner = bigger. - CSum = Proj#projection_v1.epoch_csum, - OorI = case inner_projection_exists(Proj) of - true -> z_inner; - false -> a_outer - end, - K = {E, CSum, OorI, Proj#projection_v1{dbg2=[]}}, - Rel2 = case lists:keyfind(K, 1, Rel) of - false -> - [{K, [FLU]}|Rel]; - {K, OldV} -> - NewV = lists:usort([FLU|OldV]), - NewT = {K, NewV}, - lists:keyreplace(K, 1, Rel, - NewT) - end, - Rel2; - _ -> - Rel - end - end, Relation, lists:reverse([{E, FLU} || E <- NowEpochs, FLU <- Up])), - SortedRel = lists:reverse(lists:sort(Rel2)), - case [T || T={{_E, _CSum, _OorI, Proj}, WrittenFLUs} <- SortedRel, - ordsets:is_subset(ordsets:from_list(Proj#projection_v1.upi), - ordsets:from_list(WrittenFLUs)) - andalso - length(Proj#projection_v1.upi) >= MajoritySize] of +zerf_find_last_common(MajoritySize, Up, S) -> + case lists:reverse( + lists:sort( + lists:flatten( + [zerf_find_last_annotated(FLU,MajoritySize,S) || FLU <- Up]))) of [] -> - zerf_find_last_common(NextEpochs, Rel2, MajoritySize, Up, S); - [{{_E, _CSum, _OorI, Proj}, _WrittenFLUs}|_] -> - Proj + throw({zerf,no_common}); + [P|_] -> + %% TODO is this simple sort really good enough? + P + end. + +zerf_find_last_annotated(FLU, MajoritySize, S) -> + Proxy = proxy_pid(FLU, S), + {ok, Epochs} = ?FLU_PC:list_all_projections(Proxy, private, 60*1000), + P = lists:foldl( + fun(Epoch, #projection_v1{}=Proj) -> + Proj; + (Epoch, Acc) -> + {ok, Proj} = ?FLU_PC:read_projection(Proxy, private, + Epoch, ?TO*10), + case proplists:get_value(private_proj_is_upi_unanimous, + Proj#projection_v1.dbg2) of + undefined -> + Acc; + {{ConfEpoch, ConfCSum}, _ConfTime} -> + Px = if ConfEpoch == Epoch -> + Proj; + true -> + Proj2 = inner_projection_or_self(Proj), + %% Sanity checking + ConfEpoch = Proj2#projection_v1.epoch_number, + ConfCSum = Proj2#projection_v1.epoch_csum, + Proj2 + end, + if length(Px#projection_v1.upi) >= MajoritySize -> + Px; + true -> + Acc + end + end + end, first_accumulator, Epochs), + if is_record(P, projection_v1) -> + P; + true -> + [] % lists:flatten() will destroy end. my_lists_split(N, L) -> From 65cd18939cee6d3f38f50db9562490593636c46a Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Thu, 27 Aug 2015 17:58:43 +0900 Subject: [PATCH 10/51] WIP: changes to annotation management --- src/machi_chain_manager1.erl | 110 ++++++++++++++++++++++++++--------- src/machi_flu1.erl | 2 +- 2 files changed, 85 insertions(+), 27 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index ed97dcb..f23999d 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -61,7 +61,7 @@ name :: pv1_server(), flap_limit :: non_neg_integer(), proj :: projection(), - proj_unanimous :: boolean(), + proj_unanimous :: 'false' | erlang:timestamp(), %% timer :: 'undefined' | timer:tref(), ignore_timer :: boolean(), @@ -70,7 +70,7 @@ flap_start=?NOT_FLAPPING_START :: {{'epk', integer()}, erlang:timestamp()}, flap_last_up=[] :: list(), - flap_last_up_change=now() :: erlang:now(), + flap_last_up_change=now() :: erlang:timestamp(), flap_counts_last=[] :: list(), not_sanes :: orddict:orddict(), sane_transitions = 0 :: non_neg_integer(), @@ -553,6 +553,18 @@ read_latest_projection_call_only2(ProjectionType, All_queried_list, S) -> FLU <- All_queried_list], {Rs, S2}. +read_projection_call_only2(ProjectionType, Epoch, All_queried_list, S) -> + {_UpNodes, Partitions, S2} = calc_up_nodes(S), + DoIt = fun(Pid) -> + case (?FLU_PC:read_projection(Pid, ProjectionType, Epoch, ?TO)) of + {ok, P} -> P; + Else -> Else + end + end, + Rs = [(catch perhaps_call_t(S, Partitions, FLU, fun(Pid) -> DoIt(Pid) end)) || + FLU <- All_queried_list], + {Rs, S2}. + cl_read_latest_projection(ProjectionType, S) -> AllHosed = [], cl_read_latest_projection(ProjectionType, AllHosed, S). @@ -808,6 +820,11 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, P_none0 = make_none_projection( MyName, AllMembers, OldWitness_list, MembersDict), + Why = if NewUPI == [] -> + no_real_servers; + true -> + not_enough_witnesses + end, P_none1 = P_none0#projection_v1{ epoch_number=OldEpochNum + 1, dbg=[{none_projection,true}, @@ -821,7 +838,7 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, {tent_upi, TentativeUPI}, {new_upi, NewUPI}, {up_witnesses, UpWitnesses}, - {not_enough_witnesses,true}]}, + {why_none, Why}]}, machi_projection:update_checksum(P_none1) end end; @@ -1965,11 +1982,28 @@ react_to_env_C103(#projection_v1{epoch_number=_Epoch_newprop} = _P_newprop, %% looping C100->C103->C100. react_to_env_C100(P_none, P_none, clear_flapping_state(S)). -react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) -> +react_to_env_C110(P_latest, #ch_mgr{name=MyName, proj=P_current, + proj_unanimous=ProjUnanimous} = S) -> ?REACT(c110), ?REACT({c110, ?LINE, [{latest_epoch,P_latest#projection_v1.epoch_number}]}), - Extra_todo = [{react,get(react)}], - P_latest2 = machi_projection:update_dbg2(P_latest, Extra_todo), + Extra1 = case inner_projection_exists(P_current) andalso + inner_projection_exists(P_latest) andalso + (machi_projection:get_epoch_id( + inner_projection_or_self(P_current)) == + machi_projection:get_epoch_id( + inner_projection_or_self(P_latest))) + andalso ProjUnanimous /= false of + true -> + EpochID = machi_projection:get_epoch_id( + inner_projection_or_self(P_latest)), + UnanimousTime = ProjUnanimous, + A = make_annotation(EpochID, UnanimousTime), + [A]; + false -> + [] + end, + Extra2 = [{react,get(react)}], + P_latest2 = machi_projection:update_dbg2(P_latest, Extra1 ++ Extra2), MyNamePid = proxy_pid(MyName, S), Goo = P_latest2#projection_v1.epoch_number, @@ -2034,6 +2068,7 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H, diversion_c120_verbose_goop(P_latest, S), ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}), +io:format(user, "C120: ~w wrote ~w ~W\n", [S#ch_mgr.name, P_latest#projection_v1.epoch_number, P_latest#projection_v1.epoch_csum, 6]), {{now_using, FinalProps, P_latest#projection_v1.epoch_number}, set_proj(S#ch_mgr{proj_history=H2, sane_transitions=Xtns + 1}, P_latest)}. @@ -2606,7 +2641,7 @@ projection_transition_is_sane_except_si_epoch( poll_private_proj_is_upi_unanimous(#ch_mgr{consistency_mode=ap_mode} = S) -> S; poll_private_proj_is_upi_unanimous(#ch_mgr{consistency_mode=cp_mode, - proj_unanimous=true} = S) -> + proj_unanimous={_,_,_}} = S) -> S; poll_private_proj_is_upi_unanimous(#ch_mgr{consistency_mode=cp_mode, proj_unanimous=false, @@ -2624,17 +2659,17 @@ poll_private_proj_is_upi_unanimous_sleep(Count, S) when Count > 2 -> poll_private_proj_is_upi_unanimous_sleep(Count, S) -> timer:sleep((Count * Count) * 50), case poll_private_proj_is_upi_unanimous3(S) of - #ch_mgr{proj_unanimous=true} = S2 -> - S2; + #ch_mgr{proj_unanimous=false} = S2 -> + poll_private_proj_is_upi_unanimous_sleep(Count + 1, S2); S2 -> - poll_private_proj_is_upi_unanimous_sleep(Count + 1, S2) + S2 end. -poll_private_proj_is_upi_unanimous3(#ch_mgr{name=_MyName, proj=Proj0, +poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current, opts=MgrOpts} = S) -> - Proj = inner_projection_or_self(Proj0), - UPI = Proj#projection_v1.upi, - EpochID = machi_projection:make_epoch_id(Proj), + Proj_ios = inner_projection_or_self(P_current), + UPI = Proj_ios#projection_v1.upi, + EpochID = machi_projection:make_epoch_id(Proj_ios), {Rs, S2} = read_latest_projection_call_only2(private, UPI, S), Rs2 = [if is_record(P, projection_v1) -> machi_projection:make_epoch_id(P); @@ -2643,21 +2678,41 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=_MyName, proj=Proj0, end || #projection_v1{}=P <- Rs], case lists:usort(Rs2) of [EID] when EID == EpochID -> - Annotation = {private_proj_is_upi_unanimous, {EpochID, now()}}, - %% Careful, use the outer projection here! - NewDbg2 = [Annotation|Proj0#projection_v1.dbg2], - NewProj = Proj0#projection_v1{dbg2=NewDbg2}, - ProjStore = get_projection_store_regname(MgrOpts), + Now = os:timestamp(), + Annotation = make_annotation(EpochID, Now), + NewDbg2 = [Annotation|P_current#projection_v1.dbg2], + NewProj = P_current#projection_v1{dbg2=NewDbg2}, + ProjStore = case get_projection_store_regname(MgrOpts) of + undefined -> + machi_flu_psup:make_proj_supname(MyName); + PStr -> + PStr + end, + [io:format(user, "whereis(~w) ~w, ", [X, whereis(X)]) || + X <- [a_pstore, a_pstore2]], +io:format(user, "POLL: ~w updates ~w ~W\n", [S#ch_mgr.name, NewProj#projection_v1.epoch_number, NewProj#projection_v1.epoch_csum, 6]), ok = machi_projection_store:write(ProjStore, private, NewProj), %% Unwedge our FLU. - io:format(user, "\nUnwedge ~w @ ~W\n", [_MyName, EpochID, 7]), + io:format(user, "\nUnwedge ~w @ ~W\n", [MyName, EpochID, 7]), {ok, NotifyPid} = machi_projection_store:get_wedge_notify_pid(ProjStore), _ = machi_flu1:update_wedge_state(NotifyPid, false, EpochID), - S2#ch_mgr{proj_unanimous=true}; + S2#ch_mgr{proj_unanimous=Now}; _Else -> +io:format(user, "poll by ~w: want ~W got ~W\n", [MyName, EpochID, 6, _Else, 8]), S2 end. +poll_read_private_projections(#projection_v1{inner=undefined, + epoch_number=Epoch, + upi=UPI}=_P_current, S) -> + read_projection_call_only2(private, Epoch, UPI, S); +poll_read_private_projections(#projection_v1{inner=_not_undefined, + upi=UPI}=_P_current, S) -> + %% For inner projections, we are (by definition) flapping, and the + %% outer epoch numbers are (by definition) unstable. However, any + %% observed use of the the inner proj epoch # is what we need. + read_latest_projection_call_only2(private, UPI, S). + sleep_ranked_order(MinSleep, MaxSleep, FLU, FLU_list) -> USec = calc_sleep_ranked_order(MinSleep, MaxSleep, FLU, FLU_list), timer:sleep(USec), @@ -3104,8 +3159,7 @@ make_zerf2(OldEpochNum, Up, MajoritySize, MyName, AllMembers, OldWitness_list, MembersDict, OldFlap, S) -> try Proj = zerf_find_last_common(MajoritySize, Up, S), - Proj2 = - Proj#projection_v1{flap=OldFlap} + Proj2 = Proj#projection_v1{flap=OldFlap, dbg2=[]} , io:format(user, "ZERF ~w\n", [machi_projection:make_summary(Proj2)]), Proj2 catch @@ -3140,7 +3194,8 @@ zerf_find_last_common(MajoritySize, Up, S) -> [zerf_find_last_annotated(FLU,MajoritySize,S) || FLU <- Up]))) of [] -> throw({zerf,no_common}); - [P|_] -> + [P|_]=_TheList -> + io:format(user, "Zerf results: ~P\n", [ [machi_projection:make_summary(X) || X <- _TheList], 20]), %% TODO is this simple sort really good enough? P end. @@ -3149,7 +3204,7 @@ zerf_find_last_annotated(FLU, MajoritySize, S) -> Proxy = proxy_pid(FLU, S), {ok, Epochs} = ?FLU_PC:list_all_projections(Proxy, private, 60*1000), P = lists:foldl( - fun(Epoch, #projection_v1{}=Proj) -> + fun(_Epoch, #projection_v1{}=Proj) -> Proj; (Epoch, Acc) -> {ok, Proj} = ?FLU_PC:read_projection(Proxy, private, @@ -3174,7 +3229,7 @@ zerf_find_last_annotated(FLU, MajoritySize, S) -> Acc end end - end, first_accumulator, Epochs), + end, first_accumulator, lists:reverse(Epochs)), if is_record(P, projection_v1) -> P; true -> @@ -3303,3 +3358,6 @@ calc_consistency_mode(_Witness_list) -> set_proj(S, Proj) -> S#ch_mgr{proj=Proj, proj_unanimous=false}. + +make_annotation(EpochID, Time) -> + {private_proj_is_upi_unanimous, {EpochID, Time}}. diff --git a/src/machi_flu1.erl b/src/machi_flu1.erl index d876b57..fb433d2 100644 --- a/src/machi_flu1.erl +++ b/src/machi_flu1.erl @@ -837,7 +837,7 @@ make_listener_regname(BaseName) -> %% registers. make_projection_server_regname(BaseName) -> - list_to_atom(atom_to_list(BaseName) ++ "_pstore2"). + list_to_atom(atom_to_list(BaseName) ++ "_pstore"). %% @doc Encode `Offset + Size + TaggedCSum' into an `iolist()' type for %% internal storage by the FLU. From 12b74a52fdf84d84bd080c070106890655a33df4 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Thu, 27 Aug 2015 18:45:27 +0900 Subject: [PATCH 11/51] WIP: pre-dinner paranoid checkin --- src/machi_chain_manager1.erl | 6 ++---- src/machi_projection_store.erl | 13 +++---------- test/machi_chain_manager1_converge_demo.erl | 15 ++++++++------- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index f23999d..83e895e 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2672,7 +2672,7 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current, EpochID = machi_projection:make_epoch_id(Proj_ios), {Rs, S2} = read_latest_projection_call_only2(private, UPI, S), Rs2 = [if is_record(P, projection_v1) -> - machi_projection:make_epoch_id(P); + machi_projection:make_epoch_id(inner_projection_or_self(P)); true -> P end || #projection_v1{}=P <- Rs], @@ -2688,9 +2688,7 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current, PStr -> PStr end, - [io:format(user, "whereis(~w) ~w, ", [X, whereis(X)]) || - X <- [a_pstore, a_pstore2]], -io:format(user, "POLL: ~w updates ~w ~W\n", [S#ch_mgr.name, NewProj#projection_v1.epoch_number, NewProj#projection_v1.epoch_csum, 6]), +io:format(user, "POLL: ~w: ~w updates ~w ~W ~w\n", [S#ch_mgr.name, P_current#projection_v1.epoch_csum == (machi_projection:update_checksum(P_current))#projection_v1.epoch_csum, NewProj#projection_v1.epoch_number, NewProj#projection_v1.epoch_csum, 6, NewProj#projection_v1.epoch_csum == (machi_projection:update_checksum(NewProj))#projection_v1.epoch_csum]), ok = machi_projection_store:write(ProjStore, private, NewProj), %% Unwedge our FLU. io:format(user, "\nUnwedge ~w @ ~W\n", [MyName, EpochID, 7]), diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl index e5131ea..0961d8f 100644 --- a/src/machi_projection_store.erl +++ b/src/machi_projection_store.erl @@ -276,22 +276,15 @@ do_proj_read(ProjType, Epoch, S_or_Dir) -> {{error, Else}, S_or_Dir} end. -do_proj_write(public=ProjType, Proj, S) -> - do_proj_write2(ProjType, Proj, S); -do_proj_write(private=ProjType, #projection_v1{epoch_number=Epoch}=Proj, S) -> - case S#state.max_public_epochid of - {PublicEpoch, _} when PublicEpoch =< Epoch -> - do_proj_write2(ProjType, Proj, S); - {_PublicEpoch, _} -> - {{error, bad_arg}, S} - end. +do_proj_write(ProjType, Proj, S) -> + do_proj_write2(ProjType, Proj, S). do_proj_write2(ProjType, #projection_v1{epoch_csum=CSum}=Proj, S) -> case (machi_projection:update_checksum(Proj))#projection_v1.epoch_csum of CSum2 when CSum2 == CSum -> do_proj_write3(ProjType, Proj, S); _Else -> - {{error, bad_arg}, S} + {{error, bad_arg_badddddddddddddddddddddddddd_csum, CSum, _Else}, S} end. do_proj_write3(ProjType, #projection_v1{epoch_number=Epoch, diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index fd3b660..63f2e2c 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -371,13 +371,14 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) -> make_partition_list(All_list) -> [ - [{b,c}], - [], - [{c,d}], - [], - [{d,e}], - [], - [{c,e}] + [{b,c}] + %% [{b,c}], + %% [], + %% [{c,d}], + %% [], + %% [{d,e}], + %% [], + %% [{c,e}] ]. %% _X_Ys1 = [[{X,Y}] || X <- All_list, Y <- All_list, X /= Y], From 0eaa008810f53bc08be704ea8c04900989565add Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Thu, 27 Aug 2015 20:27:24 +0900 Subject: [PATCH 12/51] Change checksum algorithm to exclude 'flap' also --- src/machi_projection.erl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/machi_projection.erl b/src/machi_projection.erl index 68d2998..9773fbb 100644 --- a/src/machi_projection.erl +++ b/src/machi_projection.erl @@ -111,8 +111,19 @@ new(EpochNum, MyName, [] = _MembersDict0, _Down_list, _UPI_list,_Repairing_list, %% @doc Update the checksum element of a projection record. update_checksum(P) -> + %% Fields that we ignore when calculating checksum: + %% * epoch_csum + %% * dbg2: humming consensus participants may modify this at will without + %% voiding the identity of the projection as a whole. + %% * flap: In some cases in CP mode, coode upstream of C120 may have + %% updated the flapping information. That's OK enough: we aren't + %% going to violate chain replication safety rules (or + %% accidentally encourage someone else sometime later) by + %% replacing flapping information with our own local view at + %% this instant in time. CSum = crypto:hash(sha, term_to_binary(P#projection_v1{epoch_csum= <<>>, + flap=undefined, dbg2=[]})), P#projection_v1{epoch_csum=CSum}. From efb89efb0d6e290fe3a4d94aad220e19e27337d6 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Thu, 27 Aug 2015 20:27:33 +0900 Subject: [PATCH 13/51] Reduce verbosity --- src/machi_chain_manager1.erl | 16 +++++++--------- src/machi_projection.erl | 5 +++-- src/machi_projection_store.erl | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 83e895e..d23c55b 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2066,9 +2066,8 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H, MaxLength = length(P_latest#projection_v1.all_members) * 1.5, H2 = add_and_trunc_history(P_latest, H, MaxLength), - diversion_c120_verbose_goop(P_latest, S), + %% diversion_c120_verbose_goop(P_latest, S), ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}), -io:format(user, "C120: ~w wrote ~w ~W\n", [S#ch_mgr.name, P_latest#projection_v1.epoch_number, P_latest#projection_v1.epoch_csum, 6]), {{now_using, FinalProps, P_latest#projection_v1.epoch_number}, set_proj(S#ch_mgr{proj_history=H2, sane_transitions=Xtns + 1}, P_latest)}. @@ -2688,15 +2687,15 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current, PStr -> PStr end, -io:format(user, "POLL: ~w: ~w updates ~w ~W ~w\n", [S#ch_mgr.name, P_current#projection_v1.epoch_csum == (machi_projection:update_checksum(P_current))#projection_v1.epoch_csum, NewProj#projection_v1.epoch_number, NewProj#projection_v1.epoch_csum, 6, NewProj#projection_v1.epoch_csum == (machi_projection:update_checksum(NewProj))#projection_v1.epoch_csum]), + io:format(user, "\nCONFIRM epoch ~w ~W upi ~w rep ~w by ~w\n", [NewProj#projection_v1.epoch_number, NewProj#projection_v1.epoch_csum, 6, NewProj#projection_v1.upi, NewProj#projection_v1.repairing, MyName]), ok = machi_projection_store:write(ProjStore, private, NewProj), %% Unwedge our FLU. - io:format(user, "\nUnwedge ~w @ ~W\n", [MyName, EpochID, 7]), {ok, NotifyPid} = machi_projection_store:get_wedge_notify_pid(ProjStore), _ = machi_flu1:update_wedge_state(NotifyPid, false, EpochID), S2#ch_mgr{proj_unanimous=Now}; _Else -> -io:format(user, "poll by ~w: want ~W got ~W\n", [MyName, EpochID, 6, _Else, 8]), + %% io:format(user, "poll by ~w: want ~W got ~W\n", + %% [MyName, EpochID, 6, _Else, 8]), S2 end. @@ -3157,8 +3156,8 @@ make_zerf2(OldEpochNum, Up, MajoritySize, MyName, AllMembers, OldWitness_list, MembersDict, OldFlap, S) -> try Proj = zerf_find_last_common(MajoritySize, Up, S), - Proj2 = Proj#projection_v1{flap=OldFlap, dbg2=[]} - , io:format(user, "ZERF ~w\n", [machi_projection:make_summary(Proj2)]), + Proj2 = Proj#projection_v1{flap=OldFlap, dbg2=[]}, + %% io:format(user, "ZERF ~w\n",[machi_projection:make_summary(Proj2)]), Proj2 catch throw:{zerf,no_common} -> @@ -3179,7 +3178,7 @@ make_zerf2(OldEpochNum, Up, MajoritySize, MyName, AllMembers, OldWitness_list, P#projection_v1{epoch_number=OldEpochNum, mode=cp_mode, dbg2=[zerf_all]}), - io:format(user, "ZERF ~w\n", [machi_projection:make_summary(P2)]), + %% io:format(user, "ZERF ~w\n",[machi_projection:make_summary(P2)]), P2; _X:_Y -> throw({zerf, {damn_exception, Up, _X, _Y, erlang:get_stacktrace()}}) @@ -3193,7 +3192,6 @@ zerf_find_last_common(MajoritySize, Up, S) -> [] -> throw({zerf,no_common}); [P|_]=_TheList -> - io:format(user, "Zerf results: ~P\n", [ [machi_projection:make_summary(X) || X <- _TheList], 20]), %% TODO is this simple sort really good enough? P end. diff --git a/src/machi_projection.erl b/src/machi_projection.erl index 9773fbb..a31fb12 100644 --- a/src/machi_projection.erl +++ b/src/machi_projection.erl @@ -158,6 +158,7 @@ get_epoch_id(#projection_v1{epoch_number=Epoch, epoch_csum=CSum}) -> %% @doc Create a proplist-style summary of a projection record. make_summary(#projection_v1{epoch_number=EpochNum, + epoch_csum= <<_CSum4:4/binary, _/binary>>, all_members=_All_list, mode=CMode, witnesses=Witness_list, @@ -173,8 +174,8 @@ make_summary(#projection_v1{epoch_number=EpochNum, true -> [] end, - [{epoch,EpochNum},{author,Author}, - {mode,CMode},{witnesses, Witness_list}, + [{epoch,EpochNum}, %% {csum,CSum4}, + {author,Author}, {mode,CMode},{witnesses, Witness_list}, {upi,UPI_list},{repair,Repairing_list},{down,Down_list}] ++ InnerInfo ++ [{flap, Flap}] ++ diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl index 0961d8f..a4987db 100644 --- a/src/machi_projection_store.erl +++ b/src/machi_projection_store.erl @@ -284,7 +284,7 @@ do_proj_write2(ProjType, #projection_v1{epoch_csum=CSum}=Proj, S) -> CSum2 when CSum2 == CSum -> do_proj_write3(ProjType, Proj, S); _Else -> - {{error, bad_arg_badddddddddddddddddddddddddd_csum, CSum, _Else}, S} + {{error, bad_arg}, S} end. do_proj_write3(ProjType, #projection_v1{epoch_number=Epoch, From 93b9b948fcd60320d2a223098d47204d5b004948 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Thu, 27 Aug 2015 22:02:23 +0900 Subject: [PATCH 14/51] WIP: debugging, uff da --- src/machi_chain_manager1.erl | 11 +++++++++-- src/machi_projection.erl | 2 +- src/machi_projection_store.erl | 5 ++++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index d23c55b..874ab62 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1857,7 +1857,9 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest, ?REACT(c100), Sane = projection_transition_is_sane(P_current, P_latest, MyName), - if Sane == true -> ok; true -> ?V("~w-insane-~w-auth=~w:~w:~w:~w:~w:~w:~w-~p,", [?LINE, MyName, P_newprop#projection_v1.author_server, P_newprop#projection_v1.epoch_number, P_newprop#projection_v1.upi, P_newprop#projection_v1.repairing, (inner_projection_or_self(P_newprop))#projection_v1.epoch_number, (inner_projection_or_self(P_newprop))#projection_v1.upi, (inner_projection_or_self(P_newprop))#projection_v1.repairing, Sane]) end, %%% DELME!!! + QQ_current = lists:flatten(io_lib:format("~w:~w,~w/~w:~w,~w", [P_current#projection_v1.epoch_number, P_current#projection_v1.upi, P_current#projection_v1.repairing, (inner_projection_or_self(P_current))#projection_v1.epoch_number, (inner_projection_or_self(P_current))#projection_v1.upi, (inner_projection_or_self(P_current))#projection_v1.repairing])), + QQ_latest = lists:flatten(io_lib:format("~w:~w,~w/~w:~w,~w", [P_latest#projection_v1.epoch_number, P_latest#projection_v1.upi, P_latest#projection_v1.repairing, (inner_projection_or_self(P_latest))#projection_v1.epoch_number, (inner_projection_or_self(P_latest))#projection_v1.upi, (inner_projection_or_self(P_latest))#projection_v1.repairing])), + if Sane == true -> ok; true -> ?V("\n~w-insane-~w-auth=~w ~s -> ~s ~w\n", [?LINE, MyName, P_newprop#projection_v1.author_server, QQ_current, QQ_latest, Sane]) end, Flap_latest = if is_record(Flap_latest0, flap_i) -> Flap_latest0; true -> @@ -2687,7 +2689,12 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current, PStr -> PStr end, - io:format(user, "\nCONFIRM epoch ~w ~W upi ~w rep ~w by ~w\n", [NewProj#projection_v1.epoch_number, NewProj#projection_v1.epoch_csum, 6, NewProj#projection_v1.upi, NewProj#projection_v1.repairing, MyName]), + #projection_v1{epoch_number=_EpochRep, + epoch_csum= <<_CSumRep:4/binary, _/binary>>, + upi=_UPIRep, + repairing=_RepairingRep} = + inner_projection_or_self(NewProj), + io:format(user, "\nCONFIRM epoch ~w ~w upi ~w rep ~w by ~w ~w\n", [_EpochRep, _CSumRep, _UPIRep, _RepairingRep, MyName, if P_current#projection_v1.inner == undefined -> outer; true -> {inner,{outer,P_current#projection_v1.epoch_number}} end]), ok = machi_projection_store:write(ProjStore, private, NewProj), %% Unwedge our FLU. {ok, NotifyPid} = machi_projection_store:get_wedge_notify_pid(ProjStore), diff --git a/src/machi_projection.erl b/src/machi_projection.erl index a31fb12..08b20ab 100644 --- a/src/machi_projection.erl +++ b/src/machi_projection.erl @@ -174,7 +174,7 @@ make_summary(#projection_v1{epoch_number=EpochNum, true -> [] end, - [{epoch,EpochNum}, %% {csum,CSum4}, + [{epoch,EpochNum}, {csum,_CSum4}, {author,Author}, {mode,CMode},{witnesses, Witness_list}, {upi,UPI_list},{repair,Repairing_list},{down,Down_list}] ++ InnerInfo ++ diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl index a4987db..126698d 100644 --- a/src/machi_projection_store.erl +++ b/src/machi_projection_store.erl @@ -306,7 +306,10 @@ do_proj_write3(ProjType, #projection_v1{epoch_number=Epoch, if CurEpoch == Epoch, CurCSum == CSum -> do_proj_write4(ProjType, Proj, Path, Epoch, S); true -> - {{error, written}, S} + io:format(user, "OUCH: on disk: ~w\n", [machi_projection:make_summary(binary_to_term(Bin))]), + io:format(user, "OUCH: clobber: ~w\n", [machi_projection:make_summary(Proj)]), + io:format(user, "OUCH: clobber: ~p\n", [Proj#projection_v1.dbg2]), + {{error, written, CurEpoch, Epoch, CurCSum, CSum}, S} end; {error, enoent} -> do_proj_write4(ProjType, Proj, Path, Epoch, S); From deb2cdee2cb08ee3038a42d3b2722c5325a6b6b6 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Thu, 27 Aug 2015 22:22:15 +0900 Subject: [PATCH 15/51] Bugfix: correct epoch number checking when inner proj --- src/machi_chain_manager1.erl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 874ab62..b009bd6 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2422,11 +2422,11 @@ projection_transition_is_sane(P1, P2, RelativeToServer, RetrospectiveP) -> if HasInner1 orelse HasInner2 -> Inner1 = inner_projection_or_self(P1), Inner2 = inner_projection_or_self(P2), - if HasInner1 andalso HasInner2 -> - %% In case of inner->inner transition, we must allow - %% the epoch number to remain constant. Thus, we - %% call the function that does not check for a - %% strictly-increasing epoch. + if HasInner1 orelse HasInner2 -> + %% In case of transition with inner projections, we + %% must allow the epoch number to remain constant. + %% Thus, we call the function that does not check for + %% a strictly-increasing epoch. ?RETURN2( projection_transition_is_sane_final_review(P1, P2, projection_transition_is_sane_except_si_epoch( From 8ca1ffdb132d9d700a887dd80b719845cfd72207 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 28 Aug 2015 01:55:31 +0900 Subject: [PATCH 16/51] WIP: bugfixes and lots of verbose goop added --- src/machi_chain_manager1.erl | 50 +++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index b009bd6..865f26f 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1162,17 +1162,18 @@ react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, if CMode == cp_mode, Epoch_latest > P_current#projection_v1.epoch_number, Author_latest /= MyName -> + put(yyy_hack, []), case make_zerf(P_current, S) of Zerf when is_record(Zerf, projection_v1) -> ?REACT({a29, ?LINE, [{zerf_filler, true}, {zerf_in, machi_projection:make_summary(Zerf)}]}), - %% io:format(user, "zerf_in @ A29: ~p: ~w\n", [MyName, machi_projection:make_summary(Zerf)]), + io:format(user, "zerf_in: A29: ~p: ~w\n\t~p\n", [MyName, machi_projection:make_summary(Zerf), get(yyy_hack)]), P_current2 = Zerf#projection_v1{ flap=P_current#projection_v1.flap}, - %% Do not use the usual set_proj() wrapper here. + S2 = set_proj(S, P_current2), react_to_env_A30(Retries, P_latest, LatestUnanimousP, - ReadExtra, S#ch_mgr{proj=P_current2}); + ReadExtra, S2); Zerf -> {{{yo_todo_incomplete_fix_me_cp_mode, line, ?LINE, Zerf}}} end; @@ -1649,8 +1650,8 @@ react_to_env_A49(_P_latest, FinalProps, #ch_mgr{name=MyName, members_dict=MembersDict} = P_current, P_none = make_none_projection(MyName, All_list, Witness_list, MembersDict), - %% Do not use the usual set_proj() wrapper here. - react_to_env_A50(P_none, FinalProps, S#ch_mgr{proj=P_none}). +io:format(user, "Debug A49: ~w forced to none\n", [MyName]), + react_to_env_A50(P_none, FinalProps, set_proj(S, P_none)). react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> ?REACT(a50), @@ -2018,10 +2019,12 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName, proj=P_current, case {?FLU_PC:write_projection(MyNamePid, private, P_latest2,?TO*30),Goo} of {ok, Goo} -> ?REACT({c120, [{write, ok}]}), - perhaps_verbose_c110(P_latest2, S), %% We very intentionally do *not* pass P_latest2 forward: %% we must avoid bloating the dbg2 list! - react_to_env_C120(P_latest, [], S); + P_latest_perhaps_annotated = + machi_projection:update_dbg2(P_latest, Extra1), + perhaps_verbose_c110(P_latest_perhaps_annotated, S), + react_to_env_C120(P_latest_perhaps_annotated, [], S); {{error, bad_arg}, _Goo} -> ?REACT({c120, [{write, bad_arg}]}), @@ -2053,7 +2056,7 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName, proj=P_current, %% React to newer public write by restarting the iteration. react_to_env_A20(0, S); Else -> - Summ = machi_projection:make_summary(P_latest), + Summ = machi_projection:make_summary(P_latest2), io:format(user, "C110 error by ~w: ~w, ~w\n~p\n", [MyName, Else, Summ, get(react)]), error_logger:error_msg("C110 error by ~w: ~w, ~w, ~w\n", @@ -2070,9 +2073,16 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H, %% diversion_c120_verbose_goop(P_latest, S), ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}), - {{now_using, FinalProps, P_latest#projection_v1.epoch_number}, - set_proj(S#ch_mgr{proj_history=H2, - sane_transitions=Xtns + 1}, P_latest)}. + S2 = set_proj(S#ch_mgr{proj_history=H2, + sane_transitions=Xtns + 1}, P_latest), + S3 = case is_annotated(P_latest) of + false -> + S2; + {{_ConfEpoch, _ConfCSum}, ConfTime} -> +io:format(user, "\nCONFIRM debug C120 ~w was annotated ~W outer ~w\n", [S#ch_mgr.name, (inner_projection_or_self(P_latest))#projection_v1.epoch_number, 5, P_latest#projection_v1.epoch_number]), + S2#ch_mgr{proj_unanimous=ConfTime} + end, + {{now_using, FinalProps, P_latest#projection_v1.epoch_number}, S3}. add_and_trunc_history(P_latest, H, MaxLength) -> H2 = if P_latest#projection_v1.epoch_number > 0 -> @@ -3150,6 +3160,7 @@ make_zerf(#projection_v1{epoch_number=OldEpochNum, runenv=RunEnv1} = S) -> {Up, _Partitions, _RunEnv2} = calc_up_nodes(MyName, AllMembers, RunEnv1), + (catch put(yyy_hack, [{up,Up}|get(yyy_hack)])), MajoritySize = full_majority_size(AllMembers), case length(Up) >= MajoritySize of false -> @@ -3212,23 +3223,29 @@ zerf_find_last_annotated(FLU, MajoritySize, S) -> (Epoch, Acc) -> {ok, Proj} = ?FLU_PC:read_projection(Proxy, private, Epoch, ?TO*10), - case proplists:get_value(private_proj_is_upi_unanimous, - Proj#projection_v1.dbg2) of - undefined -> + case is_annotated(Proj) of + false -> + (catch put(yyy_hack, [{FLU, Epoch, not_annotated}|get(yyy_hack)])), Acc; {{ConfEpoch, ConfCSum}, _ConfTime} -> Px = if ConfEpoch == Epoch -> + (catch put(yyy_hack, [{FLU, Epoch, outer_ok}|get(yyy_hack)])), Proj; true -> + %% We only use Proj2 for sanity checking + %% here, do not return an inner! Proj2 = inner_projection_or_self(Proj), %% Sanity checking ConfEpoch = Proj2#projection_v1.epoch_number, ConfCSum = Proj2#projection_v1.epoch_csum, - Proj2 + (catch put(yyy_hack, [{FLU, Epoch, inner_ok_return_original_outerplusinner}|get(yyy_hack)])), + Proj end, if length(Px#projection_v1.upi) >= MajoritySize -> + (catch put(yyy_hack, [{FLU, Epoch, yay}|get(yyy_hack)])), Px; true -> + (catch put(yyy_hack, [{FLU, Epoch, skip}|get(yyy_hack)])), Acc end end @@ -3364,3 +3381,6 @@ set_proj(S, Proj) -> make_annotation(EpochID, Time) -> {private_proj_is_upi_unanimous, {EpochID, Time}}. + +is_annotated(#projection_v1{dbg2=Dbg2}) -> + proplists:get_value(private_proj_is_upi_unanimous, Dbg2, false). From 3dfe5c26771f3f4585780e98ecc5ea1ae5dbebd6 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 28 Aug 2015 18:37:11 +0900 Subject: [PATCH 17/51] WIP: fix annotation history on disk --- src/machi_chain_manager1.erl | 121 +++++++++++++------- src/machi_projection_store.erl | 1 + test/machi_chain_manager1_converge_demo.erl | 11 +- 3 files changed, 92 insertions(+), 41 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 865f26f..9b5c150 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1650,7 +1650,7 @@ react_to_env_A49(_P_latest, FinalProps, #ch_mgr{name=MyName, members_dict=MembersDict} = P_current, P_none = make_none_projection(MyName, All_list, Witness_list, MembersDict), -io:format(user, "Debug A49: ~w forced to none\n", [MyName]), +io:format(user, "Debug A49: ~w forced to none\n\n ~P", [MyName, get(react), 120]), react_to_env_A50(P_none, FinalProps, set_proj(S, P_none)). react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> @@ -1659,6 +1659,7 @@ react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> {latest_epoch, P_latest#projection_v1.epoch_number}, {final_props, FinalProps}]}), %% if S#ch_mgr.name == b; S#ch_mgr.name == c -> io:format(user, "A50: ~p: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, +%% io:format(user, "Debug A50: ~w P_current outer ~w ~w ~w\n", [S#ch_mgr.name, P_current#projection_v1.epoch_number,P_current#projection_v1.upi,P_current#projection_v1.repairing]), {{no_change, FinalProps, P_current#projection_v1.epoch_number}, S}. react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, @@ -1860,7 +1861,7 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest, Sane = projection_transition_is_sane(P_current, P_latest, MyName), QQ_current = lists:flatten(io_lib:format("~w:~w,~w/~w:~w,~w", [P_current#projection_v1.epoch_number, P_current#projection_v1.upi, P_current#projection_v1.repairing, (inner_projection_or_self(P_current))#projection_v1.epoch_number, (inner_projection_or_self(P_current))#projection_v1.upi, (inner_projection_or_self(P_current))#projection_v1.repairing])), QQ_latest = lists:flatten(io_lib:format("~w:~w,~w/~w:~w,~w", [P_latest#projection_v1.epoch_number, P_latest#projection_v1.upi, P_latest#projection_v1.repairing, (inner_projection_or_self(P_latest))#projection_v1.epoch_number, (inner_projection_or_self(P_latest))#projection_v1.upi, (inner_projection_or_self(P_latest))#projection_v1.repairing])), - if Sane == true -> ok; true -> ?V("\n~w-insane-~w-auth=~w ~s -> ~s ~w\n", [?LINE, MyName, P_newprop#projection_v1.author_server, QQ_current, QQ_latest, Sane]) end, + if Sane == true -> ok; true -> ?V("\n~w-insane-~w-auth=~w ~s -> ~s ~w\n ~p\n", [?LINE, MyName, P_newprop#projection_v1.author_server, QQ_current, QQ_latest, Sane, get(react)]) end, Flap_latest = if is_record(Flap_latest0, flap_i) -> Flap_latest0; true -> @@ -1875,7 +1876,9 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest, %% construction errors, checksum error, etc. case Sane of _ when P_current#projection_v1.epoch_number == 0 -> - %% Epoch == 0 is reserved for first-time, just booting conditions. + %% Epoch == 0 is reserved for first-time, just booting conditions + %% or for when we got stuck in an insane projection transition + %% and were forced to the none projection to recover. ?REACT({c100, ?LINE, [first_write]}), if Sane == true -> ok; true -> ?V("~w-insane-~w-~w:~w:~w,", [?LINE, MyName, P_newprop#projection_v1.epoch_number, P_newprop#projection_v1.upi, P_newprop#projection_v1.repairing]) end, %%% DELME!!! react_to_env_C110(P_latest, S); @@ -2001,7 +2004,8 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName, proj=P_current, inner_projection_or_self(P_latest)), UnanimousTime = ProjUnanimous, A = make_annotation(EpochID, UnanimousTime), - [A]; + io:format(user, "\nCONFIRM debug C110 ~w annotates ~W outer ~w\n", [MyName, EpochID, 5, P_latest#projection_v1.epoch_number]), + [A, {annotated_by,c110}]; false -> [] end, @@ -2021,10 +2025,10 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName, proj=P_current, ?REACT({c120, [{write, ok}]}), %% We very intentionally do *not* pass P_latest2 forward: %% we must avoid bloating the dbg2 list! - P_latest_perhaps_annotated = + P_latest2_perhaps_annotated = machi_projection:update_dbg2(P_latest, Extra1), - perhaps_verbose_c110(P_latest_perhaps_annotated, S), - react_to_env_C120(P_latest_perhaps_annotated, [], S); + perhaps_verbose_c110(P_latest2_perhaps_annotated, S), + react_to_env_C120(P_latest2_perhaps_annotated, [], S); {{error, bad_arg}, _Goo} -> ?REACT({c120, [{write, bad_arg}]}), @@ -2177,6 +2181,14 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, LastUpChange0 end, LastUpChange_diff = timer:now_diff(now(), LastUpChange) / 1000000, + ?REACT({calculate_flaps,?LINE,[{flap_start,FlapStart}, + {flap_count,FlapCount}, + {flap_last_up,FlapLastUp}, + {flap_counts_last,FlapCountsLast}, + {my_unique_prop_count,MyUniquePropCount}, + {current_up,CurrentUp}, + {last_up_change,LastUpChange}, + {last_up_change_diff,LastUpChange_diff}]}), %% TODO: Do we want to try to use BestP below to short-circuit %% calculation if we notice that the best private epoch # from @@ -2246,12 +2258,12 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, {N, _} when N >= MinQueueLen, P_latest_flap_start /= ?NOT_FLAPPING_START -> ?REACT({calculate_flaps,?LINE, - [{manifesto_clause,2}, + [{manifesto_clause,{start,2}}, {latest_epoch, P_latest#projection_v1.epoch_number}, {latest_flap_count,P_latest_Flap#flap_i.flap_count}]}), true; {N, [_]} when N >= MinQueueLen -> - ?REACT({calculate_flaps,?LINE,[{manifesto_clause,1}]}), + ?REACT({calculate_flaps,?LINE,[{manifesto_clause,{start,1}}]}), true; {_N, _} -> ?REACT({calculate_flaps,?LINE,[]}), @@ -2272,7 +2284,7 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, false; AmFlappingNow_p andalso CurrentUp /= FlapLastUp -> - ?REACT({calculate_flaps,?LINE,[{manifesto_clause,1}]}), + ?REACT({calculate_flaps,?LINE,[{manifesto_clause,{leave,1}}]}), true; AmFlappingNow_p -> P_latest_LastStartTime = @@ -2292,7 +2304,7 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, P_latest_LastStartTime /= ?NOT_FLAPPING_START -> ?REACT({calculate_flaps,?LINE, - [{manifesto_clause,2}, + [{manifesto_clause,{leave,2}}, {p_latest, machi_projection:make_summary(P_latest)}, {curtime, Curtime}, {flap_counts_last, FlapCountsLast}, @@ -2653,6 +2665,9 @@ poll_private_proj_is_upi_unanimous(#ch_mgr{consistency_mode=ap_mode} = S) -> S; poll_private_proj_is_upi_unanimous(#ch_mgr{consistency_mode=cp_mode, proj_unanimous={_,_,_}} = S) -> + %% #ch_mgr{name=MyName, proj=Proj} = S, + %% io:format(user, "\nCONFIRM debug ~w skip poll for inner ~w outer ~w\n", + %% [MyName, (inner_projection_or_self(Proj))#projection_v1.epoch_number, Proj#projection_v1.epoch_number]), S; poll_private_proj_is_upi_unanimous(#ch_mgr{consistency_mode=cp_mode, proj_unanimous=false, @@ -2682,34 +2697,55 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current, UPI = Proj_ios#projection_v1.upi, EpochID = machi_projection:make_epoch_id(Proj_ios), {Rs, S2} = read_latest_projection_call_only2(private, UPI, S), - Rs2 = [if is_record(P, projection_v1) -> - machi_projection:make_epoch_id(inner_projection_or_self(P)); + Rs2 = [if is_record(R, projection_v1) -> + machi_projection:make_epoch_id(inner_projection_or_self(R)); true -> - P - end || #projection_v1{}=P <- Rs], + R % probably {error, unwritten} + end || R <- Rs], case lists:usort(Rs2) of [EID] when EID == EpochID -> - Now = os:timestamp(), - Annotation = make_annotation(EpochID, Now), - NewDbg2 = [Annotation|P_current#projection_v1.dbg2], - NewProj = P_current#projection_v1{dbg2=NewDbg2}, - ProjStore = case get_projection_store_regname(MgrOpts) of - undefined -> - machi_flu_psup:make_proj_supname(MyName); - PStr -> - PStr - end, - #projection_v1{epoch_number=_EpochRep, - epoch_csum= <<_CSumRep:4/binary, _/binary>>, - upi=_UPIRep, - repairing=_RepairingRep} = - inner_projection_or_self(NewProj), - io:format(user, "\nCONFIRM epoch ~w ~w upi ~w rep ~w by ~w ~w\n", [_EpochRep, _CSumRep, _UPIRep, _RepairingRep, MyName, if P_current#projection_v1.inner == undefined -> outer; true -> {inner,{outer,P_current#projection_v1.epoch_number}} end]), - ok = machi_projection_store:write(ProjStore, private, NewProj), - %% Unwedge our FLU. - {ok, NotifyPid} = machi_projection_store:get_wedge_notify_pid(ProjStore), - _ = machi_flu1:update_wedge_state(NotifyPid, false, EpochID), - S2#ch_mgr{proj_unanimous=Now}; + %% We have a debugging problem, alas. It would be really great + %% if we could preserve the dbg2 info that's in the current + %% projection that's on disk. However, the full dbg2 list + %% with 'react' trace data isn't in the #ch_mgr.proj copy of + %% the projection. So, go read it from the store. + %% + %% But of course there's another small problem. P_current could + %% be the result of make_zerf(), which helps us "fast forward" to + %% a newer CP mode projection. And so what we just read in the + %% 'Rs' at the top of this function may be for a new epoch that + %% we've never seen before and therefore doesn't exist in our + %% local private projection store. But if it came from + %% make_zerf(), by definition it must be annotated, so don't try + %% to proceed any further. + ProxyPid = proxy_pid(MyName, S), + OuterEpoch = P_current#projection_v1.epoch_number, + case ?FLU_PC:read_projection(ProxyPid, private, OuterEpoch) of + {ok, P_currentFull} -> + Now = os:timestamp(), + Annotation = make_annotation(EpochID, Now), + NewDbg2 = [Annotation|P_currentFull#projection_v1.dbg2], + NewProj = P_currentFull#projection_v1{dbg2=NewDbg2}, + ProjStore = case get_projection_store_regname(MgrOpts) of + undefined -> + machi_flu_psup:make_proj_supname(MyName); + PStr -> + PStr + end, + #projection_v1{epoch_number=_EpochRep, + epoch_csum= <<_CSumRep:4/binary,_/binary>>, + upi=_UPIRep, + repairing=_RepairingRep} = + inner_projection_or_self(NewProj), + io:format(user, "\nCONFIRM epoch ~w ~w upi ~w rep ~w by ~w ~w\n", [_EpochRep, _CSumRep, _UPIRep, _RepairingRep, MyName, if P_current#projection_v1.inner == undefined -> outer; true -> {inner,{outer,P_current#projection_v1.epoch_number}} end]), + ok = machi_projection_store:write(ProjStore, private, NewProj), + %% Unwedge our FLU. + {ok, NotifyPid} = machi_projection_store:get_wedge_notify_pid(ProjStore), + _ = machi_flu1:update_wedge_state(NotifyPid, false, EpochID), + S2#ch_mgr{proj_unanimous=Now}; + _ -> + S2 + end; _Else -> %% io:format(user, "poll by ~w: want ~W got ~W\n", %% [MyName, EpochID, 6, _Else, 8]), @@ -3308,14 +3344,18 @@ perhaps_verbose_c110(P_latest2, S) -> {_,_,C} = os:timestamp(), MSec = trunc(C / 1000), {HH,MM,SS} = time(), - P_latest2x = P_latest2#projection_v1{dbg2=[]}, % limit verbose len. + Dbg2X = lists:keydelete(react, 1, + P_latest2#projection_v1.dbg2) ++ + [{is_annotated,is_annotated(P_latest2)}], + P_latest2x = P_latest2#projection_v1{dbg2=Dbg2X}, % limit verbose len. case inner_projection_exists(P_latest2) of false -> Last2 = get(last_verbose), Summ2 = machi_projection:make_summary(P_latest2x), case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of - true when Summ2 /= Last2 -> + true -> + %% true when Summ2 /= Last2 -> put(last_verbose, Summ2), ?V("\n~2..0w:~2..0w:~2..0w.~3..0w ~p uses plain: ~w \n", [HH,MM,SS,MSec, S#ch_mgr.name, Summ2]); @@ -3325,11 +3365,12 @@ perhaps_verbose_c110(P_latest2, S) -> true -> Last2 = get(last_verbose), P_inner = inner_projection_or_self(P_latest2), - P_innerx = P_inner#projection_v1{dbg2=[]}, % limit verbose len. + P_innerx = P_inner#projection_v1{dbg2=Dbg2X}, % limit verbose len. Summ2 = machi_projection:make_summary(P_innerx), case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of - true when Summ2 /= Last2 -> + true -> + %% true when Summ2 /= Last2 -> put(last_verbose, Summ2), ?V("\n~2..0w:~2..0w:~2..0w.~3..0w ~p uses inner: ~w (outer ~w auth ~w flap ~w)\n", [HH,MM,SS,MSec, S#ch_mgr.name, Summ2, P_latest2#projection_v1.epoch_number, P_latest2#projection_v1.author_server, P_latest2#projection_v1.flap]); diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl index 126698d..23f1281 100644 --- a/src/machi_projection_store.erl +++ b/src/machi_projection_store.erl @@ -309,6 +309,7 @@ do_proj_write3(ProjType, #projection_v1{epoch_number=Epoch, io:format(user, "OUCH: on disk: ~w\n", [machi_projection:make_summary(binary_to_term(Bin))]), io:format(user, "OUCH: clobber: ~w\n", [machi_projection:make_summary(Proj)]), io:format(user, "OUCH: clobber: ~p\n", [Proj#projection_v1.dbg2]), + %% {{error, written}, S} {{error, written, CurEpoch, Epoch, CurCSum, CSum}, S} end; {error, enoent} -> diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index 63f2e2c..99de9ba 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -239,6 +239,14 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) -> %% machi_partition_simulator:reset_thresholds(10, 50), %% io:format(user, "\nLet loose the dogs of war!\n", []), + %% [DoIt(20, 0, 0) || _ <- lists:seq(1,9)], + io:format(user, "\nVariations of puppies and dogs of war!\n", []), + [begin + machi_partition_simulator:reset_thresholds(90, 90), + DoIt(7, 0, 0), + machi_partition_simulator:always_these_partitions([]), + DoIt(7, 0, 0) + end || _ <- lists:seq(1, 3)], machi_partition_simulator:always_these_partitions([]), io:format(user, "\nPuppies for everyone!\n", []), [DoIt(20, 0, 0) || _ <- lists:seq(1,9)], @@ -371,7 +379,8 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) -> make_partition_list(All_list) -> [ - [{b,c}] + [{b,c}], + [{a,c},{b,c}] %% [{b,c}], %% [], %% [{c,d}], From 18aac6e489011f845183dc957102bb0bc5470588 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 28 Aug 2015 18:39:18 +0900 Subject: [PATCH 18/51] WIP: undo AmFlappingNow_p condition added at commit 3dfe5c2 --- src/machi_chain_manager1.erl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 9b5c150..ddb772c 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2238,9 +2238,11 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, ?REACT({calculate_flaps, ?LINE, [{queue_len, queue:len(H)}, {uniques, UniqueProposalSummaries}]}), P_latest_Flap = get_raw_flapping_i(P_latest), - AmFlappingNow_p = not (FlapStart == ?NOT_FLAPPING_START) - andalso - length(UniqueProposalSummaries) == 1, + AmFlappingNow_p = not (FlapStart == ?NOT_FLAPPING_START), + %% TODO: revisit why I added this extra length() + %% condition back on commit 3dfe5c2. + %% andalso + %% length(UniqueProposalSummaries) == 1, P_latest_flap_start = case P_latest_Flap of undefined -> ?NOT_FLAPPING_START; From 9edd91f48e6098fe1ee756fe822628c1e9ac1c76 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 28 Aug 2015 20:06:09 +0900 Subject: [PATCH 19/51] Bugfixes for a->b column transition & flap dampening --- src/machi_chain_manager1.erl | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index ddb772c..5f94727 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1491,6 +1491,8 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, LatestAuthorDownP = a40_latest_author_down(P_latest, P_newprop, S) andalso P_latest#projection_v1.author_server /= MyName, + P_latestStable = make_comparison_stable(P_latest), + P_currentStable = make_comparison_stable(P_current), ?REACT({a40, ?LINE, [{latest_author, P_latest#projection_v1.author_server}, {author_is_down_p, LatestAuthorDownP}, @@ -1518,7 +1520,7 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, andalso (P_latest#projection_v1.epoch_number < P_current#projection_v1.epoch_number orelse - P_latest /= P_current) -> + P_latestStable /= P_currentStable) -> ?REACT({a40, ?LINE, [{latest_epoch, P_latest#projection_v1.epoch_number}, {current_epoch, P_current#projection_v1.epoch_number}, @@ -1706,6 +1708,9 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, %% compound predicate below. I'm yanking it out now. TODO re-study? #projection_v1{upi=P_newprop_upi_ooi, repairing=P_newprop_repairing_ooi} = inner_projection_or_self(P_newprop), + CurrentZerfInStatus = proplists:get_value(make_zerf, + P_current#projection_v1.dbg2), + CurrentEpoch = P_current#projection_v1.epoch_number, EnoughAreFlapping_and_IamBad_p = %% Ignore inner_projection_exists(P_current): We might need to %% shut up quickly (adopting a new P_current can take a long @@ -1720,14 +1725,21 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, (not lists:member(MyName, P_newprop_upi_ooi++P_newprop_repairing_ooi)) andalso %% My down lists are the same, i.e., no state change to announce - P_current#projection_v1.down == P_newprop#projection_v1.down, + %% Or if P_current is a CP mode result of zerf_in & valid (epoch #), + %% then this down list comparison should be skipped. + ((P_current#projection_v1.down == P_newprop#projection_v1.down) + orelse + (CurrentZerfInStatus == CurrentEpoch)), ?REACT({b10, ?LINE, [{0,EnoughAreFlapping_and_IamBad_p}, {1,inner_projection_exists(P_current)}, {2,inner_projection_exists(P_latest)}, {3,inner_projection_exists(P_newprop)}, {4,MyUniquePropCount}, {5,{MyName, P_newprop_AllHosedPlus}}, - {6,UnanimousLatestInnerNotRelevant_p}]}), + %% {6,UnanimousLatestInnerNotRelevant_p}, + {7,P_current#projection_v1.down}, + {8,P_newprop#projection_v1.down}, + {9,{CurrentZerfInStatus,CurrentEpoch}}]}), if EnoughAreFlapping_and_IamBad_p -> ?REACT({b10, ?LINE, []}), @@ -3211,8 +3223,9 @@ make_zerf(#projection_v1{epoch_number=OldEpochNum, make_zerf2(OldEpochNum, Up, MajoritySize, MyName, AllMembers, OldWitness_list, MembersDict, OldFlap, S) -> try - Proj = zerf_find_last_common(MajoritySize, Up, S), - Proj2 = Proj#projection_v1{flap=OldFlap, dbg2=[]}, + #projection_v1{epoch_number=Epoch} = Proj = + zerf_find_last_common(MajoritySize, Up, S), + Proj2 = Proj#projection_v1{flap=OldFlap, dbg2=[{make_zerf,Epoch}]}, %% io:format(user, "ZERF ~w\n",[machi_projection:make_summary(Proj2)]), Proj2 catch @@ -3427,3 +3440,6 @@ make_annotation(EpochID, Time) -> is_annotated(#projection_v1{dbg2=Dbg2}) -> proplists:get_value(private_proj_is_upi_unanimous, Dbg2, false). + +make_comparison_stable(P) -> + P#projection_v1{flap=undefined, dbg2=[]}. From 403cb5b7a633bc8d92d897275b0351a2d3f9d8ec Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 28 Aug 2015 21:13:54 +0900 Subject: [PATCH 20/51] WIP: improvements, but now flapping inner epoch keeps increasing {sigh} --- src/machi_chain_manager1.erl | 40 +++++++++++++++++---- test/machi_chain_manager1_converge_demo.erl | 21 +++++------ 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 5f94727..2522be8 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -821,9 +821,9 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, MyName, AllMembers, OldWitness_list, MembersDict), Why = if NewUPI == [] -> - no_real_servers; + "No real servers in old upi are available now"; true -> - not_enough_witnesses + "Not enough witnesses are available now" end, P_none1 = P_none0#projection_v1{ epoch_number=OldEpochNum + 1, @@ -1285,7 +1285,16 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, {move_from_inner, MoveFromInnerToNorm_p}], ?REACT({a30, ?LINE, ClauseInfo}), MoveToNorm_p = MoveFromInnerToNorm_p orelse Kicker_p, - if MoveToNorm_p, CMode == cp_mode -> + if MoveToNorm_p, + P_newprop10#projection_v1.upi == [], + CMode == cp_mode -> + %% Too much weird stuff may have hapened while we were suffering + %% the flapping/asymmetric partition ... but we are now proposing + %% the none projection. We're going to use it so that we can + %% unwedge ourselve into the glorious none projection. + ?REACT({a30, ?LINE, []}), + react_to_env_C100(P_newprop10, P_latest, S); + MoveToNorm_p, CMode == cp_mode -> %% Too much weird stuff may have hapened while we were suffering %% the flapping/asymmetric partition. Fall back to the none %% projection as if we're restarting. @@ -1652,7 +1661,6 @@ react_to_env_A49(_P_latest, FinalProps, #ch_mgr{name=MyName, members_dict=MembersDict} = P_current, P_none = make_none_projection(MyName, All_list, Witness_list, MembersDict), -io:format(user, "Debug A49: ~w forced to none\n\n ~P", [MyName, get(react), 120]), react_to_env_A50(P_none, FinalProps, set_proj(S, P_none)). react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> @@ -1660,7 +1668,8 @@ react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> ?REACT({a50, ?LINE, [{current_epoch, P_current#projection_v1.epoch_number}, {latest_epoch, P_latest#projection_v1.epoch_number}, {final_props, FinalProps}]}), - %% if S#ch_mgr.name == b; S#ch_mgr.name == c -> io:format(user, "A50: ~p: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, + V = case file:read_file("/tmp/moomoo") of {ok, _} -> true; _ -> false end, + if V andalso (S#ch_mgr.name == b orelse S#ch_mgr.name == c) -> io:format(user, "A50: ~p: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, %% io:format(user, "Debug A50: ~w P_current outer ~w ~w ~w\n", [S#ch_mgr.name, P_current#projection_v1.epoch_number,P_current#projection_v1.upi,P_current#projection_v1.repairing]), {{no_change, FinalProps, P_current#projection_v1.epoch_number}, S}. @@ -2282,7 +2291,10 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, {_N, _} -> ?REACT({calculate_flaps,?LINE,[]}), false - end, + end + andalso + %% If P_newprop is the none projection, do not start flapping. + P_newprop#projection_v1.upi /= [], LeaveFlapping_p = if LastUpChange_diff < 3.0 -> @@ -2297,6 +2309,10 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, ?REACT({calculate_flaps,?LINE,[]}), false; AmFlappingNow_p andalso + P_newprop#projection_v1.upi == [] -> + %% P_newprop is the none projection, stop flapping. + true; + AmFlappingNow_p andalso CurrentUp /= FlapLastUp -> ?REACT({calculate_flaps,?LINE,[{manifesto_clause,{leave,1}}]}), true; @@ -3249,6 +3265,18 @@ make_zerf2(OldEpochNum, Up, MajoritySize, MyName, AllMembers, OldWitness_list, dbg2=[zerf_all]}), %% io:format(user, "ZERF ~w\n",[machi_projection:make_summary(P2)]), P2; + throw:{zerf,{not_enough_up,Up2,_All2}} -> + %% Make it appear like nobody is up now: we'll have to + %% wait until the Up list changes so that + %% zerf_find_last_common() can confirm a common stable + %% last stable epoch. + + P = make_none_projection(MyName, AllMembers, OldWitness_list, + MembersDict), + machi_projection:update_checksum( + P#projection_v1{epoch_number=OldEpochNum, + mode=cp_mode, + dbg2=[zerf_none, {up,Up2},{maj,MajoritySize}]}); _X:_Y -> throw({zerf, {damn_exception, Up, _X, _Y, erlang:get_stacktrace()}}) end. diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index 99de9ba..7552ea3 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -240,13 +240,13 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) -> %% machi_partition_simulator:reset_thresholds(10, 50), %% io:format(user, "\nLet loose the dogs of war!\n", []), %% [DoIt(20, 0, 0) || _ <- lists:seq(1,9)], - io:format(user, "\nVariations of puppies and dogs of war!\n", []), - [begin - machi_partition_simulator:reset_thresholds(90, 90), - DoIt(7, 0, 0), - machi_partition_simulator:always_these_partitions([]), - DoIt(7, 0, 0) - end || _ <- lists:seq(1, 3)], + %% %% io:format(user, "\nVariations of puppies and dogs of war!\n", []), + %% %% [begin + %% %% machi_partition_simulator:reset_thresholds(90, 90), + %% %% DoIt(7, 0, 0), + %% %% machi_partition_simulator:always_these_partitions([]), + %% %% DoIt(7, 0, 0) + %% %% end || _ <- lists:seq(1, 3)], machi_partition_simulator:always_these_partitions([]), io:format(user, "\nPuppies for everyone!\n", []), [DoIt(20, 0, 0) || _ <- lists:seq(1,9)], @@ -256,6 +256,7 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) -> MaxIters = NumFLUs * (NumFLUs + 1) * 6, [begin machi_partition_simulator:always_these_partitions(Partition), +if Partition==[{a,c},{b,c}] -> io:format(user, "\nSET SET SET debug, yo!\n", []), file:write_file("/tmp/moomoo", []); true -> ok end, io:format(user, "\nSET partitions = ~w (~w of ~w) at ~w\n", [Partition, Count, length(AllPs), time()]), true = lists:foldl( @@ -641,7 +642,7 @@ private_projections_are_stable(Namez, PollFunc) -> true end, - io:format(user, "\nPriv1 ~P\n1==2 ~w ap_disjoint ~w u_all_peers ~w cp_mode_agree ~w\n", [lists:sort(Private1), 20, Private1 == Private2, AP_mode_disjoint_test_p, Unanimous_with_all_peers_p, CP_mode_agree_test_p]), + io:format(user, "\nPriv1 ~p\nPriv2 ~p\n1==2 ~w ap_disjoint ~w u_all_peers ~w cp_mode_agree ~w\n", [lists:sort(Private1), lists:sort(Private2), Private1 == Private2, AP_mode_disjoint_test_p, Unanimous_with_all_peers_p, CP_mode_agree_test_p]), Private1 == Private2 andalso AP_mode_disjoint_test_p andalso ( @@ -661,12 +662,12 @@ private_projections_are_stable(Namez, PollFunc) -> get_latest_inner_proj_summ(FLU) -> {ok, Proj} = ?FLU_PC:read_latest_projection(FLU, private), - #projection_v1{epoch_number=E, epoch_csum=CSum, + #projection_v1{epoch_number=E, epoch_csum= <>, upi=UPI, repairing=Repairing, witnesses=Witnesses, down=Down} = machi_chain_manager1:inner_projection_or_self(Proj), Inner_p = machi_chain_manager1:inner_projection_exists(Proj), - EpochID = {E, CSum}, + EpochID = {E, CSum4}, {EpochID, UPI, Repairing, Down, Witnesses, Inner_p}. random_sort(L) -> From 582f9e5eab77e218041832f188f2ba375049381d Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 28 Aug 2015 23:08:38 +0900 Subject: [PATCH 21/51] Bugfix: fix effectively-none-projection transition to C100. Still buggy --- src/machi_chain_manager1.erl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 2522be8..ef8677a 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1293,7 +1293,9 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% the none projection. We're going to use it so that we can %% unwedge ourselve into the glorious none projection. ?REACT({a30, ?LINE, []}), - react_to_env_C100(P_newprop10, P_latest, S); + %% TODO: It seems a bit crazy, but this duplicates part/much + %% of what state C103 does? Go to C103 instead? + react_to_env_C100(P_newprop10, P_newprop10, S); MoveToNorm_p, CMode == cp_mode -> %% Too much weird stuff may have hapened while we were suffering %% the flapping/asymmetric partition. Fall back to the none From af0ade98405784f87235e80ed3f30c29dd62a637 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 29 Aug 2015 12:32:30 +0900 Subject: [PATCH 22/51] Bugfix: projection checksum fix in A30 --- src/machi_chain_manager1.erl | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index ef8677a..051352e 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1226,20 +1226,21 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, {flap_limit, FlapLimit}]}), {P_newprop3, S3} end, - ?REACT({a30, ?LINE, [{newprop10, machi_projection:make_summary(P_newprop10)}]}), + P_newprop11 = machi_projection:update_checksum(P_newprop10), + ?REACT({a30, ?LINE, [{newprop11, machi_projection:make_summary(P_newprop11)}]}), %% Here's a more common reason for moving from inner projection to %% a normal projection: the old proj has an inner but the newprop %% does not. MoveFromInnerToNorm_p = case {inner_projection_exists(P_current), - inner_projection_exists(P_newprop10)} of + inner_projection_exists(P_newprop11)} of {true, false} -> true; {_, _} -> false end, %% If P_current says that we believe that we're currently flapping, - %% and if P_newprop10 says that we're no longer flapping, then we + %% and if P_newprop11 says that we're no longer flapping, then we %% really ought to stop flapping, right. %% %% Not quite so simple.... @@ -1286,7 +1287,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, ?REACT({a30, ?LINE, ClauseInfo}), MoveToNorm_p = MoveFromInnerToNorm_p orelse Kicker_p, if MoveToNorm_p, - P_newprop10#projection_v1.upi == [], + P_newprop11#projection_v1.upi == [], CMode == cp_mode -> %% Too much weird stuff may have hapened while we were suffering %% the flapping/asymmetric partition ... but we are now proposing @@ -1295,7 +1296,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, ?REACT({a30, ?LINE, []}), %% TODO: It seems a bit crazy, but this duplicates part/much %% of what state C103 does? Go to C103 instead? - react_to_env_C100(P_newprop10, P_newprop10, S); + react_to_env_C100(P_newprop11, P_newprop11, S); MoveToNorm_p, CMode == cp_mode -> %% Too much weird stuff may have hapened while we were suffering %% the flapping/asymmetric partition. Fall back to the none @@ -1305,7 +1306,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, MoveToNorm_p, CMode == ap_mode -> %% Move from inner projection to outer. P_inner2A = inner_projection_or_self(P_current), - ResetEpoch = P_newprop10#projection_v1.epoch_number, + ResetEpoch = P_newprop11#projection_v1.epoch_number, ResetAuthor = case P_current#projection_v1.upi of [] -> %% Drat, fall back to current's author. @@ -1345,7 +1346,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, react_to_env_A40(Retries, P_o, P_latest, LatestUnanimousP, S_o); true -> ?REACT({a30, ?LINE, []}), - react_to_env_A40(Retries, P_newprop10, P_latest, + react_to_env_A40(Retries, P_newprop11, P_latest, LatestUnanimousP, S10) end. From f21fcdd7be57f80e228b52dc2f379baa67051494 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 29 Aug 2015 13:13:23 +0900 Subject: [PATCH 23/51] Bugfix: none proj must flap, undo previous commits, which may cause mess later --- src/machi_chain_manager1.erl | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 051352e..b34d268 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2294,10 +2294,9 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, {_N, _} -> ?REACT({calculate_flaps,?LINE,[]}), false - end - andalso - %% If P_newprop is the none projection, do not start flapping. - P_newprop#projection_v1.upi /= [], + end, + %% TODO: 2015-08-29: Grr, we really need CP cases of none projection + %% flapping to propagate problem_with information. LeaveFlapping_p = if LastUpChange_diff < 3.0 -> @@ -2311,10 +2310,13 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, %% that intent. ?REACT({calculate_flaps,?LINE,[]}), false; - AmFlappingNow_p andalso - P_newprop#projection_v1.upi == [] -> - %% P_newprop is the none projection, stop flapping. - true; + %% TODO: 2015-08-29: Grr, we really need CP cases of none projection + %% flapping to propagate problem_with information. + %% AmFlappingNow_p andalso + %% P_newprop#projection_v1.upi == [] -> + %% %% P_newprop is the none projection, stop flapping. + %% ?REACT({calculate_flaps,?LINE,[]}), + %% true; AmFlappingNow_p andalso CurrentUp /= FlapLastUp -> ?REACT({calculate_flaps,?LINE,[{manifesto_clause,{leave,1}}]}), From 6d9526b3796680181d1045e34f656f96d07f04fb Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 29 Aug 2015 13:13:31 +0900 Subject: [PATCH 24/51] Add more ?REACT() --- src/machi_chain_manager1.erl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index b34d268..4b904dc 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2378,11 +2378,16 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, HosedAnnotations)), AllHosed = lists:usort(HosedAnnotations ++ Magic), %%io:format(user, "ALLHOSED ~p: ~p ~w\n", [MyName, Magic, HosedAnnotations]), + ?REACT({calculate_flaps,?LINE,[{new_flap_count,NewFlapCount}, + {hosed_annotations,HosedAnnotations}, + {magic,Magic}, + {all_hosed,AllHosed}]}), AllHosed; not AmFlapping_p -> NewFlapCount = 0, NewFlapStart = ?NOT_FLAPPING_START, AllFlapCounts = [], + ?REACT({calculate_flaps,?LINE,[]}), AllHosed = [] end, @@ -2404,6 +2409,14 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, %% TODO: 2015-03-04: I'm growing increasingly suspicious of %% the 'runenv' variable that's threaded through all this code. %% It isn't doing what I'd originally intended. Fix it. + ?REACT({calculate_flaps,?LINE,[{flapping_i,FlappingI}, + {am_flapping_p,AmFlapping_p}, + {ch_mgr_updates,follow}, + {flap_count,NewFlapCount}, + {flap_start,NewFlapStart}, + {flap_last_up,CurrentUp}, + {flap_last_up_change,LastUpChange}, + {flap_counts_last,AllFlapCounts}]}), S2 = S#ch_mgr{flap_count=NewFlapCount, flap_start=NewFlapStart, flap_last_up=CurrentUp, flap_last_up_change=LastUpChange, flap_counts_last=AllFlapCounts, From c9340a662dab89ba573d88cb0834453489c3c3ac Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 29 Aug 2015 15:06:57 +0900 Subject: [PATCH 25/51] Bugfix: force stable creation_time on inner none proj --- src/machi_chain_manager1.erl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 4b904dc..7345ec5 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -827,11 +827,12 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, end, P_none1 = P_none0#projection_v1{ epoch_number=OldEpochNum + 1, + %% Stable creation time! + creation_time={1,2,3}, dbg=[{none_projection,true}, {up0, Up0}, {up, Up}, {all_hosed, AllHosed}, - {oldepoch, OldEpochNum}, {oldupi, OldUPI_list}, {newupi, NewUPI_list}, {newupi3, NewUPI_list3}, From 85eb3567a3d2e6643a31337f763ff1646f4122a7 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 29 Aug 2015 15:57:23 +0900 Subject: [PATCH 26/51] Bugfix: convergence property for CP mode --- test/machi_chain_manager1_converge_demo.erl | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index 7552ea3..c860bf4 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -634,9 +634,16 @@ private_projections_are_stable(Namez, PollFunc) -> io:format(user, "Priv2: EID ~W e ~w u ~w\n", [EpochID, 7, ExpectedFLUs, UsingFLUs]), ordsets:is_subset(ordsets:from_list(ExpectedFLUs), ordsets:from_list(UsingFLUs)); - _Else -> - io:format(user, "Priv2: Else ~p\n", [_Else]), - false + [{1=_Count,_EpochID}|_] -> + %% Our list is sorted & reversed, so 1=_Count + %% is biggest. If everyone is using the none proj, + %% then we're OK. + Private2None = [X || {_,{_,[],[],_,_,_}}=X <- Private2], + Private2 == Private2None; + Else -> + %% This is bad: we have a count that's less than + %% FullMajority but greater than 1. + throw({minority_error,Else,EpochIDs,private2,Private2}) end; CMode == ap_mode -> true From dc5ae4047a3466ab9994f04e67c05643265836c5 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 29 Aug 2015 18:01:13 +0900 Subject: [PATCH 27/51] Bugfix: react_to_env_A30 inner->norm fix, make_zerf() none proj derp fix --- src/machi_chain_manager1.erl | 59 +++++++++++++-------- test/machi_chain_manager1_converge_demo.erl | 1 - 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 7345ec5..8aa7434 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -830,6 +830,7 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, %% Stable creation time! creation_time={1,2,3}, dbg=[{none_projection,true}, + {creation_time,os:timestamp()}, {up0, Up0}, {up, Up}, {all_hosed, AllHosed}, @@ -1297,13 +1298,31 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, ?REACT({a30, ?LINE, []}), %% TODO: It seems a bit crazy, but this duplicates part/much %% of what state C103 does? Go to C103 instead? - react_to_env_C100(P_newprop11, P_newprop11, S); + P_newprop12 = machi_projection:update_checksum( + P_newprop11#projection_v1{epoch_number=NewEpoch}), + react_to_env_C100(P_newprop12, P_newprop11, S); MoveToNorm_p, CMode == cp_mode -> %% Too much weird stuff may have hapened while we were suffering - %% the flapping/asymmetric partition. Fall back to the none - %% projection as if we're restarting. - ?REACT({a30, ?LINE, [{move_to_norm, MoveToNorm_p}]}), - react_to_env_A49(P_latest, [], S10); + %% the flapping/asymmetric partition. + %% The MoveToNorm_p calculation doesn't take all CP mode + %% behavior into account, so finish the job here. + %% + %% The make_zerf() function will annotate the dbg2 list with + %% {make_zerf,Epoch} where Epoch should equal the epoch_number. + %% If annotated, then we have already passed through this if + %% clause in a prior iteration, and therefore we should go to A40 + %% now. If not annotated, go to A49 so that we *will* trigger a + %% make_zerf() on our next iteration. + case proplists:get_value(make_zerf, P_current#projection_v1.dbg2) of + Z_epoch when Z_epoch == P_current#projection_v1.epoch_number -> + ?REACT({a30, ?LINE, []}), + react_to_env_A40(Retries, P_newprop11, P_latest, + LatestUnanimousP, S10); + Z_epoch -> + ?REACT({a30, ?LINE, [{z_epoch,Z_epoch}]}), + %% Fall back to the none projection as if we're restarting. + react_to_env_A49(P_latest, [], S10) + end; MoveToNorm_p, CMode == ap_mode -> %% Move from inner projection to outer. P_inner2A = inner_projection_or_self(P_current), @@ -1672,9 +1691,6 @@ react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> ?REACT({a50, ?LINE, [{current_epoch, P_current#projection_v1.epoch_number}, {latest_epoch, P_latest#projection_v1.epoch_number}, {final_props, FinalProps}]}), - V = case file:read_file("/tmp/moomoo") of {ok, _} -> true; _ -> false end, - if V andalso (S#ch_mgr.name == b orelse S#ch_mgr.name == c) -> io:format(user, "A50: ~p: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, -%% io:format(user, "Debug A50: ~w P_current outer ~w ~w ~w\n", [S#ch_mgr.name, P_current#projection_v1.epoch_number,P_current#projection_v1.upi,P_current#projection_v1.repairing]), {{no_change, FinalProps, P_current#projection_v1.epoch_number}, S}. react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, @@ -1997,8 +2013,6 @@ react_to_env_C103(#projection_v1{epoch_number=_Epoch_newprop} = _P_newprop, #projection_v1{witnesses=Witness_list, members_dict=MembersDict} = P_current, P_none0 = make_none_projection(MyName, All_list, Witness_list, MembersDict), - %% P_none1 = P_none0#projection_v1{epoch_number=erlang:max(Epoch_newprop, - %% Epoch_latest), P_none1 = P_none0#projection_v1{epoch_number=Epoch_latest, flap=Flap, dbg=[{none_projection,true}]}, @@ -3249,7 +3263,18 @@ make_zerf(#projection_v1{epoch_number=OldEpochNum, MajoritySize = full_majority_size(AllMembers), case length(Up) >= MajoritySize of false -> - throw({zerf, {not_enough_up, Up, AllMembers}}); + %% Make it appear like nobody is up now: we'll have to + %% wait until the Up list changes so that + %% zerf_find_last_common() can confirm a common stable + %% last stable epoch. + + P = make_none_projection(MyName, AllMembers, OldWitness_list, + MembersDict), + machi_projection:update_checksum( + P#projection_v1{epoch_number=OldEpochNum, + mode=cp_mode, + flap=OldFlap, + dbg2=[zerf_none,{up,Up},{maj,MajoritySize}]}); true -> make_zerf2(OldEpochNum, Up, MajoritySize, MyName, AllMembers, OldWitness_list, MembersDict, OldFlap, S) @@ -3284,18 +3309,6 @@ make_zerf2(OldEpochNum, Up, MajoritySize, MyName, AllMembers, OldWitness_list, dbg2=[zerf_all]}), %% io:format(user, "ZERF ~w\n",[machi_projection:make_summary(P2)]), P2; - throw:{zerf,{not_enough_up,Up2,_All2}} -> - %% Make it appear like nobody is up now: we'll have to - %% wait until the Up list changes so that - %% zerf_find_last_common() can confirm a common stable - %% last stable epoch. - - P = make_none_projection(MyName, AllMembers, OldWitness_list, - MembersDict), - machi_projection:update_checksum( - P#projection_v1{epoch_number=OldEpochNum, - mode=cp_mode, - dbg2=[zerf_none, {up,Up2},{maj,MajoritySize}]}); _X:_Y -> throw({zerf, {damn_exception, Up, _X, _Y, erlang:get_stacktrace()}}) end. diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index c860bf4..89eddea 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -256,7 +256,6 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) -> MaxIters = NumFLUs * (NumFLUs + 1) * 6, [begin machi_partition_simulator:always_these_partitions(Partition), -if Partition==[{a,c},{b,c}] -> io:format(user, "\nSET SET SET debug, yo!\n", []), file:write_file("/tmp/moomoo", []); true -> ok end, io:format(user, "\nSET partitions = ~w (~w of ~w) at ~w\n", [Partition, Count, length(AllPs), time()]), true = lists:foldl( From 6b84cd6e6a07753e35ff6e347ad78fb4e4f6490f Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 29 Aug 2015 18:30:53 +0900 Subject: [PATCH 28/51] Reduce poll sleep time when running with partition simulator --- src/machi_chain_manager1.erl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 8aa7434..be72cd6 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2745,8 +2745,14 @@ poll_private_proj_is_upi_unanimous(#ch_mgr{consistency_mode=cp_mode, poll_private_proj_is_upi_unanimous_sleep(Count, S) when Count > 2 -> S; -poll_private_proj_is_upi_unanimous_sleep(Count, S) -> - timer:sleep((Count * Count) * 50), +poll_private_proj_is_upi_unanimous_sleep(Count, #ch_mgr{runenv=RunEnv}=S) -> + Denom = case proplists:get_value(use_partition_simulator, RunEnv, false) of + true -> + 20; + _ -> + 1 + end, + timer:sleep(((Count * Count) * 50) div Denom), case poll_private_proj_is_upi_unanimous3(S) of #ch_mgr{proj_unanimous=false} = S2 -> poll_private_proj_is_upi_unanimous_sleep(Count + 1, S2); From ee19a0856b368829ef3c939e31e93097ce9695a7 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 29 Aug 2015 19:59:46 +0900 Subject: [PATCH 29/51] WIP: justincase --- src/machi_chain_manager1.erl | 32 ++++++++++++++------- test/machi_chain_manager1_converge_demo.erl | 3 ++ 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index be72cd6..7c4de04 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -830,7 +830,6 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, %% Stable creation time! creation_time={1,2,3}, dbg=[{none_projection,true}, - {creation_time,os:timestamp()}, {up0, Up0}, {up, Up}, {all_hosed, AllHosed}, @@ -840,7 +839,9 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, {tent_upi, TentativeUPI}, {new_upi, NewUPI}, {up_witnesses, UpWitnesses}, - {why_none, Why}]}, + {why_none, Why}], + dbg2=[ + {creation_time,os:timestamp()}]}, machi_projection:update_checksum(P_none1) end end; @@ -1313,13 +1314,13 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% clause in a prior iteration, and therefore we should go to A40 %% now. If not annotated, go to A49 so that we *will* trigger a %% make_zerf() on our next iteration. - case proplists:get_value(make_zerf, P_current#projection_v1.dbg2) of - Z_epoch when Z_epoch == P_current#projection_v1.epoch_number -> + case has_make_zerf_annotation(P_current) of + true -> ?REACT({a30, ?LINE, []}), react_to_env_A40(Retries, P_newprop11, P_latest, LatestUnanimousP, S10); - Z_epoch -> - ?REACT({a30, ?LINE, [{z_epoch,Z_epoch}]}), + false -> + ?REACT({a30, ?LINE, []}), %% Fall back to the none projection as if we're restarting. react_to_env_A49(P_latest, [], S10) end; @@ -1691,6 +1692,10 @@ react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> ?REACT({a50, ?LINE, [{current_epoch, P_current#projection_v1.epoch_number}, {latest_epoch, P_latest#projection_v1.epoch_number}, {final_props, FinalProps}]}), + V = case file:read_file("/tmp/moomoo") of {ok, _} -> true; _ -> false end, + if V,S#ch_mgr.name == b -> io:format(user, "A50: ~p: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, + %% if V andalso (S#ch_mgr.name == b orelse S#ch_mgr.name == c) -> io:format(user, "A50: ~p: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, +%% io:format(user, "Debug A50: ~w P_current outer ~w ~w ~w\n", [S#ch_mgr.name, P_current#projection_v1.epoch_number,P_current#projection_v1.upi,P_current#projection_v1.repairing]), {{no_change, FinalProps, P_current#projection_v1.epoch_number}, S}. react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, @@ -1737,8 +1742,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, %% compound predicate below. I'm yanking it out now. TODO re-study? #projection_v1{upi=P_newprop_upi_ooi, repairing=P_newprop_repairing_ooi} = inner_projection_or_self(P_newprop), - CurrentZerfInStatus = proplists:get_value(make_zerf, - P_current#projection_v1.dbg2), + CurrentZerfInStatus_p = has_make_zerf_annotation(P_current), CurrentEpoch = P_current#projection_v1.epoch_number, EnoughAreFlapping_and_IamBad_p = %% Ignore inner_projection_exists(P_current): We might need to @@ -1758,7 +1762,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, %% then this down list comparison should be skipped. ((P_current#projection_v1.down == P_newprop#projection_v1.down) orelse - (CurrentZerfInStatus == CurrentEpoch)), + CurrentZerfInStatus_p), ?REACT({b10, ?LINE, [{0,EnoughAreFlapping_and_IamBad_p}, {1,inner_projection_exists(P_current)}, {2,inner_projection_exists(P_latest)}, @@ -1768,7 +1772,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, %% {6,UnanimousLatestInnerNotRelevant_p}, {7,P_current#projection_v1.down}, {8,P_newprop#projection_v1.down}, - {9,{CurrentZerfInStatus,CurrentEpoch}}]}), + {9,{CurrentZerfInStatus_p,CurrentEpoch}}]}), if EnoughAreFlapping_and_IamBad_p -> ?REACT({b10, ?LINE, []}), @@ -3509,3 +3513,11 @@ is_annotated(#projection_v1{dbg2=Dbg2}) -> make_comparison_stable(P) -> P#projection_v1{flap=undefined, dbg2=[]}. + +has_make_zerf_annotation(P) -> + case proplists:get_value(make_zerf, P#projection_v1.dbg2) of + Z_epoch when Z_epoch == P#projection_v1.epoch_number -> + true; + _ -> + false + end. diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index 89eddea..6e2ee4d 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -254,8 +254,10 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) -> AllPs = make_partition_list(All_list), PartitionCounts = lists:zip(AllPs, lists:seq(1, length(AllPs))), MaxIters = NumFLUs * (NumFLUs + 1) * 6, +os:cmd("rm -f /tmp/moomoo"), [begin machi_partition_simulator:always_these_partitions(Partition), +%% if Partition==[] -> os:cmd("touch /tmp/moomoo"); true -> ok end, io:format(user, "\nSET partitions = ~w (~w of ~w) at ~w\n", [Partition, Count, length(AllPs), time()]), true = lists:foldl( @@ -316,6 +318,7 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) -> end || {Partition, Count} <- PartitionCounts ], +os:cmd("touch /tmp/moomoo"), io:format(user, "\nSET partitions = []\n", []), io:format(user, "We should see convergence to 1 correct chain.\n", []), machi_partition_simulator:no_partitions(), From 94394d3429b56f3324137b16adb0e5bf2951f55f Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 29 Aug 2015 21:36:53 +0900 Subject: [PATCH 30/51] Bugfix: allow none proj to re-emerge from flapping (more) See comments added in this commit at A40. So far, I've been doing CP mode testing with a handful of (very useful) network partition combinations using: machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]). Next steps: * Expand number & types of partitions * Expand to chain lengths of 5 and beyond --- src/machi_chain_manager1.erl | 21 +++++++++++++++------ test/machi_chain_manager1_converge_demo.erl | 3 --- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 7c4de04..f19e04b 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1641,6 +1641,7 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, true -> ?REACT({a40, ?LINE, [true]}), + CurrentZerfInStatus_p = has_make_zerf_annotation(P_current), GoTo50_p = case inner_projection_exists(P_current) andalso inner_projection_exists(P_newprop) andalso @@ -1664,8 +1665,20 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, true end; false -> - ?REACT({a40, ?LINE, []}), - true + ?REACT({a40, ?LINE, [{currentzerfinstatus_p,CurrentZerfInStatus_p}]}), + if CurrentZerfInStatus_p andalso + P_newprop#projection_v1.upi /= [] -> + %% One scenario here: we are waking up after + %% a slumber with the none proj and need to + %% send P_newprop (which has non/empty UPI) + %% through the process to continue chain + %% recovery. + ?REACT({a40, ?LINE, []}), + false; + true -> + ?REACT({a40, ?LINE, []}), + true + end end, if GoTo50_p -> ?REACT({a40, ?LINE, []}), @@ -1692,10 +1705,6 @@ react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> ?REACT({a50, ?LINE, [{current_epoch, P_current#projection_v1.epoch_number}, {latest_epoch, P_latest#projection_v1.epoch_number}, {final_props, FinalProps}]}), - V = case file:read_file("/tmp/moomoo") of {ok, _} -> true; _ -> false end, - if V,S#ch_mgr.name == b -> io:format(user, "A50: ~p: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, - %% if V andalso (S#ch_mgr.name == b orelse S#ch_mgr.name == c) -> io:format(user, "A50: ~p: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, -%% io:format(user, "Debug A50: ~w P_current outer ~w ~w ~w\n", [S#ch_mgr.name, P_current#projection_v1.epoch_number,P_current#projection_v1.upi,P_current#projection_v1.repairing]), {{no_change, FinalProps, P_current#projection_v1.epoch_number}, S}. react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index 6e2ee4d..89eddea 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -254,10 +254,8 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) -> AllPs = make_partition_list(All_list), PartitionCounts = lists:zip(AllPs, lists:seq(1, length(AllPs))), MaxIters = NumFLUs * (NumFLUs + 1) * 6, -os:cmd("rm -f /tmp/moomoo"), [begin machi_partition_simulator:always_these_partitions(Partition), -%% if Partition==[] -> os:cmd("touch /tmp/moomoo"); true -> ok end, io:format(user, "\nSET partitions = ~w (~w of ~w) at ~w\n", [Partition, Count, length(AllPs), time()]), true = lists:foldl( @@ -318,7 +316,6 @@ os:cmd("rm -f /tmp/moomoo"), end || {Partition, Count} <- PartitionCounts ], -os:cmd("touch /tmp/moomoo"), io:format(user, "\nSET partitions = []\n", []), io:format(user, "We should see convergence to 1 correct chain.\n", []), machi_partition_simulator:no_partitions(), From 5c8b255da9f7de96b688af2740c7c67efec1b700 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 29 Aug 2015 22:40:18 +0900 Subject: [PATCH 31/51] Bugfix: first new CP experiments with chain len=5 --- src/machi_chain_manager1.erl | 21 +++++++++++++++------ src/machi_projection.erl | 7 +++++++ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index f19e04b..a6462ee 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1171,7 +1171,7 @@ react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, ?REACT({a29, ?LINE, [{zerf_filler, true}, {zerf_in, machi_projection:make_summary(Zerf)}]}), - io:format(user, "zerf_in: A29: ~p: ~w\n\t~p\n", [MyName, machi_projection:make_summary(Zerf), get(yyy_hack)]), + %% io:format(user, "zerf_in: A29: ~p: ~w\n\t~p\n", [MyName, machi_projection:make_summary(Zerf), get(yyy_hack)]), P_current2 = Zerf#projection_v1{ flap=P_current#projection_v1.flap}, S2 = set_proj(S, P_current2), @@ -1422,9 +1422,18 @@ a30_make_inner_projection(P_current, P_newprop3, P_latest, Up, {upi_latest_i, UPI_latest_i}, {repairing_latest_i,Repairing_latest_i}]}), LatestSameEnough_p = - (UPI_latest_i ++ Repairing_latest_i) == - (UPI_current_x ++ Repairing_current_x) - andalso + %% Experiment: With chain length=5, this check is a pain, + %% e.g. when make_zerf() verifies last + %% history of [c,d,e] *and no inner*, and now + %% others have proposed *with an inner* with + %% [a/witness,d,e] and bigger epoch. So, the + %% experiment is that if we choose something + %% insane here, other steps will figure that + %% out and do something safe instead. + %% + %% ({UPI_latest_i, Repairing_latest_i} == + %% {UPI_current_x, Repairing_current_x}) + %% andalso Epoch_latest_i >= P_current_ios#projection_v1.epoch_number, CurrentHasInner_and_LatestIsDisjoint_p = P_current_has_inner_p @@ -1777,8 +1786,8 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, {2,inner_projection_exists(P_latest)}, {3,inner_projection_exists(P_newprop)}, {4,MyUniquePropCount}, - {5,{MyName, P_newprop_AllHosedPlus}}, - %% {6,UnanimousLatestInnerNotRelevant_p}, + {5,S#ch_mgr.flap_count}, + {6,{MyName, P_newprop_AllHosedPlus}}, {7,P_current#projection_v1.down}, {8,P_newprop#projection_v1.down}, {9,{CurrentZerfInStatus_p,CurrentEpoch}}]}), diff --git a/src/machi_projection.erl b/src/machi_projection.erl index 08b20ab..10ded7c 100644 --- a/src/machi_projection.erl +++ b/src/machi_projection.erl @@ -121,8 +121,15 @@ update_checksum(P) -> %% accidentally encourage someone else sometime later) by %% replacing flapping information with our own local view at %% this instant in time. + %% * creation_time: With CP mode & inner projections, it's damn annoying + %% to have to copy this around 100% correctly. {sigh} + %% That's a negative state of the code. However, there + %% isn't a safety violation if the creation_time is + %% altered for any reason: it's there only for human + %% benefit for debugging. CSum = crypto:hash(sha, term_to_binary(P#projection_v1{epoch_csum= <<>>, + creation_time=undefined, flap=undefined, dbg2=[]})), P#projection_v1{epoch_csum=CSum}. From 53d865b24738da76f49b17ee6d3e4192f35b7454 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 29 Aug 2015 23:42:47 +0900 Subject: [PATCH 32/51] Bugfix: serious derp fix for A30's inner->outer --- src/machi_chain_manager1.erl | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index a6462ee..e1285b3 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1289,6 +1289,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, {move_from_inner, MoveFromInnerToNorm_p}], ?REACT({a30, ?LINE, ClauseInfo}), MoveToNorm_p = MoveFromInnerToNorm_p orelse Kicker_p, + CurrentHasZerf_p = has_make_zerf_annotation(P_current), if MoveToNorm_p, P_newprop11#projection_v1.upi == [], CMode == cp_mode -> @@ -1302,11 +1303,11 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, P_newprop12 = machi_projection:update_checksum( P_newprop11#projection_v1{epoch_number=NewEpoch}), react_to_env_C100(P_newprop12, P_newprop11, S); - MoveToNorm_p, CMode == cp_mode -> + MoveToNorm_p, + CMode == cp_mode, + not CurrentHasZerf_p -> %% Too much weird stuff may have hapened while we were suffering %% the flapping/asymmetric partition. - %% The MoveToNorm_p calculation doesn't take all CP mode - %% behavior into account, so finish the job here. %% %% The make_zerf() function will annotate the dbg2 list with %% {make_zerf,Epoch} where Epoch should equal the epoch_number. @@ -1314,17 +1315,11 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% clause in a prior iteration, and therefore we should go to A40 %% now. If not annotated, go to A49 so that we *will* trigger a %% make_zerf() on our next iteration. - case has_make_zerf_annotation(P_current) of - true -> - ?REACT({a30, ?LINE, []}), - react_to_env_A40(Retries, P_newprop11, P_latest, - LatestUnanimousP, S10); - false -> - ?REACT({a30, ?LINE, []}), - %% Fall back to the none projection as if we're restarting. - react_to_env_A49(P_latest, [], S10) - end; - MoveToNorm_p, CMode == ap_mode -> + + ?REACT({a30, ?LINE, []}), + %% Fall back to the none projection as if we're restarting. + react_to_env_A49(P_latest, [], S10); + MoveToNorm_p -> %% Move from inner projection to outer. P_inner2A = inner_projection_or_self(P_current), ResetEpoch = P_newprop11#projection_v1.epoch_number, From 764708f3efd5288c2bb9e30822238cc6eaa22116 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sun, 30 Aug 2015 00:03:51 +0900 Subject: [PATCH 33/51] Fix private_projections_are_stable() for long CP mode chains --- test/machi_chain_manager1_converge_demo.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index 89eddea..d0ad253 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -642,7 +642,7 @@ private_projections_are_stable(Namez, PollFunc) -> Else -> %% This is bad: we have a count that's less than %% FullMajority but greater than 1. - throw({minority_error,Else,EpochIDs,private2,Private2}) + false end; CMode == ap_mode -> true From a7db3a26c6664c52548734b2b37998402e942447 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sun, 30 Aug 2015 00:04:13 +0900 Subject: [PATCH 34/51] Bugfix: a30_make_inner_projection() compatible inner if not none proj --- src/machi_chain_manager1.erl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index e1285b3..290dc8e 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1429,6 +1429,8 @@ a30_make_inner_projection(P_current, P_newprop3, P_latest, Up, %% ({UPI_latest_i, Repairing_latest_i} == %% {UPI_current_x, Repairing_current_x}) %% andalso + UPI_latest_i /= [] % avoid hasty none proj jump + andalso Epoch_latest_i >= P_current_ios#projection_v1.epoch_number, CurrentHasInner_and_LatestIsDisjoint_p = P_current_has_inner_p From 4b83893047b107faa9bcf0071f8e76b78f55479b Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sun, 30 Aug 2015 00:50:03 +0900 Subject: [PATCH 35/51] Bugfix: minor flap count bookeeping error --- src/machi_chain_manager1.erl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 290dc8e..2e443d4 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2266,8 +2266,7 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, [X || {_FLU, {{_FlEpk,FlTime}, _FlapCount}}=X <- RemoteTransFlapCounts1, FlTime /= ?NOT_FLAPPING], TempNewFlapCount = FlapCount + 1, - TempAllFlapCounts = lists:sort([{MyName, {FlapStart, TempNewFlapCount}}| - RemoteTransFlapCounts]), + TempAllFlapCounts = lists:sort([{MyName, FlapStart}|RemoteTransFlapCounts]), %% Sanity check. true = lists:all(fun({_FLU,{_EpkTime,_Count}}) -> true; (_) -> false From 771164b82fcc3976851c1543e4ac81813cd53ea1 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sun, 30 Aug 2015 00:50:23 +0900 Subject: [PATCH 36/51] Bugfix: Flapping manifesto, leaving #2: only if not me --- src/machi_chain_manager1.erl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 2e443d4..a510fd9 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2368,7 +2368,9 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, %% latest proj flapping & flapping last time ?REACT({calculate_flaps,?LINE,[]}), false; - {0=Curtime, 0} when P_latest_LastStartTime /= undefined, + {0=Curtime, 0} when P_latest#projection_v1.author_server + /= MyName, + P_latest_LastStartTime /= undefined, P_latest_LastStartTime /= ?NOT_FLAPPING_START -> ?REACT({calculate_flaps,?LINE, From 0dc53274d1490ff340a707ebbe5840e585f7e3eb Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sun, 30 Aug 2015 02:22:59 +0900 Subject: [PATCH 37/51] Get more aggressive about AllHosed+down nodes for inner proj --- src/machi_chain_manager1.erl | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index a510fd9..7233485 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1369,13 +1369,17 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, a30_make_inner_projection(P_current, P_newprop3, P_latest, Up, #ch_mgr{name=MyName, consistency_mode=CMode} = S) -> AllHosed = get_all_hosed(P_newprop3), + NewPropDown = P_newprop3#projection_v1.down, P_current_has_inner_p = inner_projection_exists(P_current), P_current_ios = inner_projection_or_self(P_current), + AllHosed_and_Down = lists:usort(AllHosed ++ NewPropDown), {P_i1, S_i, _Up} = calc_projection2(P_current_ios, - MyName, AllHosed, [], S), + MyName, AllHosed_and_Down, [], S), ?REACT({a30, ?LINE, [{raw_all_hosed,get_all_hosed(P_newprop3)}, {up, Up}, {all_hosed, AllHosed}, + {new_prop_down, NewPropDown}, + {all_hosed_and_down, AllHosed_and_Down}, {p_c_i, machi_projection:make_summary(P_current_ios)}, {p_i1, machi_projection:make_summary(P_i1)}]}), %% The inner projection will have a fake author, which @@ -1489,7 +1493,7 @@ a30_make_inner_projection(P_current, P_newprop3, P_latest, Up, P_newprop3#projection_v1.epoch_number; true -> P_oldinner = inner_projection_or_self(P_current), - ?REACT({a30xyzxyz, ?LINE, [P_oldinner#projection_v1.epoch_number + 1]}), + ?REACT({a30xyzxyz, ?LINE, [{incrementing_based_on,P_oldinner#projection_v1.epoch_number + 1}]}), FinalCreation = P_newprop3#projection_v1.creation_time, P_oldinner#projection_v1.epoch_number + 1 end, @@ -1766,7 +1770,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, (inner_projection_exists(P_latest) orelse inner_projection_exists(P_newprop)) andalso %% I have been flapping for a while - S#ch_mgr.flap_count > 100 andalso + S#ch_mgr.flap_count > 200 andalso %% I'm suspected of being bad lists:member(MyName, P_newprop_AllHosedPlus) andalso %% I'm not in the critical UPI or repairing lists @@ -2226,6 +2230,7 @@ react_to_env_C310(P_newprop, S) -> calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, #ch_mgr{name=MyName, proj_history=H, + consistency_mode=CMode, flap_start=FlapStart, flap_count=FlapCount, flap_last_up=FlapLastUp, flap_last_up_change=LastUpChange0, @@ -2414,6 +2419,8 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, AllHosed = lists:usort(HosedAnnotations ++ Magic), %%io:format(user, "ALLHOSED ~p: ~p ~w\n", [MyName, Magic, HosedAnnotations]), ?REACT({calculate_flaps,?LINE,[{new_flap_count,NewFlapCount}, + {bad_flus,BadFLUs}, + {hosed_t_u_ts,HosedTransUnionTs}, {hosed_annotations,HosedAnnotations}, {magic,Magic}, {all_hosed,AllHosed}]}), @@ -2456,8 +2463,22 @@ calculate_flaps(P_newprop, P_latest, _P_current, CurrentUp, _FlapLimit, flap_last_up=CurrentUp, flap_last_up_change=LastUpChange, flap_counts_last=AllFlapCounts, runenv=RunEnv1}, - {machi_projection:update_checksum(P_newprop#projection_v1{ - flap=FlappingI}), + + P_newprop2 = case proplists:get_value(MyName, AllHosed) of + true when CMode == cp_mode -> + %% Experiment: try none proj but keep the epoch #. + ?REACT({calculate_flaps,?LINE,[]}), + P_newprop#projection_v1{ + upi=[], repairing=[], + down=P_newprop#projection_v1.all_members}; + _ -> + ?REACT({calculate_flaps,?LINE,[]}), + P_newprop + end, + ?REACT({calculate_flaps,?LINE,[{zzz_1,P_newprop2#projection_v1.upi}, + {zzz_2,P_newprop2#projection_v1.repairing}, + {zzz_3,catch (P_newprop2#projection_v1.flap)#flap_i.all_hosed}]}), + {machi_projection:update_checksum(P_newprop2#projection_v1{flap=FlappingI}), if AmFlapping_p -> S2; true -> From ec2e7b566910e591b480df2913d919be98b6d559 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sun, 30 Aug 2015 16:08:14 +0900 Subject: [PATCH 38/51] Sunday experiment: all-but-remove A29, feels right but definitely not sure yet --- src/machi_chain_manager1.erl | 47 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 7233485..d4a9a7b 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1160,29 +1160,30 @@ react_to_env_A20(Retries, #ch_mgr{name=MyName}=S) -> react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, #ch_mgr{name=MyName, consistency_mode=CMode, proj=P_current} = S) -> - #projection_v1{epoch_number=Epoch_latest, - author_server=Author_latest} = P_latest, - if CMode == cp_mode, - Epoch_latest > P_current#projection_v1.epoch_number, - Author_latest /= MyName -> - put(yyy_hack, []), - case make_zerf(P_current, S) of - Zerf when is_record(Zerf, projection_v1) -> - ?REACT({a29, ?LINE, - [{zerf_filler, true}, - {zerf_in, machi_projection:make_summary(Zerf)}]}), - %% io:format(user, "zerf_in: A29: ~p: ~w\n\t~p\n", [MyName, machi_projection:make_summary(Zerf), get(yyy_hack)]), - P_current2 = Zerf#projection_v1{ - flap=P_current#projection_v1.flap}, - S2 = set_proj(S, P_current2), - react_to_env_A30(Retries, P_latest, LatestUnanimousP, - ReadExtra, S2); - Zerf -> - {{{yo_todo_incomplete_fix_me_cp_mode, line, ?LINE, Zerf}}} - end; - true -> - react_to_env_A30(Retries, P_latest, LatestUnanimousP, ReadExtra, S) - end. + react_to_env_A30(Retries, P_latest, LatestUnanimousP, ReadExtra, S). + %% #projection_v1{epoch_number=Epoch_latest, + %% author_server=Author_latest} = P_latest, + %% if CMode == cp_mode, + %% Epoch_latest > P_current#projection_v1.epoch_number, + %% Author_latest /= MyName -> + %% put(yyy_hack, []), + %% case make_zerf(P_current, S) of + %% Zerf when is_record(Zerf, projection_v1) -> + %% ?REACT({a29, ?LINE, + %% [{zerf_filler, true}, + %% {zerf_in, machi_projection:make_summary(Zerf)}]}), + %% %% io:format(user, "zerf_in: A29: ~p: ~w\n\t~p\n", [MyName, machi_projection:make_summary(Zerf), get(yyy_hack)]), + %% P_current2 = Zerf#projection_v1{ + %% flap=P_current#projection_v1.flap}, + %% S2 = set_proj(S, P_current2), + %% react_to_env_A30(Retries, P_latest, LatestUnanimousP, + %% ReadExtra, S2); + %% Zerf -> + %% {{{yo_todo_incomplete_fix_me_cp_mode, line, ?LINE, Zerf}}} + %% end; + %% true -> + %% react_to_env_A30(Retries, P_latest, LatestUnanimousP, ReadExtra, S) + %% end. react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, #ch_mgr{name=MyName, proj=P_current, From 823b47bef349ef86d0f783138dfecdda5dcb6665 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sun, 30 Aug 2015 19:52:31 +0900 Subject: [PATCH 39/51] Bugfix: convergence property for CP mode, again --- test/machi_chain_manager1_converge_demo.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index d0ad253..f3f54c2 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -635,10 +635,10 @@ private_projections_are_stable(Namez, PollFunc) -> ordsets:from_list(UsingFLUs)); [{1=_Count,_EpochID}|_] -> %% Our list is sorted & reversed, so 1=_Count - %% is biggest. If everyone is using the none proj, + %% is biggest. If a majority is using the none proj, %% then we're OK. Private2None = [X || {_,{_,[],[],_,_,_}}=X <- Private2], - Private2 == Private2None; + length(Private2None) >= FullMajority; Else -> %% This is bad: we have a count that's less than %% FullMajority but greater than 1. From a449025e8b8c2fe49a8948ce7743624d7fe4404a Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sun, 30 Aug 2015 19:53:47 +0900 Subject: [PATCH 40/51] Bugfix: epoch handling around none proj: epoch 0 only at first bootstrap! --- src/machi_chain_manager1.erl | 43 +++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index d4a9a7b..7132eac 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -235,7 +235,7 @@ init({MyName, InitMembersDict, MgrOpts}) -> Opt = fun(Key, Default) -> proplists:get_value(Key, MgrOpts, Default) end, InitWitness_list = Opt(witnesses, []), ZeroAll_list = [P#p_srvr.name || {_,P} <- orddict:to_list(InitMembersDict)], - ZeroProj = make_none_projection(MyName, ZeroAll_list, + ZeroProj = make_none_projection(0, MyName, ZeroAll_list, InitWitness_list, InitMembersDict), ok = store_zeroth_projection_maybe(ZeroProj, MgrOpts), CMode = Opt(consistency_mode, ap_mode), @@ -262,9 +262,8 @@ init({MyName, InitMembersDict, MgrOpts}) -> get_my_private_proj_boot_info(MgrOpts, InitMembersDict, ZeroProj), #projection_v1{epoch_number=CurrentEpoch, all_members=All_list, witnesses=Witness_list} = Proj0, - Proj1 = make_none_projection(MyName, All_list, Witness_list, MembersDict), - Proj = machi_projection:update_checksum( - Proj1#projection_v1{epoch_number=CurrentEpoch}), + Proj = make_none_projection(CurrentEpoch, + MyName, All_list, Witness_list, MembersDict), RunEnv = [{seed, Opt(seed, now())}, {use_partition_simulator, Opt(use_partition_simulator, false)}, @@ -407,7 +406,7 @@ code_change(_OldVsn, S, _Extra) -> %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -make_none_projection(MyName, All_list, Witness_list, MembersDict) -> +make_none_projection(Epoch, MyName, All_list, Witness_list, MembersDict) -> Down_list = All_list, UPI_list = [], P = machi_projection:new(MyName, MembersDict, Down_list, UPI_list, [], []), @@ -416,7 +415,8 @@ make_none_projection(MyName, All_list, Witness_list, MembersDict) -> Witness_list /= [] -> cp_mode end, - machi_projection:update_checksum(P#projection_v1{mode=CMode, + machi_projection:update_checksum(P#projection_v1{epoch_number=Epoch, + mode=CMode, witnesses=Witness_list}). make_all_projection(MyName, All_list, Witness_list, MembersDict) -> @@ -587,7 +587,7 @@ rank_and_sort_projections_with_extra(All_queried_list, FLUsRs, ProjectionType, orelse length(UnwrittenRs) == length(FLUsRs) -> Witness_list = CurrentProj#projection_v1.witnesses, - NoneProj = make_none_projection(MyName, [], Witness_list, + NoneProj = make_none_projection(0, MyName, [], Witness_list, orddict:new()), Extra2 = [{all_members_replied, true}, {all_queried_list, All_queried_list}, @@ -708,7 +708,8 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, NewUp = Up -- LastUp, Down = AllMembers -- Up, - ?REACT({calc,?LINE,[{old_upi, OldUPI_list}, + ?REACT({calc,?LINE,[{old_epoch,OldEpochNum}, + {old_upi, OldUPI_list}, {old_repairing,OldRepairing_list}, {last_up, LastUp}, {up0, Up0}, {all_hosed, AllHosed}, {up, Up}, {new_up, NewUp}, {down, Down}]}), @@ -818,6 +819,7 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, true -> ?REACT({calc,?LINE,[]}), P_none0 = make_none_projection( + OldEpochNum + 1, MyName, AllMembers, OldWitness_list, MembersDict), Why = if NewUPI == [] -> @@ -826,7 +828,6 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, "Not enough witnesses are available now" end, P_none1 = P_none0#projection_v1{ - epoch_number=OldEpochNum + 1, %% Stable creation time! creation_time={1,2,3}, dbg=[{none_projection,true}, @@ -1707,7 +1708,8 @@ react_to_env_A49(_P_latest, FinalProps, #ch_mgr{name=MyName, #projection_v1{all_members=All_list, witnesses=Witness_list, members_dict=MembersDict} = P_current, - P_none = make_none_projection(MyName, All_list, Witness_list, + #projection_v1{epoch_number=EpochCurrent} = P_current, + P_none = make_none_projection(EpochCurrent, MyName, All_list, Witness_list, MembersDict), react_to_env_A50(P_none, FinalProps, set_proj(S, P_none)). @@ -1716,6 +1718,7 @@ react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> ?REACT({a50, ?LINE, [{current_epoch, P_current#projection_v1.epoch_number}, {latest_epoch, P_latest#projection_v1.epoch_number}, {final_props, FinalProps}]}), + %% if S#ch_mgr.name == c -> io:format(user, "A50: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, {{no_change, FinalProps, P_current#projection_v1.epoch_number}, S}. react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, @@ -2036,9 +2039,9 @@ react_to_env_C103(#projection_v1{epoch_number=_Epoch_newprop} = _P_newprop, #ch_mgr{name=MyName, proj=P_current}=S) -> #projection_v1{witnesses=Witness_list, members_dict=MembersDict} = P_current, - P_none0 = make_none_projection(MyName, All_list, Witness_list, MembersDict), - P_none1 = P_none0#projection_v1{epoch_number=Epoch_latest, - flap=Flap, + P_none0 = make_none_projection(Epoch_latest, + MyName, All_list, Witness_list, MembersDict), + P_none1 = P_none0#projection_v1{flap=Flap, dbg=[{none_projection,true}]}, P_none = machi_projection:update_checksum(P_none1), %% Use it, darn it, because it's 100% safe. And exit flapping state. @@ -2616,7 +2619,7 @@ projection_transition_is_sane_final_review( {wtf, cmode1, CMode1, cmode2, CMode2}; projection_transition_is_sane_final_review( #projection_v1{mode=cp_mode, upi=UPI1}=_P1, - #projection_v1{mode=cp_mode, upi=UPI2, witnesses=Witness_list, dbg=Dbg2}=_P2, + #projection_v1{mode=cp_mode, upi=UPI2, witnesses=Witness_list, dbg=Dbg}=_P2, true) -> %% All earlier sanity checks has said that this transition is sane, but %% we also need to make certain that any CP mode transition preserves at @@ -2624,7 +2627,11 @@ projection_transition_is_sane_final_review( %% verified that the ordering of the FLUs within the UPI list is ok. UPI1_s = ordsets:from_list(UPI1 -- Witness_list), UPI2_s = ordsets:from_list(UPI2 -- Witness_list), - case proplists:get_value(zerf_backstop, Dbg2) of + catch ?REACT({projection_transition_is_sane_final_review, ?LINE, + [{upi1,UPI1}, {upi2,UPI2}, {witnesses,Witness_list}, + {zerf_backstop, proplists:get_value(zerf_backstop, Dbg)}, + {upi1_s,UPI1}, {upi2_s,UPI2}]}), + case proplists:get_value(zerf_backstop, Dbg) of true when UPI1 == [] -> ?RETURN2(true); _ when UPI2 == [] -> @@ -3316,11 +3323,11 @@ make_zerf(#projection_v1{epoch_number=OldEpochNum, %% zerf_find_last_common() can confirm a common stable %% last stable epoch. - P = make_none_projection(MyName, AllMembers, OldWitness_list, + P = make_none_projection(OldEpochNum, + MyName, AllMembers, OldWitness_list, MembersDict), machi_projection:update_checksum( - P#projection_v1{epoch_number=OldEpochNum, - mode=cp_mode, + P#projection_v1{mode=cp_mode, flap=OldFlap, dbg2=[zerf_none,{up,Up},{maj,MajoritySize}]}); true -> From 004c686c8cf632ef3b220ca367071c176bfe9109 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sun, 30 Aug 2015 20:39:58 +0900 Subject: [PATCH 41/51] WIP: remove make_zerf() from calc_projection(); add make_zerf() to resurrected A29. Status: broken, needs work --- src/machi_chain_manager1.erl | 99 +++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 46 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 7132eac..929f00e 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -306,8 +306,12 @@ handle_call({set_chain_members, MembersDict, Witness_list}, _From, %% config. All_list = [P#p_srvr.name || {_, P} <- orddict:to_list(MembersDict)], MissingInNew = OldAll_list -- All_list, - NewUPI = OldUPI -- MissingInNew, - NewDown = All_list -- NewUPI, + {NewUPI, NewDown} = if OldEpoch == 0 -> + {All_list, []}; + true -> + NUPI = OldUPI -- MissingInNew, + {NUPI, All_list -- NUPI} + end, NewEpoch = OldEpoch + ?SET_CHAIN_MEMBERS_EPOCH_SKIP, CMode = calc_consistency_mode(Witness_list), ok = set_consistency_mode(machi_flu_psup:make_proj_supname(MyName), CMode), @@ -648,39 +652,34 @@ do_read_repair(FLUsRs, _Extra, #ch_mgr{proj=CurrentProj} = S) -> calc_projection(S, RelativeToServer) -> calc_projection(S, RelativeToServer, []). -calc_projection(#ch_mgr{proj=LastProj, consistency_mode=CMode} = S, +calc_projection(#ch_mgr{name=MyName, proj=P_current, consistency_mode=CMode, + runenv=RunEnv}=S, RelativeToServer, AllHosed) -> Dbg = [], %% OldThreshold = proplists:get_value(old_threshold, RunEnv), %% NoPartitionThreshold = proplists:get_value(no_partition_threshold, RunEnv), if CMode == ap_mode -> - calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, S); + calc_projection2(P_current, RelativeToServer, AllHosed, Dbg, S); CMode == cp_mode -> #projection_v1{epoch_number=OldEpochNum, all_members=AllMembers, upi=OldUPI_list - } = LastProj, + } = P_current, UPI_length_ok_p = length(OldUPI_list) >= full_majority_size(AllMembers), case {OldEpochNum, UPI_length_ok_p} of {0, _} -> - calc_projection2(LastProj, RelativeToServer, AllHosed, + calc_projection2(P_current, RelativeToServer, AllHosed, Dbg, S); {_, true} -> - calc_projection2(LastProj, RelativeToServer, AllHosed, + calc_projection2(P_current, RelativeToServer, AllHosed, Dbg, S); {_, false} -> - case make_zerf(LastProj, S) of - Zerf when is_record(Zerf, projection_v1) -> - ?REACT({calc,?LINE, - [{zerf_backstop, true}, - {zerf_in, machi_projection:make_summary(Zerf)}]}), - %% io:format(user, "zerf_in: ~p: ~w\n", [S#ch_mgr.name, machi_projection:make_summary(Zerf)]), - calc_projection2(Zerf, RelativeToServer, AllHosed, - [{zerf_backstop, true}]++Dbg, S); - Zerf -> - {{{yo_todo_incomplete_fix_me_cp_mode, OldEpochNum, OldUPI_list, Zerf}}} - end + io:format(user, "KEEP ~w current ~w ~w ~w\n", [MyName, P_current#projection_v1.epoch_number, P_current#projection_v1.upi, P_current#projection_v1.repairing]), + {Up, Partitions, RunEnv2} = calc_up_nodes( + MyName, AllMembers, RunEnv), + %% We can't improve on the current projection. + {P_current, S#ch_mgr{runenv=RunEnv2}, Up} end end. @@ -1161,30 +1160,38 @@ react_to_env_A20(Retries, #ch_mgr{name=MyName}=S) -> react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, #ch_mgr{name=MyName, consistency_mode=CMode, proj=P_current} = S) -> - react_to_env_A30(Retries, P_latest, LatestUnanimousP, ReadExtra, S). - %% #projection_v1{epoch_number=Epoch_latest, - %% author_server=Author_latest} = P_latest, - %% if CMode == cp_mode, - %% Epoch_latest > P_current#projection_v1.epoch_number, - %% Author_latest /= MyName -> - %% put(yyy_hack, []), - %% case make_zerf(P_current, S) of - %% Zerf when is_record(Zerf, projection_v1) -> - %% ?REACT({a29, ?LINE, - %% [{zerf_filler, true}, - %% {zerf_in, machi_projection:make_summary(Zerf)}]}), - %% %% io:format(user, "zerf_in: A29: ~p: ~w\n\t~p\n", [MyName, machi_projection:make_summary(Zerf), get(yyy_hack)]), - %% P_current2 = Zerf#projection_v1{ - %% flap=P_current#projection_v1.flap}, - %% S2 = set_proj(S, P_current2), - %% react_to_env_A30(Retries, P_latest, LatestUnanimousP, - %% ReadExtra, S2); - %% Zerf -> - %% {{{yo_todo_incomplete_fix_me_cp_mode, line, ?LINE, Zerf}}} - %% end; - %% true -> - %% react_to_env_A30(Retries, P_latest, LatestUnanimousP, ReadExtra, S) - %% end. + #projection_v1{epoch_number=Epoch_latest, + author_server=Author_latest} = P_latest, + Trigger = if CMode == cp_mode, + Epoch_latest > P_current#projection_v1.epoch_number, + Author_latest /= MyName -> + true; + P_current#projection_v1.upi == [] -> + true; + true -> + false + end, + if Trigger -> + put(yyy_hack, []), + case make_zerf(P_current, S) of + Zerf when is_record(Zerf, projection_v1) -> + ?REACT({a29, ?LINE, + [{zerf_backstop, true}, + {zerf_in, machi_projection:make_summary(Zerf)}]}), + %% io:format(user, "zerf_in: A29: ~p: ~w\n\t~p\n", [MyName, machi_projection:make_summary(Zerf), get(yyy_hack)]), + #projection_v1{dbg=ZerfDbg} = Zerf, + P_current2 = Zerf#projection_v1{ + flap=P_current#projection_v1.flap, + dbg=[{zerf_backstop,true}|ZerfDbg]}, + S2 = set_proj(S, P_current2), + react_to_env_A30(Retries, P_latest, LatestUnanimousP, + ReadExtra, S2); + Zerf -> + {{{yo_todo_incomplete_fix_me_cp_mode, line, ?LINE, Zerf}}} + end; + true -> + react_to_env_A30(Retries, P_latest, LatestUnanimousP, ReadExtra, S) + end. react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, #ch_mgr{name=MyName, proj=P_current, @@ -2618,8 +2625,8 @@ projection_transition_is_sane_final_review( _) when CMode1 /= CMode2 -> {wtf, cmode1, CMode1, cmode2, CMode2}; projection_transition_is_sane_final_review( - #projection_v1{mode=cp_mode, upi=UPI1}=_P1, - #projection_v1{mode=cp_mode, upi=UPI2, witnesses=Witness_list, dbg=Dbg}=_P2, + #projection_v1{mode=cp_mode, upi=UPI1, dbg=P1_dbg}=_P1, + #projection_v1{mode=cp_mode, upi=UPI2, witnesses=Witness_list}=_P2, true) -> %% All earlier sanity checks has said that this transition is sane, but %% we also need to make certain that any CP mode transition preserves at @@ -2629,9 +2636,9 @@ projection_transition_is_sane_final_review( UPI2_s = ordsets:from_list(UPI2 -- Witness_list), catch ?REACT({projection_transition_is_sane_final_review, ?LINE, [{upi1,UPI1}, {upi2,UPI2}, {witnesses,Witness_list}, - {zerf_backstop, proplists:get_value(zerf_backstop, Dbg)}, + {zerf_backstop, proplists:get_value(zerf_backstop, P1_dbg)}, {upi1_s,UPI1}, {upi2_s,UPI2}]}), - case proplists:get_value(zerf_backstop, Dbg) of + case proplists:get_value(zerf_backstop, P1_dbg) of true when UPI1 == [] -> ?RETURN2(true); _ when UPI2 == [] -> From 5422dc45c22dc69a64536cacb73455179881c4a1 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Mon, 31 Aug 2015 14:44:05 +0900 Subject: [PATCH 42/51] Bugfix: derp in A29 revival --- src/machi_chain_manager1.erl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 929f00e..c01280d 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1166,8 +1166,6 @@ react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, Epoch_latest > P_current#projection_v1.epoch_number, Author_latest /= MyName -> true; - P_current#projection_v1.upi == [] -> - true; true -> false end, From c637939cc223040b0348d75fcb1d554bdcb9d9ed Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Mon, 31 Aug 2015 15:21:17 +0900 Subject: [PATCH 43/51] Bugfix: A29 should trigger if EpochID (not Epoch# alone) differs --- src/machi_chain_manager1.erl | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index c01280d..37a830b 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -675,7 +675,6 @@ calc_projection(#ch_mgr{name=MyName, proj=P_current, consistency_mode=CMode, calc_projection2(P_current, RelativeToServer, AllHosed, Dbg, S); {_, false} -> - io:format(user, "KEEP ~w current ~w ~w ~w\n", [MyName, P_current#projection_v1.epoch_number, P_current#projection_v1.upi, P_current#projection_v1.repairing]), {Up, Partitions, RunEnv2} = calc_up_nodes( MyName, AllMembers, RunEnv), %% We can't improve on the current projection. @@ -1160,11 +1159,12 @@ react_to_env_A20(Retries, #ch_mgr{name=MyName}=S) -> react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, #ch_mgr{name=MyName, consistency_mode=CMode, proj=P_current} = S) -> - #projection_v1{epoch_number=Epoch_latest, - author_server=Author_latest} = P_latest, - Trigger = if CMode == cp_mode, - Epoch_latest > P_current#projection_v1.epoch_number, - Author_latest /= MyName -> + {Epoch_current,_} = EpochID_current = + machi_projection:get_epoch_id(P_current), + #projection_v1{author_server=Author_latest} = P_latest, + {Epoch_latest,_} = EpochID_latest = machi_projection:get_epoch_id(P_latest), + true = (Epoch_latest >= Epoch_current orelse Epoch_latest == 0), % sanity check + Trigger = if CMode == cp_mode, EpochID_latest /= EpochID_current -> true; true -> false @@ -1724,6 +1724,8 @@ react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> {latest_epoch, P_latest#projection_v1.epoch_number}, {final_props, FinalProps}]}), %% if S#ch_mgr.name == c -> io:format(user, "A50: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, + %% V = case file:read_file("/tmp/moomoo") of {ok,_} -> true; _ -> false end, + %% if V, S#ch_mgr.name == c -> io:format("C110: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, {{no_change, FinalProps, P_current#projection_v1.epoch_number}, S}. react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, @@ -3474,6 +3476,8 @@ diversion_c120_verbose_goop2(P_latest0, S) -> end. perhaps_verbose_c110(P_latest2, S) -> + %% V = case file:read_file("/tmp/moomoo") of {ok,_} -> true; _ -> false end, + %% if V, S#ch_mgr.name == c -> io:format("C110: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of true -> {_,_,C} = os:timestamp(), From a095e0cfc3049545f53586431bac7bd40efbca89 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Mon, 31 Aug 2015 15:40:19 +0900 Subject: [PATCH 44/51] Bugfix: ignore creation_time in make_comparison_stable() --- src/machi_chain_manager1.erl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 37a830b..59a7ba0 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1572,9 +1572,9 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, orelse P_latestStable /= P_currentStable) -> ?REACT({a40, ?LINE, - [{latest_epoch, P_latest#projection_v1.epoch_number}, - {current_epoch, P_current#projection_v1.epoch_number}, - {neq, P_latest /= P_current}]}), + [{latest, P_latestStable}, + {current, P_currentStable}, + {neq, P_latestStable /= P_currentStable}]}), %% Both of these cases are rare. Elsewhere, the code %% assumes that the local FLU's projection store is always @@ -3566,7 +3566,7 @@ is_annotated(#projection_v1{dbg2=Dbg2}) -> proplists:get_value(private_proj_is_upi_unanimous, Dbg2, false). make_comparison_stable(P) -> - P#projection_v1{flap=undefined, dbg2=[]}. + P#projection_v1{creation_time=undefined, flap=undefined, dbg2=[]}. has_make_zerf_annotation(P) -> case proplists:get_value(make_zerf, P#projection_v1.dbg2) of From bce225a20094284670caa231e66d3a55ebc0ccb8 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Mon, 31 Aug 2015 17:03:12 +0900 Subject: [PATCH 45/51] Bugfix: a30_make_inner_projection() ignore newprop down list if none proj --- src/machi_chain_manager1.erl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 59a7ba0..3efddef 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1376,7 +1376,12 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, a30_make_inner_projection(P_current, P_newprop3, P_latest, Up, #ch_mgr{name=MyName, consistency_mode=CMode} = S) -> AllHosed = get_all_hosed(P_newprop3), - NewPropDown = P_newprop3#projection_v1.down, + NewPropDown = if P_newprop3#projection_v1.upi == [] -> + %% This is a none proj, don't believe down list + []; + true -> + P_newprop3#projection_v1.down + end, P_current_has_inner_p = inner_projection_exists(P_current), P_current_ios = inner_projection_or_self(P_current), AllHosed_and_Down = lists:usort(AllHosed ++ NewPropDown), From 1e5d58b22df8bdaa78696acb8066d710f0ed980d Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Mon, 31 Aug 2015 17:57:37 +0900 Subject: [PATCH 46/51] Bugfix: more to ignore in make_basic_comparison_stable() --- src/machi_chain_manager1.erl | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 3efddef..916b3d1 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1163,13 +1163,28 @@ react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, machi_projection:get_epoch_id(P_current), #projection_v1{author_server=Author_latest} = P_latest, {Epoch_latest,_} = EpochID_latest = machi_projection:get_epoch_id(P_latest), - true = (Epoch_latest >= Epoch_current orelse Epoch_latest == 0), % sanity check Trigger = if CMode == cp_mode, EpochID_latest /= EpochID_current -> + ?REACT({a29, ?LINE, + [{epoch_id_latest,EpochID_latest}, + {epoch_id_current,EpochID_current}]}), true; true -> + ?REACT({a29, ?LINE, []}), false end, if Trigger -> + ?REACT({a29, ?LINE, + [{old_current, machi_projection:make_summary(P_current)}]}), + if Epoch_latest >= Epoch_current orelse Epoch_latest == 0 orelse + P_current#projection_v1.upi == [] -> + ok; % sanity check + true -> + exit({?MODULE,?LINE, + {epoch_latest,Epoch_latest}, + {epoch_current,Epoch_current}, + {latest,machi_projection:make_summary(P_latest)}, + {current,machi_projection:make_summary(P_current)}}) + end, put(yyy_hack, []), case make_zerf(P_current, S) of Zerf when is_record(Zerf, projection_v1) -> @@ -1188,6 +1203,7 @@ react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, {{{yo_todo_incomplete_fix_me_cp_mode, line, ?LINE, Zerf}}} end; true -> + ?REACT({a29, ?LINE, []}), react_to_env_A30(Retries, P_latest, LatestUnanimousP, ReadExtra, S) end. @@ -1546,8 +1562,8 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, LatestAuthorDownP = a40_latest_author_down(P_latest, P_newprop, S) andalso P_latest#projection_v1.author_server /= MyName, - P_latestStable = make_comparison_stable(P_latest), - P_currentStable = make_comparison_stable(P_current), + P_latestStable = make_basic_comparison_stable(P_latest), + P_currentStable = make_basic_comparison_stable(P_current), ?REACT({a40, ?LINE, [{latest_author, P_latest#projection_v1.author_server}, {author_is_down_p, LatestAuthorDownP}, @@ -3570,8 +3586,12 @@ make_annotation(EpochID, Time) -> is_annotated(#projection_v1{dbg2=Dbg2}) -> proplists:get_value(private_proj_is_upi_unanimous, Dbg2, false). -make_comparison_stable(P) -> - P#projection_v1{creation_time=undefined, flap=undefined, dbg2=[]}. +make_basic_comparison_stable(P) -> + P#projection_v1{creation_time=undefined, + flap=undefined, + dbg=[], + dbg2=[], + members_dict=[]}. has_make_zerf_annotation(P) -> case proplists:get_value(make_zerf, P#projection_v1.dbg2) of From e79265228ee969543bf4f633cf737d1fba6b9d50 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Mon, 31 Aug 2015 22:14:28 +0900 Subject: [PATCH 47/51] Bugfix: more correct for inner->outer sanity transition --- src/machi_chain_manager1.erl | 59 ++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 916b3d1..a95a673 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1164,17 +1164,15 @@ react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, #projection_v1{author_server=Author_latest} = P_latest, {Epoch_latest,_} = EpochID_latest = machi_projection:get_epoch_id(P_latest), Trigger = if CMode == cp_mode, EpochID_latest /= EpochID_current -> - ?REACT({a29, ?LINE, - [{epoch_id_latest,EpochID_latest}, - {epoch_id_current,EpochID_current}]}), true; true -> - ?REACT({a29, ?LINE, []}), false end, if Trigger -> ?REACT({a29, ?LINE, - [{old_current, machi_projection:make_summary(P_current)}]}), + [{epoch_id_latest,EpochID_latest}, + {epoch_id_current,EpochID_current}, + {old_current, machi_projection:make_summary(P_current)}]}), if Epoch_latest >= Epoch_current orelse Epoch_latest == 0 orelse P_current#projection_v1.upi == [] -> ok; % sanity check @@ -2572,14 +2570,14 @@ projection_transition_is_sane(P1, P2, RelativeToServer) -> projection_transition_is_sane(P1, P2, RelativeToServer, RetrospectiveP) -> put(myname, RelativeToServer), put(why2, []), + HasInner1 = inner_projection_exists(P1), + HasInner2 = inner_projection_exists(P2), + Inner1 = inner_projection_or_self(P1), + Inner2 = inner_projection_or_self(P2), case projection_transition_is_sane_with_si_epoch( P1, P2, RelativeToServer, RetrospectiveP) of true -> - HasInner1 = inner_projection_exists(P1), - HasInner2 = inner_projection_exists(P2), if HasInner1 orelse HasInner2 -> - Inner1 = inner_projection_or_self(P1), - Inner2 = inner_projection_or_self(P2), if HasInner1 orelse HasInner2 -> %% In case of transition with inner projections, we %% must allow the epoch number to remain constant. @@ -2590,6 +2588,7 @@ projection_transition_is_sane(P1, P2, RelativeToServer, RetrospectiveP) -> projection_transition_is_sane_except_si_epoch( Inner1, Inner2, RelativeToServer, RetrospectiveP))); true -> + exit(delete_this_inner_clause_impossible_with_two_identical_nested_if_clauses), ?RETURN2( projection_transition_is_sane_final_review(P1, P2, projection_transition_is_sane_with_si_epoch( @@ -2600,7 +2599,32 @@ projection_transition_is_sane(P1, P2, RelativeToServer, RetrospectiveP) -> ?RETURN2(true)) end; Else -> - ?RETURN2(Else) + if HasInner1 and (not HasInner2) -> + %% OK, imagine that we used to be flapping but now we've + %% stopped flapping. + %% + %% P1 = outer = upi=[a,d,e],repairing=[] epoch 870 + %% inner = upi=[a,e,d],repairing=[] epoch 605 + %% to + %% P2 = outer = upi=[a,e,d],repairing=[] epoch 875 + %% inner = undefined + %% + %% Everyone is using the inner projection [a,e,d],[], + %% everyone thinks that that is OK. It has been in use + %% for a while now. + %% + %% Now there's a new epoch, e875 that is saying that we + %% should transition from inner e605 [a,e,d],[] -> outer + %% e875 [a,e,d],[] This is SAFE! The UPI is the *same*. + %% + %% Verify this Inner1->P2 transition, including SI epoch + ?RETURN2( + projection_transition_is_sane_final_review(P1, P2, + projection_transition_is_sane_with_si_epoch( + Inner1, P2, RelativeToServer, RetrospectiveP))); + true -> + ?RETURN2(Else) + end end. projection_transition_is_sane_final_review( @@ -2774,6 +2798,21 @@ projection_transition_is_sane_except_si_epoch( %% We won't check the checksum of P1, but we will of P2. P2 = machi_projection:update_checksum(P2), + %% CP mode extra sanity checks + if CMode1 == cp_mode -> + Majority = full_majority_size(All_list2), + if length(UPI_list2) == 0 -> + ok; % none projection + length(UPI_list2) >= Majority -> + %% We have at least one non-witness + true = (length(UPI_list2 -- Witness_list2) > 0); + true -> + error({majority_not_met, UPI_list2}) + end; + CMode1 == ap_mode -> + ok + end, + %% Hooray, all basic properties of the projection's elements are %% not obviously bad. Now let's check if the UPI+Repairing->UPI %% transition is good. From 2e2f5f44c4a83793f93ac0374d0220f501d420f1 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 1 Sep 2015 00:51:12 +0900 Subject: [PATCH 48/51] Another tweak to private_projections_are_stable() --- test/machi_chain_manager1_converge_demo.erl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index f3f54c2..ccdaad6 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -552,9 +552,14 @@ todo_why_does_this_crash_sometimes(FLUName, FLU, PPPepoch) -> end. private_projections_are_stable(Namez, PollFunc) -> - Private1 = [{Name, get_latest_inner_proj_summ(FLU)} || {Name,FLU} <- Namez], + FilterNoneProj = fun({_EpochID,[],[],_Dn,_W,InnerP}) -> false; + (_) -> true + end, + Private1x = [{Name, get_latest_inner_proj_summ(FLU)} || {Name,FLU} <- Namez], + Private1 = [X || X={_,Proj} <- Private1x, FilterNoneProj(Proj)], [PollFunc(15, 1, 10) || _ <- lists:seq(1,6)], - Private2 = [{Name, get_latest_inner_proj_summ(FLU)} || {Name,FLU} <- Namez], + Private2x = [{Name, get_latest_inner_proj_summ(FLU)} || {Name,FLU} <- Namez], + Private2 = [X || X={_,Proj} <- Private2x, FilterNoneProj(Proj)], %% Is = [Inner_p || {_,_,_,_,Inner_p} <- Private1], put(stable, lists:sort(Private1)), %% We want either all true or all false (inner or not) ... except From 4378ef7b542d3cc0ad7ce8371af2b7f572f17031 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 1 Sep 2015 00:51:46 +0900 Subject: [PATCH 49/51] Bugfix: inner->outer proj @ A30 --- src/machi_chain_manager1.erl | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index a95a673..0653e7a 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1323,7 +1323,29 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% of what state C103 does? Go to C103 instead? P_newprop12 = machi_projection:update_checksum( P_newprop11#projection_v1{epoch_number=NewEpoch}), - react_to_env_C100(P_newprop12, P_newprop11, S); + + %% Move to C300 to avoid repeating the same none proj (and + %% multiple writes to the same private epoch that + %% concidentally are permitted because the projection is + %% exactly the same) + %% + %% The other problem in this execution is that there are a + %% couple of other parties that are not flapping because + %% they see this A30->C100 problem & repeat is + %% short-circuiting all of the flapping logic. If I + %% change A30->C100 to be A30->C300 instead, then I hope + %% that other effect will resolve itself correctly. + + if P_latest#projection_v1.author_server == MyName, + P_latest#projection_v1.upi == [] -> + ?REACT({a30, ?LINE, []}), + io:format(user, "CONFIRM debug A30->C100 by ~w\n",[MyName]), + react_to_env_C100(P_newprop12, P_latest, S); + true -> + ?REACT({a30, ?LINE, []}), + io:format(user, "CONFIRM debug A30->C300 by ~w\n",[MyName]), + react_to_env_C300(P_newprop12, P_latest, S) + end; MoveToNorm_p, CMode == cp_mode, not CurrentHasZerf_p -> From 3c1026da28c14aac271a842d533f3924d92ac9ec Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 1 Sep 2015 22:10:45 +0900 Subject: [PATCH 50/51] WIP: too tired to continue tonight --- src/machi_chain_manager1.erl | 199 ++++++++++++++++++++++++----------- 1 file changed, 135 insertions(+), 64 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 0653e7a..9ad81b4 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -652,9 +652,12 @@ do_read_repair(FLUsRs, _Extra, #ch_mgr{proj=CurrentProj} = S) -> calc_projection(S, RelativeToServer) -> calc_projection(S, RelativeToServer, []). -calc_projection(#ch_mgr{name=MyName, proj=P_current, consistency_mode=CMode, +calc_projection(#ch_mgr{proj=P_current}=S, RelativeToServer, AllHosed) -> + calc_projection(S, RelativeToServer, AllHosed, P_current). + +calc_projection(#ch_mgr{name=MyName, consistency_mode=CMode, runenv=RunEnv}=S, - RelativeToServer, AllHosed) -> + RelativeToServer, AllHosed, P_current) -> Dbg = [], %% OldThreshold = proplists:get_value(old_threshold, RunEnv), %% NoPartitionThreshold = proplists:get_value(no_partition_threshold, RunEnv), @@ -1191,33 +1194,34 @@ react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, {zerf_in, machi_projection:make_summary(Zerf)}]}), %% io:format(user, "zerf_in: A29: ~p: ~w\n\t~p\n", [MyName, machi_projection:make_summary(Zerf), get(yyy_hack)]), #projection_v1{dbg=ZerfDbg} = Zerf, - P_current2 = Zerf#projection_v1{ - flap=P_current#projection_v1.flap, - dbg=[{zerf_backstop,true}|ZerfDbg]}, - S2 = set_proj(S, P_current2), + P_current_calc = Zerf#projection_v1{ + flap=P_current#projection_v1.flap, + dbg=[{zerf_backstop,true}|ZerfDbg]}, react_to_env_A30(Retries, P_latest, LatestUnanimousP, - ReadExtra, S2); + P_current_calc, S); Zerf -> {{{yo_todo_incomplete_fix_me_cp_mode, line, ?LINE, Zerf}}} end; true -> ?REACT({a29, ?LINE, []}), - react_to_env_A30(Retries, P_latest, LatestUnanimousP, ReadExtra, S) + react_to_env_A30(Retries, P_latest, LatestUnanimousP, P_current, S) end. -react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, +react_to_env_A30(Retries, P_latest, LatestUnanimousP, P_current_calc, #ch_mgr{name=MyName, proj=P_current, consistency_mode=CMode, flap_limit=FlapLimit} = S) -> ?REACT(a30), %% case length(get(react)) of XX when XX > 500 -> io:format(user, "A30 ~w! ~w: ~P\n", [MyName, XX, get(react), 300]), timer:sleep(500); _ -> ok end, - {P_newprop1, S2, Up} = calc_projection(S, MyName), + AllHosed = [], + {P_newprop1, S2, Up} = calc_projection(S, MyName, AllHosed, P_current_calc), ?REACT({a30, ?LINE, [{current, machi_projection:make_summary(S#ch_mgr.proj)}]}), + ?REACT({a30, ?LINE, [{calc_current, machi_projection:make_summary(P_current_calc)}]}), ?REACT({a30, ?LINE, [{newprop1, machi_projection:make_summary(P_newprop1)}]}), ?REACT({a30, ?LINE, [{latest, machi_projection:make_summary(P_latest)}]}), %% Are we flapping yet? - {P_newprop2, S3} = calculate_flaps(P_newprop1, P_latest, P_current, Up, - FlapLimit, S2), + {P_newprop2, S3} = calculate_flaps(P_newprop1, P_latest, P_current_calc, + Up, FlapLimit, S2), %% Move the epoch number up ... originally done in C300. #projection_v1{epoch_number=Epoch_newprop2}=P_newprop2, @@ -1238,7 +1242,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, {_FLU, {_EpkTime, FlapCount}} <- AFPs]) of [SmallestFC|_] when SmallestFC > ?MINIMUM_ALL_FLAP_LIMIT -> a30_make_inner_projection( - P_current, P_newprop3, P_latest, Up, S3); + P_current_calc, P_newprop3, P_latest, Up, S3); _ -> %% Not everyone is flapping enough. Or perhaps %% everyone was but we finally saw some new server X @@ -1257,7 +1261,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% a normal projection: the old proj has an inner but the newprop %% does not. MoveFromInnerToNorm_p = - case {inner_projection_exists(P_current), + case {inner_projection_exists(P_current_calc), inner_projection_exists(P_newprop11)} of {true, false} -> true; {_, _} -> false @@ -1286,7 +1290,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% Remember! P_current is this manager's private in-use projection. %% It is always less than or equal to P_latest's epoch! - Current_flap_counts = get_all_flap_counts(P_current), + Current_flap_counts = get_all_flap_counts(P_current_calc), Latest_authors_flap_count_current = proplists:get_value( Author_latest, Current_flap_counts), Latest_flap_counts = get_all_flap_counts(P_latest), @@ -1299,7 +1303,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% count to zero flap count. But ... do not kick out %% of our flapping mode locally if we do not have an %% inner projection. - inner_projection_exists(P_current); + inner_projection_exists(P_current_calc); {_, _} -> false end, @@ -1310,7 +1314,7 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, {move_from_inner, MoveFromInnerToNorm_p}], ?REACT({a30, ?LINE, ClauseInfo}), MoveToNorm_p = MoveFromInnerToNorm_p orelse Kicker_p, - CurrentHasZerf_p = has_make_zerf_annotation(P_current), + CurrentHasZerf_p = has_make_zerf_annotation(P_current_calc), if MoveToNorm_p, P_newprop11#projection_v1.upi == [], CMode == cp_mode -> @@ -1360,18 +1364,17 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% make_zerf() on our next iteration. ?REACT({a30, ?LINE, []}), - %% Fall back to the none projection as if we're restarting. react_to_env_A49(P_latest, [], S10); MoveToNorm_p -> %% Move from inner projection to outer. - P_inner2A = inner_projection_or_self(P_current), + P_inner2A = inner_projection_or_self(P_current_calc), ResetEpoch = P_newprop11#projection_v1.epoch_number, - ResetAuthor = case P_current#projection_v1.upi of + ResetAuthor = case P_current_calc#projection_v1.upi of [] -> %% Drat, fall back to current's author. - P_current#projection_v1.author_server; + P_current_calc#projection_v1.author_server; _ -> - lists:last(P_current#projection_v1.upi) + lists:last(P_current_calc#projection_v1.upi) end, ClauseInfo2 = [{move_from_inner_to_outer, true}, {old_author, P_inner2A#projection_v1.author_server}, @@ -1400,8 +1403,11 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, %% altered significantly. Use calc_projection() to find out what %% nodes are down *now* (as best as we can tell right now). {P_o, S_o, _Up2} = calc_projection2(P_inner2B, MyName, [], [], S10), + ReactI2 = [{inner2po,machi_projection:make_summary(P_o)}], + ?REACT({a30, ?LINE, ReactI2}), %% NOTE: We are intentionally clearing flap info by not %% carrying it forwarding in the new projection. + %% TODO 2015-09-01: revisit clearing flapping state here? react_to_env_A40(Retries, P_o, P_latest, LatestUnanimousP, S_o); true -> ?REACT({a30, ?LINE, []}), @@ -1410,7 +1416,8 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, _ReadExtra, end. a30_make_inner_projection(P_current, P_newprop3, P_latest, Up, - #ch_mgr{name=MyName, consistency_mode=CMode} = S) -> + #ch_mgr{name=MyName, consistency_mode=CMode, + proj=P_current_real} = S) -> AllHosed = get_all_hosed(P_newprop3), NewPropDown = if P_newprop3#projection_v1.upi == [] -> %% This is a none proj, don't believe down list @@ -1465,24 +1472,31 @@ a30_make_inner_projection(P_current, P_newprop3, P_latest, Up, #projection_v1{epoch_number=Epoch_latest_i, upi=UPI_latest_i, repairing=Repairing_latest_i} = P_latest_i, + CurrentRealEpochCheck_p = + case inner_projection_exists(P_current_real) of + false -> + %% We're definitely going to suggest making + %% outer->inner transition. + Epoch_latest_i >= P_current_real#projection_v1.epoch_number + andalso + Epoch_latest_i >= P_current#projection_v1.epoch_number; + true -> + true + end, ?REACT({a30, ?LINE, [{epoch_latest_i, Epoch_latest_i}, {upi_latest_i, UPI_latest_i}, + {current_real_epoch_check, + CurrentRealEpochCheck_p}, + {x1,inner_projection_exists(P_current_real)}, + {x2,Epoch_latest_i}, + {x3,P_current_real#projection_v1.epoch_number}, + {x4,P_current#projection_v1.epoch_number}, {repairing_latest_i,Repairing_latest_i}]}), LatestSameEnough_p = - %% Experiment: With chain length=5, this check is a pain, - %% e.g. when make_zerf() verifies last - %% history of [c,d,e] *and no inner*, and now - %% others have proposed *with an inner* with - %% [a/witness,d,e] and bigger epoch. So, the - %% experiment is that if we choose something - %% insane here, other steps will figure that - %% out and do something safe instead. - %% - %% ({UPI_latest_i, Repairing_latest_i} == - %% {UPI_current_x, Repairing_current_x}) - %% andalso UPI_latest_i /= [] % avoid hasty none proj jump andalso + CurrentRealEpochCheck_p + andalso Epoch_latest_i >= P_current_ios#projection_v1.epoch_number, CurrentHasInner_and_LatestIsDisjoint_p = P_current_has_inner_p @@ -1490,10 +1504,14 @@ a30_make_inner_projection(P_current, P_newprop3, P_latest, Up, ordsets:is_disjoint( ordsets:from_list(UPI_current_x ++ Repairing_current_x), ordsets:from_list(UPI_latest_i ++ Repairing_latest_i)), + ?REACT({a30, ?LINE, + [{latest_same_enough,LatestSameEnough_p}, + {current_has_inner_p,P_current_has_inner_p}, + {current_hialid,CurrentHasInner_and_LatestIsDisjoint_p}]}), if LatestSameEnough_p -> ?REACT({a30, ?LINE, []}), case P_current_has_inner_p andalso - (UPI_current_x /= P_i3#projection_v1.upi orelse + (UPI_current_x /= P_i3#projection_v1.upi orelse Repairing_current_x /= P_i3#projection_v1.repairing) of true -> @@ -1505,10 +1523,12 @@ a30_make_inner_projection(P_current, P_newprop3, P_latest, Up, false -> P_latest_i end; - CurrentHasInner_and_LatestIsDisjoint_p -> + CurrentHasInner_and_LatestIsDisjoint_p + andalso + CurrentRealEpochCheck_p -> ?REACT({a30, ?LINE, []}), P_current_ios; - true -> + true -> ?REACT({a30, ?LINE, []}), false end; @@ -1534,13 +1554,13 @@ a30_make_inner_projection(P_current, P_newprop3, P_latest, Up, {P_newprop4, S_i}; true -> FinalInnerEpoch = - case inner_projection_exists(P_current) of + case inner_projection_exists(P_current_real) of false -> ?REACT({a30xyzxyz, ?LINE, [P_newprop3#projection_v1.epoch_number]}), FinalCreation = P_newprop3#projection_v1.creation_time, P_newprop3#projection_v1.epoch_number; true -> - P_oldinner = inner_projection_or_self(P_current), + P_oldinner = inner_projection_or_self(P_current_real), ?REACT({a30xyzxyz, ?LINE, [{incrementing_based_on,P_oldinner#projection_v1.epoch_number + 1}]}), FinalCreation = P_newprop3#projection_v1.creation_time, P_oldinner#projection_v1.epoch_number + 1 @@ -1748,16 +1768,26 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP, end end. -react_to_env_A49(_P_latest, FinalProps, #ch_mgr{name=MyName, - proj=P_current} = S) -> +react_to_env_A49(P_latest, FinalProps, #ch_mgr{consistency_mode=cp_mode, + name=MyName, + proj=P_current} = S) -> ?REACT(a49), - #projection_v1{all_members=All_list, - witnesses=Witness_list, - members_dict=MembersDict} = P_current, - #projection_v1{epoch_number=EpochCurrent} = P_current, - P_none = make_none_projection(EpochCurrent, MyName, All_list, Witness_list, - MembersDict), - react_to_env_A50(P_none, FinalProps, set_proj(S, P_none)). + %% Using the none projection as our new P_current does *not* work: + %% if we forget what P_current is, then we risk not being able to + %% detect an insane chain transition or else risk a false positive + %% insane check. + %% + %% Instead, we will create an implicit annotation in P_current + %% that will force A29 to always use the projection from + %% make_zerf() as the basis for our next transition calculations. + %% In this wacky case, we break the checksum on P_current so that + %% A29's epoch_id comparison will always be unequal and thus + %% always trigger make_zerf(). + Dbg = P_current#projection_v1.dbg, + P_current2 = P_current#projection_v1{epoch_csum= <<"broken">>, + dbg=[{zerf_backstop,true}, + {zerf_in,a49}|Dbg]}, + react_to_env_A50(P_latest, FinalProps, set_proj(S, P_current2)). react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> ?REACT(a50), @@ -1765,8 +1795,8 @@ react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) -> {latest_epoch, P_latest#projection_v1.epoch_number}, {final_props, FinalProps}]}), %% if S#ch_mgr.name == c -> io:format(user, "A50: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, - %% V = case file:read_file("/tmp/moomoo") of {ok,_} -> true; _ -> false end, - %% if V, S#ch_mgr.name == c -> io:format("C110: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, + V = case file:read_file("/tmp/moomoo."++atom_to_list(S#ch_mgr.name)) of {ok,_} -> true; _ -> false end, + if V -> io:format(user, "A50: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, {{no_change, FinalProps, P_current#projection_v1.epoch_number}, S}. react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, @@ -1857,12 +1887,7 @@ react_to_env_B10(Retries, P_newprop, P_latest, LatestUnanimousP, if CMode == ap_mode -> react_to_env_A50(P_latest, FinalProps, S); CMode == cp_mode -> - %% Be more harsh, stop iterating by A49 so that when we - %% resume we will have a much small opinion about the - %% world. - %% WHOOPS, doesn't allow convergence in simple cases, - %% needs more work!!!!!!!!!!!!!!!! Monday evening!!!! - %% react_to_env_A49(P_latest, FinalProps, S) + %% Don't use A49, previous experiments failed, check git. react_to_env_A50(P_latest, FinalProps, S) end; @@ -1977,7 +2002,7 @@ react_to_env_C100(P_newprop, #projection_v1{author_server=Author_latest, Sane = projection_transition_is_sane(P_current, P_latest, MyName), QQ_current = lists:flatten(io_lib:format("~w:~w,~w/~w:~w,~w", [P_current#projection_v1.epoch_number, P_current#projection_v1.upi, P_current#projection_v1.repairing, (inner_projection_or_self(P_current))#projection_v1.epoch_number, (inner_projection_or_self(P_current))#projection_v1.upi, (inner_projection_or_self(P_current))#projection_v1.repairing])), QQ_latest = lists:flatten(io_lib:format("~w:~w,~w/~w:~w,~w", [P_latest#projection_v1.epoch_number, P_latest#projection_v1.upi, P_latest#projection_v1.repairing, (inner_projection_or_self(P_latest))#projection_v1.epoch_number, (inner_projection_or_self(P_latest))#projection_v1.upi, (inner_projection_or_self(P_latest))#projection_v1.repairing])), - if Sane == true -> ok; true -> ?V("\n~w-insane-~w-auth=~w ~s -> ~s ~w\n ~p\n", [?LINE, MyName, P_newprop#projection_v1.author_server, QQ_current, QQ_latest, Sane, get(react)]) end, + if Sane == true -> ok; true -> ?V("\n~w-insane-~w-auth=~w ~s -> ~s ~w\n ~p\n ~p\n", [?LINE, MyName, P_newprop#projection_v1.author_server, QQ_current, QQ_latest, Sane, get(why2), get(react)]) end, Flap_latest = if is_record(Flap_latest0, flap_i) -> Flap_latest0; true -> @@ -2067,7 +2092,8 @@ react_to_env_C100_inner(Author_latest, NotSanesDict0, MyName, case get({zzz_quiet, P_latest#projection_v1.epoch_number}) of undefined -> ?V("YOYO-cp-mode,~w,current=~w,",[MyName, machi_projection:make_summary((S#ch_mgr.proj))]); _ -> ok end, put({zzz_quiet, P_latest#projection_v1.epoch_number}, true), react_to_env_A49(P_latest, [], S2); - N when N > ?TOO_FREQUENT_BREAKER -> + N when CMode == ap_mode, + N > ?TOO_FREQUENT_BREAKER -> ?V("\n\nYOYO ~w breaking the cycle of:\n current: ~w\n new : ~w\n", [MyName, machi_projection:make_summary(S#ch_mgr.proj), machi_projection:make_summary(P_latest)]), ?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}), react_to_env_C103(P_newprop, P_latest, S2); @@ -2200,6 +2226,8 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H, io:format(user, "\nCONFIRM debug C120 ~w was annotated ~W outer ~w\n", [S#ch_mgr.name, (inner_projection_or_self(P_latest))#projection_v1.epoch_number, 5, P_latest#projection_v1.epoch_number]), S2#ch_mgr{proj_unanimous=ConfTime} end, + V = case file:read_file("/tmp/moomoo."++atom_to_list(S#ch_mgr.name)) of {ok,_} -> true; _ -> false end, + if V -> io:format("C120: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, {{now_using, FinalProps, P_latest#projection_v1.epoch_number}, S3}. add_and_trunc_history(P_latest, H, MaxLength) -> @@ -2592,6 +2620,7 @@ projection_transition_is_sane(P1, P2, RelativeToServer) -> projection_transition_is_sane(P1, P2, RelativeToServer, RetrospectiveP) -> put(myname, RelativeToServer), put(why2, []), + CMode = P2#projection_v1.mode, HasInner1 = inner_projection_exists(P1), HasInner2 = inner_projection_exists(P2), Inner1 = inner_projection_or_self(P1), @@ -2621,7 +2650,8 @@ projection_transition_is_sane(P1, P2, RelativeToServer, RetrospectiveP) -> ?RETURN2(true)) end; Else -> - if HasInner1 and (not HasInner2) -> + if CMode == cp_mode, + HasInner1 and (not HasInner2) -> %% OK, imagine that we used to be flapping but now we've %% stopped flapping. %% @@ -2644,6 +2674,34 @@ projection_transition_is_sane(P1, P2, RelativeToServer, RetrospectiveP) -> projection_transition_is_sane_final_review(P1, P2, projection_transition_is_sane_with_si_epoch( Inner1, P2, RelativeToServer, RetrospectiveP))); + CMode == cp_mode, + (not HasInner1) and HasInner2 -> + %% OK, imagine that we are entering flapping mode. + %% + %% P1 = outer = upi=[a,d,e],repairing=[c] epoch 298 + %% inner = undefined + %% to + %% P2 = outer = upi=[d,e,c],repairing=[] epoch 684 + %% inner = upi=[a,d,e],repairing=[c] epoch 163 + %% + %% We have been unstable for a long time: 684-298 is a + %% lot of churn. Our internal sense of what the outer + %% projection should look like is screwed up. Someone + %% thinks that there was a repair of c that finished in + %% the outer projection, during the churn between 298 and + %% 684, but we didn't adopt that change to the change. + %% Perhaps we were asleep? + %% + %% Based on our last view of the world at 298, we are + %% keeping that same view *and* we've decided to start + %% flapping, hence the inner projection. Make certain + %% that that transition is ok relative to ourself, and + %% let the other safety checks built into humming + %% consensus & CP mode management take care of the rest. + ?RETURN2( + projection_transition_is_sane_final_review(P1, P2, + projection_transition_is_sane_with_si_epoch( + P1, Inner2, RelativeToServer, RetrospectiveP))); true -> ?RETURN2(Else) end @@ -2696,7 +2754,7 @@ projection_transition_is_sane_final_review( #projection_v1{mode=cp_mode, upi=UPI2, witnesses=Witness_list}=_P2, true) -> %% All earlier sanity checks has said that this transition is sane, but - %% we also need to make certain that any CP mode transition preserves at + %% we also need to make certain that any CP mode transition preserves at %% least one non-witness server in the UPI list. Earlier checks have %% verified that the ordering of the FLUs within the UPI list is ok. UPI1_s = ordsets:from_list(UPI1 -- Witness_list), @@ -2707,6 +2765,20 @@ projection_transition_is_sane_final_review( {upi1_s,UPI1}, {upi2_s,UPI2}]}), case proplists:get_value(zerf_backstop, P1_dbg) of true when UPI1 == [] -> + %% CAUTION, this is a dangerous case. If the old projection, P1, + %% has a 'zerf_backstop' annotation, then when this function + %% returns true, we are (in effect) saying, "We trust you." What + %% if we called make_zerf() a year ago because we took a 1 year + %% nap?? How can we trust this? + %% + %% The answer is: this is not our last safety enforcement for CP + %% mode, fortunately. We are going from the none projection to a + %% quorum majority projection, *and* we will not unwedge ourself + %% until we can verify that all UPI members of the chain are + %% unanimous for this epoch. So if we took a 1 year nap already, + %% or if we take a one year right now and delay writing our + %% private projection for 1 year, then if we disagree with the + %% quorum majority, we simply won't be able to unwedge. ?RETURN2(true); _ when UPI2 == [] -> %% We're down to the none projection to wedge ourself. That's ok. @@ -3257,7 +3329,8 @@ simple_chain_state_transition_is_sane(_Author1, UPI1, Repair1, Author2, UPI2) -> author1,_Author1, upi1,UPI1, repair1,Repair1, author2,Author2, upi2,UPI2, keepsdels,KeepsDels, orders,Orders, numKeeps,NumKeeps, - numOrders,NumOrders, answer1,Answer1}]}), + numOrders,NumOrders, answer1,Answer1}, + {why2, get(why2)}]}), if not Answer1 -> ?RETURN2(Answer1); true -> @@ -3558,8 +3631,6 @@ diversion_c120_verbose_goop2(P_latest0, S) -> end. perhaps_verbose_c110(P_latest2, S) -> - %% V = case file:read_file("/tmp/moomoo") of {ok,_} -> true; _ -> false end, - %% if V, S#ch_mgr.name == c -> io:format("C110: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of true -> {_,_,C} = os:timestamp(), From 42aeecd9dbf3882330f2a7091de900f85f3769a1 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 4 Sep 2015 15:23:48 +0900 Subject: [PATCH 51/51] Fix machi_projection_store_test error --- src/machi_projection_store.erl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl index 23f1281..3fbf9b5 100644 --- a/src/machi_projection_store.erl +++ b/src/machi_projection_store.erl @@ -306,11 +306,11 @@ do_proj_write3(ProjType, #projection_v1{epoch_number=Epoch, if CurEpoch == Epoch, CurCSum == CSum -> do_proj_write4(ProjType, Proj, Path, Epoch, S); true -> - io:format(user, "OUCH: on disk: ~w\n", [machi_projection:make_summary(binary_to_term(Bin))]), - io:format(user, "OUCH: clobber: ~w\n", [machi_projection:make_summary(Proj)]), - io:format(user, "OUCH: clobber: ~p\n", [Proj#projection_v1.dbg2]), - %% {{error, written}, S} - {{error, written, CurEpoch, Epoch, CurCSum, CSum}, S} + %% io:format(user, "OUCH: on disk: ~w\n", [machi_projection:make_summary(binary_to_term(Bin))]), + %% io:format(user, "OUCH: clobber: ~w\n", [machi_projection:make_summary(Proj)]), + %% io:format(user, "OUCH: clobber: ~p\n", [Proj#projection_v1.dbg2]), + %% {{error, written, CurEpoch, Epoch, CurCSum, CSum}, S} + {{error, written}, S} end; {error, enoent} -> do_proj_write4(ProjType, Proj, Path, Epoch, S);