2015-04-06 05:16:20 +00:00
%% -------------------------------------------------------------------
%%
%% Machi: a small village of replicated files
%%
%% Copyright (c) 2014-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
2015-04-08 05:24:07 +00:00
%% @doc The Machi chain manager, Guardian of all things related to
%% Chain Replication state, status, and data replica safety.
%%
%% The Chain Manager is responsible for managing the state of Machi's
%% "Chain Replication" state. This role is roughly analogous to the
%% "Riak Core" application inside of Riak, which takes care of
%% coordinating replica placement and replica repair.
%%
%% For each primitive data server in the cluster, a Machi FLU, there
%% is a Chain Manager process that manages its FLU's role within the
%% Machi cluster's Chain Replication scheme. Each Chain Manager
%% process executes locally and independently to manage the
%% distributed state of a single Machi Chain Replication chain.
%%
%% Machi's Chain Manager process performs similar tasks as Riak Core's
%% claimant. However, Machi has several active Chain Manager
%% processes, one per FLU server, instead of a single active process
%% like Core's claimant. Each Chain Manager process acts
%% independently; each is constrained so that it will reach consensus
%% via independent computation & action.
2015-04-06 05:16:20 +00:00
- module ( machi_chain_manager1 ) .
%% TODO: I am going to sever the connection between the flowchart and the
%% code. That diagram is really valuable, but it also takes a long time
%% to make any kind of edit; the process is too slow. This is a todo
%% item a reminder that the flowchart is important documentation and
%% must be brought back into sync with the code soon.
- behaviour ( gen_server ) .
- include ( " machi_projection.hrl " ) .
- include ( " machi_chain_manager.hrl " ) .
2015-07-16 07:01:53 +00:00
- include ( " machi_verbose.hrl " ) .
2015-04-06 05:16:20 +00:00
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
- define ( NOT_FLAPPING_START , { { epk , - 1 } , ? NOT_FLAPPING } ) .
2015-04-06 05:16:20 +00:00
2015-05-11 10:50:13 +00:00
- record ( ch_mgr , {
name : : pv1_server ( ) ,
flap_limit : : non_neg_integer ( ) ,
proj : : projection ( ) ,
2015-08-27 08:58:43 +00:00
proj_unanimous : : 'false' | erlang : timestamp ( ) ,
2015-05-11 10:50:13 +00:00
%%
timer : : 'undefined' | timer : tref ( ) ,
ignore_timer : : boolean ( ) ,
proj_history : : queue : queue ( ) ,
2015-08-20 08:32:46 +00:00
flap_count = 0 : : non_neg_integer ( ) , % I am flapping if > 0.
2015-08-18 11:49:36 +00:00
flap_start = ? NOT_FLAPPING_START
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
: : { { 'epk' , integer ( ) } , erlang : timestamp ( ) } ,
2015-08-18 11:49:36 +00:00
flap_last_up = [ ] : : list ( ) ,
2015-08-27 08:58:43 +00:00
flap_last_up_change = now ( ) : : erlang : timestamp ( ) ,
2015-08-18 11:49:36 +00:00
flap_counts_last = [ ] : : list ( ) ,
2015-07-18 15:43:10 +00:00
not_sanes : : orddict : orddict ( ) ,
2015-07-20 05:04:25 +00:00
sane_transitions = 0 : : non_neg_integer ( ) ,
2015-08-05 07:05:03 +00:00
consistency_mode : : 'ap_mode' | 'cp_mode' ,
2015-05-11 10:50:13 +00:00
repair_worker : : 'undefined' | pid ( ) ,
repair_start : : 'undefined' | erlang : timestamp ( ) ,
repair_final_status : : 'undefined' | term ( ) ,
runenv : : list ( ) , %proplist()
opts : : list ( ) , %proplist()
members_dict : : p_srvr_dict ( ) ,
proxies_dict : : orddict : orddict ( )
} ) .
2015-04-06 11:07:39 +00:00
- define ( FLU_PC , machi_proxy_flu1_client ) .
2015-04-09 05:44:58 +00:00
- define ( TO , ( 2 * 1000 ) ) . % default timeout
2015-04-06 11:07:39 +00:00
2015-04-06 05:16:20 +00:00
%% Keep a history of our flowchart execution in the process dictionary.
- define ( REACT ( T ) , put ( react , [ T | get ( react ) ] ) ) .
2015-05-11 10:50:13 +00:00
%% Define the period of private projection stability before we'll
%% start repair.
- ifdef ( TEST ) .
2015-05-16 08:39:58 +00:00
- define ( REPAIR_START_STABILITY_TIME , 3 ) .
2015-05-11 10:50:13 +00:00
- else . % TEST
2015-05-16 08:39:58 +00:00
- define ( REPAIR_START_STABILITY_TIME , 10 ) .
2015-05-11 10:50:13 +00:00
- endif . % TEST
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
%% Magic constant for looping "too frequently" breaker. TODO revisit & revise.
- define ( TOO_FREQUENT_BREAKER , 10 ) .
2015-07-05 05:52:50 +00:00
- define ( RETURN2 ( X ) , begin ( catch put ( why2 , [ ? LINE | get ( why2 ) ] ) ) , X end ) .
2015-07-03 14:17:34 +00:00
2015-08-13 05:21:31 +00:00
%% This rank is used if a majority quorum is not available.
- define ( RANK_CP_MINORITY_QUORUM , - 99 ) .
2015-08-14 08:05:16 +00:00
%% Amount of epoch number skip-ahead for set_chain_members call
- define ( SET_CHAIN_MEMBERS_EPOCH_SKIP , 1111 ) .
2015-08-18 11:49:36 +00:00
%% Minimum guideline for considering a remote to be flapping
- define ( MINIMUM_ALL_FLAP_LIMIT , 10 ) .
2015-04-06 05:16:20 +00:00
%% API
2015-05-01 15:33:49 +00:00
- export ( [ start_link / 2 , start_link / 3 , stop / 1 , ping / 1 ,
2015-07-21 09:43:59 +00:00
set_chain_members / 2 , set_chain_members / 3 , set_active / 2 ,
2015-07-03 07:18:40 +00:00
trigger_react_to_env / 1 ] ) .
2015-04-06 05:16:20 +00:00
- export ( [ init / 1 , handle_call / 3 , handle_cast / 2 , handle_info / 2 ,
terminate / 2 , code_change / 3 ] ) .
2015-05-08 09:17:41 +00:00
- export ( [ make_chmgr_regname / 1 , projection_transitions_are_sane / 2 ,
2015-07-03 10:21:41 +00:00
inner_projection_exists / 1 , inner_projection_or_self / 1 ,
simple_chain_state_transition_is_sane / 3 ,
2015-07-03 13:05:35 +00:00
simple_chain_state_transition_is_sane / 5 ,
2015-07-03 10:21:41 +00:00
chain_state_transition_is_sane / 5 ] ) .
%% Exports so that EDoc docs generated for these internal funcs.
- export ( [ mk / 3 ] ) .
2015-04-06 05:16:20 +00:00
2015-08-25 09:43:55 +00:00
%% Exports for developer/debugging
- export ( [ scan_dir / 4 , strip_dbg2 / 1 ,
get_ps / 2 , has_not_sane / 2 , all_hosed_history / 2 ] ) .
2015-04-06 05:16:20 +00:00
- ifdef ( TEST ) .
- export ( [ test_calc_projection / 2 ,
2015-04-09 05:44:58 +00:00
test_write_public_projection / 2 ,
2015-04-06 05:16:20 +00:00
test_read_latest_public_projection / 2 ,
get_all_hosed / 1 ] ) .
- ifdef ( EQC ) .
- include_lib ( " eqc/include/eqc.hrl " ) .
- endif .
- ifdef ( PULSE ) .
- compile ( { parse_transform , pulse_instrument } ) .
2015-07-16 07:25:38 +00:00
- include_lib ( " pulse_otp/include/pulse_otp.hrl " ) .
2015-04-06 05:16:20 +00:00
- endif .
- include_lib ( " eunit/include/eunit.hrl " ) .
- compile ( export_all ) .
- endif . %TEST
2015-04-09 08:13:38 +00:00
start_link ( MyName , MembersDict ) - >
start_link ( MyName , MembersDict , [ ] ) .
2015-04-06 05:16:20 +00:00
2015-04-09 08:13:38 +00:00
start_link ( MyName , MembersDict , MgrOpts ) - >
2015-05-02 07:59:28 +00:00
gen_server : start_link ( { local , make_chmgr_regname ( MyName ) } , ? MODULE ,
2015-04-30 08:28:43 +00:00
{ MyName , MembersDict , MgrOpts } , [ ] ) .
2015-04-06 05:16:20 +00:00
stop ( Pid ) - >
gen_server : call ( Pid , { stop } , infinity ) .
ping ( Pid ) - >
gen_server : call ( Pid , { ping } , infinity ) .
2015-05-07 09:39:39 +00:00
%% @doc Set chain members list.
%%
%% NOTE: This implementation is a bit brittle, in that an author with
%% higher rank may try to re-suggest the old membership list if it
%% races with an author of lower rank. For now, we suggest calling
%% set_chain_members() first on the author of highest rank and finish
%% with lowest rank, i.e. name z* first, name a* last.
2015-05-01 15:33:49 +00:00
set_chain_members ( Pid , MembersDict ) - >
2015-07-21 09:43:59 +00:00
set_chain_members ( Pid , MembersDict , [ ] ) .
set_chain_members ( Pid , MembersDict , Witness_list ) - >
case lists : all ( fun ( Witness ) - > orddict : is_key ( Witness , MembersDict ) end ,
Witness_list ) of
true - >
Cmd = { set_chain_members , MembersDict , Witness_list } ,
gen_server : call ( Pid , Cmd , infinity ) ;
false - >
{ error , bad_arg }
end .
2015-05-01 15:33:49 +00:00
2015-05-02 07:59:28 +00:00
set_active ( Pid , Boolean ) when Boolean == true ; Boolean == false - >
gen_server : call ( Pid , { set_active , Boolean } , infinity ) .
2015-07-03 07:18:40 +00:00
trigger_react_to_env ( Pid ) - >
gen_server : call ( Pid , { trigger_react_to_env } , infinity ) .
2015-04-06 05:16:20 +00:00
- ifdef ( TEST ) .
%% Test/debugging code only.
2015-04-09 05:44:58 +00:00
test_write_public_projection ( Pid , Proj ) - >
gen_server : call ( Pid , { test_write_public_projection , Proj } , infinity ) .
2015-04-06 05:16:20 +00:00
%% Calculate a projection and return it to us.
%% If KeepRunenvP is true, the server will retain its change in its
%% runtime environment, e.g., changes in simulated network partitions.
test_calc_projection ( Pid , KeepRunenvP ) - >
gen_server : call ( Pid , { test_calc_projection , KeepRunenvP } , infinity ) .
test_read_latest_public_projection ( Pid , ReadRepairP ) - >
gen_server : call ( Pid , { test_read_latest_public_projection , ReadRepairP } ,
infinity ) .
- endif . % TEST
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2015-05-01 05:51:42 +00:00
%% Bootstrapping is a hassle ... when when isn't it?
%%
%% If InitMembersDict == [], then we don't know anything about the chain
%% that we'll be participating in. We'll have to wait for directions from
%% our sysadmin later.
%%
%% If InitMembersDict /= [], then we do know what chain we're
%% participating in. It's probably test code, since that's about the
%% only time that we know so much at init() time.
%%
%% In either case, we'll try to create & store an epoch 0 projection
%% and store it to both projections stores. This is tricky if
%% InitMembersDict == [] because InitMembersDict usually contains the
%% #p_svrv records that we need to *write* to the projection store,
%% even our own private store! For test code, we get the store
%% manager's pid in MgrOpts and use direct gen_server calls to the
%% local projection store.
init ( { MyName , InitMembersDict , MgrOpts } ) - >
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
random : seed ( now ( ) ) ,
2015-05-01 15:33:49 +00:00
init_remember_partition_hack ( ) ,
2015-08-11 06:24:26 +00:00
Opt = fun ( Key , Default ) - > proplists : get_value ( Key , MgrOpts , Default ) end ,
InitWitness_list = Opt ( witnesses , [ ] ) ,
2015-05-01 05:51:42 +00:00
ZeroAll_list = [ P #p_srvr.name | | { _ , P } < - orddict : to_list ( InitMembersDict ) ] ,
2015-08-30 10:53:47 +00:00
ZeroProj = make_none_projection ( 0 , MyName , ZeroAll_list ,
2015-08-11 06:24:26 +00:00
InitWitness_list , InitMembersDict ) ,
2015-05-01 05:51:42 +00:00
ok = store_zeroth_projection_maybe ( ZeroProj , MgrOpts ) ,
2015-08-26 05:54:01 +00:00
CMode = Opt ( consistency_mode , ap_mode ) ,
case get_projection_store_regname ( MgrOpts ) of
undefined - >
ok ;
PS - >
ok = set_consistency_mode ( PS , CMode )
end ,
2015-05-01 05:51:42 +00:00
2015-08-06 10:24:14 +00:00
%% Using whatever is the largest epoch number in our local private
%% store, this manager starts out using the "none" projection. If
%% other members of the chain are running, then we'll simply adopt
%% whatever they're using as a basis for our next suggested
%% projection.
%%
%% If we're in CP mode, we have to be very careful about who we
%% choose to be UPI members when we (or anyone else) restarts.
%% However, that choice is *not* made here: it is made later
%% during our first humming consensus iteration. When we start
%% with the none projection, we're make a safe choice before
%% wading into the deep waters.
2015-08-06 08:48:22 +00:00
{ MembersDict , Proj0 } =
2015-05-06 02:41:04 +00:00
get_my_private_proj_boot_info ( MgrOpts , InitMembersDict , ZeroProj ) ,
2015-08-06 10:24:14 +00:00
#projection_v1 { epoch_number = CurrentEpoch ,
all_members = All_list , witnesses = Witness_list } = Proj0 ,
2015-08-30 10:53:47 +00:00
Proj = make_none_projection ( CurrentEpoch ,
MyName , All_list , Witness_list , MembersDict ) ,
2015-08-06 08:48:22 +00:00
2015-04-09 08:13:38 +00:00
RunEnv = [ { seed , Opt ( seed , now ( ) ) } ,
2015-05-08 04:40:44 +00:00
{ use_partition_simulator , Opt ( use_partition_simulator , false ) } ,
2015-07-20 07:25:42 +00:00
{ simulate_repair , Opt ( simulate_repair , true ) } ,
2015-04-09 08:13:38 +00:00
{ network_partitions , Opt ( network_partitions , [ ] ) } ,
{ network_islands , Opt ( network_islands , [ ] ) } ,
{ up_nodes , Opt ( up_nodes , not_init_yet ) } ] ,
2015-05-06 02:41:04 +00:00
ActiveP = Opt ( active_mode , true ) ,
2015-08-26 09:47:39 +00:00
S = set_proj ( #ch_mgr { name = MyName ,
2015-04-06 05:16:20 +00:00
%% TODO 2015-03-04: revisit, should this constant be bigger?
%% Yes, this should be bigger, but it's a hack. There is
%% no guarantee that all parties will advance to a minimum
%% flap awareness in the amount of time that this mgr will.
2015-08-18 11:49:36 +00:00
flap_limit = length ( All_list ) + 3 ,
2015-04-09 08:13:38 +00:00
timer = 'undefined' ,
proj_history = queue : new ( ) ,
2015-07-18 15:43:10 +00:00
not_sanes = orddict : new ( ) ,
2015-08-05 07:05:03 +00:00
consistency_mode = CMode ,
2015-04-06 05:16:20 +00:00
runenv = RunEnv ,
2015-08-26 09:47:39 +00:00
opts = MgrOpts } , Proj ) ,
2015-05-07 08:52:16 +00:00
{ _ , S2 } = do_set_chain_members_dict ( MembersDict , S ) ,
2015-05-01 15:33:49 +00:00
S3 = if ActiveP == false - >
S2 ;
2015-04-09 08:13:38 +00:00
ActiveP == true - >
2015-05-01 15:33:49 +00:00
set_active_timer ( S2 )
2015-04-09 08:13:38 +00:00
end ,
2015-05-01 15:33:49 +00:00
{ ok , S3 } .
2015-04-06 05:16:20 +00:00
2015-04-09 03:16:58 +00:00
handle_call ( { ping } , _ From , S ) - >
{ reply , pong , S } ;
2015-07-21 09:43:59 +00:00
handle_call ( { set_chain_members , MembersDict , Witness_list } , _ From ,
2015-05-07 08:52:16 +00:00
#ch_mgr { name = MyName ,
proj = #projection_v1 { all_members = OldAll_list ,
epoch_number = OldEpoch ,
upi = OldUPI } = OldProj } = S ) - >
{ Reply , S2 } = do_set_chain_members_dict ( MembersDict , S ) ,
%% TODO: should there be any additional sanity checks? Right now,
%% if someone does something bad, then do_react_to_env() will
%% crash, which will crash us, and we'll restart in a sane & old
%% config.
All_list = [ P #p_srvr.name | | { _ , P } < - orddict : to_list ( MembersDict ) ] ,
MissingInNew = OldAll_list -- All_list ,
2015-08-30 11:39:58 +00:00
{ NewUPI , NewDown } = if OldEpoch == 0 - >
{ All_list , [ ] } ;
true - >
NUPI = OldUPI -- MissingInNew ,
{ NUPI , All_list -- NUPI }
end ,
2015-08-14 08:05:16 +00:00
NewEpoch = OldEpoch + ? SET_CHAIN_MEMBERS_EPOCH_SKIP ,
2015-08-26 09:47:39 +00:00
CMode = calc_consistency_mode ( Witness_list ) ,
2015-08-26 05:54:01 +00:00
ok = set_consistency_mode ( machi_flu_psup : make_proj_supname ( MyName ) , CMode ) ,
2015-05-07 08:52:16 +00:00
NewProj = machi_projection : update_checksum (
OldProj #projection_v1 { author_server = MyName ,
creation_time = now ( ) ,
2015-08-13 15:12:13 +00:00
mode = CMode ,
2015-05-07 08:52:16 +00:00
epoch_number = NewEpoch ,
all_members = All_list ,
2015-07-21 09:43:59 +00:00
witnesses = Witness_list ,
2015-05-07 08:52:16 +00:00
upi = NewUPI ,
repairing = [ ] ,
down = NewDown ,
members_dict = MembersDict } ) ,
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
%% Reset all flapping state.
2015-06-02 11:32:52 +00:00
NewProj2 = NewProj #projection_v1 { flap = make_flapping_i ( ) } ,
2015-08-26 09:47:39 +00:00
S3 = clear_flapping_state ( set_proj ( S2 #ch_mgr { proj_history = queue : new ( ) } ,
NewProj2 ) ) ,
2015-05-07 08:52:16 +00:00
{ _ QQ , S4 } = do_react_to_env ( S3 ) ,
{ reply , Reply , S4 } ;
2015-05-02 07:59:28 +00:00
handle_call ( { set_active , Boolean } , _ From , #ch_mgr { timer = TRef } = S ) - >
case { Boolean , TRef } of
{ true , undefined } - >
S2 = set_active_timer ( S ) ,
{ reply , ok , S2 } ;
{ false , _ } - >
( catch timer : cancel ( TRef ) ) ,
{ reply , ok , S #ch_mgr { timer = undefined } } ;
_ - >
{ reply , error , S }
end ;
2015-04-09 03:16:58 +00:00
handle_call ( { stop } , _ From , S ) - >
{ stop , normal , ok , S } ;
2015-04-06 05:16:20 +00:00
handle_call ( { test_calc_projection , KeepRunenvP } , _ From ,
#ch_mgr { name = MyName } = S ) - >
RelativeToServer = MyName ,
2015-08-11 06:24:26 +00:00
{ P , S2 , _ Up } = calc_projection ( S , RelativeToServer ) ,
2015-04-06 05:16:20 +00:00
{ reply , { ok , P } , if KeepRunenvP - > S2 ;
true - > S
end } ;
2015-04-09 05:44:58 +00:00
handle_call ( { test_write_public_projection , Proj } , _ From , S ) - >
{ Res , S2 } = do_cl_write_public_proj ( Proj , S ) ,
{ reply , Res , S2 } ;
2015-04-06 05:16:20 +00:00
handle_call ( { test_read_latest_public_projection , ReadRepairP } , _ From , S ) - >
{ Perhaps , Val , ExtraInfo , S2 } =
do_cl_read_latest_public_projection ( ReadRepairP , S ) ,
Res = { Perhaps , Val , ExtraInfo } ,
{ reply , Res , S2 } ;
2015-07-03 07:18:40 +00:00
handle_call ( { trigger_react_to_env } = Call , _ From , S ) - >
2015-07-02 19:30:05 +00:00
gobble_calls ( Call ) ,
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
{ TODOtodo , S2 } = do_react_to_env ( S ) ,
2015-04-06 05:16:20 +00:00
{ reply , TODOtodo , S2 } ;
handle_call ( _ Call , _ From , S ) - >
2015-08-06 08:48:22 +00:00
io : format ( user , " \n Bad call to ~p : ~p \n " , [ S #ch_mgr.name , _ Call ] ) ,
2015-04-06 05:16:20 +00:00
{ reply , whaaaaaaaaaa , S } .
handle_cast ( _ Cast , S ) - >
? D ( { cast_whaaaaaaaaaaa , _ Cast } ) ,
{ noreply , S } .
2015-05-07 08:52:16 +00:00
handle_info ( tick_check_environment , #ch_mgr { ignore_timer = true } = S ) - >
{ noreply , S } ;
handle_info ( tick_check_environment , S ) - >
2015-05-11 10:50:13 +00:00
{ { _ Delta , Props , _ Epoch } , S1 } = do_react_to_env ( S ) ,
S2 = sanitize_repair_state ( S1 ) ,
S3 = perhaps_start_repair ( S2 ) ,
2015-05-06 02:41:04 +00:00
case proplists : get_value ( throttle_seconds , Props ) of
N when is_integer ( N ) , N > 0 - >
2015-05-07 08:52:16 +00:00
%% We are flapping. Set ignore_timer=true and schedule a
%% reminder to stop ignoring. This slows down the rate of
2015-07-02 19:30:05 +00:00
%% flapping.
2015-05-07 08:52:16 +00:00
erlang : send_after ( N * 1000 , self ( ) , stop_ignoring_timer ) ,
2015-05-11 10:50:13 +00:00
{ noreply , S3 #ch_mgr { ignore_timer = true } } ;
2015-05-06 02:41:04 +00:00
_ - >
2015-05-11 10:50:13 +00:00
{ noreply , S3 }
2015-05-07 08:52:16 +00:00
end ;
handle_info ( stop_ignoring_timer , S ) - >
{ noreply , S #ch_mgr { ignore_timer = false } } ;
2015-05-11 10:50:13 +00:00
handle_info ( { 'DOWN' , _ Ref , process , Worker , Res } ,
#ch_mgr { repair_worker = Worker } = S ) - >
{ noreply , S #ch_mgr { ignore_timer = false ,
repair_worker = undefined ,
repair_final_status = Res } } ;
2015-04-06 05:16:20 +00:00
handle_info ( Msg , S ) - >
2015-04-30 08:28:43 +00:00
case get ( todo_bummer ) of undefined - > io : format ( " TODO: got ~p \n " , [ Msg ] ) ;
_ - > ok
end ,
put ( todo_bummer , true ) ,
2015-04-06 05:16:20 +00:00
{ noreply , S } .
terminate ( _ Reason , _ S ) - >
ok .
code_change ( _ OldVsn , S , _ Extra ) - >
{ ok , S } .
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2015-08-30 10:53:47 +00:00
make_none_projection ( Epoch , MyName , All_list , Witness_list , MembersDict ) - >
2015-04-14 06:30:24 +00:00
Down_list = All_list ,
UPI_list = [ ] ,
2015-08-05 03:53:20 +00:00
P = machi_projection : new ( MyName , MembersDict , Down_list , UPI_list , [ ] , [ ] ) ,
2015-08-13 15:12:13 +00:00
CMode = if Witness_list == [ ] - >
ap_mode ;
Witness_list / = [ ] - >
cp_mode
end ,
2015-08-30 10:53:47 +00:00
machi_projection : update_checksum ( P #projection_v1 { epoch_number = Epoch ,
mode = CMode ,
2015-08-13 15:12:13 +00:00
witnesses = Witness_list } ) .
2015-04-14 06:30:24 +00:00
2015-08-12 08:53:39 +00:00
make_all_projection ( MyName , All_list , Witness_list , MembersDict ) - >
Down_list = [ ] ,
UPI_list = All_list ,
P = machi_projection : new ( MyName , MembersDict , Down_list , UPI_list , [ ] , [ ] ) ,
machi_projection : update_checksum ( P #projection_v1 { witnesses = Witness_list } ) .
2015-05-06 02:41:04 +00:00
get_my_private_proj_boot_info ( MgrOpts , DefaultDict , DefaultProj ) - >
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
get_my_proj_boot_info ( MgrOpts , DefaultDict , DefaultProj , private ) .
get_my_public_proj_boot_info ( MgrOpts , DefaultDict , DefaultProj ) - >
get_my_proj_boot_info ( MgrOpts , DefaultDict , DefaultProj , public ) .
get_my_proj_boot_info ( MgrOpts , DefaultDict , DefaultProj , ProjType ) - >
2015-05-01 05:51:42 +00:00
case proplists : get_value ( projection_store_registered_name , MgrOpts ) of
undefined - >
2015-05-06 02:41:04 +00:00
{ DefaultDict , DefaultProj } ;
2015-05-01 05:51:42 +00:00
Store - >
{ ok , P } = machi_projection_store : read_latest_projection ( Store ,
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
ProjType ) ,
2015-05-06 02:41:04 +00:00
{ P #projection_v1.members_dict , P }
2015-05-01 05:51:42 +00:00
end .
%% Write the epoch 0 projection store, to assist bootstrapping. If the
%% 0th epoch is already written, there's no problem.
store_zeroth_projection_maybe ( ZeroProj , MgrOpts ) - >
2015-08-26 05:54:01 +00:00
case get_projection_store_regname ( MgrOpts ) of
2015-05-01 05:51:42 +00:00
undefined - >
ok ;
Store - >
_ = machi_projection_store : write ( Store , public , ZeroProj ) ,
_ = machi_projection_store : write ( Store , private , ZeroProj ) ,
ok
end .
2015-08-26 05:54:01 +00:00
get_projection_store_regname ( MgrOpts ) - >
proplists : get_value ( projection_store_registered_name , MgrOpts ) .
2015-08-26 09:47:39 +00:00
set_consistency_mode ( undefined , _ CMode ) - >
2015-08-26 05:54:01 +00:00
ok ;
set_consistency_mode ( ProjStore , CMode ) - >
machi_projection_store : set_consistency_mode ( ProjStore , CMode ) .
2015-04-09 08:13:38 +00:00
set_active_timer ( #ch_mgr { name = MyName , members_dict = MembersDict } = S ) - >
2015-04-09 12:08:15 +00:00
FLU_list = [ P #p_srvr.name | | { _ , P } < - orddict : to_list ( MembersDict ) ] ,
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
%% Perturb the order a little bit, to avoid near-lock-step
%% operations every few ticks.
MSec = calc_sleep_ranked_order ( 400 , 1500 , MyName , FLU_list ) +
random : uniform ( 100 ) ,
{ ok , TRef } = timer : send_interval ( MSec , tick_check_environment ) ,
2015-04-09 08:13:38 +00:00
S #ch_mgr { timer = TRef } .
2015-04-06 05:16:20 +00:00
2015-04-09 05:44:58 +00:00
do_cl_write_public_proj ( Proj , S ) - >
2015-04-06 05:16:20 +00:00
#projection_v1 { epoch_number = Epoch } = Proj ,
2015-04-09 05:44:58 +00:00
cl_write_public_proj ( Epoch , Proj , S ) .
2015-04-06 05:16:20 +00:00
cl_write_public_proj ( Epoch , Proj , S ) - >
cl_write_public_proj ( Epoch , Proj , false , S ) .
cl_write_public_proj_skip_local_error ( Epoch , Proj , S ) - >
cl_write_public_proj ( Epoch , Proj , true , S ) .
cl_write_public_proj ( Epoch , Proj , SkipLocalWriteErrorP , S ) - >
%% Write to local public projection store first, and if it succeeds,
%% then write to all remote public projection stores.
cl_write_public_proj_local ( Epoch , Proj , SkipLocalWriteErrorP , S ) .
cl_write_public_proj_local ( Epoch , Proj , SkipLocalWriteErrorP ,
2015-04-09 08:13:38 +00:00
#ch_mgr { name = MyName } = S ) - >
2015-04-06 05:16:20 +00:00
{ _ UpNodes , Partitions , S2 } = calc_up_nodes ( S ) ,
Res0 = perhaps_call_t (
2015-04-09 08:13:38 +00:00
S , Partitions , MyName ,
2015-04-09 05:44:58 +00:00
fun ( Pid ) - > ? FLU_PC : write_projection ( Pid , public , Proj , ? TO ) end ) ,
2015-04-06 05:16:20 +00:00
Continue = fun ( ) - >
2015-04-09 08:13:38 +00:00
FLUs = Proj #projection_v1.all_members -- [ MyName ] ,
2015-04-06 05:16:20 +00:00
cl_write_public_proj_remote ( FLUs , Partitions , Epoch , Proj , S )
end ,
case Res0 of
ok - >
{ XX , SS } = Continue ( ) ,
{ { local_write_result , ok , XX } , SS } ;
Else when SkipLocalWriteErrorP - >
{ XX , SS } = Continue ( ) ,
{ { local_write_result , Else , XX } , SS } ;
2015-05-01 15:33:49 +00:00
Else - >
2015-04-06 05:16:20 +00:00
{ Else , S2 }
end .
2015-04-09 05:44:58 +00:00
cl_write_public_proj_remote ( FLUs , Partitions , _ Epoch , Proj , S ) - >
2015-04-06 05:16:20 +00:00
%% We're going to be very care-free about this write because we'll rely
%% on the read side to do any read repair.
2015-04-09 05:44:58 +00:00
DoIt = fun ( Pid ) - > ? FLU_PC : write_projection ( Pid , public , Proj , ? TO ) end ,
Rs = [ { FLU , perhaps_call_t ( S , Partitions , FLU , fun ( Pid ) - > DoIt ( Pid ) end ) } | |
2015-04-06 05:16:20 +00:00
FLU < - FLUs ] ,
{ { remote_write_results , Rs } , S } .
do_cl_read_latest_public_projection ( ReadRepairP ,
2015-05-02 07:59:28 +00:00
#ch_mgr { proj = Proj1 } = S ) - >
2015-04-06 05:16:20 +00:00
_ Epoch1 = Proj1 #projection_v1.epoch_number ,
case cl_read_latest_projection ( public , S ) of
{ needs_repair , FLUsRs , Extra , S3 } - >
if not ReadRepairP - >
2015-04-30 14:16:08 +00:00
{ not_unanimous , todoxyz , [ { unanimous_flus , [ ] } ,
{ results , FLUsRs } | Extra ] , S3 } ;
2015-04-06 05:16:20 +00:00
true - >
{ _ Status , S4 } = do_read_repair ( FLUsRs , Extra , S3 ) ,
do_cl_read_latest_public_projection ( ReadRepairP , S4 )
end ;
2015-05-02 07:59:28 +00:00
{ _ UnanimousTag , _ Proj2 , _ Extra , _ S3 } = Else - >
Else
2015-04-06 05:16:20 +00:00
end .
read_latest_projection_call_only ( ProjectionType , AllHosed ,
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
#ch_mgr { proj = CurrentProj } = S ) - >
2015-04-06 05:16:20 +00:00
#projection_v1 { all_members = All_list } = CurrentProj ,
All_queried_list = All_list -- AllHosed ,
2015-08-26 09:47:39 +00:00
{ Rs , S2 } = read_latest_projection_call_only2 ( ProjectionType ,
All_queried_list , S ) ,
FLUsRs = lists : zip ( All_queried_list , Rs ) ,
{ All_queried_list , FLUsRs , S2 } .
read_latest_projection_call_only2 ( ProjectionType , All_queried_list , S ) - >
2015-04-06 05:16:20 +00:00
{ _ UpNodes , Partitions , S2 } = calc_up_nodes ( S ) ,
2015-04-09 05:44:58 +00:00
DoIt = fun ( Pid ) - >
2015-05-01 15:33:49 +00:00
case ( ? FLU_PC : read_latest_projection ( Pid , ProjectionType , ? TO ) ) of
2015-04-06 05:16:20 +00:00
{ ok , P } - > P ;
Else - > Else
end
end ,
2015-08-26 09:47:39 +00:00
Rs = [ ( catch perhaps_call_t ( S , Partitions , FLU , fun ( Pid ) - > DoIt ( Pid ) end ) ) | |
2015-04-06 05:16:20 +00:00
FLU < - All_queried_list ] ,
2015-08-26 09:47:39 +00:00
{ Rs , S2 } .
2015-04-06 05:16:20 +00:00
2015-08-27 08:58:43 +00:00
read_projection_call_only2 ( ProjectionType , Epoch , All_queried_list , S ) - >
{ _ UpNodes , Partitions , S2 } = calc_up_nodes ( S ) ,
DoIt = fun ( Pid ) - >
case ( ? FLU_PC : read_projection ( Pid , ProjectionType , Epoch , ? TO ) ) of
{ ok , P } - > P ;
Else - > Else
end
end ,
Rs = [ ( catch perhaps_call_t ( S , Partitions , FLU , fun ( Pid ) - > DoIt ( Pid ) end ) ) | |
FLU < - All_queried_list ] ,
{ Rs , S2 } .
2015-04-06 05:16:20 +00:00
cl_read_latest_projection ( ProjectionType , S ) - >
AllHosed = [ ] ,
cl_read_latest_projection ( ProjectionType , AllHosed , S ) .
cl_read_latest_projection ( ProjectionType , AllHosed , S ) - >
{ All_queried_list , FLUsRs , S2 } =
read_latest_projection_call_only ( ProjectionType , AllHosed , S ) ,
2015-05-18 08:32:22 +00:00
rank_and_sort_projections_with_extra ( All_queried_list , FLUsRs ,
ProjectionType , S2 ) .
2015-04-06 05:16:20 +00:00
2015-05-18 08:32:22 +00:00
rank_and_sort_projections_with_extra ( All_queried_list , FLUsRs , ProjectionType ,
2015-04-30 14:16:08 +00:00
#ch_mgr { name = MyName , proj = CurrentProj } = S ) - >
2015-05-18 08:32:22 +00:00
UnwrittenRs = [ x | | { _ , { error , not_written } } < - FLUsRs ] ,
2015-04-06 05:16:20 +00:00
Ps = [ Proj | | { _ FLU , Proj } < - FLUsRs , is_record ( Proj , projection_v1 ) ] ,
BadAnswerFLUs = [ FLU | | { FLU , Answer } < - FLUsRs ,
not is_record ( Answer , projection_v1 ) ] ,
if All_queried_list == [ ]
orelse
length ( UnwrittenRs ) == length ( FLUsRs ) - >
2015-08-05 06:50:32 +00:00
Witness_list = CurrentProj #projection_v1.witnesses ,
2015-08-30 10:53:47 +00:00
NoneProj = make_none_projection ( 0 , MyName , [ ] , Witness_list ,
2015-08-05 06:50:32 +00:00
orddict : new ( ) ) ,
2015-04-30 14:16:08 +00:00
Extra2 = [ { all_members_replied , true } ,
{ all_queried_list , All_queried_list } ,
{ flus_rs , FLUsRs } ,
{ unanimous_flus , [ ] } ,
{ not_unanimous_flus , [ ] } ,
{ bad_answer_flus , BadAnswerFLUs } ,
{ not_unanimous_answers , [ ] } ,
{ trans_all_hosed , [ ] } ,
{ trans_all_flap_counts , [ ] } ] ,
{ not_unanimous , NoneProj , Extra2 , S } ;
2015-05-18 08:32:22 +00:00
ProjectionType == public , UnwrittenRs / = [ ] - >
2015-04-06 05:16:20 +00:00
{ needs_repair , FLUsRs , [ flarfus ] , S } ;
true - >
[ { _ Rank , BestProj } | _ ] = rank_and_sort_projections ( Ps , CurrentProj ) ,
NotBestPs = [ Proj | | Proj < - Ps , Proj / = BestProj ] ,
UnanimousTag = if NotBestPs == [ ] - > unanimous ;
true - > not_unanimous
end ,
Extra = [ { all_members_replied , length ( FLUsRs ) == length ( All_queried_list ) } ] ,
Best_FLUs = [ FLU | | { FLU , Projx } < - FLUsRs , Projx == BestProj ] ,
TransAllHosed = lists : usort (
lists : flatten ( [ get_all_hosed ( P ) | | P < - Ps ] ) ) ,
AllFlapCounts = merge_flap_counts ( [ get_all_flap_counts ( P ) | |
P < - Ps ] ) ,
Extra2 = [ { all_queried_list , All_queried_list } ,
{ flus_rs , FLUsRs } ,
{ unanimous_flus , Best_FLUs } ,
{ not_unanimous_flus , All_queried_list --
( Best_FLUs ++ BadAnswerFLUs ) } ,
{ bad_answer_flus , BadAnswerFLUs } ,
{ not_unanimous_answers , NotBestPs } ,
{ trans_all_hosed , TransAllHosed } ,
{ trans_all_flap_counts , AllFlapCounts } | Extra ] ,
{ UnanimousTag , BestProj , Extra2 , S }
end .
do_read_repair ( FLUsRs , _ Extra , #ch_mgr { proj = CurrentProj } = S ) - >
2015-05-18 08:32:22 +00:00
Unwrittens = [ x | | { _ FLU , { error , not_written } } < - FLUsRs ] ,
2015-04-06 05:16:20 +00:00
Ps = [ Proj | | { _ FLU , Proj } < - FLUsRs , is_record ( Proj , projection_v1 ) ] ,
if Unwrittens == [ ] orelse Ps == [ ] - >
{ nothing_to_do , S } ;
true - >
%% We have at least one unwritten and also at least one proj.
%% Pick the best one, then spam it everywhere.
[ { _ Rank , BestProj } | _ ] = rank_and_sort_projections ( Ps , CurrentProj ) ,
Epoch = BestProj #projection_v1.epoch_number ,
%% We're doing repair, so use the flavor that will
%% continue to all others even if there is an
%% error_written on the local FLU.
{ _ DontCare , _ S2 } = Res = cl_write_public_proj_skip_local_error (
Epoch , BestProj , S ) ,
Res
end .
calc_projection ( S , RelativeToServer ) - >
calc_projection ( S , RelativeToServer , [ ] ) .
2015-09-01 13:10:45 +00:00
calc_projection ( #ch_mgr { proj = P_current } = S , RelativeToServer , AllHosed ) - >
calc_projection ( S , RelativeToServer , AllHosed , P_current ) .
calc_projection ( #ch_mgr { name = MyName , consistency_mode = CMode ,
2015-08-30 11:39:58 +00:00
runenv = RunEnv } = S ,
2015-09-01 13:10:45 +00:00
RelativeToServer , AllHosed , P_current ) - >
2015-04-06 05:16:20 +00:00
Dbg = [ ] ,
2015-08-25 09:43:55 +00:00
%% OldThreshold = proplists:get_value(old_threshold, RunEnv),
%% NoPartitionThreshold = proplists:get_value(no_partition_threshold, RunEnv),
2015-08-12 08:53:39 +00:00
if CMode == ap_mode - >
2015-08-30 11:39:58 +00:00
calc_projection2 ( P_current , RelativeToServer , AllHosed , Dbg , S ) ;
2015-08-12 08:53:39 +00:00
CMode == cp_mode - >
#projection_v1 { epoch_number = OldEpochNum ,
all_members = AllMembers ,
2015-08-25 09:43:55 +00:00
upi = OldUPI_list
2015-08-30 11:39:58 +00:00
} = P_current ,
2015-08-12 08:53:39 +00:00
UPI_length_ok_p =
length ( OldUPI_list ) > = full_majority_size ( AllMembers ) ,
case { OldEpochNum , UPI_length_ok_p } of
{ 0 , _ } - >
2015-08-30 11:39:58 +00:00
calc_projection2 ( P_current , RelativeToServer , AllHosed ,
2015-08-12 08:53:39 +00:00
Dbg , S ) ;
{ _ , true } - >
2015-08-30 11:39:58 +00:00
calc_projection2 ( P_current , RelativeToServer , AllHosed ,
2015-08-12 08:53:39 +00:00
Dbg , S ) ;
{ _ , false } - >
2015-08-30 11:39:58 +00:00
{ Up , Partitions , RunEnv2 } = calc_up_nodes (
MyName , AllMembers , RunEnv ) ,
%% We can't improve on the current projection.
{ P_current , S #ch_mgr { runenv = RunEnv2 } , Up }
2015-08-12 08:53:39 +00:00
end
end .
2015-04-06 05:16:20 +00:00
2015-04-10 12:59:56 +00:00
%% AllHosed: FLUs that we must treat as if they are down, e.g., we are
%% in a flapping situation and wish to ignore FLUs that we
%% believe are bad-behaving causes of our flapping.
2015-04-06 05:16:20 +00:00
2015-08-12 08:53:39 +00:00
calc_projection2 ( LastProj , RelativeToServer , AllHosed , Dbg ,
#ch_mgr { name = MyName ,
proj = CurrentProj ,
consistency_mode = CMode ,
runenv = RunEnv1 ,
repair_final_status = RepairFS } = S ) - >
2015-04-06 05:16:20 +00:00
#projection_v1 { epoch_number = OldEpochNum ,
2015-04-09 08:13:38 +00:00
members_dict = MembersDict ,
2015-07-30 20:12:08 +00:00
witnesses = OldWitness_list ,
2015-04-09 08:13:38 +00:00
upi = OldUPI_list ,
2015-08-25 09:43:55 +00:00
repairing = OldRepairing_list
2015-04-09 08:13:38 +00:00
} = LastProj ,
2015-04-06 05:16:20 +00:00
LastUp = lists : usort ( OldUPI_list ++ OldRepairing_list ) ,
AllMembers = ( S #ch_mgr.proj ) #projection_v1.all_members ,
{ Up0 , Partitions , RunEnv2 } = calc_up_nodes ( MyName ,
AllMembers , RunEnv1 ) ,
Up = Up0 -- AllHosed ,
NewUp = Up -- LastUp ,
Down = AllMembers -- Up ,
2015-08-30 10:53:47 +00:00
? REACT ( { calc , ? LINE , [ { old_epoch , OldEpochNum } ,
{ old_upi , OldUPI_list } ,
2015-08-14 13:28:50 +00:00
{ old_repairing , OldRepairing_list } ,
{ last_up , LastUp } , { up0 , Up0 } , { all_hosed , AllHosed } ,
2015-08-14 10:30:05 +00:00
{ up , Up } , { new_up , NewUp } , { down , Down } ] } ) ,
2015-04-06 05:16:20 +00:00
2015-08-05 06:50:32 +00:00
NewUPI_list =
[ X | | X < - OldUPI_list , lists : member ( X , Up ) andalso
not lists : member ( X , OldWitness_list ) ] ,
2015-08-22 12:27:01 +00:00
%% If we are not flapping (AllHosed /= [], which is a good enough proxy),
%% then we do our repair checks based on the inner projection only. There
%% is no value in doing repairs during flapping.
RepChk_Proj = if AllHosed == [ ] - >
CurrentProj ;
true - >
inner_projection_or_self ( CurrentProj )
end ,
RepChk_LastInUPI = case RepChk_Proj #projection_v1.upi of
2015-07-19 04:32:55 +00:00
[ ] - > does_not_exist_because_upi_is_empty ;
2015-08-22 12:27:01 +00:00
[ _ | _ ] - > lists : last ( RepChk_Proj #projection_v1.upi )
2015-07-19 04:32:55 +00:00
end ,
2015-08-05 06:50:32 +00:00
Repairing_list2 = [ X | | X < - OldRepairing_list ,
lists : member ( X , Up ) ,
not lists : member ( X , OldWitness_list ) ] ,
2015-05-08 06:36:53 +00:00
Simulator_p = proplists : get_value ( use_partition_simulator , RunEnv2 , false ) ,
2015-07-20 07:25:42 +00:00
SimRepair_p = proplists : get_value ( simulate_repair , RunEnv2 , true ) ,
2015-04-06 05:16:20 +00:00
{ NewUPI_list3 , Repairing_list3 , RunEnv3 } =
2015-08-14 10:30:36 +00:00
case { NewUp -- OldWitness_list , Repairing_list2 } of
2015-04-06 05:16:20 +00:00
{ [ ] , [ ] } - >
2015-08-14 08:05:16 +00:00
D_foo = [ d_foo1 ] ,
2015-04-06 05:16:20 +00:00
{ NewUPI_list , [ ] , RunEnv2 } ;
2015-08-22 12:27:01 +00:00
{ [ ] , [ H | T ] } when RelativeToServer == RepChk_LastInUPI - >
2015-05-08 06:36:53 +00:00
%% The author is tail of the UPI list. Let's see if
2015-04-06 05:16:20 +00:00
%% *everyone* in the UPI+repairing lists are using our
%% projection. This is to simulate a requirement that repair
%% a real repair process cannot take place until the chain is
%% stable, i.e. everyone is in the same epoch.
2015-04-10 12:59:56 +00:00
%% TODO create a real API call for fetching this info?
SameEpoch_p = check_latest_private_projections_same_epoch (
2015-06-04 05:31:58 +00:00
NewUPI_list ++ Repairing_list2 ,
2015-08-22 12:27:01 +00:00
RepChk_Proj , Partitions , S ) ,
2015-07-20 07:25:42 +00:00
if Simulator_p andalso SimRepair_p andalso
2015-08-22 12:27:01 +00:00
SameEpoch_p andalso RelativeToServer == RepChk_LastInUPI - >
2015-05-07 08:52:16 +00:00
D_foo = [ { repair_airquote_done , { we_agree , ( S #ch_mgr.proj ) #projection_v1.epoch_number } } ] ,
2015-08-25 08:01:14 +00:00
if CMode == cp_mode - > timer : sleep ( 567 ) ; true - > ok end ,
2015-05-08 06:36:53 +00:00
{ NewUPI_list ++ [ H ] , T , RunEnv2 } ;
2015-07-20 07:25:42 +00:00
not ( Simulator_p andalso SimRepair_p )
2015-05-16 08:39:58 +00:00
andalso
RepairFS == { repair_final_status , ok } - >
D_foo = [ { repair_done , { repair_final_status , ok , ( S #ch_mgr.proj ) #projection_v1.epoch_number } } ] ,
{ NewUPI_list ++ Repairing_list2 , [ ] , RunEnv2 } ;
2015-05-08 06:36:53 +00:00
true - >
2015-08-14 08:05:16 +00:00
D_foo = [ d_foo2 ] ,
2015-05-08 06:36:53 +00:00
{ NewUPI_list , OldRepairing_list , RunEnv2 }
2015-04-06 05:16:20 +00:00
end ;
2015-08-14 10:30:36 +00:00
{ _ ABC , _ XYZ } - >
2015-08-14 08:05:16 +00:00
D_foo = [ d_foo3 , { new_upi_list , NewUPI_list } , { new_up , NewUp } , { repairing_list3 , OldRepairing_list } ] ,
2015-04-06 05:16:20 +00:00
{ NewUPI_list , OldRepairing_list , RunEnv2 }
end ,
2015-08-14 08:05:16 +00:00
? REACT ( { calc , ? LINE ,
[ { newupi_list3 , NewUPI_list3 } , { repairing_list3 , Repairing_list3 } ] } ) ,
2015-04-06 05:16:20 +00:00
Repairing_list4 = case NewUp of
[ ] - > Repairing_list3 ;
NewUp - > Repairing_list3 ++ NewUp
end ,
2015-08-05 06:50:32 +00:00
Repairing_list5 = ( Repairing_list4 -- Down ) -- OldWitness_list ,
2015-04-06 05:16:20 +00:00
TentativeUPI = NewUPI_list3 ,
TentativeRepairing = Repairing_list5 ,
2015-08-14 08:05:16 +00:00
? REACT ( { calc , ? LINE , [ { tent , TentativeUPI } , { tent_rep , TentativeRepairing } ] } ) ,
2015-04-06 05:16:20 +00:00
{ NewUPI , NewRepairing } =
2015-08-14 08:05:16 +00:00
if ( CMode == ap_mode
orelse
( CMode == cp_mode andalso OldEpochNum == 0 ) )
andalso
TentativeUPI == [ ] andalso TentativeRepairing / = [ ] - >
2015-08-06 06:21:44 +00:00
%% UPI is empty (not including witnesses), so grab
2015-08-05 06:50:32 +00:00
%% the first from the repairing list and make it the
%% only non-witness in the UPI.
2015-04-06 05:16:20 +00:00
[ FirstRepairing | TailRepairing ] = TentativeRepairing ,
2015-08-06 06:21:44 +00:00
{ [ FirstRepairing ] , TailRepairing } ;
2015-04-06 05:16:20 +00:00
true - >
{ TentativeUPI , TentativeRepairing }
end ,
2015-08-14 08:05:16 +00:00
? REACT ( { calc , ? LINE , [ { new_upi , NewUPI } , { new_rep , NewRepairing } ] } ) ,
2015-04-06 05:16:20 +00:00
2015-08-05 06:50:32 +00:00
P = machi_projection : new ( OldEpochNum + 1 ,
MyName , MembersDict , Down , NewUPI , NewRepairing ,
D_foo ++
Dbg ++ [ { ps , Partitions } , { nodes_up , Up } ] ) ,
2015-08-06 06:22:04 +00:00
P2 = if CMode == cp_mode - >
UpWitnesses = [ W | | W < - Up , lists : member ( W , OldWitness_list ) ] ,
2015-08-12 08:53:39 +00:00
Majority = full_majority_size ( AllMembers ) ,
2015-08-06 08:48:22 +00:00
SoFar = length ( NewUPI ) ,
if SoFar > = Majority - >
2015-08-14 08:05:16 +00:00
? REACT ( { calc , ? LINE , [ ] } ) ,
2015-08-06 08:48:22 +00:00
P ;
true - >
Need = Majority - SoFar ,
UpWitnesses = [ W | | W < - Up ,
lists : member ( W , OldWitness_list ) ] ,
if length ( UpWitnesses ) > = Need - >
Ws = lists : sublist ( UpWitnesses , Need ) ,
2015-08-14 08:05:16 +00:00
? REACT ( { calc , ? LINE , [ { ws , Ws } ] } ) ,
2015-08-06 08:48:22 +00:00
machi_projection : update_checksum (
P #projection_v1 { upi = Ws ++ NewUPI } ) ;
true - >
2015-08-14 08:05:16 +00:00
? REACT ( { calc , ? LINE , [ ] } ) ,
2015-08-06 08:48:22 +00:00
P_none0 = make_none_projection (
2015-08-30 10:53:47 +00:00
OldEpochNum + 1 ,
2015-08-06 08:48:22 +00:00
MyName , AllMembers , OldWitness_list ,
MembersDict ) ,
2015-08-27 08:58:43 +00:00
Why = if NewUPI == [ ] - >
2015-08-28 12:13:54 +00:00
" No real servers in old upi are available now " ;
2015-08-27 08:58:43 +00:00
true - >
2015-08-28 12:13:54 +00:00
" Not enough witnesses are available now "
2015-08-27 08:58:43 +00:00
end ,
2015-08-06 08:48:22 +00:00
P_none1 = P_none0 #projection_v1 {
2015-08-29 06:06:57 +00:00
%% Stable creation time!
creation_time = { 1 , 2 , 3 } ,
2015-08-06 08:48:22 +00:00
dbg = [ { none_projection , true } ,
2015-08-14 08:05:16 +00:00
{ up0 , Up0 } ,
2015-08-13 12:24:56 +00:00
{ up , Up } ,
{ all_hosed , AllHosed } ,
2015-08-14 13:28:50 +00:00
{ oldupi , OldUPI_list } ,
{ newupi , NewUPI_list } ,
{ newupi3 , NewUPI_list3 } ,
{ tent_upi , TentativeUPI } ,
{ new_upi , NewUPI } ,
{ up_witnesses , UpWitnesses } ,
2015-08-29 10:59:46 +00:00
{ why_none , Why } ] ,
dbg2 = [
{ creation_time , os : timestamp ( ) } ] } ,
2015-08-06 08:48:22 +00:00
machi_projection : update_checksum ( P_none1 )
end
end ;
2015-08-06 06:22:04 +00:00
CMode == ap_mode - >
2015-08-14 08:05:16 +00:00
? REACT ( { calc , ? LINE , [ ] } ) ,
2015-08-06 06:22:04 +00:00
P
end ,
P3 = machi_projection : update_checksum (
2015-08-13 15:12:13 +00:00
P2 #projection_v1 { mode = CMode , witnesses = OldWitness_list } ) ,
2015-08-14 13:28:50 +00:00
? REACT ( { calc , ? LINE , [ machi_projection : make_summary ( P3 ) ] } ) ,
2015-08-11 06:24:26 +00:00
{ P3 , S #ch_mgr { runenv = RunEnv3 } , Up } .
2015-04-06 05:16:20 +00:00
2015-04-10 12:59:56 +00:00
check_latest_private_projections_same_epoch ( FLUs , MyProj , Partitions , S ) - >
2015-08-22 12:27:01 +00:00
#projection_v1 { epoch_number = MyEpoch , epoch_csum = MyCSum } = MyProj ,
2015-06-04 05:31:58 +00:00
%% NOTE: The caller must provide us with the FLUs list for all
%% FLUs that must be up & available right now. So any
%% failure of perhaps_call_t() means that we must return
%% false.
2015-04-06 05:16:20 +00:00
FoldFun = fun ( _ FLU , false ) - >
false ;
( FLU , true ) - >
2015-04-09 05:44:58 +00:00
F = fun ( Pid ) - >
? FLU_PC : read_latest_projection ( Pid , private , ? TO )
2015-04-06 05:16:20 +00:00
end ,
case perhaps_call_t ( S , Partitions , FLU , F ) of
2015-08-22 12:27:01 +00:00
{ ok , RPJ } - >
#projection_v1 { epoch_number = RemoteEpoch ,
epoch_csum = RemoteCSum } =
inner_projection_or_self ( RPJ ) ,
if MyEpoch == RemoteEpoch ,
MyCSum == RemoteCSum - >
2015-04-06 05:16:20 +00:00
true ;
true - >
false
end ;
2015-08-22 12:27:01 +00:00
_ Else - >
2015-04-06 05:16:20 +00:00
false
end
end ,
lists : foldl ( FoldFun , true , FLUs ) .
calc_up_nodes ( #ch_mgr { name = MyName , proj = Proj , runenv = RunEnv1 } = S ) - >
AllMembers = Proj #projection_v1.all_members ,
{ UpNodes , Partitions , RunEnv2 } =
calc_up_nodes ( MyName , AllMembers , RunEnv1 ) ,
{ UpNodes , Partitions , S #ch_mgr { runenv = RunEnv2 } } .
calc_up_nodes ( MyName , AllMembers , RunEnv1 ) - >
2015-05-01 15:33:49 +00:00
case proplists : get_value ( use_partition_simulator , RunEnv1 ) of
true - >
calc_up_nodes_sim ( MyName , AllMembers , RunEnv1 ) ;
false - >
{ AllMembers -- get ( remember_partition_hack ) , [ ] , RunEnv1 }
end .
calc_up_nodes_sim ( MyName , AllMembers , RunEnv1 ) - >
{ Partitions2 , Islands2 } = machi_partition_simulator : get ( AllMembers ) ,
catch ? REACT ( { calc_up_nodes , ? LINE , [ { partitions , Partitions2 } ,
{ islands , Islands2 } ] } ) ,
2015-04-06 05:16:20 +00:00
UpNodes = lists : sort (
[ Node | | Node < - AllMembers ,
not lists : member ( { MyName , Node } , Partitions2 ) ,
not lists : member ( { Node , MyName } , Partitions2 ) ] ) ,
RunEnv2 = replace ( RunEnv1 ,
[ { network_partitions , Partitions2 } ,
{ network_islands , Islands2 } ,
{ up_nodes , UpNodes } ] ) ,
2015-07-15 02:25:06 +00:00
catch ? REACT ( { calc_up_nodes , ? LINE , [ { partitions , Partitions2 } ,
{ islands , Islands2 } ,
{ up_nodes , UpNodes } ] } ) ,
2015-04-06 05:16:20 +00:00
{ UpNodes , Partitions2 , RunEnv2 } .
replace ( PropList , Items ) - >
2015-05-07 08:52:16 +00:00
Tmp = Items ++ PropList ,
[ { K , proplists : get_value ( K , Tmp ) } | | K < - proplists : get_keys ( Tmp ) ] .
2015-04-06 05:16:20 +00:00
2015-04-09 08:13:38 +00:00
rank_and_sort_projections ( [ ] , CurrentProj ) - >
rank_projections ( [ CurrentProj ] , CurrentProj ) ;
2015-04-06 05:16:20 +00:00
rank_and_sort_projections ( Ps , CurrentProj ) - >
Epoch = lists : max ( [ Proj #projection_v1.epoch_number | | Proj < - Ps ] ) ,
MaxPs = [ Proj | | Proj < - Ps ,
Proj #projection_v1.epoch_number == Epoch ] ,
%% Sort with highest rank first (custom sort)
lists : sort ( fun ( { RankA , _ } , { RankB , _ } ) - > RankA > RankB end ,
rank_projections ( MaxPs , CurrentProj ) ) .
%% Caller must ensure all Projs are of the same epoch number.
%% If the caller gives us projections with different epochs, we assume
%% that the caller is doing an OK thing.
2015-04-14 06:30:24 +00:00
%%
%% TODO: This implementation currently gives higher rank to the last
%% member of All_list, which is typically/always/TODO-CLARIFY
%% sorted. That's fine, but there's a source of unnecessary
%% churn: during repair, we assume that the head of the chain is
%% the coordinator of the repair. So any time that the head
%% makes a repair-related transition, that projection may get
%% quickly replaced by an identical projection that merely has
%% higher rank because it's authored by a higher-ranked member.
%% Worst case, for chain len=4:
%% E+0: author=a, upi=[a], repairing=[b,c,d]
%% E+1: author=b, upi=[a], repairing=[b,c,d] (**)
%% E+2: author=c, upi=[a], repairing=[b,c,d] (**)
%% E+3: author=d, upi=[a], repairing=[b,c,d] (**)
%% E+4: author=a, upi=[a,b], repairing=[c,d]
%% E+5: author=b, upi=[a,b], repairing=[c,d] (**)
%% E+6: author=c, upi=[a,b], repairing=[c,d] (**)
%% E+7: author=d, upi=[a,b], repairing=[c,d] (**)
2015-05-20 02:11:54 +00:00
%% E+... 6 more (**) epochs when c & d finish their repairs.
2015-04-14 06:30:24 +00:00
%% Ideally, the "(**)" epochs are avoidable churn.
%% Perhaps this means that we should change the responsibility
%% for repair management to the highest ranking member of the
%% UPI_list?
2015-04-14 07:17:49 +00:00
%% TODO Hrrrmmmmm ... what about the TODO comment in A40's A40a clause?
%% That could perhaps resolve this same problem in a better way?
2015-04-06 05:16:20 +00:00
rank_projections ( Projs , CurrentProj ) - >
#projection_v1 { all_members = All_list } = CurrentProj ,
MemberRank = orddict : from_list (
lists : zip ( All_list , lists : seq ( 1 , length ( All_list ) ) ) ) ,
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
N = ? MAX_CHAIN_LENGTH + 1 ,
2015-04-06 05:16:20 +00:00
[ { rank_projection ( Proj , MemberRank , N ) , Proj } | | Proj < - Projs ] .
rank_projection ( #projection_v1 { upi = [ ] } , _ MemberRank , _ N ) - >
2015-08-13 05:21:31 +00:00
? RANK_CP_MINORITY_QUORUM ;
2015-07-06 14:56:29 +00:00
rank_projection ( #projection_v1 { author_server = _ Author ,
2015-08-05 06:50:32 +00:00
witnesses = Witness_list ,
2015-04-09 08:13:38 +00:00
upi = UPI_list ,
2015-07-06 14:56:29 +00:00
repairing = Repairing_list } , _ MemberRank , N ) - >
2015-07-06 07:12:15 +00:00
AuthorRank = 0 ,
2015-08-06 08:48:22 +00:00
UPI_witn = [ X | | X < - UPI_list , lists : member ( X , Witness_list ) ] ,
UPI_full = [ X | | X < - UPI_list , not lists : member ( X , Witness_list ) ] ,
2015-08-05 06:50:32 +00:00
case UPI_list -- Witness_list of
[ ] - >
2015-08-13 05:21:31 +00:00
? RANK_CP_MINORITY_QUORUM ;
2015-08-05 06:50:32 +00:00
_ - >
AuthorRank +
2015-08-06 08:48:22 +00:00
( N * length ( Repairing_list ) ) +
( N * N * length ( UPI_witn ) ) +
( N * N * N * length ( UPI_full ) )
2015-08-05 06:50:32 +00:00
end .
2015-04-06 05:16:20 +00:00
2015-05-07 08:52:16 +00:00
do_set_chain_members_dict ( MembersDict , #ch_mgr { proxies_dict = OldProxiesDict } = S ) - >
2015-05-17 14:48:05 +00:00
_ = ? FLU_PC : stop_proxies ( OldProxiesDict ) ,
ProxiesDict = ? FLU_PC : start_proxies ( MembersDict ) ,
2015-05-06 02:41:04 +00:00
{ ok , S #ch_mgr { members_dict = MembersDict ,
2015-05-17 14:48:05 +00:00
proxies_dict = ProxiesDict } } .
2015-05-01 15:33:49 +00:00
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
do_react_to_env ( #ch_mgr { name = MyName ,
proj = #projection_v1 { epoch_number = Epoch ,
members_dict = [ ] = OldDict } = OldProj ,
opts = Opts } = S ) - >
%% Read from our local *public* projection store. If some other
%% chain member has written something there, and if we are a
%% member of that chain, then we'll adopt that projection and then
%% start actively humming in that chain.
{ NewMembersDict , NewProj } =
get_my_public_proj_boot_info ( Opts , OldDict , OldProj ) ,
case orddict : is_key ( MyName , NewMembersDict ) of
false - >
{ { empty_members_dict , [ ] , Epoch } , S } ;
true - >
{ _ , S2 } = do_set_chain_members_dict ( NewMembersDict , S ) ,
2015-08-26 09:47:39 +00:00
CMode = calc_consistency_mode ( NewProj #projection_v1.witnesses ) ,
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
{ { empty_members_dict , [ ] , Epoch } ,
2015-08-26 09:47:39 +00:00
set_proj ( S2 #ch_mgr { members_dict = NewMembersDict ,
consistency_mode = CMode } , NewProj ) }
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
end ;
2015-04-06 05:16:20 +00:00
do_react_to_env ( S ) - >
2015-07-18 15:43:10 +00:00
%% The not_sanes manager counting dictionary is not strictly
%% limited to flapping scenarios. (Though the mechanism first
%% started as a way to deal with rare flapping scenarios.)
%%
%% I believe that the problem cannot happen in real life, but it can
%% happen in simulated environments, especially if the simulation for
%% repair can be approximately infinitely fast.
%%
%% For example:
%% P_current: epoch=1135, UPI=[b,e,a], Repairing=[c,d], author=e
%%
%% Now a partition happens, a & b are on an island, c & d & e on
%% the other island.
%%
%% P_newprop: epoch=1136, UPI=[e,c], Repairing=[d], author=e
%%
%% Why does e think that this is feasible? Well, the old UPI was
%% [b,e,a], and we know that a & b are partitioned away from e.
%% Therefore e chooses the best UPI, [e]. However, the simulator
%% now also says, hey, there are nodes in the repairing list, so
%% let's simulate a repair ... and the repair goes infinitely
%% quickly ...and the epoch is stable during the repair period
%% (i.e., both e/repairer and c/repairee remained in the same
%% epoch 1135) ... so e decides that the simulated repair is
%% "finished" and it's time to add the repairee to the tail of the
%% UPI ... so that's why 1136's UPI=[e,c].
%%
%% I'll try to add a condition to the simulated repair to try to
%% make slightly fewer assumptions in a row. However, I believe
%% it's a good idea to keep this too-many-not_sane-transition-
%% attempts counter very generic (i.e., not specific for flapping
%% as it once was).
%%
2015-07-20 05:04:25 +00:00
%% The not_sanes counter dict should be reset when we have had at
%% least 3 state transitions that did not have a not_sane
%% suggested projection transition or whenever we fall back to the
%% none_projection.
%%
%% We'll probably implement a very simple counter that may/will be
%% *inaccurate* by at most one -- so any reset test should ignore
%% counter values of 0 & 1.
%%
2015-04-06 05:16:20 +00:00
put ( react , [ ] ) ,
2015-08-12 08:53:39 +00:00
try
2015-08-26 09:47:39 +00:00
S2 = if S #ch_mgr.sane_transitions > 3 - > % TODO review this constant
S #ch_mgr { not_sanes = orddict : new ( ) } ;
true - >
S
end ,
%% When in CP mode, we call the poll function twice: once before
%% reacting & once after. This call is the 2nd.
{ Res , S3 } = react_to_env_A10 ( S2 ) ,
{ Res , poll_private_proj_is_upi_unanimous ( S3 ) }
2015-08-12 08:53:39 +00:00
catch
2015-08-13 05:21:31 +00:00
throw : { zerf , _ } = _ Throw - >
2015-08-12 08:53:39 +00:00
Proj = S #ch_mgr.proj ,
2015-08-18 11:49:36 +00:00
io : format ( user , " zerf ~p caught ~p \n " , [ S #ch_mgr.name , _ Throw ] ) ,
2015-08-12 08:53:39 +00:00
{ { no_change , [ ] , Proj #projection_v1.epoch_number } , S }
2015-07-20 05:04:25 +00:00
end .
2015-04-06 05:16:20 +00:00
react_to_env_A10 ( S ) - >
? REACT ( a10 ) ,
2015-08-26 09:47:39 +00:00
react_to_env_A20 ( 0 , poll_private_proj_is_upi_unanimous ( S ) ) .
2015-04-06 05:16:20 +00:00
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
react_to_env_A20 ( Retries , #ch_mgr { name = MyName } = S ) - >
2015-04-06 05:16:20 +00:00
? REACT ( a20 ) ,
2015-05-01 15:33:49 +00:00
init_remember_partition_hack ( ) ,
2015-04-06 05:16:20 +00:00
{ UnanimousTag , P_latest , ReadExtra , S2 } =
do_cl_read_latest_public_projection ( true , S ) ,
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
LastComplaint = get ( rogue_server_epoch ) ,
case orddict : is_key ( P_latest #projection_v1.author_server ,
S #ch_mgr.members_dict ) of
false when P_latest #projection_v1.epoch_number / = LastComplaint - >
put ( rogue_server_epoch , P_latest #projection_v1.epoch_number ) ,
Rogue = P_latest #projection_v1.author_server ,
2015-05-17 10:00:51 +00:00
error_logger : info_msg ( " Chain manager ~w found latest public "
" projection ~w has author ~w not a member "
" of our members list ~w . Please check "
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
" chain membership on this "
2015-05-17 10:00:51 +00:00
" rogue chain manager ~w . \n " ,
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
[ S #ch_mgr.name ,
P_latest #projection_v1.epoch_number ,
Rogue ,
[ K | | { K , _ } < - orddict : to_list ( S #ch_mgr.members_dict ) ] ,
Rogue ] ) ;
_ - >
ok
end ,
case lists : member ( MyName , P_latest #projection_v1.all_members ) of
2015-05-11 10:00:21 +00:00
false when P_latest #projection_v1.epoch_number / = LastComplaint ,
P_latest #projection_v1.all_members / = [ ] - >
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
put ( rogue_server_epoch , P_latest #projection_v1.epoch_number ) ,
error_logger : info_msg ( " Chain manager ~p found latest public "
" projection ~p has author ~p has a "
" members list ~p that does not include me. \n " ,
[ S #ch_mgr.name ,
P_latest #projection_v1.epoch_number ,
P_latest #projection_v1.author_server ,
P_latest #projection_v1.all_members ] ) ;
_ - >
ok
end ,
2015-04-06 05:16:20 +00:00
%% The UnanimousTag isn't quite sufficient for our needs. We need
%% to determine if *all* of the UPI+Repairing FLUs are members of
2015-05-01 15:33:49 +00:00
%% the unanimous server replies. All Repairing FLUs should be up
%% now (because if they aren't then they cannot be repairing), so
%% all Repairing FLUs have no non-race excuse not to be in UnanimousFLUs.
2015-04-06 05:16:20 +00:00
UnanimousFLUs = lists : sort ( proplists : get_value ( unanimous_flus , ReadExtra ) ) ,
UPI_Repairing_FLUs = lists : sort ( P_latest #projection_v1.upi ++
P_latest #projection_v1.repairing ) ,
2015-07-10 06:27:11 +00:00
All_UPI_Repairing_were_unanimous =
ordsets : is_subset ( ordsets : from_list ( UPI_Repairing_FLUs ) ,
ordsets : from_list ( UnanimousFLUs ) ) ,
2015-07-10 06:04:50 +00:00
NotUnanimousFLUs = lists : sort ( proplists : get_value ( not_unanimous_flus ,
ReadExtra , [ xxx ] ) ) ,
NotUnanimousPs = lists : sort ( proplists : get_value ( not_unanimous_answers ,
ReadExtra , [ xxx ] ) ) ,
NotUnanimousSumms = [ machi_projection : make_summary (
P #projection_v1 { dbg2 = [ omitted ] } ) | |
P < - NotUnanimousPs ,
is_record ( P , projection_v1 ) ] ,
BadAnswerFLUs = lists : sort ( proplists : get_value ( bad_answer_flus , ReadExtra ) ) ,
2015-07-10 06:27:11 +00:00
? REACT ( { a20 , ? LINE , [ { upi_repairing , UPI_Repairing_FLUs } ,
{ unanimous_flus , UnanimousFLUs } ,
2015-07-10 06:04:50 +00:00
{ all_upi_repairing_were_unanimous , All_UPI_Repairing_were_unanimous } ,
{ not_unanimous_flus , NotUnanimousFLUs } ,
{ not_unanimous_answers , NotUnanimousSumms } ,
{ bad_answer_flus , BadAnswerFLUs }
] } ) ,
2015-04-06 05:16:20 +00:00
LatestUnanimousP =
if UnanimousTag == unanimous
andalso
All_UPI_Repairing_were_unanimous - >
? REACT ( { a20 , ? LINE } ) ,
true ;
UnanimousTag == unanimous - >
2015-07-10 06:04:50 +00:00
? REACT ( { a20 , ? LINE } ) ,
2015-04-06 05:16:20 +00:00
false ;
UnanimousTag == not_unanimous - >
? REACT ( { a20 , ? LINE } ) ,
false ;
true - >
exit ( { badbad , UnanimousTag } )
end ,
2015-08-20 14:04:27 +00:00
react_to_env_A29 ( Retries , P_latest , LatestUnanimousP , ReadExtra , S2 ) .
2015-08-24 12:54:30 +00:00
react_to_env_A29 ( Retries , P_latest , LatestUnanimousP , ReadExtra ,
#ch_mgr { name = MyName , consistency_mode = CMode ,
proj = P_current } = S ) - >
2015-08-31 06:21:17 +00:00
{ Epoch_current , _ } = EpochID_current =
machi_projection : get_epoch_id ( P_current ) ,
#projection_v1 { author_server = Author_latest } = P_latest ,
{ Epoch_latest , _ } = EpochID_latest = machi_projection : get_epoch_id ( P_latest ) ,
Trigger = if CMode == cp_mode , EpochID_latest / = EpochID_current - >
2015-08-30 11:39:58 +00:00
true ;
true - >
false
end ,
if Trigger - >
2015-08-31 08:57:37 +00:00
? REACT ( { a29 , ? LINE ,
2015-08-31 13:14:28 +00:00
[ { epoch_id_latest , EpochID_latest } ,
{ epoch_id_current , EpochID_current } ,
{ old_current , machi_projection : make_summary ( P_current ) } ] } ) ,
2015-08-31 08:57:37 +00:00
if Epoch_latest > = Epoch_current orelse Epoch_latest == 0 orelse
P_current #projection_v1.upi == [ ] - >
ok ; % sanity check
true - >
exit ( { ? MODULE , ? LINE ,
{ epoch_latest , Epoch_latest } ,
{ epoch_current , Epoch_current } ,
{ latest , machi_projection : make_summary ( P_latest ) } ,
{ current , machi_projection : make_summary ( P_current ) } } )
end ,
2015-08-30 11:39:58 +00:00
put ( yyy_hack , [ ] ) ,
case make_zerf ( P_current , S ) of
Zerf when is_record ( Zerf , projection_v1 ) - >
? REACT ( { a29 , ? LINE ,
[ { zerf_backstop , true } ,
{ zerf_in , machi_projection : make_summary ( Zerf ) } ] } ) ,
%% io:format(user, "zerf_in: A29: ~p: ~w\n\t~p\n", [MyName, machi_projection:make_summary(Zerf), get(yyy_hack)]),
#projection_v1 { dbg = ZerfDbg } = Zerf ,
2015-09-01 13:10:45 +00:00
P_current_calc = Zerf #projection_v1 {
flap = P_current #projection_v1.flap ,
dbg = [ { zerf_backstop , true } | ZerfDbg ] } ,
2015-08-30 11:39:58 +00:00
react_to_env_A30 ( Retries , P_latest , LatestUnanimousP ,
2015-09-01 13:10:45 +00:00
P_current_calc , S ) ;
2015-08-30 11:39:58 +00:00
Zerf - >
{ { { yo_todo_incomplete_fix_me_cp_mode , line , ? LINE , Zerf } } }
end ;
true - >
2015-08-31 08:57:37 +00:00
? REACT ( { a29 , ? LINE , [ ] } ) ,
2015-09-01 13:10:45 +00:00
react_to_env_A30 ( Retries , P_latest , LatestUnanimousP , P_current , S )
2015-08-30 11:39:58 +00:00
end .
2015-04-06 05:16:20 +00:00
2015-09-01 13:10:45 +00:00
react_to_env_A30 ( Retries , P_latest , LatestUnanimousP , P_current_calc ,
2015-04-06 05:16:20 +00:00
#ch_mgr { name = MyName , proj = P_current ,
2015-08-11 06:24:26 +00:00
consistency_mode = CMode , flap_limit = FlapLimit } = S ) - >
2015-04-06 05:16:20 +00:00
? REACT ( a30 ) ,
2015-08-20 14:04:27 +00:00
%% case length(get(react)) of XX when XX > 500 -> io:format(user, "A30 ~w! ~w: ~P\n", [MyName, XX, get(react), 300]), timer:sleep(500); _ -> ok end,
2015-09-01 13:10:45 +00:00
AllHosed = [ ] ,
{ P_newprop1 , S2 , Up } = calc_projection ( S , MyName , AllHosed , P_current_calc ) ,
2015-04-14 06:30:24 +00:00
? REACT ( { a30 , ? LINE , [ { current , machi_projection : make_summary ( S #ch_mgr.proj ) } ] } ) ,
2015-09-01 13:10:45 +00:00
? REACT ( { a30 , ? LINE , [ { calc_current , machi_projection : make_summary ( P_current_calc ) } ] } ) ,
2015-04-09 08:13:38 +00:00
? REACT ( { a30 , ? LINE , [ { newprop1 , machi_projection : make_summary ( P_newprop1 ) } ] } ) ,
2015-07-05 05:52:50 +00:00
? REACT ( { a30 , ? LINE , [ { latest , machi_projection : make_summary ( P_latest ) } ] } ) ,
2015-04-06 05:16:20 +00:00
%% Are we flapping yet?
2015-09-01 13:10:45 +00:00
{ P_newprop2 , S3 } = calculate_flaps ( P_newprop1 , P_latest , P_current_calc ,
Up , FlapLimit , S2 ) ,
2015-04-06 05:16:20 +00:00
%% Move the epoch number up ... originally done in C300.
#projection_v1 { epoch_number = Epoch_newprop2 } = P_newprop2 ,
2015-04-13 15:54:38 +00:00
#projection_v1 { epoch_number = Epoch_latest ,
author_server = Author_latest } = P_latest ,
2015-04-06 05:16:20 +00:00
NewEpoch = erlang : max ( Epoch_newprop2 , Epoch_latest ) + 1 ,
P_newprop3 = P_newprop2 #projection_v1 { epoch_number = NewEpoch } ,
2015-04-09 08:13:38 +00:00
? REACT ( { a30 , ? LINE , [ { newprop3 , machi_projection : make_summary ( P_newprop3 ) } ] } ) ,
2015-04-06 05:16:20 +00:00
{ P_newprop10 , S10 } =
case get_flap_count ( P_newprop3 ) of
2015-08-18 11:49:36 +00:00
{ _ , P_newprop3_flap_count } when P_newprop3_flap_count > = FlapLimit - >
%% I'm flapping. Perhaps make an inner projection?
? REACT ( { a30 , ? LINE , [ { newprop3_flap_count , P_newprop3_flap_count } ,
{ flap_limit , FlapLimit } ] } ) ,
AFPs = get_all_flap_counts ( P_newprop3 ) ,
case lists : sort ( [ FlapCount | |
{ _ FLU , { _ EpkTime , FlapCount } } < - AFPs ] ) of
[ SmallestFC | _ ] when SmallestFC > ? MINIMUM_ALL_FLAP_LIMIT - >
a30_make_inner_projection (
2015-09-01 13:10:45 +00:00
P_current_calc , P_newprop3 , P_latest , Up , S3 ) ;
2015-08-18 11:49:36 +00:00
_ - >
%% Not everyone is flapping enough. Or perhaps
%% everyone was but we finally saw some new server X
%% where X's flap count isn't big enough.
{ P_newprop3 , S3 }
end ;
Bugfix for rare race between just-finished repair and flapping ending
Fix for today: We are going to game the system. We know that
C100 is going to be checking authorship relative to P_current's
UPI's tail. Therefore, we're just going to set it here.
Why??? Because we have been using this projection safely for
the entire flapping period! ... The only other way I see is to
allow C100 to carve out an exception if the repair finished
PLUS author_server check fails PLUS if we came from here, but
that feels a bit fragile to me: if some code factoring happens
in projection_transition_is_saneprojection_transition_is_sane()
or elsewhere that causes the author_server check to be
something-other-than-the-final-thing-checked, then such a
refactoring would likely cause an even harder bug to find &
fix. Conditions tested: 5 FLUs plus alternating partitions of:
[
[{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
[{b,a},{d,e}],
[{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], []
].
2015-07-06 16:29:37 +00:00
{ _ , P_newprop3_flap_count } - >
? REACT ( { a30 , ? LINE , [ { newprop3_flap_count , P_newprop3_flap_count } ,
{ flap_limit , FlapLimit } ] } ) ,
2015-04-06 05:16:20 +00:00
{ P_newprop3 , S3 }
end ,
2015-08-29 03:32:30 +00:00
P_newprop11 = machi_projection : update_checksum ( P_newprop10 ) ,
? REACT ( { a30 , ? LINE , [ { newprop11 , machi_projection : make_summary ( P_newprop11 ) } ] } ) ,
2015-04-14 07:17:49 +00:00
%% Here's a more common reason for moving from inner projection to
%% a normal projection: the old proj has an inner but the newprop
%% does not.
MoveFromInnerToNorm_p =
2015-09-01 13:10:45 +00:00
case { inner_projection_exists ( P_current_calc ) ,
2015-08-29 03:32:30 +00:00
inner_projection_exists ( P_newprop11 ) } of
2015-04-14 07:17:49 +00:00
{ true , false } - > true ;
{ _ , _ } - > false
end ,
%% If P_current says that we believe that we're currently flapping,
2015-08-29 03:32:30 +00:00
%% and if P_newprop11 says that we're no longer flapping, then we
2015-04-14 07:17:49 +00:00
%% really ought to stop flapping, right.
2015-04-13 15:54:38 +00:00
%%
2015-04-14 07:17:49 +00:00
%% Not quite so simple....
2015-04-13 15:54:38 +00:00
%%
%% AAAAH, right. The case I'm dealing with right now is an asymmetric
%% partition in a 4 member chain that affects all_hosed=[a,b,c] but
%% member D is *NOT* noticing anything different in the current scheme:
%% {inner_projection_exists(current), inner_projection_exists(new)}
%% is {true, true}.
%% Yes, that hypothesis is confirmed by time-honored io:format() tracing.
%%
%% So, we need something to kick a silly member like 'd' out of its
2015-04-14 07:17:49 +00:00
%% rut of am-still-flapping. So, let's try this:
%% If we see a P_latest from author != MyName, and if P_latest's
%% author's flap count is now 0 (latest!), but that same member's
2015-04-13 15:54:38 +00:00
%% flap count in P_current is non-zero, then we assume that author
2015-04-14 07:17:49 +00:00
%% has moved out of flapping state and that therefore we ought to do
%% the same.
2015-04-13 15:54:38 +00:00
%% Remember! P_current is this manager's private in-use projection.
2015-04-14 07:17:49 +00:00
%% It is always less than or equal to P_latest's epoch!
2015-09-01 13:10:45 +00:00
Current_flap_counts = get_all_flap_counts ( P_current_calc ) ,
2015-04-13 15:54:38 +00:00
Latest_authors_flap_count_current = proplists : get_value (
Author_latest , Current_flap_counts ) ,
Latest_flap_counts = get_all_flap_counts ( P_latest ) ,
Latest_authors_flap_count_latest = proplists : get_value (
Author_latest , Latest_flap_counts ) ,
Kicker_p = case { Latest_authors_flap_count_current ,
Latest_authors_flap_count_latest } of
{ NotUndef , undefined } when NotUndef / = undefined - >
2015-07-10 11:25:44 +00:00
%% OK, someone else has switched from non-zero flap
%% count to zero flap count. But ... do not kick out
%% of our flapping mode locally if we do not have an
%% inner projection.
2015-09-01 13:10:45 +00:00
inner_projection_exists ( P_current_calc ) ;
2015-04-13 15:54:38 +00:00
{ _ , _ } - >
false
end ,
2015-07-14 08:16:23 +00:00
ClauseInfo = [ { inner_kicker , Kicker_p } ,
{ inner_kicker2 , { Latest_authors_flap_count_current ,
Latest_authors_flap_count_latest } } ,
{ move_from_inner , MoveFromInnerToNorm_p } ] ,
? REACT ( { a30 , ? LINE , ClauseInfo } ) ,
2015-08-24 10:04:26 +00:00
MoveToNorm_p = MoveFromInnerToNorm_p orelse Kicker_p ,
2015-09-01 13:10:45 +00:00
CurrentHasZerf_p = has_make_zerf_annotation ( P_current_calc ) ,
2015-08-28 12:13:54 +00:00
if MoveToNorm_p ,
2015-08-29 03:32:30 +00:00
P_newprop11 #projection_v1.upi == [ ] ,
2015-08-28 12:13:54 +00:00
CMode == cp_mode - >
%% Too much weird stuff may have hapened while we were suffering
%% the flapping/asymmetric partition ... but we are now proposing
%% the none projection. We're going to use it so that we can
%% unwedge ourselve into the glorious none projection.
? REACT ( { a30 , ? LINE , [ ] } ) ,
2015-08-28 14:08:38 +00:00
%% TODO: It seems a bit crazy, but this duplicates part/much
%% of what state C103 does? Go to C103 instead?
2015-08-29 09:01:13 +00:00
P_newprop12 = machi_projection : update_checksum (
P_newprop11 #projection_v1 { epoch_number = NewEpoch } ) ,
2015-08-31 15:51:46 +00:00
%% Move to C300 to avoid repeating the same none proj (and
%% multiple writes to the same private epoch that
%% concidentally are permitted because the projection is
%% exactly the same)
%%
%% The other problem in this execution is that there are a
%% couple of other parties that are not flapping because
%% they see this A30->C100 problem & repeat is
%% short-circuiting all of the flapping logic. If I
%% change A30->C100 to be A30->C300 instead, then I hope
%% that other effect will resolve itself correctly.
if P_latest #projection_v1.author_server == MyName ,
P_latest #projection_v1.upi == [ ] - >
? REACT ( { a30 , ? LINE , [ ] } ) ,
io : format ( user , " CONFIRM debug A30->C100 by ~w \n " , [ MyName ] ) ,
react_to_env_C100 ( P_newprop12 , P_latest , S ) ;
true - >
? REACT ( { a30 , ? LINE , [ ] } ) ,
io : format ( user , " CONFIRM debug A30->C300 by ~w \n " , [ MyName ] ) ,
react_to_env_C300 ( P_newprop12 , P_latest , S )
end ;
2015-08-29 14:42:47 +00:00
MoveToNorm_p ,
CMode == cp_mode ,
not CurrentHasZerf_p - >
2015-08-24 10:04:26 +00:00
%% Too much weird stuff may have hapened while we were suffering
2015-08-29 09:01:13 +00:00
%% the flapping/asymmetric partition.
%%
%% The make_zerf() function will annotate the dbg2 list with
%% {make_zerf,Epoch} where Epoch should equal the epoch_number.
%% If annotated, then we have already passed through this if
%% clause in a prior iteration, and therefore we should go to A40
%% now. If not annotated, go to A49 so that we *will* trigger a
%% make_zerf() on our next iteration.
2015-08-29 14:42:47 +00:00
? REACT ( { a30 , ? LINE , [ ] } ) ,
react_to_env_A49 ( P_latest , [ ] , S10 ) ;
MoveToNorm_p - >
2015-07-10 11:08:57 +00:00
%% Move from inner projection to outer.
2015-09-01 13:10:45 +00:00
P_inner2A = inner_projection_or_self ( P_current_calc ) ,
2015-08-29 03:32:30 +00:00
ResetEpoch = P_newprop11 #projection_v1.epoch_number ,
2015-09-01 13:10:45 +00:00
ResetAuthor = case P_current_calc #projection_v1.upi of
Bugfix for rare race between just-finished repair and flapping ending
Fix for today: We are going to game the system. We know that
C100 is going to be checking authorship relative to P_current's
UPI's tail. Therefore, we're just going to set it here.
Why??? Because we have been using this projection safely for
the entire flapping period! ... The only other way I see is to
allow C100 to carve out an exception if the repair finished
PLUS author_server check fails PLUS if we came from here, but
that feels a bit fragile to me: if some code factoring happens
in projection_transition_is_saneprojection_transition_is_sane()
or elsewhere that causes the author_server check to be
something-other-than-the-final-thing-checked, then such a
refactoring would likely cause an even harder bug to find &
fix. Conditions tested: 5 FLUs plus alternating partitions of:
[
[{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
[{b,a},{d,e}],
[{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], []
].
2015-07-06 16:29:37 +00:00
[ ] - >
%% Drat, fall back to current's author.
2015-09-01 13:10:45 +00:00
P_current_calc #projection_v1.author_server ;
Bugfix for rare race between just-finished repair and flapping ending
Fix for today: We are going to game the system. We know that
C100 is going to be checking authorship relative to P_current's
UPI's tail. Therefore, we're just going to set it here.
Why??? Because we have been using this projection safely for
the entire flapping period! ... The only other way I see is to
allow C100 to carve out an exception if the repair finished
PLUS author_server check fails PLUS if we came from here, but
that feels a bit fragile to me: if some code factoring happens
in projection_transition_is_saneprojection_transition_is_sane()
or elsewhere that causes the author_server check to be
something-other-than-the-final-thing-checked, then such a
refactoring would likely cause an even harder bug to find &
fix. Conditions tested: 5 FLUs plus alternating partitions of:
[
[{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
[{b,a},{d,e}],
[{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], []
].
2015-07-06 16:29:37 +00:00
_ - >
2015-09-01 13:10:45 +00:00
lists : last ( P_current_calc #projection_v1.upi )
Bugfix for rare race between just-finished repair and flapping ending
Fix for today: We are going to game the system. We know that
C100 is going to be checking authorship relative to P_current's
UPI's tail. Therefore, we're just going to set it here.
Why??? Because we have been using this projection safely for
the entire flapping period! ... The only other way I see is to
allow C100 to carve out an exception if the repair finished
PLUS author_server check fails PLUS if we came from here, but
that feels a bit fragile to me: if some code factoring happens
in projection_transition_is_saneprojection_transition_is_sane()
or elsewhere that causes the author_server check to be
something-other-than-the-final-thing-checked, then such a
refactoring would likely cause an even harder bug to find &
fix. Conditions tested: 5 FLUs plus alternating partitions of:
[
[{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
[{b,a},{d,e}],
[{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], []
].
2015-07-06 16:29:37 +00:00
end ,
ClauseInfo2 = [ { move_from_inner_to_outer , true } ,
{ old_author , P_inner2A #projection_v1.author_server } ,
2015-07-06 14:56:29 +00:00
{ reset_author , ResetAuthor } ,
{ reset_epoch , ResetEpoch } ] ,
2015-04-14 06:30:24 +00:00
P_inner2B =
Bugfix for rare race between just-finished repair and flapping ending
Fix for today: We are going to game the system. We know that
C100 is going to be checking authorship relative to P_current's
UPI's tail. Therefore, we're just going to set it here.
Why??? Because we have been using this projection safely for
the entire flapping period! ... The only other way I see is to
allow C100 to carve out an exception if the repair finished
PLUS author_server check fails PLUS if we came from here, but
that feels a bit fragile to me: if some code factoring happens
in projection_transition_is_saneprojection_transition_is_sane()
or elsewhere that causes the author_server check to be
something-other-than-the-final-thing-checked, then such a
refactoring would likely cause an even harder bug to find &
fix. Conditions tested: 5 FLUs plus alternating partitions of:
[
[{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
[{b,a},{d,e}],
[{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], []
].
2015-07-06 16:29:37 +00:00
machi_projection : update_checksum (
P_inner2A #projection_v1 { epoch_number = ResetEpoch ,
author_server = ResetAuthor ,
dbg = ClauseInfo ++ ClauseInfo2 } ) ,
2015-07-06 14:56:29 +00:00
ReactI = [ { inner2b , machi_projection : make_summary ( P_inner2B ) } ] ,
? REACT ( { a30 , ? LINE , ReactI } ) ,
2015-07-10 11:08:57 +00:00
%% In the past, we've tried:
%% react_to_env_C100(P_inner2B, P_latest, S);
%%
%% But we *know* that direct transition is racy/buggy: if
%% P_latest UPIs are not unanimous, then we run the risk of
%% non-disjoint UPIs; state B10 exists for a reason!
%%
%% So, we're going to use P_inner2B as our new proposal and run
%% it through the regular system, as we did prior to 2015-04-14.
2015-07-10 12:11:34 +00:00
%%
%% OK, but we need to avoid a possible infinite loop by trying to
%% use the inner projection as-is. Because we're moving from
%% inner to outer projections, the partition situation has
%% altered significantly. Use calc_projection() to find out what
%% nodes are down *now* (as best as we can tell right now).
2015-08-12 08:53:39 +00:00
{ P_o , S_o , _ Up2 } = calc_projection2 ( P_inner2B , MyName , [ ] , [ ] , S10 ) ,
2015-09-01 13:10:45 +00:00
ReactI2 = [ { inner2po , machi_projection : make_summary ( P_o ) } ] ,
? REACT ( { a30 , ? LINE , ReactI2 } ) ,
2015-08-18 11:49:36 +00:00
%% NOTE: We are intentionally clearing flap info by not
%% carrying it forwarding in the new projection.
2015-09-01 13:10:45 +00:00
%% TODO 2015-09-01: revisit clearing flapping state here?
2015-07-10 12:11:34 +00:00
react_to_env_A40 ( Retries , P_o , P_latest , LatestUnanimousP , S_o ) ;
2015-04-14 06:30:24 +00:00
true - >
2015-04-14 09:19:08 +00:00
? REACT ( { a30 , ? LINE , [ ] } ) ,
2015-08-29 03:32:30 +00:00
react_to_env_A40 ( Retries , P_newprop11 , P_latest ,
2015-04-14 06:30:24 +00:00
LatestUnanimousP , S10 )
end .
2015-04-06 05:16:20 +00:00
2015-08-18 11:49:36 +00:00
a30_make_inner_projection ( P_current , P_newprop3 , P_latest , Up ,
2015-09-01 13:10:45 +00:00
#ch_mgr { name = MyName , consistency_mode = CMode ,
proj = P_current_real } = S ) - >
2015-08-18 11:49:36 +00:00
AllHosed = get_all_hosed ( P_newprop3 ) ,
2015-08-31 08:03:12 +00:00
NewPropDown = if P_newprop3 #projection_v1.upi == [ ] - >
%% This is a none proj, don't believe down list
[ ] ;
true - >
P_newprop3 #projection_v1.down
end ,
2015-08-20 08:32:46 +00:00
P_current_has_inner_p = inner_projection_exists ( P_current ) ,
P_current_ios = inner_projection_or_self ( P_current ) ,
2015-08-29 17:22:59 +00:00
AllHosed_and_Down = lists : usort ( AllHosed ++ NewPropDown ) ,
2015-08-20 08:32:46 +00:00
{ P_i1 , S_i , _ Up } = calc_projection2 ( P_current_ios ,
2015-08-29 17:22:59 +00:00
MyName , AllHosed_and_Down , [ ] , S ) ,
2015-08-18 11:49:36 +00:00
? REACT ( { a30 , ? LINE , [ { raw_all_hosed , get_all_hosed ( P_newprop3 ) } ,
{ up , Up } ,
{ all_hosed , AllHosed } ,
2015-08-29 17:22:59 +00:00
{ new_prop_down , NewPropDown } ,
{ all_hosed_and_down , AllHosed_and_Down } ,
2015-08-20 08:32:46 +00:00
{ p_c_i , machi_projection : make_summary ( P_current_ios ) } ,
{ p_i1 , machi_projection : make_summary ( P_i1 ) } ] } ) ,
2015-08-18 11:49:36 +00:00
%% The inner projection will have a fake author, which
%% everyone will agree is the largest UPI member's
%% name.
BiggestUPIMember =
2015-08-20 08:32:46 +00:00
if P_i1 #projection_v1.upi == [ ] - >
2015-08-18 11:49:36 +00:00
%% Oops, ok, fall back to author
2015-08-20 08:32:46 +00:00
P_i1 #projection_v1.author_server ;
2015-08-18 11:49:36 +00:00
true - >
2015-08-20 08:32:46 +00:00
lists : last ( lists : sort ( P_i1 #projection_v1.upi ) )
2015-08-18 11:49:36 +00:00
end ,
2015-08-20 08:32:46 +00:00
P_i2 = P_i1 #projection_v1 { author_server = BiggestUPIMember } ,
P_i3 = case lists : member ( MyName , AllHosed )
andalso
CMode == ap_mode
of
false - >
P_i2 ;
true - >
%% Fall back to a safe AP mode chain: just me.
P_i2 #projection_v1 {
upi = [ MyName ] ,
repairing = [ ] ,
down = P_i2 #projection_v1.all_members
-- [ MyName ] }
end ,
HasCompatibleInner =
WIP: refactoring inner handling, but ... (more)
There are a couple of weird things in the snippet below (AP mode):
22:32:58.209 b uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,115},{1439,904777,11627}} 29 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},29}},{c,{{{epk,121},{1439,904777,134392}},26}}])
22:32:58.224 c uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,121},{1439,904777,134392}} 28 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},28}}])
CONFIRM by epoch inner 136 <<103,64,252,...>> at [b,c] []
Priv1 [{a,{{132,<<"Cï|ÿzKX:Á"...>>},[a],[c],[b],[],false}},
{b,{{127,<<185,139,3,2,96,189,...>>},[b,c],[],[a],[],false}},
{c,{{133,<<145,71,223,6,177,...>>},[b,c],[a],[],[],false}}] agree false
Pubs: [{a,136},{b,136},{c,136}]
DoIt,
1. Both the "uses inner" messages and also the "CONFIRM by epoch inner 136"
show that B & C are using the same inner projection.
However, the 'Priv1' output shows b & c on different epochs, 127 & 133.
Weird.
2. I've added an infinite loop, probably in this commit. :-(
2015-08-18 13:35:57 +00:00
case inner_projection_exists ( P_latest ) of
2015-08-18 11:49:36 +00:00
true - >
WIP: refactoring inner handling, but ... (more)
There are a couple of weird things in the snippet below (AP mode):
22:32:58.209 b uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,115},{1439,904777,11627}} 29 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},29}},{c,{{{epk,121},{1439,904777,134392}},26}}])
22:32:58.224 c uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,121},{1439,904777,134392}} 28 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},28}}])
CONFIRM by epoch inner 136 <<103,64,252,...>> at [b,c] []
Priv1 [{a,{{132,<<"Cï|ÿzKX:Á"...>>},[a],[c],[b],[],false}},
{b,{{127,<<185,139,3,2,96,189,...>>},[b,c],[],[a],[],false}},
{c,{{133,<<145,71,223,6,177,...>>},[b,c],[a],[],[],false}}] agree false
Pubs: [{a,136},{b,136},{c,136}]
DoIt,
1. Both the "uses inner" messages and also the "CONFIRM by epoch inner 136"
show that B & C are using the same inner projection.
However, the 'Priv1' output shows b & c on different epochs, 127 & 133.
Weird.
2. I've added an infinite loop, probably in this commit. :-(
2015-08-18 13:35:57 +00:00
P_latest_i = inner_projection_or_self ( P_latest ) ,
2015-08-20 08:32:46 +00:00
#projection_v1 { epoch_number = _ _ _ Epoch_current_x ,
upi = UPI_current_x ,
repairing = Repairing_current_x } = P_current_ios ,
WIP: refactoring inner handling, but ... (more)
There are a couple of weird things in the snippet below (AP mode):
22:32:58.209 b uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,115},{1439,904777,11627}} 29 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},29}},{c,{{{epk,121},{1439,904777,134392}},26}}])
22:32:58.224 c uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,121},{1439,904777,134392}} 28 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},28}}])
CONFIRM by epoch inner 136 <<103,64,252,...>> at [b,c] []
Priv1 [{a,{{132,<<"Cï|ÿzKX:Á"...>>},[a],[c],[b],[],false}},
{b,{{127,<<185,139,3,2,96,189,...>>},[b,c],[],[a],[],false}},
{c,{{133,<<145,71,223,6,177,...>>},[b,c],[a],[],[],false}}] agree false
Pubs: [{a,136},{b,136},{c,136}]
DoIt,
1. Both the "uses inner" messages and also the "CONFIRM by epoch inner 136"
show that B & C are using the same inner projection.
However, the 'Priv1' output shows b & c on different epochs, 127 & 133.
Weird.
2. I've added an infinite loop, probably in this commit. :-(
2015-08-18 13:35:57 +00:00
#projection_v1 { epoch_number = Epoch_latest_i ,
upi = UPI_latest_i ,
repairing = Repairing_latest_i } = P_latest_i ,
2015-09-01 13:10:45 +00:00
CurrentRealEpochCheck_p =
case inner_projection_exists ( P_current_real ) of
false - >
%% We're definitely going to suggest making
%% outer->inner transition.
Epoch_latest_i > = P_current_real #projection_v1.epoch_number
andalso
Epoch_latest_i > = P_current #projection_v1.epoch_number ;
true - >
true
end ,
WIP: refactoring inner handling, but ... (more)
There are a couple of weird things in the snippet below (AP mode):
22:32:58.209 b uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,115},{1439,904777,11627}} 29 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},29}},{c,{{{epk,121},{1439,904777,134392}},26}}])
22:32:58.224 c uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,121},{1439,904777,134392}} 28 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},28}}])
CONFIRM by epoch inner 136 <<103,64,252,...>> at [b,c] []
Priv1 [{a,{{132,<<"Cï|ÿzKX:Á"...>>},[a],[c],[b],[],false}},
{b,{{127,<<185,139,3,2,96,189,...>>},[b,c],[],[a],[],false}},
{c,{{133,<<145,71,223,6,177,...>>},[b,c],[a],[],[],false}}] agree false
Pubs: [{a,136},{b,136},{c,136}]
DoIt,
1. Both the "uses inner" messages and also the "CONFIRM by epoch inner 136"
show that B & C are using the same inner projection.
However, the 'Priv1' output shows b & c on different epochs, 127 & 133.
Weird.
2. I've added an infinite loop, probably in this commit. :-(
2015-08-18 13:35:57 +00:00
? REACT ( { a30 , ? LINE , [ { epoch_latest_i , Epoch_latest_i } ,
{ upi_latest_i , UPI_latest_i } ,
2015-09-01 13:10:45 +00:00
{ current_real_epoch_check ,
CurrentRealEpochCheck_p } ,
{ x1 , inner_projection_exists ( P_current_real ) } ,
{ x2 , Epoch_latest_i } ,
{ x3 , P_current_real #projection_v1.epoch_number } ,
{ x4 , P_current #projection_v1.epoch_number } ,
2015-08-20 09:47:50 +00:00
{ repairing_latest_i , Repairing_latest_i } ] } ) ,
2015-08-23 07:43:15 +00:00
LatestSameEnough_p =
2015-08-29 15:04:13 +00:00
UPI_latest_i / = [ ] % avoid hasty none proj jump
andalso
2015-09-01 13:10:45 +00:00
CurrentRealEpochCheck_p
andalso
2015-08-20 08:32:46 +00:00
Epoch_latest_i > = P_current_ios #projection_v1.epoch_number ,
2015-08-20 09:47:50 +00:00
CurrentHasInner_and_LatestIsDisjoint_p =
2015-08-20 08:32:46 +00:00
P_current_has_inner_p
andalso
ordsets : is_disjoint (
ordsets : from_list ( UPI_current_x ++ Repairing_current_x ) ,
ordsets : from_list ( UPI_latest_i ++ Repairing_latest_i ) ) ,
2015-09-01 13:10:45 +00:00
? REACT ( { a30 , ? LINE ,
[ { latest_same_enough , LatestSameEnough_p } ,
{ current_has_inner_p , P_current_has_inner_p } ,
{ current_hialid , CurrentHasInner_and_LatestIsDisjoint_p } ] } ) ,
2015-08-23 07:43:15 +00:00
if LatestSameEnough_p - >
2015-08-20 08:32:46 +00:00
? REACT ( { a30 , ? LINE , [ ] } ) ,
2015-08-22 12:27:01 +00:00
case P_current_has_inner_p andalso
2015-09-01 13:10:45 +00:00
( UPI_current_x / = P_i3 #projection_v1.upi orelse
2015-08-22 12:27:01 +00:00
Repairing_current_x / = P_i3 #projection_v1.repairing )
of
true - >
%% Current proj is inner *and* our new
%% proposed inner proj differs substantially
%% from the current. Don't use latest or
%% current.
false ;
false - >
P_latest_i
end ;
2015-09-01 13:10:45 +00:00
CurrentHasInner_and_LatestIsDisjoint_p
andalso
CurrentRealEpochCheck_p - >
WIP: refactoring inner handling, but ... (more)
There are a couple of weird things in the snippet below (AP mode):
22:32:58.209 b uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,115},{1439,904777,11627}} 29 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},29}},{c,{{{epk,121},{1439,904777,134392}},26}}])
22:32:58.224 c uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,121},{1439,904777,134392}} 28 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},28}}])
CONFIRM by epoch inner 136 <<103,64,252,...>> at [b,c] []
Priv1 [{a,{{132,<<"Cï|ÿzKX:Á"...>>},[a],[c],[b],[],false}},
{b,{{127,<<185,139,3,2,96,189,...>>},[b,c],[],[a],[],false}},
{c,{{133,<<145,71,223,6,177,...>>},[b,c],[a],[],[],false}}] agree false
Pubs: [{a,136},{b,136},{c,136}]
DoIt,
1. Both the "uses inner" messages and also the "CONFIRM by epoch inner 136"
show that B & C are using the same inner projection.
However, the 'Priv1' output shows b & c on different epochs, 127 & 133.
Weird.
2. I've added an infinite loop, probably in this commit. :-(
2015-08-18 13:35:57 +00:00
? REACT ( { a30 , ? LINE , [ ] } ) ,
2015-08-20 08:32:46 +00:00
P_current_ios ;
2015-09-01 13:10:45 +00:00
true - >
WIP: refactoring inner handling, but ... (more)
There are a couple of weird things in the snippet below (AP mode):
22:32:58.209 b uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,115},{1439,904777,11627}} 29 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},29}},{c,{{{epk,121},{1439,904777,134392}},26}}])
22:32:58.224 c uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,121},{1439,904777,134392}} 28 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},28}}])
CONFIRM by epoch inner 136 <<103,64,252,...>> at [b,c] []
Priv1 [{a,{{132,<<"Cï|ÿzKX:Á"...>>},[a],[c],[b],[],false}},
{b,{{127,<<185,139,3,2,96,189,...>>},[b,c],[],[a],[],false}},
{c,{{133,<<145,71,223,6,177,...>>},[b,c],[a],[],[],false}}] agree false
Pubs: [{a,136},{b,136},{c,136}]
DoIt,
1. Both the "uses inner" messages and also the "CONFIRM by epoch inner 136"
show that B & C are using the same inner projection.
However, the 'Priv1' output shows b & c on different epochs, 127 & 133.
Weird.
2. I've added an infinite loop, probably in this commit. :-(
2015-08-18 13:35:57 +00:00
? REACT ( { a30 , ? LINE , [ ] } ) ,
false
end ;
false - >
2015-08-20 08:32:46 +00:00
#projection_v1 { upi = UPI_i3 ,
repairing = Repairing_i3 } = P_i3 ,
if P_current_has_inner_p ,
UPI_i3 == P_current_ios #projection_v1.upi ,
Repairing_i3 == P_current_ios #projection_v1.repairing - >
? REACT ( { a30 , ? LINE , [ ] } ) ,
P_current_ios ;
true - >
? REACT ( { a30 , ? LINE , [ ] } ) ,
false
end
2015-08-18 11:49:36 +00:00
end ,
2015-08-20 08:32:46 +00:00
if HasCompatibleInner / = false - >
2015-08-20 09:47:50 +00:00
? REACT ( { a30 , ? LINE ,
[ { inner_summary ,
machi_projection : make_summary ( HasCompatibleInner ) } ] } ) ,
2015-08-20 08:32:46 +00:00
P_newprop4 = machi_projection : update_checksum (
2015-08-20 09:47:50 +00:00
P_newprop3 #projection_v1 { inner = HasCompatibleInner } ) ,
2015-08-20 08:32:46 +00:00
{ P_newprop4 , S_i } ;
WIP: refactoring inner handling, but ... (more)
There are a couple of weird things in the snippet below (AP mode):
22:32:58.209 b uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,115},{1439,904777,11627}} 29 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},29}},{c,{{{epk,121},{1439,904777,134392}},26}}])
22:32:58.224 c uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,121},{1439,904777,134392}} 28 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},28}}])
CONFIRM by epoch inner 136 <<103,64,252,...>> at [b,c] []
Priv1 [{a,{{132,<<"Cï|ÿzKX:Á"...>>},[a],[c],[b],[],false}},
{b,{{127,<<185,139,3,2,96,189,...>>},[b,c],[],[a],[],false}},
{c,{{133,<<145,71,223,6,177,...>>},[b,c],[a],[],[],false}}] agree false
Pubs: [{a,136},{b,136},{c,136}]
DoIt,
1. Both the "uses inner" messages and also the "CONFIRM by epoch inner 136"
show that B & C are using the same inner projection.
However, the 'Priv1' output shows b & c on different epochs, 127 & 133.
Weird.
2. I've added an infinite loop, probably in this commit. :-(
2015-08-18 13:35:57 +00:00
true - >
FinalInnerEpoch =
2015-09-01 13:10:45 +00:00
case inner_projection_exists ( P_current_real ) of
WIP: refactoring inner handling, but ... (more)
There are a couple of weird things in the snippet below (AP mode):
22:32:58.209 b uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,115},{1439,904777,11627}} 29 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},29}},{c,{{{epk,121},{1439,904777,134392}},26}}])
22:32:58.224 c uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,121},{1439,904777,134392}} 28 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},28}}])
CONFIRM by epoch inner 136 <<103,64,252,...>> at [b,c] []
Priv1 [{a,{{132,<<"Cï|ÿzKX:Á"...>>},[a],[c],[b],[],false}},
{b,{{127,<<185,139,3,2,96,189,...>>},[b,c],[],[a],[],false}},
{c,{{133,<<145,71,223,6,177,...>>},[b,c],[a],[],[],false}}] agree false
Pubs: [{a,136},{b,136},{c,136}]
DoIt,
1. Both the "uses inner" messages and also the "CONFIRM by epoch inner 136"
show that B & C are using the same inner projection.
However, the 'Priv1' output shows b & c on different epochs, 127 & 133.
Weird.
2. I've added an infinite loop, probably in this commit. :-(
2015-08-18 13:35:57 +00:00
false - >
? REACT ( { a30xyzxyz , ? LINE , [ P_newprop3 #projection_v1.epoch_number ] } ) ,
FinalCreation = P_newprop3 #projection_v1.creation_time ,
P_newprop3 #projection_v1.epoch_number ;
true - >
2015-09-01 13:10:45 +00:00
P_oldinner = inner_projection_or_self ( P_current_real ) ,
2015-08-29 17:22:59 +00:00
? REACT ( { a30xyzxyz , ? LINE , [ { incrementing_based_on , P_oldinner #projection_v1.epoch_number + 1 } ] } ) ,
2015-08-20 08:32:46 +00:00
FinalCreation = P_newprop3 #projection_v1.creation_time ,
P_oldinner #projection_v1.epoch_number + 1
WIP: refactoring inner handling, but ... (more)
There are a couple of weird things in the snippet below (AP mode):
22:32:58.209 b uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,115},{1439,904777,11627}} 29 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},29}},{c,{{{epk,121},{1439,904777,134392}},26}}])
22:32:58.224 c uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,121},{1439,904777,134392}} 28 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},28}}])
CONFIRM by epoch inner 136 <<103,64,252,...>> at [b,c] []
Priv1 [{a,{{132,<<"Cï|ÿzKX:Á"...>>},[a],[c],[b],[],false}},
{b,{{127,<<185,139,3,2,96,189,...>>},[b,c],[],[a],[],false}},
{c,{{133,<<145,71,223,6,177,...>>},[b,c],[a],[],[],false}}] agree false
Pubs: [{a,136},{b,136},{c,136}]
DoIt,
1. Both the "uses inner" messages and also the "CONFIRM by epoch inner 136"
show that B & C are using the same inner projection.
However, the 'Priv1' output shows b & c on different epochs, 127 & 133.
Weird.
2. I've added an infinite loop, probably in this commit. :-(
2015-08-18 13:35:57 +00:00
end ,
%% TODO: When we implement the real chain repair function, we
%% need to keep in mind that an inner projection with
%% up nodes > 1, repair is required there! In the
%% current simulator, repair is not simulated and
%% finished (and then growing the UPI list). Fix.
2015-08-20 08:32:46 +00:00
P_i4 = machi_projection : update_checksum (
P_i3 #projection_v1 { epoch_number = FinalInnerEpoch ,
creation_time = FinalCreation } ) ,
WIP: refactoring inner handling, but ... (more)
There are a couple of weird things in the snippet below (AP mode):
22:32:58.209 b uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,115},{1439,904777,11627}} 29 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},29}},{c,{{{epk,121},{1439,904777,134392}},26}}])
22:32:58.224 c uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,121},{1439,904777,134392}} 28 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},28}}])
CONFIRM by epoch inner 136 <<103,64,252,...>> at [b,c] []
Priv1 [{a,{{132,<<"Cï|ÿzKX:Á"...>>},[a],[c],[b],[],false}},
{b,{{127,<<185,139,3,2,96,189,...>>},[b,c],[],[a],[],false}},
{c,{{133,<<145,71,223,6,177,...>>},[b,c],[a],[],[],false}}] agree false
Pubs: [{a,136},{b,136},{c,136}]
DoIt,
1. Both the "uses inner" messages and also the "CONFIRM by epoch inner 136"
show that B & C are using the same inner projection.
However, the 'Priv1' output shows b & c on different epochs, 127 & 133.
Weird.
2. I've added an infinite loop, probably in this commit. :-(
2015-08-18 13:35:57 +00:00
? REACT ( { a30 , ? LINE , [ { inner_summary ,
2015-08-20 08:32:46 +00:00
machi_projection : make_summary ( P_i4 ) } ] } ) ,
WIP: refactoring inner handling, but ... (more)
There are a couple of weird things in the snippet below (AP mode):
22:32:58.209 b uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,115},{1439,904777,11627}} 29 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},29}},{c,{{{epk,121},{1439,904777,134392}},26}}])
22:32:58.224 c uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,121},{1439,904777,134392}} 28 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},28}}])
CONFIRM by epoch inner 136 <<103,64,252,...>> at [b,c] []
Priv1 [{a,{{132,<<"Cï|ÿzKX:Á"...>>},[a],[c],[b],[],false}},
{b,{{127,<<185,139,3,2,96,189,...>>},[b,c],[],[a],[],false}},
{c,{{133,<<145,71,223,6,177,...>>},[b,c],[a],[],[],false}}] agree false
Pubs: [{a,136},{b,136},{c,136}]
DoIt,
1. Both the "uses inner" messages and also the "CONFIRM by epoch inner 136"
show that B & C are using the same inner projection.
However, the 'Priv1' output shows b & c on different epochs, 127 & 133.
Weird.
2. I've added an infinite loop, probably in this commit. :-(
2015-08-18 13:35:57 +00:00
%% Put it all together.
P_newprop4 = machi_projection : update_checksum (
2015-08-20 08:32:46 +00:00
P_newprop3 #projection_v1 { inner = P_i4 } ) ,
2015-08-18 11:49:36 +00:00
{ P_newprop4 , S_i }
end .
2015-08-11 06:24:26 +00:00
a40_latest_author_down ( #projection_v1 { author_server = LatestAuthor } = _ P_latest ,
#projection_v1 { upi = [ ] , repairing = [ ] ,
all_members = AllMembers } = _ P_newprop ,
#ch_mgr { name = MyName , runenv = RunEnv } ) - >
%% P_newprop is the none projection. P_newprop's down list is
%% bogus, we cannot use it here.
{ Up , _ Partitions , _ RunEnv2 } = calc_up_nodes ( MyName , AllMembers , RunEnv ) ,
? REACT ( { a40 , ? LINE , [ { latest_author , LatestAuthor } , { up , Up } ] } ) ,
2015-08-12 08:36:13 +00:00
not lists : member ( LatestAuthor , Up ) ;
2015-08-11 06:24:26 +00:00
a40_latest_author_down ( #projection_v1 { author_server = LatestAuthor } = _ P_latest ,
#projection_v1 { down = NewPropDown } = _ P_newprop , _ S ) - >
lists : member ( LatestAuthor , NewPropDown ) .
2015-04-06 05:16:20 +00:00
react_to_env_A40 ( Retries , P_newprop , P_latest , LatestUnanimousP ,
2015-08-25 09:43:55 +00:00
#ch_mgr { name = MyName , proj = P_current } = S ) - >
2015-04-06 05:16:20 +00:00
? REACT ( a40 ) ,
[ { Rank_newprop , _ } ] = rank_projections ( [ P_newprop ] , P_current ) ,
[ { Rank_latest , _ } ] = rank_projections ( [ P_latest ] , P_current ) ,
2015-08-11 06:24:26 +00:00
LatestAuthorDownP = a40_latest_author_down ( P_latest , P_newprop , S )
andalso
P_latest #projection_v1.author_server / = MyName ,
2015-08-31 08:57:37 +00:00
P_latestStable = make_basic_comparison_stable ( P_latest ) ,
P_currentStable = make_basic_comparison_stable ( P_current ) ,
2015-08-12 08:53:39 +00:00
? REACT ( { a40 , ? LINE ,
[ { latest_author , P_latest #projection_v1.author_server } ,
2015-08-20 08:32:46 +00:00
{ author_is_down_p , LatestAuthorDownP } ,
{ latest_flap , P_latest #projection_v1.flap } ,
{ newprop_flap , P_newprop #projection_v1.flap } ] } ) ,
2015-04-06 05:16:20 +00:00
if
2015-04-10 12:59:56 +00:00
%% Epoch == 0 is reserved for first-time, just booting conditions.
2015-08-13 05:21:31 +00:00
( Rank_newprop > 0 orelse ( Rank_newprop == ? RANK_CP_MINORITY_QUORUM ) )
2015-08-05 06:50:32 +00:00
andalso
( ( P_current #projection_v1.epoch_number > 0
andalso
P_latest #projection_v1.epoch_number > P_current #projection_v1.epoch_number )
orelse
not LatestUnanimousP ) - >
2015-04-06 05:16:20 +00:00
? REACT ( { a40 , ? LINE ,
[ { latest_epoch , P_latest #projection_v1.epoch_number } ,
{ current_epoch , P_current #projection_v1.epoch_number } ,
{ latest_unanimous_p , LatestUnanimousP } ] } ) ,
react_to_env_B10 ( Retries , P_newprop , P_latest , LatestUnanimousP ,
Rank_newprop , Rank_latest , S ) ;
2015-08-05 06:50:32 +00:00
Rank_newprop > 0
andalso
( P_latest #projection_v1.epoch_number < P_current #projection_v1.epoch_number
orelse
2015-08-28 11:06:09 +00:00
P_latestStable / = P_currentStable ) - >
2015-04-06 05:16:20 +00:00
? REACT ( { a40 , ? LINE ,
2015-08-31 06:40:19 +00:00
[ { latest , P_latestStable } ,
{ current , P_currentStable } ,
{ neq , P_latestStable / = P_currentStable } ] } ) ,
2015-04-06 05:16:20 +00:00
%% Both of these cases are rare. Elsewhere, the code
%% assumes that the local FLU's projection store is always
%% available, so reads & writes to it aren't going to fail
%% willy-nilly. If that assumption is true, then we can
%% reason as follows:
%%
%% a. If we can always read from the local FLU projection
%% store, then the 1st clause isn't possible because
%% P_latest's epoch # must be at least as large as
%% P_current's epoch #
%%
%% b. If P_latest /= P_current, then there can't be a
%% unanimous reply for P_latest, so the earlier 'if'
%% clause would be triggered and so we could never reach
%% this clause.
%%
%% I'm keeping this 'if' clause just in case the local FLU
%% projection store assumption changes.
react_to_env_B10 ( Retries , P_newprop , P_latest , LatestUnanimousP ,
Rank_newprop , Rank_latest , S ) ;
%% A40a (see flowchart)
Rank_newprop > Rank_latest - >
2015-04-14 07:17:49 +00:00
? REACT ( { a40 , ? LINE ,
2015-04-06 05:16:20 +00:00
[ { rank_latest , Rank_latest } ,
{ rank_newprop , Rank_newprop } ,
{ latest_author , P_latest #projection_v1.author_server } ] } ) ,
%% TODO: There may be an "improvement" here. If we're the
%% highest-ranking FLU in the all_members list, then if we make a
%% projection where our UPI list is the same as P_latest's, and
%% our repairing list is the same as P_latest's, then it may not
%% be necessary to write our projection: it doesn't "improve"
%% anything UPI-wise or repairing-wise. But it isn't clear to me
%% if it's 100% correct to "improve" here and skip writing
%% P_newprop, yet.
react_to_env_C300 ( P_newprop , P_latest , S ) ;
%% A40b (see flowchart)
2015-08-05 06:50:32 +00:00
Rank_newprop > 0
andalso
2015-04-06 05:16:20 +00:00
P_latest #projection_v1.author_server == MyName
andalso
( P_newprop #projection_v1.upi / = P_latest #projection_v1.upi
orelse
P_newprop #projection_v1.repairing / = P_latest #projection_v1.repairing ) - >
? REACT ( { a40 , ? LINE ,
[ { latest_author , P_latest #projection_v1.author_server } ,
{ newprop_upi , P_newprop #projection_v1.upi } ,
{ latest_upi , P_latest #projection_v1.upi } ,
{ newprop_repairing , P_newprop #projection_v1.repairing } ,
{ latest_repairing , P_latest #projection_v1.repairing } ] } ) ,
react_to_env_C300 ( P_newprop , P_latest , S ) ;
%% A40c (see flowchart)
LatestAuthorDownP - >
? REACT ( { a40 , ? LINE ,
[ { latest_author , P_latest #projection_v1.author_server } ,
{ author_is_down_p , LatestAuthorDownP } ] } ) ,
%% TODO: I believe that membership in the
%% P_newprop#projection_v1.down is not sufficient for long
%% chains. Rather, we ought to be using a full broadcast
%% gossip of server up status.
%%
%% Imagine 5 servers in an "Olympic Rings" style
%% overlapping network paritition, where ring1 = upper
%% leftmost and ring5 = upper rightmost. It's both
%% possible and desirable for ring5's projection to be
%% seen (public) by ring1. Ring5's projection's rank is
%% definitely higher than ring1's proposed projection's
%% rank ... but we're in a crazy netsplit where:
%% * if we accept ring5's proj: only one functioning chain
%% ([ring4,ring5] but stable
%% * if we accept ring1's proj: two functioning chains
%% ([ring1,ring2] and [ring4,ring5] indepependently)
%% but unstable: we're probably going to flap back & forth?!
react_to_env_C300 ( P_newprop , P_latest , S ) ;
true - >
? REACT ( { a40 , ? LINE , [ true ] } ) ,
2015-08-29 12:36:53 +00:00
CurrentZerfInStatus_p = has_make_zerf_annotation ( P_current ) ,
2015-08-22 12:27:01 +00:00
GoTo50_p =
case inner_projection_exists ( P_current ) andalso
inner_projection_exists ( P_newprop ) andalso
inner_projection_exists ( P_latest ) of
true - >
%% All three projections are flapping ... do we have a
%% new projection (probably due to repair) that is
%% worth suggesting via C300?
#projection_v1 { epoch_number = Epoch_currenti } =
inner_projection_or_self ( P_current ) ,
#projection_v1 { epoch_number = Epoch_newpropi } =
inner_projection_or_self ( P_newprop ) ,
2015-08-23 06:46:57 +00:00
? REACT ( { a40 , ? LINE , [ { epoch_currenti , Epoch_currenti } ,
2015-08-22 12:27:01 +00:00
{ epoch_newpropi , Epoch_newpropi } ] } ) ,
if Epoch_currenti > Epoch_newpropi - >
%% Inner has a newer epoch, don't go to A50.
2015-08-23 06:46:57 +00:00
? REACT ( { a40 , ? LINE , [ ] } ) ,
2015-08-22 12:27:01 +00:00
false ;
true - >
2015-08-23 06:46:57 +00:00
? REACT ( { a40 , ? LINE , [ ] } ) ,
2015-08-22 12:27:01 +00:00
true
end ;
false - >
2015-08-29 12:36:53 +00:00
? REACT ( { a40 , ? LINE , [ { currentzerfinstatus_p , CurrentZerfInStatus_p } ] } ) ,
if CurrentZerfInStatus_p andalso
P_newprop #projection_v1.upi / = [ ] - >
%% One scenario here: we are waking up after
%% a slumber with the none proj and need to
%% send P_newprop (which has non/empty UPI)
%% through the process to continue chain
%% recovery.
? REACT ( { a40 , ? LINE , [ ] } ) ,
false ;
true - >
? REACT ( { a40 , ? LINE , [ ] } ) ,
true
end
2015-08-22 12:27:01 +00:00
end ,
if GoTo50_p - >
2015-08-23 06:46:57 +00:00
? REACT ( { a40 , ? LINE , [ ] } ) ,
2015-08-22 12:27:01 +00:00
FinalProps = [ { throttle_seconds , 0 } ] ,
react_to_env_A50 ( P_latest , FinalProps , S ) ;
true - >
2015-08-23 06:46:57 +00:00
? REACT ( { a40 , ? LINE , [ ] } ) ,
2015-08-22 12:27:01 +00:00
react_to_env_C300 ( P_newprop , P_latest , S )
end
2015-04-06 05:16:20 +00:00
end .
2015-09-01 13:10:45 +00:00
react_to_env_A49 ( P_latest , FinalProps , #ch_mgr { consistency_mode = cp_mode ,
name = MyName ,
proj = P_current } = S ) - >
2015-08-24 10:04:26 +00:00
? REACT ( a49 ) ,
2015-09-01 13:10:45 +00:00
%% Using the none projection as our new P_current does *not* work:
%% if we forget what P_current is, then we risk not being able to
%% detect an insane chain transition or else risk a false positive
%% insane check.
%%
%% Instead, we will create an implicit annotation in P_current
%% that will force A29 to always use the projection from
%% make_zerf() as the basis for our next transition calculations.
%% In this wacky case, we break the checksum on P_current so that
%% A29's epoch_id comparison will always be unequal and thus
%% always trigger make_zerf().
Dbg = P_current #projection_v1.dbg ,
P_current2 = P_current #projection_v1 { epoch_csum = < < " broken " > > ,
dbg = [ { zerf_backstop , true } ,
{ zerf_in , a49 } | Dbg ] } ,
react_to_env_A50 ( P_latest , FinalProps , set_proj ( S , P_current2 ) ) .
2015-08-24 10:04:26 +00:00
2015-08-13 09:43:41 +00:00
react_to_env_A50 ( P_latest , FinalProps , #ch_mgr { proj = P_current } = S ) - >
2015-04-06 05:16:20 +00:00
? REACT ( a50 ) ,
2015-08-13 09:43:41 +00:00
? REACT ( { a50 , ? LINE , [ { current_epoch , P_current #projection_v1.epoch_number } ,
{ latest_epoch , P_latest #projection_v1.epoch_number } ,
2015-04-10 12:59:56 +00:00
{ final_props , FinalProps } ] } ) ,
2015-08-30 10:53:47 +00:00
%% if S#ch_mgr.name == c -> io:format(user, "A50: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end,
2015-09-01 13:10:45 +00:00
V = case file : read_file ( " /tmp/moomoo. " ++ atom_to_list ( S #ch_mgr.name ) ) of { ok , _ } - > true ; _ - > false end ,
if V - > io : format ( user , " A50: ~w : ~p \n " , [ S #ch_mgr.name , get ( react ) ] ) ; true - > ok end ,
2015-08-13 09:43:41 +00:00
{ { no_change , FinalProps , P_current #projection_v1.epoch_number } , S } .
2015-04-06 05:16:20 +00:00
react_to_env_B10 ( Retries , P_newprop , P_latest , LatestUnanimousP ,
Rank_newprop , Rank_latest ,
2015-08-24 11:38:54 +00:00
#ch_mgr { name = MyName , consistency_mode = CMode ,
flap_limit = FlapLimit , proj = P_current } = S ) - >
2015-04-06 05:16:20 +00:00
? REACT ( b10 ) ,
2015-08-20 08:32:46 +00:00
{ P_newprop_flap_time , P_newprop_flap_count } = get_flap_count ( P_newprop ) ,
? REACT ( { b10 , ? LINE , [ { newprop_epoch , P_newprop #projection_v1.epoch_number } ,
{ newprop_flap_time , P_newprop_flap_time } ,
{ newprop_flap_count , P_newprop_flap_count } ] } ) ,
2015-04-14 09:19:08 +00:00
UnanimousLatestInnerNotRelevant_p =
case inner_projection_exists ( P_latest ) of
true when P_latest #projection_v1.author_server / = MyName - >
2015-08-17 11:14:29 +00:00
#projection_v1 { down = Down_inner ,
epoch_number = EpochLatest_i } = inner_projection_or_self (
P_latest ) ,
2015-04-14 09:19:08 +00:00
case lists : member ( MyName , Down_inner ) of
true - >
%% Some foreign author's inner projection thinks that
%% I'm down. Silly! We ought to ignore this one.
? REACT ( { b10 , ? LINE , [ { down_inner , Down_inner } ] } ) ,
true ;
false - >
2015-08-17 11:14:29 +00:00
#projection_v1 { epoch_number = Epoch_current } =
inner_projection_or_self ( P_current ) ,
Latest_GTE_Epoch_p = EpochLatest_i > = Epoch_current ,
? REACT ( { b10 , ? LINE , [ { down_inner , Down_inner } ,
{ latest_GTE_epoch , Latest_GTE_Epoch_p } ] } ) ,
not Latest_GTE_Epoch_p
2015-04-14 09:19:08 +00:00
end ;
_ Else_u - >
false
end ,
2015-08-23 11:00:19 +00:00
#flap_i { all_hosed = P_newprop_AllHosed ,
my_unique_prop_count = MyUniquePropCount } =
case P_newprop #projection_v1.flap of undefined - > make_flapping_i ( ) ;
Flap - > Flap
end ,
2015-08-23 11:47:43 +00:00
P_newprop_AllHosedPlus =
lists : flatten ( [ [ X , Y ] | | { X , problem_with , Y } < - P_newprop_AllHosed ] ) ,
2015-08-24 10:04:26 +00:00
%% Commit 66cafe06 added UnanimousLatestInnerNotRelevant_p to the
%% compound predicate below. I'm yanking it out now. TODO re-study?
#projection_v1 { upi = P_newprop_upi_ooi , repairing = P_newprop_repairing_ooi } =
inner_projection_or_self ( P_newprop ) ,
2015-08-29 10:59:46 +00:00
CurrentZerfInStatus_p = has_make_zerf_annotation ( P_current ) ,
2015-08-28 11:06:09 +00:00
CurrentEpoch = P_current #projection_v1.epoch_number ,
2015-08-24 10:04:26 +00:00
EnoughAreFlapping_and_IamBad_p =
%% Ignore inner_projection_exists(P_current): We might need to
%% shut up quickly (adopting a new P_current can take a long
%% time).
( inner_projection_exists ( P_latest ) orelse
inner_projection_exists ( P_newprop ) ) andalso
2015-08-25 08:01:14 +00:00
%% I have been flapping for a while
2015-08-29 17:22:59 +00:00
S #ch_mgr.flap_count > 200 andalso
2015-08-24 10:04:26 +00:00
%% I'm suspected of being bad
2015-08-23 11:47:43 +00:00
lists : member ( MyName , P_newprop_AllHosedPlus ) andalso
2015-08-24 10:04:26 +00:00
%% I'm not in the critical UPI or repairing lists
( not lists : member ( MyName , P_newprop_upi_ooi ++ P_newprop_repairing_ooi ) )
andalso
%% My down lists are the same, i.e., no state change to announce
2015-08-28 11:06:09 +00:00
%% Or if P_current is a CP mode result of zerf_in & valid (epoch #),
%% then this down list comparison should be skipped.
( ( P_current #projection_v1.down == P_newprop #projection_v1.down )
orelse
2015-08-29 10:59:46 +00:00
CurrentZerfInStatus_p ) ,
2015-08-24 10:04:26 +00:00
? REACT ( { b10 , ? LINE , [ { 0 , EnoughAreFlapping_and_IamBad_p } ,
{ 1 , inner_projection_exists ( P_current ) } ,
{ 2 , inner_projection_exists ( P_latest ) } ,
{ 3 , inner_projection_exists ( P_newprop ) } ,
{ 4 , MyUniquePropCount } ,
2015-08-29 13:40:18 +00:00
{ 5 , S #ch_mgr.flap_count } ,
{ 6 , { MyName , P_newprop_AllHosedPlus } } ,
2015-08-28 11:06:09 +00:00
{ 7 , P_current #projection_v1.down } ,
{ 8 , P_newprop #projection_v1.down } ,
2015-08-29 10:59:46 +00:00
{ 9 , { CurrentZerfInStatus_p , CurrentEpoch } } ] } ) ,
2015-04-06 05:16:20 +00:00
if
2015-08-24 10:04:26 +00:00
EnoughAreFlapping_and_IamBad_p - >
2015-08-23 11:00:19 +00:00
? REACT ( { b10 , ? LINE , [ ] } ) ,
%% There's outer flapping happening *and* we ourselves are
%% definitely flapping (flapping manifesto, starting clause 1)
%% ... and also we are a member of the all_hosed club. So, we
%% should shut up and let someone else do the proposing.
2015-08-24 11:38:54 +00:00
FinalProps = [ { muting_myself , true } ,
{ all_hosed , P_newprop_AllHosed } ] ,
2015-08-23 11:47:43 +00:00
%% io:format(user, "B10: ~w shut up, latest e=~w/Inner=~w, ", [MyName, P_latest#projection_v1.epoch_number, (inner_projection_or_self(P_latest))#projection_v1.epoch_number]),
2015-08-24 11:38:54 +00:00
if CMode == ap_mode - >
react_to_env_A50 ( P_latest , FinalProps , S ) ;
CMode == cp_mode - >
2015-09-01 13:10:45 +00:00
%% Don't use A49, previous experiments failed, check git.
2015-08-24 11:38:54 +00:00
react_to_env_A50 ( P_latest , FinalProps , S )
end ;
2015-08-23 11:00:19 +00:00
2015-04-14 09:19:08 +00:00
LatestUnanimousP
andalso
UnanimousLatestInnerNotRelevant_p - >
? REACT ( { b10 , ? LINE , [ ] } ) ,
%% Do not go to C100, because we want to ignore this latest
%% proposal. Write ours instead via C300.
react_to_env_C300 ( P_newprop , P_latest , S ) ;
2015-04-06 05:16:20 +00:00
LatestUnanimousP - >
2015-04-14 07:17:49 +00:00
? REACT ( { b10 , ? LINE ,
[ { latest_unanimous_p , LatestUnanimousP } ,
{ latest_epoch , P_latest #projection_v1.epoch_number } ,
{ latest_author , P_latest #projection_v1.author_server } ,
{ newprop_epoch , P_newprop #projection_v1.epoch_number } ,
{ newprop_author , P_newprop #projection_v1.author_server }
] } ) ,
2015-04-06 05:16:20 +00:00
react_to_env_C100 ( P_newprop , P_latest , S ) ;
P_newprop_flap_count > = FlapLimit - >
%% I am flapping ... what else do I do?
? REACT ( { b10 , ? LINE , [ i_am_flapping ,
{ newprop_flap_count , P_newprop_flap_count } ,
{ flap_limit , FlapLimit } ] } ) ,
2015-06-15 03:41:16 +00:00
case proplists : get_value ( private_write_verbose , S #ch_mgr.opts ) of
true - >
2015-07-20 07:25:42 +00:00
ok ; %% ?V("{FLAP: ~w flaps ~w}! ", [S#ch_mgr.name, P_newprop_flap_count]);
2015-06-15 03:41:16 +00:00
_ - >
ok
end ,
2015-07-15 03:44:56 +00:00
%% MEANWHILE, we have learned some things about this
%% algorithm in the past many months. With the introduction
%% of the "inner projection" concept, we know that the inner
%% projection may be stable but the "outer" projection will
%% continue to be flappy for as long as there's an
%% asymmetric network partition somewhere. We now know that
%% that flappiness is OK and that the only problem with it
%% is that it needs to be slowed down so that we don't have
%% zillions of public projection proposals written every
%% second.
%%
%% It doesn't matter if the FlapLimit count mechanism
%% doesn't give an accurate sense of global flapping state.
%% FlapLimit is enough to be able to tell us to slow down.
%% We already know that I'm flapping. We need to
%% signal to the rest of the world that I'm writing
%% and flapping and churning, so we cannot always
%% go to A50 from here.
%%
%% If we do go to A50, then recommend that we poll less
%% frequently.
{ X , S2 } = gimme_random_uniform ( 100 , S ) ,
if X < 80 - >
? REACT ( { b10 , ? LINE , [ flap_stop ] } ) ,
ThrottleTime = if P_newprop_flap_count < 500 - > 1 ;
P_newprop_flap_count < 1000 - > 5 ;
P_newprop_flap_count < 5000 - > 10 ;
true - > 30
end ,
FinalProps = [ { my_flap_limit , FlapLimit } ,
{ throttle_seconds , ThrottleTime } ] ,
react_to_env_A50 ( P_latest , FinalProps , S2 ) ;
true - >
%% It is our moral imperative to write so that
%% the flap cycle continues enough times so that
%% everyone notices then eventually falls into
%% consensus.
2015-07-15 09:42:59 +00:00
react_to_env_C300 ( P_newprop , P_latest , S2 )
2015-04-06 05:16:20 +00:00
end ;
Retries > 2 - >
? REACT ( { b10 , ? LINE , [ { retries , Retries } ] } ) ,
%% The author of P_latest is too slow or crashed.
%% Let's try to write P_newprop and see what happens!
react_to_env_C300 ( P_newprop , P_latest , S ) ;
Rank_latest > = Rank_newprop
andalso
P_latest #projection_v1.author_server / = MyName - >
? REACT ( { b10 , ? LINE ,
[ { rank_latest , Rank_latest } ,
{ rank_newprop , Rank_newprop } ,
{ latest_author , P_latest #projection_v1.author_server } ] } ) ,
2015-04-14 09:19:08 +00:00
%% TODO: Is a UnanimousLatestInnerNotRelevant_p test needed in this clause???
%% Give the author of P_latest an opportunity to write a
2015-04-06 05:16:20 +00:00
%% new projection in a new epoch to resolve this mixed
%% opinion.
react_to_env_C200 ( Retries , P_latest , S ) ;
true - >
? REACT ( { b10 , ? LINE } ) ,
2015-04-13 15:54:38 +00:00
? REACT ( { b10 , ? LINE , [ { retries , Retries } , { rank_latest , Rank_latest } , { rank_newprop , Rank_newprop } , { latest_author , P_latest #projection_v1.author_server } ] } ) , % TODO debug delete me!
2015-04-06 05:16:20 +00:00
%% P_newprop is best, so let's write it.
react_to_env_C300 ( P_newprop , P_latest , S )
end .
2015-07-15 08:23:17 +00:00
react_to_env_C100 ( P_newprop , #projection_v1 { author_server = Author_latest ,
2015-07-15 12:58:21 +00:00
flap = Flap_latest0 } = P_latest ,
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
#ch_mgr { name = MyName , proj = P_current ,
2015-07-18 15:43:10 +00:00
not_sanes = NotSanesDict0 } = S ) - >
2015-04-06 05:16:20 +00:00
? REACT ( c100 ) ,
2015-04-10 13:41:22 +00:00
2015-07-15 08:23:17 +00:00
Sane = projection_transition_is_sane ( P_current , P_latest , MyName ) ,
2015-08-27 13:02:23 +00:00
QQ_current = lists : flatten ( io_lib : format ( " ~w : ~w , ~w / ~w : ~w , ~w " , [ P_current #projection_v1.epoch_number , P_current #projection_v1.upi , P_current #projection_v1.repairing , ( inner_projection_or_self ( P_current ) ) #projection_v1.epoch_number , ( inner_projection_or_self ( P_current ) ) #projection_v1.upi , ( inner_projection_or_self ( P_current ) ) #projection_v1.repairing ] ) ) ,
QQ_latest = lists : flatten ( io_lib : format ( " ~w : ~w , ~w / ~w : ~w , ~w " , [ P_latest #projection_v1.epoch_number , P_latest #projection_v1.upi , P_latest #projection_v1.repairing , ( inner_projection_or_self ( P_latest ) ) #projection_v1.epoch_number , ( inner_projection_or_self ( P_latest ) ) #projection_v1.upi , ( inner_projection_or_self ( P_latest ) ) #projection_v1.repairing ] ) ) ,
2015-09-01 13:10:45 +00:00
if Sane == true - > ok ; true - > ? V ( " \n ~w -insane- ~w -auth= ~w ~s -> ~s ~w \n ~p \n ~p \n " , [ ? LINE , MyName , P_newprop #projection_v1.author_server , QQ_current , QQ_latest , Sane , get ( why2 ) , get ( react ) ] ) end ,
2015-07-15 12:58:21 +00:00
Flap_latest = if is_record ( Flap_latest0 , flap_i ) - >
Flap_latest0 ;
true - >
2015-08-18 11:49:36 +00:00
not_a_flap_i_record
2015-07-15 12:58:21 +00:00
end ,
? REACT ( { c100 , ? LINE , [ zoo , { me , MyName } , { author_latest , Author_latest } ,
2015-08-18 11:49:36 +00:00
{ flap_latest , Flap_latest } ] } ) ,
2015-07-15 08:23:17 +00:00
2015-07-15 03:44:56 +00:00
%% Note: The value of `Sane' may be `true', `false', or `term() /= true'.
%% The error value `false' is reserved for chain order violations.
%% Any other non-true value can be used for projection structure
%% construction errors, checksum error, etc.
2015-07-04 06:52:44 +00:00
case Sane of
2015-04-10 13:41:22 +00:00
_ when P_current #projection_v1.epoch_number == 0 - >
2015-08-28 09:37:11 +00:00
%% Epoch == 0 is reserved for first-time, just booting conditions
%% or for when we got stuck in an insane projection transition
%% and were forced to the none projection to recover.
2015-04-09 08:47:43 +00:00
? REACT ( { c100 , ? LINE , [ first_write ] } ) ,
2015-08-24 10:04:26 +00:00
if Sane == true - > ok ; true - > ? V ( " ~w -insane- ~w - ~w : ~w : ~w , " , [ ? LINE , MyName , P_newprop #projection_v1.epoch_number , P_newprop #projection_v1.upi , P_newprop #projection_v1.repairing ] ) end , %%% DELME!!!
2015-04-09 08:47:43 +00:00
react_to_env_C110 ( P_latest , S ) ;
2015-06-15 08:22:02 +00:00
true - >
2015-04-06 05:16:20 +00:00
? REACT ( { c100 , ? LINE , [ sane ] } ) ,
2015-08-24 10:04:26 +00:00
if Sane == true - > ok ; true - > ? V ( " ~w -insane- ~w - ~w : ~w : ~w @ ~w , " , [ ? LINE , MyName , P_newprop #projection_v1.epoch_number , P_newprop #projection_v1.upi , P_newprop #projection_v1.repairing , ? LINE ] ) end , %%% DELME!!!
2015-04-06 05:16:20 +00:00
react_to_env_C110 ( P_latest , S ) ;
2015-07-20 05:04:25 +00:00
DoctorSays - >
? REACT ( { c100 , ? LINE , [ { not_sane , DoctorSays } ] } ) ,
2015-07-15 08:23:17 +00:00
%% This is a fun case. We had just enough asymmetric partition
%% to cause the chain to fragment into two *incompatible* and
%% *overlapping membership* chains, but the chain fragmentation
%% happened "quickly" enough so that by the time everyone's flap
%% counters hit the flap_limit, the asymmetric partition has
%% disappeared ... we'd be stuck in a flapping state forever (or
%% until the partition situation changes again, which might be a
%% very long time).
%%
%% Alas, this case took a long time to find in model checking
%% zillions of asymmetric partitions. Our solution is a bit
%% harsh: we fall back to the "none projection" and let the chain
%% reassemble from there. Hopefully this case is quite rare,
%% since asymmetric partitions (we assume) are pretty rare?
2015-07-15 12:58:21 +00:00
%%
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
%% Examples of overlapping membership insanity (at same instant):
%% Key: {author, suggested UPI, suggested Reparing}
2015-07-15 12:58:21 +00:00
%%
%% {a,[a,b],[c,d,e]},
%% {b,[a,b],[c,d,e]},
%% {c,[e,b],[a,c,d]},
%% {d,[a,b],[c,d,e]},
%% {e,[e,b],[a,c,d]},
%% OR
%% [{a,[c,e],[a,b,d]},
%% {b,[e,a,b,c,d],[]},
%% {c,[c,e],[a,b,d]},
%% {d,[c,e],[a,b,d]},
%% {e,[c,e],[a,b,d]}]
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
%%
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-18 15:43:10 +00:00
%%
%% See also: comment in do_react_to_env() about
%% non-flapping-scenario that can also cause us to want to
%% collapse to the none_projection to break a
%% livelock/infinite loop.
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
react_to_env_C100_inner ( Author_latest , NotSanesDict0 , MyName ,
2015-07-18 15:43:10 +00:00
P_newprop , P_latest , S )
2015-04-06 05:16:20 +00:00
end .
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
react_to_env_C100_inner ( Author_latest , NotSanesDict0 , MyName ,
2015-08-24 10:04:26 +00:00
P_newprop , P_latest ,
#ch_mgr { consistency_mode = CMode } = S ) - >
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
NotSanesDict = orddict : update_counter ( Author_latest , 1 , NotSanesDict0 ) ,
2015-07-20 05:04:25 +00:00
S2 = S #ch_mgr { not_sanes = NotSanesDict , sane_transitions = 0 } ,
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
case orddict : fetch ( Author_latest , NotSanesDict ) of
2015-08-24 10:04:26 +00:00
N when CMode == cp_mode - >
? V ( " YOYO-cp-mode, ~w , ~w , ~w , " , [ MyName , P_latest #projection_v1.epoch_number , N ] ) ,
? REACT ( { c100 , ? LINE , [ { cmode , CMode } ,
{ not_sanes_author_count , N } ] } ) ,
2015-08-24 12:54:30 +00:00
case get ( { zzz_quiet , P_latest #projection_v1.epoch_number } ) of undefined - > ? V ( " YOYO-cp-mode, ~w ,current= ~w , " , [ MyName , machi_projection : make_summary ( ( S #ch_mgr.proj ) ) ] ) ; _ - > ok end ,
put ( { zzz_quiet , P_latest #projection_v1.epoch_number } , true ) ,
2015-08-24 10:04:26 +00:00
react_to_env_A49 ( P_latest , [ ] , S2 ) ;
2015-09-01 13:10:45 +00:00
N when CMode == ap_mode ,
N > ? TOO_FREQUENT_BREAKER - >
2015-08-22 12:27:01 +00:00
? V ( " \n \n YOYO ~w breaking the cycle of: \n current: ~w \n new : ~w \n " , [ MyName , machi_projection : make_summary ( S #ch_mgr.proj ) , machi_projection : make_summary ( P_latest ) ] ) ,
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
? REACT ( { c100 , ? LINE , [ { not_sanes_author_count , N } ] } ) ,
2015-08-22 12:40:21 +00:00
react_to_env_C103 ( P_newprop , P_latest , S2 ) ;
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
N - >
2015-07-18 15:43:10 +00:00
? V ( " YOYO, ~w , ~w , ~w , " , [ MyName , P_latest #projection_v1.epoch_number , N ] ) ,
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
? REACT ( { c100 , ? LINE , [ { not_sanes_author_count , N } ] } ) ,
2015-07-18 15:43:10 +00:00
%% P_latest is not sane.
%% By process of elimination, P_newprop is best,
%% so let's write it.
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
react_to_env_C300 ( P_newprop , P_latest , S2 )
end .
2015-08-25 09:43:55 +00:00
react_to_env_C103 ( #projection_v1 { epoch_number = _ Epoch_newprop } = _ P_newprop ,
2015-08-22 12:40:21 +00:00
#projection_v1 { epoch_number = Epoch_latest ,
2015-07-20 05:04:25 +00:00
all_members = All_list ,
2015-08-25 09:43:55 +00:00
flap = Flap } = _ P_latest ,
2015-07-20 05:04:25 +00:00
#ch_mgr { name = MyName , proj = P_current } = S ) - >
2015-08-24 10:04:26 +00:00
#projection_v1 { witnesses = Witness_list ,
members_dict = MembersDict } = P_current ,
2015-08-30 10:53:47 +00:00
P_none0 = make_none_projection ( Epoch_latest ,
MyName , All_list , Witness_list , MembersDict ) ,
P_none1 = P_none0 #projection_v1 { flap = Flap ,
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
dbg = [ { none_projection , true } ] } ,
2015-07-15 08:23:17 +00:00
P_none = machi_projection : update_checksum ( P_none1 ) ,
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
%% Use it, darn it, because it's 100% safe. And exit flapping state.
2015-07-20 05:04:25 +00:00
? REACT ( { c103 , ? LINE ,
[ { current_epoch , P_current #projection_v1.epoch_number } ,
2015-08-24 10:04:26 +00:00
{ none_projection_epoch , P_none #projection_v1.epoch_number } ] } ) ,
2015-07-20 05:04:25 +00:00
%% Reset the not_sanes count dictionary here, or else an already
%% ?TOO_FREQUENT_BREAKER count for an author might prevent a
%% transition from C100_inner()->C300, which can lead to infinite
%% looping C100->C103->C100.
2015-07-21 09:43:59 +00:00
react_to_env_C100 ( P_none , P_none , clear_flapping_state ( S ) ) .
2015-07-15 08:23:17 +00:00
2015-08-27 08:58:43 +00:00
react_to_env_C110 ( P_latest , #ch_mgr { name = MyName , proj = P_current ,
proj_unanimous = ProjUnanimous } = S ) - >
2015-04-06 05:16:20 +00:00
? REACT ( c110 ) ,
2015-08-24 10:04:26 +00:00
? REACT ( { c110 , ? LINE , [ { latest_epoch , P_latest #projection_v1.epoch_number } ] } ) ,
2015-08-27 08:58:43 +00:00
Extra1 = case inner_projection_exists ( P_current ) andalso
inner_projection_exists ( P_latest ) andalso
( machi_projection : get_epoch_id (
inner_projection_or_self ( P_current ) ) ==
machi_projection : get_epoch_id (
inner_projection_or_self ( P_latest ) ) )
andalso ProjUnanimous / = false of
true - >
EpochID = machi_projection : get_epoch_id (
inner_projection_or_self ( P_latest ) ) ,
UnanimousTime = ProjUnanimous ,
A = make_annotation ( EpochID , UnanimousTime ) ,
2015-08-28 09:37:11 +00:00
io : format ( user , " \n CONFIRM debug C110 ~w annotates ~W outer ~w \n " , [ MyName , EpochID , 5 , P_latest #projection_v1.epoch_number ] ) ,
[ A , { annotated_by , c110 } ] ;
2015-08-27 08:58:43 +00:00
false - >
[ ]
end ,
Extra2 = [ { react , get ( react ) } ] ,
P_latest2 = machi_projection : update_dbg2 ( P_latest , Extra1 ++ Extra2 ) ,
2015-04-06 05:16:20 +00:00
2015-04-09 08:13:38 +00:00
MyNamePid = proxy_pid ( MyName , S ) ,
2015-07-02 19:30:05 +00:00
Goo = P_latest2 #projection_v1.epoch_number ,
2015-05-01 15:33:49 +00:00
%% This is the local projection store. Use a larger timeout, so
%% that things locally are pretty horrible if we're killed by a
%% timeout exception.
2015-05-06 02:41:04 +00:00
Goo = P_latest2 #projection_v1.epoch_number ,
2015-07-16 07:01:53 +00:00
%% ?V("HEE110 ~w ~w ~w\n", [S#ch_mgr.name, self(), lists:reverse(get(react))]),
2015-05-06 02:41:04 +00:00
2015-07-17 07:20:54 +00:00
case { ? FLU_PC : write_projection ( MyNamePid , private , P_latest2 , ? TO * 30 ) , Goo } of
{ ok , Goo } - >
2015-08-13 09:45:15 +00:00
? REACT ( { c120 , [ { write , ok } ] } ) ,
2015-08-13 10:10:48 +00:00
%% We very intentionally do *not* pass P_latest2 forward:
%% we must avoid bloating the dbg2 list!
2015-08-28 09:37:11 +00:00
P_latest2_perhaps_annotated =
2015-08-27 16:55:31 +00:00
machi_projection : update_dbg2 ( P_latest , Extra1 ) ,
2015-08-28 09:37:11 +00:00
perhaps_verbose_c110 ( P_latest2_perhaps_annotated , S ) ,
react_to_env_C120 ( P_latest2_perhaps_annotated , [ ] , S ) ;
2015-08-13 09:45:15 +00:00
{ { error , bad_arg } , _ Goo } - >
? REACT ( { c120 , [ { write , bad_arg } ] } ) ,
2015-08-13 10:10:48 +00:00
%% This bad_arg error is the result of an implicit pre-condition
%% failure that is now built-in to the projection store: when
%% writing a private projection, return {error, bad_arg} if there
%% the store contains a *public* projection with a higher epoch
%% number.
%%
%% In the context of AP mode, this is harmless: we avoid a bit of
%% extra work by adopting P_latest now.
%%
%% In the context of CP mode, this pre-condition failure is very
%% important: it signals to us that the world is changing (or
%% trying to change), and it is vital that we avoid doing
%% something based on stale info.
%%
%% Worst case: our humming consensus round was executing very
%% quickly until the point immediately before writing our private
%% projection above: immediately before the private proj write,
%% we go to sleep for 10 days. When we wake up after such a long
%% sleep, we would definitely notice the last projections made by
%% everyone, but we would miss the intermediate *history* of
%% chain changes over those 10 days. In CP mode it's vital that
%% we don't miss any of that history while we're running (i.e.,
%% here in this func) or when we're restarting after a
%% shutdown/crash.
%%
%% React to newer public write by restarting the iteration.
2015-08-13 09:45:15 +00:00
react_to_env_A20 ( 0 , S ) ;
2015-07-17 07:20:54 +00:00
Else - >
2015-08-27 16:55:31 +00:00
Summ = machi_projection : make_summary ( P_latest2 ) ,
2015-08-13 09:45:15 +00:00
io : format ( user , " C110 error by ~w : ~w , ~w \n ~p \n " ,
2015-07-17 07:20:54 +00:00
[ MyName , Else , Summ , get ( react ) ] ) ,
2015-08-13 09:45:15 +00:00
error_logger : error_msg ( " C110 error by ~w : ~w , ~w , ~w \n " ,
2015-07-17 07:20:54 +00:00
[ MyName , Else , Summ , get ( react ) ] ) ,
exit ( { c110_failure , MyName , Else , Summ } )
2015-08-13 09:45:15 +00:00
end .
2015-04-06 05:16:20 +00:00
2015-07-20 05:04:25 +00:00
react_to_env_C120 ( P_latest , FinalProps , #ch_mgr { proj_history = H ,
sane_transitions = Xtns } = S ) - >
2015-04-06 05:16:20 +00:00
? REACT ( c120 ) ,
2015-08-23 08:50:25 +00:00
%% TODO: revisit this constant?
2015-08-23 11:47:43 +00:00
MaxLength = length ( P_latest #projection_v1.all_members ) * 1 . 5 ,
2015-08-23 08:50:25 +00:00
H2 = add_and_trunc_history ( P_latest , H , MaxLength ) ,
2015-04-06 05:16:20 +00:00
2015-08-27 11:27:33 +00:00
%% diversion_c120_verbose_goop(P_latest, S),
2015-04-09 08:13:38 +00:00
? REACT ( { c120 , [ { latest , machi_projection : make_summary ( P_latest ) } ] } ) ,
2015-08-27 16:55:31 +00:00
S2 = set_proj ( S #ch_mgr { proj_history = H2 ,
sane_transitions = Xtns + 1 } , P_latest ) ,
S3 = case is_annotated ( P_latest ) of
false - >
S2 ;
{ { _ ConfEpoch , _ ConfCSum } , ConfTime } - >
io : format ( user , " \n CONFIRM debug C120 ~w was annotated ~W outer ~w \n " , [ S #ch_mgr.name , ( inner_projection_or_self ( P_latest ) ) #projection_v1.epoch_number , 5 , P_latest #projection_v1.epoch_number ] ) ,
S2 #ch_mgr { proj_unanimous = ConfTime }
end ,
2015-09-01 13:10:45 +00:00
V = case file : read_file ( " /tmp/moomoo. " ++ atom_to_list ( S #ch_mgr.name ) ) of { ok , _ } - > true ; _ - > false end ,
if V - > io : format ( " C120: ~w : ~p \n " , [ S #ch_mgr.name , get ( react ) ] ) ; true - > ok end ,
2015-08-27 16:55:31 +00:00
{ { now_using , FinalProps , P_latest #projection_v1.epoch_number } , S3 } .
2015-08-23 08:50:25 +00:00
add_and_trunc_history ( P_latest , H , MaxLength ) - >
H2 = if P_latest #projection_v1.epoch_number > 0 - >
queue : in ( P_latest , H ) ;
true - >
H
end ,
case queue : len ( H2 ) of
X when X > MaxLength - >
{ _ V , Hxx } = queue : out ( H2 ) ,
Hxx ;
_ - >
H2
end .
2015-04-06 05:16:20 +00:00
react_to_env_C200 ( Retries , P_latest , S ) - >
? REACT ( c200 ) ,
try
2015-07-02 19:30:05 +00:00
AuthorProxyPid = proxy_pid ( P_latest #projection_v1.author_server , S ) ,
? FLU_PC : kick_projection_reaction ( AuthorProxyPid , [ ] )
2015-04-06 05:16:20 +00:00
catch _ Type : _ Err - >
2015-07-16 07:01:53 +00:00
%% ?V("TODO: tell_author_yo is broken: ~p ~p\n",
2015-04-06 05:16:20 +00:00
%% [_Type, _Err]),
ok
end ,
react_to_env_C210 ( Retries , S ) .
react_to_env_C210 ( Retries , #ch_mgr { name = MyName , proj = Proj } = S ) - >
? REACT ( c210 ) ,
sleep_ranked_order ( 10 , 100 , MyName , Proj #projection_v1.all_members ) ,
react_to_env_C220 ( Retries , S ) .
react_to_env_C220 ( Retries , S ) - >
? REACT ( c220 ) ,
react_to_env_A20 ( Retries + 1 , S ) .
react_to_env_C300 ( #projection_v1 { epoch_number = _ Epoch_newprop } = P_newprop ,
#projection_v1 { epoch_number = _ Epoch_latest } = _ P_latest , S ) - >
? REACT ( c300 ) ,
2015-04-09 08:13:38 +00:00
react_to_env_C310 ( machi_projection : update_checksum ( P_newprop ) , S ) .
2015-04-06 05:16:20 +00:00
react_to_env_C310 ( P_newprop , S ) - >
? REACT ( c310 ) ,
Epoch = P_newprop #projection_v1.epoch_number ,
{ WriteRes , S2 } = cl_write_public_proj_skip_local_error ( Epoch , P_newprop , S ) ,
? REACT ( { c310 , ? LINE ,
2015-04-09 08:13:38 +00:00
[ { newprop , machi_projection : make_summary ( P_newprop ) } ,
2015-04-06 05:16:20 +00:00
{ write_result , WriteRes } ] } ) ,
react_to_env_A10 ( S2 ) .
2015-08-18 11:49:36 +00:00
%% calculate_flaps() ... Create the background/internal state regarding our
%% own flapping state and what we know about the flapping state of our peers.
%% Other functions will consume this data and alter our suggested projection
%% changes if/when needed later.
%%
%% The flapping manifesto:
%%
%% I will enter flapping mode if:
%%
%% 1. My last F adopted private projections+P_newprop have the same
%% UPI+Repairing list. (If I am a direct victim of asymmetric
%% partition, this is my only behavior.)
%%
%% 2. I observe a latest public projection by a flapping author.
%%
%% I will leave flapping mode if:
%%
%% 1. My view of up FLUs changes. (As a direct observer, this is a
%% will (indirectly) trigger a change of UPI+Repairing that I will
%% suggest.) Alas, there is no guarantee that I will win enough
%% races to have my single new public projection truly observed by
%% anyone else. We will rely on fate and retrying @ new epochs to
%% get our state changed noticed by someone later.
%%
%% 2. I observe a latest public projection E by author X such that author
%% X is marked as not flapping *and* I believe that X is flapping at
%% some earlier epoch E-delta.
calculate_flaps ( P_newprop , P_latest , _ P_current , CurrentUp , _ FlapLimit ,
2015-08-23 11:47:43 +00:00
#ch_mgr { name = MyName , proj_history = H ,
2015-08-29 17:22:59 +00:00
consistency_mode = CMode ,
2015-08-23 08:50:25 +00:00
flap_start = FlapStart ,
2015-08-18 11:49:36 +00:00
flap_count = FlapCount , flap_last_up = FlapLastUp ,
2015-08-24 11:38:54 +00:00
flap_last_up_change = LastUpChange0 ,
2015-08-18 11:49:36 +00:00
flap_counts_last = FlapCountsLast ,
runenv = RunEnv1 } = S ) - >
2015-08-23 08:50:25 +00:00
UniqueProposalSummaries = make_unique_proposal_summaries ( H , P_newprop ) ,
2015-08-23 11:00:19 +00:00
MyUniquePropCount = length ( UniqueProposalSummaries ) ,
2015-08-24 11:38:54 +00:00
LastUpChange = if CurrentUp / = FlapLastUp - >
now ( ) ;
true - >
LastUpChange0
end ,
LastUpChange_diff = timer : now_diff ( now ( ) , LastUpChange ) / 1000000 ,
2015-08-28 09:37:11 +00:00
? REACT ( { calculate_flaps , ? LINE , [ { flap_start , FlapStart } ,
{ flap_count , FlapCount } ,
{ flap_last_up , FlapLastUp } ,
{ flap_counts_last , FlapCountsLast } ,
{ my_unique_prop_count , MyUniquePropCount } ,
{ current_up , CurrentUp } ,
{ last_up_change , LastUpChange } ,
{ last_up_change_diff , LastUpChange_diff } ] } ) ,
2015-04-06 05:16:20 +00:00
2015-08-25 09:43:55 +00:00
%% TODO: Do we want to try to use BestP below to short-circuit
%% calculation if we notice that the best private epoch # from
%% somewhere has advanced?
{ _ WhateverUnanimous , _ BestP , Props , _ S } =
2015-04-06 05:16:20 +00:00
cl_read_latest_projection ( private , S ) ,
HosedTransUnion = proplists : get_value ( trans_all_hosed , Props ) ,
TransFlapCounts0 = proplists : get_value ( trans_all_flap_counts , Props ) ,
2015-08-14 13:28:50 +00:00
%% NOTE: bad_answer_flus are due to timeout or some other network
2015-04-06 05:16:20 +00:00
%% glitch, i.e., anything other than {ok, P::projection()}
%% response from machi_flu0:proj_read_latest().
BadFLUs = proplists : get_value ( bad_answer_flus , Props ) ,
RemoteTransFlapCounts1 = lists : keydelete ( MyName , 1 , TransFlapCounts0 ) ,
RemoteTransFlapCounts =
[ X | | { _ FLU , { { _ FlEpk , FlTime } , _ FlapCount } } = X < - RemoteTransFlapCounts1 ,
FlTime / = ? NOT_FLAPPING ] ,
2015-08-18 11:49:36 +00:00
TempNewFlapCount = FlapCount + 1 ,
2015-08-29 15:50:03 +00:00
TempAllFlapCounts = lists : sort ( [ { MyName , FlapStart } | RemoteTransFlapCounts ] ) ,
2015-04-06 05:16:20 +00:00
%% Sanity check.
2015-08-18 11:49:36 +00:00
true = lists : all ( fun ( { _ FLU , { _ EpkTime , _ Count } } ) - > true ;
( _ ) - > false
end , TempAllFlapCounts ) ,
2015-04-06 05:16:20 +00:00
%% H is the bounded history of all of this manager's private
%% projection store writes. If we've proposed the *same*
2015-08-18 11:49:36 +00:00
%% UPI+Repairing combo for the entire length of our
2015-04-06 05:16:20 +00:00
%% bounded size of H, then we're flapping.
%%
%% If we're flapping, then we use our own flap counter and that of
%% all of our peer managers to see if we've all got flap counters
%% that exceed the flap_limit. If that global condition appears
%% true, then we "blow the circuit breaker" by stopping our
%% participation in the flapping store (via the shortcut to A50).
%%
%% We reset our flap counter on any of several conditions:
%%
%% 1. If our bounded history H contains more than one proposal,
%% then by definition we are not flapping.
%% 2. If a remote manager is flapping and has re-started a new
%% flapping episode.
%% 3. If one of the remote managers that we saw earlier has
%% stopped flapping.
2015-08-18 11:49:36 +00:00
? REACT ( { calculate_flaps , ? LINE , [ { queue_len , queue : len ( H ) } ,
{ uniques , UniqueProposalSummaries } ] } ) ,
P_latest_Flap = get_raw_flapping_i ( P_latest ) ,
2015-08-28 09:39:18 +00:00
AmFlappingNow_p = not ( FlapStart == ? NOT_FLAPPING_START ) ,
%% TODO: revisit why I added this extra length()
%% condition back on commit 3dfe5c2.
%% andalso
%% length(UniqueProposalSummaries) == 1,
2015-08-22 05:50:10 +00:00
P_latest_flap_start = case P_latest_Flap of
undefined - >
? NOT_FLAPPING_START ;
_ - >
element ( 1 , P_latest_Flap #flap_i.flap_count )
end ,
2015-08-25 08:01:14 +00:00
MinQueueLen = 3 ,
2015-08-20 08:32:46 +00:00
StartFlapping_p =
2015-08-18 11:49:36 +00:00
case { queue : len ( H ) , UniqueProposalSummaries } of
2015-08-20 08:32:46 +00:00
_ when AmFlappingNow_p - >
2015-08-25 08:01:14 +00:00
? REACT ( { calculate_flaps , ? LINE , [ { flap_count , FlapCount } ,
{ flap_start , FlapStart } ] } ) ,
2015-08-20 08:32:46 +00:00
%% I'm already flapping, therefore don't start again.
false ;
2015-08-24 10:04:26 +00:00
{ N , _ } when N > = MinQueueLen ,
2015-08-22 05:50:10 +00:00
P_latest_flap_start / = ? NOT_FLAPPING_START - >
? REACT ( { calculate_flaps , ? LINE ,
2015-08-28 09:37:11 +00:00
[ { manifesto_clause , { start , 2 } } ,
2015-08-22 05:50:10 +00:00
{ latest_epoch , P_latest #projection_v1.epoch_number } ,
{ latest_flap_count , P_latest_Flap #flap_i.flap_count } ] } ) ,
2015-08-18 11:49:36 +00:00
true ;
2015-08-24 10:04:26 +00:00
{ N , [ _ ] } when N > = MinQueueLen - >
2015-08-28 09:37:11 +00:00
? REACT ( { calculate_flaps , ? LINE , [ { manifesto_clause , { start , 1 } } ] } ) ,
2015-08-18 11:49:36 +00:00
true ;
{ _ N , _ } - >
? REACT ( { calculate_flaps , ? LINE , [ ] } ) ,
false
2015-08-29 04:13:23 +00:00
end ,
%% TODO: 2015-08-29: Grr, we really need CP cases of none projection
%% flapping to propagate problem_with information.
2015-08-18 11:49:36 +00:00
LeaveFlapping_p =
2015-08-20 08:32:46 +00:00
if
2015-08-24 11:38:54 +00:00
LastUpChange_diff < 3 . 0 - >
%% If the last time we saw a hard change in up/down status
%% was less than this time ago, then we do not flap. Give
%% the status change some time to propagate.
2015-08-25 08:01:14 +00:00
? REACT ( { calculate_flaps , ? LINE , [ { time_diff , LastUpChange_diff } ] } ) ,
2015-08-24 11:38:54 +00:00
true ;
2015-08-20 08:32:46 +00:00
StartFlapping_p - >
%% If we're starting flapping on this iteration, don't ignore
%% that intent.
2015-08-25 08:01:14 +00:00
? REACT ( { calculate_flaps , ? LINE , [ ] } ) ,
2015-08-20 08:32:46 +00:00
false ;
2015-08-29 04:13:23 +00:00
%% TODO: 2015-08-29: Grr, we really need CP cases of none projection
%% flapping to propagate problem_with information.
%% AmFlappingNow_p andalso
%% P_newprop#projection_v1.upi == [] ->
%% %% P_newprop is the none projection, stop flapping.
%% ?REACT({calculate_flaps,?LINE,[]}),
%% true;
2015-08-28 12:13:54 +00:00
AmFlappingNow_p andalso
2015-08-24 11:38:54 +00:00
CurrentUp / = FlapLastUp - >
2015-08-28 09:37:11 +00:00
? REACT ( { calculate_flaps , ? LINE , [ { manifesto_clause , { leave , 1 } } ] } ) ,
2015-08-18 11:49:36 +00:00
true ;
2015-08-20 08:32:46 +00:00
AmFlappingNow_p - >
2015-08-18 11:49:36 +00:00
P_latest_LastStartTime =
2015-08-20 08:32:46 +00:00
search_last_flap_counts (
P_latest #projection_v1.author_server , FlapCountsLast ) ,
2015-08-18 11:49:36 +00:00
case get_flap_count ( P_latest ) of
{ ? NOT_FLAPPING_START , _ Count }
when P_latest_LastStartTime == undefined - >
2015-08-20 08:32:46 +00:00
%% latest proj not flapping & not flapping last time
2015-08-18 11:49:36 +00:00
? REACT ( { calculate_flaps , ? LINE , [ ] } ) ,
false ;
2015-08-20 08:32:46 +00:00
{ Curtime , _ Count } when Curtime == P_latest_LastStartTime - >
%% latest proj flapping & flapping last time
2015-08-18 11:49:36 +00:00
? REACT ( { calculate_flaps , ? LINE , [ ] } ) ,
false ;
2015-08-29 15:50:23 +00:00
{ 0 = Curtime , 0 } when P_latest #projection_v1.author_server
/ = MyName ,
P_latest_LastStartTime / = undefined ,
2015-08-20 08:32:46 +00:00
P_latest_LastStartTime / = ? NOT_FLAPPING_START - >
? REACT ( { calculate_flaps , ? LINE ,
2015-08-28 09:37:11 +00:00
[ { manifesto_clause , { leave , 2 } } ,
2015-08-20 08:32:46 +00:00
{ p_latest , machi_projection : make_summary ( P_latest ) } ,
{ curtime , Curtime } ,
{ flap_counts_last , FlapCountsLast } ,
{ laststart_time , P_latest_LastStartTime } ] } ) ,
%% latest proj not flapping & flapping last time:
%% this author
2015-08-18 11:49:36 +00:00
true ;
_ - >
2015-08-20 08:32:46 +00:00
? REACT ( { calculate_flaps , ? LINE , [ ] } ) ,
2015-08-18 11:49:36 +00:00
false
2015-08-20 08:32:46 +00:00
end ;
true - >
? REACT ( { calculate_flaps , ? LINE , [ ] } ) ,
false
2015-08-18 11:49:36 +00:00
end ,
2015-08-25 08:01:14 +00:00
%% if true -> io:format(user, "CALC_FLAP: ~w: flapping_now ~w start ~w leave ~w latest-epoch ~w: ~w\n", [MyName, AmFlappingNow_p, StartFlapping_p, LeaveFlapping_p, P_latest#projection_v1.epoch_number, [X || X={calculate_flaps,_,_} <- lists:sublist(get(react), 3)]]); true -> ok end,
2015-08-24 11:38:54 +00:00
%% if LeaveFlapping_p andalso (AmFlappingNow_p orelse StartFlapping_p) -> io:format(user, "CALC_FLAP: ~w: flapping_now ~w start ~w leave ~w latest-epoch ~w: ~w\n", [MyName, AmFlappingNow_p, StartFlapping_p, LeaveFlapping_p, P_latest#projection_v1.epoch_number, [X || X={calculate_flaps,_,_} <- lists:sublist(get(react), 3)]]); true -> ok end,
2015-08-18 11:49:36 +00:00
AmFlapping_p = if LeaveFlapping_p - > false ;
2015-08-20 08:32:46 +00:00
true - > AmFlappingNow_p orelse StartFlapping_p
2015-08-18 11:49:36 +00:00
end ,
2015-08-22 05:50:10 +00:00
if AmFlapping_p andalso not LeaveFlapping_p - >
2015-08-18 11:49:36 +00:00
NewFlapCount = TempNewFlapCount ,
2015-04-06 05:16:20 +00:00
if element ( 2 , FlapStart ) == ? NOT_FLAPPING - >
NewFlapStart = { { epk , P_newprop #projection_v1.epoch_number } , now ( ) } ;
true - >
NewFlapStart = FlapStart
end ,
AllFlapCounts = TempAllFlapCounts ,
2015-08-14 13:29:20 +00:00
HosedTransUnionTs = [ T | | T < - HosedTransUnion , is_tuple ( T ) ] ,
2015-08-11 06:24:26 +00:00
AnnotatedBadFLUs = [ { MyName , problem_with , FLU } | | FLU < - BadFLUs ] ,
2015-08-14 13:29:20 +00:00
HosedAnnotations = lists : usort ( HosedTransUnionTs ++ AnnotatedBadFLUs ) ,
Magic = lists : sort (
digraph_magic ( P_newprop #projection_v1.all_members ,
HosedAnnotations ) ) ,
AllHosed = lists : usort ( HosedAnnotations ++ Magic ) ,
2015-08-18 11:49:36 +00:00
%%io:format(user, "ALLHOSED ~p: ~p ~w\n", [MyName, Magic, HosedAnnotations]),
2015-08-29 04:13:31 +00:00
? REACT ( { calculate_flaps , ? LINE , [ { new_flap_count , NewFlapCount } ,
2015-08-29 17:22:59 +00:00
{ bad_flus , BadFLUs } ,
{ hosed_t_u_ts , HosedTransUnionTs } ,
2015-08-29 04:13:31 +00:00
{ hosed_annotations , HosedAnnotations } ,
{ magic , Magic } ,
{ all_hosed , AllHosed } ] } ) ,
2015-08-14 13:29:20 +00:00
AllHosed ;
2015-08-18 11:49:36 +00:00
not AmFlapping_p - >
NewFlapCount = 0 ,
WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
%% 'agree' and we can do something different" strategy before,
%% and it didn't work then. Silly me. Distributed systems
%% lesson #823: do not forget the past. In a situation created
%% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
%% completely unfairly. So a & c were the only authors ever to
%% suceessfully write a suggested projection to a public store.
%% Oops.
%%
%% So, we're going to keep track in #ch_mgr state for the number
%% of times that this insane judgement has happened.
2015-07-17 05:51:39 +00:00
NewFlapStart = ? NOT_FLAPPING_START ,
2015-04-06 05:16:20 +00:00
AllFlapCounts = [ ] ,
2015-08-29 04:13:31 +00:00
? REACT ( { calculate_flaps , ? LINE , [ ] } ) ,
2015-04-06 05:16:20 +00:00
AllHosed = [ ]
end ,
2015-08-20 08:32:46 +00:00
AllFlapCounts_with_my_new =
[ { MyName , NewFlapStart } | lists : keydelete ( MyName , 1 , AllFlapCounts ) ] ,
2015-08-18 11:49:36 +00:00
FlappingI = make_flapping_i ( NewFlapStart , NewFlapCount , AllHosed ,
2015-08-23 11:00:19 +00:00
AllFlapCounts_with_my_new , MyUniquePropCount ) ,
2015-04-14 07:17:49 +00:00
%% NOTE: Just because we increment flaps here, there's no correlation
%% to successful public proj store writes! For example,
2015-04-06 05:16:20 +00:00
%% if we loop through states C2xx a few times, we would incr
%% flaps each time ... but the C2xx path doesn't write a new
2015-04-14 07:17:49 +00:00
%% proposal to everyone's public proj stores. Similarly,
%% if we go through to C300, we will *try* to write to all public
%% stores, but the C3xx path doesn't care if all of those write
%% attempts *fail*. Our flap count is a rough heuristic only, and
%% a large local flaps count gives no concrete guarantee that any
%% communication has been successful with any other part of the
%% cluster.
2015-06-02 11:32:52 +00:00
%% TODO: 2015-03-04: I'm growing increasingly suspicious of
%% the 'runenv' variable that's threaded through all this code.
%% It isn't doing what I'd originally intended. Fix it.
2015-08-29 04:13:31 +00:00
? REACT ( { calculate_flaps , ? LINE , [ { flapping_i , FlappingI } ,
{ am_flapping_p , AmFlapping_p } ,
{ ch_mgr_updates , follow } ,
{ flap_count , NewFlapCount } ,
{ flap_start , NewFlapStart } ,
{ flap_last_up , CurrentUp } ,
{ flap_last_up_change , LastUpChange } ,
{ flap_counts_last , AllFlapCounts } ] } ) ,
2015-08-18 11:49:36 +00:00
S2 = S #ch_mgr { flap_count = NewFlapCount , flap_start = NewFlapStart ,
2015-08-24 11:38:54 +00:00
flap_last_up = CurrentUp , flap_last_up_change = LastUpChange ,
flap_counts_last = AllFlapCounts ,
2015-08-18 11:49:36 +00:00
runenv = RunEnv1 } ,
2015-08-29 17:22:59 +00:00
P_newprop2 = case proplists : get_value ( MyName , AllHosed ) of
true when CMode == cp_mode - >
%% Experiment: try none proj but keep the epoch #.
? REACT ( { calculate_flaps , ? LINE , [ ] } ) ,
P_newprop #projection_v1 {
upi = [ ] , repairing = [ ] ,
down = P_newprop #projection_v1.all_members } ;
_ - >
? REACT ( { calculate_flaps , ? LINE , [ ] } ) ,
P_newprop
end ,
? REACT ( { calculate_flaps , ? LINE , [ { zzz_1 , P_newprop2 #projection_v1.upi } ,
{ zzz_2 , P_newprop2 #projection_v1.repairing } ,
{ zzz_3 , catch ( P_newprop2 #projection_v1.flap ) #flap_i.all_hosed } ] } ) ,
{ machi_projection : update_checksum ( P_newprop2 #projection_v1 { flap = FlappingI } ) ,
2015-08-18 11:49:36 +00:00
if AmFlapping_p - >
S2 ;
true - >
2015-08-22 12:27:01 +00:00
clear_most_flapping_state ( S2 )
2015-08-18 11:49:36 +00:00
end } .
2015-04-06 05:16:20 +00:00
2015-08-23 08:50:25 +00:00
make_unique_proposal_summaries ( H , P_newprop ) - >
HistoryPs = queue : to_list ( H ) ,
Ps = HistoryPs ++ [ P_newprop ] ,
lists : usort ( [ { P #projection_v1.upi ,
P #projection_v1.repairing } | |
P < - Ps ] ) .
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
make_flapping_i ( ) - >
2015-08-23 11:00:19 +00:00
make_flapping_i ( ? NOT_FLAPPING_START , 0 , [ ] , [ ] , 0 ) .
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
2015-08-23 11:00:19 +00:00
make_flapping_i ( NewFlapStart , NewFlapCount , AllHosed , AllFlapCounts ,
MyUniquePropCount ) - >
2015-08-18 11:49:36 +00:00
#flap_i { flap_count = { NewFlapStart , NewFlapCount } ,
2015-06-02 11:32:52 +00:00
all_hosed = AllHosed ,
2015-08-23 11:00:19 +00:00
all_flap_counts = lists : sort ( AllFlapCounts ) ,
my_unique_prop_count = MyUniquePropCount } .
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
2015-04-06 05:16:20 +00:00
projection_transitions_are_sane ( Ps , RelativeToServer ) - >
projection_transitions_are_sane ( Ps , RelativeToServer , false ) .
- ifdef ( TEST ) .
projection_transitions_are_sane_retrospective ( Ps , RelativeToServer ) - >
projection_transitions_are_sane ( Ps , RelativeToServer , true ) .
- endif . % TEST
projection_transitions_are_sane ( [ ] , _ RelativeToServer , _ RetrospectiveP ) - >
true ;
projection_transitions_are_sane ( [ _ ] , _ RelativeToServer , _ RetrospectiveP ) - >
true ;
projection_transitions_are_sane ( [ P1 , P2 | T ] , RelativeToServer , RetrospectiveP ) - >
case projection_transition_is_sane ( P1 , P2 , RelativeToServer ,
RetrospectiveP ) of
true - >
projection_transitions_are_sane ( [ P2 | T ] , RelativeToServer ,
RetrospectiveP ) ;
Else - >
Else
end .
- ifdef ( TEST ) .
projection_transition_is_sane_retrospective ( P1 , P2 , RelativeToServer ) - >
projection_transition_is_sane ( P1 , P2 , RelativeToServer , true ) .
- endif . % TEST
2015-07-04 06:52:44 +00:00
projection_transition_is_sane ( P1 , P2 , RelativeToServer ) - >
projection_transition_is_sane ( P1 , P2 , RelativeToServer , false ) .
2015-07-15 03:44:56 +00:00
%% @doc Check if a projection transition is sane & safe.
%%
%% NOTE: The return value convention is `true' for sane/safe and
%% `term() /= true' for any unsafe/insane value.
2015-07-04 06:52:44 +00:00
projection_transition_is_sane ( P1 , P2 , RelativeToServer , RetrospectiveP ) - >
2015-08-13 15:12:13 +00:00
put ( myname , RelativeToServer ) ,
2015-07-05 05:52:50 +00:00
put ( why2 , [ ] ) ,
2015-09-01 13:10:45 +00:00
CMode = P2 #projection_v1.mode ,
2015-08-31 13:14:28 +00:00
HasInner1 = inner_projection_exists ( P1 ) ,
HasInner2 = inner_projection_exists ( P2 ) ,
Inner1 = inner_projection_or_self ( P1 ) ,
Inner2 = inner_projection_or_self ( P2 ) ,
2015-07-04 06:52:44 +00:00
case projection_transition_is_sane_with_si_epoch (
P1 , P2 , RelativeToServer , RetrospectiveP ) of
true - >
if HasInner1 orelse HasInner2 - >
2015-08-27 13:22:15 +00:00
if HasInner1 orelse HasInner2 - >
%% In case of transition with inner projections, we
%% must allow the epoch number to remain constant.
%% Thus, we call the function that does not check for
%% a strictly-increasing epoch.
2015-07-05 05:52:50 +00:00
? RETURN2 (
projection_transition_is_sane_final_review ( P1 , P2 ,
projection_transition_is_sane_except_si_epoch (
Inner1 , Inner2 , RelativeToServer , RetrospectiveP ) ) ) ;
true - >
2015-08-31 13:14:28 +00:00
exit ( delete_this_inner_clause_impossible_with_two_identical_nested_if_clauses ) ,
2015-07-05 05:52:50 +00:00
? RETURN2 (
projection_transition_is_sane_final_review ( P1 , P2 ,
projection_transition_is_sane_with_si_epoch (
Inner1 , Inner2 , RelativeToServer , RetrospectiveP ) ) )
2015-07-04 06:52:44 +00:00
end ;
true - >
2015-08-13 15:12:13 +00:00
projection_transition_is_sane_final_review ( P1 , P2 ,
? RETURN2 ( true ) )
2015-07-04 06:52:44 +00:00
end ;
Else - >
2015-09-01 13:10:45 +00:00
if CMode == cp_mode ,
HasInner1 and ( not HasInner2 ) - >
2015-08-31 13:14:28 +00:00
%% OK, imagine that we used to be flapping but now we've
%% stopped flapping.
%%
%% P1 = outer = upi=[a,d,e],repairing=[] epoch 870
%% inner = upi=[a,e,d],repairing=[] epoch 605
%% to
%% P2 = outer = upi=[a,e,d],repairing=[] epoch 875
%% inner = undefined
%%
%% Everyone is using the inner projection [a,e,d],[],
%% everyone thinks that that is OK. It has been in use
%% for a while now.
%%
%% Now there's a new epoch, e875 that is saying that we
%% should transition from inner e605 [a,e,d],[] -> outer
%% e875 [a,e,d],[] This is SAFE! The UPI is the *same*.
%%
%% Verify this Inner1->P2 transition, including SI epoch
? RETURN2 (
projection_transition_is_sane_final_review ( P1 , P2 ,
projection_transition_is_sane_with_si_epoch (
Inner1 , P2 , RelativeToServer , RetrospectiveP ) ) ) ;
2015-09-01 13:10:45 +00:00
CMode == cp_mode ,
( not HasInner1 ) and HasInner2 - >
%% OK, imagine that we are entering flapping mode.
%%
%% P1 = outer = upi=[a,d,e],repairing=[c] epoch 298
%% inner = undefined
%% to
%% P2 = outer = upi=[d,e,c],repairing=[] epoch 684
%% inner = upi=[a,d,e],repairing=[c] epoch 163
%%
%% We have been unstable for a long time: 684-298 is a
%% lot of churn. Our internal sense of what the outer
%% projection should look like is screwed up. Someone
%% thinks that there was a repair of c that finished in
%% the outer projection, during the churn between 298 and
%% 684, but we didn't adopt that change to the change.
%% Perhaps we were asleep?
%%
%% Based on our last view of the world at 298, we are
%% keeping that same view *and* we've decided to start
%% flapping, hence the inner projection. Make certain
%% that that transition is ok relative to ourself, and
%% let the other safety checks built into humming
%% consensus & CP mode management take care of the rest.
? RETURN2 (
projection_transition_is_sane_final_review ( P1 , P2 ,
projection_transition_is_sane_with_si_epoch (
P1 , Inner2 , RelativeToServer , RetrospectiveP ) ) ) ;
2015-08-31 13:14:28 +00:00
true - >
? RETURN2 ( Else )
end
2015-07-04 06:52:44 +00:00
end .
2015-08-24 10:04:26 +00:00
projection_transition_is_sane_final_review (
P1 , P2 , { expected_author2 , UPI1_tail , _ } = Else ) - >
2015-07-05 05:52:50 +00:00
%% Reminder: P1 & P2 are outer projections
%%
%% We have a small problem for state transition sanity checking in the
%% case where we are flapping *and* a repair has finished. One of the
%% sanity checks in simple_chain_state_transition_is_sane(() is that
%% the author of P2 in this case must be the tail of P1's UPI: i.e.,
%% it's the tail's responsibility to perform repair, therefore the tail
%% must damn well be the author of any transition that says a repair
%% finished successfully.
%%
%% The problem is that author_server of the inner projection does not
%% reflect the actual author! See the comment with the text
%% "The inner projection will have a fake author" in react_to_env_A30().
%%
%% So, there's a special return value that tells us to try to check for
%% the correct authorship here.
2015-08-22 12:27:01 +00:00
P1HasInner_p = inner_projection_exists ( P1 ) ,
P2HasInner_p = inner_projection_exists ( P2 ) ,
P1_LastInnerUPI = case ( inner_projection_or_self ( P1 ) ) #projection_v1.upi of
P1InnerUPI = [ _ | _ ] when P1HasInner_p - >
lists : last ( P1InnerUPI ) ;
_ - >
no_such_author
end ,
if P1HasInner_p , P2HasInner_p - >
if UPI1_tail == P1_LastInnerUPI - >
? RETURN2 ( true ) ;
true - >
? RETURN2 ( Else )
end ;
UPI1_tail == P2 #projection_v1.author_server - >
2015-07-05 05:52:50 +00:00
? RETURN2 ( true ) ;
true - >
2015-08-22 12:27:01 +00:00
? RETURN2 ( { gazzuknkgazzuknk , Else , gazzuknk } )
2015-07-05 05:52:50 +00:00
end ;
2015-08-13 15:12:13 +00:00
projection_transition_is_sane_final_review (
#projection_v1 { mode = CMode1 } = _ P1 ,
#projection_v1 { mode = CMode2 } = _ P2 ,
_ ) when CMode1 / = CMode2 - >
{ wtf , cmode1 , CMode1 , cmode2 , CMode2 } ;
projection_transition_is_sane_final_review (
2015-08-30 11:39:58 +00:00
#projection_v1 { mode = cp_mode , upi = UPI1 , dbg = P1_dbg } = _ P1 ,
#projection_v1 { mode = cp_mode , upi = UPI2 , witnesses = Witness_list } = _ P2 ,
2015-08-13 15:12:13 +00:00
true ) - >
%% All earlier sanity checks has said that this transition is sane, but
2015-09-01 13:10:45 +00:00
%% we also need to make certain that any CP mode transition preserves at
2015-08-13 15:12:13 +00:00
%% least one non-witness server in the UPI list. Earlier checks have
%% verified that the ordering of the FLUs within the UPI list is ok.
UPI1_s = ordsets : from_list ( UPI1 -- Witness_list ) ,
UPI2_s = ordsets : from_list ( UPI2 -- Witness_list ) ,
2015-08-30 10:53:47 +00:00
catch ? REACT ( { projection_transition_is_sane_final_review , ? LINE ,
[ { upi1 , UPI1 } , { upi2 , UPI2 } , { witnesses , Witness_list } ,
2015-08-30 11:39:58 +00:00
{ zerf_backstop , proplists : get_value ( zerf_backstop , P1_dbg ) } ,
2015-08-30 10:53:47 +00:00
{ upi1_s , UPI1 } , { upi2_s , UPI2 } ] } ) ,
2015-08-30 11:39:58 +00:00
case proplists : get_value ( zerf_backstop , P1_dbg ) of
2015-08-13 15:12:13 +00:00
true when UPI1 == [ ] - >
2015-09-01 13:10:45 +00:00
%% CAUTION, this is a dangerous case. If the old projection, P1,
%% has a 'zerf_backstop' annotation, then when this function
%% returns true, we are (in effect) saying, "We trust you." What
%% if we called make_zerf() a year ago because we took a 1 year
%% nap?? How can we trust this?
%%
%% The answer is: this is not our last safety enforcement for CP
%% mode, fortunately. We are going from the none projection to a
%% quorum majority projection, *and* we will not unwedge ourself
%% until we can verify that all UPI members of the chain are
%% unanimous for this epoch. So if we took a 1 year nap already,
%% or if we take a one year right now and delay writing our
%% private projection for 1 year, then if we disagree with the
%% quorum majority, we simply won't be able to unwedge.
2015-08-13 15:12:13 +00:00
? RETURN2 ( true ) ;
_ when UPI2 == [ ] - >
%% We're down to the none projection to wedge ourself. That's ok.
? RETURN2 ( true ) ;
_ - >
? RETURN2 ( not ordsets : is_disjoint ( UPI1_s , UPI2_s ) )
end ;
2015-07-05 05:52:50 +00:00
projection_transition_is_sane_final_review ( _ P1 , _ P2 , Else ) - >
? RETURN2 ( Else ) .
2015-07-15 03:44:56 +00:00
%% @doc Check if a projection transition is sane & safe with a
%% strictly increasing epoch number.
%%
%% NOTE: The return value convention is `true' for sane/safe and
%% `term() /= true' for any unsafe/insane value.
2015-07-04 06:52:44 +00:00
projection_transition_is_sane_with_si_epoch (
2015-07-04 05:57:38 +00:00
#projection_v1 { epoch_number = Epoch1 } = P1 ,
#projection_v1 { epoch_number = Epoch2 } = P2 ,
RelativeToServer , RetrospectiveP ) - >
2015-07-04 06:52:44 +00:00
case projection_transition_is_sane_except_si_epoch (
2015-07-04 05:57:38 +00:00
P1 , P2 , RelativeToServer , RetrospectiveP ) of
true - >
2015-07-04 06:52:44 +00:00
%% Must be a strictly increasing epoch.
2015-07-15 03:44:56 +00:00
case Epoch2 > Epoch1 of
true - >
? RETURN2 ( true ) ;
false - >
? RETURN2 ( { epoch_not_si , Epoch2 , 'not_gt' , Epoch1 } )
end ;
2015-07-04 05:57:38 +00:00
Else - >
2015-07-05 05:52:50 +00:00
? RETURN2 ( Else )
2015-07-04 05:57:38 +00:00
end .
2015-07-15 03:44:56 +00:00
%% @doc Check if a projection transition is sane & safe with the
%% exception of a strictly increasing epoch number (equality is ok).
%%
%% NOTE: The return value convention is `true' for sane/safe and
%% `term() /= true' for any unsafe/insane value.
2015-07-04 06:52:44 +00:00
projection_transition_is_sane_except_si_epoch (
2015-04-06 05:16:20 +00:00
#projection_v1 { epoch_number = Epoch1 ,
2015-08-06 08:48:22 +00:00
epoch_csum = CSum1 ,
creation_time = CreationTime1 ,
mode = CMode1 ,
author_server = AuthorServer1 ,
all_members = All_list1 ,
witnesses = Witness_list1 ,
down = Down_list1 ,
upi = UPI_list1 ,
repairing = Repairing_list1 ,
dbg = Dbg1 } = P1 ,
2015-04-06 05:16:20 +00:00
#projection_v1 { epoch_number = Epoch2 ,
2015-08-06 08:48:22 +00:00
epoch_csum = CSum2 ,
creation_time = CreationTime2 ,
mode = CMode2 ,
author_server = AuthorServer2 ,
all_members = All_list2 ,
witnesses = Witness_list2 ,
down = Down_list2 ,
upi = UPI_list2 ,
repairing = Repairing_list2 ,
dbg = Dbg2 } = P2 ,
2015-07-03 15:13:13 +00:00
RelativeToServer , _ _ TODO_RetrospectiveP ) - >
2015-07-05 05:52:50 +00:00
? RETURN2 ( undefined ) ,
2015-04-06 05:16:20 +00:00
try
%% General notes:
%%
%% I'm making no attempt to be "efficient" here. All of these data
WIP: 1st part of moving old chain state transtion code to new
Ha, famous last words, amirite?
%% The chain sequence/order checks at the bottom of this function aren't
%% as easy-to-read as they ought to be. However, I'm moderately confident
%% that it isn't buggy. TODO: refactor them for clarity.
So, now machi_chain_manager1:projection_transition_is_sane() is using
newer, far less buggy code to make sanity decisions.
TODO: Add support for Retrospective mode. TODO is it really needed?
Examples of how the old code sucks and the new code sucks less.
138> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxxx..x.xxxxxx..x.x....x..xx........................................................Failed! After 69 tests.
[a,b,c]
{c,[a,b,c],[c,b],b,[b,a],[b,a,c]}
Old_res ([335,192,166,160,153,139]): true
New_res: false (why line [1936])
Shrinking xxxxxxxxxxxx.xxxxxxx.xxx.xxxxxxxxxxxxxxxxx(3 times)
[a,b,c]
%% {Author1,UPI1, Repair1,Author2,UPI2, Repair2} %%
{c, [a,b,c],[], a, [b,a],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: we've swapped order of a & b, which is bad.
139> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxx..x...xx..........xxx..x..............x......x............................................(x10)...(x1)........Failed! After 120 tests.
[b,c,a]
{c,[c,a],[c],a,[a,b],[b,a]}
Old_res ([335,192,185,160,153,123]): true
New_res: false (why line [1936])
Shrinking xx.xxxxxx.x.xxxxxxxx.xxxxxxxxxxx(4 times)
[b,a,c]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{a, [c], [], c, [c,b],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: b wasn't repairing in the previous state.
150> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxx....x...xxxxx..xx.....x.......xxx..x.......xxx...................x................x......(x10).....(x1)........xFailed! After 130 tests.
[c,a,b]
{b,[c],[b,a,c],c,[c,a,b],[b]}
Old_res ([335,214,185,160,153,147]): true
New_res: false (why line [1936])
Shrinking xxxx.x.xxx.xxxxxxx.xxxxxxxxx(4 times)
[c,b,a]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{c, [c], [a,b], c, [c,b,a],[]}
Old_res ([335,328,185,160,153,111]): true
New_res: false (why line [1981,1679])
false
Old code is wrong: a & b were repairing but UPI2 has a & b in the wrong order.
2015-07-03 15:01:54 +00:00
%% structures are small, and the funcs aren't called zillions of times per
2015-04-06 05:16:20 +00:00
%% second.
2015-08-06 08:48:22 +00:00
CMode1 = CMode2 ,
2015-04-06 05:16:20 +00:00
true = is_integer ( Epoch1 ) andalso is_integer ( Epoch2 ) ,
true = is_binary ( CSum1 ) andalso is_binary ( CSum2 ) ,
{ _ , _ , _ } = CreationTime1 ,
{ _ , _ , _ } = CreationTime2 ,
2015-04-14 07:17:49 +00:00
true = is_atom ( AuthorServer1 ) andalso is_atom ( AuthorServer2 ) , % todo type may change?
2015-04-06 05:16:20 +00:00
true = is_list ( All_list1 ) andalso is_list ( All_list2 ) ,
2015-08-06 08:48:22 +00:00
true = is_list ( Witness_list1 ) andalso is_list ( Witness_list2 ) ,
2015-04-06 05:16:20 +00:00
true = is_list ( Down_list1 ) andalso is_list ( Down_list2 ) ,
true = is_list ( UPI_list1 ) andalso is_list ( UPI_list2 ) ,
true = is_list ( Repairing_list1 ) andalso is_list ( Repairing_list2 ) ,
true = is_list ( Dbg1 ) andalso is_list ( Dbg2 ) ,
2015-07-04 06:52:44 +00:00
%% Don't check for strictly increasing epoch here: that's the job of
%% projection_transition_is_sane_with_si_epoch().
true = Epoch2 > = Epoch1 ,
2015-04-06 05:16:20 +00:00
%% No duplicates
2015-08-06 08:48:22 +00:00
true = lists : sort ( Witness_list2 ) == lists : usort ( Witness_list2 ) ,
2015-04-06 05:16:20 +00:00
true = lists : sort ( Down_list2 ) == lists : usort ( Down_list2 ) ,
true = lists : sort ( UPI_list2 ) == lists : usort ( UPI_list2 ) ,
true = lists : sort ( Repairing_list2 ) == lists : usort ( Repairing_list2 ) ,
%% Disjoint-ness
2015-07-04 06:52:44 +00:00
All_list1 = All_list2 , % todo will probably change
2015-08-06 08:48:22 +00:00
%% true = lists:sort(All_list2) == lists:sort(Down_list2 ++ UPI_list2 ++
%% Repairing_list2),
[ ] = [ X | | X < - Witness_list2 , not lists : member ( X , All_list2 ) ] ,
2015-04-06 05:16:20 +00:00
[ ] = [ X | | X < - Down_list2 , not lists : member ( X , All_list2 ) ] ,
[ ] = [ X | | X < - UPI_list2 , not lists : member ( X , All_list2 ) ] ,
[ ] = [ X | | X < - Repairing_list2 , not lists : member ( X , All_list2 ) ] ,
DownS2 = sets : from_list ( Down_list2 ) ,
UPIS2 = sets : from_list ( UPI_list2 ) ,
RepairingS2 = sets : from_list ( Repairing_list2 ) ,
true = sets : is_disjoint ( DownS2 , UPIS2 ) ,
true = sets : is_disjoint ( DownS2 , RepairingS2 ) ,
true = sets : is_disjoint ( UPIS2 , RepairingS2 ) ,
2015-07-15 03:44:56 +00:00
%% We won't check the checksum of P1, but we will of P2.
P2 = machi_projection : update_checksum ( P2 ) ,
2015-04-06 05:16:20 +00:00
2015-08-31 13:14:28 +00:00
%% CP mode extra sanity checks
if CMode1 == cp_mode - >
Majority = full_majority_size ( All_list2 ) ,
if length ( UPI_list2 ) == 0 - >
ok ; % none projection
length ( UPI_list2 ) > = Majority - >
%% We have at least one non-witness
true = ( length ( UPI_list2 -- Witness_list2 ) > 0 ) ;
true - >
error ( { majority_not_met , UPI_list2 } )
end ;
CMode1 == ap_mode - >
ok
end ,
2015-07-05 05:52:50 +00:00
%% Hooray, all basic properties of the projection's elements are
%% not obviously bad. Now let's check if the UPI+Repairing->UPI
%% transition is good.
2015-08-06 08:48:22 +00:00
%%
%% NOTE: chain_state_transition_is_sane() only cares about strong
%% consistency violations and (because witness servers don't store
%% any data) doesn't care about witness servers. So we remove all
%% witnesses from the UPI lists before calling
%% chain_state_transition_is_sane()
UPI_list1w = UPI_list1 -- Witness_list1 ,
UPI_list2w = UPI_list2 -- Witness_list2 ,
WIP: 1st part of moving old chain state transtion code to new
Ha, famous last words, amirite?
%% The chain sequence/order checks at the bottom of this function aren't
%% as easy-to-read as they ought to be. However, I'm moderately confident
%% that it isn't buggy. TODO: refactor them for clarity.
So, now machi_chain_manager1:projection_transition_is_sane() is using
newer, far less buggy code to make sanity decisions.
TODO: Add support for Retrospective mode. TODO is it really needed?
Examples of how the old code sucks and the new code sucks less.
138> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxxx..x.xxxxxx..x.x....x..xx........................................................Failed! After 69 tests.
[a,b,c]
{c,[a,b,c],[c,b],b,[b,a],[b,a,c]}
Old_res ([335,192,166,160,153,139]): true
New_res: false (why line [1936])
Shrinking xxxxxxxxxxxx.xxxxxxx.xxx.xxxxxxxxxxxxxxxxx(3 times)
[a,b,c]
%% {Author1,UPI1, Repair1,Author2,UPI2, Repair2} %%
{c, [a,b,c],[], a, [b,a],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: we've swapped order of a & b, which is bad.
139> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxx..x...xx..........xxx..x..............x......x............................................(x10)...(x1)........Failed! After 120 tests.
[b,c,a]
{c,[c,a],[c],a,[a,b],[b,a]}
Old_res ([335,192,185,160,153,123]): true
New_res: false (why line [1936])
Shrinking xx.xxxxxx.x.xxxxxxxx.xxxxxxxxxxx(4 times)
[b,a,c]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{a, [c], [], c, [c,b],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: b wasn't repairing in the previous state.
150> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxx....x...xxxxx..xx.....x.......xxx..x.......xxx...................x................x......(x10).....(x1)........xFailed! After 130 tests.
[c,a,b]
{b,[c],[b,a,c],c,[c,a,b],[b]}
Old_res ([335,214,185,160,153,147]): true
New_res: false (why line [1936])
Shrinking xxxx.x.xxx.xxxxxxx.xxxxxxxxx(4 times)
[c,b,a]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{c, [c], [a,b], c, [c,b,a],[]}
Old_res ([335,328,185,160,153,111]): true
New_res: false (why line [1981,1679])
false
Old code is wrong: a & b were repairing but UPI2 has a & b in the wrong order.
2015-07-03 15:01:54 +00:00
? RETURN2 (
2015-08-06 08:48:22 +00:00
chain_state_transition_is_sane ( AuthorServer1 , UPI_list1w , Repairing_list1 ,
AuthorServer2 , UPI_list2w ) )
2015-04-06 05:16:20 +00:00
catch
_ Type : _ Err - >
WIP: 1st part of moving old chain state transtion code to new
Ha, famous last words, amirite?
%% The chain sequence/order checks at the bottom of this function aren't
%% as easy-to-read as they ought to be. However, I'm moderately confident
%% that it isn't buggy. TODO: refactor them for clarity.
So, now machi_chain_manager1:projection_transition_is_sane() is using
newer, far less buggy code to make sanity decisions.
TODO: Add support for Retrospective mode. TODO is it really needed?
Examples of how the old code sucks and the new code sucks less.
138> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxxx..x.xxxxxx..x.x....x..xx........................................................Failed! After 69 tests.
[a,b,c]
{c,[a,b,c],[c,b],b,[b,a],[b,a,c]}
Old_res ([335,192,166,160,153,139]): true
New_res: false (why line [1936])
Shrinking xxxxxxxxxxxx.xxxxxxx.xxx.xxxxxxxxxxxxxxxxx(3 times)
[a,b,c]
%% {Author1,UPI1, Repair1,Author2,UPI2, Repair2} %%
{c, [a,b,c],[], a, [b,a],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: we've swapped order of a & b, which is bad.
139> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxx..x...xx..........xxx..x..............x......x............................................(x10)...(x1)........Failed! After 120 tests.
[b,c,a]
{c,[c,a],[c],a,[a,b],[b,a]}
Old_res ([335,192,185,160,153,123]): true
New_res: false (why line [1936])
Shrinking xx.xxxxxx.x.xxxxxxxx.xxxxxxxxxxx(4 times)
[b,a,c]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{a, [c], [], c, [c,b],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: b wasn't repairing in the previous state.
150> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxx....x...xxxxx..xx.....x.......xxx..x.......xxx...................x................x......(x10).....(x1)........xFailed! After 130 tests.
[c,a,b]
{b,[c],[b,a,c],c,[c,a,b],[b]}
Old_res ([335,214,185,160,153,147]): true
New_res: false (why line [1936])
Shrinking xxxx.x.xxx.xxxxxxx.xxxxxxxxx(4 times)
[c,b,a]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{c, [c], [a,b], c, [c,b,a],[]}
Old_res ([335,328,185,160,153,111]): true
New_res: false (why line [1981,1679])
false
Old code is wrong: a & b were repairing but UPI2 has a & b in the wrong order.
2015-07-03 15:01:54 +00:00
? RETURN2 ( oops ) ,
2015-04-09 08:13:38 +00:00
S1 = machi_projection : make_summary ( P1 ) ,
S2 = machi_projection : make_summary ( P2 ) ,
2015-04-06 05:16:20 +00:00
Trace = erlang : get_stacktrace ( ) ,
2015-07-15 03:44:56 +00:00
%% There are basic data structure checks only, do not return `false'
%% here.
2015-04-06 05:16:20 +00:00
{ err , _ Type , _ Err , from , S1 , to , S2 , relative_to , RelativeToServer ,
2015-08-18 11:49:36 +00:00
react , get ( react ) ,
2015-04-06 05:16:20 +00:00
stack , Trace }
end .
2015-08-26 09:47:39 +00:00
poll_private_proj_is_upi_unanimous ( #ch_mgr { consistency_mode = ap_mode } = S ) - >
S ;
poll_private_proj_is_upi_unanimous ( #ch_mgr { consistency_mode = cp_mode ,
2015-08-27 08:58:43 +00:00
proj_unanimous = { _ , _ , _ } } = S ) - >
2015-08-28 09:37:11 +00:00
%% #ch_mgr{name=MyName, proj=Proj} = S,
%% io:format(user, "\nCONFIRM debug ~w skip poll for inner ~w outer ~w\n",
%% [MyName, (inner_projection_or_self(Proj))#projection_v1.epoch_number, Proj#projection_v1.epoch_number]),
2015-08-26 09:47:39 +00:00
S ;
poll_private_proj_is_upi_unanimous ( #ch_mgr { consistency_mode = cp_mode ,
proj_unanimous = false ,
proj = Proj } = S ) - >
if Proj #projection_v1.upi == [ ] % Nobody to poll?
orelse
Proj #projection_v1.epoch_number == 0 - > % Skip polling for epoch 0?
S ;
true - >
poll_private_proj_is_upi_unanimous_sleep ( 0 , S )
end .
poll_private_proj_is_upi_unanimous_sleep ( Count , S ) when Count > 2 - >
S ;
2015-08-29 09:30:53 +00:00
poll_private_proj_is_upi_unanimous_sleep ( Count , #ch_mgr { runenv = RunEnv } = S ) - >
Denom = case proplists : get_value ( use_partition_simulator , RunEnv , false ) of
true - >
20 ;
_ - >
1
end ,
timer : sleep ( ( ( Count * Count ) * 50 ) div Denom ) ,
2015-08-26 09:47:39 +00:00
case poll_private_proj_is_upi_unanimous3 ( S ) of
2015-08-27 08:58:43 +00:00
#ch_mgr { proj_unanimous = false } = S2 - >
poll_private_proj_is_upi_unanimous_sleep ( Count + 1 , S2 ) ;
2015-08-26 09:47:39 +00:00
S2 - >
2015-08-27 08:58:43 +00:00
S2
2015-08-26 09:47:39 +00:00
end .
2015-08-27 08:58:43 +00:00
poll_private_proj_is_upi_unanimous3 ( #ch_mgr { name = MyName , proj = P_current ,
2015-08-26 09:47:39 +00:00
opts = MgrOpts } = S ) - >
2015-08-27 08:58:43 +00:00
Proj_ios = inner_projection_or_self ( P_current ) ,
UPI = Proj_ios #projection_v1.upi ,
EpochID = machi_projection : make_epoch_id ( Proj_ios ) ,
2015-08-26 09:47:39 +00:00
{ Rs , S2 } = read_latest_projection_call_only2 ( private , UPI , S ) ,
2015-08-28 09:37:11 +00:00
Rs2 = [ if is_record ( R , projection_v1 ) - >
machi_projection : make_epoch_id ( inner_projection_or_self ( R ) ) ;
2015-08-26 09:47:39 +00:00
true - >
2015-08-28 09:37:11 +00:00
R % probably {error, unwritten}
end | | R < - Rs ] ,
2015-08-26 09:47:39 +00:00
case lists : usort ( Rs2 ) of
[ EID ] when EID == EpochID - >
2015-08-28 09:37:11 +00:00
%% We have a debugging problem, alas. It would be really great
%% if we could preserve the dbg2 info that's in the current
%% projection that's on disk. However, the full dbg2 list
%% with 'react' trace data isn't in the #ch_mgr.proj copy of
%% the projection. So, go read it from the store.
%%
%% But of course there's another small problem. P_current could
%% be the result of make_zerf(), which helps us "fast forward" to
%% a newer CP mode projection. And so what we just read in the
%% 'Rs' at the top of this function may be for a new epoch that
%% we've never seen before and therefore doesn't exist in our
%% local private projection store. But if it came from
%% make_zerf(), by definition it must be annotated, so don't try
%% to proceed any further.
ProxyPid = proxy_pid ( MyName , S ) ,
OuterEpoch = P_current #projection_v1.epoch_number ,
case ? FLU_PC : read_projection ( ProxyPid , private , OuterEpoch ) of
{ ok , P_currentFull } - >
Now = os : timestamp ( ) ,
Annotation = make_annotation ( EpochID , Now ) ,
NewDbg2 = [ Annotation | P_currentFull #projection_v1.dbg2 ] ,
NewProj = P_currentFull #projection_v1 { dbg2 = NewDbg2 } ,
ProjStore = case get_projection_store_regname ( MgrOpts ) of
undefined - >
machi_flu_psup : make_proj_supname ( MyName ) ;
PStr - >
PStr
end ,
#projection_v1 { epoch_number = _ EpochRep ,
epoch_csum = < < _ CSumRep : 4 / binary , _ / binary > > ,
upi = _ UPIRep ,
repairing = _ RepairingRep } =
inner_projection_or_self ( NewProj ) ,
io : format ( user , " \n CONFIRM epoch ~w ~w upi ~w rep ~w by ~w ~w \n " , [ _ EpochRep , _ CSumRep , _ UPIRep , _ RepairingRep , MyName , if P_current #projection_v1.inner == undefined - > outer ; true - > { inner , { outer , P_current #projection_v1.epoch_number } } end ] ) ,
ok = machi_projection_store : write ( ProjStore , private , NewProj ) ,
%% Unwedge our FLU.
{ ok , NotifyPid } = machi_projection_store : get_wedge_notify_pid ( ProjStore ) ,
_ = machi_flu1 : update_wedge_state ( NotifyPid , false , EpochID ) ,
S2 #ch_mgr { proj_unanimous = Now } ;
_ - >
S2
end ;
2015-08-26 09:47:39 +00:00
_ Else - >
2015-08-27 11:27:33 +00:00
%% io:format(user, "poll by ~w: want ~W got ~W\n",
%% [MyName, EpochID, 6, _Else, 8]),
2015-08-26 09:47:39 +00:00
S2
end .
2015-08-27 08:58:43 +00:00
poll_read_private_projections ( #projection_v1 { inner = undefined ,
epoch_number = Epoch ,
upi = UPI } = _ P_current , S ) - >
read_projection_call_only2 ( private , Epoch , UPI , S ) ;
poll_read_private_projections ( #projection_v1 { inner = _ not_undefined ,
upi = UPI } = _ P_current , S ) - >
%% For inner projections, we are (by definition) flapping, and the
%% outer epoch numbers are (by definition) unstable. However, any
%% observed use of the the inner proj epoch # is what we need.
read_latest_projection_call_only2 ( private , UPI , S ) .
2015-04-06 05:16:20 +00:00
sleep_ranked_order ( MinSleep , MaxSleep , FLU , FLU_list ) - >
2015-04-09 08:13:38 +00:00
USec = calc_sleep_ranked_order ( MinSleep , MaxSleep , FLU , FLU_list ) ,
timer : sleep ( USec ) ,
USec .
calc_sleep_ranked_order ( MinSleep , MaxSleep , FLU , FLU_list ) - >
Chain manager bug fixes & enhancment (more...)
* Set max length of a chain at -define(MAX_CHAIN_LENGTH, 64).
* Perturb tick sleep time of each manager
* If a chain manager L has zero members in its chain, and then its local
public projection store (authored by some remote author R) has a projection
that contains L, then adopt R's projection and start humming consensus.
* Handle "cross-talk" across projection stores, when chain membership
is changed administratively, e.g. chain was [a,b,c] then changed to merely
[a], but that change only happens on a. Servers b & c continue to use
stale projections and scribble their projection suggestions to a, causing
it to flap.
What's really cool about the flapping handling is that it *works*. I
wasn't thinking about this scenario when designing the flapping logic, but
it's really nifty that this extra scenario causes a to flap and then a's
inner projection remains stable, yay!
* Add complaints when "cross-talk" is observed.
* Fix flapping sleep time throttle.
* Fix bug in the machi_projection_store.erl's bookkeeping of the
max epoch number when flapping.
2015-05-11 09:41:45 +00:00
Front = lists : takewhile ( fun ( X ) - > X / = FLU end ,
lists : reverse ( lists : sort ( FLU_list ) ) ) ,
2015-05-06 02:41:04 +00:00
Index = length ( Front ) ,
2015-04-06 05:16:20 +00:00
NumNodes = length ( FLU_list ) ,
2015-04-30 08:28:43 +00:00
SleepChunk = if NumNodes == 0 - > 0 ;
2015-05-06 02:41:04 +00:00
true - > ( MaxSleep - MinSleep ) div NumNodes
2015-04-30 08:28:43 +00:00
end ,
2015-05-06 02:41:04 +00:00
MinSleep + ( SleepChunk * Index ) .
2015-04-06 05:16:20 +00:00
2015-06-02 11:32:52 +00:00
get_raw_flapping_i ( #projection_v1 { flap = F } ) - >
F .
2015-04-06 05:16:20 +00:00
get_flap_count ( P ) - >
2015-06-02 11:32:52 +00:00
case get_raw_flapping_i ( P ) of undefined - > { 0 , 0 } ;
F - > F #flap_i.flap_count
end .
2015-04-06 05:16:20 +00:00
get_all_flap_counts ( P ) - >
2015-06-02 11:32:52 +00:00
case get_raw_flapping_i ( P ) of undefined - > [ ] ;
F - > F #flap_i.all_flap_counts
end .
2015-04-06 05:16:20 +00:00
get_all_hosed ( P ) when is_record ( P , projection_v1 ) - >
2015-06-02 11:32:52 +00:00
case get_raw_flapping_i ( P ) of undefined - > [ ] ;
F - > F #flap_i.all_hosed
end .
2015-04-06 05:16:20 +00:00
merge_flap_counts ( FlapCounts ) - >
merge_flap_counts ( FlapCounts , orddict : new ( ) ) .
merge_flap_counts ( [ ] , D ) - >
orddict : to_list ( D ) ;
merge_flap_counts ( [ FlapCount | Rest ] , D1 ) - >
2015-08-18 11:49:36 +00:00
%% We know that FlapCount is list({Actor, {{_epk,FlapStartTime},NumFlapCount}}).
2015-04-06 05:16:20 +00:00
D2 = orddict : from_list ( FlapCount ) ,
2015-04-14 07:17:49 +00:00
D2 = orddict : from_list ( FlapCount ) ,
2015-04-06 05:16:20 +00:00
%% If the FlapStartTimes are identical, then pick the bigger flap count.
%% If the FlapStartTimes differ, then pick the larger start time tuple.
D3 = orddict : merge ( fun ( _ Key , { { _ , T1 } , NF1 } = V1 , { { _ , T2 } , NF2 } = V2 )
when T1 == T2 - >
if NF1 > NF2 - >
V1 ;
true - >
V2
end ;
( _ Key , { { _ , T1 } , _ NF1 } = V1 , { { _ , T2 } , _ NF2 } = V2 ) - >
if T1 > T2 - >
V1 ;
true - >
V2
end ;
( _ Key , V1 , V2 ) - >
exit ( { bad_merge_2tuples , mod , ? MODULE , line , ? LINE ,
_ Key , V1 , V2 } )
end , D1 , D2 ) ,
merge_flap_counts ( Rest , D3 ) .
2015-04-09 05:44:58 +00:00
proxy_pid ( Name , #ch_mgr { proxies_dict = ProxiesDict } ) - >
orddict : fetch ( Name , ProxiesDict ) .
2015-04-10 12:59:56 +00:00
gimme_random_uniform ( N , S ) - >
RunEnv1 = S #ch_mgr.runenv ,
Seed1 = proplists : get_value ( seed , RunEnv1 ) ,
{ X , Seed2 } = random : uniform_s ( N , Seed1 ) ,
RunEnv2 = [ { seed , Seed2 } | lists : keydelete ( seed , 1 , RunEnv1 ) ] ,
{ X , S #ch_mgr { runenv = RunEnv2 } } .
2015-06-02 11:55:18 +00:00
inner_projection_exists ( #projection_v1 { inner = undefined } ) - >
false ;
inner_projection_exists ( #projection_v1 { inner = _ } ) - >
true .
2015-04-10 12:59:56 +00:00
inner_projection_or_self ( P ) - >
2015-06-02 11:55:18 +00:00
case inner_projection_exists ( P ) of
false - >
2015-04-10 12:59:56 +00:00
P ;
2015-06-02 11:55:18 +00:00
true - >
P #projection_v1.inner
2015-04-10 12:59:56 +00:00
end .
2015-05-02 07:59:28 +00:00
make_chmgr_regname ( A ) when is_atom ( A ) - >
2015-04-30 08:28:43 +00:00
list_to_atom ( atom_to_list ( A ) ++ " _chmgr " ) ;
2015-05-02 07:59:28 +00:00
make_chmgr_regname ( B ) when is_binary ( B ) - >
2015-04-30 08:28:43 +00:00
list_to_atom ( binary_to_list ( B ) ++ " _chmgr " ) .
2015-07-03 07:18:40 +00:00
gobble_calls ( StaticCall ) - >
2015-07-02 19:30:05 +00:00
receive
2015-07-03 07:18:40 +00:00
{ '$gen_call' , From , { trigger_react_to_env } } - >
2015-07-02 19:30:05 +00:00
gen_server : reply ( From , todo_overload ) ,
2015-07-03 07:18:40 +00:00
gobble_calls ( StaticCall )
2015-07-02 19:30:05 +00:00
after 1 - > % after 0 angers pulse.
ok
end .
2015-04-06 05:16:20 +00:00
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2015-07-20 07:25:42 +00:00
perhaps_start_repair ( #ch_mgr { name = MyName ,
2015-08-05 07:05:03 +00:00
consistency_mode = CMode ,
2015-07-20 07:25:42 +00:00
repair_worker = undefined ,
proj = P_current } = S ) - >
case inner_projection_or_self ( P_current ) of
#projection_v1 { creation_time = Start ,
upi = [ _ | _ ] = UPI ,
repairing = [ _ | _ ] } - >
RepairId = { MyName , os : timestamp ( ) } ,
RepairOpts = [ { repair_mode , repair } , verbose , { repair_id , RepairId } ] ,
%% RepairOpts = [{repair_mode, check}, verbose],
2015-08-05 07:05:03 +00:00
RepairFun = fun ( ) - > do_repair ( S , RepairOpts , CMode ) end ,
2015-07-20 07:25:42 +00:00
LastUPI = lists : last ( UPI ) ,
IgnoreStabilityTime_p = proplists : get_value ( ignore_stability_time ,
S #ch_mgr.opts , false ) ,
case timer : now_diff ( os : timestamp ( ) , Start ) div 1000000 of
N when MyName == LastUPI andalso
( IgnoreStabilityTime_p orelse
N > = ? REPAIR_START_STABILITY_TIME ) - >
{ WorkerPid , _ Ref } = spawn_monitor ( RepairFun ) ,
S #ch_mgr { repair_worker = WorkerPid ,
repair_start = os : timestamp ( ) ,
repair_final_status = undefined } ;
_ - >
S
end ;
2015-05-11 10:50:13 +00:00
_ - >
S
end ;
perhaps_start_repair ( S ) - >
S .
2015-07-20 07:25:42 +00:00
do_repair ( #ch_mgr { name = MyName ,
2015-08-05 06:50:32 +00:00
proj = #projection_v1 { witnesses = Witness_list ,
upi = UPI0 ,
2015-07-20 07:25:42 +00:00
repairing = [ _ | _ ] = Repairing ,
members_dict = MembersDict } } = S ,
2015-08-05 07:05:03 +00:00
Opts , RepairMode ) - >
2015-05-12 14:37:20 +00:00
ETS = ets : new ( repair_stats , [ private , set ] ) ,
ETS_T_Keys = [ t_in_files , t_in_chunks , t_in_bytes ,
t_out_files , t_out_chunks , t_out_bytes ,
t_bad_chunks , t_elapsed_seconds ] ,
[ ets : insert ( ETS , { K , 0 } ) | | K < - ETS_T_Keys ] ,
2015-07-20 07:25:42 +00:00
{ ok , MyProj } = ? FLU_PC : read_latest_projection ( proxy_pid ( MyName , S ) ,
private ) ,
MyEpochID = machi_projection : get_epoch_id ( MyProj ) ,
RepairEpochIDs = [ case ? FLU_PC : read_latest_projection ( proxy_pid ( Rep , S ) ,
private ) of
{ ok , Proj } - >
machi_projection : get_epoch_id ( Proj ) ;
_ - >
unknown
end | | Rep < - Repairing ] ,
case lists : usort ( RepairEpochIDs ) of
[ MyEpochID ] - >
T1 = os : timestamp ( ) ,
RepairId = proplists : get_value ( repair_id , Opts , id1 ) ,
error_logger : info_msg (
" Repair start: tail ~p of ~p -> ~p , ~p ID ~w \n " ,
2015-08-05 06:50:32 +00:00
[ MyName , UPI0 , Repairing , RepairMode , RepairId ] ) ,
2015-07-20 07:25:42 +00:00
2015-08-05 06:50:32 +00:00
UPI = UPI0 -- Witness_list ,
2015-08-05 07:05:03 +00:00
Res = machi_chain_repair : repair ( RepairMode , MyName , Repairing , UPI ,
2015-07-20 07:25:42 +00:00
MembersDict , ETS , Opts ) ,
T2 = os : timestamp ( ) ,
Elapsed = ( timer : now_diff ( T2 , T1 ) div 1000 ) / 1000 ,
ets : insert ( ETS , { t_elapsed_seconds , Elapsed } ) ,
Summary = case Res of ok - > " success " ;
2015-05-12 14:37:20 +00:00
_ - > " FAILURE "
2015-07-20 07:25:42 +00:00
end ,
Stats = [ { K , ets : lookup_element ( ETS , K , 2 ) } | | K < - ETS_T_Keys ] ,
error_logger : info_msg (
" Repair ~s : tail ~p of ~p finished ~p repair ID ~w : "
" ~p \n Stats ~p \n " ,
2015-08-05 06:50:32 +00:00
[ Summary , MyName , UPI0 , RepairMode , RepairId ,
2015-07-20 07:25:42 +00:00
Res , Stats ] ) ,
ets : delete ( ETS ) ,
exit ( { repair_final_status , Res } ) ;
_ - >
exit ( not_all_in_same_epoch )
end .
2015-05-12 14:37:20 +00:00
2015-05-13 08:58:54 +00:00
sanitize_repair_state ( #ch_mgr { repair_final_status = Res ,
proj = #projection_v1 { upi = [ _ | _ ] } } = S )
2015-05-11 10:50:13 +00:00
when Res / = undefined - >
S #ch_mgr { repair_worker = undefined , repair_start = undefined ,
repair_final_status = undefined } ;
sanitize_repair_state ( S ) - >
S .
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2015-04-06 05:16:20 +00:00
perhaps_call_t ( S , Partitions , FLU , DoIt ) - >
try
perhaps_call ( S , Partitions , FLU , DoIt )
catch
exit : timeout - >
2015-07-14 08:17:14 +00:00
remember_partition_hack ( FLU ) ,
2015-05-01 15:33:49 +00:00
{ error , partition } ;
exit : { timeout , _ } - >
2015-07-14 08:17:14 +00:00
remember_partition_hack ( FLU ) ,
2015-05-01 15:33:49 +00:00
{ error , partition }
2015-04-06 05:16:20 +00:00
end .
2015-04-09 08:13:38 +00:00
perhaps_call ( #ch_mgr { name = MyName } = S , Partitions , FLU , DoIt ) - >
2015-04-09 05:44:58 +00:00
ProxyPid = proxy_pid ( FLU , S ) ,
2015-04-09 08:13:38 +00:00
RemoteFLU_p = FLU / = MyName ,
2015-05-01 15:33:49 +00:00
erase ( bad_sock ) ,
2015-04-06 05:16:20 +00:00
case RemoteFLU_p andalso lists : member ( { MyName , FLU } , Partitions ) of
false - >
2015-04-09 05:44:58 +00:00
Res = DoIt ( ProxyPid ) ,
2015-05-01 15:33:49 +00:00
if Res == { error , partition } - >
remember_partition_hack ( FLU ) ;
true - >
ok
end ,
2015-04-06 05:16:20 +00:00
case RemoteFLU_p andalso lists : member ( { FLU , MyName } , Partitions ) of
false - >
Res ;
_ - >
2015-04-09 08:13:38 +00:00
( catch put ( react , [ { timeout2 , me , MyName , to , FLU , RemoteFLU_p , Partitions } | get ( react ) ] ) ) ,
2015-04-06 05:16:20 +00:00
exit ( timeout )
end ;
_ - >
2015-04-09 08:13:38 +00:00
( catch put ( react , [ { timeout1 , me , MyName , to , FLU , RemoteFLU_p , Partitions } | get ( react ) ] ) ) ,
2015-04-06 05:16:20 +00:00
exit ( timeout )
end .
2015-05-01 15:33:49 +00:00
init_remember_partition_hack ( ) - >
put ( remember_partition_hack , [ ] ) .
remember_partition_hack ( FLU ) - >
put ( remember_partition_hack , [ FLU | get ( remember_partition_hack ) ] ) .
2015-04-06 05:16:20 +00:00
2015-05-01 15:33:49 +00:00
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2015-07-03 10:21:41 +00:00
%% @doc A simple technique for checking chain state transition safety.
%%
%% Math tells us that any change state `UPI1' plus `Repair1' to state
%% `UPI2' is OK as long as `UPI2' is a concatenation of some
%% order-preserving combination from `UPI1' with some order-preserving
%% combination from `Repair1'.
%%
%% ```
%% Good_UPI2s = [ X ++ Y || X <- machi_util:ordered_combinations(UPI1),
%% Y <- machi_util:ordered_combinations(Repair1)]'''
%%
%% Rather than creating that list and then checking if `UPI2' is in
%% it, we try a `diff'-like technique to check for basic state
%% transition safety. See docs for {@link mk/3} for more detail.
%%
%% ```
%% 2> machi_chain_manager1:mk([a,b], [], [a]).
%% {[keep,del],[]} %% good transition
%% 3> machi_chain_manager1:mk([a,b], [], [b,a]).
%% {[del,keep],[]} %% bad transition: too few 'keep' for UPI2's length 2
%% 4> machi_chain_manager1:mk([a,b], [c,d,e], [a,d]).
%% {[keep,del],[2]} %% good transition
%% 5> machi_chain_manager1:mk([a,b], [c,d,e], [a,bogus]).
%% {[keep,del],[error]} %% bad transition: 'bogus' not in Repair1'''
simple_chain_state_transition_is_sane ( UPI1 , Repair1 , UPI2 ) - >
2015-07-05 05:52:50 +00:00
? RETURN2 ( simple_chain_state_transition_is_sane ( undefined , UPI1 , Repair1 ,
undefined , UPI2 ) ) .
2015-07-03 13:05:35 +00:00
2015-07-15 03:44:56 +00:00
%% @doc Simple check if a projection transition is sane & safe: we assume
%% that the caller has checked basic projection data structure contents.
%%
%% NOTE: The return value convention is `true' for sane/safe and
%% `term() /= true' for any unsafe/insane value.
2015-07-03 15:13:13 +00:00
simple_chain_state_transition_is_sane ( _ Author1 , UPI1 , Repair1 , Author2 , UPI2 ) - >
2015-07-03 10:21:41 +00:00
{ KeepsDels , Orders } = mk ( UPI1 , Repair1 , UPI2 ) ,
NumKeeps = length ( [ x | | keep < - KeepsDels ] ) ,
NumOrders = length ( Orders ) ,
2015-07-15 03:44:56 +00:00
NoErrorInOrders = ( false == lists : member ( error , Orders ) ) ,
OrdersOK = ( Orders == lists : sort ( Orders ) ) ,
UPI2LengthOK = ( length ( UPI2 ) == NumKeeps + NumOrders ) ,
Answer1 = NoErrorInOrders andalso OrdersOK andalso UPI2LengthOK ,
catch ? REACT ( { simple , ? LINE ,
[ { sane , answer1 , Answer1 ,
author1 , _ Author1 , upi1 , UPI1 , repair1 , Repair1 ,
author2 , Author2 , upi2 , UPI2 ,
keepsdels , KeepsDels , orders , Orders , numKeeps , NumKeeps ,
2015-09-01 13:10:45 +00:00
numOrders , NumOrders , answer1 , Answer1 } ,
{ why2 , get ( why2 ) } ] } ) ,
2015-07-03 13:05:35 +00:00
if not Answer1 - >
2015-07-03 14:17:34 +00:00
? RETURN2 ( Answer1 ) ;
2015-07-03 13:05:35 +00:00
true - >
WIP: 1st part of moving old chain state transtion code to new
Ha, famous last words, amirite?
%% The chain sequence/order checks at the bottom of this function aren't
%% as easy-to-read as they ought to be. However, I'm moderately confident
%% that it isn't buggy. TODO: refactor them for clarity.
So, now machi_chain_manager1:projection_transition_is_sane() is using
newer, far less buggy code to make sanity decisions.
TODO: Add support for Retrospective mode. TODO is it really needed?
Examples of how the old code sucks and the new code sucks less.
138> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxxx..x.xxxxxx..x.x....x..xx........................................................Failed! After 69 tests.
[a,b,c]
{c,[a,b,c],[c,b],b,[b,a],[b,a,c]}
Old_res ([335,192,166,160,153,139]): true
New_res: false (why line [1936])
Shrinking xxxxxxxxxxxx.xxxxxxx.xxx.xxxxxxxxxxxxxxxxx(3 times)
[a,b,c]
%% {Author1,UPI1, Repair1,Author2,UPI2, Repair2} %%
{c, [a,b,c],[], a, [b,a],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: we've swapped order of a & b, which is bad.
139> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxx..x...xx..........xxx..x..............x......x............................................(x10)...(x1)........Failed! After 120 tests.
[b,c,a]
{c,[c,a],[c],a,[a,b],[b,a]}
Old_res ([335,192,185,160,153,123]): true
New_res: false (why line [1936])
Shrinking xx.xxxxxx.x.xxxxxxxx.xxxxxxxxxxx(4 times)
[b,a,c]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{a, [c], [], c, [c,b],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: b wasn't repairing in the previous state.
150> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxx....x...xxxxx..xx.....x.......xxx..x.......xxx...................x................x......(x10).....(x1)........xFailed! After 130 tests.
[c,a,b]
{b,[c],[b,a,c],c,[c,a,b],[b]}
Old_res ([335,214,185,160,153,147]): true
New_res: false (why line [1936])
Shrinking xxxx.x.xxx.xxxxxxx.xxxxxxxxx(4 times)
[c,b,a]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{c, [c], [a,b], c, [c,b,a],[]}
Old_res ([335,328,185,160,153,111]): true
New_res: false (why line [1981,1679])
false
Old code is wrong: a & b were repairing but UPI2 has a & b in the wrong order.
2015-07-03 15:01:54 +00:00
if Orders == [ ] - >
%% No repairing have joined UPI2. Keep original answer.
2015-07-03 14:17:34 +00:00
? RETURN2 ( Answer1 ) ;
2015-07-03 13:05:35 +00:00
Author2 == undefined - >
WIP: 1st part of moving old chain state transtion code to new
Ha, famous last words, amirite?
%% The chain sequence/order checks at the bottom of this function aren't
%% as easy-to-read as they ought to be. However, I'm moderately confident
%% that it isn't buggy. TODO: refactor them for clarity.
So, now machi_chain_manager1:projection_transition_is_sane() is using
newer, far less buggy code to make sanity decisions.
TODO: Add support for Retrospective mode. TODO is it really needed?
Examples of how the old code sucks and the new code sucks less.
138> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxxx..x.xxxxxx..x.x....x..xx........................................................Failed! After 69 tests.
[a,b,c]
{c,[a,b,c],[c,b],b,[b,a],[b,a,c]}
Old_res ([335,192,166,160,153,139]): true
New_res: false (why line [1936])
Shrinking xxxxxxxxxxxx.xxxxxxx.xxx.xxxxxxxxxxxxxxxxx(3 times)
[a,b,c]
%% {Author1,UPI1, Repair1,Author2,UPI2, Repair2} %%
{c, [a,b,c],[], a, [b,a],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: we've swapped order of a & b, which is bad.
139> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxx..x...xx..........xxx..x..............x......x............................................(x10)...(x1)........Failed! After 120 tests.
[b,c,a]
{c,[c,a],[c],a,[a,b],[b,a]}
Old_res ([335,192,185,160,153,123]): true
New_res: false (why line [1936])
Shrinking xx.xxxxxx.x.xxxxxxxx.xxxxxxxxxxx(4 times)
[b,a,c]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{a, [c], [], c, [c,b],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: b wasn't repairing in the previous state.
150> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxx....x...xxxxx..xx.....x.......xxx..x.......xxx...................x................x......(x10).....(x1)........xFailed! After 130 tests.
[c,a,b]
{b,[c],[b,a,c],c,[c,a,b],[b]}
Old_res ([335,214,185,160,153,147]): true
New_res: false (why line [1936])
Shrinking xxxx.x.xxx.xxxxxxx.xxxxxxxxx(4 times)
[c,b,a]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{c, [c], [a,b], c, [c,b,a],[]}
Old_res ([335,328,185,160,153,111]): true
New_res: false (why line [1981,1679])
false
Old code is wrong: a & b were repairing but UPI2 has a & b in the wrong order.
2015-07-03 15:01:54 +00:00
%% At least one Repairing1 element is now in UPI2.
%% We need Author2 to make better decision. Go
%% with what we know, silly caller for not giving
%% us what we need.
2015-07-03 14:17:34 +00:00
? RETURN2 ( Answer1 ) ;
2015-07-03 13:05:35 +00:00
Author2 / = undefined - >
WIP: 1st part of moving old chain state transtion code to new
Ha, famous last words, amirite?
%% The chain sequence/order checks at the bottom of this function aren't
%% as easy-to-read as they ought to be. However, I'm moderately confident
%% that it isn't buggy. TODO: refactor them for clarity.
So, now machi_chain_manager1:projection_transition_is_sane() is using
newer, far less buggy code to make sanity decisions.
TODO: Add support for Retrospective mode. TODO is it really needed?
Examples of how the old code sucks and the new code sucks less.
138> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxxx..x.xxxxxx..x.x....x..xx........................................................Failed! After 69 tests.
[a,b,c]
{c,[a,b,c],[c,b],b,[b,a],[b,a,c]}
Old_res ([335,192,166,160,153,139]): true
New_res: false (why line [1936])
Shrinking xxxxxxxxxxxx.xxxxxxx.xxx.xxxxxxxxxxxxxxxxx(3 times)
[a,b,c]
%% {Author1,UPI1, Repair1,Author2,UPI2, Repair2} %%
{c, [a,b,c],[], a, [b,a],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: we've swapped order of a & b, which is bad.
139> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxx..x...xx..........xxx..x..............x......x............................................(x10)...(x1)........Failed! After 120 tests.
[b,c,a]
{c,[c,a],[c],a,[a,b],[b,a]}
Old_res ([335,192,185,160,153,123]): true
New_res: false (why line [1936])
Shrinking xx.xxxxxx.x.xxxxxxxx.xxxxxxxxxxx(4 times)
[b,a,c]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{a, [c], [], c, [c,b],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: b wasn't repairing in the previous state.
150> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxx....x...xxxxx..xx.....x.......xxx..x.......xxx...................x................x......(x10).....(x1)........xFailed! After 130 tests.
[c,a,b]
{b,[c],[b,a,c],c,[c,a,b],[b]}
Old_res ([335,214,185,160,153,147]): true
New_res: false (why line [1936])
Shrinking xxxx.x.xxx.xxxxxxx.xxxxxxxxx(4 times)
[c,b,a]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{c, [c], [a,b], c, [c,b,a],[]}
Old_res ([335,328,185,160,153,111]): true
New_res: false (why line [1981,1679])
false
Old code is wrong: a & b were repairing but UPI2 has a & b in the wrong order.
2015-07-03 15:01:54 +00:00
%% At least one Repairing1 element is now in UPI2.
%% We permit only the tail to author such a UPI2.
2015-07-03 13:05:35 +00:00
case catch ( lists : last ( UPI1 ) ) of
UPI1_tail when UPI1_tail == Author2 - >
2015-07-03 14:17:34 +00:00
? RETURN2 ( true ) ;
2015-07-05 05:52:50 +00:00
UPI1_tail - >
2015-08-24 10:04:26 +00:00
? RETURN2 ( { expected_author2 , UPI1_tail ,
[ { upi1 , UPI1 } ,
{ repair1 , Repair1 } ,
{ author2 , Author2 } ,
{ upi2 , UPI2 } ] } )
2015-07-03 13:05:35 +00:00
end
end
end .
2015-07-03 10:21:41 +00:00
2015-07-15 03:44:56 +00:00
%% @doc Check if a projection transition is sane & safe: we assume
%% that the caller has checked basic projection data structure contents.
%%
%% NOTE: The return value convention is `true' for sane/safe and `term() /=
%% true' for any unsafe/insane value. This function (and its callee
%% functions) are the only functions (throughout all of the chain state
%% transition sanity checking functions) that is allowed to return `false'.
2015-07-03 10:21:41 +00:00
chain_state_transition_is_sane ( Author1 , UPI1 , Repair1 , Author2 , UPI2 ) - >
ToSelfOnly_p = if UPI2 == [ Author2 ] - > true ;
true - > false
end ,
2015-07-03 13:05:35 +00:00
Disjoint_UPIs = ordsets : is_disjoint ( ordsets : from_list ( UPI1 ) ,
ordsets : from_list ( UPI2 ) ) ,
WIP: 1st part of moving old chain state transtion code to new
Ha, famous last words, amirite?
%% The chain sequence/order checks at the bottom of this function aren't
%% as easy-to-read as they ought to be. However, I'm moderately confident
%% that it isn't buggy. TODO: refactor them for clarity.
So, now machi_chain_manager1:projection_transition_is_sane() is using
newer, far less buggy code to make sanity decisions.
TODO: Add support for Retrospective mode. TODO is it really needed?
Examples of how the old code sucks and the new code sucks less.
138> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxxx..x.xxxxxx..x.x....x..xx........................................................Failed! After 69 tests.
[a,b,c]
{c,[a,b,c],[c,b],b,[b,a],[b,a,c]}
Old_res ([335,192,166,160,153,139]): true
New_res: false (why line [1936])
Shrinking xxxxxxxxxxxx.xxxxxxx.xxx.xxxxxxxxxxxxxxxxx(3 times)
[a,b,c]
%% {Author1,UPI1, Repair1,Author2,UPI2, Repair2} %%
{c, [a,b,c],[], a, [b,a],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: we've swapped order of a & b, which is bad.
139> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxx..x...xx..........xxx..x..............x......x............................................(x10)...(x1)........Failed! After 120 tests.
[b,c,a]
{c,[c,a],[c],a,[a,b],[b,a]}
Old_res ([335,192,185,160,153,123]): true
New_res: false (why line [1936])
Shrinking xx.xxxxxx.x.xxxxxxxx.xxxxxxxxxxx(4 times)
[b,a,c]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{a, [c], [], c, [c,b],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: b wasn't repairing in the previous state.
150> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxx....x...xxxxx..xx.....x.......xxx..x.......xxx...................x................x......(x10).....(x1)........xFailed! After 130 tests.
[c,a,b]
{b,[c],[b,a,c],c,[c,a,b],[b]}
Old_res ([335,214,185,160,153,147]): true
New_res: false (why line [1936])
Shrinking xxxx.x.xxx.xxxxxxx.xxxxxxxxx(4 times)
[c,b,a]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{c, [c], [a,b], c, [c,b,a],[]}
Old_res ([335,328,185,160,153,111]): true
New_res: false (why line [1981,1679])
false
Old code is wrong: a & b were repairing but UPI2 has a & b in the wrong order.
2015-07-03 15:01:54 +00:00
%% This if statement contains the only exceptions that we make to
%% the judgement of simple_chain_state_transition_is_sane().
2015-07-03 10:21:41 +00:00
if ToSelfOnly_p - >
WIP: 1st part of moving old chain state transtion code to new
Ha, famous last words, amirite?
%% The chain sequence/order checks at the bottom of this function aren't
%% as easy-to-read as they ought to be. However, I'm moderately confident
%% that it isn't buggy. TODO: refactor them for clarity.
So, now machi_chain_manager1:projection_transition_is_sane() is using
newer, far less buggy code to make sanity decisions.
TODO: Add support for Retrospective mode. TODO is it really needed?
Examples of how the old code sucks and the new code sucks less.
138> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxxx..x.xxxxxx..x.x....x..xx........................................................Failed! After 69 tests.
[a,b,c]
{c,[a,b,c],[c,b],b,[b,a],[b,a,c]}
Old_res ([335,192,166,160,153,139]): true
New_res: false (why line [1936])
Shrinking xxxxxxxxxxxx.xxxxxxx.xxx.xxxxxxxxxxxxxxxxx(3 times)
[a,b,c]
%% {Author1,UPI1, Repair1,Author2,UPI2, Repair2} %%
{c, [a,b,c],[], a, [b,a],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: we've swapped order of a & b, which is bad.
139> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxx..x...xx..........xxx..x..............x......x............................................(x10)...(x1)........Failed! After 120 tests.
[b,c,a]
{c,[c,a],[c],a,[a,b],[b,a]}
Old_res ([335,192,185,160,153,123]): true
New_res: false (why line [1936])
Shrinking xx.xxxxxx.x.xxxxxxxx.xxxxxxxxxxx(4 times)
[b,a,c]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{a, [c], [], c, [c,b],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: b wasn't repairing in the previous state.
150> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxx....x...xxxxx..xx.....x.......xxx..x.......xxx...................x................x......(x10).....(x1)........xFailed! After 130 tests.
[c,a,b]
{b,[c],[b,a,c],c,[c,a,b],[b]}
Old_res ([335,214,185,160,153,147]): true
New_res: false (why line [1936])
Shrinking xxxx.x.xxx.xxxxxxx.xxxxxxxxx(4 times)
[c,b,a]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{c, [c], [a,b], c, [c,b,a],[]}
Old_res ([335,328,185,160,153,111]): true
New_res: false (why line [1981,1679])
false
Old code is wrong: a & b were repairing but UPI2 has a & b in the wrong order.
2015-07-03 15:01:54 +00:00
%% The transition is to UPI2=[Author2].
2015-07-04 06:52:44 +00:00
%% For AP mode, this transition is always safe (though not
%% always optimal for highest availability).
2015-07-03 14:17:34 +00:00
? RETURN2 ( true ) ;
2015-07-03 13:05:35 +00:00
Disjoint_UPIs - >
WIP: 1st part of moving old chain state transtion code to new
Ha, famous last words, amirite?
%% The chain sequence/order checks at the bottom of this function aren't
%% as easy-to-read as they ought to be. However, I'm moderately confident
%% that it isn't buggy. TODO: refactor them for clarity.
So, now machi_chain_manager1:projection_transition_is_sane() is using
newer, far less buggy code to make sanity decisions.
TODO: Add support for Retrospective mode. TODO is it really needed?
Examples of how the old code sucks and the new code sucks less.
138> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxxx..x.xxxxxx..x.x....x..xx........................................................Failed! After 69 tests.
[a,b,c]
{c,[a,b,c],[c,b],b,[b,a],[b,a,c]}
Old_res ([335,192,166,160,153,139]): true
New_res: false (why line [1936])
Shrinking xxxxxxxxxxxx.xxxxxxx.xxx.xxxxxxxxxxxxxxxxx(3 times)
[a,b,c]
%% {Author1,UPI1, Repair1,Author2,UPI2, Repair2} %%
{c, [a,b,c],[], a, [b,a],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: we've swapped order of a & b, which is bad.
139> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxx..x...xx..........xxx..x..............x......x............................................(x10)...(x1)........Failed! After 120 tests.
[b,c,a]
{c,[c,a],[c],a,[a,b],[b,a]}
Old_res ([335,192,185,160,153,123]): true
New_res: false (why line [1936])
Shrinking xx.xxxxxx.x.xxxxxxxx.xxxxxxxxxxx(4 times)
[b,a,c]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{a, [c], [], c, [c,b],[]}
Old_res ([338,185,160,153,147]): true
New_res: false (why line [1936])
false
Old code is wrong: b wasn't repairing in the previous state.
150> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
xxxxxxxxxxx....x...xxxxx..xx.....x.......xxx..x.......xxx...................x................x......(x10).....(x1)........xFailed! After 130 tests.
[c,a,b]
{b,[c],[b,a,c],c,[c,a,b],[b]}
Old_res ([335,214,185,160,153,147]): true
New_res: false (why line [1936])
Shrinking xxxx.x.xxx.xxxxxxx.xxxxxxxxx(4 times)
[c,b,a]
%% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
{c, [c], [a,b], c, [c,b,a],[]}
Old_res ([335,328,185,160,153,111]): true
New_res: false (why line [1981,1679])
false
Old code is wrong: a & b were repairing but UPI2 has a & b in the wrong order.
2015-07-03 15:01:54 +00:00
%% The transition from UPI1 -> UPI2 where the two are
%% disjoint/no FLUs in common.
2015-07-04 06:52:44 +00:00
%% For AP mode, this transition is always safe (though not
%% always optimal for highest availability).
2015-07-03 14:17:34 +00:00
? RETURN2 ( true ) ;
2015-07-03 10:21:41 +00:00
true - >
2015-07-05 05:52:50 +00:00
? RETURN2 (
simple_chain_state_transition_is_sane ( Author1 , UPI1 , Repair1 ,
Author2 , UPI2 ) )
2015-07-03 10:21:41 +00:00
end .
%% @doc Create a 2-tuple that describes how `UPI1' + `Repair1' are
%% transformed into `UPI2' in a chain state change.
%%
%% The 1st part of the 2-tuple is a list of `keep' and `del' instructions,
%% relative to the items in UPI1 and whether they are present (`keep') or
%% absent (`del') in `UPI2'.
%%
%% The 2nd part of the 2-tuple is `list(non_neg_integer()|error)' that
%% describes the relative order of items in `Repair1' that appear in
%% `UPI2'. The `error' atom is used to denote items not present in
%% `Repair1'.
mk ( UPI1 , Repair1 , UPI2 ) - >
mk ( UPI1 , Repair1 , UPI2 , [ ] ) .
mk ( [ X | UPI1 ] , Repair1 , [ X | UPI2 ] , Acc ) - >
mk ( UPI1 , Repair1 , UPI2 , [ keep | Acc ] ) ;
mk ( [ X | UPI1 ] , Repair1 , UPI2 , Acc ) - >
mk ( UPI1 , Repair1 , UPI2 -- [ X ] , [ del | Acc ] ) ;
mk ( [ ] , [ ] , [ ] , Acc ) - >
{ lists : reverse ( Acc ) , [ ] } ;
mk ( [ ] , Repair1 , UPI2 , Acc ) - >
{ lists : reverse ( Acc ) , machi_util : mk_order ( UPI2 , Repair1 ) } .
2015-07-15 12:58:21 +00:00
scan_dir ( Dir , FileFilterFun , FoldEachFun , FoldEachAcc ) - >
Files = filelib : wildcard ( Dir ++ " /* " ) ,
Xs = [ binary_to_term ( element ( 2 , file : read_file ( File ) ) ) | | File < - Files ] ,
Xs2 = FileFilterFun ( Xs ) ,
lists : foldl ( FoldEachFun , FoldEachAcc , Xs2 ) .
get_ps ( #projection_v1 { epoch_number = Epoch , dbg = Dbg } , Acc ) - >
[ { Epoch , proplists : get_value ( ps , Dbg , [ ] ) } | Acc ] .
strip_dbg2 ( P ) - >
P #projection_v1 { dbg2 = [ stripped ] } .
has_not_sane ( #projection_v1 { epoch_number = Epoch , dbg2 = Dbg2 } , Acc ) - >
Reacts = proplists : get_value ( react , Dbg2 , [ ] ) ,
case [ X | | { _ State , _ Line , [ not_sane | _ ] } = X < - Reacts ] of
[ ] - >
Acc ;
Xs - >
[ { Epoch , Xs } | Acc ]
end .
all_hosed_history ( #projection_v1 { epoch_number = _ Epoch , flap = Flap } ,
{ OldAllHosed , Acc } ) - >
AllHosed = if Flap == undefined - >
[ ] ;
true - >
Flap #flap_i.all_hosed
end ,
if AllHosed == OldAllHosed - >
{ OldAllHosed , Acc } ;
true - >
{ AllHosed , [ AllHosed | Acc ] }
end .
2015-07-21 09:43:59 +00:00
clear_flapping_state ( S ) - >
2015-08-22 12:27:01 +00:00
S2 = clear_most_flapping_state ( S ) ,
2015-08-25 09:43:55 +00:00
S2 #ch_mgr { not_sanes = orddict : new ( ) } .
2015-08-22 12:27:01 +00:00
clear_most_flapping_state ( S ) - >
2015-08-18 11:49:36 +00:00
S #ch_mgr { flap_count = 0 ,
2015-07-21 09:43:59 +00:00
flap_start = ? NOT_FLAPPING_START ,
2015-08-18 11:49:36 +00:00
%% Do not clear flap_last_up.
2015-08-22 12:27:01 +00:00
flap_counts_last = [ ] } .
2015-08-06 08:48:22 +00:00
2015-08-12 08:53:39 +00:00
full_majority_size ( N ) when is_integer ( N ) - >
( N div 2 ) + 1 ;
full_majority_size ( L ) when is_list ( L ) - >
full_majority_size ( length ( L ) ) .
make_zerf ( #projection_v1 { epoch_number = OldEpochNum ,
all_members = AllMembers ,
members_dict = MembersDict ,
2015-08-27 07:19:22 +00:00
witnesses = OldWitness_list ,
flap = OldFlap
2015-08-12 08:53:39 +00:00
} = _ LastProj ,
#ch_mgr { name = MyName ,
consistency_mode = cp_mode ,
runenv = RunEnv1 } = S ) - >
{ Up , _ Partitions , _ RunEnv2 } = calc_up_nodes ( MyName ,
AllMembers , RunEnv1 ) ,
2015-08-27 16:55:31 +00:00
( catch put ( yyy_hack , [ { up , Up } | get ( yyy_hack ) ] ) ) ,
2015-08-12 08:53:39 +00:00
MajoritySize = full_majority_size ( AllMembers ) ,
case length ( Up ) > = MajoritySize of
false - >
2015-08-29 09:01:13 +00:00
%% Make it appear like nobody is up now: we'll have to
%% wait until the Up list changes so that
%% zerf_find_last_common() can confirm a common stable
%% last stable epoch.
2015-08-30 10:53:47 +00:00
P = make_none_projection ( OldEpochNum ,
MyName , AllMembers , OldWitness_list ,
2015-08-29 09:01:13 +00:00
MembersDict ) ,
machi_projection : update_checksum (
2015-08-30 10:53:47 +00:00
P #projection_v1 { mode = cp_mode ,
2015-08-29 09:01:13 +00:00
flap = OldFlap ,
dbg2 = [ zerf_none , { up , Up } , { maj , MajoritySize } ] } ) ;
2015-08-12 08:53:39 +00:00
true - >
make_zerf2 ( OldEpochNum , Up , MajoritySize , MyName ,
2015-08-27 07:19:22 +00:00
AllMembers , OldWitness_list , MembersDict , OldFlap , S )
2015-08-12 08:53:39 +00:00
end .
2015-08-27 07:19:22 +00:00
make_zerf2 ( OldEpochNum , Up , MajoritySize , MyName , AllMembers , OldWitness_list ,
MembersDict , OldFlap , S ) - >
2015-08-12 08:53:39 +00:00
try
2015-08-28 11:06:09 +00:00
#projection_v1 { epoch_number = Epoch } = Proj =
zerf_find_last_common ( MajoritySize , Up , S ) ,
Proj2 = Proj #projection_v1 { flap = OldFlap , dbg2 = [ { make_zerf , Epoch } ] } ,
2015-08-27 11:27:33 +00:00
%% io:format(user, "ZERF ~w\n",[machi_projection:make_summary(Proj2)]),
2015-08-27 07:19:22 +00:00
Proj2
2015-08-12 08:53:39 +00:00
catch
throw : { zerf , no_common } - >
2015-08-27 07:19:22 +00:00
%% Epoch 0 special case: make the "all" projection.
%% calc_projection2() will then filter out any FLUs that
%% aren't currently up to create the first chain. If not
%% enough are up now, then it will fail to create a first
%% chain.
%%
%% If epoch 0 isn't the only epoch that we've looked at,
%% but we still couldn't find a common projection, then
%% we still need to default to the "all" projection and let
%% subsequent chain calculations do their calculations....
P = make_all_projection ( MyName , AllMembers , OldWitness_list ,
MembersDict ) ,
P2 =
machi_projection : update_checksum (
P #projection_v1 { epoch_number = OldEpochNum ,
mode = cp_mode ,
dbg2 = [ zerf_all ] } ) ,
2015-08-27 11:27:33 +00:00
%% io:format(user, "ZERF ~w\n",[machi_projection:make_summary(P2)]),
2015-08-27 07:19:22 +00:00
P2 ;
2015-08-12 08:53:39 +00:00
_ X : _ Y - >
throw ( { zerf , { damn_exception , Up , _ X , _ Y , erlang : get_stacktrace ( ) } } )
end .
2015-08-27 07:19:22 +00:00
zerf_find_last_common ( MajoritySize , Up , S ) - >
case lists : reverse (
lists : sort (
lists : flatten (
[ zerf_find_last_annotated ( FLU , MajoritySize , S ) | | FLU < - Up ] ) ) ) of
2015-08-12 08:53:39 +00:00
[ ] - >
2015-08-27 07:19:22 +00:00
throw ( { zerf , no_common } ) ;
2015-08-27 08:58:43 +00:00
[ P | _ ] = _ TheList - >
2015-08-27 07:19:22 +00:00
%% TODO is this simple sort really good enough?
P
end .
zerf_find_last_annotated ( FLU , MajoritySize , S ) - >
Proxy = proxy_pid ( FLU , S ) ,
{ ok , Epochs } = ? FLU_PC : list_all_projections ( Proxy , private , 60 * 1000 ) ,
P = lists : foldl (
2015-08-27 08:58:43 +00:00
fun ( _ Epoch , #projection_v1 { } = Proj ) - >
2015-08-27 07:19:22 +00:00
Proj ;
( Epoch , Acc ) - >
{ ok , Proj } = ? FLU_PC : read_projection ( Proxy , private ,
Epoch , ? TO * 10 ) ,
2015-08-27 16:55:31 +00:00
case is_annotated ( Proj ) of
false - >
( catch put ( yyy_hack , [ { FLU , Epoch , not_annotated } | get ( yyy_hack ) ] ) ) ,
2015-08-27 07:19:22 +00:00
Acc ;
{ { ConfEpoch , ConfCSum } , _ ConfTime } - >
Px = if ConfEpoch == Epoch - >
2015-08-27 16:55:31 +00:00
( catch put ( yyy_hack , [ { FLU , Epoch , outer_ok } | get ( yyy_hack ) ] ) ) ,
2015-08-27 07:19:22 +00:00
Proj ;
true - >
2015-08-27 16:55:31 +00:00
%% We only use Proj2 for sanity checking
%% here, do not return an inner!
2015-08-27 07:19:22 +00:00
Proj2 = inner_projection_or_self ( Proj ) ,
%% Sanity checking
ConfEpoch = Proj2 #projection_v1.epoch_number ,
ConfCSum = Proj2 #projection_v1.epoch_csum ,
2015-08-27 16:55:31 +00:00
( catch put ( yyy_hack , [ { FLU , Epoch , inner_ok_return_original_outerplusinner } | get ( yyy_hack ) ] ) ) ,
Proj
2015-08-27 07:19:22 +00:00
end ,
if length ( Px #projection_v1.upi ) > = MajoritySize - >
2015-08-27 16:55:31 +00:00
( catch put ( yyy_hack , [ { FLU , Epoch , yay } | get ( yyy_hack ) ] ) ) ,
2015-08-27 07:19:22 +00:00
Px ;
true - >
2015-08-27 16:55:31 +00:00
( catch put ( yyy_hack , [ { FLU , Epoch , skip } | get ( yyy_hack ) ] ) ) ,
2015-08-27 07:19:22 +00:00
Acc
end
end
2015-08-27 08:58:43 +00:00
end , first_accumulator , lists : reverse ( Epochs ) ) ,
2015-08-27 07:19:22 +00:00
if is_record ( P , projection_v1 ) - >
P ;
true - >
[ ] % lists:flatten() will destroy
2015-08-12 08:53:39 +00:00
end .
my_lists_split ( N , L ) - >
try
lists : split ( N , L )
catch
error : badarg - >
{ L , [ ] }
end .
2015-08-13 10:10:48 +00:00
2015-08-25 09:43:55 +00:00
diversion_c120_verbose_goop ( #projection_v1 { upi = [ ] , repairing = [ ] } , _ S ) - >
End-to-end chain state checking is still broken (more)
If we use verbose output from:
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).
And use:
tail -f typescript_file | egrep --line-buffered 'SET|attempted|CONFIRM'
... then we can clearly see a chain safety violation when moving from
epoch 81 -> 83. I need to add more smarts to the safety checking,
both at the individual transition sanity check and at the converge_demo
overall rolling sanity check.
Key to output: CONFIRM by epoch {num} {csum} at {UPI} {Repairing}
SET # of FLUs = 3 members [a,b,c]).
CONFIRM by epoch 1 <<96,161,96,...>> at [a,b] [c]
CONFIRM by epoch 5 <<134,243,175,...>> at [b,c] []
CONFIRM by epoch 7 <<207,93,225,...>> at [b,c] []
CONFIRM by epoch 47 <<60,142,248,...>> at [b,c] []
SET partitions = [{c,b},{c,a}] (1 of 2) at {22,3,34}
CONFIRM by epoch 81 <<223,58,184,...>> at [a,b] []
SET partitions = [{b,c},{b,a}] (2 of 2) at {22,3,38}
CONFIRM by epoch 83 <<33,208,224,...>> at [a,c] []
SET partitions = []
CONFIRM by epoch 85 <<173,179,149,...>> at [a,c] [b]
2015-08-13 13:05:08 +00:00
ok ;
diversion_c120_verbose_goop ( Proj , S ) - >
case proplists : get_value ( private_write_verbose , S #ch_mgr.opts ) of
true - >
diversion_c120_verbose_goop2 ( Proj , S ) ;
_ - >
ok
end .
2015-08-18 11:49:36 +00:00
diversion_c120_verbose_goop2 ( P_latest0 , S ) - >
End-to-end chain state checking is still broken (more)
If we use verbose output from:
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).
And use:
tail -f typescript_file | egrep --line-buffered 'SET|attempted|CONFIRM'
... then we can clearly see a chain safety violation when moving from
epoch 81 -> 83. I need to add more smarts to the safety checking,
both at the individual transition sanity check and at the converge_demo
overall rolling sanity check.
Key to output: CONFIRM by epoch {num} {csum} at {UPI} {Repairing}
SET # of FLUs = 3 members [a,b,c]).
CONFIRM by epoch 1 <<96,161,96,...>> at [a,b] [c]
CONFIRM by epoch 5 <<134,243,175,...>> at [b,c] []
CONFIRM by epoch 7 <<207,93,225,...>> at [b,c] []
CONFIRM by epoch 47 <<60,142,248,...>> at [b,c] []
SET partitions = [{c,b},{c,a}] (1 of 2) at {22,3,34}
CONFIRM by epoch 81 <<223,58,184,...>> at [a,b] []
SET partitions = [{b,c},{b,a}] (2 of 2) at {22,3,38}
CONFIRM by epoch 83 <<33,208,224,...>> at [a,c] []
SET partitions = []
CONFIRM by epoch 85 <<173,179,149,...>> at [a,c] [b]
2015-08-13 13:05:08 +00:00
P_latest = machi_projection : update_checksum ( P_latest0 #projection_v1 { dbg2 = [ ] } ) ,
2015-08-18 11:49:36 +00:00
Type = case inner_projection_exists ( P_latest ) of true - > " inner " ;
_ - > " outer "
end ,
#projection_v1 { epoch_number = Epoch , epoch_csum = CSum , upi = UPI ,
repairing = Repairing } = inner_projection_or_self ( P_latest ) ,
End-to-end chain state checking is still broken (more)
If we use verbose output from:
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).
And use:
tail -f typescript_file | egrep --line-buffered 'SET|attempted|CONFIRM'
... then we can clearly see a chain safety violation when moving from
epoch 81 -> 83. I need to add more smarts to the safety checking,
both at the individual transition sanity check and at the converge_demo
overall rolling sanity check.
Key to output: CONFIRM by epoch {num} {csum} at {UPI} {Repairing}
SET # of FLUs = 3 members [a,b,c]).
CONFIRM by epoch 1 <<96,161,96,...>> at [a,b] [c]
CONFIRM by epoch 5 <<134,243,175,...>> at [b,c] []
CONFIRM by epoch 7 <<207,93,225,...>> at [b,c] []
CONFIRM by epoch 47 <<60,142,248,...>> at [b,c] []
SET partitions = [{c,b},{c,a}] (1 of 2) at {22,3,34}
CONFIRM by epoch 81 <<223,58,184,...>> at [a,b] []
SET partitions = [{b,c},{b,a}] (2 of 2) at {22,3,38}
CONFIRM by epoch 83 <<33,208,224,...>> at [a,c] []
SET partitions = []
CONFIRM by epoch 85 <<173,179,149,...>> at [a,c] [b]
2015-08-13 13:05:08 +00:00
UPI_Rs = UPI ++ Repairing ,
R = [ try
true = ( UPI_Rs / = [ ] ) ,
Proxy = proxy_pid ( FLU , S ) ,
{ ok , P } = ? FLU_PC : read_projection ( Proxy , private , Epoch ) ,
case machi_projection : update_checksum ( P #projection_v1 { dbg2 = [ ] } ) of
X when X == P_latest - >
FLU ;
_ - >
nope
end
catch _ : _ - >
definitely_not
end | | FLU < - UPI_Rs ] ,
if R == UPI_Rs - >
2015-08-18 11:49:36 +00:00
io : format ( user , " \n CONFIRM by epoch ~s ~p ~W at ~p ~p \n " ,
[ Type , Epoch , CSum , 4 , UPI , Repairing ] ) ;
End-to-end chain state checking is still broken (more)
If we use verbose output from:
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).
And use:
tail -f typescript_file | egrep --line-buffered 'SET|attempted|CONFIRM'
... then we can clearly see a chain safety violation when moving from
epoch 81 -> 83. I need to add more smarts to the safety checking,
both at the individual transition sanity check and at the converge_demo
overall rolling sanity check.
Key to output: CONFIRM by epoch {num} {csum} at {UPI} {Repairing}
SET # of FLUs = 3 members [a,b,c]).
CONFIRM by epoch 1 <<96,161,96,...>> at [a,b] [c]
CONFIRM by epoch 5 <<134,243,175,...>> at [b,c] []
CONFIRM by epoch 7 <<207,93,225,...>> at [b,c] []
CONFIRM by epoch 47 <<60,142,248,...>> at [b,c] []
SET partitions = [{c,b},{c,a}] (1 of 2) at {22,3,34}
CONFIRM by epoch 81 <<223,58,184,...>> at [a,b] []
SET partitions = [{b,c},{b,a}] (2 of 2) at {22,3,38}
CONFIRM by epoch 83 <<33,208,224,...>> at [a,c] []
SET partitions = []
CONFIRM by epoch 85 <<173,179,149,...>> at [a,c] [b]
2015-08-13 13:05:08 +00:00
true - >
ok
end .
2015-08-13 09:45:15 +00:00
perhaps_verbose_c110 ( P_latest2 , S ) - >
case proplists : get_value ( private_write_verbose , S #ch_mgr.opts ) of
true - >
{ _ , _ , C } = os : timestamp ( ) ,
MSec = trunc ( C / 1000 ) ,
{ HH , MM , SS } = time ( ) ,
2015-08-28 09:37:11 +00:00
Dbg2X = lists : keydelete ( react , 1 ,
P_latest2 #projection_v1.dbg2 ) ++
[ { is_annotated , is_annotated ( P_latest2 ) } ] ,
P_latest2x = P_latest2 #projection_v1 { dbg2 = Dbg2X } , % limit verbose len.
2015-08-13 09:45:15 +00:00
case inner_projection_exists ( P_latest2 ) of
false - >
Last2 = get ( last_verbose ) ,
Summ2 = machi_projection : make_summary ( P_latest2x ) ,
case proplists : get_value ( private_write_verbose ,
S #ch_mgr.opts ) of
2015-08-28 09:37:11 +00:00
true - >
%% true when Summ2 /= Last2 ->
2015-08-13 09:45:15 +00:00
put ( last_verbose , Summ2 ) ,
2015-08-22 14:30:30 +00:00
? V ( " \n ~2..0w : ~2..0w : ~2..0w . ~3..0w ~p uses plain: ~w \n " ,
[ HH , MM , SS , MSec , S #ch_mgr.name , Summ2 ] ) ;
2015-08-13 09:45:15 +00:00
_ - >
ok
end ;
true - >
Last2 = get ( last_verbose ) ,
P_inner = inner_projection_or_self ( P_latest2 ) ,
2015-08-28 09:37:11 +00:00
P_innerx = P_inner #projection_v1 { dbg2 = Dbg2X } , % limit verbose len.
2015-08-13 09:45:15 +00:00
Summ2 = machi_projection : make_summary ( P_innerx ) ,
case proplists : get_value ( private_write_verbose ,
S #ch_mgr.opts ) of
2015-08-28 09:37:11 +00:00
true - >
%% true when Summ2 /= Last2 ->
2015-08-13 09:45:15 +00:00
put ( last_verbose , Summ2 ) ,
2015-08-25 08:01:14 +00:00
? V ( " \n ~2..0w : ~2..0w : ~2..0w . ~3..0w ~p uses inner: ~w (outer ~w auth ~w flap ~w ) \n " ,
[ HH , MM , SS , MSec , S #ch_mgr.name , Summ2 , P_latest2 #projection_v1.epoch_number , P_latest2 #projection_v1.author_server , P_latest2 #projection_v1.flap ] ) ;
2015-08-13 09:45:15 +00:00
_ - >
ok
end
end ;
_ - >
ok
end .
2015-08-14 13:29:20 +00:00
digraph_magic ( All_list , HosedAnnotations ) - >
G = digraph : new ( ) ,
[ digraph : add_vertex ( G , V ) | | V < - All_list ] ,
[ digraph : add_edge ( G , V1 , V2 ) | | { V1 , problem_with , V2 } < - HosedAnnotations ] ,
calc_magic_down ( lists : sort ( digraph : vertices ( G ) ) , G ) .
calc_magic_down ( [ ] , G ) - >
digraph : delete ( G ) ,
[ ] ;
calc_magic_down ( [ H | T ] , G ) - >
case digraph : in_degree ( G , H ) of
0 - >
calc_magic_down ( T , G ) ;
1 - >
Neighbors = digraph : in_neighbours ( G , H ) ,
case [ V | | V < - Neighbors , digraph : in_degree ( G , V ) == 1 ] of
[ AlsoOne | _ ] - >
%% TODO: be smarter here about the choice of which is down.
[ H | calc_magic_down ( T -- [ AlsoOne ] , G ) ] ;
[ ] - >
%% H is "on the end", e.g. 1-2-1, so it's OK.
calc_magic_down ( T , G )
end ;
N when N > 1 - >
[ H | calc_magic_down ( T , G ) ]
end .
2015-08-18 11:49:36 +00:00
2015-08-20 08:32:46 +00:00
search_last_flap_counts ( FLU , FlapCountsLast ) - >
proplists : get_value ( FLU , FlapCountsLast , undefined ) .
2015-08-26 09:47:39 +00:00
calc_consistency_mode ( _ Witness_list = [ ] ) - >
ap_mode ;
calc_consistency_mode ( _ Witness_list ) - >
cp_mode .
set_proj ( S , Proj ) - >
S #ch_mgr { proj = Proj , proj_unanimous = false } .
2015-08-27 08:58:43 +00:00
make_annotation ( EpochID , Time ) - >
{ private_proj_is_upi_unanimous , { EpochID , Time } } .
2015-08-27 16:55:31 +00:00
is_annotated ( #projection_v1 { dbg2 = Dbg2 } ) - >
proplists : get_value ( private_proj_is_upi_unanimous , Dbg2 , false ) .
2015-08-28 11:06:09 +00:00
2015-08-31 08:57:37 +00:00
make_basic_comparison_stable ( P ) - >
P #projection_v1 { creation_time = undefined ,
flap = undefined ,
dbg = [ ] ,
dbg2 = [ ] ,
members_dict = [ ] } .
2015-08-29 10:59:46 +00:00
has_make_zerf_annotation ( P ) - >
case proplists : get_value ( make_zerf , P #projection_v1.dbg2 ) of
Z_epoch when Z_epoch == P #projection_v1.epoch_number - >
true ;
_ - >
false
end .