From ed56a2c6cfb688b3051298bfcd3de4c4af938514 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 19 Feb 2016 14:49:24 +0900 Subject: [PATCH 01/24] Fix 'ranch' app dependency upon re-start w/FLUs configured ... and allow direct start by machi_sup for EUnit tests. --- src/machi.app.src | 4 ++-- src/machi_sup.erl | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/machi.app.src b/src/machi.app.src index c26154f..a9f96f0 100644 --- a/src/machi.app.src +++ b/src/machi.app.src @@ -1,7 +1,7 @@ {application, machi, [ {description, "A village of write-once files."}, - {vsn, "0.0.0"}, - {applications, [kernel, stdlib, crypto, cluster_info]}, + {vsn, "0.0.1"}, + {applications, [kernel, stdlib, crypto, cluster_info, ranch]}, {mod,{machi_app,[]}}, {registered, []}, {env, [ diff --git a/src/machi_sup.erl b/src/machi_sup.erl index 6cf7695..f7ddd10 100644 --- a/src/machi_sup.erl +++ b/src/machi_sup.erl @@ -65,5 +65,11 @@ init([]) -> LifecycleMgr = {machi_lifecycle_mgr, {machi_lifecycle_mgr, start_link, []}, Restart, Shutdown, worker, []}, - - {ok, {SupFlags, [ServerSup, RanchSup, LifecycleMgr]}}. + RunningApps = [A || {A,_D,_V} <- application:which_applications()], + Specs = case lists:member(ranch, RunningApps) of + true -> + [ServerSup, LifecycleMgr]; + false -> + [ServerSup, RanchSup, LifecycleMgr] + end, + {ok, {SupFlags, Specs}}. From affad6b1d31ccf0d37c7c84792dfb7f650b2c490 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 19 Feb 2016 14:56:53 +0900 Subject: [PATCH 02/24] Specify short timeout to ?FLU_PC:kick_projection_reaction() call --- src/machi_chain_manager1.erl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index bdc142d..1436f44 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2094,11 +2094,10 @@ react_to_env_C200(Retries, P_latest, S) -> ?REACT(c200), try AuthorProxyPid = proxy_pid(P_latest#projection_v1.author_server, S), - ?FLU_PC:kick_projection_reaction(AuthorProxyPid, []) + %% This is just advisory, we don't need a sync reply. + ?FLU_PC:kick_projection_reaction(AuthorProxyPid, [], 100) catch _Type:_Err -> - %% ?V("TODO: tell_author_yo is broken: ~p ~p\n", - %% [_Type, _Err]), - ok + ok end, react_to_env_C210(Retries, S). From d5c3da78fb56666a40126c3212f1901b24173d41 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 19 Feb 2016 15:29:17 +0900 Subject: [PATCH 03/24] Change 'COMMIT epoch' logging & chain mgr options --- rel/files/app.config | 4 ++ src/machi_chain_manager1.erl | 42 ++++++++++----------- test/machi_chain_manager1_converge_demo.erl | 1 + 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/rel/files/app.config b/rel/files/app.config index eb330f3..a2c55ee 100644 --- a/rel/files/app.config +++ b/rel/files/app.config @@ -16,6 +16,10 @@ %% Default = 10 %% {metadata_manager_count, 2}, + %% Default options for chain manager processes. + %% {chain_manager_opts, [{private_write_verbose,true}, + %% {private_write_verbose_confirm,true}]}, + %% Platform vars (mirror of reltool packaging) {platform_data_dir, "{{platform_data_dir}}"}, {platform_etc_dir, "{{platform_etc_dir}}"}, diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 1436f44..4c826a1 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -234,11 +234,13 @@ test_read_latest_public_projection(Pid, ReadRepairP) -> %% manager's pid in MgrOpts and use direct gen_server calls to the %% local projection store. -init({MyName, InitMembersDict, MgrOpts}) -> +init({MyName, InitMembersDict, MgrOpts0}) -> put(ttt, [?LINE]), _ = random:seed(now()), init_remember_down_list(), + MgrOpts = MgrOpts0 ++ application:get_env(machi, chain_manager_opts, []), Opt = fun(Key, Default) -> proplists:get_value(Key, MgrOpts, Default) end, + InitWitness_list = Opt(witnesses, []), ZeroAll_list = [P#p_srvr.name || {_,P} <- orddict:to_list(InitMembersDict)], ZeroProj = make_none_projection(0, MyName, ZeroAll_list, @@ -2060,7 +2062,6 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H, ?REACT(c120), H2 = add_and_trunc_history(P_latest, H, ?MAX_HISTORY_LENGTH), - %% diversion_c120_verbose_goop(P_latest, S), ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}), S2 = set_proj(S#ch_mgr{proj_history=H2, sane_transitions=Xtns + 1}, P_latest), @@ -2487,9 +2488,9 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current} = S) -> upi=_UPIRep, repairing=_RepairingRep} = NewProj, ok = machi_projection_store:write(ProjStore, private, NewProj), - case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of + case proplists:get_value(private_write_verbose_confirm, S#ch_mgr.opts) of true -> - io:format(user, "\n~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), _EpochRep, _CSumRep, _UPIRep, _RepairingRep, MyName]); + error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [_EpochRep, _CSumRep, _UPIRep, _RepairingRep, MyName]); _ -> ok end, @@ -2965,34 +2966,33 @@ zerf_find_last_annotated(FLU, MajoritySize, S) -> [] % lists:flatten() will destroy end. -perhaps_verbose_c111(P_latest2, S) -> - case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of - true -> +perhaps_verbose_c111(P_latest2, #ch_mgr{name=MyName, opts=Opts}=S) -> + PrivWriteVerb = proplists:get_value(private_write_verbose, Opts, false), + PrivWriteVerbCONFIRM = proplists:get_value(private_write_verbose_confirm, Opts, false), + if PrivWriteVerb orelse PrivWriteVerbCONFIRM -> Dbg2X = lists:keydelete(react, 1, P_latest2#projection_v1.dbg2) ++ [{is_annotated,is_annotated(P_latest2)}], P_latest2x = P_latest2#projection_v1{dbg2=Dbg2X}, % limit verbose len. Last2 = get(last_verbose), Summ2 = machi_projection:make_summary(P_latest2x), - if P_latest2#projection_v1.upi == [], - (S#ch_mgr.proj)#projection_v1.upi /= [] -> - <> = - P_latest2#projection_v1.epoch_csum, - io:format(user, "~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), (S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]); + if PrivWriteVerb, Summ2 /= Last2 -> + put(last_verbose, Summ2), + ?V("\n~s ~p uses plain: ~w \n", + [machi_util:pretty_time(), MyName, Summ2]); true -> ok end, - case proplists:get_value(private_write_verbose, - S#ch_mgr.opts) of - %% case true of - true when Summ2 /= Last2 -> - put(last_verbose, Summ2), - ?V("\n~s ~p uses plain: ~w \n", - [machi_util:pretty_time(), S#ch_mgr.name, Summ2]); - _ -> + if PrivWriteVerbCONFIRM, + P_latest2#projection_v1.upi == [], + (S#ch_mgr.proj)#projection_v1.upi /= [] -> + <> = + P_latest2#projection_v1.epoch_csum, + error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [(S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]); + true -> ok end; - _ -> + true -> ok end. diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index cee7a78..4ef1cfa 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -134,6 +134,7 @@ Press control-c to interrupt the test....". %% convergence_demo_testfun(3). -define(DEFAULT_MGR_OPTS, [{private_write_verbose, false}, + {private_write_verbose_confirm, true}, {active_mode,false}, {use_partition_simulator, true}]). From 0f543b4c4d805caa988029e7f4a6d44b5f3a4594 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 19 Feb 2016 16:30:18 +0900 Subject: [PATCH 04/24] Add author_server to CONFIRM messages --- src/machi_chain_manager1.erl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 4c826a1..65e6a69 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2485,12 +2485,13 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current} = S) -> ProjStore = get_projection_store_pid_or_regname(S), #projection_v1{epoch_number=_EpochRep, epoch_csum= <<_CSumRep:4/binary,_/binary>>, + author_server=AuthRep, upi=_UPIRep, repairing=_RepairingRep} = NewProj, ok = machi_projection_store:write(ProjStore, private, NewProj), case proplists:get_value(private_write_verbose_confirm, S#ch_mgr.opts) of true -> - error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [_EpochRep, _CSumRep, _UPIRep, _RepairingRep, MyName]); + error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w auth ~w by ~w\n", [_EpochRep, _CSumRep, _UPIRep, _RepairingRep, AuthRep, MyName]); _ -> ok end, @@ -2988,7 +2989,7 @@ perhaps_verbose_c111(P_latest2, #ch_mgr{name=MyName, opts=Opts}=S) -> (S#ch_mgr.proj)#projection_v1.upi /= [] -> <> = P_latest2#projection_v1.epoch_csum, - error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [(S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]); + error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w auth ~w by ~w\n", [(S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, P_latest2#projection_v1.author_server, S#ch_mgr.name]); true -> ok end; From 2e46d199c8e540f8d0678650734d14dd527d11df Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 19 Feb 2016 16:38:43 +0900 Subject: [PATCH 05/24] Export csum_tag() type --- src/machi_dt.erl | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/machi_dt.erl b/src/machi_dt.erl index 6a57e86..0af3bb4 100644 --- a/src/machi_dt.erl +++ b/src/machi_dt.erl @@ -32,6 +32,12 @@ -type chunk_summary() :: {file_offset(), chunk_size(), chunk_bin(), chunk_cstrm()}. -type chunk_pos() :: {file_offset(), chunk_size(), file_name_s()}. -type chunk_size() :: non_neg_integer(). + +%% Tags that stand for how that checksum was generated. See +%% machi_util:make_tagged_csum/{1,2} for further documentation and +%% implementation. +-type csum_tag() :: none | client_sha | server_sha | server_regen_sha. + -type error_general() :: 'bad_arg' | 'wedged' | 'bad_checksum'. -type epoch_csum() :: binary(). -type epoch_num() :: -1 | non_neg_integer(). @@ -53,11 +59,6 @@ -type read_opts() :: #read_opts{}. -type read_opts_x() :: 'undefined' | 'noopt' | 'none' | #read_opts{}. -%% Tags that stand for how that checksum was generated. See -%% machi_util:make_tagged_csum/{1,2} for further documentation and -%% implementation. --type csum_tag() :: none | client_sha | server_sha | server_regen_sha. - -export_type([ append_opts/0, chunk/0, @@ -68,6 +69,7 @@ chunk_summary/0, chunk_pos/0, chunk_size/0, + csum_tag/0, error_general/0, epoch_csum/0, epoch_num/0, From 53ce6d89dd9febabf9e32c760b8d5aa80bcd2d93 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 19 Feb 2016 18:02:56 +0900 Subject: [PATCH 06/24] Add verbose() option to machi_fitness --- src/machi_fitness.erl | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/machi_fitness.erl b/src/machi_fitness.erl index 2b54244..70af62a 100644 --- a/src/machi_fitness.erl +++ b/src/machi_fitness.erl @@ -108,6 +108,7 @@ handle_call({update_local_down_list, Down, MembersDict}, _From, #state{my_flu_name=MyFluName, pending_map=OldMap, local_down=OldDown, members_dict=OldMembersDict, admin_down=AdminDown}=S) -> + verbose("FITNESS: ~w has down suspect ~w\n", [MyFluName, Down]), NewMap = store_in_map(OldMap, MyFluName, erlang:now(), Down, AdminDown, [props_yo]), S2 = if Down == OldDown, MembersDict == OldMembersDict -> @@ -119,13 +120,17 @@ handle_call({update_local_down_list, Down, MembersDict}, _From, end, {reply, ok, S2#state{local_down=Down}}; handle_call({add_admin_down, DownFLU, DownProps}, _From, - #state{local_down=OldDown, admin_down=AdminDown}=S) -> + #state{my_flu_name=MyFluName, + local_down=OldDown, admin_down=AdminDown}=S) -> + verbose("FITNESS: ~w add admin down ~w\n", [MyFluName, DownFLU]), NewAdminDown = [{DownFLU,DownProps}|lists:keydelete(DownFLU, 1, AdminDown)], S3 = finish_admin_down(erlang:now(), OldDown, NewAdminDown, [props_yo], S), {reply, ok, S3}; handle_call({delete_admin_down, DownFLU}, _From, - #state{local_down=OldDown, admin_down=AdminDown}=S) -> + #state{my_flu_name=MyFluName, + local_down=OldDown, admin_down=AdminDown}=S) -> + verbose("FITNESS: ~w delete admin down ~w\n", [MyFluName, DownFLU]), NewAdminDown = lists:keydelete(DownFLU, 1, AdminDown), S3 = finish_admin_down(erlang:now(), OldDown, NewAdminDown, [props_yo], S), @@ -143,7 +148,8 @@ handle_call(_Request, _From, S) -> handle_cast(_Msg, S) -> {noreply, S}. -handle_info({adjust_down_list, FLU}, #state{active_unfit=ActiveUnfit}=S) -> +handle_info({adjust_down_list, FLU}, #state{my_flu_name=MyFluName, + active_unfit=ActiveUnfit}=S) -> NewUnfit = make_unfit_list(S), Added_to_new = NewUnfit -- ActiveUnfit, Dropped_from_new = ActiveUnfit -- NewUnfit, @@ -184,9 +190,11 @@ handle_info({adjust_down_list, FLU}, #state{active_unfit=ActiveUnfit}=S) -> {true, true} -> error({bad, ?MODULE, ?LINE, FLU, ActiveUnfit, NewUnfit}); {true, false} -> - {noreply, S#state{active_unfit=lists:usort(ActiveUnfit ++ [FLU])}}; + NewActive = wrap_active(MyFluName,lists:usort(ActiveUnfit++[FLU])), + {noreply, S#state{active_unfit=NewActive}}; {false, true} -> - {noreply, S#state{active_unfit=ActiveUnfit -- [FLU]}}; + NewActive = wrap_active(MyFluName,ActiveUnfit--[FLU]), + {noreply, S#state{active_unfit=NewActive}}; {false, false} -> {noreply, S} end; @@ -424,6 +432,18 @@ map_value(Map) -> map_merge(Map1, Map2) -> ?MAP:merge(Map1, Map2). +wrap_active(MyFluName, L) -> + verbose("FITNESS: ~w has new down list ~w\n", [MyFluName, L]), + L. + +verbose(Fmt, Args) -> + case application:get_env(machi, fitness_verbose) of + {ok, true} -> + error_logger:info_msg(Fmt, Args); + _ -> + ok + end. + -ifdef(TEST). dt_understanding_test() -> From 1d8bc198918d277b63140042d8fe11775548267e Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Mon, 22 Feb 2016 16:48:02 +0900 Subject: [PATCH 07/24] Fix repair-is-finished-but-message-not-consumed DoS during peer SIGSTOP --- src/machi_chain_manager1.erl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 65e6a69..18d92a3 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -390,6 +390,7 @@ handle_cast(_Cast, S) -> handle_info(tick_check_environment, #ch_mgr{ignore_timer=true}=S) -> {noreply, S}; handle_info(tick_check_environment, S) -> + gobble_ticks(), {{_Delta, Props, _Epoch}, S1} = do_react_to_env(S), S2 = sanitize_repair_state(S1), S3 = perhaps_start_repair(S2), @@ -2538,6 +2539,14 @@ gobble_calls(StaticCall) -> ok end. +gobble_ticks() -> + receive + tick_check_environment -> + gobble_ticks() + after 0 -> + ok + end. + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% perhaps_start_repair(#ch_mgr{name=MyName, From c02a0bed70fef037b49aa3b6f6144dd22384c04c Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Mon, 22 Feb 2016 17:03:50 +0900 Subject: [PATCH 08/24] Change 'uses' verbose message to error_logger:info --- src/machi_chain_manager1.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 18d92a3..4ab5a55 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2988,8 +2988,8 @@ perhaps_verbose_c111(P_latest2, #ch_mgr{name=MyName, opts=Opts}=S) -> Summ2 = machi_projection:make_summary(P_latest2x), if PrivWriteVerb, Summ2 /= Last2 -> put(last_verbose, Summ2), - ?V("\n~s ~p uses plain: ~w \n", - [machi_util:pretty_time(), MyName, Summ2]); + error_logger:info_msg("~p uses plain: ~w \n", + [MyName, Summ2]); true -> ok end, From 34f8632f194241cffac31b9dcde9aef031cbf6ac Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 23 Feb 2016 15:06:33 +0900 Subject: [PATCH 09/24] Add ranch startup to machi_chain_manager1_converge_demo --- test/machi_chain_manager1_converge_demo.erl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index 4ef1cfa..c1299cd 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -151,7 +151,8 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) -> %% Faster test startup, commented: io:format(user, short_doc(), []), %% Faster test startup, commented: timer:sleep(3000), - application:start(sasl), + Apps = [sasl, ranch], + [application:start(App) || App <- Apps], MgrOpts = MgrOpts0 ++ ?DEFAULT_MGR_OPTS, TcpPort = proplists:get_value(port_base, MgrOpts, 62877), @@ -394,7 +395,8 @@ timer:sleep(1234), exit(SupPid, normal), ok = machi_partition_simulator:stop(), [ok = ?FLU_PC:quit(PPid) || {_, PPid} <- Namez], - machi_util:wait_for_death(SupPid, 100) + machi_util:wait_for_death(SupPid, 100), + [application:start(App) || App <- lists:reverse(Apps)] end. %% Many of the static partition lists below have been problematic at one From a27425147dabf10e22d4453b37a678916b40f5cd Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 23 Feb 2016 15:07:16 +0900 Subject: [PATCH 10/24] Re-add a flapping check, but also take advantage of confirmed accepted epoch --- src/machi_chain_manager1.erl | 39 +++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 4ab5a55..15b82ab 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -92,8 +92,11 @@ -define(REPAIR_START_STABILITY_TIME, 10). -endif. % TEST -%% Magic constant for looping "too frequently" breaker. TODO revisit & revise. --define(TOO_FREQUENT_BREAKER, 10). +%% Maximum length of the history of adopted projections (via C120). +-define(MAX_HISTORY_LENGTH, 8). + +%% Magic constant for looping "too frequently" breaker. +-define(TOO_FREQUENT_BREAKER, (?MAX_HISTORY_LENGTH+5)). -define(RETURN2(X), begin (catch put(why2, [?LINE|get(why2)])), X end). @@ -103,9 +106,6 @@ %% Amount of epoch number skip-ahead for set_chain_members call -define(SET_CHAIN_MEMBERS_EPOCH_SKIP, 1111). -%% Maximum length of the history of adopted projections (via C120). --define(MAX_HISTORY_LENGTH, 30). - %% API -export([start_link/2, start_link/3, stop/1, ping/1, set_chain_members/2, set_chain_members/6, set_active/2, @@ -463,7 +463,7 @@ get_my_proj_boot_info(MgrOpts, DefaultDict, DefaultProj, ProjType) -> {DefaultDict, DefaultProj}; Store -> {ok, P} = machi_projection_store:read_latest_projection(Store, - ProjType), + ProjType, 7789), {P#projection_v1.members_dict, P} end. @@ -840,7 +840,10 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, D_foo=[{repair_done, {repair_final_status, ok, (S#ch_mgr.proj)#projection_v1.epoch_number}}], {NewUPI_list ++ Repairing_list2, [], RunEnv2}; true -> - D_foo=[d_foo2], + D_foo=[d_foo2, {sim_p,Simulator_p}, + {simr_p,SimRepair_p}, {same_epoch,SameEpoch_p}, + {rel_to,RelativeToServer}, + {repch,RepChk_LastInUPI}, {repair_fs,RepairFS}], {NewUPI_list, OldRepairing_list, RunEnv2} end; {_ABC, _XYZ} -> @@ -1977,7 +1980,7 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) -> %% In contrast to the public projection store writes, Humming Consensus %% doesn't care about the status of writes to the public store: it's %% always relying only on successful reads of the public store. - case {?FLU_PC:write_projection(MyStorePid, private, P_latest2,?TO*30),Goo} of + case {?FLU_PC:write_projection(MyStorePid, private, P_latest2,?TO*30+66),Goo} of {ok, Goo} -> ?REACT({c110, [{write, ok}]}), react_to_env_C111(P_latest, P_latest2, Extra1, MyStorePid, S); @@ -2070,20 +2073,21 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H, false -> S2; {{_ConfEpoch, _ConfCSum}, ConfTime} -> - io:format(user, "\nCONFIRM debug C120 ~w was annotated ~w\n", [S#ch_mgr.name, P_latest#projection_v1.epoch_number]), + P_latestEpoch = P_latest#projection_v1.epoch_number, + io:format(user, "\nCONFIRM debug C120 ~w was annotated ~w\n", [S#ch_mgr.name, P_latestEpoch]), S2#ch_mgr{proj_unanimous=ConfTime} end, V = case file:read_file("/tmp/moomoo."++atom_to_list(S#ch_mgr.name)) of {ok,_} -> true; _ -> false end, if V -> io:format("C120: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, {{now_using, FinalProps, P_latest#projection_v1.epoch_number}, S3}. -add_and_trunc_history(P_latest, H, MaxLength) -> +add_and_trunc_history(#projection_v1{epoch_number=0}, H, _MaxLength) -> + H; +add_and_trunc_history(#projection_v1{} = P_latest, H, MaxLength) -> Latest_U_R = {P_latest#projection_v1.upi, P_latest#projection_v1.repairing}, - H2 = if P_latest#projection_v1.epoch_number > 0 -> - queue:in(Latest_U_R, H); - true -> - H - end, + add_and_trunc_history(Latest_U_R, H, MaxLength); +add_and_trunc_history(Item, H, MaxLength) -> + H2 = queue:in(Item, H), case queue:len(H2) of X when X > MaxLength -> {_V, Hxx} = queue:out(H2), @@ -2499,7 +2503,10 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current} = S) -> %% Unwedge our FLU. {ok, NotifyPid} = machi_projection_store:get_wedge_notify_pid(ProjStore), _ = machi_flu1:update_wedge_state(NotifyPid, false, EpochID), - S2#ch_mgr{proj_unanimous=Now}; + #ch_mgr{proj_history=H} = S2, + H2 = add_and_trunc_history({confirm, Epoch}, H, + ?MAX_HISTORY_LENGTH), + S2#ch_mgr{proj_unanimous=Now, proj_history=H2}; _ -> S2 end; From 11921d82bf3a2b5a225247adcc03c3c6eb047c1d Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 23 Feb 2016 17:30:30 +0900 Subject: [PATCH 11/24] WIP: start of demo doc --- .gitignore | 1 + Makefile | 33 ++++++- README.md | 11 ++- ...erge_demo.md => humming_consensus_demo.md} | 97 ++++++++++++++----- rel/gen_dev | 16 +++ rel/vars.config | 3 + rel/vars/dev_vars.config.src | 48 +++++++++ 7 files changed, 182 insertions(+), 27 deletions(-) rename doc/{machi_chain_manager1_converge_demo.md => humming_consensus_demo.md} (76%) create mode 100755 rel/gen_dev create mode 100644 rel/vars/dev_vars.config.src diff --git a/.gitignore b/.gitignore index 3af54ff..ef440c4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ prototype/chain-manager/patch.* .eqc-info .eunit deps +dev erl_crash.dump .concrete/DEV_MODE .rebar diff --git a/Makefile b/Makefile index 7ff19ed..01b1e99 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ endif OVERLAY_VARS ?= EUNIT_OPTS = -v -.PHONY: rel deps package pkgclean edoc +.PHONY: rel stagedevrel deps package pkgclean edoc all: deps compile @@ -57,6 +57,37 @@ relclean: stage : rel $(foreach dep,$(wildcard deps/*), rm -rf rel/$(REPO)/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) rel/$(REPO)/lib;) +## +## Developer targets +## +## devN - Make a dev build for node N +## stagedevN - Make a stage dev build for node N (symlink libraries) +## devrel - Make a dev build for 1..$DEVNODES +## stagedevrel Make a stagedev build for 1..$DEVNODES +## +## Example, make a 68 node devrel cluster +## make stagedevrel DEVNODES=68 + +.PHONY : stagedevrel devrel +DEVNODES ?= 3 + +# 'seq' is not available on all *BSD, so using an alternate in awk +SEQ = $(shell awk 'BEGIN { for (i = 1; i < '$(DEVNODES)'; i++) printf("%i ", i); print i ;exit(0);}') + +$(eval stagedevrel : $(foreach n,$(SEQ),stagedev$(n))) +$(eval devrel : $(foreach n,$(SEQ),dev$(n))) + +dev% : all + mkdir -p dev + rel/gen_dev $@ rel/vars/dev_vars.config.src rel/vars/$@_vars.config + (cd rel && ../rebar generate target_dir=../dev/$@ overlay_vars=vars/$@_vars.config) + +stagedev% : dev% + $(foreach dep,$(wildcard deps/*), rm -rf dev/$^/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) dev/$^/lib;) + +devclean: clean + rm -rf dev + DIALYZER_APPS = kernel stdlib sasl erts ssl compiler eunit crypto public_key syntax_tools PLT = $(HOME)/.machi_dialyzer_plt diff --git a/README.md b/README.md index 28f77d2..37db1e0 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,9 @@ Humming Consensus" is available online now. * [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf) * [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q) +See later in this document for how to run the Humming Consensus demos, +including the network partition simulator. + ## 3. Development status summary @@ -99,10 +102,10 @@ Mid-December 2015: work is underway. * The Erlang language client implementation of the high-level protocol flavor is brittle (e.g., little error handling yet). -If you would like to run the network partition simulator -mentioned in the Ricon 2015 presentation about Humming Consensus, -please see the -[partition simulator convergence test doc.](./doc/machi_chain_manager1_converge_demo.md) +If you would like to run the Humming Consensus code (with or without +the network partition simulator) as described in the RICON 2015 +presentation, please see the +[Humming Consensus demo doc.](./doc/humming_consensus_demo.md). If you'd like to work on a protocol such as Thrift, UBF, msgpack over UDP, or some other protocol, let us know by diff --git a/doc/machi_chain_manager1_converge_demo.md b/doc/humming_consensus_demo.md similarity index 76% rename from doc/machi_chain_manager1_converge_demo.md rename to doc/humming_consensus_demo.md index 2844bfa..eb66ebe 100644 --- a/doc/machi_chain_manager1_converge_demo.md +++ b/doc/humming_consensus_demo.md @@ -1,6 +1,75 @@ +# Table of contents + +* [Hand-on experiments with Machi and Humming Consensus](#hands-on) +* [Using the network partition simulator and convergence demo test code](#partition-simulator) + + +# Hand-on experiments with Machi and Humming Consensus + + +## Prerequisites + +1. Machi requires a OS X, FreeBSD, Linux, or Solaris machine. +2. You'll need the `git` source management utility. +3. You'll need the Erlang/OTP 17 runtime environment. Please don't + use earlier or later versions until we have a chance to fix the + compilation warnings that versions R16B and 18 will trigger. + +For `git` and the Erlang runtime, please use your OS-specific +package manager to install these. If your package manager doesn't +have Erlang/OTP version 17 available, then we recommend using the +[precompiled packages available at Erlang Solutions](https://www.erlang-solutions.com/resources/download.html). + +All of the commands that should be run at your login shell (e.g. Bash, +c-shell) can be cut-and-pasted from this document directly to your +login shell prompt. + + +## Clone and compile the code + +Clone the Machi source repo and compile the source and test code. Run +the following commands at your login shell: + + cd /tmp + git clone https://github.com/basho/machi.git + cd machi + git checkout master + make + +Then run the unit test suite. This may take up to two minutes or so +to finish. + + make test + +At the end, the test suite should report that all tests passed. + +If you had a test failure, a likely cause may be a limit on the number +of file descriptors available to your user process. (Recent releases +of OS X have a limit of 1024 file descriptors, which may be too slow.) +The output of the `limit -n` will tell you your file descriptor limit. + +## Running three Machi instances on a single machine + +Run the following command: + + make stagedevrel + +This will create a directory structure like this: + + |-dev1-|... stand-alone Machi app directories + |-dev-|-dev2-|... stand-alone Machi app directories + |-dev3-|... stand-alone Machi app directories + + # Using the network partition simulator and convergence demo test code +This is the demo code mentioned in the presentation that Scott Lystig +Fritchie gave at the +[RICON 2015 conference](http://ricon.io). +* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf) +* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q) + ## A complete example of all input and output If you don't have an Erlang/OTP 17 runtime environment available, @@ -15,31 +84,15 @@ To help interpret the output of the test, please skip ahead to the ## Prerequisites -1. You'll need the `git` source management -2. You'll need the Erlang/OTP 17 runtime environment. Please don't - use earlier or later versions until we have a chance to fix the - compilation warnings that versions R16B and 18 will trigger. - -All of the commands that should be run at your login shell (e.g. Bash, -c-shell) can be cut-and-pasted from this document directly to your -login shell prompt. +If you don't have `git` and/or the Erlang 17 runtime system available +on your OS X, FreeBSD, Linux, or Solaris machine, please take a look +at the [Prerequistes section](#prerequisites) first. When you have +installed the prerequisite software, please return back here. ## Clone and compile the code -Clone the Machi source repo and compile the source and test code. Run -the following commands at your login shell: - - cd /tmp - git clone https://github.com/basho/machi.git - cd machi - git checkout master - make - -Then run the unit test suite. This may take up to two minutes or so -to finish. Most of the tests will be silent; please be patient until -the tests finish. - - make test +Please briefly visit the [Clone and compile the code](#clone-compile) +section. When finished, please return back here. ## Run an interactive Erlang CLI shell diff --git a/rel/gen_dev b/rel/gen_dev new file mode 100755 index 0000000..1b8ce1b --- /dev/null +++ b/rel/gen_dev @@ -0,0 +1,16 @@ +#! /bin/sh +# +# Example usage: gen_dev dev4 vars.src vars +# +# Generate an overlay config for devNNN from vars.src and write to vars +# + +NAME=$1 +TEMPLATE=$2 +VARFILE=$3 + +NODE="$NAME@127.0.0.1" + +echo "Generating $NAME - node='$NODE'" +sed -e "s/@NODE@/$NODE/" \ + < $TEMPLATE > $VARFILE diff --git a/rel/vars.config b/rel/vars.config index 06b3aa0..b1bb405 100644 --- a/rel/vars.config +++ b/rel/vars.config @@ -1,6 +1,9 @@ %% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*- %% ex: ft=erlang ts=4 sw=4 et +%% NOTE: When modifying this file, also keep its near cousin +%% config file rel/vars/dev_vars.config.src in sync! + %% Platform-specific installation paths {platform_bin_dir, "./bin"}. {platform_data_dir, "./data"}. diff --git a/rel/vars/dev_vars.config.src b/rel/vars/dev_vars.config.src new file mode 100644 index 0000000..a5a3828 --- /dev/null +++ b/rel/vars/dev_vars.config.src @@ -0,0 +1,48 @@ +%% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*- +%% ex: ft=erlang ts=4 sw=4 et + +%% NOTE: When modifying this file, also keep its near cousin +%% config file rel/vars/dev_vars.config.src in sync! + +%% Platform-specific installation paths +{platform_bin_dir, "./bin"}. +{platform_data_dir, "./data"}. +{platform_etc_dir, "./etc"}. +{platform_lib_dir, "./lib"}. +{platform_log_dir, "./log"}. + +%% +%% etc/app.config +%% +{sasl_error_log, "{{platform_log_dir}}/sasl-error.log"}. +{sasl_log_dir, "{{platform_log_dir}}/sasl"}. + +%% lager +{console_log_default, file}. + +%% +%% etc/vm.args +%% +{node, "@NODE@"}. +{crash_dump, "{{platform_log_dir}}/erl_crash.dump"}. + +%% +%% bin/machi +%% +{runner_script_dir, "\`cd \\`dirname $0\\` 1>/dev/null && /bin/pwd\`"}. +{runner_base_dir, "{{runner_script_dir}}/.."}. +{runner_etc_dir, "$RUNNER_BASE_DIR/etc"}. +{runner_log_dir, "$RUNNER_BASE_DIR/log"}. +{runner_lib_dir, "$RUNNER_BASE_DIR/lib"}. +{runner_patch_dir, "$RUNNER_BASE_DIR/lib/basho-patches"}. +{pipe_dir, "/tmp/$RUNNER_BASE_DIR/"}. +{runner_user, ""}. +{runner_wait_process, "machi_flu_sup"}. +{runner_ulimit_warn, 65536}. + +%% +%% cuttlefish +%% +{cuttlefish, ""}. % blank = off +{cuttlefish_conf, "machi.conf"}. + From 6c03f5c1a69342612de5c9be2da819b86a919c07 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 24 Feb 2016 15:08:41 +0900 Subject: [PATCH 12/24] Split out docs dev-clone-compile.md and dev-prerequisites.md --- README.md | 9 ++++--- doc/dev-clone-compile.md | 28 +++++++++++++++++++++ doc/dev-prerequisites.md | 18 ++++++++++++++ doc/humming_consensus_demo.md | 46 +++++++++-------------------------- 4 files changed, 63 insertions(+), 38 deletions(-) create mode 100644 doc/dev-clone-compile.md create mode 100644 doc/dev-prerequisites.md diff --git a/README.md b/README.md index 37db1e0..95e24a1 100644 --- a/README.md +++ b/README.md @@ -137,10 +137,13 @@ X. The only known limitations for using R16 are minor type specification difference between R16 and 17, but we strongly suggest continuing development using version 17. -We also assume that you have the standard UNIX/Linux developers -tool chain for C and C++ applications. Specifically, we assume `make` -is available. The utility used to compile the Machi source code, +We also assume that you have the standard UNIX/Linux developer +tool chain for C and C++ applications. Also, we assume +that Git and GNU Make are available. +The utility used to compile the Machi source code, `rebar`, is pre-compiled and included in the repo. +For more details, please see the +[Machi development environment prerequisites doc](./doc/dev-prerequisites.md). Machi has a dependency on the [ELevelDB](https://github.com/basho/eleveldb) library. ELevelDB only diff --git a/doc/dev-clone-compile.md b/doc/dev-clone-compile.md new file mode 100644 index 0000000..9795bb3 --- /dev/null +++ b/doc/dev-clone-compile.md @@ -0,0 +1,28 @@ +# Clone and compile Machi + +Clone the Machi source repo and compile the source and test code. Run +the following commands at your login shell: + + cd /tmp + git clone https://github.com/basho/machi.git + cd machi + git checkout master + make # or 'gmake' if GNU make uses an alternate name + +Then run the unit test suite. This may take up to two minutes or so +to finish. + + make test + +At the end, the test suite should report that all tests passed. + + [... many lines omitted ...] + module 'event_logger' + module 'chain_mgr_legacy' + ======================================================= + All 90 tests passed. + +If you had a test failure, a likely cause may be a limit on the number +of file descriptors available to your user process. (Recent releases +of OS X have a limit of 1024 file descriptors, which may be too slow.) +The output of the `limit -n` will tell you your file descriptor limit. diff --git a/doc/dev-prerequisites.md b/doc/dev-prerequisites.md new file mode 100644 index 0000000..b3987ad --- /dev/null +++ b/doc/dev-prerequisites.md @@ -0,0 +1,18 @@ +## Machi developer environment prerequisites + +1. Machi requires an OS X, FreeBSD, Linux, or Solaris machine is a + standard developer environment for C and C++ applications. +2. You'll need the `git` source management utility. +3. You'll need the Erlang/OTP 17 runtime environment. Please don't + use earlier or later versions until we have a chance to fix the + compilation warnings that versions R16B and 18 will trigger. + +For `git` and the Erlang runtime, please use your OS-specific +package manager to install these. If your package manager doesn't +have Erlang/OTP version 17 available, then we recommend using the +[precompiled packages available at Erlang Solutions](https://www.erlang-solutions.com/resources/download.html). + +Also, please verify that you have enough file descriptors available to +your user processes. The output of `ulimit -n` should report at least +4,000 file descriptors available. If your limit is lower (a frequent +problem for OS X users), please increase it to at least 4,000. diff --git a/doc/humming_consensus_demo.md b/doc/humming_consensus_demo.md index eb66ebe..1f0f3c5 100644 --- a/doc/humming_consensus_demo.md +++ b/doc/humming_consensus_demo.md @@ -7,50 +7,26 @@ # Hand-on experiments with Machi and Humming Consensus - ## Prerequisites -1. Machi requires a OS X, FreeBSD, Linux, or Solaris machine. -2. You'll need the `git` source management utility. -3. You'll need the Erlang/OTP 17 runtime environment. Please don't - use earlier or later versions until we have a chance to fix the - compilation warnings that versions R16B and 18 will trigger. - -For `git` and the Erlang runtime, please use your OS-specific -package manager to install these. If your package manager doesn't -have Erlang/OTP version 17 available, then we recommend using the -[precompiled packages available at Erlang Solutions](https://www.erlang-solutions.com/resources/download.html). - -All of the commands that should be run at your login shell (e.g. Bash, -c-shell) can be cut-and-pasted from this document directly to your -login shell prompt. +Please refer to the +[Machi development environment prerequisites doc](./doc/dev-prerequisites.md) +for Machi developer environment prerequisites. ## Clone and compile the code -Clone the Machi source repo and compile the source and test code. Run -the following commands at your login shell: - - cd /tmp - git clone https://github.com/basho/machi.git - cd machi - git checkout master - make - -Then run the unit test suite. This may take up to two minutes or so -to finish. - - make test - -At the end, the test suite should report that all tests passed. - -If you had a test failure, a likely cause may be a limit on the number -of file descriptors available to your user process. (Recent releases -of OS X have a limit of 1024 file descriptors, which may be too slow.) -The output of the `limit -n` will tell you your file descriptor limit. +Please see the +[Machi 'clone and compile' doc](./doc/dev-clone-compile.md) +for the short list of steps required to fetch the Machi source code +from GitHub and to compile & test Machi. ## Running three Machi instances on a single machine +All of the commands that should be run at your login shell (e.g. Bash, +c-shell) can be cut-and-pasted from this document directly to your +login shell prompt. + Run the following command: make stagedevrel From bdf47da10cbce8812ba2f779ebd153b0305a2b71 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 24 Feb 2016 15:11:35 +0900 Subject: [PATCH 13/24] oops fix doc links --- .gitignore | 1 + README.md | 2 +- doc/humming_consensus_demo.md | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index ef440c4..c6a0bf2 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ prototype/chain-manager/patch.* deps dev erl_crash.dump +eqc .concrete/DEV_MODE .rebar edoc diff --git a/README.md b/README.md index 95e24a1..6e86eb9 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Machi: a robust & reliable, distributed, highly available, large file store +# Machi: a distributed, decentralized blob/large file store [Travis-CI](http://travis-ci.org/basho/machi) :: ![Travis-CI](https://secure.travis-ci.org/basho/machi.png) diff --git a/doc/humming_consensus_demo.md b/doc/humming_consensus_demo.md index 1f0f3c5..7d01fe7 100644 --- a/doc/humming_consensus_demo.md +++ b/doc/humming_consensus_demo.md @@ -10,14 +10,14 @@ ## Prerequisites Please refer to the -[Machi development environment prerequisites doc](./doc/dev-prerequisites.md) +[Machi development environment prerequisites doc](./dev-prerequisites.md) for Machi developer environment prerequisites. ## Clone and compile the code Please see the -[Machi 'clone and compile' doc](./doc/dev-clone-compile.md) +[Machi 'clone and compile' doc](./dev-clone-compile.md) for the short list of steps required to fetch the Machi source code from GitHub and to compile & test Machi. From a3fbe2c8bbc3af192d91512bf1b3cd14967072db Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Thu, 25 Feb 2016 17:00:05 +0900 Subject: [PATCH 14/24] WIP: demo script writing, derp, need a shell script to simplify --- doc/dev-clone-compile.md | 4 +- doc/dev-prerequisites.md | 15 +++-- ...nsus_demo.md => humming-consensus-demo.md} | 66 ++++++++++++++++++- priv/quick-admin-examples/demo-000 | 7 ++ rel/reltool.config | 1 + 5 files changed, 83 insertions(+), 10 deletions(-) rename doc/{humming_consensus_demo.md => humming-consensus-demo.md} (78%) create mode 100644 priv/quick-admin-examples/demo-000 diff --git a/doc/dev-clone-compile.md b/doc/dev-clone-compile.md index 9795bb3..3ba78e1 100644 --- a/doc/dev-clone-compile.md +++ b/doc/dev-clone-compile.md @@ -14,7 +14,9 @@ to finish. make test -At the end, the test suite should report that all tests passed. +At the end, the test suite should report that all tests passed. The +actual number of tests shown in the "All `X` tests passed" line may be +different than the example below. [... many lines omitted ...] module 'event_logger' diff --git a/doc/dev-prerequisites.md b/doc/dev-prerequisites.md index b3987ad..8fa5b7a 100644 --- a/doc/dev-prerequisites.md +++ b/doc/dev-prerequisites.md @@ -1,15 +1,18 @@ ## Machi developer environment prerequisites -1. Machi requires an OS X, FreeBSD, Linux, or Solaris machine is a - standard developer environment for C and C++ applications. +1. Machi requires an 64-bit variant of UNIX: OS X, FreeBSD, Linux, or + Solaris machine is a standard developer environment for C and C++ + applications (64-bit versions). 2. You'll need the `git` source management utility. -3. You'll need the Erlang/OTP 17 runtime environment. Please don't - use earlier or later versions until we have a chance to fix the - compilation warnings that versions R16B and 18 will trigger. +3. You'll need the 64-bit Erlang/OTP 17 runtime environment. Please + don't use earlier or later versions until we have a chance to fix + the compilation warnings that versions R16B and 18 will trigger. + Also, please verify that you are not using a 32-bit Erlang/OTP + runtime package. For `git` and the Erlang runtime, please use your OS-specific package manager to install these. If your package manager doesn't -have Erlang/OTP version 17 available, then we recommend using the +have 64-bit Erlang/OTP version 17 available, then we recommend using the [precompiled packages available at Erlang Solutions](https://www.erlang-solutions.com/resources/download.html). Also, please verify that you have enough file descriptors available to diff --git a/doc/humming_consensus_demo.md b/doc/humming-consensus-demo.md similarity index 78% rename from doc/humming_consensus_demo.md rename to doc/humming-consensus-demo.md index 7d01fe7..e881de8 100644 --- a/doc/humming_consensus_demo.md +++ b/doc/humming-consensus-demo.md @@ -33,10 +33,70 @@ Run the following command: This will create a directory structure like this: - |-dev1-|... stand-alone Machi app directories - |-dev-|-dev2-|... stand-alone Machi app directories - |-dev3-|... stand-alone Machi app directories + |-dev1-|... stand-alone Machi app + subdirectories + |-dev-|-dev2-|... stand-alone Machi app + directories + |-dev3-|... stand-alone Machi app + directories +Each of the `dev/dev1`, `dev/dev2`, and `dev/dev3` are stand-alone +application instances of Machi and can be run independently of each +other on the same machine. This demo will use all three. + +The lifecycle management utilities for Machi are a bit immature, +currently. They assume that each Machi server runs on a host with a +unique hostname -- there is no flexibility built-in yet to easily run +multiple Machi instances on the same machine. To continue with the +demo, we need to use `sudo` or `su` to obtain superuser privileges to +edit the `/etc/hosts` file. + +Please add the following line to `/etc/hosts`, using this command: + + sudo sh -c 'echo "127.0.0.1 machi1 machi2 machi3" >> /etc/hosts' + +Then please verify that all three new hostnames for the localhost +network interface are working correctly: + + ping -c 1 machi1 ; ping -c 1 machi2 ; ping -c 1 machi3 + +If that worked, then we're ready for the next step: starting our three +Machi app instances on this machine, then configure a single chain to +to experiment with. + +Run the following commands to start the three Machi app instances and +use the `machi ping` command to verify that all three are running. + + sh -c 'for i in 1 2 3; do ./dev/dev$i/bin/machi start; done + sh -c 'for i in 1 2 3; do ./dev/dev$i/bin/machi ping; done + +The output from the `ping` commands should be: + + pong + pong + pong + +Next, use the following to configure a single chain: + + sh -c 'for i in 1 2 3; do ./dev/dev$i/bin/machi-admin + +The results should be: + + Result: ok + Result: ok + Result: ok + +We have now created a single replica chain, called `c1`, that has +three file servers participating in the chain. Thanks to the +hostnames that we added to `/etc/hosts`, all are using the localhost +network interface. + + | App instance | Hostname | FLU name | TCP port | + | directory | | | number | + |--------------+----------+----------+----------| + | dev1 | machi1 | flu1 | 20401 | + | dev2 | machi2 | flu2 | 20402 | + | dev3 | machi3 | flu3 | 20403 | + +The log files for each application instance can be found + # Using the network partition simulator and convergence demo test code diff --git a/priv/quick-admin-examples/demo-000 b/priv/quick-admin-examples/demo-000 new file mode 100644 index 0000000..301f348 --- /dev/null +++ b/priv/quick-admin-examples/demo-000 @@ -0,0 +1,7 @@ +{host, "machi1", []}. +{host, "machi2", []}. +{host, "machi3", []}. +{flu,f1,"machi1",20401,[]}. +{flu,f2,"machi2",20402,[]}. +{flu,f3,"machi3",20403,[]}. +{chain,c1,[f1,f2,f3],[]}. diff --git a/rel/reltool.config b/rel/reltool.config index 33df951..eb015be 100644 --- a/rel/reltool.config +++ b/rel/reltool.config @@ -106,6 +106,7 @@ {copy, "../priv/quick-admin-examples/000", "priv/quick-admin-examples"}, {copy, "../priv/quick-admin-examples/001", "priv/quick-admin-examples"}, {copy, "../priv/quick-admin-examples/002", "priv/quick-admin-examples"}, + {copy, "../priv/quick-admin-examples/demo-000", "priv/quick-admin-examples/demo-000"}, {mkdir, "lib/basho-patches"} %% {copy, "../apps/machi/ebin/etop_txt.beam", "lib/basho-patches"} From f433e84fab2e8ac210b996d264c120d4561b5d0f Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Thu, 25 Feb 2016 17:52:40 +0900 Subject: [PATCH 15/24] Add 'stability_time' env var for repair --- src/machi_chain_manager1.erl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 15b82ab..66b0163 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -2569,12 +2569,13 @@ perhaps_start_repair(#ch_mgr{name=MyName, %% RepairOpts = [{repair_mode, check}, verbose], RepairFun = fun() -> do_repair(S, RepairOpts, CMode) end, LastUPI = lists:last(UPI), + StabilityTime = application:get_env(machi, stability_time, ?REPAIR_START_STABILITY_TIME), IgnoreStabilityTime_p = proplists:get_value(ignore_stability_time, S#ch_mgr.opts, false), case timer:now_diff(os:timestamp(), Start) div 1000000 of N when MyName == LastUPI andalso (IgnoreStabilityTime_p orelse - N >= ?REPAIR_START_STABILITY_TIME) -> + N >= StabilityTime) -> {WorkerPid, _Ref} = spawn_monitor(RepairFun), S#ch_mgr{repair_worker=WorkerPid, repair_start=os:timestamp(), From 4cb166368a1b05f181c9e963d5faf96d1c4cb56b Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Thu, 25 Feb 2016 18:10:11 +0900 Subject: [PATCH 16/24] priv/humming-consensus-demo.setup.sh debugged, all appears to work --- doc/humming-consensus-demo.md | 38 ++++++++++------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/doc/humming-consensus-demo.md b/doc/humming-consensus-demo.md index e881de8..bf141cb 100644 --- a/doc/humming-consensus-demo.md +++ b/doc/humming-consensus-demo.md @@ -52,36 +52,20 @@ Please add the following line to `/etc/hosts`, using this command: sudo sh -c 'echo "127.0.0.1 machi1 machi2 machi3" >> /etc/hosts' -Then please verify that all three new hostnames for the localhost -network interface are working correctly: +Next, we will use a shell script to finish setting up our cluster. It +will do the following for us: - ping -c 1 machi1 ; ping -c 1 machi2 ; ping -c 1 machi3 +* Verify that the new line that was added to `/etc/hosts` is correct. +* Modify the `etc/app.config` files to configure the Humming Consensus + chain manager's actions logged to the `log/console.log` file. +* Start the three application instances. +* Verify that the three instances are running correctly. +* Configure a single chain, with one FLU server per application + instance. -If that worked, then we're ready for the next step: starting our three -Machi app instances on this machine, then configure a single chain to -to experiment with. +Please run this script using this command: -Run the following commands to start the three Machi app instances and -use the `machi ping` command to verify that all three are running. - - sh -c 'for i in 1 2 3; do ./dev/dev$i/bin/machi start; done - sh -c 'for i in 1 2 3; do ./dev/dev$i/bin/machi ping; done - -The output from the `ping` commands should be: - - pong - pong - pong - -Next, use the following to configure a single chain: - - sh -c 'for i in 1 2 3; do ./dev/dev$i/bin/machi-admin - -The results should be: - - Result: ok - Result: ok - Result: ok + ./priv/humming-consensus-demo.setup.sh We have now created a single replica chain, called `c1`, that has three file servers participating in the chain. Thanks to the From 184a54ebbd5ae4ef46bec3a2d506757a2ae94872 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 26 Feb 2016 15:46:17 +0900 Subject: [PATCH 17/24] Change ?HYOGE blob size from 1GB -> 75MB to reduce RAM required for eunit tests --- test/machi_file_proxy_test.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/machi_file_proxy_test.erl b/test/machi_file_proxy_test.erl index 605abe7..10e16bf 100644 --- a/test/machi_file_proxy_test.erl +++ b/test/machi_file_proxy_test.erl @@ -38,7 +38,7 @@ clean_up_data_dir(DataDir) -> -ifndef(PULSE). -define(TESTDIR, "./t"). --define(HYOOGE, 1 * 1024 * 1024 * 1024). % 1 long GB +-define(HYOOGE, 75 * 1024 * 1024). % 75 MBytes random_binary_single() -> %% OK, I guess it's not that random... From fc46cd1b25f0673b7cd7528577e472547b2eeb72 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 26 Feb 2016 17:32:51 +0900 Subject: [PATCH 18/24] WIP: Vagrant --- doc/dev-prerequisites.md | 17 ++++ doc/humming-consensus-demo.md | 9 ++- priv/humming-consensus-demo.setup.sh | 56 +++++++++++++ .../Vagrantfile | 81 +++++++++++++++++++ 4 files changed, 161 insertions(+), 2 deletions(-) create mode 100755 priv/humming-consensus-demo.setup.sh create mode 100644 priv/humming-consensus-demo.vagrant/Vagrantfile diff --git a/doc/dev-prerequisites.md b/doc/dev-prerequisites.md index 8fa5b7a..66afd41 100644 --- a/doc/dev-prerequisites.md +++ b/doc/dev-prerequisites.md @@ -19,3 +19,20 @@ Also, please verify that you have enough file descriptors available to your user processes. The output of `ulimit -n` should report at least 4,000 file descriptors available. If your limit is lower (a frequent problem for OS X users), please increase it to at least 4,000. + +# Using Vagrant to set up a developer environment for Machi + +The Machi source directory contains a `Vagrantfile` for creating an +Ubuntu Linux-based virtual machine for compiling and running Machi. +This file is in the +[$SRC_TOP/priv/humming-consensus-demo.vagrant](../priv/humming-consensus-demo.vagrant) +directory. + +If used as-is, the virtual machine specification is modest. + +* 1 virtual CPU +* 512MB virtual memory +* 768MB swap space +* 79GB sparse virtual disk image. After installing prerequisites and + compiling Machi, the root file system uses approximately 2.7 GBytes. + diff --git a/doc/humming-consensus-demo.md b/doc/humming-consensus-demo.md index bf141cb..198bc55 100644 --- a/doc/humming-consensus-demo.md +++ b/doc/humming-consensus-demo.md @@ -13,6 +13,11 @@ Please refer to the [Machi development environment prerequisites doc](./dev-prerequisites.md) for Machi developer environment prerequisites. +If you do not have an Erlang/OTP runtime system available, but you do +have [the Vagrant virtual machine](https://www.vagrantup.com/) manager +available, then please refer to the instructions in the prerequisites +doc for using Vagrant. + ## Clone and compile the code @@ -72,8 +77,8 @@ three file servers participating in the chain. Thanks to the hostnames that we added to `/etc/hosts`, all are using the localhost network interface. - | App instance | Hostname | FLU name | TCP port | - | directory | | | number | + | App instance | Pseudo | FLU name | TCP port | + | directory | Hostname | | number | |--------------+----------+----------+----------| | dev1 | machi1 | flu1 | 20401 | | dev2 | machi2 | flu2 | 20402 | diff --git a/priv/humming-consensus-demo.setup.sh b/priv/humming-consensus-demo.setup.sh new file mode 100755 index 0000000..dc57731 --- /dev/null +++ b/priv/humming-consensus-demo.setup.sh @@ -0,0 +1,56 @@ +#!/bin/sh + +echo "Step: Verify that the required entries in /etc/hosts are present" +for i in 1 2 3; do + grep machi$i /etc/hosts | egrep -s '^127.0.0.1' > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "" + echo "'grep -s machi$i' failed. Aborting, sorry." + exit 1 + fi + ping -c 1 machi$i > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "" + echo "Ping attempt on host machi$i failed. Aborting." + echo "" + ping -c 1 machi$i + exit 1 + fi +done + +echo "Step: add a verbose logging option to app.config" +for i in 1 2 3; do + ed ./dev/dev$i/etc/app.config < /dev/null 2>&1 +/verbose_confirm +a +{chain_manager_opts, [{private_write_verbose_confirm,true}]}, +{stability_time, 1}, +. +w +q +EOF +done + +echo "Step: start three three Machi application instances" +for i in 1 2 3; do + ./dev/dev$i/bin/machi start + ./dev/dev$i/bin/machi ping + if [ $? -ne 0 ]; then + echo "Sorry, a 'ping' check for instance dev$i failed. Aborting." + exit 1 + fi +done + +echo "Step: configure one chain to start a Humming Consensus group with three members" + +# Note: $CWD of each Machi proc is two levels below the source code root dir. +LIFECYCLE000=../../priv/quick-admin-examples/demo-000 +for i in 3 2 1; do + ./dev/dev$i/bin/machi-admin quick-admin-apply $LIFECYCLE000 machi$i + if [ $? -ne 0 ]; then + echo "Sorry, 'machi-admin quick-admin-apply failed' on machi$i. Aborting." + exit 1 + fi +done + +exit 0 diff --git a/priv/humming-consensus-demo.vagrant/Vagrantfile b/priv/humming-consensus-demo.vagrant/Vagrantfile new file mode 100644 index 0000000..8fe04a3 --- /dev/null +++ b/priv/humming-consensus-demo.vagrant/Vagrantfile @@ -0,0 +1,81 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +# All Vagrant configuration is done below. The "2" in Vagrant.configure +# configures the configuration version (we support older styles for +# backwards compatibility). Please don't change it unless you know what +# you're doing. +Vagrant.configure(2) do |config| + # The most common configuration options are documented and commented below. + # For a complete reference, please see the online documentation at + # https://docs.vagrantup.com. + + # Every Vagrant development environment requires a box. You can search for + # boxes at https://atlas.hashicorp.com/search. + # If this Vagrant box has not been downloaded before (e.g. using "vagrant box add"), + # then Vagrant will automatically download the VM image from HashiCorp. + config.vm.box = "hashicorp/precise64" + + # Disable automatic box update checking. If you disable this, then + # boxes will only be checked for updates when the user runs + # `vagrant box outdated`. This is not recommended. + # config.vm.box_check_update = false + + # Create a forwarded port mapping which allows access to a specific port + # within the machine from a port on the host machine. In the example below, + # accessing "localhost:8080" will access port 80 on the guest machine. + # config.vm.network "forwarded_port", guest: 80, host: 8080 + + # Create a private network, which allows host-only access to the machine + # using a specific IP. + # config.vm.network "private_network", ip: "192.168.33.10" + + # Create a public network, which generally matched to bridged network. + # Bridged networks make the machine appear as another physical device on + # your network. + # config.vm.network "public_network" + + # Share an additional folder to the guest VM. The first argument is + # the path on the host to the actual folder. The second argument is + # the path on the guest to mount the folder. And the optional third + # argument is a set of non-required options. + # config.vm.synced_folder "../data", "/vagrant_data" + + # Provider-specific configuration so you can fine-tune various + # backing providers for Vagrant. These expose provider-specific options. + # Example for VirtualBox: + # + config.vm.provider "virtualbox" do |vb| + # Display the VirtualBox GUI when booting the machine + # vb.gui = true + + # Customize the amount of memory on the VM: + vb.memory = "512" + end + # + # View the documentation for the provider you are using for more + # information on available options. + + # Define a Vagrant Push strategy for pushing to Atlas. Other push strategies + # such as FTP and Heroku are also available. See the documentation at + # https://docs.vagrantup.com/v2/push/atlas.html for more information. + # config.push.define "atlas" do |push| + # push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME" + # end + + # Enable provisioning with a shell script. Additional provisioners such as + # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the + # documentation for more information about their specific syntax and use. + config.vm.provision "shell", inline: <<-SHELL + sudo apt-get install -y git + git clone https://github.com/slfritchie/slf-configurator.git + chown -R vagrant ./slf-configurator + (cd slf-configurator ; sudo sh -x ./ALL.sh) + echo 'export PATH=${PATH}:/usr/local/erlang/17.5/bin' >> ~vagrant/.bashrc + export PATH=${PATH}:/usr/local/erlang/17.5/bin + + git clone https://github.com/basho/machi.git + (cd machi ; git checkout master ; make test 2>&1 | tee RUNLOG.0) + chown -R vagrant ./machi + SHELL +end From 84f522f865df75b238ca40d4d9862ed502093af1 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 27 Feb 2016 00:05:29 +0900 Subject: [PATCH 19/24] WIP: Vagrant --- doc/humming-consensus-demo.md | 4 +++- priv/humming-consensus-demo.vagrant/Vagrantfile | 16 ++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/humming-consensus-demo.md b/doc/humming-consensus-demo.md index 198bc55..01e10d1 100644 --- a/doc/humming-consensus-demo.md +++ b/doc/humming-consensus-demo.md @@ -84,7 +84,9 @@ network interface. | dev2 | machi2 | flu2 | 20402 | | dev3 | machi3 | flu3 | 20403 | -The log files for each application instance can be found +The log files for each application instance can be found in the +`./dev/devN/log/console.log` file, where the `N` is the instance +number: 1, 2, or 3. # Using the network partition simulator and convergence demo test code diff --git a/priv/humming-consensus-demo.vagrant/Vagrantfile b/priv/humming-consensus-demo.vagrant/Vagrantfile index 8fe04a3..187341b 100644 --- a/priv/humming-consensus-demo.vagrant/Vagrantfile +++ b/priv/humming-consensus-demo.vagrant/Vagrantfile @@ -15,6 +15,11 @@ Vagrant.configure(2) do |config| # If this Vagrant box has not been downloaded before (e.g. using "vagrant box add"), # then Vagrant will automatically download the VM image from HashiCorp. config.vm.box = "hashicorp/precise64" + # If using a FreeBSD box, Bash may not be installed. + # Use the config.ssh.shell setting to specify an alternate shell. + # Note, however, that any code in the 'config.vm.provision' section + # would then have to use this shell's syntax! + # config.ssh.shell = "/bin/csh -l" # Disable automatic box update checking. If you disable this, then # boxes will only be checked for updates when the user runs @@ -67,15 +72,22 @@ Vagrant.configure(2) do |config| # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the # documentation for more information about their specific syntax and use. config.vm.provision "shell", inline: <<-SHELL - sudo apt-get install -y git + # Install prerequsites + # Support here for FreeBSD is experimental + apt-get update ; sudo apt-get install -y git sudo rsync ; # Ubuntu Linux + env ASSUME_ALWAYS_YES=yes pkg install -f git sudo rsync ; # FreeBSD 10 + + # Install dependent packages, using slf-configurator git clone https://github.com/slfritchie/slf-configurator.git chown -R vagrant ./slf-configurator (cd slf-configurator ; sudo sh -x ./ALL.sh) echo 'export PATH=${PATH}:/usr/local/erlang/17.5/bin' >> ~vagrant/.bashrc export PATH=${PATH}:/usr/local/erlang/17.5/bin + ## echo 'set path = ( $path /usr/local/erlang/17.5/bin )' >> ~vagrant/.cshrc + ## setenv PATH /usr/local/erlang/17.5/bin:$PATH git clone https://github.com/basho/machi.git - (cd machi ; git checkout master ; make test 2>&1 | tee RUNLOG.0) + (cd machi ; git checkout master ; make test ) chown -R vagrant ./machi SHELL end From 16153a5d31bb5d6bee6ec6ea5092bc69e820f209 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Sat, 27 Feb 2016 01:56:16 +0900 Subject: [PATCH 20/24] Fix deps building problem, silly --- priv/humming-consensus-demo.vagrant/Vagrantfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/priv/humming-consensus-demo.vagrant/Vagrantfile b/priv/humming-consensus-demo.vagrant/Vagrantfile index 187341b..ce0474d 100644 --- a/priv/humming-consensus-demo.vagrant/Vagrantfile +++ b/priv/humming-consensus-demo.vagrant/Vagrantfile @@ -87,7 +87,7 @@ Vagrant.configure(2) do |config| ## setenv PATH /usr/local/erlang/17.5/bin:$PATH git clone https://github.com/basho/machi.git - (cd machi ; git checkout master ; make test ) + (cd machi ; git checkout master ; make && make test ) chown -R vagrant ./machi SHELL end From 4e5c16f5e2d3d8865a182976d4cb6f7d754f44d4 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 9 Mar 2016 10:30:23 -0800 Subject: [PATCH 21/24] WIP --- doc/humming-consensus-demo.md | 57 +++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/doc/humming-consensus-demo.md b/doc/humming-consensus-demo.md index 01e10d1..d00637c 100644 --- a/doc/humming-consensus-demo.md +++ b/doc/humming-consensus-demo.md @@ -72,6 +72,20 @@ Please run this script using this command: ./priv/humming-consensus-demo.setup.sh +If the output looks like this (and exits with status zero), then the +script was successful. + + Step: Verify that the required entries in /etc/hosts are present + Step: add a verbose logging option to app.config + Step: start three three Machi application instances + pong + pong + pong + Step: configure one chain to start a Humming Consensus group with three members + Result: ok + Result: ok + Result: ok + We have now created a single replica chain, called `c1`, that has three file servers participating in the chain. Thanks to the hostnames that we added to `/etc/hosts`, all are using the localhost @@ -88,6 +102,49 @@ The log files for each application instance can be found in the `./dev/devN/log/console.log` file, where the `N` is the instance number: 1, 2, or 3. +## Understanding the chain manager's log file output + +After running the `./priv/humming-consensus-demo.setup.sh` script, +let's look at the last few lines of the `./dev/dev1/log/console.log` +log file for Erlang VM process #1. + + 2016-03-09 10:16:35.676 [info] <0.105.0>@machi_lifecycle_mgr:process_pending_flu:422 Started FLU f1 with supervisor pid <0.128.0> + 2016-03-09 10:16:35.676 [info] <0.105.0>@machi_lifecycle_mgr:move_to_flu_config:540 Creating FLU config file f1 + 2016-03-09 10:16:35.790 [info] <0.105.0>@machi_lifecycle_mgr:bootstrap_chain2:312 Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[] + 2016-03-09 10:16:35.790 [info] <0.105.0>@machi_lifecycle_mgr:move_to_chain_config:546 Creating chain config file c1 + 2016-03-09 10:16:44.139 [info] <0.132.0> CONFIRM epoch 1141 <<155,42,7,221>> upi [] rep [] auth f1 by f1 + 2016-03-09 10:16:44.271 [info] <0.132.0> CONFIRM epoch 1148 <<57,213,154,16>> upi [f1] rep [] auth f1 by f1 + 2016-03-09 10:16:44.864 [info] <0.132.0> CONFIRM epoch 1151 <<239,29,39,70>> upi [f1] rep [f3] auth f1 by f1 + 2016-03-09 10:16:45.235 [info] <0.132.0> CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1 + 2016-03-09 10:16:47.343 [info] <0.132.0> CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1 + +Let's pick apart some of these lines. + +* `Started FLU f1 with supervisor pid <0.128.0>` ; This VM, #1, + started a FLU (Machi data server) with the name `f1`. In the Erlang + process supervisor hierarchy, the process ID of the top supervisor + is `<0.128.0>`. +* `Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[]` + A bootstrap configuration for a chain named `c1` has been created. + * The FLUs/data servers that are eligible for participation in the + chain have names `f1`, `f2`, and `f3`. + * The chain will operate in eventual consistency mode (`ap_mode`) + * The witness server list is empty. Witness servers are never used + in eventual consistency mode. +* `CONFIRM epoch 1141 <<155,42,7,221>> upi [] rep [] auth f1 by f1` + * All participants in epoch 1141 are unanimous in adopting epoch + 1141's projection. All active membership lists are empty, so + there is no functional chain replication yet, at least as far as + server `f1` knows + * The epoch's abbreviated checksum is `<<155,42,7,221>>`. + * The UPI list, i.e. the replicas whose data is 100% in sync is + `[]`, the empty list. (UPI = Update Propagation Invariant) + * The list of servers that are under data repair (`rep`) is also + empty, `[]`. + * This projection was authored by server `f1`. + * The log message was generated by server `f1`. + + # Using the network partition simulator and convergence demo test code From cd166361aa61b156fbe344dea134ecc5aa67bfc6 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 9 Mar 2016 10:48:00 -0800 Subject: [PATCH 22/24] WIP --- doc/humming-consensus-demo.md | 53 +++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/doc/humming-consensus-demo.md b/doc/humming-consensus-demo.md index d00637c..5ef8e8d 100644 --- a/doc/humming-consensus-demo.md +++ b/doc/humming-consensus-demo.md @@ -118,14 +118,18 @@ log file for Erlang VM process #1. 2016-03-09 10:16:45.235 [info] <0.132.0> CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1 2016-03-09 10:16:47.343 [info] <0.132.0> CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1 -Let's pick apart some of these lines. +Let's pick apart some of these lines. We have started all three +servers at about the same time. We see some race conditions happen, +and some jostling and readjustment happens pretty quickly in the first +few seconds. -* `Started FLU f1 with supervisor pid <0.128.0>` ; This VM, #1, +* `Started FLU f1 with supervisor pid <0.128.0>` + * This VM, #1, started a FLU (Machi data server) with the name `f1`. In the Erlang process supervisor hierarchy, the process ID of the top supervisor is `<0.128.0>`. * `Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[]` - A bootstrap configuration for a chain named `c1` has been created. + * A bootstrap configuration for a chain named `c1` has been created. * The FLUs/data servers that are eligible for participation in the chain have names `f1`, `f2`, and `f3`. * The chain will operate in eventual consistency mode (`ap_mode`) @@ -143,6 +147,49 @@ Let's pick apart some of these lines. empty, `[]`. * This projection was authored by server `f1`. * The log message was generated by server `f1`. +* `CONFIRM epoch 1148 <<57,213,154,16>> upi [f1] rep [] auth f1 by f1` + * Now the server `f1` has created a chain of length 1, `[f1]`. + * Chain repair/file re-sync is not required when the UPI server list + changes from length 0 -> 1. +* `CONFIRM epoch 1151 <<239,29,39,70>> upi [f1] rep [f3] auth f1 by f1` + * Server `f1` has noticed that server `f3` is alive. Apparently it + has not yet noticed that server `f2` is also running. + * Server `f3` is in the repair list. +* `CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1` + * Server `f2` is apparently now aware that all three servers are running. + * The previous configuration used by `f2` was `upi [f2]`, i.e., `f2` + was running in a chain of one. `f2` noticed that `f1` and `f3` + were now available and has started adding them to the chain. + * All new servers are always added to the tail of the chain. + * In eventual consistency mode, a UPI change like this is OK. + * When performing a read, a client must read from both tail of the + UPI list and also from all repairing servers. + * When performing a write, the client writes to both the UPI + server list and also the repairing list, in that order. + * Server `f2` will trigger file repair/re-sync shortly. + * The waiting time for starting repair has been configured to be + extremely short, 1 second. The default waiting time is 10 + seconds, in case Humming Consensus remains unstable. +* `CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1` + * File repair/re-sync has finished. All file data on all servers + are now in sync. + * The UPI/in-sync part of the chain is now `[f2,f1,f3]`, and there + are no servers under repair. + +## Let's create some failures + +Here are some suggestions for creating failures. + +* Use the `./dev/devN/bin/machi stop` and ``./dev/devN/bin/machi start` + commands to stop & start VM #`N`. +* Stop a VM abnormally by using `kill`. The OS process name to look + for is `beam.smp`. +* Suspend and resume a VM, using the `SIGSTOP` and `SIGCONT` signals. + * E.g. `kill -STOP 9823` and `kill -CONT 9823` + +The network partition simulator is not (yet) available when running +Machi in this mode. Please see the next section for instructions on +how to use partition simulator. From 96c46ec5aa0610c566814c591a1c82022852fa09 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 9 Mar 2016 10:53:12 -0800 Subject: [PATCH 23/24] Add explanation for the 'CONFIRM' log messages --- doc/humming-consensus-demo.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/humming-consensus-demo.md b/doc/humming-consensus-demo.md index 5ef8e8d..ffed8bb 100644 --- a/doc/humming-consensus-demo.md +++ b/doc/humming-consensus-demo.md @@ -160,12 +160,15 @@ few seconds. * The previous configuration used by `f2` was `upi [f2]`, i.e., `f2` was running in a chain of one. `f2` noticed that `f1` and `f3` were now available and has started adding them to the chain. - * All new servers are always added to the tail of the chain. + * All new servers are always added to the tail of the chain in the + repair list. * In eventual consistency mode, a UPI change like this is OK. * When performing a read, a client must read from both tail of the UPI list and also from all repairing servers. * When performing a write, the client writes to both the UPI server list and also the repairing list, in that order. + * I.e., the client concatenates both lists, + `UPI ++ Repairing`, for its chain configuration for the write. * Server `f2` will trigger file repair/re-sync shortly. * The waiting time for starting repair has been configured to be extremely short, 1 second. The default waiting time is 10 @@ -180,7 +183,7 @@ few seconds. Here are some suggestions for creating failures. -* Use the `./dev/devN/bin/machi stop` and ``./dev/devN/bin/machi start` +* Use the `./dev/devN/bin/machi stop` and `./dev/devN/bin/machi start` commands to stop & start VM #`N`. * Stop a VM abnormally by using `kill`. The OS process name to look for is `beam.smp`. From 6b000f6e7c32cfd4730af972f4aa8e23013a1950 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 9 Mar 2016 11:14:43 -0800 Subject: [PATCH 24/24] Ignore +rel/vars/dev*vars.config --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c6a0bf2..063a61d 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ include/machi_pb.hrl # Release packaging rel/machi +rel/vars/dev*vars.config # Misc Scott cruft *.patch