Merge branch 'slf/hc-demo-env'

2016-03-09 11:16:35 -08:00 · 2016-03-09 11:16:35 -08:00 · 6cddfcf988
commit 6cddfcf988
parent c2e9a83372 6b000f6e7c
22 changed files with 824 additions and 253 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,7 +2,9 @@ prototype/chain-manager/patch.*
 .eqc-info
 .eunit
 deps
+dev
 erl_crash.dump
+eqc
 .concrete/DEV_MODE
 .rebar
 edoc
@ -20,6 +22,7 @@ include/machi_pb.hrl

 # Release packaging
 rel/machi
+rel/vars/dev*vars.config

 # Misc Scott cruft
 *.patch
--- a/33
+++ b/33
@ -10,7 +10,7 @@ endif
 OVERLAY_VARS    ?=
 EUNIT_OPTS       = -v

-.PHONY: rel deps package pkgclean edoc
+.PHONY: rel stagedevrel deps package pkgclean edoc

 all: deps compile

@ -57,6 +57,37 @@ relclean:
 stage : rel
 	$(foreach dep,$(wildcard deps/*), rm -rf rel/$(REPO)/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) rel/$(REPO)/lib;)

+##
+## Developer targets
+##
+##  devN - Make a dev build for node N
+##  stagedevN - Make a stage dev build for node N (symlink libraries)
+##  devrel - Make a dev build for 1..$DEVNODES
+##  stagedevrel Make a stagedev build for 1..$DEVNODES
+##
+##  Example, make a 68 node devrel cluster
+##    make stagedevrel DEVNODES=68
+
+.PHONY : stagedevrel devrel
+DEVNODES ?= 3
+
+# 'seq' is not available on all *BSD, so using an alternate in awk
+SEQ = $(shell awk 'BEGIN { for (i = 1; i < '$(DEVNODES)'; i++) printf("%i ", i); print i ;exit(0);}')
+
+$(eval stagedevrel : $(foreach n,$(SEQ),stagedev$(n)))
+$(eval devrel : $(foreach n,$(SEQ),dev$(n)))
+
+dev% : all
+	mkdir -p dev
+	rel/gen_dev $@ rel/vars/dev_vars.config.src rel/vars/$@_vars.config
+	(cd rel && ../rebar generate target_dir=../dev/$@ overlay_vars=vars/$@_vars.config)
+
+stagedev% : dev%
+	  $(foreach dep,$(wildcard deps/*), rm -rf dev/$^/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) dev/$^/lib;)
+
+devclean: clean
+	rm -rf dev
+
 DIALYZER_APPS = kernel stdlib sasl erts ssl compiler eunit crypto public_key syntax_tools
 PLT = $(HOME)/.machi_dialyzer_plt

--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-# Machi: a robust & reliable, distributed, highly available, large file store
+# Machi: a distributed, decentralized blob/large file store

  [Travis-CI](http://travis-ci.org/basho/machi) :: ![Travis-CI](https://secure.travis-ci.org/basho/machi.png)

@ -64,6 +64,9 @@ Humming Consensus" is available online now.
 * [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf)
 * [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q)

+See later in this document for how to run the Humming Consensus demos,
+including the network partition simulator.
+
 <a name="sec3">
 ## 3. Development status summary

@ -99,10 +102,10 @@ Mid-December 2015: work is underway.
        * The Erlang language client implementation of the high-level
          protocol flavor is brittle (e.g., little error handling yet).

-If you would like to run the network partition simulator
-mentioned in the Ricon 2015 presentation about Humming Consensus,
-please see the
-[partition simulator convergence test doc.](./doc/machi_chain_manager1_converge_demo.md)
+If you would like to run the Humming Consensus code (with or without
+the network partition simulator) as described in the RICON 2015
+presentation, please see the
+[Humming Consensus demo doc.](./doc/humming_consensus_demo.md).

 If you'd like to work on a protocol such as Thrift, UBF,
 msgpack over UDP, or some other protocol, let us know by
@ -134,10 +137,13 @@ X.  The only known limitations for using R16 are minor type
 specification difference between R16 and 17, but we strongly suggest
 continuing development using version 17.

-We also assume that you have the standard UNIX/Linux developers
-tool chain for C and C++ applications.  Specifically, we assume `make`
-is available.  The utility used to compile the Machi source code,
+We also assume that you have the standard UNIX/Linux developer
+tool chain for C and C++ applications.  Also, we assume
+that Git and GNU Make are available.
+The utility used to compile the Machi source code,
 `rebar`, is pre-compiled and included in the repo.
+For more details, please see the
+[Machi development environment prerequisites doc](./doc/dev-prerequisites.md).

 Machi has a dependency on the
 [ELevelDB](https://github.com/basho/eleveldb) library.  ELevelDB only
--- a/doc/dev-clone-compile.md
+++ b/doc/dev-clone-compile.md
@ -0,0 +1,30 @@
+# Clone and compile Machi
+
+Clone the Machi source repo and compile the source and test code.  Run
+the following commands at your login shell:
+
+    cd /tmp
+    git clone https://github.com/basho/machi.git
+    cd machi
+    git checkout master
+    make          # or 'gmake' if GNU make uses an alternate name
+
+Then run the unit test suite.  This may take up to two minutes or so
+to finish.
+
+    make test
+
+At the end, the test suite should report that all tests passed.  The
+actual number of tests shown in the "All `X` tests passed" line may be
+different than the example below.
+
+    [... many lines omitted ...]
+    module 'event_logger'
+    module 'chain_mgr_legacy'
+    =======================================================
+      All 90 tests passed.
+
+If you had a test failure, a likely cause may be a limit on the number
+of file descriptors available to your user process.  (Recent releases
+of OS X have a limit of 1024 file descriptors, which may be too slow.)
+The output of the `limit -n` will tell you your file descriptor limit.
--- a/doc/dev-prerequisites.md
+++ b/doc/dev-prerequisites.md
@ -0,0 +1,38 @@
+## Machi developer environment prerequisites
+
+1. Machi requires an 64-bit variant of UNIX: OS X, FreeBSD, Linux, or
+   Solaris machine is a standard developer environment for C and C++
+   applications (64-bit versions).
+2. You'll need the `git` source management utility.
+3. You'll need the 64-bit Erlang/OTP 17 runtime environment.  Please
+   don't use earlier or later versions until we have a chance to fix
+   the compilation warnings that versions R16B and 18 will trigger.
+   Also, please verify that you are not using a 32-bit Erlang/OTP
+   runtime package.
+
+For `git` and the Erlang runtime, please use your OS-specific
+package manager to install these.  If your package manager doesn't
+have 64-bit Erlang/OTP version 17 available, then we recommend using the
+[precompiled packages available at Erlang Solutions](https://www.erlang-solutions.com/resources/download.html).
+
+Also, please verify that you have enough file descriptors available to
+your user processes.  The output of `ulimit -n` should report at least
+4,000 file descriptors available.  If your limit is lower (a frequent
+problem for OS X users), please increase it to at least 4,000.
+
+# Using Vagrant to set up a developer environment for Machi
+
+The Machi source directory contains a `Vagrantfile` for creating an
+Ubuntu Linux-based virtual machine for compiling and running Machi.
+This file is in the
+[$SRC_TOP/priv/humming-consensus-demo.vagrant](../priv/humming-consensus-demo.vagrant)
+directory.
+
+If used as-is, the virtual machine specification is modest.
+
+* 1 virtual CPU
+* 512MB virtual memory
+* 768MB swap space
+* 79GB sparse virtual disk image.  After installing prerequisites and
+  compiling Machi, the root file system uses approximately 2.7 GBytes.
+
--- a/doc/humming-consensus-demo.md
+++ b/doc/humming-consensus-demo.md
@ -0,0 +1,372 @@
+
+# Table of contents
+
+* [Hand-on experiments with Machi and Humming Consensus](#hands-on)
+* [Using the network partition simulator and convergence demo test code](#partition-simulator)
+
+<a name="hands-on">
+# Hand-on experiments with Machi and Humming Consensus
+
+## Prerequisites
+
+Please refer to the
+[Machi development environment prerequisites doc](./dev-prerequisites.md)
+for Machi developer environment prerequisites.
+
+If you do not have an Erlang/OTP runtime system available, but you do
+have [the Vagrant virtual machine](https://www.vagrantup.com/) manager
+available, then please refer to the instructions in the prerequisites
+doc for using Vagrant.
+
+<a name="clone-compile">
+## Clone and compile the code
+
+Please see the
+[Machi 'clone and compile' doc](./dev-clone-compile.md)
+for the short list of steps required to fetch the Machi source code
+from GitHub and to compile &amp; test Machi.
+
+## Running three Machi instances on a single machine
+
+All of the commands that should be run at your login shell (e.g. Bash,
+c-shell) can be cut-and-pasted from this document directly to your
+login shell prompt.
+
+Run the following command:
+
+    make stagedevrel
+
+This will create a directory structure like this:
+          
+          |-dev1-|... stand-alone Machi app + subdirectories
+    |-dev-|-dev2-|... stand-alone Machi app + directories
+          |-dev3-|... stand-alone Machi app + directories
+    
+Each of the `dev/dev1`, `dev/dev2`, and `dev/dev3` are stand-alone
+application instances of Machi and can be run independently of each
+other on the same machine.  This demo will use all three.
+
+The lifecycle management utilities for Machi are a bit immature,
+currently.  They assume that each Machi server runs on a host with a
+unique hostname -- there is no flexibility built-in yet to easily run
+multiple Machi instances on the same machine.  To continue with the
+demo, we need to use `sudo` or `su` to obtain superuser privileges to
+edit the `/etc/hosts` file.
+
+Please add the following line to `/etc/hosts`, using this command:
+
+    sudo sh -c 'echo "127.0.0.1 machi1 machi2 machi3" >> /etc/hosts'
+
+Next, we will use a shell script to finish setting up our cluster.  It
+will do the following for us:
+
+* Verify that the new line that was added to `/etc/hosts` is correct.
+* Modify the `etc/app.config` files to configure the Humming Consensus
+  chain manager's actions logged to the `log/console.log` file.
+* Start the three application instances.
+* Verify that the three instances are running correctly.
+* Configure a single chain, with one FLU server per application
+  instance.
+
+Please run this script using this command:
+
+    ./priv/humming-consensus-demo.setup.sh
+
+If the output looks like this (and exits with status zero), then the
+script was successful.
+
+    Step: Verify that the required entries in /etc/hosts are present
+    Step: add a verbose logging option to app.config
+    Step: start three three Machi application instances
+    pong
+    pong
+    pong
+    Step: configure one chain to start a Humming Consensus group with three members
+    Result: ok
+    Result: ok
+    Result: ok
+
+We have now created a single replica chain, called `c1`, that has
+three file servers participating in the chain.  Thanks to the
+hostnames that we added to `/etc/hosts`, all are using the localhost
+network interface.
+
+    | App instance | Pseudo   | FLU name | TCP port |
+    | directory    | Hostname |          |   number |
+    |--------------+----------+----------+----------|
+    | dev1         | machi1   | flu1     |    20401 |
+    | dev2         | machi2   | flu2     |    20402 |
+    | dev3         | machi3   | flu3     |    20403 |
+
+The log files for each application instance can be found in the
+`./dev/devN/log/console.log` file, where the `N` is the instance
+number: 1, 2, or 3.
+
+## Understanding the chain manager's log file output
+
+After running the `./priv/humming-consensus-demo.setup.sh` script,
+let's look at the last few lines of the `./dev/dev1/log/console.log`
+log file for Erlang VM process #1.
+
+    2016-03-09 10:16:35.676 [info] <0.105.0>@machi_lifecycle_mgr:process_pending_flu:422 Started FLU f1 with supervisor pid <0.128.0>
+    2016-03-09 10:16:35.676 [info] <0.105.0>@machi_lifecycle_mgr:move_to_flu_config:540 Creating FLU config file f1
+    2016-03-09 10:16:35.790 [info] <0.105.0>@machi_lifecycle_mgr:bootstrap_chain2:312 Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[]
+    2016-03-09 10:16:35.790 [info] <0.105.0>@machi_lifecycle_mgr:move_to_chain_config:546 Creating chain config file c1
+    2016-03-09 10:16:44.139 [info] <0.132.0> CONFIRM epoch 1141 <<155,42,7,221>> upi [] rep [] auth f1 by f1
+    2016-03-09 10:16:44.271 [info] <0.132.0> CONFIRM epoch 1148 <<57,213,154,16>> upi [f1] rep [] auth f1 by f1
+    2016-03-09 10:16:44.864 [info] <0.132.0> CONFIRM epoch 1151 <<239,29,39,70>> upi [f1] rep [f3] auth f1 by f1
+    2016-03-09 10:16:45.235 [info] <0.132.0> CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1
+    2016-03-09 10:16:47.343 [info] <0.132.0> CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1
+
+Let's pick apart some of these lines.  We have started all three
+servers at about the same time.  We see some race conditions happen,
+and some jostling and readjustment happens pretty quickly in the first
+few seconds.
+
+* `Started FLU f1 with supervisor pid <0.128.0>`
+  * This VM, #1,
+  started a FLU (Machi data server) with the name `f1`.  In the Erlang
+  process supervisor hierarchy, the process ID of the top supervisor
+  is `<0.128.0>`.
+* `Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[]`
+  * A bootstrap configuration for a chain named `c1` has been created.
+  * The FLUs/data servers that are eligible for participation in the
+    chain have names `f1`, `f2`, and `f3`.
+  * The chain will operate in eventual consistency mode (`ap_mode`)
+  * The witness server list is empty.  Witness servers are never used
+    in eventual consistency mode.
+* `CONFIRM epoch 1141 <<155,42,7,221>> upi [] rep [] auth f1 by f1`
+  * All participants in epoch 1141 are unanimous in adopting epoch
+    1141's projection.  All active membership lists are empty, so
+    there is no functional chain replication yet, at least as far as
+    server `f1` knows
+  * The epoch's abbreviated checksum is `<<155,42,7,221>>`.
+  * The UPI list, i.e. the replicas whose data is 100% in sync is
+    `[]`, the empty list.  (UPI = Update Propagation Invariant)
+  * The list of servers that are under data repair (`rep`) is also
+    empty, `[]`.
+  * This projection was authored by server `f1`.
+  * The log message was generated by server `f1`.
+* `CONFIRM epoch 1148 <<57,213,154,16>> upi [f1] rep [] auth f1 by f1`
+  * Now the server `f1` has created a chain of length 1, `[f1]`.
+  * Chain repair/file re-sync is not required when the UPI server list
+    changes from length 0 -> 1.
+* `CONFIRM epoch 1151 <<239,29,39,70>> upi [f1] rep [f3] auth f1 by f1`
+  * Server `f1` has noticed that server `f3` is alive.  Apparently it
+    has not yet noticed that server `f2` is also running.
+  * Server `f3` is in the repair list.
+* `CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1`
+  * Server `f2` is apparently now aware that all three servers are running.
+  * The previous configuration used by `f2` was `upi [f2]`, i.e., `f2`
+    was running in a chain of one.  `f2` noticed that `f1` and `f3`
+    were now available and has started adding them to the chain.
+  * All new servers are always added to the tail of the chain in the
+    repair list.
+  * In eventual consistency mode, a UPI change like this is OK.
+    * When performing a read, a client must read from both tail of the
+      UPI list and also from all repairing servers.
+    * When performing a write, the client writes to both the UPI
+      server list and also the repairing list, in that order.
+      * I.e., the client concatenates both lists,
+      `UPI ++ Repairing`, for its chain configuration for the write.
+  * Server `f2` will trigger file repair/re-sync shortly.
+    * The waiting time for starting repair has been configured to be
+      extremely short, 1 second.  The default waiting time is 10
+      seconds, in case Humming Consensus remains unstable.
+* `CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1`
+  * File repair/re-sync has finished.  All file data on all servers
+    are now in sync.
+  * The UPI/in-sync part of the chain is now `[f2,f1,f3]`, and there
+    are no servers under repair.
+
+## Let's create some failures
+
+Here are some suggestions for creating failures.
+
+* Use the `./dev/devN/bin/machi stop` and `./dev/devN/bin/machi start`
+  commands to stop & start VM #`N`.
+* Stop a VM abnormally by using `kill`.  The OS process name to look
+  for is `beam.smp`.
+* Suspend and resume a VM, using the `SIGSTOP` and `SIGCONT` signals.
+  * E.g. `kill -STOP 9823` and `kill -CONT 9823`
+
+The network partition simulator is not (yet) available when running
+Machi in this mode.  Please see the next section for instructions on
+how to use partition simulator.
+
+
+<a name="partition-simulator">
+# Using the network partition simulator and convergence demo test code
+
+This is the demo code mentioned in the presentation that Scott Lystig
+Fritchie gave at the
+[RICON 2015 conference](http://ricon.io).
+* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf)
+* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q)
+
+## A complete example of all input and output
+
+If you don't have an Erlang/OTP 17 runtime environment available,
+please see this file for full input and output of a strong consistency
+length=3 chain test:
+https://gist.github.com/slfritchie/8352efc88cc18e62c72c
+This file contains all commands input and all simulator output from a
+sample run of the simulator.
+
+To help interpret the output of the test, please skip ahead to the
+"The test output is very verbose" section.
+
+## Prerequisites
+
+If you don't have `git` and/or the Erlang 17 runtime system available
+on your OS X, FreeBSD, Linux, or Solaris machine, please take a look
+at the [Prerequistes section](#prerequisites) first.  When you have
+installed the prerequisite software, please return back here.
+
+## Clone and compile the code
+
+Please briefly visit the [Clone and compile the code](#clone-compile)
+section.  When finished, please return back here.
+
+## Run an interactive Erlang CLI shell
+
+Run the following command at your login shell:
+
+    erl -pz .eunit ebin deps/*/ebin
+
+If you are using Erlang/OTP version 17, you should see some CLI output
+that looks like this:
+
+    Erlang/OTP 17 [erts-6.4] [source] [64-bit] [smp:8:8] [async-threads:10] [hipe] [kernel-poll:false] [dtrace]
+    
+    Eshell V6.4  (abort with ^G)
+    1>
+
+## The test output is very verbose ... what are the important parts?
+
+The output of the Erlang command
+`machi_chain_manager1_converge_demo:help()` will display the following
+guide to the output of the tests.
+
+    A visualization of the convergence behavior of the chain self-management
+    algorithm for Machi.
+    
+      1. Set up some server and chain manager pairs.
+      2. Create a number of different network partition scenarios, where
+         (simulated) partitions may be symmetric or asymmetric.  Then stop changing
+         the partitions and keep the simulated network stable (and perhaps broken).
+      3. Run a number of iterations of the algorithm in parallel by poking each
+         of the manager processes on a random'ish basis.
+      4. Afterward, fetch the chain transition changes made by each FLU and
+         verify that no transition was unsafe.
+    
+    During the iteration periods, the following is a cheatsheet for the output.
+    See the internal source for interpreting the rest of the output.
+    
+        'SET partitions = '
+    
+            A pair-wise list of actors which cannot send messages.  The
+            list is uni-directional.  If there are three servers (a,b,c),
+            and if the partitions list is '[{a,b},{b,c}]' then all
+            messages from a->b and b->c will be dropped, but any other
+            sender->recipient messages will be delivered successfully.
+    
+        'x uses:'
+    
+            The FLU x has made an internal state transition and is using
+            this epoch's projection as operating chain configuration.  The
+            rest of the line is a summary of the projection.
+    
+        'CONFIRM epoch {N}'
+    
+            This message confirms that all of the servers listed in the
+            UPI and repairing lists of the projection at epoch {N} have
+            agreed to use this projection because they all have written
+            this projection to their respective private projection stores.
+            The chain is now usable by/available to all clients.
+    
+        'Sweet, private projections are stable'
+    
+            This report announces that this iteration of the test cycle
+            has passed successfully.  The report that follows briefly
+            summarizes the latest private projection used by each
+            participating server.  For example, when in strong consistency
+            mode with 'a' as a witness and 'b' and 'c' as real servers:
+    
+            %% Legend:
+            %% server name, epoch ID, UPI list, repairing list, down list, ...
+            %%                         ... witness list, 'false' (a constant value)
+    
+            [{a,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}},
+             {b,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}]
+    
+            Both servers 'a' and 'b' agree on epoch 1116 with epoch ID
+            {1116,<<23,143,246,55>>} where UPI=[a,b], repairing=[],
+            down=[c], and witnesses=[a].
+    
+            Server 'c' is not shown because 'c' has wedged itself OOS (out
+            of service) by configuring a chain length of zero.
+    
+            If no servers are listed in the report (i.e. only '[]' is
+            displayed), then all servers have wedged themselves OOS, and
+            the chain is unavailable.
+    
+        'DoIt,' 
+    
+            This marks a group of tick events which trigger the manager
+            processes to evaluate their environment and perhaps make a
+            state transition.
+    
+    A long chain of 'DoIt,DoIt,DoIt,' means that the chain state has
+    (probably) settled to a stable configuration, which is the goal of the
+    algorithm.
+    
+    Press control-c to interrupt the test....".
+
+## Run a test in eventual consistency mode
+
+Run the following command at the Erlang CLI prompt:
+
+    machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}]).
+
+The first argument, `3`, is the number of servers to participate in
+the chain.  Please note:
+
+* Chain lengths as short as 1 or 2 are valid, but the results are a
+  bit boring.
+* Chain lengths as long as 7 or 9 can be used, but they may
+  suffer from longer periods of churn/instability before all chain
+  managers reach agreement via humming consensus.  (It is future work
+  to shorten the worst of the unstable churn latencies.)
+* In eventual consistency mode, chain lengths may be even numbers,
+  e.g. 2, 4, or 6.
+* The simulator will choose partition events from the permutations of
+  all 1, 2, and 3 node partition pairs.  The total runtime will
+  increase *dramatically* with chain length.
+    * Chain length 2: about 3 partition cases
+    * Chain length 3: about 35 partition cases
+    * Chain length 4: about 230 partition cases
+    * Chain length 5: about 1100 partition cases
+
+## Run a test in strong consistency mode (with witnesses):
+
+*NOTE:* Due to a bug in the test code, please do not try to run the
+ convergence test in strong consistency mode and also without the
+ correct minority number of witness servers!  If in doubt, please run
+ the commands shown below exactly.
+
+Run the following command at the Erlang CLI prompt:
+
+    machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).
+
+The first argument, `3`, is the number of servers to participate in
+the chain.  Chain lengths as long as 7 or 9 can be used, but they may
+suffer from longer periods of churn/instability before all chain
+managers reach agreement via humming consensus.
+
+Due to the bug mentioned above, please use the following
+commands when running with chain lengths of 5 or 7, respectively.
+
+    machi_chain_manager1_converge_demo:t(5, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b]}]).
+    machi_chain_manager1_converge_demo:t(7, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b,c]}]).
+
--- a/doc/machi_chain_manager1_converge_demo.md
+++ b/doc/machi_chain_manager1_converge_demo.md
@ -1,185 +0,0 @@
-
-# Using the network partition simulator and convergence demo test code
-
-## A complete example of all input and output
-
-If you don't have an Erlang/OTP 17 runtime environment available,
-please see this file for full input and output of a strong consistency
-length=3 chain test:
-https://gist.github.com/slfritchie/8352efc88cc18e62c72c
-This file contains all commands input and all simulator output from a
-sample run of the simulator.
-
-To help interpret the output of the test, please skip ahead to the
-"The test output is very verbose" section.
-
-## Prerequisites
-
-1. You'll need the `git` source management 
-2. You'll need the Erlang/OTP 17 runtime environment.  Please don't
-   use earlier or later versions until we have a chance to fix the
-   compilation warnings that versions R16B and 18 will trigger.
-
-All of the commands that should be run at your login shell (e.g. Bash,
-c-shell) can be cut-and-pasted from this document directly to your
-login shell prompt.
-
-## Clone and compile the code
-
-Clone the Machi source repo and compile the source and test code.  Run
-the following commands at your login shell:
-
-    cd /tmp
-    git clone https://github.com/basho/machi.git
-    cd machi
-    git checkout master
-    make
-
-Then run the unit test suite.  This may take up to two minutes or so
-to finish.  Most of the tests will be silent; please be patient until
-the tests finish.
-
-    make test
-
-## Run an interactive Erlang CLI shell
-
-Run the following command at your login shell:
-
-    erl -pz .eunit ebin deps/*/ebin
-
-If you are using Erlang/OTP version 17, you should see some CLI output
-that looks like this:
-
-    Erlang/OTP 17 [erts-6.4] [source] [64-bit] [smp:8:8] [async-threads:10] [hipe] [kernel-poll:false] [dtrace]
-    
-    Eshell V6.4  (abort with ^G)
-    1>
-
-## The test output is very verbose ... what are the important parts?
-
-The output of the Erlang command
-`machi_chain_manager1_converge_demo:help()` will display the following
-guide to the output of the tests.
-
-    A visualization of the convergence behavior of the chain self-management
-    algorithm for Machi.
-    
-      1. Set up some server and chain manager pairs.
-      2. Create a number of different network partition scenarios, where
-         (simulated) partitions may be symmetric or asymmetric.  Then stop changing
-         the partitions and keep the simulated network stable (and perhaps broken).
-      3. Run a number of iterations of the algorithm in parallel by poking each
-         of the manager processes on a random'ish basis.
-      4. Afterward, fetch the chain transition changes made by each FLU and
-         verify that no transition was unsafe.
-    
-    During the iteration periods, the following is a cheatsheet for the output.
-    See the internal source for interpreting the rest of the output.
-    
-        'SET partitions = '
-    
-            A pair-wise list of actors which cannot send messages.  The
-            list is uni-directional.  If there are three servers (a,b,c),
-            and if the partitions list is '[{a,b},{b,c}]' then all
-            messages from a->b and b->c will be dropped, but any other
-            sender->recipient messages will be delivered successfully.
-    
-        'x uses:'
-    
-            The FLU x has made an internal state transition and is using
-            this epoch's projection as operating chain configuration.  The
-            rest of the line is a summary of the projection.
-    
-        'CONFIRM epoch {N}'
-    
-            This message confirms that all of the servers listed in the
-            UPI and repairing lists of the projection at epoch {N} have
-            agreed to use this projection because they all have written
-            this projection to their respective private projection stores.
-            The chain is now usable by/available to all clients.
-    
-        'Sweet, private projections are stable'
-    
-            This report announces that this iteration of the test cycle
-            has passed successfully.  The report that follows briefly
-            summarizes the latest private projection used by each
-            participating server.  For example, when in strong consistency
-            mode with 'a' as a witness and 'b' and 'c' as real servers:
-    
-            %% Legend:
-            %% server name, epoch ID, UPI list, repairing list, down list, ...
-            %%                         ... witness list, 'false' (a constant value)
-    
-            [{a,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}},
-             {b,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}]
-    
-            Both servers 'a' and 'b' agree on epoch 1116 with epoch ID
-            {1116,<<23,143,246,55>>} where UPI=[a,b], repairing=[],
-            down=[c], and witnesses=[a].
-    
-            Server 'c' is not shown because 'c' has wedged itself OOS (out
-            of service) by configuring a chain length of zero.
-    
-            If no servers are listed in the report (i.e. only '[]' is
-            displayed), then all servers have wedged themselves OOS, and
-            the chain is unavailable.
-    
-        'DoIt,' 
-    
-            This marks a group of tick events which trigger the manager
-            processes to evaluate their environment and perhaps make a
-            state transition.
-    
-    A long chain of 'DoIt,DoIt,DoIt,' means that the chain state has
-    (probably) settled to a stable configuration, which is the goal of the
-    algorithm.
-    
-    Press control-c to interrupt the test....".
-
-## Run a test in eventual consistency mode
-
-Run the following command at the Erlang CLI prompt:
-
-    machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}]).
-
-The first argument, `3`, is the number of servers to participate in
-the chain.  Please note:
-
-* Chain lengths as short as 1 or 2 are valid, but the results are a
-  bit boring.
-* Chain lengths as long as 7 or 9 can be used, but they may
-  suffer from longer periods of churn/instability before all chain
-  managers reach agreement via humming consensus.  (It is future work
-  to shorten the worst of the unstable churn latencies.)
-* In eventual consistency mode, chain lengths may be even numbers,
-  e.g. 2, 4, or 6.
-* The simulator will choose partition events from the permutations of
-  all 1, 2, and 3 node partition pairs.  The total runtime will
-  increase *dramatically* with chain length.
-    * Chain length 2: about 3 partition cases
-    * Chain length 3: about 35 partition cases
-    * Chain length 4: about 230 partition cases
-    * Chain length 5: about 1100 partition cases
-
-## Run a test in strong consistency mode (with witnesses):
-
-*NOTE:* Due to a bug in the test code, please do not try to run the
- convergence test in strong consistency mode and also without the
- correct minority number of witness servers!  If in doubt, please run
- the commands shown below exactly.
-
-Run the following command at the Erlang CLI prompt:
-
-    machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).
-
-The first argument, `3`, is the number of servers to participate in
-the chain.  Chain lengths as long as 7 or 9 can be used, but they may
-suffer from longer periods of churn/instability before all chain
-managers reach agreement via humming consensus.
-
-Due to the bug mentioned above, please use the following
-commands when running with chain lengths of 5 or 7, respectively.
-
-    machi_chain_manager1_converge_demo:t(5, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b]}]).
-    machi_chain_manager1_converge_demo:t(7, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b,c]}]).
-
--- a/priv/humming-consensus-demo.setup.sh
+++ b/priv/humming-consensus-demo.setup.sh
@ -0,0 +1,56 @@
+#!/bin/sh
+
+echo "Step: Verify that the required entries in /etc/hosts are present"
+for i in 1 2 3; do
+    grep machi$i /etc/hosts | egrep -s '^127.0.0.1' > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+        echo ""
+        echo "'grep -s machi$i' failed. Aborting, sorry."
+        exit 1
+    fi
+    ping -c 1 machi$i > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+        echo ""
+        echo "Ping attempt on host machi$i failed. Aborting."
+        echo ""
+        ping -c 1 machi$i
+        exit 1
+    fi
+done
+
+echo "Step: add a verbose logging option to app.config"
+for i in 1 2 3; do
+    ed ./dev/dev$i/etc/app.config <<EOF > /dev/null 2>&1
+/verbose_confirm
+a
+{chain_manager_opts, [{private_write_verbose_confirm,true}]},
+{stability_time, 1},
+.
+w
+q
+EOF
+done
+
+echo "Step: start three three Machi application instances"
+for i in 1 2 3; do
+    ./dev/dev$i/bin/machi start
+    ./dev/dev$i/bin/machi ping
+    if [ $? -ne 0 ]; then
+        echo "Sorry, a 'ping' check for instance dev$i failed. Aborting."
+        exit 1
+    fi
+done
+
+echo "Step: configure one chain to start a Humming Consensus group with three members"
+
+# Note: $CWD of each Machi proc is two levels below the source code root dir.
+LIFECYCLE000=../../priv/quick-admin-examples/demo-000
+for i in 3 2 1; do
+    ./dev/dev$i/bin/machi-admin quick-admin-apply $LIFECYCLE000 machi$i
+    if [ $? -ne 0 ]; then
+        echo "Sorry, 'machi-admin quick-admin-apply failed' on machi$i. Aborting."
+        exit 1
+    fi
+done
+
+exit 0
--- a/priv/humming-consensus-demo.vagrant/Vagrantfile
+++ b/priv/humming-consensus-demo.vagrant/Vagrantfile
@ -0,0 +1,93 @@
+# -*- mode: ruby -*-
+# vi: set ft=ruby :
+
+# All Vagrant configuration is done below. The "2" in Vagrant.configure
+# configures the configuration version (we support older styles for
+# backwards compatibility). Please don't change it unless you know what
+# you're doing.
+Vagrant.configure(2) do |config|
+  # The most common configuration options are documented and commented below.
+  # For a complete reference, please see the online documentation at
+  # https://docs.vagrantup.com.
+
+  # Every Vagrant development environment requires a box. You can search for
+  # boxes at https://atlas.hashicorp.com/search.
+  # If this Vagrant box has not been downloaded before (e.g. using "vagrant box add"),
+  # then Vagrant will automatically download the VM image from HashiCorp.
+  config.vm.box = "hashicorp/precise64"
+  # If using a FreeBSD box, Bash may not be installed.
+  # Use the config.ssh.shell setting to specify an alternate shell.
+  # Note, however, that any code in the 'config.vm.provision' section
+  # would then have to use this shell's syntax!
+  #   config.ssh.shell = "/bin/csh -l"
+
+  # Disable automatic box update checking. If you disable this, then
+  # boxes will only be checked for updates when the user runs
+  # `vagrant box outdated`. This is not recommended.
+  # config.vm.box_check_update = false
+
+  # Create a forwarded port mapping which allows access to a specific port
+  # within the machine from a port on the host machine. In the example below,
+  # accessing "localhost:8080" will access port 80 on the guest machine.
+  # config.vm.network "forwarded_port", guest: 80, host: 8080
+
+  # Create a private network, which allows host-only access to the machine
+  # using a specific IP.
+  # config.vm.network "private_network", ip: "192.168.33.10"
+
+  # Create a public network, which generally matched to bridged network.
+  # Bridged networks make the machine appear as another physical device on
+  # your network.
+  # config.vm.network "public_network"
+
+  # Share an additional folder to the guest VM. The first argument is
+  # the path on the host to the actual folder. The second argument is
+  # the path on the guest to mount the folder. And the optional third
+  # argument is a set of non-required options.
+  # config.vm.synced_folder "../data", "/vagrant_data"
+
+  # Provider-specific configuration so you can fine-tune various
+  # backing providers for Vagrant. These expose provider-specific options.
+  # Example for VirtualBox:
+  #
+  config.vm.provider "virtualbox" do |vb|
+    # Display the VirtualBox GUI when booting the machine
+    # vb.gui = true
+ 
+    # Customize the amount of memory on the VM:
+    vb.memory = "512"
+  end
+  #
+  # View the documentation for the provider you are using for more
+  # information on available options.
+
+  # Define a Vagrant Push strategy for pushing to Atlas. Other push strategies
+  # such as FTP and Heroku are also available. See the documentation at
+  # https://docs.vagrantup.com/v2/push/atlas.html for more information.
+  # config.push.define "atlas" do |push|
+  #   push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME"
+  # end
+
+  # Enable provisioning with a shell script. Additional provisioners such as
+  # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the
+  # documentation for more information about their specific syntax and use.
+  config.vm.provision "shell", inline: <<-SHELL
+    # Install prerequsites
+    # Support here for FreeBSD is experimental
+    apt-get update ; sudo apt-get install -y git sudo rsync ; # Ubuntu Linux
+    env ASSUME_ALWAYS_YES=yes pkg install -f git sudo rsync ; # FreeBSD 10
+
+    # Install dependent packages, using slf-configurator
+    git clone https://github.com/slfritchie/slf-configurator.git
+    chown -R vagrant ./slf-configurator
+    (cd slf-configurator ; sudo sh -x ./ALL.sh)
+    echo 'export PATH=${PATH}:/usr/local/erlang/17.5/bin' >> ~vagrant/.bashrc
+    export PATH=${PATH}:/usr/local/erlang/17.5/bin
+    ## echo 'set path = ( $path  /usr/local/erlang/17.5/bin )' >> ~vagrant/.cshrc
+    ## setenv PATH /usr/local/erlang/17.5/bin:$PATH
+
+    git clone https://github.com/basho/machi.git
+    (cd machi ; git checkout master ; make && make test )
+    chown -R vagrant ./machi
+  SHELL
+end
--- a/priv/quick-admin-examples/demo-000
+++ b/priv/quick-admin-examples/demo-000
@ -0,0 +1,7 @@
+{host, "machi1", []}.
+{host, "machi2", []}.
+{host, "machi3", []}.
+{flu,f1,"machi1",20401,[]}.
+{flu,f2,"machi2",20402,[]}.
+{flu,f3,"machi3",20403,[]}.
+{chain,c1,[f1,f2,f3],[]}.
--- a/rel/files/app.config
+++ b/rel/files/app.config
@ -16,6 +16,10 @@
          %% Default = 10
          %% {metadata_manager_count, 2},

+          %% Default options for chain manager processes.
+          %% {chain_manager_opts, [{private_write_verbose,true},
+          %%                       {private_write_verbose_confirm,true}]},
+
          %% Platform vars (mirror of reltool packaging)
          {platform_data_dir, "{{platform_data_dir}}"},
          {platform_etc_dir,  "{{platform_etc_dir}}"},
--- a/rel/gen_dev
+++ b/rel/gen_dev
@ -0,0 +1,16 @@
+#! /bin/sh
+#
+# Example usage: gen_dev dev4 vars.src vars
+#
+# Generate an overlay config for devNNN from vars.src and write to vars
+#
+
+NAME=$1
+TEMPLATE=$2
+VARFILE=$3
+
+NODE="$NAME@127.0.0.1"
+
+echo "Generating $NAME - node='$NODE'"
+sed -e "s/@NODE@/$NODE/" \
+    < $TEMPLATE > $VARFILE
--- a/rel/reltool.config
+++ b/rel/reltool.config
@ -106,6 +106,7 @@
           {copy, "../priv/quick-admin-examples/000", "priv/quick-admin-examples"},
           {copy, "../priv/quick-admin-examples/001", "priv/quick-admin-examples"},
           {copy, "../priv/quick-admin-examples/002", "priv/quick-admin-examples"},
+           {copy, "../priv/quick-admin-examples/demo-000", "priv/quick-admin-examples/demo-000"},

           {mkdir, "lib/basho-patches"}
           %% {copy, "../apps/machi/ebin/etop_txt.beam", "lib/basho-patches"}
--- a/rel/vars.config
+++ b/rel/vars.config
@ -1,6 +1,9 @@
 %% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*-
 %% ex: ft=erlang ts=4 sw=4 et

+%% NOTE: When modifying this file, also keep its near cousin
+%%       config file rel/vars/dev_vars.config.src in sync!
+
 %% Platform-specific installation paths
 {platform_bin_dir,  "./bin"}.
 {platform_data_dir, "./data"}.
--- a/rel/vars/dev_vars.config.src
+++ b/rel/vars/dev_vars.config.src
@ -0,0 +1,48 @@
+%% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*-
+%% ex: ft=erlang ts=4 sw=4 et
+
+%% NOTE: When modifying this file, also keep its near cousin
+%%       config file rel/vars/dev_vars.config.src in sync!
+
+%% Platform-specific installation paths
+{platform_bin_dir,  "./bin"}.
+{platform_data_dir, "./data"}.
+{platform_etc_dir,  "./etc"}.
+{platform_lib_dir,  "./lib"}.
+{platform_log_dir,  "./log"}.
+
+%%
+%% etc/app.config
+%%
+{sasl_error_log,    "{{platform_log_dir}}/sasl-error.log"}.
+{sasl_log_dir,      "{{platform_log_dir}}/sasl"}.
+
+%% lager
+{console_log_default, file}.
+
+%%
+%% etc/vm.args
+%%
+{node,         "@NODE@"}.
+{crash_dump,   "{{platform_log_dir}}/erl_crash.dump"}.
+
+%%
+%% bin/machi
+%%
+{runner_script_dir,  "\`cd \\`dirname $0\\` 1>/dev/null && /bin/pwd\`"}.
+{runner_base_dir,    "{{runner_script_dir}}/.."}.
+{runner_etc_dir,     "$RUNNER_BASE_DIR/etc"}.
+{runner_log_dir,     "$RUNNER_BASE_DIR/log"}.
+{runner_lib_dir,     "$RUNNER_BASE_DIR/lib"}.
+{runner_patch_dir,   "$RUNNER_BASE_DIR/lib/basho-patches"}.
+{pipe_dir,           "/tmp/$RUNNER_BASE_DIR/"}.
+{runner_user,        ""}.
+{runner_wait_process, "machi_flu_sup"}.
+{runner_ulimit_warn, 65536}.
+
+%%
+%% cuttlefish
+%%
+{cuttlefish,         ""}. % blank = off
+{cuttlefish_conf,    "machi.conf"}.
+
--- a/src/machi.app.src
+++ b/src/machi.app.src
@ -1,7 +1,7 @@
 {application, machi, [
    {description, "A village of write-once files."},
-    {vsn, "0.0.0"},
-    {applications, [kernel, stdlib, crypto, cluster_info]},
+    {vsn, "0.0.1"},
+    {applications, [kernel, stdlib, crypto, cluster_info, ranch]},
    {mod,{machi_app,[]}},
    {registered, []},
    {env, [
--- a/src/machi_chain_manager1.erl
+++ b/src/machi_chain_manager1.erl
@ -92,8 +92,11 @@
 -define(REPAIR_START_STABILITY_TIME, 10).
 -endif. % TEST

-%% Magic constant for looping "too frequently" breaker.  TODO revisit & revise.
-define(TOO_FREQUENT_BREAKER, 10).
+%% Maximum length of the history of adopted projections (via C120).
+-define(MAX_HISTORY_LENGTH, 8).
+
+%% Magic constant for looping "too frequently" breaker.
+-define(TOO_FREQUENT_BREAKER, (?MAX_HISTORY_LENGTH+5)).

 -define(RETURN2(X), begin (catch put(why2, [?LINE|get(why2)])), X end).

@ -103,9 +106,6 @@
 %% Amount of epoch number skip-ahead for set_chain_members call
 -define(SET_CHAIN_MEMBERS_EPOCH_SKIP, 1111).

-%% Maximum length of the history of adopted projections (via C120).
-define(MAX_HISTORY_LENGTH, 30).
-
 %% API
 -export([start_link/2, start_link/3, stop/1, ping/1,
         set_chain_members/2, set_chain_members/6, set_active/2,
@ -234,11 +234,13 @@ test_read_latest_public_projection(Pid, ReadRepairP) ->
 %% manager's pid in MgrOpts and use direct gen_server calls to the
 %% local projection store.

-init({MyName, InitMembersDict, MgrOpts}) ->
+init({MyName, InitMembersDict, MgrOpts0}) ->
    put(ttt, [?LINE]),
    _ = random:seed(now()),
    init_remember_down_list(),
+    MgrOpts = MgrOpts0 ++ application:get_env(machi, chain_manager_opts, []),
    Opt = fun(Key, Default) -> proplists:get_value(Key, MgrOpts, Default) end,
+
    InitWitness_list = Opt(witnesses, []),
    ZeroAll_list = [P#p_srvr.name || {_,P} <- orddict:to_list(InitMembersDict)],
    ZeroProj = make_none_projection(0, MyName, ZeroAll_list,
@ -388,6 +390,7 @@ handle_cast(_Cast, S) ->
 handle_info(tick_check_environment, #ch_mgr{ignore_timer=true}=S) ->
    {noreply, S};
 handle_info(tick_check_environment, S) ->
+    gobble_ticks(),
    {{_Delta, Props, _Epoch}, S1} = do_react_to_env(S),
    S2 = sanitize_repair_state(S1),
    S3 = perhaps_start_repair(S2),
@ -460,7 +463,7 @@ get_my_proj_boot_info(MgrOpts, DefaultDict, DefaultProj, ProjType) ->
            {DefaultDict, DefaultProj};
        Store ->
            {ok, P} = machi_projection_store:read_latest_projection(Store,
-                                                                    ProjType),
+                                                                    ProjType, 7789),
            {P#projection_v1.members_dict, P}
    end.

@ -837,7 +840,10 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
                        D_foo=[{repair_done, {repair_final_status, ok, (S#ch_mgr.proj)#projection_v1.epoch_number}}],
                        {NewUPI_list ++ Repairing_list2, [], RunEnv2};
                   true ->
-                        D_foo=[d_foo2],
+                        D_foo=[d_foo2, {sim_p,Simulator_p},
+                               {simr_p,SimRepair_p}, {same_epoch,SameEpoch_p},
+                               {rel_to,RelativeToServer},
+                               {repch,RepChk_LastInUPI}, {repair_fs,RepairFS}],
                        {NewUPI_list, OldRepairing_list, RunEnv2}
                end;
            {_ABC, _XYZ} ->
@ -1974,7 +1980,7 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) ->
    %% In contrast to the public projection store writes, Humming Consensus
    %% doesn't care about the status of writes to the public store: it's
    %% always relying only on successful reads of the public store.
-    case {?FLU_PC:write_projection(MyStorePid, private, P_latest2,?TO*30),Goo} of
+    case {?FLU_PC:write_projection(MyStorePid, private, P_latest2,?TO*30+66),Goo} of
        {ok, Goo} ->
            ?REACT({c110, [{write, ok}]}),
            react_to_env_C111(P_latest, P_latest2, Extra1, MyStorePid, S);
@ -2060,7 +2066,6 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H,
    ?REACT(c120),
    H2   = add_and_trunc_history(P_latest, H, ?MAX_HISTORY_LENGTH),

-    %% diversion_c120_verbose_goop(P_latest, S),
    ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}),
    S2 = set_proj(S#ch_mgr{proj_history=H2,
                           sane_transitions=Xtns + 1}, P_latest),
@ -2068,20 +2073,21 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H,
             false ->
                 S2;
             {{_ConfEpoch, _ConfCSum}, ConfTime} ->
-                 io:format(user, "\nCONFIRM debug C120 ~w was annotated ~w\n", [S#ch_mgr.name, P_latest#projection_v1.epoch_number]),
+                 P_latestEpoch = P_latest#projection_v1.epoch_number,
+                 io:format(user, "\nCONFIRM debug C120 ~w was annotated ~w\n", [S#ch_mgr.name, P_latestEpoch]),
                 S2#ch_mgr{proj_unanimous=ConfTime}
         end,
    V = case file:read_file("/tmp/moomoo."++atom_to_list(S#ch_mgr.name)) of {ok,_} -> true; _ -> false end,
    if V -> io:format("C120: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end,
    {{now_using, FinalProps, P_latest#projection_v1.epoch_number}, S3}.

-add_and_trunc_history(P_latest, H, MaxLength) ->
+add_and_trunc_history(#projection_v1{epoch_number=0}, H, _MaxLength) ->
+    H;
+add_and_trunc_history(#projection_v1{} = P_latest, H, MaxLength) ->
    Latest_U_R = {P_latest#projection_v1.upi, P_latest#projection_v1.repairing},
-    H2 = if P_latest#projection_v1.epoch_number > 0 ->
-                 queue:in(Latest_U_R, H);
-            true ->
-                 H
-         end,
+    add_and_trunc_history(Latest_U_R, H, MaxLength);
+add_and_trunc_history(Item, H, MaxLength) ->
+    H2 = queue:in(Item, H),
    case queue:len(H2) of
        X when X > MaxLength ->
            {_V, Hxx} = queue:out(H2),
@ -2094,11 +2100,10 @@ react_to_env_C200(Retries, P_latest, S) ->
    ?REACT(c200),
    try
        AuthorProxyPid = proxy_pid(P_latest#projection_v1.author_server, S),
-        ?FLU_PC:kick_projection_reaction(AuthorProxyPid, [])
+        %% This is just advisory, we don't need a sync reply.
+        ?FLU_PC:kick_projection_reaction(AuthorProxyPid, [], 100)
    catch _Type:_Err ->
-            %% ?V("TODO: tell_author_yo is broken: ~p ~p\n",
-            %%           [_Type, _Err]),
-            ok
+        ok
    end,
    react_to_env_C210(Retries, S).

@ -2485,19 +2490,23 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current} = S) ->
                    ProjStore = get_projection_store_pid_or_regname(S),
                    #projection_v1{epoch_number=_EpochRep,
                                   epoch_csum= <<_CSumRep:4/binary,_/binary>>,
+                                   author_server=AuthRep,
                                   upi=_UPIRep,
                                   repairing=_RepairingRep} = NewProj,
                    ok = machi_projection_store:write(ProjStore, private, NewProj),
-                    case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of
+                    case proplists:get_value(private_write_verbose_confirm, S#ch_mgr.opts) of
                        true ->
-                            io:format(user, "\n~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), _EpochRep, _CSumRep, _UPIRep, _RepairingRep, MyName]);
+                            error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w auth ~w by ~w\n", [_EpochRep, _CSumRep, _UPIRep, _RepairingRep, AuthRep, MyName]);
                        _ ->
                            ok
                    end,
                    %% Unwedge our FLU.
                    {ok, NotifyPid} = machi_projection_store:get_wedge_notify_pid(ProjStore),
                    _ = machi_flu1:update_wedge_state(NotifyPid, false, EpochID),
-                    S2#ch_mgr{proj_unanimous=Now};
+                    #ch_mgr{proj_history=H} = S2,
+                    H2 = add_and_trunc_history({confirm, Epoch}, H,
+                                               ?MAX_HISTORY_LENGTH),
+                    S2#ch_mgr{proj_unanimous=Now, proj_history=H2};
                _ ->
                    S2
            end;
@ -2537,6 +2546,14 @@ gobble_calls(StaticCall) ->
            ok
    end.

+gobble_ticks() ->
+    receive
+        tick_check_environment ->
+            gobble_ticks()
+    after 0 ->
+            ok
+    end.
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 perhaps_start_repair(#ch_mgr{name=MyName,
@ -2552,12 +2569,13 @@ perhaps_start_repair(#ch_mgr{name=MyName,
            %% RepairOpts = [{repair_mode, check}, verbose],
            RepairFun = fun() -> do_repair(S, RepairOpts, CMode) end,
            LastUPI = lists:last(UPI),
+            StabilityTime = application:get_env(machi, stability_time, ?REPAIR_START_STABILITY_TIME),
            IgnoreStabilityTime_p = proplists:get_value(ignore_stability_time,
                                                        S#ch_mgr.opts, false),
            case timer:now_diff(os:timestamp(), Start) div 1000000 of
                N when MyName == LastUPI andalso
                       (IgnoreStabilityTime_p orelse
-                        N >= ?REPAIR_START_STABILITY_TIME) ->
+                        N >= StabilityTime) ->
                    {WorkerPid, _Ref} = spawn_monitor(RepairFun),
                    S#ch_mgr{repair_worker=WorkerPid,
                             repair_start=os:timestamp(),
@ -2966,34 +2984,33 @@ zerf_find_last_annotated(FLU, MajoritySize, S) ->
            []                                  % lists:flatten() will destroy
    end.

-perhaps_verbose_c111(P_latest2, S) ->
-    case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of
-        true ->
+perhaps_verbose_c111(P_latest2, #ch_mgr{name=MyName, opts=Opts}=S) ->
+    PrivWriteVerb = proplists:get_value(private_write_verbose, Opts, false),
+    PrivWriteVerbCONFIRM = proplists:get_value(private_write_verbose_confirm, Opts, false),
+    if PrivWriteVerb orelse PrivWriteVerbCONFIRM ->
            Dbg2X = lists:keydelete(react, 1,
                                    P_latest2#projection_v1.dbg2) ++
                [{is_annotated,is_annotated(P_latest2)}],
            P_latest2x = P_latest2#projection_v1{dbg2=Dbg2X}, % limit verbose len.
            Last2 = get(last_verbose),
            Summ2 = machi_projection:make_summary(P_latest2x),
-            if P_latest2#projection_v1.upi == [],
-               (S#ch_mgr.proj)#projection_v1.upi /= [] ->
-                    <<CSumRep:4/binary,_/binary>> =
-                                          P_latest2#projection_v1.epoch_csum,
-                    io:format(user, "~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), (S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]);
+            if PrivWriteVerb, Summ2 /= Last2 ->
+                    put(last_verbose, Summ2),
+                    error_logger:info_msg("~p uses plain: ~w \n",
+                       [MyName, Summ2]);
               true ->
                    ok
            end,
-            case proplists:get_value(private_write_verbose,
-                                     S#ch_mgr.opts) of
-            %% case true of
-                true when Summ2 /= Last2 ->
-                    put(last_verbose, Summ2),
-                    ?V("\n~s ~p uses plain: ~w \n",
-                       [machi_util:pretty_time(), S#ch_mgr.name, Summ2]);
-                _ ->
+            if PrivWriteVerbCONFIRM,
+               P_latest2#projection_v1.upi == [],
+               (S#ch_mgr.proj)#projection_v1.upi /= [] ->
+                    <<CSumRep:4/binary,_/binary>> =
+                                          P_latest2#projection_v1.epoch_csum,
+                    error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w auth ~w by ~w\n", [(S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, P_latest2#projection_v1.author_server, S#ch_mgr.name]);
+               true ->
                    ok
            end;
-        _ ->
+       true ->
            ok
    end.

--- a/src/machi_dt.erl
+++ b/src/machi_dt.erl
@ -32,6 +32,12 @@
 -type chunk_summary() :: {file_offset(), chunk_size(), chunk_bin(), chunk_cstrm()}.
 -type chunk_pos()   :: {file_offset(), chunk_size(), file_name_s()}.
 -type chunk_size()  :: non_neg_integer().
+
+%% Tags that stand for how that checksum was generated. See
+%% machi_util:make_tagged_csum/{1,2} for further documentation and
+%% implementation.
+-type csum_tag()    :: none | client_sha | server_sha | server_regen_sha.
+
 -type error_general() :: 'bad_arg' | 'wedged' | 'bad_checksum'.
 -type epoch_csum()  :: binary().
 -type epoch_num()   :: -1 | non_neg_integer().
@ -53,11 +59,6 @@
 -type read_opts()   :: #read_opts{}.
 -type read_opts_x() :: 'undefined' | 'noopt' | 'none' | #read_opts{}.

-%% Tags that stand for how that checksum was generated. See
-%% machi_util:make_tagged_csum/{1,2} for further documentation and
-%% implementation.
-type csum_tag()    :: none | client_sha | server_sha | server_regen_sha.
-
 -export_type([
              append_opts/0,
              chunk/0,
@ -68,6 +69,7 @@
              chunk_summary/0,
              chunk_pos/0,
              chunk_size/0,
+              csum_tag/0,
              error_general/0,
              epoch_csum/0,
              epoch_num/0,
--- a/src/machi_fitness.erl
+++ b/src/machi_fitness.erl
@ -108,6 +108,7 @@ handle_call({update_local_down_list, Down, MembersDict}, _From,
            #state{my_flu_name=MyFluName, pending_map=OldMap,
                   local_down=OldDown, members_dict=OldMembersDict,
                   admin_down=AdminDown}=S) ->
+    verbose("FITNESS: ~w has down suspect ~w\n", [MyFluName, Down]),
    NewMap = store_in_map(OldMap, MyFluName, erlang:now(), Down,
                          AdminDown, [props_yo]),
    S2 = if Down == OldDown, MembersDict == OldMembersDict ->
@ -119,13 +120,17 @@ handle_call({update_local_down_list, Down, MembersDict}, _From,
         end,
    {reply, ok, S2#state{local_down=Down}};
 handle_call({add_admin_down, DownFLU, DownProps}, _From,
-            #state{local_down=OldDown, admin_down=AdminDown}=S) ->
+            #state{my_flu_name=MyFluName,
+                   local_down=OldDown, admin_down=AdminDown}=S) ->
+    verbose("FITNESS: ~w add admin down ~w\n", [MyFluName, DownFLU]),
    NewAdminDown = [{DownFLU,DownProps}|lists:keydelete(DownFLU, 1, AdminDown)],
    S3 = finish_admin_down(erlang:now(), OldDown, NewAdminDown,
                           [props_yo], S),
    {reply, ok, S3};
 handle_call({delete_admin_down, DownFLU}, _From,
-            #state{local_down=OldDown, admin_down=AdminDown}=S) ->
+            #state{my_flu_name=MyFluName,
+                   local_down=OldDown, admin_down=AdminDown}=S) ->
+    verbose("FITNESS: ~w delete admin down ~w\n", [MyFluName, DownFLU]),
    NewAdminDown = lists:keydelete(DownFLU, 1, AdminDown),
    S3 = finish_admin_down(erlang:now(), OldDown, NewAdminDown,
                           [props_yo], S),
@ -143,7 +148,8 @@ handle_call(_Request, _From, S) ->
 handle_cast(_Msg, S) ->
    {noreply, S}.

-handle_info({adjust_down_list, FLU}, #state{active_unfit=ActiveUnfit}=S) ->
+handle_info({adjust_down_list, FLU}, #state{my_flu_name=MyFluName,
+                                            active_unfit=ActiveUnfit}=S) ->
    NewUnfit = make_unfit_list(S),
    Added_to_new     = NewUnfit -- ActiveUnfit,
    Dropped_from_new = ActiveUnfit -- NewUnfit,
@ -184,9 +190,11 @@ handle_info({adjust_down_list, FLU}, #state{active_unfit=ActiveUnfit}=S) ->
        {true, true} ->
            error({bad, ?MODULE, ?LINE, FLU, ActiveUnfit, NewUnfit});
        {true, false} ->
-            {noreply, S#state{active_unfit=lists:usort(ActiveUnfit ++ [FLU])}};
+            NewActive = wrap_active(MyFluName,lists:usort(ActiveUnfit++[FLU])),
+            {noreply, S#state{active_unfit=NewActive}};
        {false, true} ->
-            {noreply, S#state{active_unfit=ActiveUnfit -- [FLU]}};
+            NewActive = wrap_active(MyFluName,ActiveUnfit--[FLU]),
+            {noreply, S#state{active_unfit=NewActive}};
        {false, false} ->
            {noreply, S}
    end;
@ -424,6 +432,18 @@ map_value(Map) ->
 map_merge(Map1, Map2) ->
    ?MAP:merge(Map1, Map2).

+wrap_active(MyFluName, L) ->
+    verbose("FITNESS: ~w has new down list ~w\n", [MyFluName, L]),
+    L.
+
+verbose(Fmt, Args) ->
+    case application:get_env(machi, fitness_verbose) of
+        {ok, true} ->
+            error_logger:info_msg(Fmt, Args);
+        _ ->
+            ok
+    end.
+
 -ifdef(TEST).

 dt_understanding_test() ->
--- a/src/machi_sup.erl
+++ b/src/machi_sup.erl
@ -65,5 +65,11 @@ init([]) ->
    LifecycleMgr =
        {machi_lifecycle_mgr, {machi_lifecycle_mgr, start_link, []},
         Restart, Shutdown, worker, []},
-
-    {ok, {SupFlags, [ServerSup, RanchSup, LifecycleMgr]}}.
+    RunningApps = [A || {A,_D,_V} <- application:which_applications()],
+    Specs = case lists:member(ranch, RunningApps) of
+                true ->
+                    [ServerSup, LifecycleMgr];
+                false ->
+                    [ServerSup, RanchSup, LifecycleMgr]
+            end,
+    {ok, {SupFlags, Specs}}.
--- a/test/machi_chain_manager1_converge_demo.erl
+++ b/test/machi_chain_manager1_converge_demo.erl
@ -134,6 +134,7 @@ Press control-c to interrupt the test....".
 %%     convergence_demo_testfun(3).

 -define(DEFAULT_MGR_OPTS, [{private_write_verbose, false},
+                           {private_write_verbose_confirm, true},
                           {active_mode,false},
                           {use_partition_simulator, true}]).

@ -150,7 +151,8 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) ->
    %% Faster test startup, commented: io:format(user, short_doc(), []),
    %% Faster test startup, commented: timer:sleep(3000),

-    application:start(sasl),
+    Apps = [sasl, ranch],
+    [application:start(App) || App <- Apps],

    MgrOpts = MgrOpts0 ++ ?DEFAULT_MGR_OPTS,
    TcpPort = proplists:get_value(port_base, MgrOpts, 62877),
@ -393,7 +395,8 @@ timer:sleep(1234),
        exit(SupPid, normal),
        ok = machi_partition_simulator:stop(),
        [ok = ?FLU_PC:quit(PPid) || {_, PPid} <- Namez],
-        machi_util:wait_for_death(SupPid, 100)
+        machi_util:wait_for_death(SupPid, 100),
+        [application:start(App) || App <- lists:reverse(Apps)]
    end.

 %% Many of the static partition lists below have been problematic at one
--- a/test/machi_file_proxy_test.erl
+++ b/test/machi_file_proxy_test.erl
@ -38,7 +38,7 @@ clean_up_data_dir(DataDir) ->
 -ifndef(PULSE).

 -define(TESTDIR, "./t").
-define(HYOOGE, 1 * 1024 * 1024 * 1024). % 1 long GB
+-define(HYOOGE, 75 * 1024 * 1024). % 75 MBytes

 random_binary_single() ->
    %% OK, I guess it's not that random...