From 69244691f457049959cc2a1be46b9506ae1acf08 Mon Sep 17 00:00:00 2001
From: Scott Lystig Fritchie <slfritchie@snookles.com>
Date: Wed, 20 May 2015 11:11:54 +0900
Subject: [PATCH] Such wonder when one *reads* the docs...

---
 src/machi_app.erl               |   5 +-
 src/machi_chain_manager1.erl    |   2 +-
 src/machi_chain_repair.erl      | 114 ++++++++++++++++----------------
 src/machi_cr_client.erl         |  74 +++++++++++++++++++--
 src/machi_flu1.erl              |  17 ++++-
 src/machi_flu1_client.erl       |  28 ++++++++
 src/machi_flu_psup.erl          |  36 ++++++++++
 src/machi_flu_sup.erl           |   3 +
 src/machi_projection_store.erl  |   6 +-
 src/machi_proxy_flu1_client.erl |   5 +-
 src/machi_sup.erl               |   3 +
 11 files changed, 221 insertions(+), 72 deletions(-)

diff --git a/src/machi_app.erl b/src/machi_app.erl
index 2701f60..d23718e 100644
--- a/src/machi_app.erl
+++ b/src/machi_app.erl
@@ -18,7 +18,10 @@
 %%
 %% -------------------------------------------------------------------
 
-%% @doc Top-level supervisor for the Machi application.
+%% @doc Start the top-level supervisor for the Machi application.
+%%
+%% See {@link machi_flu_psup} for an illustration of the entire Machi
+%% application process structure.
 
 -module(machi_app).
 
diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl
index 6e48978..572bd8f 100644
--- a/src/machi_chain_manager1.erl
+++ b/src/machi_chain_manager1.erl
@@ -722,7 +722,7 @@ rank_and_sort_projections(Ps, CurrentProj) ->
 %%          E+5: author=b, upi=[a,b], repairing=[c,d] (**)
 %%          E+6: author=c, upi=[a,b], repairing=[c,d] (**)
 %%          E+7: author=d, upi=[a,b], repairing=[c,d] (**)
-%%          E+... 6 more (**) epochs when c & d finish their respective repairs.
+%%          E+... 6 more (**) epochs when c &amp; d finish their repairs.
 %%       Ideally, the "(**)" epochs are avoidable churn.
 %%       Perhaps this means that we should change the responsibility
 %%       for repair management to the highest ranking member of the
diff --git a/src/machi_chain_repair.erl b/src/machi_chain_repair.erl
index b890d80..fbb36ff 100644
--- a/src/machi_chain_repair.erl
+++ b/src/machi_chain_repair.erl
@@ -18,22 +18,65 @@
 %%
 %% -------------------------------------------------------------------
 
-%% @doc Erlang API for the Machi FLU TCP protocol version 1, with a
-%% proxy-process style API for hiding messy details such as TCP
-%% connection/disconnection with the remote Machi server.
+%% @doc Perform "chain repair", i.e., resynchronization of Machi file
+%% contents and metadata as servers are (re-)added to the chain.
 %%
-%% Machi is intentionally avoiding using distributed Erlang for
-%% Machi's communication.  This design decision makes Erlang-side code
-%% more difficult &amp; complex, but it's the price to pay for some
-%% language independence.  Later in Machi's life cycle, we need to
-%% (re-)implement some components in a non-Erlang/BEAM-based language.
+%% The implementation here is a very basic one, and is probably a bit
+%% slower than the original "demo day" implementation at
+%% [https://github.com/basho/machi/blob/master/prototype/demo-day-hack/file0_repair_server.escript]
 %%
-%% This module implements a "man in the middle" proxy between the
-%% Erlang client and Machi server (which is on the "far side" of a TCP
-%% connection to somewhere).  This proxy process will always execute
-%% on the same Erlang node as the Erlang client that uses it.  The
-%% proxy is intended to be a stable, long-lived process that survives
-%% TCP communication problems with the remote server.
+%% It's so easy to bikeshed this into a 1 year programming exercise.
+%%
+%% General TODO note: There are a lot of areas for exploiting parallelism here.
+%% I've set the bikeshed aside for now, but "make repair faster" has a
+%% lot of room for exploiting concurrency, overlapping reads &amp; writes,
+%% etc etc.  There are also lots of different trade-offs to make with
+%% regard to RAM use vs. disk use.
+%%
+%% There's no reason why repair can't be done:
+%%
+%% <ol>
+%% <li> Repair in parallel across multiple repairees ... Optimization.
+%% </li>
+%% <li> Repair multiple byte ranges concurrently ... Optimization.
+%% </li>
+%% <li> Use bigger chunks than the client originally used to write the file
+%%    ... Optimization ... but it would be the easiest to implement, e.g. use
+%%    constant-sized 4MB chunks.  Unfortuntely, it would also destroy
+%%    the ability to verify here that the chunk checksums are correct
+%%    *and* also propagate the correct checksum metadata to the
+%%    destination FLU.
+%%
+%%    As an additional optimization, add a bit of #2 to start the next
+%%    read while the current write is still in progress.
+%% </li>
+%% <li> The current method centralizes the "smarts" required to compare
+%%    checksum differences ... move some computation to each FLU, then use
+%%    a Merkle- or other-compression-style scheme to reduce the amount of
+%%    data sent across a network.
+%% </li>
+%% </ol>
+%%
+%% Most/all of this could be executed in parallel on each FLU relative to
+%% its own files.  Then, in another TODO option, perhaps build a Merkle tree
+%% or other summary of the local files and send that data structure to the
+%% repair coordinator.
+%%
+%% Also, as another TODO note, repair_both_present() in the
+%% prototype/demo-day code uses an optimization of calculating the MD5
+%% checksum of the chunk checksum data as it arrives, and if the two MD5s
+%% match, then we consider the two files in sync.  If there isn't a match,
+%% then we sort the lines and try another MD5, and if they match, then we're
+%% in sync.  In theory, that's lower overhead than the procedure used here.
+%%
+%% NOTE that one reason I chose the "directives list" method is to have an
+%% option, later, of choosing to repair a subset of repairee FLUs if there
+%% is a big discrepency between out of sync files: e.g., if FLU x has N
+%% bytes out of sync but FLU y has 50N bytes out of sync, then it's likely
+%% better to repair x only so that x can return to the UPI list quickly.
+%% Also, in the event that all repairees are roughly comparably out of sync,
+%% then the repair network traffic can be minimized by reading each chunk
+%% only once.
 
 -module(machi_chain_repair).
 
@@ -139,49 +182,6 @@ get_file_lists(Proxy, FLU_name, D) ->
                            dict:append(File, {FLU_name, Size}, Dict)
                 end, D, Res).
 
-%% Wow, it's so easy to bikeshed this into a 1 year programming exercise.
-%%
-%% TODO: There are a lot of areas for exploiting parallelism here.
-%% I've set the bikeshed aside for now, but "make repair faster" has a
-%% lot of room for exploiting concurrency, overlapping reads & writes,
-%% etc etc.  There are also lots of different trade-offs to make with
-%% regard to RAM use vs. disk use.
-%%
-%% TODO: There's no reason why repair can't be done 1).in parallel
-%% across multiple repairees, and/or 2). with multiple byte ranges in
-%% the same file, and/or 3). with bigger chunks.
-%%
-%% 1. Optimization
-%% 2. Optimization
-%% 3. Optimization, but it would be the easiest to implement, e.g. use
-%%    constant-sized 4MB chunks.  Unfortuntely, it would also destroy
-%%    the ability to verify here that the chunk checksums are correct
-%%    *and* also propagate the correct checksum metadata to the
-%%    destination FLU.
-%%    As an additional optimization, add a bit of #2 to start the next
-%%    read while the current write is still in progress.
-%%
-%% Most/all of this could be executed in parallel on each FLU relative to
-%% its own files.  Then, in another TODO option, perhaps build a Merkle tree
-%% or other summary of the local files & send that data structure to the
-%% repair coordinator.
-%%
-%% Also, as another TODO note, repair_both_present() in the
-%% prototype/demo-day code uses an optimization of calculating the MD5
-%% checksum of the chunk checksum data as it arrives, and if the two MD5s
-%% match, then we consider the two files in sync.  If there isn't a match,
-%% then we sort the lines and try another MD5, and if they match, then we're
-%% in sync.  In theory, that's lower overhead than the procedure used here.
-%%
-%% NOTE that one reason I chose the "directives list" method is to have an
-%% option, later, of choosing to repair a subset of repairee FLUs if there
-%% is a big discrepency between out of sync files: e.g., if FLU x has N
-%% bytes out of sync but FLU y has 50N bytes out of sync, then it's likely
-%% better to repair x only so that x can return to the UPI list quickly.
-%% Also, in the event that all repairees are roughly comparably out of sync,
-%% then the repair network traffic can be minimized by reading each chunk
-%% only once.
-
 make_repair_compare_fun(SrcFLU) ->
     fun({{Offset_X, _Sz_a, _Cs_a, FLU_a}, _N_a},
         {{Offset_X, _Sz_b, _CS_b, FLU_b}, _N_b}) ->
diff --git a/src/machi_cr_client.erl b/src/machi_cr_client.erl
index e6c145d..a0e0324 100644
--- a/src/machi_cr_client.erl
+++ b/src/machi_cr_client.erl
@@ -21,6 +21,65 @@
 %% @doc Erlang API for the Machi client-implemented Chain Replication
 %% (CORFU-style) protocol.
 %%
+%% See also the docs for {@link machi_flu1_client} for additional
+%% details on data types and operation descriptions.
+%%
+%% The API here is much simpler than the {@link machi_flu1_client} or
+%% {@link machi_proxy_flu1_client} APIs.  This module's API is a
+%% proposed simple-but-complete form for clients who are not
+%% interested in being an active participant in a Machi cluster and to
+%% have the responsibility for Machi internals, i.e., client-side
+%% Chain Replication, client-side read repair, client-side tracking of
+%% internal Machi epoch &amp; projection changes, etc.
+%%
+%% This client is implemented as a long-lived Erlang process using
+%% `gen_server'-style OTP code practice.  A naive client can expect
+%% that this process will manage all transient TCP session
+%% disconnections and Machi chain reconfigurations.  This client's
+%% efforts are best-effort and can require some time to retry
+%% operations in certain failure cases, i.e., up to several seconds
+%% during a Machi projection &amp; epoch change when a new server is
+%% added to the chain.
+%%
+%% Doc TODO: Once this API stabilizes, add all relevant data type details
+%% to the EDoc here.
+%%
+%%
+%% === Missing API features ===
+%%
+%% So far, there is one missing client API feature that ought to be
+%% added to Machi in the near future: more flexible checksum
+%% management.
+%%
+%% Add a `source' annotation to all checksums to indicate where the
+%% checksum was calculated.  For example,
+%%
+%% <ul>
+%%
+%% <li> Calculated by client that performed the original chunk append,
+%% </li>
+%%
+%% <li> Calculated by the 1st Machi server to receive an
+%%      un-checksummed append request
+%% </li>
+%%
+%% <li> Re-calculated by Machi to manage fewer checksums of blocks of
+%%      data larger than the original client-specified chunks.
+%% </li>
+%% </ul>
+%%
+%% Client-side checksums would be the "strongest" type of
+%% checksum, meaning that any data corruption (of the original
+%% data and/or of the checksum itself) can be detected after the
+%% client-side calculation.  There are too many horror stories on
+%% The Net about IP PDUs that are corrupted but unnoticed due to
+%% weak TCP checksums, buggy hardware, buggy OS drivers, etc.
+%% Checksum versioning is also desirable if/when the current checksum
+%% implementation changes from SHA-1 to something else.
+%%
+%%
+%% === Implementation notes ===
+%%
 %% The major operation processing is implemented in a state machine-like
 %% manner.  Before attempting an operation `X', there's an initial
 %% operation `pre-X' that takes care of updating the epoch id,
@@ -74,6 +133,7 @@
 
 -define(FLU_PC, machi_proxy_flu1_client).
 -define(TIMEOUT, 2*1000).
+-define(DEFAULT_TIMEOUT, 10*1000).
 -define(MAX_RUNTIME, 8*1000).
 
 -record(state, {
@@ -95,7 +155,7 @@ start_link(P_srvr_list) ->
 %% with `Prefix'.
 
 append_chunk(PidSpec, Prefix, Chunk) ->
-    append_chunk(PidSpec, Prefix, Chunk, infinity).
+    append_chunk(PidSpec, Prefix, Chunk, ?DEFAULT_TIMEOUT).
 
 %% @doc Append a chunk (binary- or iolist-style) of data to a file
 %% with `Prefix'.
@@ -108,7 +168,7 @@ append_chunk(PidSpec, Prefix, Chunk, Timeout) ->
 
 append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra)
   when is_integer(ChunkExtra), ChunkExtra >= 0 ->
-    append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra, infinity).
+    append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra, ?DEFAULT_TIMEOUT).
 
 %% @doc Append a chunk (binary- or iolist-style) of data to a file
 %% with `Prefix'.
@@ -118,10 +178,10 @@ append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra, Timeout) ->
                                     Chunk, ChunkExtra}},
                     Timeout).
 
-%% %% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
+%% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
 
 read_chunk(PidSpec, File, Offset, Size) ->
-    read_chunk(PidSpec, File, Offset, Size, infinity).
+    read_chunk(PidSpec, File, Offset, Size, ?DEFAULT_TIMEOUT).
 
 %% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
 
@@ -132,7 +192,7 @@ read_chunk(PidSpec, File, Offset, Size, Timeout) ->
 %% @doc Fetch the list of chunk checksums for `File'.
 
 checksum_list(PidSpec, File) ->
-    checksum_list(PidSpec, File, infinity).
+    checksum_list(PidSpec, File, ?DEFAULT_TIMEOUT).
 
 %% @doc Fetch the list of chunk checksums for `File'.
 
@@ -143,7 +203,7 @@ checksum_list(PidSpec, File, Timeout) ->
 %% @doc Fetch the list of all files on the remote FLU.
 
 list_files(PidSpec) ->
-    list_files(PidSpec, infinity).
+    list_files(PidSpec, ?DEFAULT_TIMEOUT).
 
 %% @doc Fetch the list of all files on the remote FLU.
 
@@ -155,7 +215,7 @@ list_files(PidSpec, Timeout) ->
 %% proxy process.
 
 quit(PidSpec) ->
-    gen_server:call(PidSpec, quit, infinity).
+    gen_server:call(PidSpec, quit, ?DEFAULT_TIMEOUT).
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%
 
diff --git a/src/machi_flu1.erl b/src/machi_flu1.erl
index 0106c37..09c4982 100644
--- a/src/machi_flu1.erl
+++ b/src/machi_flu1.erl
@@ -26,7 +26,18 @@
 %% primitive file server process vs. the larger Machi design of a FLU
 %% as a sequencer + file server + chain manager group of processes.
 %%
-%% For the moment, this module also implements a rudimentary TCP-based
+%% The FLU is named after the CORFU server "FLU" or "FLash Unit" server.
+%%
+%% === Protocol origins ===
+%%
+%% The protocol implemented here is an artisanal, hand-crafted, silly
+%% thing that was very quick to put together for a "demo day" proof of
+%% concept.  It will almost certainly be replaced with something else,
+%% both in terms of wire format and better code separation of
+%% serialization/deserialization vs. network transport management,
+%% etc.
+%%
+%% For the moment, this module implements a rudimentary TCP-based
 %% protocol as the sole supported access method to the server,
 %% sequencer, and projection store.  Conceptually, those three
 %% services are independent and ought to have their own protocols.  As
@@ -35,7 +46,7 @@
 %% detection, it is very convenient that all three FLU-related
 %% services are accessed using the same single TCP port.
 %%
-%% The FLU is named after the CORFU server "FLU" or "FLash Unit" server.
+%% === TODO items ===
 %%
 %% TODO There is a major missing feature in this FLU implementation:
 %% there is no "write-once" enforcement for any position in a Machi
@@ -53,7 +64,7 @@
 %% replication/chain repair.
 %%
 %% TODO Section 4.2 ("The Sequencer") says that the sequencer must
-%% change its file assignments to new & unique names whenever we move
+%% change its file assignments to new &amp; unique names whenever we move
 %% to wedge state.  This is not yet implemented.  In the current
 %% Erlang process scheme (which will probably be changing soon), a
 %% simple implementation would stop all existing processes that are
diff --git a/src/machi_flu1_client.erl b/src/machi_flu1_client.erl
index 14f18a0..d1a0653 100644
--- a/src/machi_flu1_client.erl
+++ b/src/machi_flu1_client.erl
@@ -19,6 +19,34 @@
 %% -------------------------------------------------------------------
 
 %% @doc Erlang API for the Machi FLU TCP protocol version 1.
+%%
+%% This client API handles low-level PDU serialization/deserialization
+%% and low-level TCP session management, e.g. open, receive, write,
+%% close.  The API for higher-level session management and Machi state
+%% management can be found in {@link machi_proxy_flu1_client} and
+%% {@link machi_cr_client}.
+%%
+%% TODO This EDoc was written first, and the EDoc and also `-type' and
+%% `-spec' definitions for {@link machi_proxy_flu1_client} and {@link
+%% machi_cr_client} must be improved.
+%%
+%% === Protocol origins ===
+%%
+%% The protocol implemented here is an artisanal, hand-crafted, silly
+%% thing that was very quick to put together for a "demo day" proof of
+%% concept.  It will almost certainly be replaced with something else,
+%% both in terms of wire format and better code separation of
+%% serialization/deserialization vs. network transport management,
+%% etc.
+%%
+%% For the moment, this module implements a rudimentary TCP-based
+%% protocol as the sole supported access method to the server,
+%% sequencer, and projection store.  Conceptually, those three
+%% services are independent and ought to have their own protocols.  As
+%% a practical matter, there is no need for wire protocol
+%% compatibility.  Furthermore, from the perspective of failure
+%% detection, it is very convenient that all three FLU-related
+%% services are accessed using the same single TCP port.
 
 -module(machi_flu1_client).
 
diff --git a/src/machi_flu_psup.erl b/src/machi_flu_psup.erl
index a4fe0ad..f7bbf87 100644
--- a/src/machi_flu_psup.erl
+++ b/src/machi_flu_psup.erl
@@ -20,6 +20,42 @@
 
 %% @doc Supervisor for Machi FLU servers and their related support
 %% servers.
+%%
+%% Our parent supervisor, {@link machi_flu_sup}, is responsible for
+%% managing FLUs as a single entity.  However, the actual
+%% implementation of a FLU includes three major Erlang processes (not
+%% including support/worker procs): the FLU itself, the FLU's
+%% projection store, and the FLU's local chain manager.  This
+%% supervisor is responsible for managing those three major services
+%% as a single "package", to be started &amp; stopped together.
+%%
+%% The illustration below shows the OTP process supervision tree for
+%% the Machi application.  Two FLUs are running, called `a' and `b'.
+%% The chain is configured for a third FLU, `c', which is not running
+%% at this time.
+%%
+%% <img src="/machi/{@docRoot}/images/supervisor-2flus.png"></img>
+%%
+%% <ul>
+%% <li> The FLU process itself is named `a'.
+%% </li>
+%% <li> The projection store process is named `a_pstore'.
+%% </li>
+%% <li> The chain manager process is named `a_chmgr'.  The three
+%%      linked subprocesses are long-lived {@link
+%%      machi_proxy_flu1_client} processes for communicating to all
+%%      chain participants' projection stores (including the local
+%%      store `a_pstore').
+%% </li>
+%% <li> A fourth major process, `a_listener', which is responsible for
+%%      listening on a TCP socket and creating new connections.
+%%      Currently, each listener has two processes handling incoming
+%%      requests, one from each chain manager proxy.
+%% </li>
+%% <li> Note that the sub-supervisor parent of `a' and `a_listener' does
+%%      not have a registered name.
+%% </li>
+%% </ul>
 
 -module(machi_flu_psup).
 
diff --git a/src/machi_flu_sup.erl b/src/machi_flu_sup.erl
index 51efd87..5082b55 100644
--- a/src/machi_flu_sup.erl
+++ b/src/machi_flu_sup.erl
@@ -20,6 +20,9 @@
 
 %% @doc Supervisor for Machi FLU servers and their related support
 %% servers.
+%%
+%% See {@link machi_flu_psup} for an illustration of the entire Machi
+%% application process structure.
 
 -module(machi_flu_sup).
 
diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl
index 8818588..4eed3ed 100644
--- a/src/machi_projection_store.erl
+++ b/src/machi_projection_store.erl
@@ -22,11 +22,13 @@
 %%
 %% This API is gen_server-style message passing, intended for use
 %% within a single Erlang node to glue together the projection store
-%% server with the node-local process that implements Machi's TCP
+%% server with the node-local process that implements Machi's FLU
 %% client access protocol (on the "server side" of the TCP connection).
 %%
 %% All Machi client access to the projection store SHOULD NOT use this
-%% module's API.
+%% module's API.  Instead, clients should access indirectly via {@link
+%% machi_cr_client}, {@link machi_proxy_flu1_client}, or {@link
+%% machi_flu1_client}.
 %%
 %% The projection store is implemented by an Erlang/OTP `gen_server'
 %% process that is associated with each FLU.  Conceptually, the
diff --git a/src/machi_proxy_flu1_client.erl b/src/machi_proxy_flu1_client.erl
index 3d0100e..1fc5af5 100644
--- a/src/machi_proxy_flu1_client.erl
+++ b/src/machi_proxy_flu1_client.erl
@@ -25,7 +25,7 @@
 %% Machi is intentionally avoiding using distributed Erlang for
 %% Machi's communication.  This design decision makes Erlang-side code
 %% more difficult &amp; complex, but it's the price to pay for some
-%% language independence.  Later in Machi's life cycle, we need to
+%% language independence.  Later in Machi's life cycle, we may (?) need to
 %% (re-)implement some components in a non-Erlang/BEAM-based language.
 %%
 %% This module implements a "man in the middle" proxy between the
@@ -34,6 +34,9 @@
 %% on the same Erlang node as the Erlang client that uses it.  The
 %% proxy is intended to be a stable, long-lived process that survives
 %% TCP communication problems with the remote server.
+%%
+%% For a higher level interface, see {@link machi_cr_client}.
+%% For a lower level interface, see {@link machi_flu1_client}.
 
 -module(machi_proxy_flu1_client).
 
diff --git a/src/machi_sup.erl b/src/machi_sup.erl
index 31fcc9b..5ffe918 100644
--- a/src/machi_sup.erl
+++ b/src/machi_sup.erl
@@ -19,6 +19,9 @@
 %% -------------------------------------------------------------------
 
 %% @doc Top Machi application supervisor.
+%%
+%% See {@link machi_flu_psup} for an illustration of the entire Machi
+%% application process structure.
 
 -module(machi_sup).