From 69244691f457049959cc2a1be46b9506ae1acf08 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 20 May 2015 11:11:54 +0900 Subject: [PATCH] Such wonder when one *reads* the docs... --- src/machi_app.erl | 5 +- src/machi_chain_manager1.erl | 2 +- src/machi_chain_repair.erl | 114 ++++++++++++++++---------------- src/machi_cr_client.erl | 74 +++++++++++++++++++-- src/machi_flu1.erl | 17 ++++- src/machi_flu1_client.erl | 28 ++++++++ src/machi_flu_psup.erl | 36 ++++++++++ src/machi_flu_sup.erl | 3 + src/machi_projection_store.erl | 6 +- src/machi_proxy_flu1_client.erl | 5 +- src/machi_sup.erl | 3 + 11 files changed, 221 insertions(+), 72 deletions(-) diff --git a/src/machi_app.erl b/src/machi_app.erl index 2701f60..d23718e 100644 --- a/src/machi_app.erl +++ b/src/machi_app.erl @@ -18,7 +18,10 @@ %% %% ------------------------------------------------------------------- -%% @doc Top-level supervisor for the Machi application. +%% @doc Start the top-level supervisor for the Machi application. +%% +%% See {@link machi_flu_psup} for an illustration of the entire Machi +%% application process structure. -module(machi_app). diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 6e48978..572bd8f 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -722,7 +722,7 @@ rank_and_sort_projections(Ps, CurrentProj) -> %% E+5: author=b, upi=[a,b], repairing=[c,d] (**) %% E+6: author=c, upi=[a,b], repairing=[c,d] (**) %% E+7: author=d, upi=[a,b], repairing=[c,d] (**) -%% E+... 6 more (**) epochs when c & d finish their respective repairs. +%% E+... 6 more (**) epochs when c & d finish their repairs. %% Ideally, the "(**)" epochs are avoidable churn. %% Perhaps this means that we should change the responsibility %% for repair management to the highest ranking member of the diff --git a/src/machi_chain_repair.erl b/src/machi_chain_repair.erl index b890d80..fbb36ff 100644 --- a/src/machi_chain_repair.erl +++ b/src/machi_chain_repair.erl @@ -18,22 +18,65 @@ %% %% ------------------------------------------------------------------- -%% @doc Erlang API for the Machi FLU TCP protocol version 1, with a -%% proxy-process style API for hiding messy details such as TCP -%% connection/disconnection with the remote Machi server. +%% @doc Perform "chain repair", i.e., resynchronization of Machi file +%% contents and metadata as servers are (re-)added to the chain. %% -%% Machi is intentionally avoiding using distributed Erlang for -%% Machi's communication. This design decision makes Erlang-side code -%% more difficult & complex, but it's the price to pay for some -%% language independence. Later in Machi's life cycle, we need to -%% (re-)implement some components in a non-Erlang/BEAM-based language. +%% The implementation here is a very basic one, and is probably a bit +%% slower than the original "demo day" implementation at +%% [https://github.com/basho/machi/blob/master/prototype/demo-day-hack/file0_repair_server.escript] %% -%% This module implements a "man in the middle" proxy between the -%% Erlang client and Machi server (which is on the "far side" of a TCP -%% connection to somewhere). This proxy process will always execute -%% on the same Erlang node as the Erlang client that uses it. The -%% proxy is intended to be a stable, long-lived process that survives -%% TCP communication problems with the remote server. +%% It's so easy to bikeshed this into a 1 year programming exercise. +%% +%% General TODO note: There are a lot of areas for exploiting parallelism here. +%% I've set the bikeshed aside for now, but "make repair faster" has a +%% lot of room for exploiting concurrency, overlapping reads & writes, +%% etc etc. There are also lots of different trade-offs to make with +%% regard to RAM use vs. disk use. +%% +%% There's no reason why repair can't be done: +%% +%%
    +%%
  1. Repair in parallel across multiple repairees ... Optimization. +%%
  2. +%%
  3. Repair multiple byte ranges concurrently ... Optimization. +%%
  4. +%%
  5. Use bigger chunks than the client originally used to write the file +%% ... Optimization ... but it would be the easiest to implement, e.g. use +%% constant-sized 4MB chunks. Unfortuntely, it would also destroy +%% the ability to verify here that the chunk checksums are correct +%% *and* also propagate the correct checksum metadata to the +%% destination FLU. +%% +%% As an additional optimization, add a bit of #2 to start the next +%% read while the current write is still in progress. +%%
  6. +%%
  7. The current method centralizes the "smarts" required to compare +%% checksum differences ... move some computation to each FLU, then use +%% a Merkle- or other-compression-style scheme to reduce the amount of +%% data sent across a network. +%%
  8. +%%
+%% +%% Most/all of this could be executed in parallel on each FLU relative to +%% its own files. Then, in another TODO option, perhaps build a Merkle tree +%% or other summary of the local files and send that data structure to the +%% repair coordinator. +%% +%% Also, as another TODO note, repair_both_present() in the +%% prototype/demo-day code uses an optimization of calculating the MD5 +%% checksum of the chunk checksum data as it arrives, and if the two MD5s +%% match, then we consider the two files in sync. If there isn't a match, +%% then we sort the lines and try another MD5, and if they match, then we're +%% in sync. In theory, that's lower overhead than the procedure used here. +%% +%% NOTE that one reason I chose the "directives list" method is to have an +%% option, later, of choosing to repair a subset of repairee FLUs if there +%% is a big discrepency between out of sync files: e.g., if FLU x has N +%% bytes out of sync but FLU y has 50N bytes out of sync, then it's likely +%% better to repair x only so that x can return to the UPI list quickly. +%% Also, in the event that all repairees are roughly comparably out of sync, +%% then the repair network traffic can be minimized by reading each chunk +%% only once. -module(machi_chain_repair). @@ -139,49 +182,6 @@ get_file_lists(Proxy, FLU_name, D) -> dict:append(File, {FLU_name, Size}, Dict) end, D, Res). -%% Wow, it's so easy to bikeshed this into a 1 year programming exercise. -%% -%% TODO: There are a lot of areas for exploiting parallelism here. -%% I've set the bikeshed aside for now, but "make repair faster" has a -%% lot of room for exploiting concurrency, overlapping reads & writes, -%% etc etc. There are also lots of different trade-offs to make with -%% regard to RAM use vs. disk use. -%% -%% TODO: There's no reason why repair can't be done 1).in parallel -%% across multiple repairees, and/or 2). with multiple byte ranges in -%% the same file, and/or 3). with bigger chunks. -%% -%% 1. Optimization -%% 2. Optimization -%% 3. Optimization, but it would be the easiest to implement, e.g. use -%% constant-sized 4MB chunks. Unfortuntely, it would also destroy -%% the ability to verify here that the chunk checksums are correct -%% *and* also propagate the correct checksum metadata to the -%% destination FLU. -%% As an additional optimization, add a bit of #2 to start the next -%% read while the current write is still in progress. -%% -%% Most/all of this could be executed in parallel on each FLU relative to -%% its own files. Then, in another TODO option, perhaps build a Merkle tree -%% or other summary of the local files & send that data structure to the -%% repair coordinator. -%% -%% Also, as another TODO note, repair_both_present() in the -%% prototype/demo-day code uses an optimization of calculating the MD5 -%% checksum of the chunk checksum data as it arrives, and if the two MD5s -%% match, then we consider the two files in sync. If there isn't a match, -%% then we sort the lines and try another MD5, and if they match, then we're -%% in sync. In theory, that's lower overhead than the procedure used here. -%% -%% NOTE that one reason I chose the "directives list" method is to have an -%% option, later, of choosing to repair a subset of repairee FLUs if there -%% is a big discrepency between out of sync files: e.g., if FLU x has N -%% bytes out of sync but FLU y has 50N bytes out of sync, then it's likely -%% better to repair x only so that x can return to the UPI list quickly. -%% Also, in the event that all repairees are roughly comparably out of sync, -%% then the repair network traffic can be minimized by reading each chunk -%% only once. - make_repair_compare_fun(SrcFLU) -> fun({{Offset_X, _Sz_a, _Cs_a, FLU_a}, _N_a}, {{Offset_X, _Sz_b, _CS_b, FLU_b}, _N_b}) -> diff --git a/src/machi_cr_client.erl b/src/machi_cr_client.erl index e6c145d..a0e0324 100644 --- a/src/machi_cr_client.erl +++ b/src/machi_cr_client.erl @@ -21,6 +21,65 @@ %% @doc Erlang API for the Machi client-implemented Chain Replication %% (CORFU-style) protocol. %% +%% See also the docs for {@link machi_flu1_client} for additional +%% details on data types and operation descriptions. +%% +%% The API here is much simpler than the {@link machi_flu1_client} or +%% {@link machi_proxy_flu1_client} APIs. This module's API is a +%% proposed simple-but-complete form for clients who are not +%% interested in being an active participant in a Machi cluster and to +%% have the responsibility for Machi internals, i.e., client-side +%% Chain Replication, client-side read repair, client-side tracking of +%% internal Machi epoch & projection changes, etc. +%% +%% This client is implemented as a long-lived Erlang process using +%% `gen_server'-style OTP code practice. A naive client can expect +%% that this process will manage all transient TCP session +%% disconnections and Machi chain reconfigurations. This client's +%% efforts are best-effort and can require some time to retry +%% operations in certain failure cases, i.e., up to several seconds +%% during a Machi projection & epoch change when a new server is +%% added to the chain. +%% +%% Doc TODO: Once this API stabilizes, add all relevant data type details +%% to the EDoc here. +%% +%% +%% === Missing API features === +%% +%% So far, there is one missing client API feature that ought to be +%% added to Machi in the near future: more flexible checksum +%% management. +%% +%% Add a `source' annotation to all checksums to indicate where the +%% checksum was calculated. For example, +%% +%% +%% +%% Client-side checksums would be the "strongest" type of +%% checksum, meaning that any data corruption (of the original +%% data and/or of the checksum itself) can be detected after the +%% client-side calculation. There are too many horror stories on +%% The Net about IP PDUs that are corrupted but unnoticed due to +%% weak TCP checksums, buggy hardware, buggy OS drivers, etc. +%% Checksum versioning is also desirable if/when the current checksum +%% implementation changes from SHA-1 to something else. +%% +%% +%% === Implementation notes === +%% %% The major operation processing is implemented in a state machine-like %% manner. Before attempting an operation `X', there's an initial %% operation `pre-X' that takes care of updating the epoch id, @@ -74,6 +133,7 @@ -define(FLU_PC, machi_proxy_flu1_client). -define(TIMEOUT, 2*1000). +-define(DEFAULT_TIMEOUT, 10*1000). -define(MAX_RUNTIME, 8*1000). -record(state, { @@ -95,7 +155,7 @@ start_link(P_srvr_list) -> %% with `Prefix'. append_chunk(PidSpec, Prefix, Chunk) -> - append_chunk(PidSpec, Prefix, Chunk, infinity). + append_chunk(PidSpec, Prefix, Chunk, ?DEFAULT_TIMEOUT). %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. @@ -108,7 +168,7 @@ append_chunk(PidSpec, Prefix, Chunk, Timeout) -> append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra) when is_integer(ChunkExtra), ChunkExtra >= 0 -> - append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra, infinity). + append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra, ?DEFAULT_TIMEOUT). %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. @@ -118,10 +178,10 @@ append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra, Timeout) -> Chunk, ChunkExtra}}, Timeout). -%% %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. +%% @doc Read a chunk of data of size `Size' from `File' at `Offset'. read_chunk(PidSpec, File, Offset, Size) -> - read_chunk(PidSpec, File, Offset, Size, infinity). + read_chunk(PidSpec, File, Offset, Size, ?DEFAULT_TIMEOUT). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. @@ -132,7 +192,7 @@ read_chunk(PidSpec, File, Offset, Size, Timeout) -> %% @doc Fetch the list of chunk checksums for `File'. checksum_list(PidSpec, File) -> - checksum_list(PidSpec, File, infinity). + checksum_list(PidSpec, File, ?DEFAULT_TIMEOUT). %% @doc Fetch the list of chunk checksums for `File'. @@ -143,7 +203,7 @@ checksum_list(PidSpec, File, Timeout) -> %% @doc Fetch the list of all files on the remote FLU. list_files(PidSpec) -> - list_files(PidSpec, infinity). + list_files(PidSpec, ?DEFAULT_TIMEOUT). %% @doc Fetch the list of all files on the remote FLU. @@ -155,7 +215,7 @@ list_files(PidSpec, Timeout) -> %% proxy process. quit(PidSpec) -> - gen_server:call(PidSpec, quit, infinity). + gen_server:call(PidSpec, quit, ?DEFAULT_TIMEOUT). %%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/src/machi_flu1.erl b/src/machi_flu1.erl index 0106c37..09c4982 100644 --- a/src/machi_flu1.erl +++ b/src/machi_flu1.erl @@ -26,7 +26,18 @@ %% primitive file server process vs. the larger Machi design of a FLU %% as a sequencer + file server + chain manager group of processes. %% -%% For the moment, this module also implements a rudimentary TCP-based +%% The FLU is named after the CORFU server "FLU" or "FLash Unit" server. +%% +%% === Protocol origins === +%% +%% The protocol implemented here is an artisanal, hand-crafted, silly +%% thing that was very quick to put together for a "demo day" proof of +%% concept. It will almost certainly be replaced with something else, +%% both in terms of wire format and better code separation of +%% serialization/deserialization vs. network transport management, +%% etc. +%% +%% For the moment, this module implements a rudimentary TCP-based %% protocol as the sole supported access method to the server, %% sequencer, and projection store. Conceptually, those three %% services are independent and ought to have their own protocols. As @@ -35,7 +46,7 @@ %% detection, it is very convenient that all three FLU-related %% services are accessed using the same single TCP port. %% -%% The FLU is named after the CORFU server "FLU" or "FLash Unit" server. +%% === TODO items === %% %% TODO There is a major missing feature in this FLU implementation: %% there is no "write-once" enforcement for any position in a Machi @@ -53,7 +64,7 @@ %% replication/chain repair. %% %% TODO Section 4.2 ("The Sequencer") says that the sequencer must -%% change its file assignments to new & unique names whenever we move +%% change its file assignments to new & unique names whenever we move %% to wedge state. This is not yet implemented. In the current %% Erlang process scheme (which will probably be changing soon), a %% simple implementation would stop all existing processes that are diff --git a/src/machi_flu1_client.erl b/src/machi_flu1_client.erl index 14f18a0..d1a0653 100644 --- a/src/machi_flu1_client.erl +++ b/src/machi_flu1_client.erl @@ -19,6 +19,34 @@ %% ------------------------------------------------------------------- %% @doc Erlang API for the Machi FLU TCP protocol version 1. +%% +%% This client API handles low-level PDU serialization/deserialization +%% and low-level TCP session management, e.g. open, receive, write, +%% close. The API for higher-level session management and Machi state +%% management can be found in {@link machi_proxy_flu1_client} and +%% {@link machi_cr_client}. +%% +%% TODO This EDoc was written first, and the EDoc and also `-type' and +%% `-spec' definitions for {@link machi_proxy_flu1_client} and {@link +%% machi_cr_client} must be improved. +%% +%% === Protocol origins === +%% +%% The protocol implemented here is an artisanal, hand-crafted, silly +%% thing that was very quick to put together for a "demo day" proof of +%% concept. It will almost certainly be replaced with something else, +%% both in terms of wire format and better code separation of +%% serialization/deserialization vs. network transport management, +%% etc. +%% +%% For the moment, this module implements a rudimentary TCP-based +%% protocol as the sole supported access method to the server, +%% sequencer, and projection store. Conceptually, those three +%% services are independent and ought to have their own protocols. As +%% a practical matter, there is no need for wire protocol +%% compatibility. Furthermore, from the perspective of failure +%% detection, it is very convenient that all three FLU-related +%% services are accessed using the same single TCP port. -module(machi_flu1_client). diff --git a/src/machi_flu_psup.erl b/src/machi_flu_psup.erl index a4fe0ad..f7bbf87 100644 --- a/src/machi_flu_psup.erl +++ b/src/machi_flu_psup.erl @@ -20,6 +20,42 @@ %% @doc Supervisor for Machi FLU servers and their related support %% servers. +%% +%% Our parent supervisor, {@link machi_flu_sup}, is responsible for +%% managing FLUs as a single entity. However, the actual +%% implementation of a FLU includes three major Erlang processes (not +%% including support/worker procs): the FLU itself, the FLU's +%% projection store, and the FLU's local chain manager. This +%% supervisor is responsible for managing those three major services +%% as a single "package", to be started & stopped together. +%% +%% The illustration below shows the OTP process supervision tree for +%% the Machi application. Two FLUs are running, called `a' and `b'. +%% The chain is configured for a third FLU, `c', which is not running +%% at this time. +%% +%% +%% +%% -module(machi_flu_psup). diff --git a/src/machi_flu_sup.erl b/src/machi_flu_sup.erl index 51efd87..5082b55 100644 --- a/src/machi_flu_sup.erl +++ b/src/machi_flu_sup.erl @@ -20,6 +20,9 @@ %% @doc Supervisor for Machi FLU servers and their related support %% servers. +%% +%% See {@link machi_flu_psup} for an illustration of the entire Machi +%% application process structure. -module(machi_flu_sup). diff --git a/src/machi_projection_store.erl b/src/machi_projection_store.erl index 8818588..4eed3ed 100644 --- a/src/machi_projection_store.erl +++ b/src/machi_projection_store.erl @@ -22,11 +22,13 @@ %% %% This API is gen_server-style message passing, intended for use %% within a single Erlang node to glue together the projection store -%% server with the node-local process that implements Machi's TCP +%% server with the node-local process that implements Machi's FLU %% client access protocol (on the "server side" of the TCP connection). %% %% All Machi client access to the projection store SHOULD NOT use this -%% module's API. +%% module's API. Instead, clients should access indirectly via {@link +%% machi_cr_client}, {@link machi_proxy_flu1_client}, or {@link +%% machi_flu1_client}. %% %% The projection store is implemented by an Erlang/OTP `gen_server' %% process that is associated with each FLU. Conceptually, the diff --git a/src/machi_proxy_flu1_client.erl b/src/machi_proxy_flu1_client.erl index 3d0100e..1fc5af5 100644 --- a/src/machi_proxy_flu1_client.erl +++ b/src/machi_proxy_flu1_client.erl @@ -25,7 +25,7 @@ %% Machi is intentionally avoiding using distributed Erlang for %% Machi's communication. This design decision makes Erlang-side code %% more difficult & complex, but it's the price to pay for some -%% language independence. Later in Machi's life cycle, we need to +%% language independence. Later in Machi's life cycle, we may (?) need to %% (re-)implement some components in a non-Erlang/BEAM-based language. %% %% This module implements a "man in the middle" proxy between the @@ -34,6 +34,9 @@ %% on the same Erlang node as the Erlang client that uses it. The %% proxy is intended to be a stable, long-lived process that survives %% TCP communication problems with the remote server. +%% +%% For a higher level interface, see {@link machi_cr_client}. +%% For a lower level interface, see {@link machi_flu1_client}. -module(machi_proxy_flu1_client). diff --git a/src/machi_sup.erl b/src/machi_sup.erl index 31fcc9b..5ffe918 100644 --- a/src/machi_sup.erl +++ b/src/machi_sup.erl @@ -19,6 +19,9 @@ %% ------------------------------------------------------------------- %% @doc Top Machi application supervisor. +%% +%% See {@link machi_flu_psup} for an illustration of the entire Machi +%% application process structure. -module(machi_sup).