Type up Friday's edits
This commit is contained in:
parent
62d3dadf98
commit
60dfff0c86
3 changed files with 219 additions and 187 deletions
|
@ -191,11 +191,11 @@ newpath 467 -238 moveto 467 -265 lineto stroke
|
|||
newpath 552 -238 moveto 552 -265 lineto stroke
|
||||
newpath 42 -251 moveto 382 -251 lineto stroke
|
||||
newpath 382 -251 moveto 372 -257 lineto stroke
|
||||
(write "foo.seq_a.009" offset=447 <<...123...>> epoch=13) dup stringwidth
|
||||
(write "foo.seq_a.009" offset=447 <<123 bytes...>> epoch=13) dup stringwidth
|
||||
1.000000 1.000000 1.000000 setrgbcolor
|
||||
pop dup newpath 62 -249 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
pop dup newpath 51 -249 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
0.000000 0.000000 0.000000 setrgbcolor
|
||||
62 -249 moveto show
|
||||
51 -249 moveto show
|
||||
newpath 42 -265 moveto 42 -292 lineto stroke
|
||||
newpath 127 -265 moveto 127 -292 lineto stroke
|
||||
newpath 212 -265 moveto 212 -292 lineto stroke
|
||||
|
@ -219,11 +219,11 @@ newpath 467 -292 moveto 467 -319 lineto stroke
|
|||
newpath 552 -292 moveto 552 -319 lineto stroke
|
||||
newpath 42 -305 moveto 467 -305 lineto stroke
|
||||
newpath 467 -305 moveto 457 -311 lineto stroke
|
||||
(write "foo.seq_a.009" offset=447 <<...123...>> epoch=13) dup stringwidth
|
||||
(write "foo.seq_a.009" offset=447 <<123 bytes...>> epoch=13) dup stringwidth
|
||||
1.000000 1.000000 1.000000 setrgbcolor
|
||||
pop dup newpath 105 -303 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
pop dup newpath 94 -303 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
0.000000 0.000000 0.000000 setrgbcolor
|
||||
105 -303 moveto show
|
||||
94 -303 moveto show
|
||||
newpath 42 -319 moveto 42 -346 lineto stroke
|
||||
newpath 127 -319 moveto 127 -346 lineto stroke
|
||||
newpath 212 -319 moveto 212 -346 lineto stroke
|
||||
|
@ -247,11 +247,11 @@ newpath 467 -346 moveto 467 -373 lineto stroke
|
|||
newpath 552 -346 moveto 552 -373 lineto stroke
|
||||
newpath 42 -359 moveto 552 -359 lineto stroke
|
||||
newpath 552 -359 moveto 542 -365 lineto stroke
|
||||
(write "foo.seq_a.009" offset=447 <<...123...>> epoch=13) dup stringwidth
|
||||
(write "foo.seq_a.009" offset=447 <<123 bytes...>> epoch=13) dup stringwidth
|
||||
1.000000 1.000000 1.000000 setrgbcolor
|
||||
pop dup newpath 147 -357 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
pop dup newpath 136 -357 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
0.000000 0.000000 0.000000 setrgbcolor
|
||||
147 -357 moveto show
|
||||
136 -357 moveto show
|
||||
newpath 42 -373 moveto 42 -400 lineto stroke
|
||||
newpath 127 -373 moveto 127 -400 lineto stroke
|
||||
newpath 212 -373 moveto 212 -400 lineto stroke
|
||||
|
|
|
@ -105,11 +105,11 @@ newpath 467 -76 moveto 467 -103 lineto stroke
|
|||
newpath 552 -76 moveto 552 -103 lineto stroke
|
||||
newpath 42 -89 moveto 382 -89 lineto stroke
|
||||
newpath 382 -89 moveto 372 -95 lineto stroke
|
||||
(write prefix="foo" <<...123...>> epoch=12) dup stringwidth
|
||||
(append prefix="foo" <<123 bytes...>> epoch=12) dup stringwidth
|
||||
1.000000 1.000000 1.000000 setrgbcolor
|
||||
pop dup newpath 104 -87 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
pop dup newpath 85 -87 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
0.000000 0.000000 0.000000 setrgbcolor
|
||||
104 -87 moveto show
|
||||
85 -87 moveto show
|
||||
newpath 42 -103 moveto 42 -130 lineto stroke
|
||||
newpath 127 -103 moveto 127 -130 lineto stroke
|
||||
newpath 212 -103 moveto 212 -130 lineto stroke
|
||||
|
@ -163,11 +163,11 @@ newpath 467 -184 moveto 467 -211 lineto stroke
|
|||
newpath 552 -184 moveto 552 -211 lineto stroke
|
||||
newpath 42 -197 moveto 382 -197 lineto stroke
|
||||
newpath 382 -197 moveto 372 -203 lineto stroke
|
||||
(write prefix="foo" <<...123...>> epoch=13) dup stringwidth
|
||||
(append prefix="foo" <<123 bytes...>> epoch=13) dup stringwidth
|
||||
1.000000 1.000000 1.000000 setrgbcolor
|
||||
pop dup newpath 104 -195 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
pop dup newpath 85 -195 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
0.000000 0.000000 0.000000 setrgbcolor
|
||||
104 -195 moveto show
|
||||
85 -195 moveto show
|
||||
newpath 42 -211 moveto 42 -238 lineto stroke
|
||||
newpath 127 -211 moveto 127 -238 lineto stroke
|
||||
newpath 212 -211 moveto 212 -238 lineto stroke
|
||||
|
@ -224,17 +224,13 @@ newpath 297 -292 moveto 297 -319 lineto stroke
|
|||
newpath 382 -292 moveto 382 -319 lineto stroke
|
||||
newpath 467 -292 moveto 467 -319 lineto stroke
|
||||
newpath 552 -292 moveto 552 -319 lineto stroke
|
||||
(FLU_A writes to local storage @ "foo.seq_a.009" offset=447) dup stringwidth
|
||||
newpath 382 -305 85 13 270 90 ellipse stroke
|
||||
newpath 382 -311 moveto 392 -317 lineto stroke
|
||||
(write "foo.seq_a.009" offset=447 <<123 bytes...>> epoch=13) dup stringwidth
|
||||
1.000000 1.000000 1.000000 setrgbcolor
|
||||
pop dup newpath 138 -308 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
pop dup newpath 58 -303 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
0.000000 0.000000 0.000000 setrgbcolor
|
||||
138 -308 moveto show
|
||||
[2] 0 setdash
|
||||
newpath 21 -305 moveto 136 -305 lineto stroke
|
||||
[] 0 setdash
|
||||
[2] 0 setdash
|
||||
newpath 459 -305 moveto 574 -305 lineto stroke
|
||||
[] 0 setdash
|
||||
58 -303 moveto show
|
||||
newpath 42 -319 moveto 42 -346 lineto stroke
|
||||
newpath 127 -319 moveto 127 -346 lineto stroke
|
||||
newpath 212 -319 moveto 212 -346 lineto stroke
|
||||
|
@ -244,11 +240,11 @@ newpath 467 -319 moveto 467 -346 lineto stroke
|
|||
newpath 552 -319 moveto 552 -346 lineto stroke
|
||||
newpath 382 -332 moveto 467 -332 lineto stroke
|
||||
newpath 467 -332 moveto 457 -338 lineto stroke
|
||||
(write "foo.seq_a.009" offset=447 <<...123...>> epoch=13) dup stringwidth
|
||||
(write "foo.seq_a.009" offset=447 <<123 bytes...>> epoch=13) dup stringwidth
|
||||
1.000000 1.000000 1.000000 setrgbcolor
|
||||
pop dup newpath 275 -330 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
pop dup newpath 264 -330 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
0.000000 0.000000 0.000000 setrgbcolor
|
||||
275 -330 moveto show
|
||||
264 -330 moveto show
|
||||
newpath 42 -346 moveto 42 -373 lineto stroke
|
||||
newpath 127 -346 moveto 127 -373 lineto stroke
|
||||
newpath 212 -346 moveto 212 -373 lineto stroke
|
||||
|
@ -258,11 +254,11 @@ newpath 467 -346 moveto 467 -373 lineto stroke
|
|||
newpath 552 -346 moveto 552 -373 lineto stroke
|
||||
newpath 467 -359 moveto 552 -359 lineto stroke
|
||||
newpath 552 -359 moveto 542 -365 lineto stroke
|
||||
(write "foo.seq_a.009" offset=447 <<...123...>> epoch=13) dup stringwidth
|
||||
(write "foo.seq_a.009" offset=447 <<123 bytes...>> epoch=13) dup stringwidth
|
||||
1.000000 1.000000 1.000000 setrgbcolor
|
||||
pop dup newpath 295 -357 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
pop dup newpath 273 -357 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
0.000000 0.000000 0.000000 setrgbcolor
|
||||
295 -357 moveto show
|
||||
273 -357 moveto show
|
||||
newpath 42 -373 moveto 42 -400 lineto stroke
|
||||
newpath 127 -373 moveto 127 -400 lineto stroke
|
||||
newpath 212 -373 moveto 212 -400 lineto stroke
|
||||
|
@ -302,16 +298,16 @@ newpath 297 -427 moveto 297 -454 lineto stroke
|
|||
newpath 382 -427 moveto 382 -454 lineto stroke
|
||||
newpath 467 -427 moveto 467 -454 lineto stroke
|
||||
newpath 552 -427 moveto 552 -454 lineto stroke
|
||||
(If, instead, FLU_C has an error...) dup stringwidth
|
||||
(If, in an alternate scenario, FLU_C has an error...) dup stringwidth
|
||||
1.000000 1.000000 1.000000 setrgbcolor
|
||||
pop dup newpath 210 -443 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
pop dup newpath 167 -443 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
0.000000 0.000000 0.000000 setrgbcolor
|
||||
210 -443 moveto show
|
||||
167 -443 moveto show
|
||||
[2] 0 setdash
|
||||
newpath 21 -440 moveto 208 -440 lineto stroke
|
||||
newpath 21 -440 moveto 165 -440 lineto stroke
|
||||
[] 0 setdash
|
||||
[2] 0 setdash
|
||||
newpath 386 -440 moveto 574 -440 lineto stroke
|
||||
newpath 429 -440 moveto 574 -440 lineto stroke
|
||||
[] 0 setdash
|
||||
newpath 42 -454 moveto 42 -481 lineto stroke
|
||||
newpath 127 -454 moveto 127 -481 lineto stroke
|
||||
|
@ -336,14 +332,14 @@ newpath 297 -481 moveto 297 -508 lineto stroke
|
|||
newpath 382 -481 moveto 382 -508 lineto stroke
|
||||
newpath 467 -481 moveto 467 -508 lineto stroke
|
||||
newpath 552 -481 moveto 552 -508 lineto stroke
|
||||
(Repair is now the client's responsibility \("slow path"\).) dup stringwidth
|
||||
(... then repair becomes the client's responsibility \("slow path"\).) dup stringwidth
|
||||
1.000000 1.000000 1.000000 setrgbcolor
|
||||
pop dup newpath 158 -497 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
pop dup newpath 133 -497 moveto 0 rlineto 0 11 rlineto neg 0 rlineto closepath fill
|
||||
0.000000 0.000000 0.000000 setrgbcolor
|
||||
158 -497 moveto show
|
||||
133 -497 moveto show
|
||||
[2] 0 setdash
|
||||
newpath 21 -494 moveto 156 -494 lineto stroke
|
||||
newpath 21 -494 moveto 131 -494 lineto stroke
|
||||
[] 0 setdash
|
||||
[2] 0 setdash
|
||||
newpath 439 -494 moveto 574 -494 lineto stroke
|
||||
newpath 464 -494 moveto 574 -494 lineto stroke
|
||||
[] 0 setdash
|
||||
|
|
|
@ -23,8 +23,8 @@
|
|||
\copyrightdata{978-1-nnnn-nnnn-n/yy/mm}
|
||||
\doi{nnnnnnn.nnnnnnn}
|
||||
|
||||
\titlebanner{Draft \#0, April 2014}
|
||||
\preprintfooter{Draft \#0, April 2014}
|
||||
\titlebanner{Draft \#1, April 2014}
|
||||
\preprintfooter{Draft \#1, April 2014}
|
||||
|
||||
\title{Machi: an immutable file store}
|
||||
\subtitle{High level design \& strawman implementation suggestions \\
|
||||
|
@ -76,10 +76,9 @@ document.
|
|||
\par
|
||||
\hfill{--- Fred Hebert, {\tt @mononcqc}}
|
||||
\end{quotation}
|
||||
\subsection{Name}
|
||||
\subsection{Origin of the name ``Machi''}
|
||||
\label{sub:name}
|
||||
|
||||
This file store will be called ``Machi''.
|
||||
``Machi'' is a Japanese word for
|
||||
``village'' or ``small town''. A village is a rather self-contained
|
||||
thing, but it is small, not like a city.
|
||||
|
@ -95,15 +94,15 @@ built out of a single village.
|
|||
|
||||
Machi is a client-server system. All servers in a Machi cluster store
|
||||
identical copies/replicas of all files, preferably large files.
|
||||
\begin{itemize}
|
||||
\item This puts an effective limit on the size of a Machi cluster.
|
||||
For example, five servers will replicate all files
|
||||
for an effective replication $N$ factor of 5.
|
||||
\item Any mechanism to distribute files across a subset of Machi
|
||||
servers is outside the scope of Machi and of this design.
|
||||
\end{itemize}
|
||||
This puts an effective limit on the size of a Machi cluster.
|
||||
For example, five servers will replicate all files
|
||||
for an effective replication $N$ factor of 5.
|
||||
|
||||
``Large file'' is intended to mean hundreds of MBytes or more
|
||||
Any mechanism to distribute files across a subset of Machi
|
||||
servers is outside the scope of Machi and of this design.
|
||||
|
||||
Machi's design assumes that it stores mostly large files.
|
||||
``Large file'' means hundreds of MBytes or more
|
||||
per file. The design ``sweet spot'' targets about
|
||||
1 GByte/file and/or managing up to a few million files in a
|
||||
single cluster. The maximum size of a single Machi file is
|
||||
|
@ -112,26 +111,15 @@ practical estimate is 2Tbytes or less but may be larger.
|
|||
|
||||
Machi files are write-once, read-many data structures; the label
|
||||
``append-only'' is mostly correct. However, to be 100\% truthful
|
||||
truth, the bytes a Machi file can be written in any order.
|
||||
truth, the bytes a Machi file can be written temporally in any order.
|
||||
|
||||
Machi files are always named by the server; Machi clients have no
|
||||
direct control of the name assigned by a Machi server. Machi servers
|
||||
specify the file name and byte offset to all client write requests.
|
||||
determine the file name and byte offset to all client write requests.
|
||||
(Machi clients may advise servers with a desired file name prefix.)
|
||||
|
||||
Machi is not a Hadoop file system (HDFS) replacement.
|
||||
%% \begin{itemize}
|
||||
% \item
|
||||
There is no mechanism for writing Machi files to a subset of
|
||||
available storage servers: all servers in a Machi server store
|
||||
identical copies/replicas of all files.
|
||||
% \item
|
||||
However, Machi is intended to play very nicely with a layer above it,
|
||||
where that layer {\em does} handle file scattering and on-the-fly
|
||||
file migration across servers and all of the nice things that
|
||||
HDFS, Riak CS, and similar systems can do.
|
||||
|
||||
Robust and reliable means that Machi will not lose data until a
|
||||
Machi shall be a
|
||||
robust and reliable system. Machi will not lose data until a
|
||||
fundamental assumption has been violated, e.g., all servers have
|
||||
crashed permanently. Machi's file replicaion algorithms can provide
|
||||
strong or eventual consistency and is provably correct. Our only
|
||||
|
@ -153,6 +141,18 @@ incomplete writes may happen long after the client has finished or
|
|||
even crashed. In effect, Machi will provide clients with
|
||||
``at least once'' behavior for writes.
|
||||
|
||||
Machi is not a Hadoop file system (HDFS) replacement.
|
||||
%% \begin{itemize}
|
||||
% \item
|
||||
There is no mechanism for writing Machi files to a subset of
|
||||
available storage servers: all servers in a Machi server store
|
||||
identical copies/replicas of all files.
|
||||
% \item
|
||||
However, Machi is intended to play very nicely with a layer above it,
|
||||
where that layer {\em does} handle file scattering and on-the-fly
|
||||
file migration across servers and all of the nice things that
|
||||
HDFS, Riak CS, and similar systems can do.
|
||||
|
||||
\subsection{Defining a Machi file}
|
||||
|
||||
A Machi ``file'' is an undifferentiated, one-dimensional array of
|
||||
|
@ -167,10 +167,11 @@ shows the basic shape of the service.
|
|||
\begin{figure}
|
||||
\begin{itemize}
|
||||
\item Append bytes $B$ to a file with name prefix {\tt "foo"}.
|
||||
\item Read $N$ bytes from offset $O$ from file $F$.
|
||||
\item Write bytes $B$ to offset $O$ of file $F$.
|
||||
\item Read $N$ bytes from offset $O$ of file $F$.
|
||||
\item List files: name, size, etc.
|
||||
\end{itemize}
|
||||
\caption{Full (?) list of file API operations}
|
||||
\caption{Nearly complete list of file API operations}
|
||||
\label{fig:example-client-API}
|
||||
\end{figure}
|
||||
|
||||
|
@ -180,12 +181,12 @@ order of 4 KBytes or 16 KBytes.)
|
|||
|
||||
\begin{figure}
|
||||
\begin{enumerate}
|
||||
\item Client1: Write 1 byte at offset 0.
|
||||
\item Client1: Read 1 byte at offset 0.
|
||||
\item Client2: Write 1 byte at offset 2.
|
||||
\item Client2: Read 1 byte at offset 2.
|
||||
\item Client3: (an intermittently slow client) Write 1 byte at offset 1.
|
||||
\item Client3: Read 1 byte at offset 1.
|
||||
\item Client1: Write 1 byte at offset 0 of file $F$.
|
||||
% \item Client1: Read 1 byte at offset 0 of file $F$.
|
||||
\item Client2: Write 1 byte at offset 2 of file $F$.
|
||||
% \item Client2: Read 1 byte at offset 2 of file $F$.
|
||||
\item Client3: (an intermittently slow client) Write 1 byte at offset 1 of file $F$.
|
||||
% \item Client3: Read 1 byte at offset 1 of file $F$.
|
||||
\end{enumerate}
|
||||
\caption{Example of temporally out-of-order file append sequence that
|
||||
is valid within a Machi cluster.}
|
||||
|
@ -262,7 +263,7 @@ Bit-rot can and will happen. To guard against bit-rot on disk, strong
|
|||
\begin{itemize}
|
||||
\item Client-calculated checksums of appended data
|
||||
\item Whole-file checksums, calculated by Machi servers for internal
|
||||
sanity checking. See \ref{sub:detecting-corrupted} for
|
||||
sanity checking. See Section~\ref{sub:detecting-corrupted} for
|
||||
commentary on how this may not be feasible.
|
||||
\item Any other place that makes sense for the paranoid.
|
||||
\end{itemize}
|
||||
|
@ -284,10 +285,8 @@ the per-append checksums described in Section~\ref{sub:bit-rot}
|
|||
\begin{itemize}
|
||||
\item File metadata is strictly append-only.
|
||||
\item File metadata is always eventually consistent.
|
||||
\item A complete history of all metadata updates is maintained for
|
||||
each file.
|
||||
\item Temporal order of metadata entries is not preserved.
|
||||
\item Multiple histories for a file may be merged at any time.
|
||||
\item Multiple metadata stores for a file may be merged at any time.
|
||||
\begin{itemize}
|
||||
\item If a client requires idempotency, then the property list
|
||||
should contain all information required to identify multiple
|
||||
|
@ -298,6 +297,9 @@ the per-append checksums described in Section~\ref{sub:bit-rot}
|
|||
\end{itemize}
|
||||
\end{itemize}
|
||||
|
||||
{\bf NOTE:} It isn't yet clear how much support early versions of
|
||||
Machi will need for file metadata features.
|
||||
|
||||
\subsubsection{File replica management via Chain Replication}
|
||||
\label{sub:chain-replication}
|
||||
|
||||
|
@ -313,7 +315,7 @@ restrictions:
|
|||
\begin{enumerate}
|
||||
\item All writes are strictly performed by servers that are arranged
|
||||
in a single order, known as the ``chain order'', beginning at the
|
||||
chain's head.
|
||||
chain's head and ending at the chain's tail.
|
||||
\item All strongly consistent reads are performed only by the tail of
|
||||
the chain, i.e., the last server in the chain order.
|
||||
\item Inconsistent reads may be performed by any single server in the
|
||||
|
@ -321,10 +323,10 @@ restrictions:
|
|||
\end{enumerate}
|
||||
|
||||
Machi contains enough Chain Replication implementation to maintain its
|
||||
chain state, file data integrity, and file metadata eventual
|
||||
chain state, strict file data integrity, and file metadata eventual
|
||||
consistency. See also Section~\ref{sub:self-management}.
|
||||
|
||||
The first version of Machi would use a single chain for managing all
|
||||
The first version of Machi will use a single chain for managing all
|
||||
files in the cluster. If the system is quiescent,
|
||||
then all chain members store the same data: all
|
||||
Machi servers will all store identical files. Later versions of Machi
|
||||
|
@ -365,6 +367,8 @@ of poor health will automatically reconfigure the Machi cluster
|
|||
to avoid data loss and to provide maximum availability.
|
||||
For example, if a server $S$ crashes and later
|
||||
restarts, Machi will automatically bring the data on $S$ back to full sync.
|
||||
This service will be provided by the ``chain manager'', which is
|
||||
described in \cite{machi-chain-manager-design}.
|
||||
|
||||
Machi will provide an administration API for managing Machi servers, e.g.,
|
||||
cluster membership, file integrity and checksum verification, etc.
|
||||
|
@ -407,16 +411,6 @@ considered out-of-scope for Machi.
|
|||
burden of physical separation of each coded piece (i.e., ``rack
|
||||
awareness'') someone/something else's problem.
|
||||
|
||||
Why would would someone wish to run a Machi cluster with only one
|
||||
server (i.e., chain length of one) rather than using the FLU service
|
||||
(Section~\ref{sub:flu}) by itself? One answer is that data
|
||||
migration is much easier with all of Machi than with only the FLU
|
||||
server. To migrate all files from FLU $F_a$ to FLU $F_b$, the administrator
|
||||
merely needs to add $F_b$ to the end of $F_a$'s chain. When the data
|
||||
repair is finished, we know that $F_b$ stores full replicas of all of
|
||||
$F_a$'s data. The administrator removes $F_a$ from the chain, and the
|
||||
data migration is finished.
|
||||
|
||||
\section{Architecture: base components and ideas}
|
||||
|
||||
This section presents the major architectural components. They are:
|
||||
|
@ -427,19 +421,19 @@ This section presents the major architectural components. They are:
|
|||
\item The Sequencer: assigns a unique file name + offset to each file
|
||||
append request.
|
||||
(Section \ref{sub:sequencer})
|
||||
\item The Projection Store: a write-once key-value blob store, used by
|
||||
Machi for storing projections.
|
||||
(Section \ref{sub:proj-store})
|
||||
\item The chain manager: monitors the health of the
|
||||
chain and calculates new projections when failure is detected.
|
||||
(Section \ref{sub:chain-manager})
|
||||
\item The Projection Store: a write-once key-value blob store, used by
|
||||
Machi's chain manager for storing projections.
|
||||
(Section \ref{sub:proj-store})
|
||||
\end{itemize}
|
||||
|
||||
Also presented here are the major concepts used by Machi components:
|
||||
\begin{itemize}
|
||||
\item The Projection: the data structure that describes the current
|
||||
state of the Machi chain.
|
||||
and is stored in the write-once Projection Store.
|
||||
Projections are stored in the write-once Projection Store.
|
||||
(Section \ref{sub:projection})
|
||||
\item The Projection Epoch Number (a.k.a.~The Epoch): Each projection
|
||||
is numbered with an epoch.
|
||||
|
@ -464,7 +458,7 @@ The basic idea of the FLU is borrowed from CORFU. The base CORFU
|
|||
data server is called a ``flash unit''. For Machi, the equivalent
|
||||
server is nicknamed a FLU, a ``FiLe replica Unit''. A FLU is
|
||||
responsible for maintaining a single replica/copy of each file
|
||||
(and its associated metadata) stored in a Machi cluster
|
||||
(and its associated metadata) stored in a Machi cluster.
|
||||
|
||||
The FLU's API is very simple: see Figure~\ref{fig:flu-api} for its
|
||||
data types and operations. This description is not 100\% complete but
|
||||
|
@ -484,9 +478,12 @@ is sufficient for discussion purposes.
|
|||
error_bad_checksum | error_unavailable.
|
||||
-type m_name() :: binary().
|
||||
-type m_offset() :: non_neg_integer().
|
||||
-type m_prefix() :: binary().
|
||||
-type m_rerror() :: m_err_r() m_generr().
|
||||
-type m_werror() :: m_generr() | m_err_w().
|
||||
|
||||
-spec append(m_prefix(), m_bytes(), m_epoch()) -> {ok, m_name(), m_offset()} |
|
||||
m_werror().
|
||||
-spec fill(m_name(), m_offset(), integer(), m_epoch()) -> ok | m_fill_err() |
|
||||
m_werror().
|
||||
-spec list_files() -> {ok, [m_file_info()]} | m_generr().
|
||||
|
@ -511,7 +508,7 @@ Transitions between these states are strictly ordered.
|
|||
See Section~\ref{sub:assume-append-only} for state transitions and
|
||||
the restrictions related to those transitions.
|
||||
|
||||
The FLU also keeps track of the projection number (number and checksum
|
||||
The FLU also keeps track of the projection epoch number (number and checksum
|
||||
both, see also Section~\ref{sub:flu-divergence}) of the last modification to a
|
||||
file. This projection number is used for quick comparisons during
|
||||
repair (Section~\ref{sec:repair}) to determine if files are in sync or
|
||||
|
@ -525,7 +522,7 @@ In Machi, the type signature of {\tt
|
|||
of the projection's contents. This checksum is used in cases where
|
||||
Machi is configured to run in ``AP mode'', which allows a running Machi
|
||||
cluster to fragment into multiple running sub-clusters during network
|
||||
partitions. Each sub-cluster can choose a projection number
|
||||
partitions. Each sub-cluster can choose an epoch projection number
|
||||
$P_{side}$ for its side of the cluster.
|
||||
|
||||
After the partition is
|
||||
|
@ -568,7 +565,7 @@ used to continue:
|
|||
\item If the client's write has been successful on at least the head
|
||||
FLU in the chain, then the client may continue to use the old
|
||||
location. The client is now performing read repair of this location in
|
||||
the new epoch. (The client may have to add a ``read repair'' option
|
||||
the new epoch. (The client may be required to add a ``read repair'' option
|
||||
to its requests to bypass the FLUs usual enforcement of the
|
||||
location's epoch.)
|
||||
\item If the client's write to the head FLU has not started yet, or if
|
||||
|
@ -577,6 +574,13 @@ used to continue:
|
|||
request a new assignment from the sequencer.
|
||||
\end{itemize}
|
||||
|
||||
If the client eventually wishes to write a contiguous chunk of $Y$
|
||||
bytes, but only $X$ bytes ($X < Y$) are available right now, the
|
||||
client may make a sequencer request for the larger $Y$ byte range
|
||||
immediately. The client then uses this file~+~byte range assignment
|
||||
to write the $X$ bytes now and all of the remaining $Y-X$ bytes at
|
||||
some later time.
|
||||
|
||||
\subsubsection{Divergence from CORFU}
|
||||
\label{sub:sequencer-divergence}
|
||||
|
||||
|
@ -602,15 +606,19 @@ that generates unique file names is sufficient.
|
|||
\subsection{The Projection Store}
|
||||
\label{sub:proj-store}
|
||||
|
||||
Each FLU maintains a key-value store for the purpose of storing
|
||||
Each FLU maintains a key-value store of write-once registers
|
||||
for the purpose of storing
|
||||
projections. Reads \& writes to this store are provided by the FLU
|
||||
administration API. The projection store runs on each server that
|
||||
provides FLU service, for two reasons of convenience. First, the
|
||||
provides FLU service, for several reasons. First, the
|
||||
projection data structure
|
||||
need not include extra server names to identify projection
|
||||
store servers or their locations.
|
||||
Second, writes to the projection store require
|
||||
notification to a FLU of the projection update anyway.
|
||||
Third, certain kinds of writes to the projection store indicate
|
||||
changes in cluster status which require prompt changes of state inside
|
||||
of the FLU (e.g., entering wedge state).
|
||||
|
||||
The store's basic operation set is simple: get, put, get largest key
|
||||
(and optionally its value), and list all keys.
|
||||
|
@ -627,7 +635,7 @@ The projection store's data types are:
|
|||
|
||||
As a write-once register, any attempt to write a key $K$ when the
|
||||
local store already has a value written for $K$ will always fail
|
||||
with a {\tt error\_written} error.
|
||||
with a {\tt error\_written} status.
|
||||
|
||||
Any write of a key whose value is larger than the FLU's current
|
||||
projection number will move the FLU to the wedged state
|
||||
|
@ -636,17 +644,21 @@ projection number will move the FLU to the wedged state
|
|||
The contents of the projection blob store are maintained by neither
|
||||
Chain Replication techniques nor any other server-side technique. All
|
||||
replication and read repair is done only by the projection store
|
||||
client. Astute readers may theorize that race conditions exist in
|
||||
clients. Astute readers may theorize that race conditions exist in
|
||||
such management; see Section~\ref{sec:projections} for details and
|
||||
restrictions that make it practical.
|
||||
|
||||
\subsection{The chain manager}
|
||||
\label{sub:chain-manager}
|
||||
|
||||
Each FLU runs an administration agent that is responsible for
|
||||
monitoring the health of the entire Machi cluster. If a change of
|
||||
state is noticed (via measurement) or is requested (via the
|
||||
administration API), zero or more actions may be taken:
|
||||
Each FLU runs an administration agent, the chain manager, that is
|
||||
responsible for monitoring the health of the entire Machi cluster.
|
||||
Each chain manager instance is fully autonomous and communicates with
|
||||
other chain managers indirectly via writes and reads to its peers'
|
||||
projection stores.
|
||||
|
||||
If a change of state is noticed (via measurement) or is requested (via
|
||||
the administration API), one or more actions may be taken:
|
||||
|
||||
\begin{itemize}
|
||||
\item Enter wedge state (Section~\ref{sub:wedge}).
|
||||
|
@ -703,6 +715,8 @@ Pseudo-code for the projection's definition is shown in
|
|||
Figure~\ref{fig:projection}. To summarize the major components:
|
||||
|
||||
\begin{itemize}
|
||||
\item {\tt epoch\_number} and {\tt epoch\_csum} The epoch number and
|
||||
projection checksum are unique identifiers for this projection.
|
||||
\item {\tt creation\_time} Wall-clock time, useful for humans and
|
||||
general debugging effort.
|
||||
\item {\tt author\_server} Name of the server that calculated the projection.
|
||||
|
@ -730,13 +744,14 @@ Figure~\ref{fig:projection}. To summarize the major components:
|
|||
Most Machi protocol actions are tagged with the actor's best knowledge
|
||||
of the current epoch. However, Machi does not have a single/master
|
||||
coordinator for making configuration changes. Instead, change is
|
||||
performed in a fully asynchronous manner. During a cluster
|
||||
performed in a fully asynchronous manner by
|
||||
each local chain manager. During a cluster
|
||||
configuration change, some servers will use the old projection number,
|
||||
$P_p$, whereas others know of a newer projection, $P_{p+x}$ where $x>0$.
|
||||
|
||||
When a protocol operation with $P_p$ arrives at an actor who knows
|
||||
$P_{p+x}$, the response must be {\tt error\_bad\_epoch}. This is a signal
|
||||
that the actor using $P_p$ is indeed out-of-date and that a newer
|
||||
When a protocol operation with $P_{p-x}$ arrives at an actor who knows
|
||||
$P_p$, the response must be {\tt error\_bad\_epoch}. This is a signal
|
||||
that the actor using $P_{p-x}$ is indeed out-of-date and that a newer
|
||||
projection must be found and used.
|
||||
|
||||
\subsection{The Wedge}
|
||||
|
@ -744,12 +759,12 @@ projection must be found and used.
|
|||
|
||||
If a FLU server is using a projection $P_p$ and receives a protocol
|
||||
message that mentions a newer projection $P_{p+x}$ that is larger than its
|
||||
current projection value, then it must enter ``wedge'' state and stop
|
||||
current projection value, then it enters ``wedge'' state and stops
|
||||
processing all new requests. The server remains in wedge state until
|
||||
a new projection (with a larger/higher epoch number) is discovered and
|
||||
appropriately acted upon.
|
||||
In the Windows Azure storage system \cite{was}, this state is called
|
||||
the ``sealed'' state.
|
||||
(In the Windows Azure storage system \cite{was}, this state is called
|
||||
the ``sealed'' state.)
|
||||
|
||||
\subsection{``AP Mode'' and ``CP Mode''}
|
||||
\label{sub:ap-cp-mode}
|
||||
|
@ -764,14 +779,14 @@ sufficient for an ``AP Mode'' Machi service. In AP Mode, all mutations
|
|||
to any file on any side of a network partition are guaranteed to use
|
||||
unique locations (file names and/or byte offsets). When network
|
||||
partitions are healed, all files can be merged together
|
||||
(while considering the file format detail discussed in
|
||||
the footnote of Section~\ref{ssec:just-rsync-it}) in any order
|
||||
(while considering the details discussed in
|
||||
Section~\ref{ssec:just-rsync-it}) in any order
|
||||
without conflict.
|
||||
|
||||
``CP mode'' will be extensively covered in other documents. In summary,
|
||||
to support ``CP mode'', we believe that the chain manager
|
||||
service proposed here can guarantee strong consistency
|
||||
at all times.
|
||||
``CP mode'' will be extensively covered in~\cite{machi-chain-manager-design}.
|
||||
In summary, to support ``CP mode'', we believe that the chain manager
|
||||
service proposed by~\cite{machi-chain-manager-design} can guarantee
|
||||
strong consistency at all times.
|
||||
|
||||
\section{Sketches of single operations}
|
||||
\label{sec:sketches}
|
||||
|
@ -791,8 +806,8 @@ at all times.
|
|||
|
||||
To write/append atomically a single sequence/hunk of bytes to a file,
|
||||
here's the sequence of steps required.
|
||||
See Figure~\ref{fig:append-flow} for a diagram showing an example
|
||||
append; the same example is also shown in
|
||||
See Figure~\ref{fig:append-flow} for a diagram that illustrates this
|
||||
example; the same example is also shown in
|
||||
Figure~\ref{fig:append-flowMSC} using MSC style (message sequence chart).
|
||||
In
|
||||
this case, the first FLU contacted has a newer projection epoch,
|
||||
|
@ -807,21 +822,26 @@ prefixes $Pref1$ and $Pref2$ where $Pref1 \ne Pref2$, then the two byte
|
|||
sequences will definitely be written to different files. If
|
||||
$Pref1 = Pref2$,
|
||||
then the sequencer may choose the same file for both (but no
|
||||
guarantee of how ``close together'' the two requests might be).
|
||||
guarantee of how ``close together'' the two requests might be time-wise).
|
||||
|
||||
\item (cacheable) Find the list of Machi member servers. This step is
|
||||
only needed at client initialization time or when all Machi members
|
||||
are down/unavailable. This step is out of scope of Machi, i.e., found
|
||||
via another source: local configuration file, DNS, LDAP, Riak KV, ZooKeeper,
|
||||
carrier pigeon, etc.
|
||||
carrier pigeon, papyrus, etc.
|
||||
|
||||
\item (cacheable) Find the current projection number and projection data
|
||||
structure by fetching it from one of the Machi FLU server's
|
||||
projection store service. This info
|
||||
may be cached and reused for as long as Machi server requests do not
|
||||
may be cached and reused for as long as Machi API operations do not
|
||||
result in {\tt error\_bad\_epoch}.
|
||||
|
||||
\item Client sends a sequencer op to the sequencer process on the head of
|
||||
\item Client sends a sequencer op\footnote{The {\tt append()} API
|
||||
operation is performed by the server as if it were two different API
|
||||
operations in sequence: {\tt sequence()} and {\tt write()}. The {\tt
|
||||
append()} operation is provided as an optimization to reduce latency
|
||||
by reducing messages sent \& received by a client.}
|
||||
to the sequencer process on the head of
|
||||
the Machi chain (as defined by the projection data structure):
|
||||
{\tt \{sequence\_req, Filename\_Prefix, Number\_of\_Bytes\}}. The reply
|
||||
includes {\tt \{Full\_Filename, Offset\}}.
|
||||
|
@ -838,15 +858,18 @@ successful. The client now knows the full Machi file name and byte
|
|||
offset, so that future attempts to read the data can do so by file
|
||||
name and offset.
|
||||
|
||||
\item Upon any non-{\tt ok} reply from a FLU server, {\em the client must
|
||||
consider the entire append operation a failure}. If the client
|
||||
\item Upon any non-{\tt ok} reply from a FLU server, the client must
|
||||
either perform read repair or else consider the entire append
|
||||
operation a failure.
|
||||
If the client
|
||||
wishes, it may retry the append operation using a new location
|
||||
assignment from the sequencer or, if permitted by Machi restrictions,
|
||||
perform read repair on the original location. If this read repair is
|
||||
fully successful, then the client may consider the append operation
|
||||
successful.
|
||||
|
||||
\item If a FLU server $FLU$ is unavailable, notify another up/available
|
||||
\item (optional)
|
||||
If a FLU server $FLU$ is unavailable, notify another up/available
|
||||
chain member that $FLU$ appears unavailable. This info may be used by
|
||||
the chain manager service to change projections. If the client
|
||||
wishes, it may retry the append op or perhaps wait until a new projection is
|
||||
|
@ -855,15 +878,6 @@ available.
|
|||
\item If any FLU server reports {\tt error\_written}, then either of two
|
||||
things has happened:
|
||||
\begin{itemize}
|
||||
\item The appending client $C_w$ was too slow when attempting to write
|
||||
to the head of the chain.
|
||||
Another client, $C_r$, attempted a read, noticed that the tail's value was
|
||||
unwritten and noticed that the head's value was also unwritten.
|
||||
Then $C_r$ initiated a ``fill'' operation to write junk into
|
||||
this offset of
|
||||
the file. The fill operation succeeded, and now the slow
|
||||
appending client $C_w$ discovers that it was too slow via the
|
||||
{\tt error\_written} response.
|
||||
\item The appending client $C_w$ was too slow after at least one
|
||||
successful write.
|
||||
Client $C_r$ attempted a read, noticed the partial write, and
|
||||
|
@ -871,14 +885,21 @@ things has happened:
|
|||
replicas to verify that the repaired data matches its write
|
||||
attempt -- in all cases, the values written by $C_w$ and $C_r$ are
|
||||
identical.
|
||||
\item The appending client $C_w$ was too slow when attempting to write
|
||||
to the head of the chain.
|
||||
Another client, $C_r$, attempted a read.
|
||||
$C_r$ observes that the tail's value was
|
||||
unwritten and observes that the head's value was also unwritten.
|
||||
Then $C_r$ initiated a ``fill'' operation to write junk into
|
||||
this offset of
|
||||
the file. The fill operation succeeded, and now the slow
|
||||
appending client $C_w$ discovers that it was too slow via the
|
||||
{\tt error\_written} response.
|
||||
\end{itemize}
|
||||
|
||||
\end{enumerate}
|
||||
|
||||
\subsection{TODO: Single operation: reading a chunk of bytes from a file}
|
||||
\label{sec:sketch-read}
|
||||
|
||||
\section{Projections: calculation, then storage, then (perhaps) use}
|
||||
\section{Projections: calculation, storage, then use}
|
||||
\label{sec:projections}
|
||||
|
||||
Machi uses a ``projection'' to determine how its Chain Replication replicas
|
||||
|
@ -909,7 +930,7 @@ included in any production-quality implementation.
|
|||
|
||||
\subsection{When to trigger read repair of single values}
|
||||
|
||||
Assume now that some client $X$ wishes to fetch a datum that's managed
|
||||
Assume that some client $X$ wishes to fetch a datum that's managed
|
||||
by Chain Replication. Client $X$ must discover the chain's
|
||||
configuration for that datum, then send its read request to the tail
|
||||
replica of the chain, $R_{tail}$.
|
||||
|
@ -941,14 +962,14 @@ A read from any other server in the chain will also yield {\tt
|
|||
|
||||
A read from any other server in the chain may yield {\tt
|
||||
error\_unwritten} or may find written data. (In this scenario, the
|
||||
head server has written data; we don't know the state of the middle
|
||||
head server has written data, but we don't know the state of the middle
|
||||
and tail server(s).) The client ought to perform read repair of this
|
||||
data. (See also, scenario \#4 below.)
|
||||
|
||||
During read repair, the client's writes operations may race with the
|
||||
original writer's operations. However, both the original writer and
|
||||
the repairing client are always writing the same data. Therefore,
|
||||
data corruption by conflicting client writes is not possible.
|
||||
data corruption by concurrent client writes is not possible.
|
||||
|
||||
\paragraph{Scenario 3: A client $X_w$ has received a sequencer's
|
||||
assignment for this
|
||||
|
@ -1031,19 +1052,19 @@ method is nearly sufficient enough for Machi's eventual consistency
|
|||
mode of operation. There's only one small problem that {\tt rsync}
|
||||
cannot handle by itself: handling late writes to a file. It is
|
||||
possible that the same file could contain the following pattern of
|
||||
written and unwritten data:
|
||||
written and unwritten data on two different replicas $A$ and $B$:
|
||||
|
||||
\begin{itemize}
|
||||
\item Server $A$: $x$ bytes written, $y$ bytes unwritten
|
||||
\item Server $B$: $x$ bytes unwritten, $y$ bytes written
|
||||
\end{itemize}
|
||||
|
||||
If {\tt rsync} is uses as-is to replicate this file, then one of the
|
||||
two written sections will overwritten by NUL bytes. Obviously, we
|
||||
If {\tt rsync} is used as-is to replicate this file, then one of the
|
||||
two written sections will lost, i.e., overwritten by NUL bytes. Obviously, we
|
||||
don't want this kind of data loss. However, we already have a
|
||||
requirement that Machi file servers must enforce write-once behavior
|
||||
on all file byte ranges. The same data used to maintain written and
|
||||
unwritten state can be used to merge file state so that both the $x$
|
||||
on all file byte ranges. The same metadata used to maintain written and
|
||||
unwritten state can be used to merge file state safely so that both the $x$
|
||||
and $y$ byte ranges will be correct after repair.
|
||||
|
||||
\subsubsection{The larger problem with ``Just `rsync' it!''}
|
||||
|
@ -1053,8 +1074,9 @@ Machi written chunk boundaries as described above. A larger
|
|||
administration problem still remains: this informal method cannot tell
|
||||
you exactly when you are in danger of data loss or when data loss has
|
||||
actually happened. If we maintain the Update Propagation Invariant
|
||||
(as argued in \cite{machi-chain-manager-design},
|
||||
then we always know exactly when data loss is immanent or has happened.
|
||||
(as argued in \cite{machi-chain-manager-design}),
|
||||
then we always know exactly when data loss is immanent or has
|
||||
probably happened.
|
||||
|
||||
\section{On-disk storage and file corruption detection}
|
||||
\label{sec:on-disk}
|
||||
|
@ -1064,9 +1086,13 @@ as efficiently as possible, and make it easy to detect and fix file
|
|||
corruption.
|
||||
|
||||
FLUs have a lot of flexibility to implement their on-disk data formats in
|
||||
whatever manner allow them to be safe and fast. Any format that
|
||||
whatever manner allow them to be safe and fast. Any scheme that
|
||||
allows safe management of file names, per-file data chunks, and
|
||||
per-data-chunk metadata is sufficient.
|
||||
\footnote{The proof-of-concept implementation at GitHub in the {\tt
|
||||
prototype/demo-day} directory uses two files in the local file
|
||||
system per Machi file: one for Machi file data and one for
|
||||
checksum metadata.}
|
||||
|
||||
\subsection{First draft/strawman proposal for on-disk data format}
|
||||
\label{sub:on-disk-data-format}
|
||||
|
@ -1199,34 +1225,27 @@ example, for chain $[F_a, F_b, F_c]$ and a 100\% read-only workload,
|
|||
FLUs $F_a$ and $F_b$ will be completely idle, and FLU $F_c$ must
|
||||
handle all of the workload.
|
||||
|
||||
CORFU suggests a strategy of rotating the chain every so often, e.g.,
|
||||
rotating the chain members every 10K or 20K pages or so. In this
|
||||
manner, then, the head and tail roles would rotate in a deterministic
|
||||
way and balance the workload evenly.\footnote{If we ignore cases of
|
||||
small numbers of extremely ``hot''/frequently-accessed pages.}
|
||||
|
||||
The same scheme could be applied pretty easily to the Machi projection
|
||||
data structure. For example, using a rotation ``stripe'' of 1 MByte, then
|
||||
any write where the offset $O \textit{ div } 1024^2 = 0$ would use chain
|
||||
variation $[F_a, F_b, F_c]$, and $O \textit{ div } 1024^2 = 1$, would use chain
|
||||
variation $[F_b, F_c, F_a]$, and so on. Some use cases, if the first
|
||||
1 MByte of a file were always ``hot'', then this simple scheme would be
|
||||
insufficient.
|
||||
|
||||
Other more complicated striping solutions can be applied.\footnote{It
|
||||
may not be worth discussing any of them here, but SLF has several
|
||||
ideas of how to do it.} All have the problem of ``tearing'' a byte
|
||||
range write into two pieces, if that byte range falls on either size
|
||||
of a stripe boundary, e.g., $\{1024^2 - 1, 1024^2 + 1\}$. It feels
|
||||
like the cost of a few torn writes (relative to the entire file size)
|
||||
should be fairly low? And in cases like CORFU where the stripe size
|
||||
is an exact multiple of the page size, then torn writes cannot happen
|
||||
\ldots and it is likely that the CORFU use case is the one most likely
|
||||
to requite this kind of load balancing.
|
||||
Because all bytes of a Machi file is immutable, the extra
|
||||
synchronization between servers as suggested by \cite{cr-craq} are not
|
||||
needed.
|
||||
Machi's use of write-once registers makes any server choice correct.
|
||||
The implementation is
|
||||
therefore free to make any load balancing choice for read operations,
|
||||
as long as the read repair protocol is honored.
|
||||
|
||||
\section{Integration strategy with Riak Core and other distributed systems}
|
||||
\label{sec:integration}
|
||||
|
||||
We have repeatedly stated that load balancing/sharding files across
|
||||
multiple Machi clusters is out of scope of this document. This
|
||||
section ignores that warning and explores a couple of extremely simple
|
||||
methods to implement a cluster-of-Machi-clusters. Note that the
|
||||
method sketched in Section~\ref{sub:integration-random-slicing} has
|
||||
been implemented in the Machi proof-of-concept implementation at
|
||||
GitHub in the {\tt prototype/demo-day} directory.
|
||||
|
||||
\subsection{Assumptions}
|
||||
|
||||
We assume that any technique is able to perform extremely basic
|
||||
parsing of the file names that Machi sequencers create. The example
|
||||
shown in Section~\ref{sub:sequencer-divergence} depicts a client write
|
||||
|
@ -1276,8 +1295,9 @@ co-invented at about the same time that Hibari
|
|||
\cite{cr-theory-and-practice} implemented it.
|
||||
|
||||
The data structure to describe a Random Slicing scheme is pretty
|
||||
small, about 100 KBytes in a conveninet but space-inefficient
|
||||
representation in Erlang. A pure function with domain of Machi file
|
||||
small, about 100 KBytes in a convenient but space-inefficient
|
||||
representation in Erlang for a few hundred chains.
|
||||
A pure function implementation with domain of Machi file
|
||||
name plus Random Slicing map and range of all available Machi clusters
|
||||
is straightforward.
|
||||
|
||||
|
@ -1303,24 +1323,33 @@ latency. The generalization of the move/relocate algorithm above is:
|
|||
\begin{enumerate}
|
||||
\item For each $RSM_j$ mapping for the ``new'' location map list,
|
||||
query the Machi cluster $MAP(F_{prefix}, RSM_j)$ and take the
|
||||
first {\tt \{ok,\ldots\}} response.
|
||||
first {\tt \{ok,\ldots\}} response. If no results are found, then \ldots
|
||||
\item For each $RSM_i$ mapping for the ``old'' location map list,
|
||||
query the Machi cluster $MAP(F_{prefix}, RSM_i)$ and take the
|
||||
first {\tt \{ok,\ldots\}} response.
|
||||
first {\tt \{ok,\ldots\}} response. If no results are found, then \ldots
|
||||
\item To deal with races when moving files and then removing them from
|
||||
the ``old'' locations, perform step \#1 again to look in the new
|
||||
location(s).
|
||||
\item If the data is not found at this stage, then the data does not exist.
|
||||
\end{enumerate}
|
||||
|
||||
\subsubsection{Problems with the ``simplest scheme''}
|
||||
|
||||
The major drawback to the ``simplest schemes'' sketched above is a
|
||||
problem of uneven file distributions across the cluster-of-clusters.
|
||||
The risk of this imbalance is directly proportional to the risk of
|
||||
clients that make poor prefix choices. The worst case is if all
|
||||
clients always request the same prefix. Research for effective,
|
||||
well-balancing file prefix choices is an area for future work.
|
||||
|
||||
\section{Recommended reading \& related work}
|
||||
|
||||
A big reason for the large size of this document is that it includes a
|
||||
lot of background information.
|
||||
Basho people tend to be busy, and sitting down to
|
||||
People tend to be busy, and sitting down to
|
||||
read 4--6 research papers to get familiar with a topic \ldots doesn't
|
||||
happen very quickly. We recommend you read the papers mentioned in
|
||||
this section and in the ``References'' at the end, but if our job is
|
||||
this section and in the ``References'' section, but if our job is
|
||||
done well enough, it isn't necessary.
|
||||
|
||||
Familiarity with the CAP Theorem, the concepts \& semantics \&
|
||||
|
@ -1334,7 +1363,7 @@ The replication protocol for Machi is based almost entirely on the CORFU
|
|||
ordered log protocol \cite{corfu1}. If the reader is familiar with
|
||||
the content of this paper, understanding the implementation details of
|
||||
Machi will be easy. The longer paper \cite{corfu2} goes into much
|
||||
more detail -- developers are strongly recommended to read this paper
|
||||
more detail --- Machi developers are strongly recommended to read this paper
|
||||
also.
|
||||
|
||||
CORFU is, in turn, a very close cousin of the Paxos distributed
|
||||
|
@ -1442,6 +1471,12 @@ Manageability, availability and performance in Porcupine: a highly scalable, clu
|
|||
7th ACM Symposium on Operating System Principles (SOSP’99).
|
||||
{\tt http://homes.cs.washington.edu/\%7Elevy/ porcupine.pdf}
|
||||
|
||||
\bibitem{cr-craq}
|
||||
Jeff Terrace and Michael J.~Freedman
|
||||
Object Storage on CRAQ.
|
||||
In Usenix ATC 2009.
|
||||
{\tt https://www.usenix.org/legacy/event/usenix09/ tech/full\_papers/terrace/terrace.pdf}
|
||||
|
||||
\bibitem{chain-replication}
|
||||
van Renesse, Robbert et al.
|
||||
Chain Replication for Supporting High Throughput and Availability.
|
||||
|
@ -1479,8 +1514,9 @@ Design \& Implementation (OSDI'04) - Volume 6, 2004.
|
|||
\includegraphics{append-flow2}
|
||||
}
|
||||
\caption{MSC diagram: append 123 bytes onto a file with prefix {\tt
|
||||
"foo"}, using FLU$\rightarrow$FLU direct communication in original
|
||||
Chain Replication's messaging pattern. In error-free cases and with
|
||||
"foo"}, using the {\tt append()} API function and also
|
||||
using FLU$\rightarrow$FLU direct communication (i.e., the original
|
||||
Chain Replication's messaging pattern). In error-free cases and with
|
||||
a correct cached projection, the number of network messages is $N+1$
|
||||
where $N$ is chain length.}
|
||||
\label{fig:append-flow2MSC}
|
||||
|
|
Loading…
Reference in a new issue