diff --git a/.gitignore b/.gitignore index 80da416..0f6b627 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ rel/machi current_counterexample.eqc foo* typescript* +*.swp diff --git a/.travis.yml b/.travis.yml index 6c8a6c7..7bb3465 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,4 +4,3 @@ notifications: script: "priv/test-for-gh-pr.sh" otp_release: - 17.5 - - 18.1 diff --git a/Makefile b/Makefile index 8cf5072..a93a383 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,6 @@ ifeq ($(REBAR),) REBAR = $(BASE_DIR)/rebar endif OVERLAY_VARS ?= -EUNIT_OPTS = -v .PHONY: rel deps package pkgclean edoc @@ -54,7 +53,7 @@ relclean: stage : rel $(foreach dep,$(wildcard deps/*), rm -rf rel/$(REPO)/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) rel/$(REPO)/lib;) -DIALYZER_APPS = kernel stdlib sasl erts ssl compiler eunit crypto public_key syntax_tools +DIALYZER_APPS = kernel stdlib sasl erts ssl compiler eunit crypto PLT = $(HOME)/.machi_dialyzer_plt include tools.mk diff --git a/README.md b/README.md index d802080..8cd8354 100644 --- a/README.md +++ b/README.md @@ -28,16 +28,13 @@ doc](./doc/high-level-machi.pdf) for further references.) ## Status: mid-October 2015: work is underway -* The chain manager is ready for both eventual consistency use ("available - mode") and strong consistency use ("consistent mode"). Both modes use a new +* The chain manager is ready for both eventual consistency use ("AP + mode") and strong constency use ("CP mode"). Both modes use a new consensus technique, Humming Consensus. * Scott will be [speaking about Humming Consensus](http://ricon.io/agenda/#managing-chain-replication-metadata-with-humming-consensus) at the [Ricon 2015 conference] (http://ricon.io) in San Francisco, CA, USA on Thursday, November 5th, 2015. - * If you would like to run the network partition simulator - mentioned in that Ricon presentation, please see the - [partition simulator convergence test doc.](./doc/machi_chain_manager1_converge_demo.md) * Implementation of the file repair process for strong consistency is still in progress. diff --git a/dialyzer.ignore-warnings b/dialyzer.ignore-warnings index a5ee352..db73995 100644 --- a/dialyzer.ignore-warnings +++ b/dialyzer.ignore-warnings @@ -3,10 +3,8 @@ machi_pb.erl:0: ################################################## ######## Specific types ##################### ################################################## -Unknown types: basho_bench_config:get/2 machi_partition_simulator:get/1 - hamcrest:matchspec/0 ################################################## ######## Specific messages ##################### ################################################## diff --git a/doc/high-level-chain-mgr.pdf b/doc/high-level-chain-mgr.pdf index 7276f57..aa70095 100644 Binary files a/doc/high-level-chain-mgr.pdf and b/doc/high-level-chain-mgr.pdf differ diff --git a/doc/machi_chain_manager1_converge_demo.md b/doc/machi_chain_manager1_converge_demo.md deleted file mode 100644 index 2844bfa..0000000 --- a/doc/machi_chain_manager1_converge_demo.md +++ /dev/null @@ -1,185 +0,0 @@ - -# Using the network partition simulator and convergence demo test code - -## A complete example of all input and output - -If you don't have an Erlang/OTP 17 runtime environment available, -please see this file for full input and output of a strong consistency -length=3 chain test: -https://gist.github.com/slfritchie/8352efc88cc18e62c72c -This file contains all commands input and all simulator output from a -sample run of the simulator. - -To help interpret the output of the test, please skip ahead to the -"The test output is very verbose" section. - -## Prerequisites - -1. You'll need the `git` source management -2. You'll need the Erlang/OTP 17 runtime environment. Please don't - use earlier or later versions until we have a chance to fix the - compilation warnings that versions R16B and 18 will trigger. - -All of the commands that should be run at your login shell (e.g. Bash, -c-shell) can be cut-and-pasted from this document directly to your -login shell prompt. - -## Clone and compile the code - -Clone the Machi source repo and compile the source and test code. Run -the following commands at your login shell: - - cd /tmp - git clone https://github.com/basho/machi.git - cd machi - git checkout master - make - -Then run the unit test suite. This may take up to two minutes or so -to finish. Most of the tests will be silent; please be patient until -the tests finish. - - make test - -## Run an interactive Erlang CLI shell - -Run the following command at your login shell: - - erl -pz .eunit ebin deps/*/ebin - -If you are using Erlang/OTP version 17, you should see some CLI output -that looks like this: - - Erlang/OTP 17 [erts-6.4] [source] [64-bit] [smp:8:8] [async-threads:10] [hipe] [kernel-poll:false] [dtrace] - - Eshell V6.4 (abort with ^G) - 1> - -## The test output is very verbose ... what are the important parts? - -The output of the Erlang command -`machi_chain_manager1_converge_demo:help()` will display the following -guide to the output of the tests. - - A visualization of the convergence behavior of the chain self-management - algorithm for Machi. - - 1. Set up some server and chain manager pairs. - 2. Create a number of different network partition scenarios, where - (simulated) partitions may be symmetric or asymmetric. Then stop changing - the partitions and keep the simulated network stable (and perhaps broken). - 3. Run a number of iterations of the algorithm in parallel by poking each - of the manager processes on a random'ish basis. - 4. Afterward, fetch the chain transition changes made by each FLU and - verify that no transition was unsafe. - - During the iteration periods, the following is a cheatsheet for the output. - See the internal source for interpreting the rest of the output. - - 'SET partitions = ' - - A pair-wise list of actors which cannot send messages. The - list is uni-directional. If there are three servers (a,b,c), - and if the partitions list is '[{a,b},{b,c}]' then all - messages from a->b and b->c will be dropped, but any other - sender->recipient messages will be delivered successfully. - - 'x uses:' - - The FLU x has made an internal state transition and is using - this epoch's projection as operating chain configuration. The - rest of the line is a summary of the projection. - - 'CONFIRM epoch {N}' - - This message confirms that all of the servers listed in the - UPI and repairing lists of the projection at epoch {N} have - agreed to use this projection because they all have written - this projection to their respective private projection stores. - The chain is now usable by/available to all clients. - - 'Sweet, private projections are stable' - - This report announces that this iteration of the test cycle - has passed successfully. The report that follows briefly - summarizes the latest private projection used by each - participating server. For example, when in strong consistency - mode with 'a' as a witness and 'b' and 'c' as real servers: - - %% Legend: - %% server name, epoch ID, UPI list, repairing list, down list, ... - %% ... witness list, 'false' (a constant value) - - [{a,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}, - {b,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}] - - Both servers 'a' and 'b' agree on epoch 1116 with epoch ID - {1116,<<23,143,246,55>>} where UPI=[a,b], repairing=[], - down=[c], and witnesses=[a]. - - Server 'c' is not shown because 'c' has wedged itself OOS (out - of service) by configuring a chain length of zero. - - If no servers are listed in the report (i.e. only '[]' is - displayed), then all servers have wedged themselves OOS, and - the chain is unavailable. - - 'DoIt,' - - This marks a group of tick events which trigger the manager - processes to evaluate their environment and perhaps make a - state transition. - - A long chain of 'DoIt,DoIt,DoIt,' means that the chain state has - (probably) settled to a stable configuration, which is the goal of the - algorithm. - - Press control-c to interrupt the test....". - -## Run a test in eventual consistency mode - -Run the following command at the Erlang CLI prompt: - - machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}]). - -The first argument, `3`, is the number of servers to participate in -the chain. Please note: - -* Chain lengths as short as 1 or 2 are valid, but the results are a - bit boring. -* Chain lengths as long as 7 or 9 can be used, but they may - suffer from longer periods of churn/instability before all chain - managers reach agreement via humming consensus. (It is future work - to shorten the worst of the unstable churn latencies.) -* In eventual consistency mode, chain lengths may be even numbers, - e.g. 2, 4, or 6. -* The simulator will choose partition events from the permutations of - all 1, 2, and 3 node partition pairs. The total runtime will - increase *dramatically* with chain length. - * Chain length 2: about 3 partition cases - * Chain length 3: about 35 partition cases - * Chain length 4: about 230 partition cases - * Chain length 5: about 1100 partition cases - -## Run a test in strong consistency mode (with witnesses): - -*NOTE:* Due to a bug in the test code, please do not try to run the - convergence test in strong consistency mode and also without the - correct minority number of witness servers! If in doubt, please run - the commands shown below exactly. - -Run the following command at the Erlang CLI prompt: - - machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]). - -The first argument, `3`, is the number of servers to participate in -the chain. Chain lengths as long as 7 or 9 can be used, but they may -suffer from longer periods of churn/instability before all chain -managers reach agreement via humming consensus. - -Due to the bug mentioned above, please use the following -commands when running with chain lengths of 5 or 7, respectively. - - machi_chain_manager1_converge_demo:t(5, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b]}]). - machi_chain_manager1_converge_demo:t(7, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b,c]}]). - diff --git a/doc/src.high-level/high-level-chain-mgr.tex b/doc/src.high-level/high-level-chain-mgr.tex index f139862..b6e35a6 100644 --- a/doc/src.high-level/high-level-chain-mgr.tex +++ b/doc/src.high-level/high-level-chain-mgr.tex @@ -23,8 +23,8 @@ \copyrightdata{978-1-nnnn-nnnn-n/yy/mm} \doi{nnnnnnn.nnnnnnn} -\titlebanner{Draft \#0.92, October 2015} -\preprintfooter{Draft \#0.92, October 2015} +\titlebanner{Draft \#0.91, June 2015} +\preprintfooter{Draft \#0.91, June 2015} \title{Chain Replication metadata management in Machi, an immutable file store} @@ -50,23 +50,19 @@ For an overview of the design of the larger Machi system, please see TODO Fix, after all of the recent changes to this document. Machi is an immutable file store, now in active development by Basho -Japan KK. Machi uses Chain Replication\footnote{Chain +Japan KK. Machi uses Chain Replication to maintain strong consistency +of file updates to all replica servers in a Machi cluster. Chain Replication is a variation of primary/backup replication where the order of updates between the primary server and each of the backup -servers is strictly ordered into a single ``chain''.} -to maintain strong consistency -of file updates to all replica servers in a Machi cluster. - -This document describes the Machi chain manager, the component -responsible for managing Chain Replication metadata state. -Management of -chain metadata, e.g., ``What is the current order of +servers is strictly ordered into a single ``chain''. Management of +Chain Replication's metadata, e.g., ``What is the current order of servers in the chain?'', remains an open research problem. The current state of the art for Chain Replication metadata management -relies on an external oracle (e.g., based on ZooKeeper) or the Elastic -Replication \cite{elastic-chain-replication} algorithm. +relies on an external oracle (e.g., ZooKeeper) or the Elastic +Replication algorithm. -The chain +This document describes the Machi chain manager, the component +responsible for managing Chain Replication metadata state. The chain manager uses a new technique, based on a variation of CORFU, called ``humming consensus''. Humming consensus does not require active participation by all or even @@ -93,18 +89,20 @@ to perform these management tasks. Chain metadata state and state management tasks include: \begin{itemize} +\item Preserving data integrity of all metadata and data stored within + the chain. Data loss is not an option. \item Preserving stable knowledge of chain membership (i.e. all nodes in - the chain, regardless of operational status). We expect that a systems - administrator will make all ``permanent'' decisions about + the chain, regardless of operational status). A systems + administrator is expected to make ``permanent'' decisions about chain membership. \item Using passive and/or active techniques to track operational - state/status, e.g., up, down, restarting, full data sync in progress, partial - data sync in progress, etc. + state/status, e.g., up, down, restarting, full data sync, partial + data sync, etc. \item Choosing the run-time replica ordering/state of the chain, based on current member status and past operational history. All chain state transitions must be done safely and without data loss or corruption. -\item When a new node is added to the chain administratively or old node is +\item As a new node is added to the chain administratively or old node is restarted, adding the node to the chain safely and perform any data synchronization/repair required to bring the node's data into full synchronization with the other nodes. @@ -113,27 +111,39 @@ management tasks include: \subsection{Ultimate goal: Preserve data integrity of Chain Replicated data} Preservation of data integrity is paramount to any chain state -management technique for Machi. Loss or corruption of chain data must -be avoided. +management technique for Machi. Even when operating in an eventually +consistent mode, Machi must not lose data without cause outside of all +design, e.g., all particpants crash permanently. \subsection{Goal: Contribute to Chain Replication metadata management research} We believe that this new self-management algorithm, humming consensus, contributes a novel approach to Chain Replication metadata management. +The ``monitor +and mangage your neighbor'' technique proposed in Elastic Replication +(Section \ref{ssec:elastic-replication}) appears to be the current +state of the art in the distributed systems research community. Typical practice in the IT industry appears to favor using an external -oracle, e.g., built on top of ZooKeeper as a trusted coordinator. +oracle, e.g., using ZooKeeper as a trusted coordinator. -See Section~\ref{sec:cr-management-review} for a brief review of -techniques used today. +See Section~\ref{sec:cr-management-review} for a brief review. \subsection{Goal: Support both eventually consistent \& strongly consistent modes of operation} -Chain Replication was originally designed by van Renesse and Schneider -\cite{chain-replication} for applications that require strong -consistency, e.g. sequential consistency. However, Machi has use -cases where more relaxed eventual consistency semantics are -sufficient. We wish to use the same Chain Replication management -technique for both strong and eventual consistency environments. +Machi's first use cases are all for use as a file store in an eventually +consistent environment. +In eventually consistent mode, humming consensus +allows a Machi cluster to fragment into +arbitrary islands of network partition, all the way down to 100\% of +members running in complete network isolation from each other. +Furthermore, it provides enough agreement to allow +formerly-partitioned members to coordinate the reintegration and +reconciliation of their data when partitions are healed. + +Later, we wish the option of supporting strong consistency +applications such as CORFU-style logging while reusing all (or most) +of Machi's infrastructure. Such strongly consistent operation is the +main focus of this document. \subsection{Anti-goal: Minimize churn} @@ -194,18 +204,6 @@ would probably be preferable to add the feature to Riak Ensemble rather than to use ZooKeeper (and for Basho to document ZK, package ZK, provide commercial ZK support, etc.). -\subsection{An external management oracle, implemented by - active/standby application failover} - -This technique has been used in production of HibariDB. The customer -very carefully deployed the oracle using the Erlang/OTP ``application -controller'' on two machines to provide active/standby failover of the -management oracle. The customer was willing to monitor this service -very closely and was prepared to intervene manually during network -partitions. (This controller is very susceptible to ``split brain -syndrome''.) While this feature of Erlang/OTP is useful in other -environments, we believe is it not sufficient for Machi's needs. - \section{Assumptions} \label{sec:assumptions} @@ -214,8 +212,8 @@ Paxos, Raft, et al.), why bother with a slightly different set of assumptions and a slightly different protocol? The answer lies in one of our explicit goals: to have an option of -running in an ``eventually consistent'' manner. We wish to be -remain available, even if we are +running in an ``eventually consistent'' manner. We wish to be able to +make progress, i.e., remain available in the CAP sense, even if we are partitioned down to a single isolated node. VR, Paxos, and Raft alone are not sufficient to coordinate service availability at such small scale. The humming consensus algorithm can manage @@ -249,15 +247,13 @@ synchronized by NTP. The protocol and algorithm presented here do not specify or require any timestamps, physical or logical. Any mention of time inside of data -structures are for human and/or diagnostic purposes only. +structures are for human/historic/diagnostic purposes only. -Having said that, some notion of physical time is suggested -occasionally for -purposes of efficiency. For example, some ``sleep +Having said that, some notion of physical time is suggested for +purposes of efficiency. It's recommended that there be some ``sleep time'' between iterations of the algorithm: there is no need to ``busy -wait'' by executing the algorithm as many times per minute as -possible. -See also Section~\ref{ssub:when-to-calc}. +wait'' by executing the algorithm as quickly as possible. See also +Section~\ref{ssub:when-to-calc}. \subsection{Failure detector model} @@ -280,73 +276,55 @@ eventual consistency. Discussion of strongly consistent CP mode is always the default; exploration of AP mode features in this document will always be explictly noted. -%%\subsection{Use of the ``wedge state''} -%% -%%A participant in Chain Replication will enter ``wedge state'', as -%%described by the Machi high level design \cite{machi-design} and by CORFU, -%%when it receives information that -%%a newer projection (i.e., run-time chain state reconfiguration) is -%%available. The new projection may be created by a system -%%administrator or calculated by the self-management algorithm. -%%Notification may arrive via the projection store API or via the file -%%I/O API. -%% -%%When in wedge state, the server will refuse all file write I/O API -%%requests until the self-management algorithm has determined that -%%humming consensus has been decided (see next bullet item). The server -%%may also refuse file read I/O API requests, depending on its CP/AP -%%operation mode. -%% -%%\subsection{Use of ``humming consensus''} -%% -%%CS literature uses the word ``consensus'' in the context of the problem -%%description at \cite{wikipedia-consensus} -%%. -%%This traditional definition differs from what is described here as -%%``humming consensus''. -%% -%%``Humming consensus'' describes -%%consensus that is derived only from data that is visible/known at the current -%%time. -%%The algorithm will calculate -%%a rough consensus despite not having input from a quorum majority -%%of chain members. Humming consensus may proceed to make a -%%decision based on data from only a single participant, i.e., only the local -%%node. -%% -%%See Section~\ref{sec:humming-consensus} for detailed discussion. +\subsection{Use of the ``wedge state''} -%%\subsection{Concurrent chain managers execute humming consensus independently} -%% -%%Each Machi file server has its own concurrent chain manager -%%process embedded within it. Each chain manager process will -%%execute the humming consensus algorithm using only local state (e.g., -%%the $P_{current}$ projection currently used by the local server) and -%%values observed in everyone's projection stores -%%(Section~\ref{sec:projection-store}). -%% -%%The chain manager communicates with the local Machi -%%file server using the wedge and un-wedge request API. When humming -%%consensus has chosen a projection $P_{new}$ to replace $P_{current}$, -%%the value of $P_{new}$ is included in the un-wedge request. +A participant in Chain Replication will enter ``wedge state'', as +described by the Machi high level design \cite{machi-design} and by CORFU, +when it receives information that +a newer projection (i.e., run-time chain state reconfiguration) is +available. The new projection may be created by a system +administrator or calculated by the self-management algorithm. +Notification may arrive via the projection store API or via the file +I/O API. -\subsection{The reader is familiar with CORFU} +When in wedge state, the server will refuse all file write I/O API +requests until the self-management algorithm has determined that +humming consensus has been decided (see next bullet item). The server +may also refuse file read I/O API requests, depending on its CP/AP +operation mode. -Machi borrows heavily from the techniques and data structures used by -CORFU \cite[corfu1],\cite[corfu2]. We hope that the reader is -familiar with CORFU's features, including: +\subsection{Use of ``humming consensus''} -\begin{itemize} -\item write-once registers for log data storage, -\item the epoch, which defines a period of time when a cluster's configuration -is stable, -\item strictly increasing epoch numbers, which are identifiers -for particular epochs, -\item projections, which define the chain order and other details of - data replication within the cluster, and -\item the wedge state, used by servers to coordinate cluster changes - during epoch transitions. -\end{itemize} +CS literature uses the word ``consensus'' in the context of the problem +description at \cite{wikipedia-consensus} +. +This traditional definition differs from what is described here as +``humming consensus''. + +``Humming consensus'' describes +consensus that is derived only from data that is visible/known at the current +time. +The algorithm will calculate +a rough consensus despite not having input from all/majority +of chain members. Humming consensus may proceed to make a +decision based on data from only a single participant, i.e., only the local +node. + +See Section~\ref{sec:humming-consensus} for detailed discussion. + +\subsection{Concurrent chain managers execute humming consensus independently} + +Each Machi file server has its own concurrent chain manager +process embedded within it. Each chain manager process will +execute the humming consensus algorithm using only local state (e.g., +the $P_{current}$ projection currently used by the local server) and +values observed in everyone's projection stores +(Section~\ref{sec:projection-store}). + +The chain manager communicates with the local Machi +file server using the wedge and un-wedge request API. When humming +consensus has chosen a projection $P_{new}$ to replace $P_{current}$, +the value of $P_{new}$ is included in the un-wedge request. \section{The projection store} \label{sec:projection-store} @@ -365,15 +343,19 @@ this key. The store's value is either the special `unwritten' value\footnote{We use $\bot$ to denote the unwritten value.} or else a binary blob that is immutable thereafter; the projection data structure is -serialized and stored in this binary blob. See -\ref{sub:the-projection} for more detail. +serialized and stored in this binary blob. + +The projection store is vital for the correct implementation of humming +consensus (Section~\ref{sec:humming-consensus}). The write-once +register primitive allows us to reason about the store's behavior +using the same logical tools and techniques as the CORFU ordered log. \subsection{The publicly-writable half of the projection store} The publicly-writable projection store is used to share information during the first half of humming consensus algorithm. Projections in the public half of the store form a log of -suggestions\footnote{I hesitate to use the words ``propose'' or ``proposal'' +suggestions\footnote{I hesitate to use the word ``propose'' or ``proposal'' anywhere in this document \ldots until I've done a more formal analysis of the protocol. Those words have too many connotations in the context of consensus protocols such as Paxos and Raft.} @@ -387,9 +369,8 @@ Any chain member may read from the public half of the store. The privately-writable projection store is used to store the Chain Replication metadata state (as chosen by humming consensus) -that is in use now by the local Machi server. Earlier projections -remain in the private half to keep a historical -record of chain state transitions by the local server. +that is in use now by the local Machi server as well as previous +operation states. Only the local server may write values into the private half of store. Any chain member may read from the private half of the store. @@ -405,30 +386,35 @@ The private projection store serves multiple purposes, including: its sequence of $P_{current}$ projection changes. \end{itemize} +The private half of the projection store is not replicated. + \section{Projections: calculation, storage, and use} \label{sec:projections} Machi uses a ``projection'' to determine how its Chain Replication replicas -should operate; see \cite{machi-design} and \cite{corfu1}. +should operate; see \cite{machi-design} and +\cite{corfu1}. At runtime, a cluster must be able to respond both to +administrative changes (e.g., substituting a failed server with +replacement hardware) as well as local network conditions (e.g., is +there a network partition?). + +The projection defines the operational state of Chain Replication's +chain order as well the (re-)synchronization of data managed by by +newly-added/failed-and-now-recovering members of the chain. This +chain metadata, together with computational processes that manage the +chain, must be managed in a safe manner in order to avoid unintended +data loss of data managed by the chain. + The concept of a projection is borrowed from CORFU but has a longer history, e.g., the Hibari key-value store \cite{cr-theory-and-practice} and goes back in research for decades, e.g., Porcupine \cite{porcupine}. -The projection defines the operational state of Chain Replication's -chain order as well the (re-)synchronization of data managed by by -newly-added/failed-and-now-recovering members of the chain. -At runtime, a cluster must be able to respond both to -administrative changes (e.g., substituting a failed server with -replacement hardware) as well as local network conditions (e.g., is -there a network partition?). - \subsection{The projection data structure} \label{sub:the-projection} {\bf NOTE:} This section is a duplicate of the ``The Projection and -the Projection Epoch Number'' section of the ``Machi: an immutable -file store'' design doc \cite{machi-design}. +the Projection Epoch Number'' section of \cite{machi-design}. The projection data structure defines the current administration \& operational/runtime @@ -459,7 +445,6 @@ Figure~\ref{fig:projection}. To summarize the major components: active_upi :: [m_server()], repairing :: [m_server()], down_members :: [m_server()], - witness_servers :: [m_server()], dbg_annotations :: proplist() }). \end{verbatim} @@ -469,12 +454,13 @@ Figure~\ref{fig:projection}. To summarize the major components: \begin{itemize} \item {\tt epoch\_number} and {\tt epoch\_csum} The epoch number and - projection checksum together form the unique identifier for this projection. + projection checksum are unique identifiers for this projection. \item {\tt creation\_time} Wall-clock time, useful for humans and general debugging effort. \item {\tt author\_server} Name of the server that calculated the projection. \item {\tt all\_members} All servers in the chain, regardless of current - operation status. + operation status. If all operating conditions are perfect, the + chain should operate in the order specified here. \item {\tt active\_upi} All active chain members that we know are fully repaired/in-sync with each other and therefore the Update Propagation Invariant (Section~\ref{sub:upi}) is always true. @@ -482,10 +468,7 @@ Figure~\ref{fig:projection}. To summarize the major components: are in active data repair procedures. \item {\tt down\_members} All members that the {\tt author\_server} believes are currently down or partitioned. -\item {\tt witness\_servers} If witness servers (Section~\ref{zzz}) - are used in strong consistency mode, then they are listed here. The - set of {\tt witness\_servers} is a subset of {\tt all\_members}. -\item {\tt dbg\_annotations} A ``kitchen sink'' property list, for code to +\item {\tt dbg\_annotations} A ``kitchen sink'' proplist, for code to add any hints for why the projection change was made, delay/retry information, etc. \end{itemize} @@ -495,8 +478,7 @@ Figure~\ref{fig:projection}. To summarize the major components: According to the CORFU research papers, if a server node $S$ or client node $C$ believes that epoch $E$ is the latest epoch, then any information that $S$ or $C$ receives from any source that an epoch $E+\delta$ (where -$\delta > 0$) exists will push $S$ into the ``wedge'' state -and force $C$ into a mode +$\delta > 0$) exists will push $S$ into the ``wedge'' state and $C$ into a mode of searching for the projection definition for the newest epoch. In the humming consensus description in @@ -524,7 +506,7 @@ Humming consensus requires that any projection be identified by both the epoch number and the projection checksum, as described in Section~\ref{sub:the-projection}. -\section{Managing projection store replicas} +\section{Managing multiple projection store replicas} \label{sec:managing-multiple-projection-stores} An independent replica management technique very similar to the style @@ -533,63 +515,11 @@ replicas of Machi's projection data structures. The major difference is that humming consensus {\em does not necessarily require} successful return status from a minimum number of participants (e.g., -a majority quorum). - -\subsection{Writing to public projection stores} -\label{sub:proj-store-writing} - -Writing replicas of a projection $P_{new}$ to the cluster's public -projection stores is similar to writing in a Dynamo-like system. -The significant difference with Chain Replication is how we interpret -the return status of each write operation. - -In cases of {\tt error\_written} status, -the process may be aborted and read repair -triggered. The most common reason for {\tt error\_written} status -is that another actor in the system has concurrently -already calculated another -(perhaps different\footnote{The {\tt error\_written} may also -indicate that another server has performed read repair on the exact -projection $P_{new}$ that the local server is trying to write!}) -projection using the same projection epoch number. - -\subsection{Writing to private projection stores} - -Only the local server/owner may write to the private half of a -projection store. Private projection store values are never subject -to read repair. - -\subsection{Reading from public projection stores} -\label{sub:proj-store-reading} - -A read is simple: for an epoch $E$, send a public projection read API -operation to all participants. Usually, the ``get latest epoch'' -variety is used. - -The minimum number of non-error responses is only one.\footnote{The local -projection store should always be available, even if no other remote -replica projection stores are available.} If all available servers -return a single, unanimous value $V_u, V_u \ne \bot$, then $V_u$ is -the final result for epoch $E$. -Any non-unanimous values are considered unresolvable for the -epoch. This disagreement is resolved by newer -writes to the public projection stores during subsequent iterations of -humming consensus. - -Unavailable servers may not necessarily interfere with making a decision. -Humming consensus -only uses as many public projections as are available at the present -moment of time. Assume that some server $S$ is unavailable at time $t$ and -becomes available at some later $t+\delta$. -If at $t+\delta$ we -discover that $S$'s public projection store for key $E$ -contains some disagreeing value $V_{weird}$, then the disagreement -will be resolved in the exact same manner that would have been used as if we -had seen the disagreeing values at the earlier time $t$. +a quorum). \subsection{Read repair: repair only unwritten values} -The ``read repair'' concept is also shared with Riak Core and Dynamo +The idea of ``read repair'' is also shared with Riak Core and Dynamo systems. However, Machi has situations where read repair cannot truly ``fix'' a key because two different values have been written by two different replicas. @@ -600,24 +530,85 @@ values, all participants in humming consensus merely agree that there were multiple suggestions at that epoch which must be resolved by the creation and writing of newer projections with later epoch numbers.} Machi's projection store read repair can only repair values that are -unwritten, i.e., currently storing $\bot$. +unwritten, i.e., storing $\bot$. -The value used to repair unwritten $\bot$ values is the ``best'' projection that +The value used to repair $\bot$ values is the ``best'' projection that is currently available for the current epoch $E$. If there is a single, unanimous value $V_{u}$ for the projection at epoch $E$, then $V_{u}$ -is used to repair all projections stores at $E$ that contain $\bot$ +is use to repair all projections stores at $E$ that contain $\bot$ values. If the value of $K$ is not unanimous, then the ``highest ranked value'' $V_{best}$ is used for the repair; see Section~\ref{sub:ranking-projections} for a description of projection ranking. -If a non-$\bot$ value exists, then by definition\footnote{Definition - of a write-once register} this value is immutable. The only -conflict resolution path is to write a new projection with a newer and -larger epoch number. Once a public projection with epoch number $E$ is -written, projections with epochs smaller than $E$ are ignored by +\subsection{Writing to public projection stores} +\label{sub:proj-store-writing} + +Writing replicas of a projection $P_{new}$ to the cluster's public +projection stores is similar, in principle, to writing a Chain +Replication-managed system or Dynamo-like system. But unlike Chain +Replication, the order doesn't really matter. +In fact, the two steps below may be performed in parallel. +The significant difference with Chain Replication is how we interpret +the return status of each write operation. + +\begin{enumerate} +\item Write $P_{new}$ to the local server's public projection store + using $P_{new}$'s epoch number $E$ as the key. + As a side effect, a successful write will trigger + ``wedge'' status in the local server, which will then cascade to other + projection-related activity by the local chain manager. +\item Write $P_{new}$ to key $E$ of each remote public projection store of + all participants in the chain. +\end{enumerate} + +In cases of {\tt error\_written} status, +the process may be aborted and read repair +triggered. The most common reason for {\tt error\_written} status +is that another actor in the system has +already calculated another (perhaps different) projection using the +same projection epoch number and that +read repair is necessary. The {\tt error\_written} may also +indicate that another server has performed read repair on the exact +projection $P_{new}$ that the local server is trying to write! + +\subsection{Writing to private projection stores} + +Only the local server/owner may write to the private half of a +projection store. Also, the private projection store is not replicated. + +\subsection{Reading from public projection stores} +\label{sub:proj-store-reading} + +A read is simple: for an epoch $E$, send a public projection read API +request to all participants. As when writing to the public projection +stores, we can ignore any timeout/unavailable return +status.\footnote{The success/failure status of projection reads and + writes is {\em not} ignored with respect to the chain manager's + internal liveness tracker. However, the liveness tracker's state is + typically only used when calculating new projections.} If we +discover any unwritten values $\bot$, the read repair protocol is +followed. + +The minimum number of non-error responses is only one.\footnote{The local +projection store should always be available, even if no other remote +replica projection stores are available.} If all available servers +return a single, unanimous value $V_u, V_u \ne \bot$, then $V_u$ is +the final result for epoch $E$. +Any non-unanimous values are considered complete disagreement for the +epoch. This disagreement is resolved by humming consensus by later +writes to the public projection stores during subsequent iterations of humming consensus. +We are not concerned with unavailable servers. Humming consensus +only uses as many public projections as are available at the present +moment of time. If some server $S$ is unavailable at time $t$ and +becomes available at some later $t+\delta$, and if at $t+\delta$ we +discover that $S$'s public projection store for key $E$ +contains some disagreeing value $V_{weird}$, then the disagreement +will be resolved in the exact same manner that would be used as if we +had found the disagreeing values at the earlier time $t$. + \section{Phases of projection change, a prelude to Humming Consensus} \label{sec:phases-of-projection-change} @@ -680,7 +671,7 @@ straightforward; see Section~\ref{sub:proj-store-writing} for the technique for writing projections to all participating servers' projection stores. Humming Consensus does not care -if the writes succeed or not. The next phase, adopting a +if the writes succeed or not: its final phase, adopting a new projection, will determine which write operations are usable. \subsection{Adoption a new projection} @@ -694,8 +685,8 @@ to avoid direct parallels with protocols such as Raft and Paxos.) In general, a projection $P_{new}$ at epoch $E_{new}$ is adopted by a server only if the change in state from the local server's current projection to new -projection, $P_{current} \rightarrow P_{new}$, will not cause data loss: -the Update Propagation Invariant and all other safety checks +projection, $P_{current} \rightarrow P_{new}$ will not cause data loss, +e.g., the Update Propagation Invariant and all other safety checks required by chain repair in Section~\ref{sec:repair-entire-files} are correct. For example, any new epoch must be strictly larger than the current epoch, i.e., $E_{new} > E_{current}$. @@ -705,12 +696,16 @@ available public projection stores. If the result is not a single unanmous projection, then we return to the step in Section~\ref{sub:projection-calculation}. If the result is a {\em unanimous} projection $P_{new}$ in epoch $E_{new}$, and if $P_{new}$ -does not violate chain safety checks, then the local node will: +does not violate chain safety checks, then the local node may +replace its local $P_{current}$ projection with $P_{new}$. -\begin{itemize} -\item write $P_{current}$ to the local private projection store, and -\item set its local operating state $P_{current} \leftarrow P_{new}$. -\end{itemize} +Not all safe projection transitions are useful, however. For example, +it's trivally safe to suggest projection $P_{zero}$, where the chain +length is zero. In an eventual consistency environment, projection +$P_{one}$ where the chain length is exactly one is also trivially +safe.\footnote{Although, if the total number of participants is more + than one, eventual consistency would demand that $P_{self}$ cannot + be used forever.} \section{Humming Consensus} \label{sec:humming-consensus} @@ -719,11 +714,13 @@ Humming consensus describes consensus that is derived only from data that is visible/available at the current time. It's OK if a network partition is in effect and not all chain members are available; the algorithm will calculate a rough consensus despite not -having input from all chain members. +having input from all chain members. Humming consensus +may proceed to make a decision based on data from only one +participant, i.e., only the local node. \begin{itemize} -\item When operating in eventual consistency mode, humming +\item When operating in AP mode, i.e., in eventual consistency mode, humming consensus may reconfigure a chain of length $N$ into $N$ independent chains of length 1. When a network partition heals, the humming consensus is sufficient to manage the chain so that each @@ -731,12 +728,11 @@ replica's data can be repaired/merged/reconciled safely. Other features of the Machi system are designed to assist such repair safely. -\item When operating in strong consistency mode, any -chain shorter than the quorum majority of -all members is invalid and therefore cannot be used. Any server with -a too-short chain cannot not move itself out -of wedged state and is therefore unavailable for general file service. -In very general terms, this requirement for a quorum +\item When operating in CP mode, i.e., in strong consistency mode, humming +consensus would require additional restrictions. For example, any +chain that didn't have a minimum length of the quorum majority size of +all members would be invalid and therefore would not move itself out +of wedged state. In very general terms, this requirement for a quorum majority of surviving participants is also a requirement for Paxos, Raft, and ZAB. See Section~\ref{sec:split-brain-management} for a proposal to handle ``split brain'' scenarios while in CP mode. @@ -756,6 +752,8 @@ Section~\ref{sec:phases-of-projection-change}: network monitoring, calculating new projections, writing projections, then perhaps adopting the newest projection (which may or may not be the projection that we just wrote). +Beginning with Section~\ref{sub:flapping-state}, we provide +additional detail to the rough outline of humming consensus. \begin{figure*}[htp] \resizebox{\textwidth}{!}{ @@ -803,15 +801,15 @@ is used by the flowchart and throughout this section. \item[$\mathbf{P_{current}}$] The projection actively used by the local node right now. It is also the projection with largest - epoch number in the local node's {\em private} projection store. + epoch number in the local node's private projection store. \item[$\mathbf{P_{newprop}}$] A new projection suggestion, as calculated by the local server (Section~\ref{sub:humming-projection-calculation}). \item[$\mathbf{P_{latest}}$] The highest-ranked projection with the largest - single epoch number that has been read from all available {\em public} - projection stores. + single epoch number that has been read from all available public + projection stores, including the local node's public projection store. \item[Unanimous] The $P_{latest}$ projection is unanimous if all replicas in all accessible public projection stores are effectively @@ -830,7 +828,7 @@ is used by the flowchart and throughout this section. The flowchart has three columns, from left to right: \begin{description} -\item[Column A] Is there any reason to act? +\item[Column A] Is there any reason to change? \item[Column B] Do I act? \item[Column C] How do I act? \begin{description} @@ -865,12 +863,12 @@ In today's implementation, there is only a single criterion for determining the alive/perhaps-not-alive status of a remote server $S$: is $S$'s projection store available now? This question is answered by attemping to read the projection store on server $S$. -If successful, then we assume that $S$ and all of $S$'s network services -are available. If $S$'s projection store is not available for any -reason (including timeout), we inform the local ``fitness server'' -that we have had a problem querying $S$. The fitness service may then -take additional monitoring/querying actions before informing us (in a -later iteration) that $S$ should be considered down. +If successful, then we assume that all +$S$ is available. If $S$'s projection store is not available for any +reason (including timeout), we assume $S$ is entirely unavailable. +This simple single +criterion appears to be sufficient for humming consensus, according to +simulations of arbitrary network partitions. %% {\bf NOTE:} The projection store API is accessed via TCP. The network %% partition simulator, mentioned above and described at @@ -885,10 +883,64 @@ Column~A of Figure~\ref{fig:flowchart}. See also, Section~\ref{sub:projection-calculation}. Execution starts at ``Start'' state of Column~A of -Figure~\ref{fig:flowchart}. Rule $A20$'s uses judgement from the -local ``fitness server'' to select a definite +Figure~\ref{fig:flowchart}. Rule $A20$'s uses recent success \& +failures in accessing other public projection stores to select a hard boolean up/down status for each participating server. +\subsubsection{Calculating flapping state} + +Also at this stage, the chain manager calculates its local +``flapping'' state. The name ``flapping'' is borrowed from IP network +engineer jargon ``route flapping'': + +\begin{quotation} +``Route flapping is caused by pathological conditions +(hardware errors, software errors, configuration errors, intermittent +errors in communications links, unreliable connections, etc.) within +the network which cause certain reachability information to be +repeatedly advertised and withdrawn.'' \cite{wikipedia-route-flapping} +\end{quotation} + +\paragraph{Flapping due to constantly changing network partitions and/or server crashes and restarts} + +Currently, Machi does not attempt to dampen, smooth, or ignore recent +history of constantly flapping peer servers. If necessary, a failure +detector such as the $\phi$ accrual failure detector +\cite{phi-accrual-failure-detector} can be used to help mange such +situations. + +\paragraph{Flapping due to asymmetric network partitions} + +The simulator's behavior during stable periods where at least one node +is the victim of an asymmetric network partition is \ldots weird, +wonderful, and something I don't completely understand yet. This is +another place where we need more eyes reviewing and trying to poke +holes in the algorithm. + +In cases where any node is a victim of an asymmetric network +partition, the algorithm oscillates in a very predictable way: each +server $S$ makes the same $P_{new}$ projection at epoch $E$ that $S$ made +during a previous recent epoch $E-\delta$ (where $\delta$ is small, usually +much less than 10). However, at least one node makes a suggestion that +makes rough consensus impossible. When any epoch $E$ is not +acceptable (because some node disagrees about something, e.g., +which nodes are down), +the result is more new rounds of suggestions that create a repeating +loop that lasts as long as the asymmetric partition lasts. + +From the perspective of $S$'s chain manager, the pattern of this +infinite loop is easy to detect: $S$ inspects the pattern of the last +$L$ projections that it has suggested, e.g., the last 10. +Tiny details such as the epoch number and creation timestamp will +differ, but the major details such as UPI list and repairing list are +the same. + +If the major details of the last $L$ projections authored and +suggested by $S$ are the same, then $S$ unilaterally decides that it +is ``flapping'' and enters flapping state. See +Section~\ref{sub:flapping-state} for additional disucssion of the +flapping state. + \subsubsection{When to calculate a new projection} \label{ssub:when-to-calc} @@ -897,7 +949,7 @@ calculate a new projection. The timer interval is typically 0.5--2.0 seconds, if the cluster has been stable. A client may call an external API call to trigger a new projection, e.g., if that client knows that an environment change has happened and wishes to trigger a -response prior to the next timer firing (e.g.~at state $C200$). +response prior to the next timer firing. It's recommended that the timer interval be staggered according to the participant ranking rules in Section~\ref{sub:ranking-projections}; @@ -918,14 +970,15 @@ done by state $C110$ and that writing a public projection is done by states $C300$ and $C310$. Broadly speaking, there are a number of decisions made in all three -columns of Figure~\ref{fig:flowchart} to decide if and when a -projection should be written at all. Sometimes, the best action is +columns of Figure~\ref{fig:flowchart} to decide if and when any type +of projection should be written at all. Sometimes, the best action is to do nothing. \subsubsection{Column A: Is there any reason to change?} The main tasks of the flowchart states in Column~A is to calculate a -new projection $P_{new}$. Then we try to figure out which +new projection $P_{new}$ and perhaps also the inner projection +$P_{new2}$ if we're in flapping mode. Then we try to figure out which projection has the greatest merit: our current projection $P_{current}$, the new projection $P_{new}$, or the latest epoch $P_{latest}$. If our local $P_{current}$ projection is best, then @@ -958,7 +1011,7 @@ The main decisions that states in Column B need to make are: It's notable that if $P_{new}$ is truly the best projection available at the moment, it must always first be written to everyone's -public projection stores and only afterward processed through another +public projection stores and only then processed through another monitor \& calculate loop through the flowchart. \subsubsection{Column C: How do I act?} @@ -1000,14 +1053,14 @@ therefore the suggested projections at epoch $E$ are not unanimous. \paragraph{\#2: The transition from current $\rightarrow$ new projection is safe} -Given the current projection +Given the projection that the server is currently using, $P_{current}$, the projection $P_{latest}$ is evaluated by numerous rules and invariants, relative to $P_{current}$. If such rule or invariant is violated/false, then the local server will discard $P_{latest}$. -The transition from $P_{current} \rightarrow P_{latest}$ is protected -by rules and invariants that include: +The transition from $P_{current} \rightarrow P_{latest}$ is checked +for safety and sanity. The conditions used for the check include: \begin{enumerate} \item The Erlang data types of all record members are correct. @@ -1020,13 +1073,161 @@ by rules and invariants that include: The same re-reordering restriction applies to all servers in $P_{latest}$'s repairing list relative to $P_{current}$'s repairing list. -\item Any server $S$ that is newly-added to $P_{latest}$'s UPI list must +\item Any server $S$ that was added to $P_{latest}$'s UPI list must appear in the tail the UPI list. Furthermore, $S$ must have been in $P_{current}$'s repairing list and had successfully completed file - repair prior to $S$'s promotion from the repairing list to the tail - of the UPI list. + repair prior to the transition. \end{enumerate} +\subsection{Additional discussion of flapping state} +\label{sub:flapping-state} +All $P_{new}$ projections +calculated while in flapping state have additional diagnostic +information added, including: + +\begin{itemize} +\item Flag: server $S$ is in flapping state. +\item Epoch number \& wall clock timestamp when $S$ entered flapping state. +\item The collection of all other known participants who are also + flapping (with respective starting epoch numbers). +\item A list of nodes that are suspected of being partitioned, called the + ``hosed list''. The hosed list is a union of all other hosed list + members that are ever witnessed, directly or indirectly, by a server + while in flapping state. +\end{itemize} + +\subsubsection{Flapping diagnostic data accumulates} + +While in flapping state, this diagnostic data is gathered from +all available participants and merged together in a CRDT-like manner. +Once added to the diagnostic data list, a datum remains until +$S$ drops out of flapping state. When flapping state stops, all +accumulated diagnostic data is discarded. + +This accumulation of diagnostic data in the projection data +structure acts in part as a substitute for a separate gossip protocol. +However, since all participants are already communicating with each +other via read \& writes to each others' projection stores, the diagnostic +data can propagate in a gossip-like manner via the projection stores. + +\subsubsection{Flapping example (part 1)} +\label{ssec:flapping-example} + +Any server listed in the ``hosed list'' is suspected of having some +kind of network communication problem with some other server. For +example, let's examine a scenario involving a Machi cluster of servers +$a$, $b$, $c$, $d$, and $e$. Assume there exists an asymmetric network +partition such that messages from $a \rightarrow b$ are dropped, but +messages from $b \rightarrow a$ are delivered.\footnote{If this + partition were happening at or below the level of a reliable + delivery network protocol like TCP, then communication in {\em both} + directions would be affected by an asymmetric partition. + However, in this model, we are + assuming that a ``message'' lost during a network partition is a + uni-directional projection API call or its response.} + +Once a participant $S$ enters flapping state, it starts gathering the +flapping starting epochs and hosed lists from all of the other +projection stores that are available. The sum of this info is added +to all projections calculated by $S$. +For example, projections authored by $a$ will say that $a$ believes +that $b$ is down. +Likewise, projections authored by $b$ will say that $b$ believes +that $a$ is down. + +\subsubsection{The inner projection (flapping example, part 2)} +\label{ssec:inner-projection} + +\ldots We continue the example started in the previous subsection\ldots + +Eventually, in a gossip-like manner, all other participants will +eventually find that their hosed list is equal to $[a,b]$. Any other +server, for example server $c$, will then calculate another +projection, $P_{new2}$, using the assumption that both $a$ and $b$ +are down in addition to all other known unavailable servers. + +\begin{itemize} +\item If operating in the default CP mode, both $a$ and $b$ are down + and therefore not eligible to participate in Chain Replication. + %% The chain may continue service if a $c$, $d$, $e$ and/or witness + %% servers can try to form a correct UPI list for the chain. + This may cause an availability problem for the chain: we may not + have a quorum of participants (real or witness-only) to form a + correct UPI chain. +\item If operating in AP mode, $a$ and $b$ can still form two separate + chains of length one, using UPI lists of $[a]$ and $[b]$, respectively. +\end{itemize} + +This re-calculation, $P_{new2}$, of the new projection is called an +``inner projection''. The inner projection definition is nested +inside of its parent projection, using the same flapping disagnostic +data used for other flapping status tracking. + +When humming consensus has determined that a projection state change +is necessary and is also safe (relative to both the outer and inner +projections), then the outer projection\footnote{With the inner + projection $P_{new2}$ nested inside of it.} is written to +the local private projection store. +With respect to future iterations of +humming consensus, the innter projection is ignored. +However, with respect to Chain Replication, the server's subsequent +behavior +{\em will consider the inner projection only}. The inner projection +is used to order the UPI and repairing parts of the chain and trigger +wedge/un-wedge behavior. The inner projection is also +advertised to Machi clients. + +The epoch of the inner projection, $E^{inner}$ is always less than or +equal to the epoch of the outer projection, $E$. The $E^{inner}$ +epoch typically only changes when new servers are added to the hosed +list. + +To attempt a rough analogy, the outer projection is the carrier wave +that is used to transmit the inner projection and its accompanying +gossip of diagnostic data. + +\subsubsection{Outer projection churn, inner projection stability} + +One of the intriguing features of humming consensus's reaction to +asymmetric partition: flapping behavior continues for as long as +an any asymmetric partition exists. + +\subsubsection{Stability in symmetric partition cases} + +Although humming consensus hasn't been formally proven to handle all +asymmetric and symmetric partition cases, the current implementation +appears to converge rapidly to a single chain state in all symmetric +partition cases. This is in contrast to asymmetric partition cases, +where ``flapping'' will continue on every humming consensus iteration +until all asymmetric partition disappears. A formal proof is an area of +future work. + +\subsubsection{Leaving flapping state and discarding inner projection} + +There are two events that can trigger leaving flapping state. + +\begin{itemize} + +\item A server $S$ in flapping state notices that its long history of + repeatedly suggesting the same projection will be broken: + $S$ instead calculates some differing projection instead. + This change in projection history happens whenever a perceived network + partition changes in any way. + +\item Server $S$ reads a public projection suggestion, $P_{noflap}$, that is + authored by another server $S'$, and that $P_{noflap}$ no longer + contains the flapping start epoch for $S'$ that is present in the + history that $S$ has maintained while $S$ has been in + flapping state. + +\end{itemize} + +When either trigger event happens, server $S$ will exit flapping state. All +new projections authored by $S$ will have all flapping diagnostic data +removed. This includes stopping use of the inner projection: the UPI +list of the inner projection is copied to the outer projection's UPI +list, to avoid a drastic change in UPI membership. + \subsection{Ranking projections} \label{sub:ranking-projections} @@ -1588,7 +1789,7 @@ Manageability, availability and performance in Porcupine: a highly scalable, clu {\tt http://homes.cs.washington.edu/\%7Elevy/ porcupine.pdf} \bibitem{chain-replication} -van Renesse, Robbert and Schneider, Fred. +van Renesse, Robbert et al. Chain Replication for Supporting High Throughput and Availability. Proceedings of the 6th Conference on Symposium on Operating Systems Design \& Implementation (OSDI'04) - Volume 6, 2004. diff --git a/doc/src.high-level/high-level-machi.tex b/doc/src.high-level/high-level-machi.tex index 9c7c87b..a95642a 100644 --- a/doc/src.high-level/high-level-machi.tex +++ b/doc/src.high-level/high-level-machi.tex @@ -1489,7 +1489,7 @@ In Usenix ATC 2009. {\tt https://www.usenix.org/legacy/event/usenix09/ tech/full\_papers/terrace/terrace.pdf} \bibitem{chain-replication} -van Renesse, Robbert and Schneider, Fred. +van Renesse, Robbert et al. Chain Replication for Supporting High Throughput and Availability. Proceedings of the 6th Conference on Symposium on Operating Systems Design \& Implementation (OSDI'04) - Volume 6, 2004. diff --git a/include/machi_merkle_tree.hrl b/include/machi_merkle_tree.hrl new file mode 100644 index 0000000..e3d0feb --- /dev/null +++ b/include/machi_merkle_tree.hrl @@ -0,0 +1,20 @@ +%% machi merkle tree records + +-record(naive, { + chunk_size = 1048576 :: pos_integer(), %% default 1 MB + recalc = true :: boolean(), + root :: 'undefined' | binary(), + lvl1 = [] :: [ binary() ], + lvl2 = [] :: [ binary() ], + lvl3 = [] :: [ binary() ], + leaves = [] :: [ { Offset :: pos_integer(), + Size :: pos_integer(), + Csum :: binary()} ] + }). + +-record(mt, { + filename :: string(), + tree :: #naive{}, + backend = 'naive' :: 'naive' + }). + diff --git a/rebar.config b/rebar.config index d6debc0..2a77381 100644 --- a/rebar.config +++ b/rebar.config @@ -1,4 +1,4 @@ -{require_otp_vsn, "17|18"}. +{require_otp_vsn, "17"}. %%% {erl_opts, [warnings_as_errors, {parse_transform, lager_transform}, debug_info]}. {erl_opts, [{parse_transform, lager_transform}, debug_info]}. diff --git a/src/machi.proto b/src/machi.proto index c9251cb..e5d77d9 100644 --- a/src/machi.proto +++ b/src/machi.proto @@ -48,10 +48,9 @@ enum Mpb_GeneralStatusCode { PARTITION = 4; NOT_WRITTEN = 5; WRITTEN = 6; - TRIMMED = 7; // The whole file was trimmed - NO_SUCH_FILE = 8; - PARTIAL_READ = 9; - BAD_EPOCH = 10; + NO_SUCH_FILE = 7; + PARTIAL_READ = 8; + BAD_EPOCH = 9; BAD_JOSS = 255; // Only for testing by the Taipan } @@ -356,7 +355,6 @@ message Mpb_ProjectionV1 { // append_chunk() // write_chunk() // read_chunk() -// trim_chunk() // checksum_list() // list_files() // wedge_status() @@ -426,20 +424,6 @@ message Mpb_LL_ReadChunkResp { repeated Mpb_ChunkPos trimmed = 3; } -// Low level API: trim_chunk() - -message Mpb_LL_TrimChunkReq { - required Mpb_EpochID epoch_id = 1; - required string file = 2; - required uint64 offset = 3; - required uint32 size = 4; - optional uint32 trigger_gc = 5 [default=0]; -} - -message Mpb_LL_TrimChunkResp { - required Mpb_GeneralStatusCode status = 1; -} - // Low level API: checksum_list() message Mpb_LL_ChecksumListReq { @@ -604,12 +588,11 @@ message Mpb_LL_Request { optional Mpb_LL_AppendChunkReq append_chunk = 30; optional Mpb_LL_WriteChunkReq write_chunk = 31; optional Mpb_LL_ReadChunkReq read_chunk = 32; - optional Mpb_LL_TrimChunkReq trim_chunk = 33; - optional Mpb_LL_ChecksumListReq checksum_list = 34; - optional Mpb_LL_ListFilesReq list_files = 35; - optional Mpb_LL_WedgeStatusReq wedge_status = 36; - optional Mpb_LL_DeleteMigrationReq delete_migration = 37; - optional Mpb_LL_TruncHackReq trunc_hack = 38; + optional Mpb_LL_ChecksumListReq checksum_list = 33; + optional Mpb_LL_ListFilesReq list_files = 34; + optional Mpb_LL_WedgeStatusReq wedge_status = 35; + optional Mpb_LL_DeleteMigrationReq delete_migration = 36; + optional Mpb_LL_TruncHackReq trunc_hack = 37; } message Mpb_LL_Response { @@ -639,10 +622,9 @@ message Mpb_LL_Response { optional Mpb_LL_AppendChunkResp append_chunk = 30; optional Mpb_LL_WriteChunkResp write_chunk = 31; optional Mpb_LL_ReadChunkResp read_chunk = 32; - optional Mpb_LL_TrimChunkResp trim_chunk = 33; - optional Mpb_LL_ChecksumListResp checksum_list = 34; - optional Mpb_LL_ListFilesResp list_files = 35; - optional Mpb_LL_WedgeStatusResp wedge_status = 36; - optional Mpb_LL_DeleteMigrationResp delete_migration = 37; - optional Mpb_LL_TruncHackResp trunc_hack = 38; + optional Mpb_LL_ChecksumListResp checksum_list = 33; + optional Mpb_LL_ListFilesResp list_files = 34; + optional Mpb_LL_WedgeStatusResp wedge_status = 35; + optional Mpb_LL_DeleteMigrationResp delete_migration = 36; + optional Mpb_LL_TruncHackResp trunc_hack = 37; } diff --git a/src/machi_cr_client.erl b/src/machi_cr_client.erl index e03262b..7b88f45 100644 --- a/src/machi_cr_client.erl +++ b/src/machi_cr_client.erl @@ -114,7 +114,7 @@ -include_lib("eunit/include/eunit.hrl"). -endif. % TEST. --export([start_link/1, start_link/2]). +-export([start_link/1]). %% FLU1 API -export([ %% File API @@ -146,8 +146,7 @@ proxies_dict :: orddict:orddict(), epoch_id, proj, - bad_proj, - opts :: proplists:proplist() + bad_proj }). %% @doc Start a local, long-lived process that will be our steady @@ -155,10 +154,7 @@ %% remote Machi server. start_link(P_srvr_list) -> - gen_server:start_link(?MODULE, [P_srvr_list, []], []). - -start_link(P_srvr_list, Opts) -> - gen_server:start_link(?MODULE, [P_srvr_list, Opts], []). + gen_server:start_link(?MODULE, [P_srvr_list], []). %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. @@ -258,15 +254,14 @@ quit(PidSpec) -> %%%%%%%%%%%%%%%%%%%%%%%%%%% -init([P_srvr_list, Opts]) -> +init([P_srvr_list]) -> MembersDict = orddict:from_list([{P#p_srvr.name, P} || P <- P_srvr_list]), ProxiesDict = ?FLU_PC:start_proxies(MembersDict), - {ok, #state{members_dict=MembersDict, proxies_dict=ProxiesDict, opts=Opts}}. + {ok, #state{members_dict=MembersDict, proxies_dict=ProxiesDict}}. handle_call({req, Req}, From, S) -> handle_call2(Req, From, update_proj(S)); -handle_call(quit, _From, #state{members_dict=MembersDict}=S) -> - ?FLU_PC:stop_proxies(MembersDict), +handle_call(quit, _From, S) -> {stop, normal, ok, S}; handle_call(_Request, _From, S) -> Reply = whaaaaaaaaaaaaaaaaaaaa, @@ -365,9 +360,6 @@ do_append_head3(Prefix, Chunk, ChunkExtra, Depth, STime, TO, %% written block is. But we lost a race. Repeat, with a new %% sequencer assignment. do_append_head(Prefix, Chunk, ChunkExtra, Depth, STime, TO, S); - {error, trimmed} = Err -> - %% TODO: behaviour - {reply, Err, S}; {error, not_written} -> exit({todo_should_never_happen,?MODULE,?LINE, Prefix,iolist_size(Chunk)}) @@ -414,7 +406,7 @@ do_append_midtail(_RestFLUs, Prefix, File, Offset, Chunk, ChunkExtra, Ws, Depth + 1, STime, TO, S2) end end - end. + end. do_append_midtail2([], _Prefix, File, Offset, Chunk, _ChunkExtra, _Ws, _Depth, _STime, _TO, S) -> @@ -442,9 +434,6 @@ do_append_midtail2([FLU|RestFLUs]=FLUs, Prefix, File, Offset, Chunk, Resume = {append, Offset, iolist_size(Chunk), File}, do_repair_chunk(FLUs, Resume, Chunk, [], File, Offset, iolist_size(Chunk), Depth, STime, S); - {error, trimmed} = Err -> - %% TODO: nothing can be done - {reply, Err, S}; {error, not_written} -> exit({todo_should_never_happen,?MODULE,?LINE,File,Offset}) end. @@ -508,8 +497,6 @@ do_write_head2(File, Offset, Chunk, Depth, STime, TO, do_write_head(File, Offset, Chunk, Depth, STime, TO, S); {error, written}=Err -> {reply, Err, S}; - {error, trimmed}=Err -> - {reply, Err, S}; {error, not_written} -> exit({todo_should_never_happen,?MODULE,?LINE, iolist_size(Chunk)}) @@ -541,26 +528,18 @@ do_read_chunk2(File, Offset, Size, Opts, Depth, STime, TO, ConsistencyMode = P#projection_v1.mode, case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), EpochID, File, Offset, Size, Opts, ?TIMEOUT) of - {ok, {Chunks, Trimmed}} when is_list(Chunks), is_list(Trimmed) -> - %% After partition heal, there could happen that heads may - %% have chunk trimmed but tails may have chunk written - - %% such repair couldn't be triggered in read time (because - %% there's data!). In this case, repair should happen by - %% partition heal event or some background - %% hashtree-n-repair service. TODO. FIXME. - {reply, {ok, {Chunks, Trimmed}}, S}; + {ok, {Chunks, []}} when is_list(Chunks) -> + {reply, {ok, {Chunks, []}}, S}; %% {ok, BadChunk} -> %% %% TODO cleaner handling of bad chunks %% exit({todo, bad_chunk_size, ?MODULE, ?LINE, File, Offset, Size, %% got, byte_size(BadChunk)}); - {error, bad_arg} = BadArg -> + {error, bad_arg} = BadArg -> {reply, BadArg, S}; {error, partial_read}=Err -> - %% TODO: maybe this case we might need another repair? {reply, Err, S}; {error, bad_checksum}=BadCS -> %% TODO: alternate strategy? - %% Maybe we need read repair here, too? {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> @@ -569,123 +548,12 @@ do_read_chunk2(File, Offset, Size, Opts, Depth, STime, TO, read_repair(ConsistencyMode, read, File, Offset, Size, Depth, STime, S); %% {reply, {error, not_written}, S}; {error, written} -> - exit({todo_should_never_happen,?MODULE,?LINE,File,Offset,Size}); - {error, trimmed}=Err -> - {reply, Err, S} - end. - -do_trim_chunk(File, Offset, Size, 0=Depth, STime, TO, S) -> - do_trim_chunk(File, Offset, Size, Depth+1, STime, TO, S); - -do_trim_chunk(File, Offset, Size, Depth, STime, TO, #state{proj=P}=S) -> - sleep_a_while(Depth), - DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, - if DiffMs > TO -> - {reply, {error, partition}, S}; - true -> - %% This is suboptimal for performance: there are some paths - %% through this point where our current projection is good - %% enough. But we're going to try to keep the code as simple - %% as we can for now. - S2 = update_proj(S#state{proj=undefined, bad_proj=P}), - case S2#state.proj of - P2 when P2 == undefined orelse - P2#projection_v1.upi == [] -> - do_trim_chunk(File, Offset, Size, Depth + 1, - STime, TO, S2); - _ -> - do_trim_chunk2(File, Offset, Size, Depth + 1, - STime, TO, S2) - end - end. - -do_trim_chunk2(File, Offset, Size, Depth, STime, TO, - #state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) -> - [HeadFLU|RestFLUs] = mutation_flus(P), - Proxy = orddict:fetch(HeadFLU, PD), - case ?FLU_PC:trim_chunk(Proxy, EpochID, File, Offset, Size, ?TIMEOUT) of - ok -> - do_trim_midtail(RestFLUs, undefined, File, Offset, Size, - [HeadFLU], 0, STime, TO, S); - {error, trimmed} -> - %% Maybe the trim had failed in the middle of the tail so re-run - %% trim accross the whole chain. - do_trim_midtail(RestFLUs, undefined, File, Offset, Size, - [HeadFLU], 0, STime, TO, S); - {error, bad_checksum}=BadCS -> - {reply, BadCS, S}; - {error, Retry} - when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_trim_chunk(File, Offset, Size, Depth, STime, TO, S) - end. - -do_trim_midtail(RestFLUs, Prefix, File, Offset, Size, - Ws, Depth, STime, TO, S) - when RestFLUs == [] orelse Depth == 0 -> - do_trim_midtail2(RestFLUs, Prefix, File, Offset, Size, - Ws, Depth + 1, STime, TO, S); -do_trim_midtail(_RestFLUs, Prefix, File, Offset, Size, - Ws, Depth, STime, TO, #state{proj=P}=S) -> - %% io:format(user, "midtail sleep2,", []), - sleep_a_while(Depth), - DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, - if DiffMs > TO -> - {reply, {error, partition}, S}; - true -> - S2 = update_proj(S#state{proj=undefined, bad_proj=P}), - case S2#state.proj of - undefined -> - {reply, {error, partition}, S}; - P2 -> - RestFLUs2 = mutation_flus(P2), - case RestFLUs2 -- Ws of - RestFLUs2 -> - %% None of the writes that we have done so far - %% are to FLUs that are in the RestFLUs2 list. - %% We are pessimistic here and assume that - %% those FLUs are permanently dead. Start - %% over with a new sequencer assignment, at - %% the 2nd have of the impl (we have already - %% slept & refreshed the projection). - - if Prefix == undefined -> % atom! not binary()!! - {error, partition}; - true -> - do_trim_chunk(Prefix, Offset, Size, - Depth, STime, TO, S2) - end; - RestFLUs3 -> - do_trim_midtail2(RestFLUs3, Prefix, File, Offset, Size, - Ws, Depth + 1, STime, TO, S2) - end - end - end. - -do_trim_midtail2([], _Prefix, _File, _Offset, _Size, - _Ws, _Depth, _STime, _TO, S) -> - %% io:format(user, "ok!\n", []), - {reply, ok, S}; -do_trim_midtail2([FLU|RestFLUs]=FLUs, Prefix, File, Offset, Size, - Ws, Depth, STime, TO, - #state{epoch_id=EpochID, proxies_dict=PD}=S) -> - Proxy = orddict:fetch(FLU, PD), - case ?FLU_PC:trim_chunk(Proxy, EpochID, File, Offset, Size, ?TIMEOUT) of - ok -> - %% io:format(user, "write ~w,", [FLU]), - do_trim_midtail2(RestFLUs, Prefix, File, Offset, Size, - [FLU|Ws], Depth, STime, TO, S); - {error, trimmed} -> - do_trim_midtail2(RestFLUs, Prefix, File, Offset, Size, - [FLU|Ws], Depth, STime, TO, S); - {error, bad_checksum}=BadCS -> - %% TODO: alternate strategy? - {reply, BadCS, S}; - {error, Retry} - when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_trim_midtail(FLUs, Prefix, File, Offset, Size, - Ws, Depth, STime, TO, S) + exit({todo_should_never_happen,?MODULE,?LINE,File,Offset,Size}) end. +do_trim_chunk(_File, _Offset, _Size, _Depth, _STime, _TO, S) -> + %% This is just a stub to reach CR client from high level client + {reply, {error, bad_joss}, S}. %% Read repair: depends on the consistency mode that we're in: %% @@ -729,7 +597,6 @@ read_repair2(cp_mode=ConsistencyMode, case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), EpochID, File, Offset, Size, [], ?TIMEOUT) of {ok, Chunks} when is_list(Chunks) -> - %% TODO: change to {Chunks, Trimmed} and have them repaired ToRepair = mutation_flus(P) -- [Tail], {Reply, S1} = do_repair_chunks(Chunks, ToRepair, ReturnMode, [Tail], File, Depth, STime, S, {ok, Chunks}), @@ -747,12 +614,7 @@ read_repair2(cp_mode=ConsistencyMode, {error, not_written} -> {reply, {error, not_written}, S}; {error, written} -> - exit({todo_should_never_happen,?MODULE,?LINE,File,Offset,Size}); - {error, trimmed} -> - %% TODO: Again, whole file was trimmed. Needs repair. How - %% do we repair trimmed file (which was already unlinked) - %% across the flu servers? - exit({todo_should_repair_unlinked_files, ?MODULE, ?LINE, File}) + exit({todo_should_never_happen,?MODULE,?LINE,File,Offset,Size}) end; read_repair2(ap_mode=ConsistencyMode, ReturnMode, File, Offset, Size, Depth, STime, @@ -760,7 +622,6 @@ read_repair2(ap_mode=ConsistencyMode, Eligible = mutation_flus(P), case try_to_find_chunk(Eligible, File, Offset, Size, S) of {ok, {Chunks, _Trimmed}, GotItFrom} when is_list(Chunks) -> - %% TODO: Repair trimmed chunks ToRepair = mutation_flus(P) -- [GotItFrom], {Reply0, S1} = do_repair_chunks(Chunks, ToRepair, ReturnMode, [GotItFrom], File, Depth, STime, S, {ok, Chunks}), @@ -777,11 +638,7 @@ read_repair2(ap_mode=ConsistencyMode, {error, not_written} -> {reply, {error, not_written}, S}; {error, written} -> - exit({todo_should_never_happen,?MODULE,?LINE,File,Offset,Size}); - {error, trimmed} -> - %% TODO: Again, whole file was trimmed. Needs repair. How - %% do we repair trimmed file across the flu servers? - exit({todo_should_repair_unlinked_files, ?MODULE, ?LINE, File}) + exit({todo_should_never_happen,?MODULE,?LINE,File,Offset,Size}) end. do_repair_chunks([], _, _, _, _, _, _, S, Reply) -> @@ -846,9 +703,6 @@ do_repair_chunk2([First|Rest]=ToRepair, ReturnMode, Chunk, Repaired, File, Offse %% that it is exactly our Chunk. do_repair_chunk2(Rest, ReturnMode, Chunk, Repaired, File, Offset, Size, Depth, STime, S); - {error, trimmed} = _Error -> - %% TODO - exit(todo_should_repair_trimmed); {error, not_written} -> exit({todo_should_never_happen,?MODULE,?LINE,File,Offset,Size}) end. @@ -928,25 +782,10 @@ update_proj(#state{proj=undefined}=S) -> update_proj(S) -> S. -update_proj2(Count, #state{bad_proj=BadProj, proxies_dict=ProxiesDict, - opts=Opts}=S) -> +update_proj2(Count, #state{bad_proj=BadProj, proxies_dict=ProxiesDict}=S) -> Timeout = 2*1000, WTimeout = 2*Timeout, - SimName = proplists:get_value(simulator_self_name, Opts, cr_client), - ExcludedFLUs = - case proplists:get_value(use_partition_simulator, Opts, false) of - true -> - Members = proplists:get_value(simulator_members, Opts, []), - {Partitions, _Islands} = machi_partition_simulator:get(Members), - lists:filtermap(fun({A, B}) when A =:= SimName -> {true, B}; - ({A, B}) when B =:= SimName -> {true, A}; - (_) -> false - end, Partitions); - false -> [] - end, - Proxies = lists:foldl(fun(Name, Dict) -> - orddict:erase(Name, Dict) - end, ProxiesDict, ExcludedFLUs), + Proxies = orddict:to_list(ProxiesDict), Work = fun({_K, Proxy}) -> ?FLU_PC:read_latest_projection(Proxy, private, Timeout) end, @@ -959,7 +798,7 @@ update_proj2(Count, #state{bad_proj=BadProj, proxies_dict=ProxiesDict, %% b's projection. case choose_best_proj(Rs) of P when P == ?WORST_PROJ -> - io:format(user, "TODO: Using ?WORST_PROJ, chain is not available ~w\n", [self()]), + io:format(user, "TODO: Using ?WORST_PROJ, chain is not available\n", []), sleep_a_while(Count), update_proj2(Count + 1, S); P when P >= BadProj -> @@ -970,7 +809,7 @@ update_proj2(Count, #state{bad_proj=BadProj, proxies_dict=ProxiesDict, NewProxiesDict = ?FLU_PC:start_proxies(NewMembersDict), S#state{bad_proj=undefined, proj=P, epoch_id=EpochID, members_dict=NewMembersDict, proxies_dict=NewProxiesDict}; - _P -> + _ -> sleep_a_while(Count), update_proj2(Count + 1, S) end. @@ -1033,9 +872,7 @@ try_to_find_chunk(Eligible, File, Offset, Size, [{FoundFLU, {ok, ChunkAndTrimmed}}|_] -> {ok, ChunkAndTrimmed, FoundFLU}; [] -> - RetryErrs = [partition, bad_epoch, wedged, trimmed], - %% Adding 'trimmed' to return so as to trigger repair, - %% once all other retry errors fixed + RetryErrs = [partition, bad_epoch, wedged], case [Err || {error, Err} <- Rs, lists:member(Err, RetryErrs)] of [SomeErr|_] -> {error, SomeErr}; @@ -1085,7 +922,5 @@ chunk_wrapper_size(Chunk) -> timeout(infinity) -> timeout(15*60*1000); % close enough to infinity -timeout({_, _}=Timeout) -> - Timeout; timeout(Timeout0) -> {Timeout0, Timeout0 + 30*1000}. diff --git a/src/machi_csum_table.erl b/src/machi_csum_table.erl index 6e5e2b8..80f1765 100644 --- a/src/machi_csum_table.erl +++ b/src/machi_csum_table.erl @@ -125,6 +125,7 @@ write(#machi_csum_table{fd=Fd, table=T} = CsumT, true = ets:insert(T, {Offset, Size, CSum}), ok; Error -> + io:format(user, "boob *********************", []), Error end. @@ -170,20 +171,13 @@ trim(#machi_csum_table{fd=Fd, table=T}, Offset, Size) -> Error end. --spec all_trimmed(table(), non_neg_integer(), non_neg_integer()) -> boolean(). +-spec all_trimmed(table(), machi_dt:chunk_pos(), machi_dt:chunk_pos()) -> boolean(). all_trimmed(#machi_csum_table{table=T}, Left, Right) -> runthru(ets:tab2list(T), Left, Right). --spec all_trimmed(table(), non_neg_integer()) -> boolean(). +-spec all_trimmed(table(), machi_dt:chunk_pos()) -> boolean(). all_trimmed(#machi_csum_table{table=T}, Pos) -> - case ets:tab2list(T) of - [{0, ?MINIMUM_OFFSET, _}|L] -> - %% tl/1 to remove header space {0, 1024, <<0>>} - runthru(L, ?MINIMUM_OFFSET, Pos); - List -> - %% In case a header is removed; - runthru(List, 0, Pos) - end. + runthru(ets:tab2list(T), 0, Pos). -spec any_trimmed(table(), pos_integer(), diff --git a/src/machi_file_proxy.erl b/src/machi_file_proxy.erl index ff6748f..ed9a933 100644 --- a/src/machi_file_proxy.erl +++ b/src/machi_file_proxy.erl @@ -22,20 +22,20 @@ %% controlled files. In particular, it manages the "write-once register" %% conceit at the heart of Machi's design. %% -%% Read, write and append requests for a single file will be managed +%% Read, write and append requests for a single file will be managed %% through this proxy. Clients can also request syncs for specific %% types of filehandles. %% %% As operations are requested, the proxy keeps track of how many %% operations it has performed (and how many errors were generated.) -%% After a sufficient number of inactivity, the server terminates +%% After a sufficient number of inactivity, the server terminates %% itself. %% %% TODO: -%% 1. Some way to transition the proxy into a wedged state that +%% 1. Some way to transition the proxy into a wedged state that %% doesn't rely on message delivery. %% -%% 2. Check max file size on appends. Writes we take on faith we can +%% 2. Check max file size on appends. Writes we take on faith we can %% and should handle. %% %% 3. Async checksum reads on startup. @@ -47,7 +47,7 @@ %% public API -export([ - start_link/3, + start_link/2, stop/1, sync/1, sync/2, @@ -55,7 +55,6 @@ read/4, write/3, write/4, - trim/4, append/2, append/4 ]). @@ -75,11 +74,10 @@ -define(TIMEOUT, 10*1000). -define(TOO_MANY_ERRORS_RATIO, 50). --type op_stats() :: { Total :: non_neg_integer(), +-type op_stats() :: { Total :: non_neg_integer(), Errors :: non_neg_integer() }. -record(state, { - fluname :: atom(), data_dir :: string() | undefined, filename :: string() | undefined, data_path :: string() | undefined, @@ -95,18 +93,17 @@ ops = 0 :: non_neg_integer(), %% sum of all ops reads = {0, 0} :: op_stats(), writes = {0, 0} :: op_stats(), - appends = {0, 0} :: op_stats(), - trims = {0, 0} :: op_stats() + appends = {0, 0} :: op_stats() }). %% Public API -% @doc Start a new instance of the file proxy service. Takes the filename +% @doc Start a new instance of the file proxy service. Takes the filename % and data directory as arguments. This function is typically called by the % `machi_file_proxy_sup:start_proxy/2' function. --spec start_link(FluName :: atom(), Filename :: string(), DataDir :: string()) -> any(). -start_link(FluName, Filename, DataDir) -> - gen_server:start_link(?MODULE, {FluName, Filename, DataDir}, []). +-spec start_link(Filename :: string(), DataDir :: string()) -> any(). +start_link(Filename, DataDir) -> + gen_server:start_link(?MODULE, {Filename, DataDir}, []). % @doc Request to stop an instance of the file proxy service. -spec stop(Pid :: pid()) -> ok. @@ -123,7 +120,7 @@ sync(_Pid) -> % @doc Force a sync of a specific filehandle type. Valid types are `all', `csum' and `data'. -spec sync(Pid :: pid(), Type :: all|data|csum) -> ok|{error, term()}. -sync(Pid, Type) when is_pid(Pid) andalso +sync(Pid, Type) when is_pid(Pid) andalso ( Type =:= all orelse Type =:= csum orelse Type =:= data ) -> gen_server:call(Pid, {sync, Type}, ?TIMEOUT); sync(_Pid, Type) -> @@ -131,8 +128,9 @@ sync(_Pid, Type) -> {error, bad_arg}. % @doc Read file at offset for length. This returns a sequence of all -% written and trimmed (optional) bytes that overlaps with requested -% offset and length. Borders are not aligned. +% chunks that overlaps with requested offset and length. Note that +% borders are not aligned, not to mess up repair at cr_client with +% checksums. They should be cut at cr_client. -spec read(Pid :: pid(), Offset :: non_neg_integer(), Length :: non_neg_integer()) -> @@ -149,7 +147,7 @@ read(Pid, Offset, Length) -> {ok, [{Filename::string(), Offset :: non_neg_integer(), Data :: binary(), Checksum :: binary()}]} | {error, Reason :: term()}. -read(Pid, Offset, Length, Opts) when is_pid(Pid) andalso is_integer(Offset) andalso Offset >= 0 +read(Pid, Offset, Length, Opts) when is_pid(Pid) andalso is_integer(Offset) andalso Offset >= 0 andalso is_integer(Length) andalso Length > 0 andalso is_list(Opts) -> gen_server:call(Pid, {read, Offset, Length, Opts}, ?TIMEOUT); @@ -181,12 +179,6 @@ write(_Pid, Offset, ClientMeta, _Data) -> lager:warning("Bad arg to write: Offset ~p, ClientMeta: ~p", [Offset, ClientMeta]), {error, bad_arg}. -trim(Pid, Offset, Size, TriggerGC) when is_pid(Pid), - is_integer(Offset) andalso Offset >= 0, - is_integer(Size) andalso Size > 0, - is_boolean(TriggerGC) -> - gen_server:call(Pid, {trim ,Offset, Size, TriggerGC}, ?TIMEOUT). - % @doc Append data -spec append(Pid :: pid(), Data :: binary()) -> {ok, File :: string(), Offset :: non_neg_integer()} |{error, term()}. @@ -202,8 +194,8 @@ append(_Pid, _Data) -> -spec append(Pid :: pid(), ClientMeta :: proplists:proplist(), Extra :: non_neg_integer(), Data :: binary()) -> {ok, File :: string(), Offset :: non_neg_integer()} |{error, term()}. -append(Pid, ClientMeta, Extra, Data) when is_pid(Pid) andalso is_list(ClientMeta) - andalso is_integer(Extra) andalso Extra >= 0 +append(Pid, ClientMeta, Extra, Data) when is_pid(Pid) andalso is_list(ClientMeta) + andalso is_integer(Extra) andalso Extra >= 0 andalso is_binary(Data) -> gen_server:call(Pid, {append, ClientMeta, Extra, Data}, ?TIMEOUT); append(_Pid, ClientMeta, Extra, _Data) -> @@ -213,7 +205,7 @@ append(_Pid, ClientMeta, Extra, _Data) -> %% gen_server callbacks % @private -init({FluName, Filename, DataDir}) -> +init({Filename, DataDir}) -> CsumFile = machi_util:make_checksum_filename(DataDir, Filename), {_, DPath} = machi_util:make_data_filename(DataDir, Filename), ok = filelib:ensure_dir(CsumFile), @@ -222,11 +214,8 @@ init({FluName, Filename, DataDir}) -> UnwrittenBytes = machi_csum_table:calc_unwritten_bytes(CsumTable), {Eof, infinity} = lists:last(UnwrittenBytes), {ok, FHd} = file:open(DPath, [read, write, binary, raw]), - %% Reserve for EC and stuff, to prevent eof when read - ok = file:pwrite(FHd, 0, binary:copy(<<"so what?">>, ?MINIMUM_OFFSET div 8)), Tref = schedule_tick(), St = #state{ - fluname = FluName, filename = Filename, data_dir = DataDir, data_path = DPath, @@ -261,13 +250,13 @@ handle_call({sync, all}, _From, State = #state{filename = F, R1 = file:sync(FHd), Resp = case {R, R1} of {ok, ok} -> ok; - {ok, O1} -> - lager:error("Got ~p during a data file sync on file ~p", [O1, F]), + {ok, O1} -> + lager:error("Got ~p during a data file sync on file ~p", [O1, F]), O1; - {O2, ok} -> - lager:error("Got ~p during a csum file sync on file ~p", [O2, F]), + {O2, ok} -> + lager:error("Got ~p during a csum file sync on file ~p", [O2, F]), O2; - {O3, O4} -> + {O3, O4} -> lager:error("Got ~p ~p syncing all files for file ~p", [O3, O4, F]), {O3, O4} end, @@ -296,21 +285,16 @@ handle_call({read, Offset, Length, Opts}, _From, csum_table = CsumTable, reads = {T, Err} }) -> - %% TODO: use these options - NoChunk prevents reading from disks - %% NoChecksum doesn't check checksums NoChecksum = proplists:get_value(no_checksum, Opts, false), NoChunk = proplists:get_value(no_chunk, Opts, false), + NeedsMerge = proplists:get_value(needs_trimmed, Opts, false), {Resp, NewErr} = - case do_read(FH, F, CsumTable, Offset, Length, NoChunk, NoChecksum) of + case do_read(FH, F, CsumTable, Offset, Length, NoChecksum, NoChunk, NeedsMerge) of {ok, {[], []}} -> {{error, not_written}, Err + 1}; {ok, {Chunks0, Trimmed0}} -> Chunks = slice_both_side(Chunks0, Offset, Offset+Length), - Trimmed = case proplists:get_value(needs_trimmed, Opts, false) of - true -> Trimmed0; - false -> [] - end, - {{ok, {Chunks, Trimmed}}, Err}; + {{ok, {Chunks, Trimmed0}}, Err}; Error -> lager:error("Can't read ~p, ~p at File ~p", [Offset, Length, F]), {Error, Err + 1} @@ -354,48 +338,6 @@ handle_call({write, Offset, ClientMeta, Data}, _From, {reply, Resp, State#state{writes = {T+1, NewErr}, eof_position = NewEof}}; - -%%% TRIMS - -handle_call({trim, _Offset, _ClientMeta, _Data}, _From, - State = #state{wedged = true, - writes = {T, Err} - }) -> - {reply, {error, wedged}, State#state{writes = {T + 1, Err + 1}}}; - -handle_call({trim, Offset, Size, _TriggerGC}, _From, - State = #state{data_filehandle=FHd, - ops = Ops, - trims = {T, Err}, - csum_table = CsumTable}) -> - - case machi_csum_table:all_trimmed(CsumTable, Offset, Offset+Size) of - true -> - NewState = State#state{ops=Ops+1, trims={T, Err+1}}, - %% All bytes of that range was already trimmed returns ok - %% here, not {error, trimmed}, which means the whole file - %% was trimmed - maybe_gc(ok, NewState); - false -> - LUpdate = maybe_regenerate_checksum( - FHd, - machi_csum_table:find_leftneighbor(CsumTable, Offset)), - RUpdate = maybe_regenerate_checksum( - FHd, - machi_csum_table:find_rightneighbor(CsumTable, Offset+Size)), - - case machi_csum_table:trim(CsumTable, Offset, Size, LUpdate, RUpdate) of - ok -> - {NewEof, infinity} = lists:last(machi_csum_table:calc_unwritten_bytes(CsumTable)), - NewState = State#state{ops=Ops+1, - trims={T+1, Err}, - eof_position=NewEof}, - maybe_gc(ok, NewState); - Error -> - {reply, Error, State#state{ops=Ops+1, trims={T, Err+1}}} - end - end; - %% APPENDS handle_call({append, _ClientMeta, _Extra, _Data}, _From, @@ -534,20 +476,10 @@ terminate(Reason, #state{filename = F, lager:info(" Reads: ~p/~p", [RT, RE]), lager:info(" Writes: ~p/~p", [WT, WE]), lager:info("Appends: ~p/~p", [AT, AE]), - case FHd of - undefined -> - noop; %% file deleted - _ -> - ok = file:sync(FHd), - ok = file:close(FHd) - end, - case T of - undefined -> - noop; %% file deleted - _ -> - ok = machi_csum_table:sync(T), - ok = machi_csum_table:close(T) - end, + ok = file:sync(FHd), + ok = file:close(FHd), + ok = machi_csum_table:sync(T), + ok = machi_csum_table:close(T), ok. % @private @@ -577,26 +509,18 @@ check_or_make_tagged_csum(Tag, InCsum, Data) when Tag == ?CSUM_TAG_CLIENT_SHA; false -> {error, {bad_csum, Csum}} end; -check_or_make_tagged_csum(?CSUM_TAG_SERVER_REGEN_SHA, - InCsum, Data) -> - Csum = machi_util:checksum_chunk(Data), - case Csum =:= InCsum of - true -> - machi_util:make_tagged_csum(server_regen_sha, Csum); - false -> - {error, {bad_csum, Csum}} - end; check_or_make_tagged_csum(OtherTag, _ClientCsum, _Data) -> lager:warning("Unknown checksum tag ~p", [OtherTag]), {error, bad_checksum}. - + -spec do_read(FHd :: file:io_device(), Filename :: string(), CsumTable :: machi_csum_table:table(), Offset :: non_neg_integer(), Size :: non_neg_integer(), + NoChecksum :: boolean(), NoChunk :: boolean(), - NoChecksum :: boolean() + NeedsTrimmed :: boolean() ) -> {ok, Chunks :: [{string(), Offset::non_neg_integer(), binary(), Csum :: binary()}]} | {error, bad_checksum} | {error, partial_read} | @@ -615,23 +539,23 @@ check_or_make_tagged_csum(OtherTag, _ClientCsum, _Data) -> % tuple is returned. % % -do_read(FHd, Filename, CsumTable, Offset, Size, _, _) -> +do_read(FHd, Filename, CsumTable, Offset, Size, _, _, _) -> + do_read(FHd, Filename, CsumTable, Offset, Size). + +do_read(FHd, Filename, CsumTable, Offset, Size) -> %% Note that find/3 only returns overlapping chunks, both borders %% are not aligned to original Offset and Size. ChunkCsums = machi_csum_table:find(CsumTable, Offset, Size), - read_all_ranges(FHd, Filename, ChunkCsums, [], []). + read_all_ranges(FHd, Filename, ChunkCsums, []). -read_all_ranges(_, _, [], ReadChunks, TrimmedChunks) -> +read_all_ranges(_, _, [], ReadChunks) -> %% TODO: currently returns empty list of trimmed chunks - {ok, {lists:reverse(ReadChunks), lists:reverse(TrimmedChunks)}}; + {ok, {lists:reverse(ReadChunks), []}}; -read_all_ranges(FHd, Filename, [{Offset, Size, trimmed}|T], ReadChunks, TrimmedChunks) -> - read_all_ranges(FHd, Filename, T, ReadChunks, [{Filename, Offset, Size}|TrimmedChunks]); - -read_all_ranges(FHd, Filename, [{Offset, Size, TaggedCsum}|T], ReadChunks, TrimmedChunks) -> +read_all_ranges(FHd, Filename, [{Offset, Size, TaggedCsum}|T], ReadChunks) -> case file:pread(FHd, Offset, Size) of eof -> - read_all_ranges(FHd, Filename, T, ReadChunks, TrimmedChunks); + read_all_ranges(FHd, Filename, T, ReadChunks); {ok, Bytes} when byte_size(Bytes) == Size -> {Tag, Ck} = machi_util:unmake_tagged_csum(TaggedCsum), case check_or_make_tagged_csum(Tag, Ck, Bytes) of @@ -641,21 +565,19 @@ read_all_ranges(FHd, Filename, [{Offset, Size, TaggedCsum}|T], ReadChunks, Trimm {error, bad_checksum}; TaggedCsum -> read_all_ranges(FHd, Filename, T, - [{Filename, Offset, Bytes, TaggedCsum}|ReadChunks], - TrimmedChunks); + [{Filename, Offset, Bytes, TaggedCsum}|ReadChunks]); OtherCsum when Tag =:= ?CSUM_TAG_NONE -> %% XXX FIXME: Should we return something other than %% {ok, ....} in this case? read_all_ranges(FHd, Filename, T, - [{Filename, Offset, Bytes, OtherCsum}|ReadChunks], - TrimmedChunks) + [{Filename, Offset, Bytes, OtherCsum}|ReadChunks]) end; {ok, Partial} -> - lager:error("In file ~p, offset ~p, wanted to read ~p bytes, but got ~p", + lager:error("In file ~p, offset ~p, wanted to read ~p bytes, but got ~p", [Filename, Offset, Size, byte_size(Partial)]), {error, partial_read}; Other -> - lager:error("While reading file ~p, offset ~p, length ~p, got ~p", + lager:error("While reading file ~p, offset ~p, length ~p, got ~p", [Filename, Offset, Size, Other]), {error, Other} end. @@ -682,7 +604,6 @@ read_all_ranges(FHd, Filename, [{Offset, Size, TaggedCsum}|T], ReadChunks, Trimm % caller as `ok' handle_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Data) -> Size = iolist_size(Data), - case machi_csum_table:find(CsumTable, Offset, Size) of [] -> %% Nothing should be there try @@ -695,7 +616,7 @@ handle_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Data) -> {error, Reason} end; [{Offset, Size, TaggedCsum}] -> - case do_read(FHd, Filename, CsumTable, Offset, Size, false, false) of + case do_read(FHd, Filename, CsumTable, Offset, Size, false, false, false) of {error, _} = E -> lager:warning("This should never happen: got ~p while reading" " at offset ~p in file ~p that's supposedly written", @@ -772,11 +693,9 @@ do_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Size, Data) -> %% Dialyzer 'can never match': slice_both_side([], _, _) -> %% []; -slice_both_side([], _, _) -> - []; slice_both_side([{F, Offset, Chunk, _Csum}|L], LeftPos, RightPos) when Offset < LeftPos andalso LeftPos < RightPos -> - TrashLen = (LeftPos - Offset), + TrashLen = 8 * (LeftPos - Offset), <<_:TrashLen/binary, NewChunk/binary>> = Chunk, NewChecksum = machi_util:make_tagged_csum(?CSUM_TAG_SERVER_REGEN_SHA_ATOM, Chunk), NewH = {F, LeftPos, NewChunk, NewChecksum}, @@ -801,7 +720,7 @@ maybe_regenerate_checksum(_, {_, _, trimmed} = Change) -> maybe_regenerate_checksum(FHd, {Offset, Size, _Csum}) -> case file:pread(FHd, Offset, Size) of eof -> - error({eof, Offset, Size}); + error(eof); {ok, Bytes} when byte_size(Bytes) =:= Size -> TaggedCsum = machi_util:make_tagged_csum(server_regen_sha, @@ -810,43 +729,3 @@ maybe_regenerate_checksum(FHd, {Offset, Size, _Csum}) -> Error -> throw(Error) end. - -%% GC: make sure unwritten bytes = [{Eof, infinity}] and Eof is > max -%% file size walk through the checksum table and make sure all chunks -%% trimmed Then unlink the file --spec maybe_gc(term(), #state{}) -> - {reply, term(), #state{}} | {stop, normal, term(), #state{}}. -maybe_gc(Reply, S = #state{eof_position = Eof, - max_file_size = MaxFileSize}) when Eof < MaxFileSize -> - lager:debug("The file is still small; not trying GC (Eof, MaxFileSize) = (~p, ~p)~n", - [Eof, MaxFileSize]), - {reply, Reply, S}; -maybe_gc(Reply, S = #state{fluname=FluName, - data_filehandle = FHd, - data_dir = DataDir, - filename = Filename, - eof_position = Eof, - csum_table=CsumTable}) -> - case machi_csum_table:all_trimmed(CsumTable, Eof) of - true -> - lager:debug("GC? Let's do it: ~p.~n", [Filename]), - %% Before unlinking a file, it should inform - %% machi_flu_filename_mgr that this file is - %% deleted and mark it as "trimmed" to avoid - %% filename reuse and resurrection. Maybe garbage - %% will remain if a process crashed but it also - %% should be recovered at filename_mgr startup. - - %% Also, this should be informed *before* file proxy - %% deletes files. - ok = machi_flu_metadata_mgr:trim_file(FluName, {file, Filename}), - ok = file:close(FHd), - {_, DPath} = machi_util:make_data_filename(DataDir, Filename), - ok = file:delete(DPath), - machi_csum_table:delete(CsumTable), - {stop, normal, Reply, - S#state{data_filehandle=undefined, - csum_table=undefined}}; - false -> - {reply, Reply, S} - end. diff --git a/src/machi_file_proxy_sup.erl b/src/machi_file_proxy_sup.erl index a165a68..dbb0fa6 100644 --- a/src/machi_file_proxy_sup.erl +++ b/src/machi_file_proxy_sup.erl @@ -44,8 +44,7 @@ start_link(FluName) -> supervisor:start_link({local, make_proxy_name(FluName)}, ?MODULE, []). start_proxy(FluName, DataDir, Filename) -> - supervisor:start_child(make_proxy_name(FluName), - [FluName, Filename, DataDir]). + supervisor:start_child(make_proxy_name(FluName), [Filename, DataDir]). init([]) -> SupFlags = {simple_one_for_one, 1000, 10}, diff --git a/src/machi_flu1.erl b/src/machi_flu1.erl index 042eaed..cb269f2 100644 --- a/src/machi_flu1.erl +++ b/src/machi_flu1.erl @@ -75,7 +75,6 @@ epoch_id :: 'undefined' | machi_dt:epoch_id(), pb_mode = undefined :: 'undefined' | 'high' | 'low', high_clnt :: 'undefined' | pid(), - trim_table :: ets:tid(), props = [] :: list() % proplist }). @@ -149,7 +148,6 @@ main2(FluName, TcpPort, DataDir, Props) -> {true, undefined} end, Witness_p = proplists:get_value(witness_mode, Props, false), - S0 = #state{flu_name=FluName, proj_store=ProjectionPid, tcp_port=TcpPort, @@ -411,11 +409,8 @@ do_pb_ll_request3({low_write_chunk, _EpochID, File, Offset, Chunk, CSum_tag, #state{witness=false}=S) -> {do_server_write_chunk(File, Offset, Chunk, CSum_tag, CSum, S), S}; do_pb_ll_request3({low_read_chunk, _EpochID, File, Offset, Size, Opts}, - #state{witness=false} = S) -> - {do_server_read_chunk(File, Offset, Size, Opts, S), S}; -do_pb_ll_request3({low_trim_chunk, _EpochID, File, Offset, Size, TriggerGC}, #state{witness=false}=S) -> - {do_server_trim_chunk(File, Offset, Size, TriggerGC, S), S}; + {do_server_read_chunk(File, Offset, Size, Opts, S), S}; do_pb_ll_request3({low_checksum_list, _EpochID, File}, #state{witness=false}=S) -> {do_server_checksum_listing(File, S), S}; @@ -546,47 +541,21 @@ do_server_append_chunk2(_PKey, Prefix, Chunk, CSum_tag, Client_CSum, do_server_write_chunk(File, Offset, Chunk, CSum_tag, CSum, #state{flu_name=FluName}) -> case sanitize_file_string(File) of ok -> - case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, File}) of - {ok, Pid} -> - Meta = [{client_csum_tag, CSum_tag}, {client_csum, CSum}], - machi_file_proxy:write(Pid, Offset, Meta, Chunk); - {error, trimmed} = Error -> - Error - end; + {ok, Pid} = machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, File}), + Meta = [{client_csum_tag, CSum_tag}, {client_csum, CSum}], + machi_file_proxy:write(Pid, Offset, Meta, Chunk); _ -> {error, bad_arg} end. do_server_read_chunk(File, Offset, Size, Opts, #state{flu_name=FluName})-> + %% TODO: Look inside Opts someday. case sanitize_file_string(File) of ok -> - case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, File}) of - {ok, Pid} -> - case machi_file_proxy:read(Pid, Offset, Size, Opts) of - %% XXX FIXME - %% For now we are omiting the checksum data because it blows up - %% protobufs. - {ok, ChunksAndTrimmed} -> {ok, ChunksAndTrimmed}; - Other -> Other - end; - {error, trimmed} = Error -> - Error - end; - _ -> - {error, bad_arg} - end. - -do_server_trim_chunk(File, Offset, Size, TriggerGC, #state{flu_name=FluName}) -> - lager:debug("Hi there! I'm trimming this: ~s, (~p, ~p), ~p~n", - [File, Offset, Size, TriggerGC]), - case sanitize_file_string(File) of - ok -> - case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, File}) of - {ok, Pid} -> - machi_file_proxy:trim(Pid, Offset, Size, TriggerGC); - {error, trimmed} = Trimmed -> - %% Should be returned back to (maybe) trigger repair - Trimmed + {ok, Pid} = machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, File}), + case machi_file_proxy:read(Pid, Offset, Size, Opts) of + {ok, ChunksAndTrimmed} -> {ok, ChunksAndTrimmed}; + Other -> Other end; _ -> {error, bad_arg} @@ -693,14 +662,10 @@ handle_append(Prefix, Chunk, Csum, Extra, FluName, EpochId) -> Res = machi_flu_filename_mgr:find_or_make_filename_from_prefix(FluName, EpochId, {prefix, Prefix}), case Res of {file, F} -> - case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, F}) of - {ok, Pid} -> - {Tag, CS} = machi_util:unmake_tagged_csum(Csum), - Meta = [{client_csum_tag, Tag}, {client_csum, CS}], - machi_file_proxy:append(Pid, Meta, Extra, Chunk); - {error, trimmed} = E -> - E - end; + {ok, Pid} = machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, F}), + {Tag, CS} = machi_util:unmake_tagged_csum(Csum), + Meta = [{client_csum_tag, Tag}, {client_csum, CS}], + machi_file_proxy:append(Pid, Meta, Extra, Chunk); Error -> Error end. diff --git a/src/machi_flu1_client.erl b/src/machi_flu1_client.erl index 119e154..da90618 100644 --- a/src/machi_flu1_client.erl +++ b/src/machi_flu1_client.erl @@ -80,7 +80,6 @@ %% For "internal" replication only. -export([ write_chunk/5, write_chunk/6, - trim_chunk/5, delete_migration/3, delete_migration/4, trunc_hack/3, trunc_hack/4 ]). @@ -475,21 +474,6 @@ write_chunk(Host, TcpPort, EpochID, File, Offset, Chunk) disconnect(Sock) end. -%% @doc Restricted API: Write a chunk of already-sequenced data to -%% `File' at `Offset'. - --spec trim_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size()) -> - ok | {error, machi_dt:error_general()} | {error, term()}. -trim_chunk(Sock, EpochID, File0, Offset, Size) - when Offset >= ?MINIMUM_OFFSET -> - ReqID = <<"id">>, - File = machi_util:make_binary(File0), - true = (Offset >= ?MINIMUM_OFFSET), - Req = machi_pb_translate:to_pb_request( - ReqID, - {low_trim_chunk, EpochID, File, Offset, Size, 0}), - do_pb_request_common(Sock, ReqID, Req). - %% @doc Restricted API: Delete a file after it has been successfully %% migrated. diff --git a/src/machi_flu_filename_mgr.erl b/src/machi_flu_filename_mgr.erl index 7e8bb9d..ac505a9 100644 --- a/src/machi_flu_filename_mgr.erl +++ b/src/machi_flu_filename_mgr.erl @@ -17,8 +17,8 @@ %% under the License. %% %% ------------------------------------------------------------------- -%% -%% @doc This process is responsible for managing filenames assigned to +%% +%% @doc This process is responsible for managing filenames assigned to %% prefixes. It's started out of `machi_flu_psup'. %% %% Supported operations include finding the "current" filename assigned to @@ -32,7 +32,7 @@ %% First it looks up the sequence number from the prefix name. If %% no sequence file is found, it uses 0 as the sequence number and searches %% for a matching file with the prefix and 0 as the sequence number. -%% If no file is found, the it generates a new filename by incorporating +%% If no file is found, the it generates a new filename by incorporating %% the given prefix, a randomly generated (v4) UUID and 0 as the %% sequence number. %% @@ -79,7 +79,7 @@ child_spec(FluName, DataDir) -> Name = make_filename_mgr_name(FluName), - {Name, + {Name, {?MODULE, start_link, [FluName, DataDir]}, permanent, 5000, worker, [?MODULE]}. @@ -87,8 +87,8 @@ start_link(FluName, DataDir) when is_atom(FluName) andalso is_list(DataDir) -> N = make_filename_mgr_name(FluName), gen_server:start_link({local, N}, ?MODULE, [FluName, DataDir], []). --spec find_or_make_filename_from_prefix( FluName :: atom(), - EpochId :: pv1_epoch_n(), +-spec find_or_make_filename_from_prefix( FluName :: atom(), + EpochId :: pv1_epoch_n(), Prefix :: {prefix, string()} ) -> {file, Filename :: string()} | {error, Reason :: term() } | timeout. % @doc Find the latest available or make a filename from a prefix. A prefix @@ -96,7 +96,7 @@ start_link(FluName, DataDir) when is_atom(FluName) andalso is_list(DataDir) -> % tuple in the form of `{file, F}' or an `{error, Reason}' find_or_make_filename_from_prefix(FluName, EpochId, {prefix, Prefix}) when is_atom(FluName) -> N = make_filename_mgr_name(FluName), - gen_server:call(N, {find_filename, EpochId, Prefix}, ?TIMEOUT); + gen_server:call(N, {find_filename, EpochId, Prefix}, ?TIMEOUT); find_or_make_filename_from_prefix(_FluName, _EpochId, Other) -> lager:error("~p is not a valid prefix.", [Other]), error(badarg). @@ -104,9 +104,9 @@ find_or_make_filename_from_prefix(_FluName, _EpochId, Other) -> -spec increment_prefix_sequence( FluName :: atom(), Prefix :: {prefix, string()} ) -> ok | {error, Reason :: term() } | timeout. % @doc Increment the sequence counter for a given prefix. Prefix should -% be in the form of `{prefix, P}'. +% be in the form of `{prefix, P}'. increment_prefix_sequence(FluName, {prefix, Prefix}) when is_atom(FluName) -> - gen_server:call(make_filename_mgr_name(FluName), {increment_sequence, Prefix}, ?TIMEOUT); + gen_server:call(make_filename_mgr_name(FluName), {increment_sequence, Prefix}, ?TIMEOUT); increment_prefix_sequence(_FluName, Other) -> lager:error("~p is not a valid prefix.", [Other]), error(badarg). @@ -117,7 +117,7 @@ increment_prefix_sequence(_FluName, Other) -> % all the data files associated with that prefix. Returns % a list. list_files_by_prefix(FluName, {prefix, Prefix}) when is_atom(FluName) -> - gen_server:call(make_filename_mgr_name(FluName), {list_files, Prefix}, ?TIMEOUT); + gen_server:call(make_filename_mgr_name(FluName), {list_files, Prefix}, ?TIMEOUT); list_files_by_prefix(_FluName, Other) -> lager:error("~p is not a valid prefix.", [Other]), error(badarg). @@ -125,10 +125,7 @@ list_files_by_prefix(_FluName, Other) -> %% gen_server API init([FluName, DataDir]) -> Tid = ets:new(make_filename_mgr_name(FluName), [named_table, {read_concurrency, true}]), - {ok, #state{fluname = FluName, - epoch = 0, - datadir = DataDir, - tid = Tid}}. + {ok, #state{ fluname = FluName, epoch = 0, datadir = DataDir, tid = Tid }}. handle_cast(Req, State) -> lager:warning("Got unknown cast ~p", [Req]), @@ -138,9 +135,9 @@ handle_cast(Req, State) -> %% the FLU has already validated that the caller's epoch id and the FLU's epoch id %% are the same. So we *assume* that remains the case here - that is to say, we %% are not wedged. -handle_call({find_filename, EpochId, Prefix}, _From, S = #state{ datadir = DataDir, - epoch = EpochId, - tid = Tid}) -> +handle_call({find_filename, EpochId, Prefix}, _From, S = #state{ datadir = DataDir, + epoch = EpochId, + tid = Tid }) -> %% Our state and the caller's epoch ids are the same. Business as usual. File = handle_find_file(Tid, Prefix, DataDir), {reply, {file, File}, S}; @@ -157,7 +154,7 @@ handle_call({increment_sequence, Prefix}, _From, S = #state{ datadir = DataDir } ok = machi_util:increment_max_filenum(DataDir, Prefix), {reply, ok, S}; handle_call({list_files, Prefix}, From, S = #state{ datadir = DataDir }) -> - spawn(fun() -> + spawn(fun() -> L = list_files(DataDir, Prefix), gen_server:reply(From, L) end), @@ -184,7 +181,7 @@ code_change(_OldVsn, State, _Extra) -> %% MIT License generate_uuid_v4_str() -> <> = crypto:strong_rand_bytes(16), - io_lib:format("~8.16.0b-~4.16.0b-4~3.16.0b-~4.16.0b-~12.16.0b", + io_lib:format("~8.16.0b-~4.16.0b-4~3.16.0b-~4.16.0b-~12.16.0b", [A, B, C band 16#0fff, D band 16#3fff bor 16#8000, E]). find_file(DataDir, Prefix, N) -> @@ -204,8 +201,8 @@ handle_find_file(Tid, Prefix, DataDir) -> [] -> {find_or_make_filename(Tid, DataDir, Prefix, N), false}; [H] -> {H, true}; - [Fn | _ ] = L -> - lager:debug( + [Fn | _ ] = L -> + lager:warning( "Searching for a matching file to prefix ~p and sequence number ~p gave multiples: ~p", [Prefix, N, L]), {Fn, true} @@ -248,3 +245,4 @@ increment_and_cache_filename(Tid, DataDir, Prefix) -> -ifdef(TEST). -endif. + diff --git a/src/machi_flu_metadata_mgr.erl b/src/machi_flu_metadata_mgr.erl index d4447ae..c851f84 100644 --- a/src/machi_flu_metadata_mgr.erl +++ b/src/machi_flu_metadata_mgr.erl @@ -39,13 +39,10 @@ -define(HASH(X), erlang:phash2(X)). %% hash algorithm to use -define(TIMEOUT, 10 * 1000). %% 10 second timeout --define(KNOWN_FILES_LIST_PREFIX, "known_files_"). - -record(state, {fluname :: atom(), datadir :: string(), tid :: ets:tid(), - cnt :: non_neg_integer(), - trimmed_files :: machi_plist:plist() + cnt :: non_neg_integer() }). %% This record goes in the ets table where filename is the key @@ -62,8 +59,7 @@ lookup_proxy_pid/2, start_proxy_pid/2, stop_proxy_pid/2, - build_metadata_mgr_name/2, - trim_file/2 + build_metadata_mgr_name/2 ]). %% gen_server callbacks @@ -101,24 +97,10 @@ start_proxy_pid(FluName, {file, Filename}) -> stop_proxy_pid(FluName, {file, Filename}) -> gen_server:call(get_manager_atom(FluName, Filename), {stop_proxy_pid, Filename}, ?TIMEOUT). -trim_file(FluName, {file, Filename}) -> - gen_server:call(get_manager_atom(FluName, Filename), {trim_file, Filename}, ?TIMEOUT). - %% gen_server callbacks init([FluName, Name, DataDir, Num]) -> - %% important: we'll need another persistent storage to - %% remember deleted (trimmed) file, to prevent resurrection after - %% flu restart and append. - FileListFileName = - filename:join([DataDir, ?KNOWN_FILES_LIST_PREFIX ++ atom_to_list(FluName)]), - {ok, PList} = machi_plist:open(FileListFileName, []), - %% TODO make sure all files non-existent, if any remaining files - %% here, just delete it. They're in the list *because* they're all - %% trimmed. - Tid = ets:new(Name, [{keypos, 2}, {read_concurrency, true}, {write_concurrency, true}]), - {ok, #state{fluname = FluName, datadir = DataDir, tid = Tid, cnt = Num, - trimmed_files=PList}}. + {ok, #state{ fluname = FluName, datadir = DataDir, tid = Tid, cnt = Num}}. handle_cast(Req, State) -> lager:warning("Got unknown cast ~p", [Req]), @@ -131,25 +113,17 @@ handle_call({proxy_pid, Filename}, _From, State = #state{ tid = Tid }) -> end, {reply, Reply, State}; -handle_call({start_proxy_pid, Filename}, _From, - State = #state{ fluname = N, tid = Tid, datadir = D, - trimmed_files=TrimmedFiles}) -> - case machi_plist:find(TrimmedFiles, Filename) of - false -> - NewR = case lookup_md(Tid, Filename) of - not_found -> - start_file_proxy(N, D, Filename); - #md{ proxy_pid = undefined } = R0 -> - start_file_proxy(N, D, R0); - #md{ proxy_pid = _Pid } = R1 -> - R1 - end, - update_ets(Tid, NewR), - {reply, {ok, NewR#md.proxy_pid}, State}; - true -> - {reply, {error, trimmed}, State} - end; - +handle_call({start_proxy_pid, Filename}, _From, State = #state{ fluname = N, tid = Tid, datadir = D }) -> + NewR = case lookup_md(Tid, Filename) of + not_found -> + start_file_proxy(N, D, Filename); + #md{ proxy_pid = undefined } = R0 -> + start_file_proxy(N, D, R0); + #md{ proxy_pid = _Pid } = R1 -> + R1 + end, + update_ets(Tid, NewR), + {reply, {ok, NewR#md.proxy_pid}, State}; handle_call({stop_proxy_pid, Filename}, _From, State = #state{ tid = Tid }) -> case lookup_md(Tid, Filename) of not_found -> @@ -163,15 +137,6 @@ handle_call({stop_proxy_pid, Filename}, _From, State = #state{ tid = Tid }) -> end, {reply, ok, State}; -handle_call({trim_file, Filename}, _, - S = #state{trimmed_files = TrimmedFiles }) -> - case machi_plist:add(TrimmedFiles, Filename) of - {ok, TrimmedFiles2} -> - {reply, ok, S#state{trimmed_files=TrimmedFiles2}}; - Error -> - {reply, Error, S} - end; - handle_call(Req, From, State) -> lager:warning("Got unknown call ~p from ~p", [Req, From]), {reply, hoge, State}. @@ -204,21 +169,18 @@ handle_info({'DOWN', Mref, process, Pid, wedged}, State = #state{ tid = Tid }) - lager:error("file proxy ~p shutdown because it's wedged", [Pid]), clear_ets(Tid, Mref), {noreply, State}; -handle_info({'DOWN', _Mref, process, Pid, trimmed}, State = #state{ tid = _Tid }) -> - lager:debug("file proxy ~p shutdown because the file was trimmed", [Pid]), - {noreply, State}; handle_info({'DOWN', Mref, process, Pid, Error}, State = #state{ tid = Tid }) -> lager:error("file proxy ~p shutdown because ~p", [Pid, Error]), clear_ets(Tid, Mref), {noreply, State}; + handle_info(Info, State) -> lager:warning("Got unknown info ~p", [Info]), {noreply, State}. -terminate(Reason, _State = #state{trimmed_files=TrimmedFiles}) -> +terminate(Reason, _State) -> lager:info("Shutting down because ~p", [Reason]), - machi_plist:close(TrimmedFiles), ok. code_change(_OldVsn, State, _Extra) -> diff --git a/src/machi_merkle_tree.erl b/src/machi_merkle_tree.erl new file mode 100644 index 0000000..effcb67 --- /dev/null +++ b/src/machi_merkle_tree.erl @@ -0,0 +1,156 @@ +%% ------------------------------------------------------------------- +%% +%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved. +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- + +%% @doc Creates a Merkle tree per file based on the checksum data for +%% a given data file. +%% +%% The `naive' implementation representation is: +%% +%% `<>' for unwritten bytes +%% `<>' for trimmed bytes +%% `<>' for written bytes +%% +%% The tree feeds these leaf nodes into hashes representing chunks of a minimum +%% size of at least 1024 KB (1 MB), but if the file size is larger, we will try +%% to get about 100 chunks for the first rollup "Level 1." We aim for around 10 +%% hashes at level 2, and then 2 hashes level 3 and finally the root. + +-module(machi_merkle_tree). + +-include("machi.hrl"). +-include("machi_merkle_tree.hrl"). + +-ifdef(TEST). +-compile(export_all). +-else. +-export([ + open/2, + open/3, + tree/1, + filename/1, + diff/2 +]). +-endif. + +-define(TRIMMED, <<1>>). +-define(UNWRITTEN, <<0>>). +-define(NAIVE_ENCODE(Offset, Size, Data), <>). + +-define(MINIMUM_CHUNK, 1048576). %% 1024 * 1024 +-define(LEVEL_SIZE, 10). +-define(H, sha). + +%% public API + +open(Filename, DataDir) -> + open(Filename, DataDir, naive). + +open(Filename, DataDir, Type) -> + Tree = load_filename(Filename, DataDir, Type), + {ok, #mt{ filename = Filename, tree = Tree, backend = Type}}. + +tree(#mt{ tree = T, backend = naive }) -> + case T#naive.recalc of + true -> build_tree(T); + false -> T + end. + +filename(#mt{ filename = F }) -> F. + +diff(#mt{backend = naive, tree = T1}, #mt{backend = naive, tree = T2}) -> + case T1#naive.root == T2#naive.root of + true -> same; + false -> naive_diff(T1, T2) + end; +diff(_, _) -> error(badarg). + +%% private + +% @private +load_filename(Filename, DataDir, naive) -> + {Last, M} = do_load(Filename, DataDir, fun insert_csum_naive/2, []), + ChunkSize = max(?MINIMUM_CHUNK, Last div 100), + T = #naive{ leaves = lists:reverse(M), chunk_size = ChunkSize, recalc = true }, + build_tree(T). + +do_load(Filename, DataDir, FoldFun, AccInit) -> + CsumFile = machi_util:make_checksum_filename(DataDir, Filename), + {ok, T} = machi_csum_table:open(CsumFile, []), + Acc = machi_csum_table:foldl_chunks(FoldFun, {0, AccInit}, T), + ok = machi_csum_table:close(T), + Acc. + +% @private +insert_csum_naive({Last, Size, _Csum}=In, {Last, MT}) -> + %% no gap + {Last+Size, update_acc(In, MT)}; +insert_csum_naive({Offset, Size, _Csum}=In, {Last, MT}) -> + Hole = Offset - Last, + MT0 = update_acc({Last, Hole, unwritten}, MT), + {Offset+Size, update_acc(In, MT0)}. + +% @private +update_acc({Offset, Size, unwritten}, MT) -> + [ {Offset, Size, ?NAIVE_ENCODE(Offset, Size, ?UNWRITTEN)} | MT ]; +update_acc({Offset, Size, trimmed}, MT) -> + [ {Offset, Size, ?NAIVE_ENCODE(Offset, Size, ?TRIMMED)} | MT ]; +update_acc({Offset, Size, <<_Tag:8, Csum/binary>>}, MT) -> + [ {Offset, Size, ?NAIVE_ENCODE(Offset, Size, Csum)} | MT ]. + +build_tree(MT = #naive{ leaves = L, chunk_size = ChunkSize }) -> + Lvl1s = build_level_1(ChunkSize, L, 1, [ crypto:hash_init(?H) ]), + Mod2 = length(Lvl1s) div ?LEVEL_SIZE, + Lvl2s = build_int_level(Mod2, Lvl1s, 1, [ crypto:hash_init(?H) ]), + Mod3 = length(Lvl2s) div 2, + Lvl3s = build_int_level(Mod3, Lvl2s, 1, [ crypto:hash_init(?H) ]), + Root = build_root(Lvl3s, crypto:hash_init(?H)), + MT#naive{ root = Root, lvl1 = Lvl1s, lvl2 = Lvl2s, lvl3 = Lvl3s, recalc = false }. + +build_root([], Ctx) -> + crypto:hash_final(Ctx); +build_root([H|T], Ctx) -> + build_root(T, crypto:hash_update(Ctx, H)). + +build_int_level(_Mod, [], _Cnt, [ Ctx | Rest ]) -> + lists:reverse( [ crypto:hash_final(Ctx) | Rest ] ); +build_int_level(Mod, [H|T], Cnt, [ Ctx | Rest ]) when Cnt rem Mod == 0 -> + NewCtx = crypto:hash_init(?H), + build_int_level(Mod, T, Cnt + 1, [ crypto:hash_update(NewCtx, H), crypto:hash_final(Ctx) | Rest ]); +build_int_level(Mod, [H|T], Cnt, [ Ctx | Rest ]) -> + build_int_level(Mod, T, Cnt+1, [ crypto:hash_update(Ctx, H) | Rest ]). + +build_level_1(_Size, [], _Multiple, [ Ctx | Rest ]) -> + lists:reverse([ crypto:hash_final(Ctx) | Rest ]); +build_level_1(Size, [{Pos, Len, Hash}|T], Multiple, [ Ctx | Rest ]) + when ( Pos + Len ) > ( Size * Multiple ) -> + NewCtx = crypto:hash_init(?H), + build_level_1(Size, T, Multiple+1, + [ crypto:hash_update(NewCtx, Hash), crypto:hash_final(Ctx) | Rest ]); +build_level_1(Size, [{Pos, Len, Hash}|T], Multiple, [ Ctx | Rest ]) + when ( Pos + Len ) =< ( Size * Multiple ) -> + build_level_1(Size, T, Multiple, [ crypto:hash_update(Ctx, Hash) | Rest ]). + +naive_diff(#naive{lvl1 = L1}, #naive{lvl1=L2, chunk_size=CS2}) -> + Set1 = gb_sets:from_list(lists:zip(lists:seq(1, length(L1)), L1)), + Set2 = gb_sets:from_list(lists:zip(lists:seq(1, length(L2)), L2)), + + %% The byte ranges in list 2 that do not match in list 1 + %% Or should we do something else? + [ {(X-1)*CS2, CS2, SHA} || {X, SHA} <- gb_sets:to_list(gb_sets:subtract(Set1, Set2)) ]. diff --git a/src/machi_pb_high_client.erl b/src/machi_pb_high_client.erl index ef1e740..5509803 100644 --- a/src/machi_pb_high_client.erl +++ b/src/machi_pb_high_client.erl @@ -58,110 +58,61 @@ count=0 :: non_neg_integer() }). -%% @doc official error types that is specific in Machi --type machi_client_error_reason() :: bad_arg | wedged | bad_checksum | - partition | not_written | written | - trimmed | no_such_file | partial_read | - bad_epoch | inet:posix(). - -%% @doc Creates a client process --spec start_link(p_srvr_dict()) -> {ok, pid()} | {error, machi_client_error_reason()}. start_link(P_srvr_list) -> gen_server:start_link(?MODULE, [P_srvr_list], []). -%% @doc Stops a client process. --spec quit(pid()) -> ok. quit(PidSpec) -> gen_server:call(PidSpec, quit, infinity). connected_p(PidSpec) -> gen_server:call(PidSpec, connected_p, infinity). --spec echo(pid(), string()) -> {ok, string()} | {error, machi_client_error_reason()}. echo(PidSpec, String) -> echo(PidSpec, String, ?DEFAULT_TIMEOUT). --spec echo(pid(), string(), non_neg_integer()) -> {ok, string()} | {error, machi_client_error_reason()}. echo(PidSpec, String, Timeout) -> send_sync(PidSpec, {echo, String}, Timeout). %% TODO: auth() is not implemented. Auth requires SSL, and this client %% doesn't support SSL yet. This is just a placeholder and reminder. --spec auth(pid(), string(), string()) -> ok | {error, machi_client_error_reason()}. auth(PidSpec, User, Pass) -> auth(PidSpec, User, Pass, ?DEFAULT_TIMEOUT). --spec auth(pid(), string(), string(), non_neg_integer()) -> ok | {error, machi_client_error_reason()}. auth(PidSpec, User, Pass, Timeout) -> send_sync(PidSpec, {auth, User, Pass}, Timeout). --spec append_chunk(pid(), PlacementKey::binary(), Prefix::binary(), Chunk::binary(), - CSum::binary(), ChunkExtra::non_neg_integer()) -> - {ok, Filename::string(), Offset::machi_dt:file_offset()} | - {error, machi_client_error_reason()}. append_chunk(PidSpec, PlacementKey, Prefix, Chunk, CSum, ChunkExtra) -> append_chunk(PidSpec, PlacementKey, Prefix, Chunk, CSum, ChunkExtra, ?DEFAULT_TIMEOUT). --spec append_chunk(pid(), PlacementKey::binary(), Prefix::binary(), - Chunk::binary(), CSum::binary(), - ChunkExtra::non_neg_integer(), - Timeout::non_neg_integer()) -> - {ok, Filename::string(), Offset::machi_dt:file_offset()} | - {error, machi_client_error_reason()}. append_chunk(PidSpec, PlacementKey, Prefix, Chunk, CSum, ChunkExtra, Timeout) -> send_sync(PidSpec, {append_chunk, PlacementKey, Prefix, Chunk, CSum, ChunkExtra}, Timeout). --spec write_chunk(pid(), File::string(), machi_dt:file_offset(), - Chunk::binary(), CSum::binary()) -> - ok | {error, machi_client_error_reason()}. write_chunk(PidSpec, File, Offset, Chunk, CSum) -> write_chunk(PidSpec, File, Offset, Chunk, CSum, ?DEFAULT_TIMEOUT). --spec write_chunk(pid(), File::string(), machi_dt:file_offset(), - Chunk::binary(), CSum::binary(), Timeout::non_neg_integer()) -> - ok | {error, machi_client_error_reason()}. write_chunk(PidSpec, File, Offset, Chunk, CSum, Timeout) -> send_sync(PidSpec, {write_chunk, File, Offset, Chunk, CSum}, Timeout). -%% @doc Tries to read a chunk of a specified file. It returns `{ok, -%% {Chunks, TrimmedChunks}}' for live file while it returns `{error, -%% trimmed}' if all bytes of the file was trimmed. --spec read_chunk(pid(), File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), +-spec read_chunk(pid(), string(), pos_integer(), pos_integer(), [{flag_no_checksum | flag_no_chunk | needs_trimmed, boolean()}]) -> - {ok, {Chunks::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), binary()}], - Trimmed::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size()}]}} | - {error, machi_client_error_reason()}. + {ok, {list(), list()}} | {error, term()}. read_chunk(PidSpec, File, Offset, Size, Options) -> read_chunk(PidSpec, File, Offset, Size, Options, ?DEFAULT_TIMEOUT). --spec read_chunk(pid(), File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), - [{flag_no_checksum | flag_no_chunk | needs_trimmed, boolean()}], - Timeout::non_neg_integer()) -> - {ok, {Chunks::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), binary()}], - Trimmed::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size()}]}} | - {error, machi_client_error_reason()}. +-spec read_chunk(pid(), string(), pos_integer(), pos_integer(), + [{no_checksum | no_chunk | needs_trimmed, boolean()}], + pos_integer()) -> + {ok, {list(), list()}} | {error, term()}. read_chunk(PidSpec, File, Offset, Size, Options, Timeout) -> send_sync(PidSpec, {read_chunk, File, Offset, Size, Options}, Timeout). -%% @doc Trims arbitrary binary range of any file. If a specified range -%% has any byte trimmed, it fails and returns `{error, trimmed}`. -%% Otherwise it trims all bytes in that range. If there are -%% overlapping chunks with client-specified checksum, they will cut -%% off and checksum are re-calculated in server side. TODO: Add -%% option specifying whether to trigger GC. --spec trim_chunk(pid(), string(), non_neg_integer(), machi_dt:chunk_size()) -> - ok | {error, machi_client_error_reason()}. trim_chunk(PidSpec, File, Offset, Size) -> trim_chunk(PidSpec, File, Offset, Size, ?DEFAULT_TIMEOUT). trim_chunk(PidSpec, File, Offset, Size, Timeout) -> send_sync(PidSpec, {trim_chunk, File, Offset, Size}, Timeout). -%% @doc Returns a binary that has checksums and chunks encoded inside -%% (This is because encoding-decoding them are inefficient). TODO: -%% return a structured list of them. --spec checksum_list(pid(), string()) -> {ok, binary()} | {error, machi_client_error_reason()}. checksum_list(PidSpec, File) -> checksum_list(PidSpec, File, ?DEFAULT_TIMEOUT). @@ -464,8 +415,6 @@ convert_general_status_code('NOT_WRITTEN') -> {error, not_written}; convert_general_status_code('WRITTEN') -> {error, written}; -convert_general_status_code('TRIMMED') -> - {error, trimmed}; convert_general_status_code('NO_SUCH_FILE') -> {error, no_such_file}; convert_general_status_code('PARTIAL_READ') -> diff --git a/src/machi_pb_translate.erl b/src/machi_pb_translate.erl index 0b49908..cc26766 100644 --- a/src/machi_pb_translate.erl +++ b/src/machi_pb_translate.erl @@ -88,17 +88,6 @@ from_pb_request(#mpb_ll_request{ offset=Offset, chunk_size=Size} = ChunkPos, {ReqID, {low_read_chunk, EpochID, File, Offset, Size, Opts}}; -from_pb_request(#mpb_ll_request{ - req_id=ReqID, - trim_chunk=#mpb_ll_trimchunkreq{ - epoch_id=PB_EpochID, - file=File, - offset=Offset, - size=Size, - trigger_gc=PB_TriggerGC}}) -> - EpochID = conv_to_epoch_id(PB_EpochID), - TriggerGC = conv_to_boolean(PB_TriggerGC), - {ReqID, {low_trim_chunk, EpochID, File, Offset, Size, TriggerGC}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, checksum_list=#mpb_ll_checksumlistreq{ @@ -273,10 +262,6 @@ from_pb_response(#mpb_ll_response{ _ -> {ReqID, machi_pb_high_client:convert_general_status_code(Status)} end; -from_pb_response(#mpb_ll_response{ - req_id=ReqID, - trim_chunk=#mpb_ll_trimchunkresp{status=Status}}) -> - {ReqID, machi_pb_high_client:convert_general_status_code(Status)}; from_pb_response(#mpb_ll_response{ req_id=ReqID, checksum_list=#mpb_ll_checksumlistresp{ @@ -413,10 +398,11 @@ to_pb_request(ReqID, {low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, C chunk=Chunk, csum=PB_CSum}}}; to_pb_request(ReqID, {low_read_chunk, EpochID, File, Offset, Size, Opts}) -> + %% TODO: stop ignoring Opts ^_^ PB_EpochID = conv_from_epoch_id(EpochID), FNChecksum = proplists:get_value(no_checksum, Opts, false), FNChunk = proplists:get_value(no_chunk, Opts, false), - NeedsTrimmed = proplists:get_value(needs_trimmed, Opts, false), + NeedsTrimmed = proplists:get_value(needs_merge, Opts, false), #mpb_ll_request{ req_id=ReqID, do_not_alter=2, read_chunk=#mpb_ll_readchunkreq{ @@ -428,15 +414,6 @@ to_pb_request(ReqID, {low_read_chunk, EpochID, File, Offset, Size, Opts}) -> flag_no_checksum=machi_util:bool2int(FNChecksum), flag_no_chunk=machi_util:bool2int(FNChunk), flag_needs_trimmed=machi_util:bool2int(NeedsTrimmed)}}; -to_pb_request(ReqID, {low_trim_chunk, EpochID, File, Offset, Size, TriggerGC}) -> - PB_EpochID = conv_from_epoch_id(EpochID), - #mpb_ll_request{req_id=ReqID, do_not_alter=2, - trim_chunk=#mpb_ll_trimchunkreq{ - epoch_id=PB_EpochID, - file=File, - offset=Offset, - size=Size, - trigger_gc=TriggerGC}}; to_pb_request(ReqID, {low_checksum_list, EpochID, File}) -> PB_EpochID = conv_from_epoch_id(EpochID), #mpb_ll_request{req_id=ReqID, do_not_alter=2, @@ -547,18 +524,6 @@ to_pb_response(ReqID, {low_read_chunk, _EID, _Fl, _Off, _Sz, _Opts}, Resp)-> _Else -> make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {low_trim_chunk, _, _, _, _, _}, Resp) -> - case Resp of - ok -> - #mpb_ll_response{req_id=ReqID, - trim_chunk=#mpb_ll_trimchunkresp{status='OK'}}; - {error, _}=Error -> - Status = conv_from_status(Error), - #mpb_ll_response{req_id=ReqID, - read_chunk=#mpb_ll_trimchunkresp{status=Status}}; - _Else -> - make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) - end; to_pb_response(ReqID, {low_checksum_list, _EpochID, _File}, Resp) -> case Resp of {ok, Chunk} -> @@ -944,8 +909,6 @@ conv_from_status({error, not_written}) -> 'NOT_WRITTEN'; conv_from_status({error, written}) -> 'WRITTEN'; -conv_from_status({error, trimmed}) -> - 'TRIMMED'; conv_from_status({error, no_such_file}) -> 'NO_SUCH_FILE'; conv_from_status({error, partial_read}) -> diff --git a/src/machi_plist.erl b/src/machi_plist.erl deleted file mode 100644 index 7750b0a..0000000 --- a/src/machi_plist.erl +++ /dev/null @@ -1,69 +0,0 @@ --module(machi_plist). - -%%% @doc persistent list of binaries - --export([open/2, close/1, find/2, add/2]). - --ifdef(TEST). --export([all/1]). --endif. - --record(machi_plist, - {filename :: file:filename_all(), - fd :: file:io_device(), - list = [] :: list(string)}). - --type plist() :: #machi_plist{}. --export_type([plist/0]). - --spec open(file:filename_all(), proplists:proplist()) -> - {ok, plist()} | {error, file:posix()}. -open(Filename, _Opt) -> - %% TODO: This decode could fail if the file didn't finish writing - %% whole contents, which should be fixed by some persistent - %% solution. - List = case file:read_file(Filename) of - {ok, <<>>} -> []; - {ok, Bin} -> binary_to_term(Bin); - {error, enoent} -> [] - end, - case file:open(Filename, [read, write, raw, binary, sync]) of - {ok, Fd} -> - {ok, #machi_plist{filename=Filename, - fd=Fd, - list=List}}; - Error -> - Error - end. - --spec close(plist()) -> ok. -close(#machi_plist{fd=Fd}) -> - _ = file:close(Fd). - --spec find(plist(), string()) -> boolean(). -find(#machi_plist{list=List}, Name) -> - lists:member(Name, List). - --spec add(plist(), string()) -> {ok, plist()} | {error, file:posix()}. -add(Plist = #machi_plist{list=List0, fd=Fd}, Name) -> - case find(Plist, Name) of - true -> - {ok, Plist}; - false -> - List = lists:append(List0, [Name]), - %% TODO: partial write could break the file with other - %% persistent info (even lose data of trimmed states); - %% needs a solution. - case file:pwrite(Fd, 0, term_to_binary(List)) of - ok -> - {ok, Plist#machi_plist{list=List}}; - Error -> - Error - end - end. - --ifdef(TEST). --spec all(plist()) -> [file:filename()]. -all(#machi_plist{list=List}) -> - List. --endif. diff --git a/src/machi_proxy_flu1_client.erl b/src/machi_proxy_flu1_client.erl index 2cbaabd..93f3b95 100644 --- a/src/machi_proxy_flu1_client.erl +++ b/src/machi_proxy_flu1_client.erl @@ -79,7 +79,6 @@ %% Internal API write_chunk/5, write_chunk/6, - trim_chunk/5, trim_chunk/6, %% Helpers stop_proxies/1, start_proxies/1 @@ -311,18 +310,6 @@ write_chunk(PidSpec, EpochID, File, Offset, Chunk, Timeout) -> Else end. - -trim_chunk(PidSpec, EpochID, File, Offset, Size) -> - trim_chunk(PidSpec, EpochID, File, Offset, Size, infinity). - -%% @doc Write a chunk (binary- or iolist-style) of data to a file -%% with `Prefix' at `Offset'. - -trim_chunk(PidSpec, EpochID, File, Offset, Chunk, Timeout) -> - gen_server:call(PidSpec, - {req, {trim_chunk, EpochID, File, Offset, Chunk}}, - Timeout). - %%%%%%%%%%%%%%%%%%%%%%%%%%% init([I]) -> @@ -396,9 +383,6 @@ make_req_fun({read_chunk, EpochID, File, Offset, Size, Opts}, make_req_fun({write_chunk, EpochID, File, Offset, Chunk}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> fun() -> Mod:write_chunk(Sock, EpochID, File, Offset, Chunk) end; -make_req_fun({trim_chunk, EpochID, File, Offset, Size}, - #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> - fun() -> Mod:trim_chunk(Sock, EpochID, File, Offset, Size) end; make_req_fun({checksum_list, EpochID, File}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> fun() -> Mod:checksum_list(Sock, EpochID, File) end; diff --git a/test/machi_ap_repair_eqc.erl b/test/machi_ap_repair_eqc.erl deleted file mode 100644 index e9d576e..0000000 --- a/test/machi_ap_repair_eqc.erl +++ /dev/null @@ -1,605 +0,0 @@ -%% ------------------------------------------------------------------- -%% -%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved. -%% -%% This file is provided to you under the Apache License, -%% Version 2.0 (the "License"); you may not use this file -%% except in compliance with the License. You may obtain -%% a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, -%% software distributed under the License is distributed on an -%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -%% KIND, either express or implied. See the License for the -%% specific language governing permissions and limitations -%% under the License. -%% -%% ------------------------------------------------------------------- - -%% EQC single-threaded and concurrent test for file operations and repair -%% under simulated network partition. - -%% The main purpose is to confirm no dataloss, i.e. every chunk that -%% has been successfully written (ACK received) by append/write -%% opration will be read after partition heals. -%% -%% All updating -- append, write and trim -- operations are executed -%% through CR client, not directly by flu1 client, in order to be -%% end-to-end test (in single chain point of veiw.) There may be churn -%% for projections by simulated network partition. -%% -%% Test steps -%% 1. Setup single chain. -%% 2. Execute updating operations and simulated partition (by eqc_statem). -%% Every updating results are recorded in ETS tables. -%% 3. When {error, timeout|partition} happens, trigger management tick for -%% every chain manager process. -%% 4. After commands are executed, remove patition and wait for the chain -%% without down nodes nor repairing nodes. -%% 5. Asserting written results so that each record be read from the -%% chain and data be the same with written one. - -%% Improvements to-do's -%% - Use higher concurrency, e.g. 10+ -%% - Random length for binary to write -%% - Operations other than append, write, trim -%% - Use checksum instead of binary to save memory -%% - More variety for partitioning pattern: non-constant failure -%% - Stop and restart -%% - Suspend and resume of some erlang processes - --module(machi_ap_repair_eqc). - --ifdef(TEST). --ifdef(EQC). --compile(export_all). --include("machi.hrl"). --include("machi_projection.hrl"). --include("machi_verbose.hrl"). --include_lib("eqc/include/eqc.hrl"). --include_lib("eqc/include/eqc_statem.hrl"). --include_lib("eunit/include/eunit.hrl"). - --record(target, {verbose=false, - flu_names, - mgr_names}). - --record(state, {num, - verbose=false, - flu_names, - mgr_names, - cr_count}). - -%% ETS table names --define(WRITTEN_TAB, written). % Successfully written data --define(ACCPT_TAB, accpt). % Errors with no harm, e.g. timeout --define(FAILED_TAB, failed). % Uncategorized errors, when happenes - % it should be re-categorized to accept or critical --define(CRITICAL_TAB, critical). % Critical errors, e.g. double write to the same key - --define(QC_OUT(P), - eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). - -%% EUNIT TEST DEFINITION -prop_repair_test_() -> - {PropTO, EUnitTO} = eqc_timeout(60), - Verbose = eqc_verbose(), - {spawn, - [{timeout, EUnitTO, - ?_assertEqual( - true, - eqc:quickcheck(eqc:testing_time( - PropTO, ?QC_OUT(noshrink(prop_repair(Verbose))))))}]}. - -prop_repair_par_test_() -> - {PropTO, EUnitTO} = eqc_timeout(60), - Verbose = eqc_verbose(), - {spawn, - [{timeout, EUnitTO, - ?_assertEqual( - true, - eqc:quickcheck(eqc:testing_time( - PropTO, ?QC_OUT(noshrink(prop_repair_par(Verbose))))))}]}. - -%% Model - -weight(_S, change_partition) -> 20; -weight(_S, _) -> 100. - -%% Append - -append_args(#state{cr_count=CRCount}=S) -> - [choose(1, CRCount), chunk(), S]. - -append(CRIndex, Bin, #state{verbose=V}=S) -> - CRList = cr_list(), - {_SimSelfName, C} = lists:nth(CRIndex, CRList), - Prefix = <<"pre">>, - Len = byte_size(Bin), - Res = (catch machi_cr_client:append_chunk(C, Prefix, Bin, {sec(1), sec(1)})), - case Res of - {ok, {_Off, Len, _FileName}=Key} -> - case ets:insert_new(?WRITTEN_TAB, {Key, Bin}) of - true -> - [?V("", []) || V], - ok; - false -> - %% The Key is alread written, WHY!!!???? - case ets:lookup(?WRITTEN_TAB, Key) of - [{Key, Bin}] -> - %% TODO: The identical binary is alread inserted in - %% written table. Is this acceptable??? Hmm, maybe NO... - [?V("", [Key]) || V], - true = ets:insert_new(?ACCPT_TAB, - {make_ref(), double_write_same, Key}), - {acceptable_error, doublewrite_the_same}; - [{Key, OtherBin}] -> - [?V("", [Key, {OtherBin, Bin}]) || V], - true = ets:insert_new(?CRITICAL_TAB, - {make_ref(), double_write_diff, Key}), - R = {critical_error, - {doublewrite_diff, Key, {OtherBin, Bin}}}, - %% TODO: when double write happens, it seems that - %% repair process got stack with endless loop. To - %% avoit it, return error here. - %% If this error/1 will be removed, one can possibly - %% know double write frequency/rate. - error(R) - end - end; - {error, partition} -> - [?V("", []) || V], - true = ets:insert_new(?ACCPT_TAB, {make_ref(), timeout}), - _ = tick(S), - {acceptable_error, partition}; - {'EXIT', {timeout, _}} -> - [?V("", [_SimSelfName, C]) || V], - true = ets:insert_new(?ACCPT_TAB, {make_ref(), timeout}), - _ = tick(S), - {acceptable_error, timeout}; - {ok, {_Off, UnexpectedLen, _FileName}=Key} -> - [?V("", []) || V], - true = ets:insert_new(?CRITICAL_TAB, {make_ref(), unexpected_len, Key}), - {critical_error, {unexpected_len, Key, Len, UnexpectedLen}}; - {error, _Reason} = Error -> - [?V("", []) || V], - true = ets:insert_new(?FAILED_TAB, {make_ref(), Error}), - {other_error, Error}; - Other -> - [?V("", []) || V], - true = ets:insert_new(?FAILED_TAB, {make_ref(), Other}), - {other_error, Other} - end. - -%% Change partition - -change_partition_args(#state{flu_names=FLUNames}=S) -> - %% [partition(FLUNames), S]. - [partition_sym(FLUNames), S]. - -change_partition(Partition, - #state{verbose=Verbose, flu_names=FLUNames}=S) -> - [case Partition of - [] -> ?V("## Turn OFF partition: ~w~n", [Partition]); - _ -> ?V("## Turn ON partition: ~w~n", [Partition]) - end || Verbose], - machi_partition_simulator:always_these_partitions(Partition), - _ = machi_partition_simulator:get(FLUNames), - %% Don't wait for stable chain, tick will be executed on demand - %% in append oprations - _ = tick(S), - ok. - -%% Generators - -num() -> - choose(2, 5). - -cr_count(Num) -> - Num * 3. - -%% Returns a list like -%% `[{#p_srvr{name=a, port=7501, ..}, "./eqc/data.eqc.a/"}, ...]' -all_list_extra(Num) -> - {PortBase, DirBase} = get_port_dir_base(), - [begin - FLUNameStr = [$a + I - 1], - FLUName = list_to_atom(FLUNameStr), - MgrName = machi_flu_psup:make_mgr_supname(FLUName), - {#p_srvr{name=FLUName, address="localhost", port=PortBase+I, - props=[{chmgr, MgrName}]}, - DirBase ++ "/data.eqc." ++ FLUNameStr} - end || I <- lists:seq(1, Num)]. - -%% Generator for possibly assymmetric partition information -partition(FLUNames) -> - frequency([{10, return([])}, - {20, non_empty(sublist(flu_ordered_pairs(FLUNames)))}]). - -%% Generator for symmetric partition information -partition_sym(FLUNames) -> - ?LET(Pairs, non_empty(sublist(flu_pairs(FLUNames))), - lists:flatmap(fun({One, Another}) -> [{One, Another}, {Another, One}] end, - Pairs)). - -flu_ordered_pairs(FLUNames) -> - [{From, To} || From <- FLUNames, To <- FLUNames, From =/= To]. - -flu_pairs(FLUNames) -> - [{One, Another} || One <- FLUNames, Another <- FLUNames, One > Another]. - -chunk() -> - non_empty(binary(10)). - -%% Properties - -prop_repair(Verbose) -> - error_logger:tty(false), - application:load(sasl), - application:set_env(sasl, sasl_error_logger, false), - - Seed = {1445,935441,287549}, - ?FORALL(Num, num(), - ?FORALL(Cmds, commands(?MODULE, initial_state(Num, Verbose)), - begin - Target = setup_target(Num, Seed, Verbose), - {H, S1, Res0} = run_commands(?MODULE, Cmds), - %% ?V("S1=~w~n", [S1]), - ?V("==== Start post operations, stabilize and confirm results~n", []), - _ = stabilize(commands_len(Cmds), Target), - {Dataloss, Critical} = confirm_result(Target), - _ = cleanup(Target), - pretty_commands( - ?MODULE, Cmds, {H, S1, Res0}, - aggregate(with_title(cmds), command_names(Cmds), - collect(with_title(length5), (length(Cmds) div 5) * 5, - {Dataloss, Critical} =:= {0, 0}))) - end)). - -prop_repair_par(Verbose) -> - error_logger:tty(false), - application:load(sasl), - application:set_env(sasl, sasl_error_logger, false), - - Seed = {1445,935441,287549}, - ?FORALL(Num, num(), - ?FORALL(Cmds, - %% Now try-and-err'ing, how to control command length and concurrency? - ?SUCHTHAT(Cmds0, ?SIZED(Size, resize(Size, - parallel_commands(?MODULE, initial_state(Num, Verbose)))), - commands_len(Cmds0) > 20 - andalso - concurrency(Cmds0) > 2), - begin - CmdsLen= commands_len(Cmds), - Target = setup_target(Num, Seed, Verbose), - {Seq, Par, Res0} = run_parallel_commands(?MODULE, Cmds), - %% ?V("Seq=~w~n", [Seq]), - %% ?V("Par=~w~n", [Par]), - ?V("==== Start post operations, stabilize and confirm results~n", []), - {FinalRes, {Dataloss, Critical}} = - case Res0 of - ok -> - Res1 = stabilize(CmdsLen, Target), - {Res1, confirm_result(Target)}; - _ -> - ?V("Res0=~w~n", [Res0]), - {Res0, {undefined, undefined}} - end, - _ = cleanup(Target), - %% Process is leaking? This log line can be removed after fix. - [?V("process_count=~w~n", [erlang:system_info(process_count)]) || Verbose], - pretty_commands( - ?MODULE, Cmds, {Seq, Par, Res0}, - aggregate(with_title(cmds), command_names(Cmds), - collect(with_title(length5), (CmdsLen div 5) * 5, - collect(with_title(conc), concurrency(Cmds), - {FinalRes, {Dataloss, Critical}} =:= {ok, {0, 0}}))) - ) - end)). - -%% Initilization / setup - -%% Fake initialization function for debugging in shell like: -%% > eqc_gen:sample(eqc_statem:commands(machi_ap_repair_eqc)). -%% but not so helpful. -initial_state() -> - #state{cr_count=3}. - -initial_state(Num, Verbose) -> - AllListE = all_list_extra(Num), - FLUNames = [P#p_srvr.name || {P, _Dir} <- AllListE], - MgrNames = [{Name, machi_flu_psup:make_mgr_supname(Name)} || Name <- FLUNames], - #state{num=Num, verbose=Verbose, - flu_names=FLUNames, mgr_names=MgrNames, - cr_count=cr_count(Num)}. - -setup_target(Num, Seed, Verbose) -> - %% ?V("setup_target(Num=~w, Seed=~w~nn", [Num, Seed]), - AllListE = all_list_extra(Num), - FLUNames = [P#p_srvr.name || {P, _Dir} <- AllListE], - MgrNames = [{Name, machi_flu_psup:make_mgr_supname(Name)} || Name <- FLUNames], - Dict = orddict:from_list([{P#p_srvr.name, P} || {P, _Dir} <- AllListE]), - - setup_chain(Seed, AllListE, FLUNames, MgrNames, Dict), - _ = setup_cpool(AllListE, FLUNames, Dict), - - Target = #target{flu_names=FLUNames, mgr_names=MgrNames, - verbose=Verbose}, - %% Don't wait for complete chain. Even partialy completed, the chain - %% should work fine. Right? - wait_until_stable(chain_state_all_ok(FLUNames), FLUNames, MgrNames, - 20, Verbose), - Target. - -setup_chain(Seed, AllListE, FLUNames, MgrNames, Dict) -> - ok = shutdown_hard(), - [begin - machi_flu1_test:clean_up_data_dir(Dir), - filelib:ensure_dir(Dir ++ "/not-used") - end || {_P, Dir} <- AllListE], - [catch ets:delete(T) || T <- tabs()], - - [ets:new(T, [set, public, named_table, - {write_concurrency, true}, {read_concurrency, true}]) || - T <- tabs()], - {ok, _} = application:ensure_all_started(machi), - - SimSpec = {part_sim, - {machi_partition_simulator, start_link, [{0,0,0}, 0, 100]}, - permanent, 500, worker, []}, - {ok, _PSimPid} = supervisor:start_child(machi_sup, SimSpec), - ok = machi_partition_simulator:set_seed(Seed), - _Partitions = machi_partition_simulator:get(FLUNames), - - %% Start FLUs and setup the chain - FLUOpts = [{use_partition_simulator, true}, - %% {private_write_verbose, true}, - {active_mode, false}, - {simulate_repair, false}], - [{ok, _} = machi_flu_psup:start_flu_package(Name, Port, Dir, FLUOpts) || - {#p_srvr{name=Name, port=Port}, Dir} <- AllListE], - [machi_chain_manager1:set_chain_members(MgrName, Dict) || {_, MgrName} <- MgrNames], - ok. - -setup_cpool(AllListE, FLUNames, Dict) -> - Num = length(AllListE), - FCList = [begin - {ok, PCPid} = machi_proxy_flu1_client:start_link(P), - {Name, PCPid} - end || {_, #p_srvr{name=Name}=P} <- Dict], - %% CR clients are pooled, each has "name" which is interpreted "From" - %% side of simulated partition. - SimSelfNames = lists:append(lists:duplicate(cr_count(Num), FLUNames)), - CRList = [begin - {ok, C} = machi_cr_client:start_link( - [P || {_, P} <- Dict], - [{use_partition_simulator, true}, - {simulator_self_name, SimSelfName}, - {simulator_members, FLUNames}]), - {SimSelfName, C} - end || SimSelfName <- SimSelfNames], - catch ets:delete(cpool), - ets:new(cpool, [set, protected, named_table, {read_concurrency, true}]), - ets:insert(cpool, {fc_list, FCList}), - ets:insert(cpool, {cr_list, CRList}), - {CRList, FCList}. - -fc_list() -> - [{fc_list, FCList}] = ets:lookup(cpool, fc_list), - FCList. - -cr_list() -> - [{cr_list, CRList}] = ets:lookup(cpool, cr_list), - CRList. - -%% Post run_commands - -stabilize(0, _T) -> - ok; -stabilize(_CmdsLen, #target{flu_names=FLUNames, mgr_names=MgrNames, - verbose=Verbose}) -> - machi_partition_simulator:no_partitions(), - wait_until_stable(chain_state_all_ok(FLUNames), FLUNames, MgrNames, - 100, Verbose), - ok. - -chain_state_all_ok(FLUNames) -> - [{FLUName, {FLUNames, [], []}} || FLUName <- FLUNames]. - -confirm_result(_T) -> - [{_, C} | _] = cr_list(), - [{written, _Written}, {accpt, Accpt}, - {failed, Failed}, {critical, Critical}] = tab_counts(), - {OK, Dataloss} = confirm_written(C), - ?V(" Written=~w, DATALOSS=~w, Acceptable=~w~n", [OK, Dataloss, Accpt]), - ?V(" Failed=~w, Critical=~w~n~n", [Failed, Critical]), - DirBase = get_dir_base(), - Suffix = dump_file_suffix(), - case Failed of - 0 -> ok; - _ -> - DumpFailed = filename:join(DirBase, "dump-failed-" ++ Suffix), - ?V("Dump failed ETS tab to: ~w~n", [DumpFailed]), - ets:tab2file(?FAILED_TAB, DumpFailed) - end, - case Critical of - 0 -> ok; - _ -> - DumpCritical = filename:join(DirBase, "dump-critical-" ++ Suffix), - ?V("Dump critical ETS tab to: ~w~n", [DumpCritical]), - ets:tab2file(?CRITICAL_TAB, DumpCritical) - end, - {Dataloss, Critical}. - -confirm_written(C) -> - ets:foldl( - fun({Key, Bin}, {OK, NG}) -> - case assert_chunk(C, Key, Bin) of - ok -> {OK+1, NG}; - {error, _} -> {OK, NG+1} - end - end, {0, 0}, ?WRITTEN_TAB). - -assert_chunk(C, {Off, Len, FileName}=Key, Bin) -> - %% TODO: This probably a bug, read_chunk respnds with filename of `string()' type - FileNameStr = binary_to_list(FileName), - %% TODO : Use CSum instead of binary (after disuccsion about CSum is calmed down?) - case (catch machi_cr_client:read_chunk(C, FileName, Off, Len, [], sec(3))) of - {ok, {[{FileNameStr, Off, Bin, _}], []}} -> - ok; - {ok, Got} -> - ?V("read_chunk got different binary for Key=~p~n", [Key]), - ?V(" Expected: ~p~n", [{[{FileNameStr, Off, Bin, <<"CSum-NYI">>}], []}]), - ?V(" Got: ~p~n", [Got]), - {error, different_binary}; - {error, Reason} -> - ?V("read_chunk error for Key=~p: ~p~n", [Key, Reason]), - {error, Reason}; - Other -> - ?V("read_chunk other error for Key=~p: ~p~n", [Key, Other]), - {error, Other} - end. - -cleanup(_Target) -> - [begin unlink(FC), catch exit(FC, kill) end || {_, FC} <- fc_list()], - [begin unlink(CR), catch exit(CR, kill) end || {_, CR} <- cr_list()], - _ = shutdown_hard(). - -%% Internal misc utilities - -eqc_verbose() -> - os:getenv("EQC_VERBOSE") =:= "true". - -eqc_timeout(Default) -> - PropTimeout = case os:getenv("EQC_TIMEOUT") of - false -> Default; - V -> list_to_integer(V) - end, - {PropTimeout, PropTimeout * 300}. - -get_port_dir_base() -> - I = case os:getenv("EQC_BASE_PORT") of - false -> 0; - V -> list_to_integer(V) - end, - D = get_dir_base(), - {7400 + (I * 100), D ++ "/" ++ integer_to_list(I)}. - -get_dir_base() -> - case os:getenv("EQC_BASE_DIR") of - false -> "./eqc"; - DD -> DD - end. - -shutdown_hard() -> - _STOP = application:stop(machi), - timer:sleep(100). - -tick(#state{flu_names=FLUNames, mgr_names=MgrNames, - verbose=Verbose}) -> - tick(FLUNames, MgrNames, Verbose). - -tick(FLUNames, MgrNames, Verbose) -> - tick(FLUNames, MgrNames, 2, 100, Verbose). - -tick(FLUNames, MgrNames, Iter, SleepMax, Verbose) -> - TickFun = tick_fun(FLUNames, MgrNames, self()), - TickFun(Iter, 0, SleepMax), - FCList = fc_list(), - [?V("## Chain state after tick()=~w~n", [chain_state(FCList)]) || Verbose]. - -tick_fun(FLUNames, MgrNames, Parent) -> - fun(Iters, SleepMin, SleepMax) -> - %% ?V("^", []), - Trigger = - fun(FLUName, MgrName) -> - random:seed(now()), - [begin - erlang:yield(), - SleepMaxRand = random:uniform(SleepMax + 1), - %% io:format(user, "{t}", []), - Elapsed = machi_chain_manager1:sleep_ranked_order( - SleepMin, SleepMaxRand, - FLUName, FLUNames), - MgrName ! tick_check_environment, - %% Be more unfair by not sleeping here. - timer:sleep(max(SleepMax - Elapsed, 1)), - ok - end || _ <- lists:seq(1, Iters)], - Parent ! {done, self()} - end, - Pids = [{spawn(fun() -> Trigger(FLUName, MgrName) end), FLUName} || - {FLUName, MgrName} <- MgrNames ], - [receive - {done, ThePid} -> - ok - after 120*1000 -> - exit({icky_timeout, M_name}) - end || {ThePid, M_name} <- Pids] - end. - -wait_until_stable(ExpectedChainState, FLUNames, MgrNames, Verbose) -> - wait_until_stable(ExpectedChainState, FLUNames, MgrNames, 20, Verbose). - -wait_until_stable(ExpectedChainState, FLUNames, MgrNames, Retries, Verbose) -> - TickFun = tick_fun(FLUNames, MgrNames, self()), - FCList = fc_list(), - wait_until_stable1(ExpectedChainState, TickFun, FCList, Retries, Verbose). - -wait_until_stable1(_ExpectedChainState, _TickFun, FCList, 0, _Verbose) -> - ?V(" [ERROR] wait_until_stable failed.... : ~p~n", [chain_state(FCList)]), - false; -wait_until_stable1(ExpectedChainState, TickFun, FCList, Reties, Verbose) -> - [TickFun(3, 0, 100) || _ <- lists:seq(1, 3)], - Normalized = normalize_chain_state(chain_state(FCList)), - case Normalized of - ExpectedChainState -> - [?V(" Got stable chain: ~w~n", [chain_state(FCList)]) || Verbose], - true; - _ -> - [?V(" NOT YET stable chain: ~w~n", [chain_state(FCList)]) || Verbose], - wait_until_stable1(ExpectedChainState, TickFun, FCList, Reties-1, Verbose) - end. - -normalize_chain_state(ChainState) -> - lists:usort([{FLUName, - {lists:usort(UPI), lists:usort(Repairing), lists:usort(Down)}} || - {FLUName, {_EpochNo, UPI, Repairing, Down}} <- ChainState]). - -chain_state(FCList) -> - lists:usort( - [case (catch machi_proxy_flu1_client:read_latest_projection(C, private, sec(5))) of - {ok, #projection_v1{epoch_number=EpochNo, upi=UPI, - repairing=Repairing, down=Down}} -> - {FLUName, {EpochNo, UPI, Repairing, Down}}; - Other -> - {FLUName, Other} - end || {FLUName, C} <- FCList]). - -tabs() -> [?WRITTEN_TAB, ?ACCPT_TAB, ?FAILED_TAB, ?CRITICAL_TAB]. - -tab_counts() -> - [{T, ets:info(T, size)} || T <- tabs()]. - -sec(Sec) -> - timer:seconds(Sec). - -commands_len({SeqCmds, ParCmdsList} = _Cmds) -> - lists:sum([length(SeqCmds) | [length(P) || P <- ParCmdsList]]); -commands_len(Cmds) -> - length(Cmds). - -concurrency({_SeqCmds, ParCmdsList} = _Cmds) -> length(ParCmdsList); -concurrency(_) -> 1. - -dump_file_suffix() -> - {{Year, Month, Day}, {Hour, Min, Sec}} = calendar:local_time(), - lists:flatten( - io_lib:format("~4.10.0B-~2.10.0B-~2.10.0BT~2.10.0B:~2.10.0B:~2.10.0B.000Z", - [Year, Month, Day, Hour, Min, Sec])). - --endif. % EQC --endif. % TEST diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index 9303701..782e7be 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -45,18 +45,14 @@ -include_lib("eunit/include/eunit.hrl"). -help() -> - io:format("~s\n", [short_doc()]). - short_doc() -> " A visualization of the convergence behavior of the chain self-management algorithm for Machi. - - 1. Set up some server and chain manager pairs. + 1. Set up 4 FLUs and chain manager pairs. 2. Create a number of different network partition scenarios, where - (simulated) partitions may be symmetric or asymmetric. Then stop changing - the partitions and keep the simulated network stable (and perhaps broken). + (simulated) partitions may be symmetric or asymmetric. Then halt changing + the partitions and keep the simulated network stable and broken. 3. Run a number of iterations of the algorithm in parallel by poking each of the manager processes on a random'ish basis. 4. Afterward, fetch the chain transition changes made by each FLU and @@ -65,65 +61,73 @@ algorithm for Machi. During the iteration periods, the following is a cheatsheet for the output. See the internal source for interpreting the rest of the output. - 'SET partitions = ' + 'Let loose the dogs of war!' Network instability + 'SET partitions = ' Network stability (but broken) + 'x uses:' The FLU x has made an internal state transition. The rest of + the line is a dump of internal state. + '{t}' This is a tick event which triggers one of the manager processes + to evaluate its environment and perhaps make a state transition. - A pair-wise list of actors which cannot send messages. The - list is uni-directional. If there are three servers (a,b,c), - and if the partitions list is '[{a,b},{b,c}]' then all - messages from a->b and b->c will be dropped, but any other - sender->recipient messages will be delivered successfully. +A long chain of '{t}{t}{t}{t}' means that the chain state has settled +to a stable configuration, which is the goal of the algorithm. +Press control-c to interrupt....". - 'x uses:' +long_doc() -> + " +'Let loose the dogs of war!' - The FLU x has made an internal state transition and is using - this epoch's projection as operating chain configuration. The - rest of the line is a summary of the projection. + The simulated network is very unstable for a few seconds. - 'CONFIRM epoch {N}' +'x uses' - This message confirms that all of the servers listed in the - UPI and repairing lists of the projection at epoch {N} have - agreed to use this projection because they all have written - this projection to their respective private projection stores. - The chain is now usable by/available to all clients. + After a single iteration, server x has determined that the chain + should be defined by the upi, repair, and down list in this record. + If all participants reach the same conclusion at the same epoch + number (and checksum, see next item below), then the chain is + stable, fully configured, and can provide full service. - 'Sweet, private projections are stable' +'epoch,E' - This report announces that this iteration of the test cycle - has passed successfully. The report that follows briefly - summarizes the latest private projection used by each - participating server. For example, when in strong consistency - mode with 'a' as a witness and 'b' and 'c' as real servers: + The epoch number for this decision is E. The checksum of the full + record is not shown. For purposes of the protocol, a server will + 'wedge' itself and refuse service (until a new config is chosen) + whenever: a). it sees a bigger epoch number mentioned somewhere, or + b). it sees the same epoch number but a different checksum. In case + of b), there was a network partition that has healed, and both sides + had chosen to operate with an identical epoch number but different + chain configs. - %% Legend: - %% server name, epoch ID, UPI list, repairing list, down list, ... - %% ... witness list, 'false' (a constant value) +'upi', 'repair', and 'down' - [{a,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}, - {b,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}] + Members in the chain that are fully in sync and thus preserving the + Update Propagation Invariant, up but under repair (simulated), and + down, respectively. - Both servers 'a' and 'b' agree on epoch 1116 with epoch ID - {1116,<<23,143,246,55>>} where UPI=[a,b], repairing=[], - down=[c], and witnesses=[a]. +'ps,[some list]' - Server 'c' is not shown because 'c' has wedged itself OOS (out - of service) by configuring a chain length of zero. + The list of asymmetric network partitions. {a,b} means that a + cannot send to b, but b can send to a. - If no servers are listed in the report (i.e. only '[]' is - displayed), then all servers have wedged themselves OOS, and - the chain is unavailable. + This partition list is recorded for debugging purposes but is *not* + used by the algorithm. The algorithm only 'feels' its effects via + simulated timeout whenever there's a partition in one of the + messaging directions. - 'DoIt,' +'nodes_up,[list]' - This marks a group of tick events which trigger the manager - processes to evaluate their environment and perhaps make a - state transition. + The best guess right now of which ndoes are up, relative to the + author node, specified by '{author,X}' -A long chain of 'DoIt,DoIt,DoIt,' means that the chain state has -(probably) settled to a stable configuration, which is the goal of the -algorithm. +'SET partitions = [some list]' -Press control-c to interrupt the test....". + All subsequent iterations should have a stable list of partitions, + i.e. the 'ps' list described should be stable. + +'{FLAP: x flaps n}!' + + Server x has detected that it's flapping/oscillating after iteration + n of a naive/1st draft detection algorithm. +". %% ' silly Emacs syntax highlighting.... @@ -291,7 +295,7 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) -> private_projections_are_stable(Namez, DoIt) end, false, lists:seq(0, MaxIters)), io:format(user, "\n~s Sweet, private projections are stable\n", [machi_util:pretty_time()]), - io:format(user, "\t~P\n", [get(stable), 24]), + io:format(user, "\t~P\n", [get(stable), 14]), io:format(user, "Rolling sanity check ... ", []), PrivProjs = [{Name, begin {ok, Ps8} = ?FLU_PC:get_all_projections( @@ -715,7 +719,7 @@ private_projections_are_stable(Namez, PollFunc) -> true end, - %% io:format(user, "\nPriv1 ~p\nPriv2 ~p\n1==2 ~w ap_disjoint ~w u_all_peers ~w cp_mode_agree ~w\n", [lists:sort(Private1), lists:sort(Private2), Private1 == Private2, AP_mode_disjoint_test_p, Unanimous_with_all_peers_p, CP_mode_agree_test_p]), + io:format(user, "\nPriv1 ~p\nPriv2 ~p\n1==2 ~w ap_disjoint ~w u_all_peers ~w cp_mode_agree ~w\n", [lists:sort(Private1), lists:sort(Private2), Private1 == Private2, AP_mode_disjoint_test_p, Unanimous_with_all_peers_p, CP_mode_agree_test_p]), Private1 == Private2 andalso AP_mode_disjoint_test_p andalso ( diff --git a/test/machi_csum_table_test.erl b/test/machi_csum_table_test.erl index c168d45..f34d955 100644 --- a/test/machi_csum_table_test.erl +++ b/test/machi_csum_table_test.erl @@ -54,6 +54,7 @@ smoke2_test() -> ok = machi_csum_table:close(MC), ok = machi_csum_table:delete(MC). + smoke3_test() -> Filename = "./temp-checksum-dumb-file-4", _ = file:delete(Filename), @@ -75,14 +76,13 @@ smoke3_test() -> {?LINE, trim, {0, 1024, <<>>}, undefined, undefined} ], [ begin - %% ?debugVal({_Line, Chunk}), + %% ?debugVal({Line, Chunk}), {Offset, Size, Csum} = Chunk, ?assertEqual(LeftN0, machi_csum_table:find_leftneighbor(MC, Offset)), ?assertEqual(RightN0, machi_csum_table:find_rightneighbor(MC, Offset+Size)), LeftN = case LeftN0 of - {OffsL, SizeL, trimmed} -> {OffsL, SizeL, trimmed}; {OffsL, SizeL, _} -> {OffsL, SizeL, <<"boom">>}; OtherL -> OtherL end, @@ -106,4 +106,5 @@ smoke3_test() -> ok = machi_csum_table:close(MC), ok = machi_csum_table:delete(MC). + %% TODO: add quickcheck test here diff --git a/test/machi_file_proxy_eqc.erl b/test/machi_file_proxy_eqc.erl index 00d470f..bf57043 100644 --- a/test/machi_file_proxy_eqc.erl +++ b/test/machi_file_proxy_eqc.erl @@ -31,14 +31,13 @@ -define(QC_OUT(P), eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)). --define(TESTDIR, "./eqc"). %% EUNIT TEST DEFINITION eqc_test_() -> {timeout, 60, {spawn, [ - {timeout, 30, ?_assertEqual(true, eqc:quickcheck(eqc:testing_time(15, ?QC_OUT(prop_ok()))))} + {timeout, 30, ?_assertEqual(true, eqc:quickcheck(eqc:testing_time(15, ?QC_OUT(prop_ok()))))} ] }}. @@ -89,117 +88,55 @@ data_with_csum(Limit) -> intervals([]) -> []; intervals([N]) -> - [{N, choose(1,1)}]; + [{N, choose(1,150)}]; intervals([A,B|T]) -> - [{A, oneof([choose(1, B-A), B-A])}|intervals([B|T])]. + [{A, choose(1, B-A)}|intervals([B|T])]. interval_list() -> - ?LET(L, - oneof([list(choose(1025, 1033)), list(choose(1024, 4096))]), - intervals(lists:usort(L))). + ?LET(L, list(choose(1024, 4096)), intervals(lists:usort(L))). shuffle_interval() -> ?LET(L, interval_list(), shuffle(L)). get_written_interval(L) -> ?LET({O, Ln}, elements(L), {O+1, Ln-1}). - + %% INITIALIZATION --record(state, {pid, prev_extra = 0, - filename = undefined, - planned_writes=[], - planned_trims=[], - written=[], - trimmed=[]}). +-record(state, {pid, prev_extra = 0, planned_writes=[], written=[]}). -initial_state() -> - {_, _, MS} = os:timestamp(), - Filename = test_server:temp_name("eqc_data") ++ "." ++ integer_to_list(MS), - #state{filename=Filename, written=[{0,1024}]}. - -initial_state(I, T) -> - S=initial_state(), - S#state{written=[{0,1024}], - planned_writes=I, - planned_trims=T}. +initial_state() -> #state{written=[{0,1024}]}. +initial_state(I) -> #state{written=[{0,1024}], planned_writes=I}. weight(_S, rewrite) -> 1; weight(_S, _) -> 2. %% HELPERS -get_overlaps(_Offset, _Len, [], Acc) -> lists:reverse(Acc); -get_overlaps(Offset, Len, [{Pos, Sz} = Ck|T], Acc0) -%% Overlap judgement differnt from the one in machi_csum_table -%% [a=Offset, b), [x=Pos, y) ... - when - %% a =< x && x < b && b =< y - (Offset =< Pos andalso Pos < Offset + Len andalso Offset + Len =< Pos + Sz) orelse - %% a =< x && y < b - (Offset =< Pos andalso Pos + Sz < Offset + Len) orelse - %% x < a && a < y && y =< b - (Pos < Offset andalso Offset < Pos + Sz andalso Pos + Sz =< Offset + Len) orelse - %% x < a && b < y - (Pos < Offset + Len andalso Offset + Len < Pos + Sz) -> - get_overlaps(Offset, Len, T, [Ck|Acc0]); -get_overlaps(Offset, Len, [_Ck|T], Acc0) -> - get_overlaps(Offset, Len, T, Acc0). - -%% Inefficient but simple easy code to verify by eyes - returns all -%% bytes that fits in (Offset, Len) -chop(Offset, Len, List) -> - ChopLeft = fun({Pos, Sz}) when Pos < Offset andalso Offset =< Pos + Sz -> - {Offset, Sz + Pos - Offset}; - ({Pos, Sz}) when Offset =< Pos andalso Pos + Sz < Offset + Len -> - {Pos, Sz}; - ({Pos, _Sz}) when Offset =< Pos -> - {Pos, Offset + Len - Pos} - end, - ChopRight = fun({Pos, Sz}) when Offset + Len < Pos + Sz -> - {Pos, Offset + Len - Pos}; - ({Pos, Sz}) -> - {Pos, Sz} - end, - Filter0 = fun({_, 0}) -> false; - (Other) -> {true, Other} end, - lists:filtermap(fun(E) -> Filter0(ChopRight(ChopLeft(E))) end, - List). - -%% Returns all bytes that are at left side of the Offset -chopped_left(_Offset, []) -> undefined; -chopped_left(Offset, [{Pos,_Sz}|_]) when Pos < Offset -> - {Pos, Offset - Pos}; -chopped_left(_, _) -> - undefined. - -chopped_right(_Offset, []) -> undefined; -chopped_right(Offset, List) -> - {Pos, Sz} = lists:last(List), - if Offset < Pos + Sz -> - {Offset, Pos + Sz - Offset}; - true -> - undefined - end. - -cleanup_chunk(Offset, Length, ChunkList) -> - Overlaps = get_overlaps(Offset, Length, ChunkList, []), - NewCL0 = lists:foldl(fun lists:delete/2, - ChunkList, Overlaps), - NewCL1 = case chopped_left(Offset, Overlaps) of - undefined -> NewCL0; - LeftRemain -> [LeftRemain|NewCL0] - end, - NewCL2 = case chopped_right(Offset+Length, Overlaps) of - undefined -> NewCL1; - RightRemain -> [RightRemain|NewCL1] - end, - lists:sort(NewCL2). +%% check if an operation is permitted based on whether a write has +%% occurred +check_writes(_Op, [], _Off, _L) -> + false; +check_writes(_Op, [{Pos, Sz}|_T], Off, L) when Pos == Off + andalso Sz == L -> + mostly_true; +check_writes(read, [{Pos, Sz}|_T], Off, L) when Off >= Pos + andalso Off < (Pos + Sz) + andalso Sz >= ( L - ( Off - Pos ) ) -> + true; +check_writes(write, [{Pos, Sz}|_T], Off, L) when ( Off + L ) > Pos + andalso Off < (Pos + Sz) -> + true; +check_writes(Op, [_H|T], Off, L) -> + check_writes(Op, T, Off, L). is_error({error, _}) -> true; is_error({error, _, _}) -> true; is_error(Other) -> {expected_ERROR, Other}. +probably_error(ok) -> true; +probably_error(V) -> is_error(V). + is_ok({ok, _, _}) -> true; is_ok(ok) -> true; is_ok(Other) -> {expected_OK, Other}. @@ -207,10 +144,11 @@ is_ok(Other) -> {expected_OK, Other}. get_offset({ok, _Filename, Offset}) -> Offset; get_offset(_) -> error(badarg). -last_byte([]) -> 0; -last_byte(L0) -> - L1 = lists:map(fun({Pos, Sz}) -> Pos + Sz end, L0), - lists:last(lists:sort(L1)). +offset_valid(Offset, Extra, L) -> + {Pos, Sz} = lists:last(L), + Offset == Pos + Sz + Extra. + +-define(TESTDIR, "./eqc"). cleanup() -> [begin @@ -224,17 +162,19 @@ cleanup() -> %% start start_pre(S) -> - S#state.pid =:= undefined. + S#state.pid == undefined. start_command(S) -> {call, ?MODULE, start, [S]}. -start(#state{filename=File}) -> - {ok, Pid} = machi_file_proxy:start_link(some_flu, File, ?TESTDIR), +start(_S) -> + {_, _, MS} = os:timestamp(), + File = test_server:temp_name("eqc_data") ++ "." ++ integer_to_list(MS), + {ok, Pid} = machi_file_proxy:start_link(File, ?TESTDIR), unlink(Pid), Pid. -start_next(S, Pid, _) -> +start_next(S, Pid, _Args) -> S#state{pid = Pid}. %% read @@ -243,34 +183,31 @@ read_pre(S) -> S#state.pid /= undefined. read_args(S) -> - [S#state.pid, oneof([offset(), big_offset()]), len()]. + [S#state.pid, offset(), len()]. + +read_ok(S, Off, L) -> + case S#state.written of + [{0, 1024}] -> false; + W -> check_writes(read, W, Off, L) + end. read_post(S, [_Pid, Off, L], Res) -> - Written = get_overlaps(Off, L, S#state.written, []), - Chopped = chop(Off, L, Written), - Trimmed = get_overlaps(Off, L, S#state.trimmed, []), - Eof = lists:max([Pos+Sz||{Pos,Sz}<-S#state.written]), - case Res of - {ok, {Written0, Trimmed0}} -> - Written1 = lists:map(fun({_, Pos, Chunk, _}) -> - {Pos, iolist_size(Chunk)} - end, Written0), - Trimmed1 = lists:map(fun({_, Pos, Sz}) -> {Pos, Sz} end, Trimmed0), - Chopped =:= Written1 - andalso Trimmed =:= Trimmed1; - %% TODO: such response are ugly, rethink the SPEC - {error, not_written} when Eof < Off + L -> - true; - {error, not_written} when Chopped =:= [] andalso Trimmed =:= [] -> - true; - _Other -> - is_error(Res) + case read_ok(S, Off, L) of + true -> is_ok(Res); + mostly_true -> is_ok(Res); + false -> is_error(Res) end. read_next(S, _Res, _Args) -> S. read(Pid, Offset, Length) -> - machi_file_proxy:read(Pid, Offset, Length, [{needs_trimmed, true}]). + case machi_file_proxy:read(Pid, Offset, Length) of + {ok, {Chunks, _}} -> + [{_, Offset, Data, Csum}] = Chunks, + {ok, Data, Csum}; + E -> + E + end. %% write @@ -279,7 +216,6 @@ write_pre(S) -> %% do not allow writes with empty data write_pre(_S, [_Pid, _Extra, {<<>>, _Tag, _Csum}]) -> - ?assert(false), false; write_pre(_S, _Args) -> true. @@ -288,29 +224,39 @@ write_args(S) -> {Off, Len} = hd(S#state.planned_writes), [S#state.pid, Off, data_with_csum(Len)]. -write_post(S, [_Pid, Off, {Bin, _Tag, _Csum}] = _Args, Res) -> +write_ok(_S, [_Pid, Off, _Data]) when Off < 1024 -> false; +write_ok(S, [_Pid, Off, {Bin, _Tag, _Csum}]) -> Size = iolist_size(Bin), - case {get_overlaps(Off, Size, S#state.written, []), - get_overlaps(Off, Size, S#state.trimmed, [])} of - {[], []} -> - %% No overlap neither with written ranges nor trimmed - %% ranges; OK to write things. - eq(Res, ok); - {_, _} -> - %% overlap found in either or both at written or at - %% trimmed ranges; can't write. - is_error(Res) + %% Check writes checks if a byte range is *written* + %% So writes are ok IFF they are NOT written, so + %% we want not check_writes/3 to be true. + check_writes(write, S#state.written, Off, Size). + +write_post(S, Args, Res) -> + case write_ok(S, Args) of + %% false means this range has NOT been written before, so + %% it should succeed + false -> eq(Res, ok); + %% mostly true means we've written this range before BUT + %% as a special case if we get a call to write the EXACT + %% same data that's already on the disk, we return "ok" + %% instead of {error, written}. + mostly_true -> probably_error(Res); + %% If we get true, then we've already written this section + %% or a portion of this range to disk and should return an + %% error. + true -> is_error(Res) end. write_next(S, Res, [_Pid, Offset, {Bin, _Tag, _Csum}]) -> S0 = case is_ok(Res) of - true -> + true -> S#state{written = lists:sort(S#state.written ++ [{Offset, iolist_size(Bin)}]) }; - _ -> + _ -> S end, S0#state{prev_extra = 0, planned_writes=tl(S0#state.planned_writes)}. - + write(Pid, Offset, {Bin, Tag, Csum}) -> Meta = [{client_csum_tag, Tag}, @@ -338,43 +284,27 @@ append(Pid, Extra, {Bin, Tag, Csum}) -> append_next(S, Res, [_Pid, Extra, {Bin, _Tag, _Csum}]) -> case is_ok(Res) of - true -> + true -> Offset = get_offset(Res), - S#state{prev_extra = Extra, - written = lists:sort(S#state.written ++ [{Offset, iolist_size(Bin)}])}; - _Other -> + true = offset_valid(Offset, S#state.prev_extra, S#state.written), + S#state{prev_extra = Extra, written = lists:sort(S#state.written ++ [{Offset, iolist_size(Bin)}])}; + _ -> S end. -%% appends should always succeed unless the disk is full +%% appends should always succeed unless the disk is full %% or there's a hardware failure. -append_post(S, _Args, Res) -> - case is_ok(Res) of - true -> - Offset = get_offset(Res), - case erlang:max(last_byte(S#state.written), - last_byte(S#state.trimmed)) + S#state.prev_extra of - Offset -> - true; - UnexpectedByte -> - {wrong_offset_after_append, - {Offset, UnexpectedByte}, - {S#state.written, S#state.prev_extra}} - end; - Error -> - Error - end. +append_post(_S, _Args, Res) -> + true == is_ok(Res). %% rewrite rewrite_pre(S) -> - S#state.pid /= undefined andalso - (S#state.written ++ S#state.trimmed) /= [] . + S#state.pid /= undefined andalso S#state.written /= []. rewrite_args(S) -> - ?LET({Off, Len}, - get_written_interval(S#state.written ++ S#state.trimmed), - [S#state.pid, Off, data_with_csum(Len)]). + ?LET({Off, Len}, get_written_interval(S#state.written), + [S#state.pid, Off, data_with_csum(Len)]). rewrite(Pid, Offset, {Bin, Tag, Csum}) -> Meta = [{client_csum_tag, Tag}, @@ -387,88 +317,18 @@ rewrite_post(_S, _Args, Res) -> rewrite_next(S, _Res, _Args) -> S#state{prev_extra = 0}. -%% trim - -trim_pre(S) -> - S#state.pid /= undefined andalso S#state.planned_trims /= []. - -trim_args(S) -> - {Offset, Length} = hd(S#state.planned_trims), - [S#state.pid, Offset, Length]. - -trim(Pid, Offset, Length) -> - machi_file_proxy:trim(Pid, Offset, Length, false). - -trim_post(_S, [_Pid, _Offset, _Length], ok) -> - true; -trim_post(_S, [_Pid, _Offset, _Length], _Res) -> - false. - -trim_next(S, Res, [_Pid, Offset, Length]) -> - S1 = case is_ok(Res) of - true -> - NewWritten = cleanup_chunk(Offset, Length, S#state.written), - Trimmed1 = cleanup_chunk(Offset, Length, S#state.trimmed), - NewTrimmed = lists:sort([{Offset, Length}|Trimmed1]), - S#state{trimmed=NewTrimmed, - written=NewWritten}; - _Other -> - S - end, - S1#state{prev_extra=0, - planned_trims=tl(S#state.planned_trims)}. - -stop_pre(S) -> - S#state.pid /= undefined. - -stop_args(S) -> - [S#state.pid]. - -stop(Pid) -> - catch machi_file_proxy:stop(Pid). - -stop_post(_, _, _) -> true. - -stop_next(S, _, _) -> - S#state{pid=undefined, prev_extra=0}. - %% Property prop_ok() -> - cleanup(), - ?FORALL({I, T}, - {shuffle_interval(), shuffle_interval()}, - ?FORALL(Cmds, parallel_commands(?MODULE, initial_state(I, T)), - begin - {H, S, Res} = run_parallel_commands(?MODULE, Cmds), - cleanup(), - pretty_commands(?MODULE, Cmds, {H, S, Res}, - aggregate(command_names(Cmds), Res == ok)) - end)). - -%% Test for tester functions -chopper_test_() -> - [?_assertEqual([{0, 1024}], - get_overlaps(1, 1, [{0, 1024}], [])), - ?_assertEqual([], - get_overlaps(10, 5, [{9, 1}, {15, 1}], [])), - ?_assertEqual([{9,2},{14,1}], - get_overlaps(10, 5, [{9, 2}, {14, 1}], [])), - ?_assertEqual([], chop(0, 0, [{0,2}])), - ?_assertEqual([{0, 1}], chop(0, 1, [{0,2}])), - ?_assertEqual([], chop(1, 0, [{0,2}])), - ?_assertEqual([{1, 1}], chop(1, 1, [{0,2}])), - ?_assertEqual([{1, 1}], chop(1, 2, [{0,2}])), - ?_assertEqual([], chop(2, 1, [{0,2}])), - ?_assertEqual([], chop(2, 2, [{0,2}])), - ?_assertEqual([{1, 1}], chop(1, 3, [{0,2}])), - ?_assertError(_, chop(3, 1, [{0,2}])), - ?_assertEqual([], chop(2, 3, [{0,2}])), - ?_assertEqual({0, 1}, chopped_left(1, [{0, 1024}])), - ?_assertEqual([{0, 1}, {2, 1022}], cleanup_chunk(1, 1, [{0, 1024}])), - ?_assertEqual([{2, 1022}], cleanup_chunk(0, 2, [{0, 1}, {2, 1022}])), - ?_assert(true) - ]. + cleanup(), + ?FORALL(I, shuffle_interval(), + ?FORALL(Cmds, parallel_commands(?MODULE, initial_state(I)), + begin + {H, S, Res} = run_parallel_commands(?MODULE, Cmds), + pretty_commands(?MODULE, Cmds, {H, S, Res}, + aggregate(command_names(Cmds), Res == ok)) + end) + ). -endif. % EQC -endif. % TEST diff --git a/test/machi_file_proxy_test.erl b/test/machi_file_proxy_test.erl index 8c4b60b..8269483 100644 --- a/test/machi_file_proxy_test.erl +++ b/test/machi_file_proxy_test.erl @@ -78,14 +78,14 @@ random_binary(Start, End) -> machi_file_proxy_test_() -> clean_up_data_dir(?TESTDIR), - {ok, Pid} = machi_file_proxy:start_link(fluname, "test", ?TESTDIR), + {ok, Pid} = machi_file_proxy:start_link("test", ?TESTDIR), [ ?_assertEqual({error, bad_arg}, machi_file_proxy:read(Pid, -1, -1)), ?_assertEqual({error, bad_arg}, machi_file_proxy:write(Pid, -1, <<"yo">>)), ?_assertEqual({error, bad_arg}, machi_file_proxy:append(Pid, [], -1, <<"krep">>)), - ?_assertMatch({ok, {_, []}}, machi_file_proxy:read(Pid, 1, 1)), + ?_assertEqual({error, not_written}, machi_file_proxy:read(Pid, 1, 1)), ?_assertEqual({error, not_written}, machi_file_proxy:read(Pid, 1024, 1)), - ?_assertMatch({ok, {_, []}}, machi_file_proxy:read(Pid, 1, 1024)), + ?_assertEqual({error, not_written}, machi_file_proxy:read(Pid, 1, 1024)), ?_assertEqual({error, not_written}, machi_file_proxy:read(Pid, 1024, ?HYOOGE)), ?_assertEqual({error, not_written}, machi_file_proxy:read(Pid, ?HYOOGE, 1)), ?_assertEqual({error, written}, machi_file_proxy:write(Pid, 1, random_binary(0, ?HYOOGE))), @@ -100,12 +100,8 @@ machi_file_proxy_test_() -> multiple_chunks_read_test_() -> clean_up_data_dir(?TESTDIR), - {ok, Pid} = machi_file_proxy:start_link(fluname, "test", ?TESTDIR), + {ok, Pid} = machi_file_proxy:start_link("test", ?TESTDIR), [ - ?_assertEqual(ok, machi_file_proxy:trim(Pid, 0, 1, false)), - ?_assertMatch({ok, {[], [{"test", 0, 1}]}}, - machi_file_proxy:read(Pid, 0, 1, - [{needs_trimmed, true}])), ?_assertMatch({ok, "test", _}, machi_file_proxy:append(Pid, random_binary(0, 1024))), ?_assertEqual(ok, machi_file_proxy:write(Pid, 10000, <<"fail">>)), ?_assertEqual(ok, machi_file_proxy:write(Pid, 20000, <<"fail">>)), @@ -118,9 +114,6 @@ multiple_chunks_read_test_() -> {"test", 30000, <<"fail">>, _}, {"test", 530000, <<"fail">>, _}], []}}, machi_file_proxy:read(Pid, 1024, 530000)), - ?_assertMatch({ok, {[{"test", 1, _, _}], [{"test", 0, 1}]}}, - machi_file_proxy:read(Pid, 0, 1024, - [{needs_trimmed, true}])), ?_assertException(exit, {normal, _}, machi_file_proxy:stop(Pid)) ]. diff --git a/test/machi_merkle_tree_test.erl b/test/machi_merkle_tree_test.erl new file mode 100644 index 0000000..922f0e2 --- /dev/null +++ b/test/machi_merkle_tree_test.erl @@ -0,0 +1,198 @@ +%% ------------------------------------------------------------------- +%% +%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved. +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- + +-module(machi_merkle_tree_test). +-compile([export_all]). + +-include("machi_merkle_tree.hrl"). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("kernel/include/file.hrl"). + +-define(GAP_CHANCE, 0.10). + +%% unit tests +basic_test() -> + random:seed(os:timestamp()), + Fsz = choose_size() * 1024, + Filesize = max(Fsz, 10*1024*1024), + ChunkSize = max(1048576, Filesize div 100), + N = make_leaf_nodes(Filesize), + D0 = #naive{ leaves = N, chunk_size = ChunkSize, recalc = true }, + T1 = machi_merkle_tree:build_tree(D0), + + D1 = #naive{ leaves = tl(N), chunk_size = ChunkSize, recalc = true }, + T2 = machi_merkle_tree:build_tree(D1), + + ?assertNotEqual(T1#naive.root, T2#naive.root), + ?assertEqual(1, length(machi_merkle_tree:naive_diff(T1, T2))). + + +make_leaf_nodes(Filesize) -> + lists:reverse( + lists:foldl(fun(T, Acc) -> machi_merkle_tree:update_acc(T, Acc) end, + [], + generate_offsets(Filesize, 1024, [])) + ). + +choose_int(Factor) -> + random:uniform(1024*Factor). + +small_int() -> + choose_int(10). + +medium_int() -> + choose_int(1024). + +large_int() -> + choose_int(4096). + +generate_offsets(Filesize, Current, Acc) when Current < Filesize -> + Length0 = choose_size(), + + Length = case Length0 + Current > Filesize of + false -> Length0; + true -> Filesize - Current + end, + Data = term_to_binary(os:timestamp()), + Checksum = machi_util:make_tagged_csum(client_sha, machi_util:checksum_chunk(Data)), + Gap = maybe_gap(random:uniform()), + generate_offsets(Filesize, Current + Length + Gap, [ {Current, Length, Checksum} | Acc ]); +generate_offsets(_Filesize, _Current, Acc) -> + lists:reverse(Acc). + + +random_from_list(L) -> + N = random:uniform(length(L)), + lists:nth(N, L). + +choose_size() -> + F = random_from_list([fun small_int/0, fun medium_int/0, fun large_int/0]), + F(). + +maybe_gap(Chance) when Chance < ?GAP_CHANCE -> + choose_size(); +maybe_gap(_) -> 0. + +%% Define or remove these ifdefs if benchmarking is desired. +-ifdef(BENCH). +generate_offsets(FH, Filesize, Current, Acc) when Current < Filesize -> + Length0 = choose_size(), + + Length = case Length0 + Current > Filesize of + false -> Length0; + true -> Filesize - Current + end, + {ok, Data} = file:pread(FH, Current, Length), + Checksum = machi_util:make_tagged_csum(client_sha, machi_util:checksum_chunk(Data)), + Gap = maybe_gap(random:uniform()), + generate_offsets(FH, Filesize, Current + Length + Gap, [ {Current, Length, Checksum} | Acc ]); +generate_offsets(_FH, _Filesize, _Current, Acc) -> + lists:reverse(Acc). + +make_offsets_from_file(Filename) -> + {ok, Info} = file:read_file_info(Filename), + Filesize = Info#file_info.size, + {ok, FH} = file:open(Filename, [read, raw, binary]), + Offsets = generate_offsets(FH, Filesize, 1024, []), + file:close(FH), + Offsets. + +choose_filename() -> + random_from_list([ + "def^c5ea7511-d649-47d6-a8c3-2b619379c237^1", + "jkl^b077eff7-b2be-4773-a73f-fea4acb8a732^1", + "stu^553fa47a-157c-4fac-b10f-2252c7d8c37a^1", + "vwx^ae015d68-7689-4c9f-9677-926c6664f513^1", + "yza^4c784dc2-19bf-4ac6-91f6-58bbe5aa88e0^1" + ]). + + +make_csum_file(DataDir, Filename, Offsets) -> + Path = machi_util:make_checksum_filename(DataDir, Filename), + filelib:ensure_dir(Path), + {ok, MC} = machi_csum_table:open(Path, []), + lists:foreach(fun({Offset, Size, Checksum}) -> + machi_csum_table:write(MC, Offset, Size, Checksum) end, + Offsets), + machi_csum_table:close(MC). + + +test() -> + test(100). + +test(N) -> + {ok, F} = file:open("results.txt", [raw, write]), + lists:foreach(fun(X) -> format_and_store(F, run_test(X)) end, lists:seq(1, N)). + +format_and_store(F, {OffsetNum, {MTime, MSize}, {NTime, NSize}}) -> + S = io_lib:format("~w\t~w\t~w\t~w\t~w\n", [OffsetNum, MTime, MSize, NTime, NSize]), + ok = file:write(F, S). + +run_test(C) -> + random:seed(os:timestamp()), + OffsetFn = "test/" ++ choose_filename(), + O = make_offsets_from_file(OffsetFn), + Fn = "csum_" ++ integer_to_list(C), + make_csum_file(".", Fn, O), + + Osize = length(O), + + {MTime, {ok, M}} = timer:tc(fun() -> machi_merkle_tree:open(Fn, ".", merklet) end), + {NTime, {ok, N}} = timer:tc(fun() -> machi_merkle_tree:open(Fn, ".", naive) end), + + ?assertEqual(Fn, machi_merkle_tree:filename(M)), + ?assertEqual(Fn, machi_merkle_tree:filename(N)), + + MTree = machi_merkle_tree:tree(M), + MSize = byte_size(term_to_binary(MTree)), + + NTree = machi_merkle_tree:tree(N), + NSize = byte_size(term_to_binary(NTree)), + + ?assertEqual(same, machi_merkle_tree:diff(N, N)), + ?assertEqual(same, machi_merkle_tree:diff(M, M)), + {Osize, {MTime, MSize}, {NTime, NSize}}. + +torture_test(C) -> + Results = [ run_torture_test() || _ <- lists:seq(1, C) ], + {ok, F} = file:open("torture_results.txt", [raw, write]), + lists:foreach(fun({MSize, MTime, NSize, NTime}) -> + file:write(F, io_lib:format("~p\t~p\t~p\t~p\n", + [MSize, MTime, NSize, NTime])) + end, Results), + ok = file:close(F). + +run_torture_test() -> + {NTime, N} = timer:tc(fun() -> naive_torture() end), + + MSize = byte_size(term_to_binary(M)), + NSize = byte_size(term_to_binary(N)), + + {MSize, MTime, NSize, NTime}. + +naive_torture() -> + N = lists:foldl(fun(T, Acc) -> machi_merkle_tree:update_acc(T, Acc) end, [], torture_generator()), + T = #naive{ leaves = lists:reverse(N), chunk_size = 10010, recalc = true }, + machi_merkle_tree:build_tree(T). + +torture_generator() -> + [ {O, 1, crypto:hash(sha, term_to_binary(now()))} || O <- lists:seq(1024, 1000000) ]. +-endif. % BENCH diff --git a/test/machi_pb_high_client_test.erl b/test/machi_pb_high_client_test.erl index 361eb55..25c79fd 100644 --- a/test/machi_pb_high_client_test.erl +++ b/test/machi_pb_high_client_test.erl @@ -38,7 +38,6 @@ smoke_test2() -> Ps = [#p_srvr{name=a, address="localhost", port=Port, props="./data.a"} ], D = orddict:from_list([{P#p_srvr.name, P} || P <- Ps]), - ok = application:set_env(machi, max_file_size, 1024*1024), [os:cmd("rm -rf " ++ P#p_srvr.props) || P <- Ps], {ok, SupPid} = machi_flu_sup:start_link(), @@ -91,54 +90,23 @@ smoke_test2() -> {ok, [{File1Size,File1}]} = ?C:list_files(Clnt), true = is_integer(File1Size), - File1Bin = binary_to_list(File1), [begin - #p_srvr{name=Name, port=Port, props=Dir} = P, - ?assertEqual({ok, [File1Bin]}, - file:list_dir(filename:join([Dir, "data"]))), - FileListFileName = filename:join([Dir, "known_files_" ++ atom_to_list(Name)]), - {ok, Plist} = machi_plist:open(FileListFileName, []), - ?assertEqual([], machi_plist:all(Plist)) - end || P <- Ps], + %% ok = ?C:trim_chunk(Clnt, Fl, Off, Sz) + %% This gets an error as trim API is still a stub + ?assertMatch({bummer, + {throw, + {error, bad_joss_taipan_fixme}, + _Boring_stack_trace}}, + ?C:trim_chunk(Clnt, Fl, Off, Sz)) + end || {Ch, Fl, Off, Sz} <- Reads], - [begin - ok = ?C:trim_chunk(Clnt, Fl, Off, Sz) - end || {_Ch, Fl, Off, Sz} <- Reads], - [begin - {ok, {[], Trimmed}} = - ?C:read_chunk(Clnt, Fl, Off, Sz, [{needs_trimmed, true}]), - Filename = binary_to_list(Fl), - ?assertEqual([{Filename, Off, Sz}], Trimmed) - end || {_Ch, Fl, Off, Sz} <- Reads], - - LargeBytes = binary:copy(<<"x">>, 1024*1024), - LBCsum = {client_sha, machi_util:checksum_chunk(LargeBytes)}, - {ok, {Offx, Sizex, Filex}} = - ?C:append_chunk(Clnt, PK, Prefix, LargeBytes, LBCsum, 0), - ok = ?C:trim_chunk(Clnt, Filex, Offx, Sizex), - - %% Make sure everything was trimmed - File = binary_to_list(Filex), - [begin - #p_srvr{name=Name, port=_Port, props=Dir} = P, - ?assertEqual({ok, []}, - file:list_dir(filename:join([Dir, "data"]))), - FileListFileName = filename:join([Dir, "known_files_" ++ atom_to_list(Name)]), - {ok, Plist} = machi_plist:open(FileListFileName, []), - ?assertEqual([File], machi_plist:all(Plist)) - end || P <- Ps], - - [begin - {error, trimmed} = - ?C:read_chunk(Clnt, Fl, Off, Sz, []) - end || {_Ch, Fl, Off, Sz} <- Reads], ok after (catch ?C:quit(Clnt)) end after exit(SupPid, normal), - [os:cmd("rm -rf " ++ P#p_srvr.props) || P <- Ps], + [os:cmd("rm -rf " ++ P#p_srvr.props) || P <- Ps], machi_util:wait_for_death(SupPid, 100), ok end. diff --git a/test/machi_plist_test.erl b/test/machi_plist_test.erl deleted file mode 100644 index a796c1b..0000000 --- a/test/machi_plist_test.erl +++ /dev/null @@ -1,17 +0,0 @@ --module(machi_plist_test). - --include_lib("eunit/include/eunit.hrl"). - -open_close_test() -> - FileName = "bark-bark-one", - file:delete(FileName), - {ok, PList0} = machi_plist:open(FileName, []), - {ok, PList1} = machi_plist:add(PList0, "boomar"), - ?assertEqual(["boomar"], machi_plist:all(PList1)), - ok = machi_plist:close(PList1), - - {ok, PList2} = machi_plist:open(FileName, []), - ?assertEqual(["boomar"], machi_plist:all(PList2)), - ok = machi_plist:close(PList2), - file:delete(FileName), - ok. diff --git a/tools.mk b/tools.mk index 5a8afd0..1c40f8e 100644 --- a/tools.mk +++ b/tools.mk @@ -27,7 +27,6 @@ REBAR ?= ./rebar REVISION ?= $(shell git rev-parse --short HEAD) PROJECT ?= $(shell basename `find src -name "*.app.src"` .app.src) -EUNIT_OPTS ?= .PHONY: compile-no-deps test docs xref dialyzer-run dialyzer-quick dialyzer \ cleanplt upload-docs @@ -36,7 +35,7 @@ compile-no-deps: ${REBAR} compile skip_deps=true test: compile - ${REBAR} ${EUNIT_OPTS} eunit skip_deps=true + ${REBAR} eunit skip_deps=true upload-docs: docs @if [ -z "${BUCKET}" -o -z "${PROJECT}" -o -z "${REVISION}" ]; then \ @@ -64,18 +63,10 @@ ERL_LIB_DIR = $(shell erl -eval '{io:format("~s\n", [code:lib_dir()]), erlang:ha native-ebin: mkdir -p $(NATIVE_EBIN) rm -f $(NATIVE_EBIN)/*.erl $(NATIVE_EBIN)/*.hrl $(NATIVE_EBIN)/*.beam - @for mod in lists dict digraph digraph_utils ets gb_sets gb_trees ordsets sets sofs; do \ - cp $(ERL_LIB_DIR)/stdlib-*/src/"$$mod".erl $(NATIVE_EBIN); \ - done - @for mod in cerl cerl_trees core_parse; do \ - cp $(ERL_LIB_DIR)/compiler-*/src/"$$mod".?rl $(NATIVE_EBIN); \ - done - @for mod in dialyzer_analysis_callgraph dialyzer dialyzer_behaviours dialyzer_codeserver dialyzer_contracts dialyzer_coordinator dialyzer_dataflow dialyzer_dep dialyzer_plt dialyzer_succ_typings dialyzer_typesig dialyzer_worker; do \ - cp $(ERL_LIB_DIR)/dialyzer-*/src/"$$mod".?rl $(NATIVE_EBIN); \ - done - @for mod in erl_types erl_bif_types; do \ - cp $(ERL_LIB_DIR)/hipe-*/*/"$$mod".?rl $(NATIVE_EBIN); \ - done + cp $(ERL_LIB_DIR)/stdlib-*/src/{lists,dict,digraph,digraph_utils,ets,gb_sets,gb_trees,ordsets,sets,sofs}.erl $(NATIVE_EBIN) + cp $(ERL_LIB_DIR)/compiler-*/src/{cerl,cerl_trees,core_parse}.?rl $(NATIVE_EBIN) + cp $(ERL_LIB_DIR)/dialyzer-*/src/{dialyzer_analysis_callgraph,dialyzer,dialyzer_behaviours,dialyzer_codeserver,dialyzer_contracts,dialyzer_coordinator,dialyzer_dataflow,dialyzer_dep,dialyzer_plt,dialyzer_succ_typings,dialyzer_typesig,dialyzer_worker}.?rl $(NATIVE_EBIN) + cp $(ERL_LIB_DIR)/hipe-*/*/{erl_types,erl_bif_types}.?rl $(NATIVE_EBIN) erlc -o $(NATIVE_EBIN) -smp +native -DVSN='"$(DIALYZER_VERSION)"' $(NATIVE_EBIN)/*erl ${PLT}: compile