Compare commits
42 commits
slf/doc-cl
...
master
Author | SHA1 | Date | |
---|---|---|---|
|
e87bd59a97 | ||
|
1e0bb4c404 | ||
|
549963545f | ||
|
2aa8917875 | ||
|
27cbf1e38c | ||
|
d2fa79e037 | ||
|
57ba204210 | ||
|
24f8cb21a2 | ||
|
e63db8dedc | ||
|
a739b5265c | ||
|
767f5d9e60 | ||
|
8c21539fcb | ||
|
0f24b69378 | ||
|
ec9d391047 | ||
|
fa71a918b8 | ||
|
6cddfcf988 | ||
|
6b000f6e7c | ||
|
96c46ec5aa | ||
|
cd166361aa | ||
|
4e5c16f5e2 | ||
|
16153a5d31 | ||
|
84f522f865 | ||
|
fc46cd1b25 | ||
|
184a54ebbd | ||
|
4cb166368a | ||
|
f433e84fab | ||
|
a3fbe2c8bb | ||
|
bdf47da10c | ||
|
6c03f5c1a6 | ||
|
11921d82bf | ||
|
a27425147d | ||
|
34f8632f19 | ||
|
c02a0bed70 | ||
|
1d8bc19891 | ||
|
53ce6d89dd | ||
|
2e46d199c8 | ||
|
0f543b4c4d | ||
|
d5c3da78fb | ||
|
affad6b1d3 | ||
|
ed56a2c6cf | ||
|
c2e9a83372 | ||
|
ecfad4726b |
31 changed files with 1130 additions and 388 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -2,7 +2,9 @@ prototype/chain-manager/patch.*
|
|||
.eqc-info
|
||||
.eunit
|
||||
deps
|
||||
dev
|
||||
erl_crash.dump
|
||||
eqc
|
||||
.concrete/DEV_MODE
|
||||
.rebar
|
||||
edoc
|
||||
|
@ -20,6 +22,7 @@ include/machi_pb.hrl
|
|||
|
||||
# Release packaging
|
||||
rel/machi
|
||||
rel/vars/dev*vars.config
|
||||
|
||||
# Misc Scott cruft
|
||||
*.patch
|
||||
|
|
13
FAQ.md
13
FAQ.md
|
@ -46,13 +46,13 @@
|
|||
<a name="n1.1">
|
||||
### 1.1. What is Machi?
|
||||
|
||||
Very briefly, Machi is a very simple append-only file store.
|
||||
Very briefly, Machi is a very simple append-only blob/file store.
|
||||
|
||||
Machi is
|
||||
"dumber" than many other file stores (i.e., lacking many features
|
||||
found in other file stores) such as HadoopFS or a simple NFS or CIFS file
|
||||
server.
|
||||
However, Machi is a distributed file store, which makes it different
|
||||
However, Machi is a distributed blob/file store, which makes it different
|
||||
(and, in some ways, more complicated) than a simple NFS or CIFS file
|
||||
server.
|
||||
|
||||
|
@ -142,7 +142,8 @@ consistency mode during and after network partitions are:
|
|||
due to Machi's restrictions on file naming and file offset
|
||||
assignment. Both file names and file offsets are always chosen
|
||||
by Machi servers according to rules which guarantee safe
|
||||
mergeability.
|
||||
mergeability. Server-assigned names are a characteristic of a
|
||||
"blob store".
|
||||
|
||||
<a name="n1.5">
|
||||
### 1.5. What is Machi like when operating in "strongly consistent" mode?
|
||||
|
@ -172,10 +173,10 @@ for more details.
|
|||
### 1.6. What does Machi's API look like?
|
||||
|
||||
The Machi API only contains a handful of API operations. The function
|
||||
arguments shown below use Erlang-style type annotations.
|
||||
arguments shown below (in simplifed form) use Erlang-style type annotations.
|
||||
|
||||
append_chunk(Prefix:binary(), Chunk:binary()).
|
||||
append_chunk_extra(Prefix:binary(), Chunk:binary(), ExtraSpace:non_neg_integer()).
|
||||
append_chunk(Prefix:binary(), Chunk:binary(), CheckSum:binary()).
|
||||
append_chunk_extra(Prefix:binary(), Chunk:binary(), CheckSum:binary(), ExtraSpace:non_neg_integer()).
|
||||
read_chunk(File:binary(), Offset:non_neg_integer(), Size:non_neg_integer()).
|
||||
|
||||
checksum_list(File:binary()).
|
||||
|
|
33
Makefile
33
Makefile
|
@ -10,7 +10,7 @@ endif
|
|||
OVERLAY_VARS ?=
|
||||
EUNIT_OPTS = -v
|
||||
|
||||
.PHONY: rel deps package pkgclean edoc
|
||||
.PHONY: rel stagedevrel deps package pkgclean edoc
|
||||
|
||||
all: deps compile
|
||||
|
||||
|
@ -57,6 +57,37 @@ relclean:
|
|||
stage : rel
|
||||
$(foreach dep,$(wildcard deps/*), rm -rf rel/$(REPO)/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) rel/$(REPO)/lib;)
|
||||
|
||||
##
|
||||
## Developer targets
|
||||
##
|
||||
## devN - Make a dev build for node N
|
||||
## stagedevN - Make a stage dev build for node N (symlink libraries)
|
||||
## devrel - Make a dev build for 1..$DEVNODES
|
||||
## stagedevrel Make a stagedev build for 1..$DEVNODES
|
||||
##
|
||||
## Example, make a 68 node devrel cluster
|
||||
## make stagedevrel DEVNODES=68
|
||||
|
||||
.PHONY : stagedevrel devrel
|
||||
DEVNODES ?= 3
|
||||
|
||||
# 'seq' is not available on all *BSD, so using an alternate in awk
|
||||
SEQ = $(shell awk 'BEGIN { for (i = 1; i < '$(DEVNODES)'; i++) printf("%i ", i); print i ;exit(0);}')
|
||||
|
||||
$(eval stagedevrel : $(foreach n,$(SEQ),stagedev$(n)))
|
||||
$(eval devrel : $(foreach n,$(SEQ),dev$(n)))
|
||||
|
||||
dev% : all
|
||||
mkdir -p dev
|
||||
rel/gen_dev $@ rel/vars/dev_vars.config.src rel/vars/$@_vars.config
|
||||
(cd rel && ../rebar generate target_dir=../dev/$@ overlay_vars=vars/$@_vars.config)
|
||||
|
||||
stagedev% : dev%
|
||||
$(foreach dep,$(wildcard deps/*), rm -rf dev/$^/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) dev/$^/lib;)
|
||||
|
||||
devclean: clean
|
||||
rm -rf dev
|
||||
|
||||
DIALYZER_APPS = kernel stdlib sasl erts ssl compiler eunit crypto public_key syntax_tools
|
||||
PLT = $(HOME)/.machi_dialyzer_plt
|
||||
|
||||
|
|
124
README.md
124
README.md
|
@ -1,19 +1,19 @@
|
|||
# Machi: a robust & reliable, distributed, highly available, large file store
|
||||
# Machi: a distributed, decentralized blob/large file store
|
||||
|
||||
[Travis-CI](http://travis-ci.org/basho/machi) :: ![Travis-CI](https://secure.travis-ci.org/basho/machi.png)
|
||||
|
||||
Outline
|
||||
|
||||
1. [Why another file store?](#sec1)
|
||||
1. [Why another blob/file store?](#sec1)
|
||||
2. [Where to learn more about Machi](#sec2)
|
||||
3. [Development status summary](#sec3)
|
||||
4. [Contributing to Machi's development](#sec4)
|
||||
|
||||
<a name="sec1">
|
||||
## 1. Why another file store?
|
||||
## 1. Why another blob/file store?
|
||||
|
||||
Our goal is a robust & reliable, distributed, highly available, large
|
||||
file store. Such stores already exist, both in the open source world
|
||||
file and blob store. Such stores already exist, both in the open source world
|
||||
and in the commercial world. Why reinvent the wheel? We believe
|
||||
there are three reasons, ordered by decreasing rarity.
|
||||
|
||||
|
@ -25,9 +25,8 @@ there are three reasons, ordered by decreasing rarity.
|
|||
3. We want to manage file replicas in a way that's provably correct
|
||||
and also easy to test.
|
||||
|
||||
Of all the file stores in the open source & commercial worlds, only
|
||||
criteria #3 is a viable option. Or so we hope. Or we just don't
|
||||
care, and if data gets lost or corrupted, then ... so be it.
|
||||
Criteria #3 is difficult to find in the open source world but perhaps
|
||||
not impossible.
|
||||
|
||||
If we have app use cases where availability is more important than
|
||||
consistency, then systems that meet criteria #2 are also rare.
|
||||
|
@ -39,12 +38,13 @@ file data and attempts best-effort file reads?
|
|||
|
||||
If we really do care about data loss and/or data corruption, then we
|
||||
really want both #3 and #1. Unfortunately, systems that meet
|
||||
criteria #1 are _very rare_.
|
||||
criteria #1 are _very rare_. (Nonexistant?)
|
||||
Why? This is 2015. We have decades of research that shows
|
||||
that computer hardware can (and
|
||||
indeed does) corrupt data at nearly every level of the modern
|
||||
client/server application stack. Systems with end-to-end data
|
||||
corruption detection should be ubiquitous today. Alas, they are not.
|
||||
|
||||
Machi is an effort to change the deplorable state of the world, one
|
||||
Erlang function at a time.
|
||||
|
||||
|
@ -64,49 +64,68 @@ Humming Consensus" is available online now.
|
|||
* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf)
|
||||
* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q)
|
||||
|
||||
See later in this document for how to run the Humming Consensus demos,
|
||||
including the network partition simulator.
|
||||
|
||||
<a name="sec3">
|
||||
## 3. Development status summary
|
||||
|
||||
Mid-December 2015: work is underway.
|
||||
Mid-March 2016: The Machi development team has been downsized in
|
||||
recent months, and the pace of development has slowed. Here is a
|
||||
summary of the status of Machi's major components.
|
||||
|
||||
* In progress:
|
||||
* Code refactoring: metadata management using
|
||||
[ELevelDB](https://github.com/basho/eleveldb)
|
||||
* File repair using file-centric, Merkle-style hash tree.
|
||||
* Server-side socket handling is now performed by
|
||||
[ranch](https://github.com/ninenines/ranch)
|
||||
* QuickCheck tests for file repair correctness
|
||||
* 2015-12-15: The EUnit test `machi_ap_repair_eqc` is
|
||||
currently failing occasionally because it (correctly) detects
|
||||
double-write errors. Double-write errors will be eliminated
|
||||
when the ELevelDB integration work is complete.
|
||||
* The `make stage` and `make release` commands can be used to
|
||||
create a primitive "package". Use `./rel/machi/bin/machi console`
|
||||
to start the Machi app in interactive mode. Substitute the word
|
||||
`start` instead of console to start Machi in background/daemon
|
||||
mode. The `./rel/machi/bin/machi` command without any arguments
|
||||
will give a short usage summary.
|
||||
* Chain Replication management using the Humming Consensus
|
||||
algorithm to manage chain state is stable.
|
||||
* ... with the caveat that it runs very well in a very harsh
|
||||
and unforgiving network partition simulator but has not run
|
||||
much yet in the real world.
|
||||
* All Machi client/server protocols are based on
|
||||
[Protocol Buffers](https://developers.google.com/protocol-buffers/docs/overview).
|
||||
* The current specification for Machi's protocols can be found at
|
||||
[https://github.com/basho/machi/blob/master/src/machi.proto](https://github.com/basho/machi/blob/master/src/machi.proto).
|
||||
* The Machi PB protocol is not yet stable. Expect change!
|
||||
* The Erlang language client implementation of the high-level
|
||||
protocol flavor is brittle (e.g., little error handling yet).
|
||||
* Humming Consensus and the chain manager
|
||||
* No new safety bugs have been found by model-checking tests.
|
||||
* A new document,
|
||||
[Hands-on experiments with Machi and Humming Consensus](doc/humming-consensus-demo.md)
|
||||
is now available. It is a tutorial for setting up a 3 virtual
|
||||
machine Machi cluster and how to demonstrate the chain manager's
|
||||
reactions to server stops & starts, crashes & restarts, and pauses
|
||||
(simulated by `SIGSTOP` and `SIGCONT`).
|
||||
* The chain manager can still make suboptimal-but-safe choices for
|
||||
chain transitions when a server hangs/pauses temporarily.
|
||||
* Recent chain manager changes have made the instability window
|
||||
much shorter when the slow/paused server resumes execution.
|
||||
* Scott believes that a modest change to the chain manager's
|
||||
calculation of a new projection can reduce flapping in this (and
|
||||
many other cases) less likely. Currently, the new local
|
||||
projection is calculated using only local state (i.e., the chain
|
||||
manager's internal state + the fitness server's state).
|
||||
However, if the "latest" projection read from the public
|
||||
projection stores were also input to the new projection
|
||||
calculation function, then many obviously bad projections can be
|
||||
avoided without needing rounds of Humming Consensus to
|
||||
demonstrate that a bad projection is bad.
|
||||
|
||||
If you would like to run the network partition simulator
|
||||
mentioned in the Ricon 2015 presentation about Humming Consensus,
|
||||
please see the
|
||||
[partition simulator convergence test doc.](./doc/machi_chain_manager1_converge_demo.md)
|
||||
* FLU/data server process
|
||||
* All known correctness bugs have been fixed.
|
||||
* Performance has not yet been measured. Performance measurement
|
||||
and enhancements are scheduled to start in the middle of March 2016.
|
||||
(This will include a much-needed update to the `basho_bench` driver.)
|
||||
|
||||
If you'd like to work on a protocol such as Thrift, UBF,
|
||||
msgpack over UDP, or some other protocol, let us know by
|
||||
[opening an issue to discuss it](./issues/new).
|
||||
* Access protocols and client libraries
|
||||
* The protocol used by both external clients and internally (instead
|
||||
of using Erlang's native message passing mechanisms) is based on
|
||||
Protocol Buffers.
|
||||
* (Machi PB protocol specification: ./src/machi.proto)[./src/machi.proto]
|
||||
* At the moment, the PB specification contains two protocols.
|
||||
Sometime in the near future, the spec will be split to separate
|
||||
the external client API (the "high" protocol) from the internal
|
||||
communication API (the "low" protocol).
|
||||
|
||||
* Recent conference talks about Machi
|
||||
* Erlang Factory San Francisco 2016
|
||||
[the slides and video recording](http://www.erlang-factory.com/sfbay2016/scott-lystig-fritchie)
|
||||
will be available a few weeks after the conference ends on March
|
||||
11, 2016.
|
||||
* Ricon 2015
|
||||
* [The slides](http://ricon.io/archive/2015/slides/Scott_Fritchie_Ricon_2015.pdf)
|
||||
* and the [video recording](https://www.youtube.com/watch?v=yR5kHL1bu1Q&index=13&list=PL9Jh2HsAWHxIc7Tt2M6xez_TOP21GBH6M)
|
||||
are now available.
|
||||
* If you would like to run the Humming Consensus code (with or without
|
||||
the network partition simulator) as described in the RICON 2015
|
||||
presentation, please see the
|
||||
[Humming Consensus demo doc](./doc/humming_consensus_demo.md).
|
||||
|
||||
<a name="sec4">
|
||||
## 4. Contributing to Machi's development
|
||||
|
@ -134,13 +153,22 @@ X. The only known limitations for using R16 are minor type
|
|||
specification difference between R16 and 17, but we strongly suggest
|
||||
continuing development using version 17.
|
||||
|
||||
We also assume that you have the standard UNIX/Linux developers
|
||||
tool chain for C and C++ applications. Specifically, we assume `make`
|
||||
is available. The utility used to compile the Machi source code,
|
||||
We also assume that you have the standard UNIX/Linux developer
|
||||
tool chain for C and C++ applications. Also, we assume
|
||||
that Git and GNU Make are available.
|
||||
The utility used to compile the Machi source code,
|
||||
`rebar`, is pre-compiled and included in the repo.
|
||||
For more details, please see the
|
||||
[Machi development environment prerequisites doc](./doc/dev-prerequisites.md).
|
||||
|
||||
Machi has a dependency on the
|
||||
[ELevelDB](https://github.com/basho/eleveldb) library. ELevelDB only
|
||||
supports UNIX/Linux OSes and 64-bit versions of Erlang/OTP only; we
|
||||
apologize to Windows-based and 32-bit-based Erlang developers for this
|
||||
restriction.
|
||||
|
||||
### 4.3 New protocols and features
|
||||
|
||||
If you'd like to work on a protocol such as Thrift, UBF,
|
||||
msgpack over UDP, or some other protocol, let us know by
|
||||
[opening an issue to discuss it](./issues/new).
|
||||
|
|
30
doc/dev-clone-compile.md
Normal file
30
doc/dev-clone-compile.md
Normal file
|
@ -0,0 +1,30 @@
|
|||
# Clone and compile Machi
|
||||
|
||||
Clone the Machi source repo and compile the source and test code. Run
|
||||
the following commands at your login shell:
|
||||
|
||||
cd /tmp
|
||||
git clone https://github.com/basho/machi.git
|
||||
cd machi
|
||||
git checkout master
|
||||
make # or 'gmake' if GNU make uses an alternate name
|
||||
|
||||
Then run the unit test suite. This may take up to two minutes or so
|
||||
to finish.
|
||||
|
||||
make test
|
||||
|
||||
At the end, the test suite should report that all tests passed. The
|
||||
actual number of tests shown in the "All `X` tests passed" line may be
|
||||
different than the example below.
|
||||
|
||||
[... many lines omitted ...]
|
||||
module 'event_logger'
|
||||
module 'chain_mgr_legacy'
|
||||
=======================================================
|
||||
All 90 tests passed.
|
||||
|
||||
If you had a test failure, a likely cause may be a limit on the number
|
||||
of file descriptors available to your user process. (Recent releases
|
||||
of OS X have a limit of 1024 file descriptors, which may be too slow.)
|
||||
The output of the `limit -n` will tell you your file descriptor limit.
|
38
doc/dev-prerequisites.md
Normal file
38
doc/dev-prerequisites.md
Normal file
|
@ -0,0 +1,38 @@
|
|||
## Machi developer environment prerequisites
|
||||
|
||||
1. Machi requires an 64-bit variant of UNIX: OS X, FreeBSD, Linux, or
|
||||
Solaris machine is a standard developer environment for C and C++
|
||||
applications (64-bit versions).
|
||||
2. You'll need the `git` source management utility.
|
||||
3. You'll need the 64-bit Erlang/OTP 17 runtime environment. Please
|
||||
don't use earlier or later versions until we have a chance to fix
|
||||
the compilation warnings that versions R16B and 18 will trigger.
|
||||
Also, please verify that you are not using a 32-bit Erlang/OTP
|
||||
runtime package.
|
||||
|
||||
For `git` and the Erlang runtime, please use your OS-specific
|
||||
package manager to install these. If your package manager doesn't
|
||||
have 64-bit Erlang/OTP version 17 available, then we recommend using the
|
||||
[precompiled packages available at Erlang Solutions](https://www.erlang-solutions.com/resources/download.html).
|
||||
|
||||
Also, please verify that you have enough file descriptors available to
|
||||
your user processes. The output of `ulimit -n` should report at least
|
||||
4,000 file descriptors available. If your limit is lower (a frequent
|
||||
problem for OS X users), please increase it to at least 4,000.
|
||||
|
||||
# Using Vagrant to set up a developer environment for Machi
|
||||
|
||||
The Machi source directory contains a `Vagrantfile` for creating an
|
||||
Ubuntu Linux-based virtual machine for compiling and running Machi.
|
||||
This file is in the
|
||||
[$SRC_TOP/priv/humming-consensus-demo.vagrant](../priv/humming-consensus-demo.vagrant)
|
||||
directory.
|
||||
|
||||
If used as-is, the virtual machine specification is modest.
|
||||
|
||||
* 1 virtual CPU
|
||||
* 512MB virtual memory
|
||||
* 768MB swap space
|
||||
* 79GB sparse virtual disk image. After installing prerequisites and
|
||||
compiling Machi, the root file system uses approximately 2.7 GBytes.
|
||||
|
372
doc/humming-consensus-demo.md
Normal file
372
doc/humming-consensus-demo.md
Normal file
|
@ -0,0 +1,372 @@
|
|||
|
||||
# Table of contents
|
||||
|
||||
* [Hands-on experiments with Machi and Humming Consensus](#hands-on)
|
||||
* [Using the network partition simulator and convergence demo test code](#partition-simulator)
|
||||
|
||||
<a name="hands-on">
|
||||
# Hands-on experiments with Machi and Humming Consensus
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Please refer to the
|
||||
[Machi development environment prerequisites doc](./dev-prerequisites.md)
|
||||
for Machi developer environment prerequisites.
|
||||
|
||||
If you do not have an Erlang/OTP runtime system available, but you do
|
||||
have [the Vagrant virtual machine](https://www.vagrantup.com/) manager
|
||||
available, then please refer to the instructions in the prerequisites
|
||||
doc for using Vagrant.
|
||||
|
||||
<a name="clone-compile">
|
||||
## Clone and compile the code
|
||||
|
||||
Please see the
|
||||
[Machi 'clone and compile' doc](./dev-clone-compile.md)
|
||||
for the short list of steps required to fetch the Machi source code
|
||||
from GitHub and to compile & test Machi.
|
||||
|
||||
## Running three Machi instances on a single machine
|
||||
|
||||
All of the commands that should be run at your login shell (e.g. Bash,
|
||||
c-shell) can be cut-and-pasted from this document directly to your
|
||||
login shell prompt.
|
||||
|
||||
Run the following command:
|
||||
|
||||
make stagedevrel
|
||||
|
||||
This will create a directory structure like this:
|
||||
|
||||
|-dev1-|... stand-alone Machi app + subdirectories
|
||||
|-dev-|-dev2-|... stand-alone Machi app + directories
|
||||
|-dev3-|... stand-alone Machi app + directories
|
||||
|
||||
Each of the `dev/dev1`, `dev/dev2`, and `dev/dev3` are stand-alone
|
||||
application instances of Machi and can be run independently of each
|
||||
other on the same machine. This demo will use all three.
|
||||
|
||||
The lifecycle management utilities for Machi are a bit immature,
|
||||
currently. They assume that each Machi server runs on a host with a
|
||||
unique hostname -- there is no flexibility built-in yet to easily run
|
||||
multiple Machi instances on the same machine. To continue with the
|
||||
demo, we need to use `sudo` or `su` to obtain superuser privileges to
|
||||
edit the `/etc/hosts` file.
|
||||
|
||||
Please add the following line to `/etc/hosts`, using this command:
|
||||
|
||||
sudo sh -c 'echo "127.0.0.1 machi1 machi2 machi3" >> /etc/hosts'
|
||||
|
||||
Next, we will use a shell script to finish setting up our cluster. It
|
||||
will do the following for us:
|
||||
|
||||
* Verify that the new line that was added to `/etc/hosts` is correct.
|
||||
* Modify the `etc/app.config` files to configure the Humming Consensus
|
||||
chain manager's actions logged to the `log/console.log` file.
|
||||
* Start the three application instances.
|
||||
* Verify that the three instances are running correctly.
|
||||
* Configure a single chain, with one FLU server per application
|
||||
instance.
|
||||
|
||||
Please run this script using this command:
|
||||
|
||||
./priv/humming-consensus-demo.setup.sh
|
||||
|
||||
If the output looks like this (and exits with status zero), then the
|
||||
script was successful.
|
||||
|
||||
Step: Verify that the required entries in /etc/hosts are present
|
||||
Step: add a verbose logging option to app.config
|
||||
Step: start three three Machi application instances
|
||||
pong
|
||||
pong
|
||||
pong
|
||||
Step: configure one chain to start a Humming Consensus group with three members
|
||||
Result: ok
|
||||
Result: ok
|
||||
Result: ok
|
||||
|
||||
We have now created a single replica chain, called `c1`, that has
|
||||
three file servers participating in the chain. Thanks to the
|
||||
hostnames that we added to `/etc/hosts`, all are using the localhost
|
||||
network interface.
|
||||
|
||||
| App instance | Pseudo | FLU name | TCP port |
|
||||
| directory | Hostname | | number |
|
||||
|--------------+----------+----------+----------|
|
||||
| dev1 | machi1 | flu1 | 20401 |
|
||||
| dev2 | machi2 | flu2 | 20402 |
|
||||
| dev3 | machi3 | flu3 | 20403 |
|
||||
|
||||
The log files for each application instance can be found in the
|
||||
`./dev/devN/log/console.log` file, where the `N` is the instance
|
||||
number: 1, 2, or 3.
|
||||
|
||||
## Understanding the chain manager's log file output
|
||||
|
||||
After running the `./priv/humming-consensus-demo.setup.sh` script,
|
||||
let's look at the last few lines of the `./dev/dev1/log/console.log`
|
||||
log file for Erlang VM process #1.
|
||||
|
||||
2016-03-09 10:16:35.676 [info] <0.105.0>@machi_lifecycle_mgr:process_pending_flu:422 Started FLU f1 with supervisor pid <0.128.0>
|
||||
2016-03-09 10:16:35.676 [info] <0.105.0>@machi_lifecycle_mgr:move_to_flu_config:540 Creating FLU config file f1
|
||||
2016-03-09 10:16:35.790 [info] <0.105.0>@machi_lifecycle_mgr:bootstrap_chain2:312 Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[]
|
||||
2016-03-09 10:16:35.790 [info] <0.105.0>@machi_lifecycle_mgr:move_to_chain_config:546 Creating chain config file c1
|
||||
2016-03-09 10:16:44.139 [info] <0.132.0> CONFIRM epoch 1141 <<155,42,7,221>> upi [] rep [] auth f1 by f1
|
||||
2016-03-09 10:16:44.271 [info] <0.132.0> CONFIRM epoch 1148 <<57,213,154,16>> upi [f1] rep [] auth f1 by f1
|
||||
2016-03-09 10:16:44.864 [info] <0.132.0> CONFIRM epoch 1151 <<239,29,39,70>> upi [f1] rep [f3] auth f1 by f1
|
||||
2016-03-09 10:16:45.235 [info] <0.132.0> CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1
|
||||
2016-03-09 10:16:47.343 [info] <0.132.0> CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1
|
||||
|
||||
Let's pick apart some of these lines. We have started all three
|
||||
servers at about the same time. We see some race conditions happen,
|
||||
and some jostling and readjustment happens pretty quickly in the first
|
||||
few seconds.
|
||||
|
||||
* `Started FLU f1 with supervisor pid <0.128.0>`
|
||||
* This VM, #1,
|
||||
started a FLU (Machi data server) with the name `f1`. In the Erlang
|
||||
process supervisor hierarchy, the process ID of the top supervisor
|
||||
is `<0.128.0>`.
|
||||
* `Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[]`
|
||||
* A bootstrap configuration for a chain named `c1` has been created.
|
||||
* The FLUs/data servers that are eligible for participation in the
|
||||
chain have names `f1`, `f2`, and `f3`.
|
||||
* The chain will operate in eventual consistency mode (`ap_mode`)
|
||||
* The witness server list is empty. Witness servers are never used
|
||||
in eventual consistency mode.
|
||||
* `CONFIRM epoch 1141 <<155,42,7,221>> upi [] rep [] auth f1 by f1`
|
||||
* All participants in epoch 1141 are unanimous in adopting epoch
|
||||
1141's projection. All active membership lists are empty, so
|
||||
there is no functional chain replication yet, at least as far as
|
||||
server `f1` knows
|
||||
* The epoch's abbreviated checksum is `<<155,42,7,221>>`.
|
||||
* The UPI list, i.e. the replicas whose data is 100% in sync is
|
||||
`[]`, the empty list. (UPI = Update Propagation Invariant)
|
||||
* The list of servers that are under data repair (`rep`) is also
|
||||
empty, `[]`.
|
||||
* This projection was authored by server `f1`.
|
||||
* The log message was generated by server `f1`.
|
||||
* `CONFIRM epoch 1148 <<57,213,154,16>> upi [f1] rep [] auth f1 by f1`
|
||||
* Now the server `f1` has created a chain of length 1, `[f1]`.
|
||||
* Chain repair/file re-sync is not required when the UPI server list
|
||||
changes from length 0 -> 1.
|
||||
* `CONFIRM epoch 1151 <<239,29,39,70>> upi [f1] rep [f3] auth f1 by f1`
|
||||
* Server `f1` has noticed that server `f3` is alive. Apparently it
|
||||
has not yet noticed that server `f2` is also running.
|
||||
* Server `f3` is in the repair list.
|
||||
* `CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1`
|
||||
* Server `f2` is apparently now aware that all three servers are running.
|
||||
* The previous configuration used by `f2` was `upi [f2]`, i.e., `f2`
|
||||
was running in a chain of one. `f2` noticed that `f1` and `f3`
|
||||
were now available and has started adding them to the chain.
|
||||
* All new servers are always added to the tail of the chain in the
|
||||
repair list.
|
||||
* In eventual consistency mode, a UPI change like this is OK.
|
||||
* When performing a read, a client must read from both tail of the
|
||||
UPI list and also from all repairing servers.
|
||||
* When performing a write, the client writes to both the UPI
|
||||
server list and also the repairing list, in that order.
|
||||
* I.e., the client concatenates both lists,
|
||||
`UPI ++ Repairing`, for its chain configuration for the write.
|
||||
* Server `f2` will trigger file repair/re-sync shortly.
|
||||
* The waiting time for starting repair has been configured to be
|
||||
extremely short, 1 second. The default waiting time is 10
|
||||
seconds, in case Humming Consensus remains unstable.
|
||||
* `CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1`
|
||||
* File repair/re-sync has finished. All file data on all servers
|
||||
are now in sync.
|
||||
* The UPI/in-sync part of the chain is now `[f2,f1,f3]`, and there
|
||||
are no servers under repair.
|
||||
|
||||
## Let's create some failures
|
||||
|
||||
Here are some suggestions for creating failures.
|
||||
|
||||
* Use the `./dev/devN/bin/machi stop` and `./dev/devN/bin/machi start`
|
||||
commands to stop & start VM #`N`.
|
||||
* Stop a VM abnormally by using `kill`. The OS process name to look
|
||||
for is `beam.smp`.
|
||||
* Suspend and resume a VM, using the `SIGSTOP` and `SIGCONT` signals.
|
||||
* E.g. `kill -STOP 9823` and `kill -CONT 9823`
|
||||
|
||||
The network partition simulator is not (yet) available when running
|
||||
Machi in this mode. Please see the next section for instructions on
|
||||
how to use partition simulator.
|
||||
|
||||
|
||||
<a name="partition-simulator">
|
||||
# Using the network partition simulator and convergence demo test code
|
||||
|
||||
This is the demo code mentioned in the presentation that Scott Lystig
|
||||
Fritchie gave at the
|
||||
[RICON 2015 conference](http://ricon.io).
|
||||
* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf)
|
||||
* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q)
|
||||
|
||||
## A complete example of all input and output
|
||||
|
||||
If you don't have an Erlang/OTP 17 runtime environment available,
|
||||
please see this file for full input and output of a strong consistency
|
||||
length=3 chain test:
|
||||
https://gist.github.com/slfritchie/8352efc88cc18e62c72c
|
||||
This file contains all commands input and all simulator output from a
|
||||
sample run of the simulator.
|
||||
|
||||
To help interpret the output of the test, please skip ahead to the
|
||||
"The test output is very verbose" section.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
If you don't have `git` and/or the Erlang 17 runtime system available
|
||||
on your OS X, FreeBSD, Linux, or Solaris machine, please take a look
|
||||
at the [Prerequisites section](#prerequisites) first. When you have
|
||||
installed the prerequisite software, please return back here.
|
||||
|
||||
## Clone and compile the code
|
||||
|
||||
Please briefly visit the [Clone and compile the code](#clone-compile)
|
||||
section. When finished, please return back here.
|
||||
|
||||
## Run an interactive Erlang CLI shell
|
||||
|
||||
Run the following command at your login shell:
|
||||
|
||||
erl -pz .eunit ebin deps/*/ebin
|
||||
|
||||
If you are using Erlang/OTP version 17, you should see some CLI output
|
||||
that looks like this:
|
||||
|
||||
Erlang/OTP 17 [erts-6.4] [source] [64-bit] [smp:8:8] [async-threads:10] [hipe] [kernel-poll:false] [dtrace]
|
||||
|
||||
Eshell V6.4 (abort with ^G)
|
||||
1>
|
||||
|
||||
## The test output is very verbose ... what are the important parts?
|
||||
|
||||
The output of the Erlang command
|
||||
`machi_chain_manager1_converge_demo:help()` will display the following
|
||||
guide to the output of the tests.
|
||||
|
||||
A visualization of the convergence behavior of the chain self-management
|
||||
algorithm for Machi.
|
||||
|
||||
1. Set up some server and chain manager pairs.
|
||||
2. Create a number of different network partition scenarios, where
|
||||
(simulated) partitions may be symmetric or asymmetric. Then stop changing
|
||||
the partitions and keep the simulated network stable (and perhaps broken).
|
||||
3. Run a number of iterations of the algorithm in parallel by poking each
|
||||
of the manager processes on a random'ish basis.
|
||||
4. Afterward, fetch the chain transition changes made by each FLU and
|
||||
verify that no transition was unsafe.
|
||||
|
||||
During the iteration periods, the following is a cheatsheet for the output.
|
||||
See the internal source for interpreting the rest of the output.
|
||||
|
||||
'SET partitions = '
|
||||
|
||||
A pair-wise list of actors which cannot send messages. The
|
||||
list is uni-directional. If there are three servers (a,b,c),
|
||||
and if the partitions list is '[{a,b},{b,c}]' then all
|
||||
messages from a->b and b->c will be dropped, but any other
|
||||
sender->recipient messages will be delivered successfully.
|
||||
|
||||
'x uses:'
|
||||
|
||||
The FLU x has made an internal state transition and is using
|
||||
this epoch's projection as operating chain configuration. The
|
||||
rest of the line is a summary of the projection.
|
||||
|
||||
'CONFIRM epoch {N}'
|
||||
|
||||
This message confirms that all of the servers listed in the
|
||||
UPI and repairing lists of the projection at epoch {N} have
|
||||
agreed to use this projection because they all have written
|
||||
this projection to their respective private projection stores.
|
||||
The chain is now usable by/available to all clients.
|
||||
|
||||
'Sweet, private projections are stable'
|
||||
|
||||
This report announces that this iteration of the test cycle
|
||||
has passed successfully. The report that follows briefly
|
||||
summarizes the latest private projection used by each
|
||||
participating server. For example, when in strong consistency
|
||||
mode with 'a' as a witness and 'b' and 'c' as real servers:
|
||||
|
||||
%% Legend:
|
||||
%% server name, epoch ID, UPI list, repairing list, down list, ...
|
||||
%% ... witness list, 'false' (a constant value)
|
||||
|
||||
[{a,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}},
|
||||
{b,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}]
|
||||
|
||||
Both servers 'a' and 'b' agree on epoch 1116 with epoch ID
|
||||
{1116,<<23,143,246,55>>} where UPI=[a,b], repairing=[],
|
||||
down=[c], and witnesses=[a].
|
||||
|
||||
Server 'c' is not shown because 'c' has wedged itself OOS (out
|
||||
of service) by configuring a chain length of zero.
|
||||
|
||||
If no servers are listed in the report (i.e. only '[]' is
|
||||
displayed), then all servers have wedged themselves OOS, and
|
||||
the chain is unavailable.
|
||||
|
||||
'DoIt,'
|
||||
|
||||
This marks a group of tick events which trigger the manager
|
||||
processes to evaluate their environment and perhaps make a
|
||||
state transition.
|
||||
|
||||
A long chain of 'DoIt,DoIt,DoIt,' means that the chain state has
|
||||
(probably) settled to a stable configuration, which is the goal of the
|
||||
algorithm.
|
||||
|
||||
Press control-c to interrupt the test....".
|
||||
|
||||
## Run a test in eventual consistency mode
|
||||
|
||||
Run the following command at the Erlang CLI prompt:
|
||||
|
||||
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}]).
|
||||
|
||||
The first argument, `3`, is the number of servers to participate in
|
||||
the chain. Please note:
|
||||
|
||||
* Chain lengths as short as 1 or 2 are valid, but the results are a
|
||||
bit boring.
|
||||
* Chain lengths as long as 7 or 9 can be used, but they may
|
||||
suffer from longer periods of churn/instability before all chain
|
||||
managers reach agreement via humming consensus. (It is future work
|
||||
to shorten the worst of the unstable churn latencies.)
|
||||
* In eventual consistency mode, chain lengths may be even numbers,
|
||||
e.g. 2, 4, or 6.
|
||||
* The simulator will choose partition events from the permutations of
|
||||
all 1, 2, and 3 node partition pairs. The total runtime will
|
||||
increase *dramatically* with chain length.
|
||||
* Chain length 2: about 3 partition cases
|
||||
* Chain length 3: about 35 partition cases
|
||||
* Chain length 4: about 230 partition cases
|
||||
* Chain length 5: about 1100 partition cases
|
||||
|
||||
## Run a test in strong consistency mode (with witnesses):
|
||||
|
||||
*NOTE:* Due to a bug in the test code, please do not try to run the
|
||||
convergence test in strong consistency mode and also without the
|
||||
correct minority number of witness servers! If in doubt, please run
|
||||
the commands shown below exactly.
|
||||
|
||||
Run the following command at the Erlang CLI prompt:
|
||||
|
||||
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).
|
||||
|
||||
The first argument, `3`, is the number of servers to participate in
|
||||
the chain. Chain lengths as long as 7 or 9 can be used, but they may
|
||||
suffer from longer periods of churn/instability before all chain
|
||||
managers reach agreement via humming consensus.
|
||||
|
||||
Due to the bug mentioned above, please use the following
|
||||
commands when running with chain lengths of 5 or 7, respectively.
|
||||
|
||||
machi_chain_manager1_converge_demo:t(5, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b]}]).
|
||||
machi_chain_manager1_converge_demo:t(7, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b,c]}]).
|
||||
|
|
@ -1,185 +0,0 @@
|
|||
|
||||
# Using the network partition simulator and convergence demo test code
|
||||
|
||||
## A complete example of all input and output
|
||||
|
||||
If you don't have an Erlang/OTP 17 runtime environment available,
|
||||
please see this file for full input and output of a strong consistency
|
||||
length=3 chain test:
|
||||
https://gist.github.com/slfritchie/8352efc88cc18e62c72c
|
||||
This file contains all commands input and all simulator output from a
|
||||
sample run of the simulator.
|
||||
|
||||
To help interpret the output of the test, please skip ahead to the
|
||||
"The test output is very verbose" section.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. You'll need the `git` source management
|
||||
2. You'll need the Erlang/OTP 17 runtime environment. Please don't
|
||||
use earlier or later versions until we have a chance to fix the
|
||||
compilation warnings that versions R16B and 18 will trigger.
|
||||
|
||||
All of the commands that should be run at your login shell (e.g. Bash,
|
||||
c-shell) can be cut-and-pasted from this document directly to your
|
||||
login shell prompt.
|
||||
|
||||
## Clone and compile the code
|
||||
|
||||
Clone the Machi source repo and compile the source and test code. Run
|
||||
the following commands at your login shell:
|
||||
|
||||
cd /tmp
|
||||
git clone https://github.com/basho/machi.git
|
||||
cd machi
|
||||
git checkout master
|
||||
make
|
||||
|
||||
Then run the unit test suite. This may take up to two minutes or so
|
||||
to finish. Most of the tests will be silent; please be patient until
|
||||
the tests finish.
|
||||
|
||||
make test
|
||||
|
||||
## Run an interactive Erlang CLI shell
|
||||
|
||||
Run the following command at your login shell:
|
||||
|
||||
erl -pz .eunit ebin deps/*/ebin
|
||||
|
||||
If you are using Erlang/OTP version 17, you should see some CLI output
|
||||
that looks like this:
|
||||
|
||||
Erlang/OTP 17 [erts-6.4] [source] [64-bit] [smp:8:8] [async-threads:10] [hipe] [kernel-poll:false] [dtrace]
|
||||
|
||||
Eshell V6.4 (abort with ^G)
|
||||
1>
|
||||
|
||||
## The test output is very verbose ... what are the important parts?
|
||||
|
||||
The output of the Erlang command
|
||||
`machi_chain_manager1_converge_demo:help()` will display the following
|
||||
guide to the output of the tests.
|
||||
|
||||
A visualization of the convergence behavior of the chain self-management
|
||||
algorithm for Machi.
|
||||
|
||||
1. Set up some server and chain manager pairs.
|
||||
2. Create a number of different network partition scenarios, where
|
||||
(simulated) partitions may be symmetric or asymmetric. Then stop changing
|
||||
the partitions and keep the simulated network stable (and perhaps broken).
|
||||
3. Run a number of iterations of the algorithm in parallel by poking each
|
||||
of the manager processes on a random'ish basis.
|
||||
4. Afterward, fetch the chain transition changes made by each FLU and
|
||||
verify that no transition was unsafe.
|
||||
|
||||
During the iteration periods, the following is a cheatsheet for the output.
|
||||
See the internal source for interpreting the rest of the output.
|
||||
|
||||
'SET partitions = '
|
||||
|
||||
A pair-wise list of actors which cannot send messages. The
|
||||
list is uni-directional. If there are three servers (a,b,c),
|
||||
and if the partitions list is '[{a,b},{b,c}]' then all
|
||||
messages from a->b and b->c will be dropped, but any other
|
||||
sender->recipient messages will be delivered successfully.
|
||||
|
||||
'x uses:'
|
||||
|
||||
The FLU x has made an internal state transition and is using
|
||||
this epoch's projection as operating chain configuration. The
|
||||
rest of the line is a summary of the projection.
|
||||
|
||||
'CONFIRM epoch {N}'
|
||||
|
||||
This message confirms that all of the servers listed in the
|
||||
UPI and repairing lists of the projection at epoch {N} have
|
||||
agreed to use this projection because they all have written
|
||||
this projection to their respective private projection stores.
|
||||
The chain is now usable by/available to all clients.
|
||||
|
||||
'Sweet, private projections are stable'
|
||||
|
||||
This report announces that this iteration of the test cycle
|
||||
has passed successfully. The report that follows briefly
|
||||
summarizes the latest private projection used by each
|
||||
participating server. For example, when in strong consistency
|
||||
mode with 'a' as a witness and 'b' and 'c' as real servers:
|
||||
|
||||
%% Legend:
|
||||
%% server name, epoch ID, UPI list, repairing list, down list, ...
|
||||
%% ... witness list, 'false' (a constant value)
|
||||
|
||||
[{a,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}},
|
||||
{b,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}]
|
||||
|
||||
Both servers 'a' and 'b' agree on epoch 1116 with epoch ID
|
||||
{1116,<<23,143,246,55>>} where UPI=[a,b], repairing=[],
|
||||
down=[c], and witnesses=[a].
|
||||
|
||||
Server 'c' is not shown because 'c' has wedged itself OOS (out
|
||||
of service) by configuring a chain length of zero.
|
||||
|
||||
If no servers are listed in the report (i.e. only '[]' is
|
||||
displayed), then all servers have wedged themselves OOS, and
|
||||
the chain is unavailable.
|
||||
|
||||
'DoIt,'
|
||||
|
||||
This marks a group of tick events which trigger the manager
|
||||
processes to evaluate their environment and perhaps make a
|
||||
state transition.
|
||||
|
||||
A long chain of 'DoIt,DoIt,DoIt,' means that the chain state has
|
||||
(probably) settled to a stable configuration, which is the goal of the
|
||||
algorithm.
|
||||
|
||||
Press control-c to interrupt the test....".
|
||||
|
||||
## Run a test in eventual consistency mode
|
||||
|
||||
Run the following command at the Erlang CLI prompt:
|
||||
|
||||
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}]).
|
||||
|
||||
The first argument, `3`, is the number of servers to participate in
|
||||
the chain. Please note:
|
||||
|
||||
* Chain lengths as short as 1 or 2 are valid, but the results are a
|
||||
bit boring.
|
||||
* Chain lengths as long as 7 or 9 can be used, but they may
|
||||
suffer from longer periods of churn/instability before all chain
|
||||
managers reach agreement via humming consensus. (It is future work
|
||||
to shorten the worst of the unstable churn latencies.)
|
||||
* In eventual consistency mode, chain lengths may be even numbers,
|
||||
e.g. 2, 4, or 6.
|
||||
* The simulator will choose partition events from the permutations of
|
||||
all 1, 2, and 3 node partition pairs. The total runtime will
|
||||
increase *dramatically* with chain length.
|
||||
* Chain length 2: about 3 partition cases
|
||||
* Chain length 3: about 35 partition cases
|
||||
* Chain length 4: about 230 partition cases
|
||||
* Chain length 5: about 1100 partition cases
|
||||
|
||||
## Run a test in strong consistency mode (with witnesses):
|
||||
|
||||
*NOTE:* Due to a bug in the test code, please do not try to run the
|
||||
convergence test in strong consistency mode and also without the
|
||||
correct minority number of witness servers! If in doubt, please run
|
||||
the commands shown below exactly.
|
||||
|
||||
Run the following command at the Erlang CLI prompt:
|
||||
|
||||
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).
|
||||
|
||||
The first argument, `3`, is the number of servers to participate in
|
||||
the chain. Chain lengths as long as 7 or 9 can be used, but they may
|
||||
suffer from longer periods of churn/instability before all chain
|
||||
managers reach agreement via humming consensus.
|
||||
|
||||
Due to the bug mentioned above, please use the following
|
||||
commands when running with chain lengths of 5 or 7, respectively.
|
||||
|
||||
machi_chain_manager1_converge_demo:t(5, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b]}]).
|
||||
machi_chain_manager1_converge_demo:t(7, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b,c]}]).
|
||||
|
56
priv/humming-consensus-demo.setup.sh
Executable file
56
priv/humming-consensus-demo.setup.sh
Executable file
|
@ -0,0 +1,56 @@
|
|||
#!/bin/sh
|
||||
|
||||
echo "Step: Verify that the required entries in /etc/hosts are present"
|
||||
for i in 1 2 3; do
|
||||
grep machi$i /etc/hosts | egrep -s '^127.0.0.1' > /dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
echo ""
|
||||
echo "'grep -s machi$i' failed. Aborting, sorry."
|
||||
exit 1
|
||||
fi
|
||||
ping -c 1 machi$i > /dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
echo ""
|
||||
echo "Ping attempt on host machi$i failed. Aborting."
|
||||
echo ""
|
||||
ping -c 1 machi$i
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Step: add a verbose logging option to app.config"
|
||||
for i in 1 2 3; do
|
||||
ed ./dev/dev$i/etc/app.config <<EOF > /dev/null 2>&1
|
||||
/verbose_confirm
|
||||
a
|
||||
{chain_manager_opts, [{private_write_verbose_confirm,true}]},
|
||||
{stability_time, 1},
|
||||
.
|
||||
w
|
||||
q
|
||||
EOF
|
||||
done
|
||||
|
||||
echo "Step: start three three Machi application instances"
|
||||
for i in 1 2 3; do
|
||||
./dev/dev$i/bin/machi start
|
||||
./dev/dev$i/bin/machi ping
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Sorry, a 'ping' check for instance dev$i failed. Aborting."
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Step: configure one chain to start a Humming Consensus group with three members"
|
||||
|
||||
# Note: $CWD of each Machi proc is two levels below the source code root dir.
|
||||
LIFECYCLE000=../../priv/quick-admin-examples/demo-000
|
||||
for i in 3 2 1; do
|
||||
./dev/dev$i/bin/machi-admin quick-admin-apply $LIFECYCLE000 machi$i
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Sorry, 'machi-admin quick-admin-apply failed' on machi$i. Aborting."
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
exit 0
|
93
priv/humming-consensus-demo.vagrant/Vagrantfile
vendored
Normal file
93
priv/humming-consensus-demo.vagrant/Vagrantfile
vendored
Normal file
|
@ -0,0 +1,93 @@
|
|||
# -*- mode: ruby -*-
|
||||
# vi: set ft=ruby :
|
||||
|
||||
# All Vagrant configuration is done below. The "2" in Vagrant.configure
|
||||
# configures the configuration version (we support older styles for
|
||||
# backwards compatibility). Please don't change it unless you know what
|
||||
# you're doing.
|
||||
Vagrant.configure(2) do |config|
|
||||
# The most common configuration options are documented and commented below.
|
||||
# For a complete reference, please see the online documentation at
|
||||
# https://docs.vagrantup.com.
|
||||
|
||||
# Every Vagrant development environment requires a box. You can search for
|
||||
# boxes at https://atlas.hashicorp.com/search.
|
||||
# If this Vagrant box has not been downloaded before (e.g. using "vagrant box add"),
|
||||
# then Vagrant will automatically download the VM image from HashiCorp.
|
||||
config.vm.box = "hashicorp/precise64"
|
||||
# If using a FreeBSD box, Bash may not be installed.
|
||||
# Use the config.ssh.shell setting to specify an alternate shell.
|
||||
# Note, however, that any code in the 'config.vm.provision' section
|
||||
# would then have to use this shell's syntax!
|
||||
# config.ssh.shell = "/bin/csh -l"
|
||||
|
||||
# Disable automatic box update checking. If you disable this, then
|
||||
# boxes will only be checked for updates when the user runs
|
||||
# `vagrant box outdated`. This is not recommended.
|
||||
# config.vm.box_check_update = false
|
||||
|
||||
# Create a forwarded port mapping which allows access to a specific port
|
||||
# within the machine from a port on the host machine. In the example below,
|
||||
# accessing "localhost:8080" will access port 80 on the guest machine.
|
||||
# config.vm.network "forwarded_port", guest: 80, host: 8080
|
||||
|
||||
# Create a private network, which allows host-only access to the machine
|
||||
# using a specific IP.
|
||||
# config.vm.network "private_network", ip: "192.168.33.10"
|
||||
|
||||
# Create a public network, which generally matched to bridged network.
|
||||
# Bridged networks make the machine appear as another physical device on
|
||||
# your network.
|
||||
# config.vm.network "public_network"
|
||||
|
||||
# Share an additional folder to the guest VM. The first argument is
|
||||
# the path on the host to the actual folder. The second argument is
|
||||
# the path on the guest to mount the folder. And the optional third
|
||||
# argument is a set of non-required options.
|
||||
# config.vm.synced_folder "../data", "/vagrant_data"
|
||||
|
||||
# Provider-specific configuration so you can fine-tune various
|
||||
# backing providers for Vagrant. These expose provider-specific options.
|
||||
# Example for VirtualBox:
|
||||
#
|
||||
config.vm.provider "virtualbox" do |vb|
|
||||
# Display the VirtualBox GUI when booting the machine
|
||||
# vb.gui = true
|
||||
|
||||
# Customize the amount of memory on the VM:
|
||||
vb.memory = "512"
|
||||
end
|
||||
#
|
||||
# View the documentation for the provider you are using for more
|
||||
# information on available options.
|
||||
|
||||
# Define a Vagrant Push strategy for pushing to Atlas. Other push strategies
|
||||
# such as FTP and Heroku are also available. See the documentation at
|
||||
# https://docs.vagrantup.com/v2/push/atlas.html for more information.
|
||||
# config.push.define "atlas" do |push|
|
||||
# push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME"
|
||||
# end
|
||||
|
||||
# Enable provisioning with a shell script. Additional provisioners such as
|
||||
# Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the
|
||||
# documentation for more information about their specific syntax and use.
|
||||
config.vm.provision "shell", inline: <<-SHELL
|
||||
# Install prerequsites
|
||||
# Support here for FreeBSD is experimental
|
||||
apt-get update ; sudo apt-get install -y git sudo rsync ; # Ubuntu Linux
|
||||
env ASSUME_ALWAYS_YES=yes pkg install -f git sudo rsync ; # FreeBSD 10
|
||||
|
||||
# Install dependent packages, using slf-configurator
|
||||
git clone https://github.com/slfritchie/slf-configurator.git
|
||||
chown -R vagrant ./slf-configurator
|
||||
(cd slf-configurator ; sudo sh -x ./ALL.sh)
|
||||
echo 'export PATH=${PATH}:/usr/local/erlang/17.5/bin' >> ~vagrant/.bashrc
|
||||
export PATH=${PATH}:/usr/local/erlang/17.5/bin
|
||||
## echo 'set path = ( $path /usr/local/erlang/17.5/bin )' >> ~vagrant/.cshrc
|
||||
## setenv PATH /usr/local/erlang/17.5/bin:$PATH
|
||||
|
||||
git clone https://github.com/basho/machi.git
|
||||
(cd machi ; git checkout master ; make && make test )
|
||||
chown -R vagrant ./machi
|
||||
SHELL
|
||||
end
|
7
priv/quick-admin-examples/demo-000
Normal file
7
priv/quick-admin-examples/demo-000
Normal file
|
@ -0,0 +1,7 @@
|
|||
{host, "machi1", []}.
|
||||
{host, "machi2", []}.
|
||||
{host, "machi3", []}.
|
||||
{flu,f1,"machi1",20401,[]}.
|
||||
{flu,f2,"machi2",20402,[]}.
|
||||
{flu,f3,"machi3",20403,[]}.
|
||||
{chain,c1,[f1,f2,f3],[]}.
|
|
@ -16,6 +16,10 @@
|
|||
%% Default = 10
|
||||
%% {metadata_manager_count, 2},
|
||||
|
||||
%% Default options for chain manager processes.
|
||||
%% {chain_manager_opts, [{private_write_verbose,true},
|
||||
%% {private_write_verbose_confirm,true}]},
|
||||
|
||||
%% Platform vars (mirror of reltool packaging)
|
||||
{platform_data_dir, "{{platform_data_dir}}"},
|
||||
{platform_etc_dir, "{{platform_etc_dir}}"},
|
||||
|
|
16
rel/gen_dev
Executable file
16
rel/gen_dev
Executable file
|
@ -0,0 +1,16 @@
|
|||
#! /bin/sh
|
||||
#
|
||||
# Example usage: gen_dev dev4 vars.src vars
|
||||
#
|
||||
# Generate an overlay config for devNNN from vars.src and write to vars
|
||||
#
|
||||
|
||||
NAME=$1
|
||||
TEMPLATE=$2
|
||||
VARFILE=$3
|
||||
|
||||
NODE="$NAME@127.0.0.1"
|
||||
|
||||
echo "Generating $NAME - node='$NODE'"
|
||||
sed -e "s/@NODE@/$NODE/" \
|
||||
< $TEMPLATE > $VARFILE
|
|
@ -106,6 +106,7 @@
|
|||
{copy, "../priv/quick-admin-examples/000", "priv/quick-admin-examples"},
|
||||
{copy, "../priv/quick-admin-examples/001", "priv/quick-admin-examples"},
|
||||
{copy, "../priv/quick-admin-examples/002", "priv/quick-admin-examples"},
|
||||
{copy, "../priv/quick-admin-examples/demo-000", "priv/quick-admin-examples/demo-000"},
|
||||
|
||||
{mkdir, "lib/basho-patches"}
|
||||
%% {copy, "../apps/machi/ebin/etop_txt.beam", "lib/basho-patches"}
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
%% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*-
|
||||
%% ex: ft=erlang ts=4 sw=4 et
|
||||
|
||||
%% NOTE: When modifying this file, also keep its near cousin
|
||||
%% config file rel/vars/dev_vars.config.src in sync!
|
||||
|
||||
%% Platform-specific installation paths
|
||||
{platform_bin_dir, "./bin"}.
|
||||
{platform_data_dir, "./data"}.
|
||||
|
|
48
rel/vars/dev_vars.config.src
Normal file
48
rel/vars/dev_vars.config.src
Normal file
|
@ -0,0 +1,48 @@
|
|||
%% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*-
|
||||
%% ex: ft=erlang ts=4 sw=4 et
|
||||
|
||||
%% NOTE: When modifying this file, also keep its near cousin
|
||||
%% config file rel/vars/dev_vars.config.src in sync!
|
||||
|
||||
%% Platform-specific installation paths
|
||||
{platform_bin_dir, "./bin"}.
|
||||
{platform_data_dir, "./data"}.
|
||||
{platform_etc_dir, "./etc"}.
|
||||
{platform_lib_dir, "./lib"}.
|
||||
{platform_log_dir, "./log"}.
|
||||
|
||||
%%
|
||||
%% etc/app.config
|
||||
%%
|
||||
{sasl_error_log, "{{platform_log_dir}}/sasl-error.log"}.
|
||||
{sasl_log_dir, "{{platform_log_dir}}/sasl"}.
|
||||
|
||||
%% lager
|
||||
{console_log_default, file}.
|
||||
|
||||
%%
|
||||
%% etc/vm.args
|
||||
%%
|
||||
{node, "@NODE@"}.
|
||||
{crash_dump, "{{platform_log_dir}}/erl_crash.dump"}.
|
||||
|
||||
%%
|
||||
%% bin/machi
|
||||
%%
|
||||
{runner_script_dir, "\`cd \\`dirname $0\\` 1>/dev/null && /bin/pwd\`"}.
|
||||
{runner_base_dir, "{{runner_script_dir}}/.."}.
|
||||
{runner_etc_dir, "$RUNNER_BASE_DIR/etc"}.
|
||||
{runner_log_dir, "$RUNNER_BASE_DIR/log"}.
|
||||
{runner_lib_dir, "$RUNNER_BASE_DIR/lib"}.
|
||||
{runner_patch_dir, "$RUNNER_BASE_DIR/lib/basho-patches"}.
|
||||
{pipe_dir, "/tmp/$RUNNER_BASE_DIR/"}.
|
||||
{runner_user, ""}.
|
||||
{runner_wait_process, "machi_flu_sup"}.
|
||||
{runner_ulimit_warn, 65536}.
|
||||
|
||||
%%
|
||||
%% cuttlefish
|
||||
%%
|
||||
{cuttlefish, ""}. % blank = off
|
||||
{cuttlefish_conf, "machi.conf"}.
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
{application, machi, [
|
||||
{description, "A village of write-once files."},
|
||||
{vsn, "0.0.0"},
|
||||
{applications, [kernel, stdlib, crypto, cluster_info]},
|
||||
{vsn, "0.0.1"},
|
||||
{applications, [kernel, stdlib, crypto, cluster_info, ranch]},
|
||||
{mod,{machi_app,[]}},
|
||||
{registered, []},
|
||||
{env, [
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
%% -------------------------------------------------------------------
|
||||
%%
|
||||
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
|
||||
%% Copyright (c) 2007-2016 Basho Technologies, Inc. All Rights Reserved.
|
||||
%%
|
||||
%% This file is provided to you under the Apache License,
|
||||
%% Version 2.0 (the "License"); you may not use this file
|
||||
|
@ -43,23 +43,25 @@
|
|||
%% could add new entries to this ETS table.
|
||||
%%
|
||||
%% Now we can use various integer-centric key generators that are
|
||||
%% already bundled with basho_bench.
|
||||
%% already bundled with basho_bench. NOTE: this scheme does not allow
|
||||
%% mixing of 'append' and 'read' operations in the same config. Basho
|
||||
%% Bench does not support different key generators for different
|
||||
%% operations, unfortunately. The work-around is to run two different
|
||||
%% Basho Bench instances: on for 'append' ops with a key generator for
|
||||
%% the desired prefix(es), and the other for 'read' ops with an
|
||||
%% integer key generator.
|
||||
%%
|
||||
%% TODO: Add CRC checking, when feasible and when supported on the
|
||||
%% server side.
|
||||
%%
|
||||
%% TODO: As an alternate idea, if we know that the chunks written are
|
||||
%% always the same size, and if we don't care about CRC checking, then
|
||||
%% all we need to know are the file names & file sizes on the server:
|
||||
%% we can then pick any valid offset within that file. That would
|
||||
%% certainly be more scalable than the zillion-row-ETS-table, which is
|
||||
%% definitely RAM-hungry.
|
||||
%% TODO: The 'read' operator will always read chunks at exactly the
|
||||
%% byte offset & size as the original append/write ops. If reads are
|
||||
%% desired at any arbitrary offset & size, then a new strategy is
|
||||
%% required.
|
||||
|
||||
-module(machi_basho_bench_driver).
|
||||
|
||||
-export([new/1, run/4]).
|
||||
|
||||
-record(m, {
|
||||
id,
|
||||
conn,
|
||||
max_key
|
||||
}).
|
||||
|
@ -81,7 +83,7 @@ new(Id) ->
|
|||
{read_concurrency, true}]),
|
||||
ets:insert(ETS, {max_key, 0}),
|
||||
ets:insert(ETS, {total_bytes, 0}),
|
||||
MaxKeys = load_ets_table(Conn, ETS),
|
||||
MaxKeys = load_ets_table_maybe(Conn, ETS),
|
||||
?INFO("Key preload: finished, ~w keys loaded", [MaxKeys]),
|
||||
Bytes = ets:lookup_element(ETS, total_bytes, 2),
|
||||
?INFO("Key preload: finished, chunk list specifies ~s MBytes of chunks",
|
||||
|
@ -90,12 +92,14 @@ new(Id) ->
|
|||
true ->
|
||||
ok
|
||||
end,
|
||||
{ok, #m{conn=Conn}}.
|
||||
{ok, #m{id=Id, conn=Conn}}.
|
||||
|
||||
run(append, KeyGen, ValueGen, #m{conn=Conn}=S) ->
|
||||
Prefix = KeyGen(),
|
||||
Value = ValueGen(),
|
||||
case machi_cr_client:append_chunk(Conn, Prefix, Value, ?THE_TIMEOUT) of
|
||||
CSum = machi_util:make_client_csum(Value),
|
||||
AppendOpts = {append_opts,0,undefined,false}, % HACK FIXME
|
||||
case machi_cr_client:append_chunk(Conn, undefined, Prefix, Value, CSum, AppendOpts, ?THE_TIMEOUT) of
|
||||
{ok, Pos} ->
|
||||
EtsKey = ets:update_counter(?ETS_TAB, max_key, 1),
|
||||
true = ets:insert(?ETS_TAB, {EtsKey, Pos}),
|
||||
|
@ -112,9 +116,26 @@ run(read, KeyGen, _ValueGen, #m{conn=Conn, max_key=MaxKey}=S) ->
|
|||
Idx = KeyGen() rem MaxKey,
|
||||
%% {File, Offset, Size, _CSum} = ets:lookup_element(?ETS_TAB, Idx, 2),
|
||||
{File, Offset, Size} = ets:lookup_element(?ETS_TAB, Idx, 2),
|
||||
case machi_cr_client:read_chunk(Conn, File, Offset, Size, undefined, ?THE_TIMEOUT) of
|
||||
{ok, _Chunk} ->
|
||||
{ok, S};
|
||||
ReadOpts = {read_opts,false,false,false}, % HACK FIXME
|
||||
case machi_cr_client:read_chunk(Conn, undefined, File, Offset, Size, ReadOpts, ?THE_TIMEOUT) of
|
||||
{ok, {Chunks, _Trimmed}} ->
|
||||
%% io:format(user, "Chunks ~P\n", [Chunks, 15]),
|
||||
%% {ok, S};
|
||||
case lists:all(fun({File2, Offset2, Chunk, CSum}) ->
|
||||
{_Tag, CS} = machi_util:unmake_tagged_csum(CSum),
|
||||
CS2 = machi_util:checksum_chunk(Chunk),
|
||||
if CS == CS2 ->
|
||||
true;
|
||||
CS /= CS2 ->
|
||||
?ERROR("Client-side checksum error for file ~p offset ~p expected ~p got ~p\n", [File2, Offset2, CS, CS2]),
|
||||
false
|
||||
end
|
||||
end, Chunks) of
|
||||
true ->
|
||||
{ok, S};
|
||||
false ->
|
||||
{error, bad_checksum, S}
|
||||
end;
|
||||
{error, _}=Err ->
|
||||
?ERROR("read file ~p offset ~w size ~w: ~w\n",
|
||||
[File, Offset, Size, Err]),
|
||||
|
@ -132,21 +153,40 @@ find_server_info(_Id) ->
|
|||
Ps
|
||||
end.
|
||||
|
||||
load_ets_table_maybe(Conn, ETS) ->
|
||||
case basho_bench_config:get(operations, undefined) of
|
||||
undefined ->
|
||||
?ERROR("The 'operations' key is missing from the config file, aborting", []),
|
||||
exit(bad_config);
|
||||
Ops when is_list(Ops) ->
|
||||
case lists:keyfind(read, 1, Ops) of
|
||||
{read,_} ->
|
||||
load_ets_table(Conn, ETS);
|
||||
false ->
|
||||
?INFO("No 'read' op in the 'operations' list ~p, skipping ETS table load.", [Ops]),
|
||||
0
|
||||
end
|
||||
end.
|
||||
|
||||
load_ets_table(Conn, ETS) ->
|
||||
{ok, Fs} = machi_cr_client:list_files(Conn),
|
||||
[begin
|
||||
{ok, InfoBin} = machi_cr_client:checksum_list(Conn, File),
|
||||
{ok, InfoBin} = machi_cr_client:checksum_list(Conn, File, ?THE_TIMEOUT),
|
||||
PosList = machi_csum_table:split_checksum_list_blob_decode(InfoBin),
|
||||
?INFO("File ~s len PosList ~p\n", [File, length(PosList)]),
|
||||
StartKey = ets:update_counter(ETS, max_key, 0),
|
||||
%% _EndKey = lists:foldl(fun({Off,Sz,CSum}, K) ->
|
||||
%% V = {File, Off, Sz, CSum},
|
||||
{_, Bytes} = lists:foldl(fun({Off,Sz,_CSum}, {K, Bs}) ->
|
||||
V = {File, Off, Sz},
|
||||
ets:insert(ETS, {K, V}),
|
||||
{K + 1, Bs + Sz}
|
||||
end, {StartKey, 0}, PosList),
|
||||
ets:update_counter(ETS, max_key, length(PosList)),
|
||||
ets:update_counter(ETS, total_bytes, Bytes)
|
||||
{_, C, Bytes} = lists:foldl(fun({_Off,0,_CSum}, {_K, _C, _Bs}=Acc) ->
|
||||
Acc;
|
||||
({0,_Sz,_CSum}, {_K, _C, _Bs}=Acc) ->
|
||||
Acc;
|
||||
({Off,Sz,_CSum}, {K, C, Bs}) ->
|
||||
V = {File, Off, Sz},
|
||||
ets:insert(ETS, {K, V}),
|
||||
{K + 1, C + 1, Bs + Sz}
|
||||
end, {StartKey, 0, 0}, PosList),
|
||||
_ = ets:update_counter(ETS, max_key, C),
|
||||
_ = ets:update_counter(ETS, total_bytes, Bytes),
|
||||
ok
|
||||
end || {_Size, File} <- Fs],
|
||||
ets:update_counter(?ETS_TAB, max_key, 0).
|
||||
|
||||
|
|
|
@ -92,8 +92,11 @@
|
|||
-define(REPAIR_START_STABILITY_TIME, 10).
|
||||
-endif. % TEST
|
||||
|
||||
%% Magic constant for looping "too frequently" breaker. TODO revisit & revise.
|
||||
-define(TOO_FREQUENT_BREAKER, 10).
|
||||
%% Maximum length of the history of adopted projections (via C120).
|
||||
-define(MAX_HISTORY_LENGTH, 8).
|
||||
|
||||
%% Magic constant for looping "too frequently" breaker.
|
||||
-define(TOO_FREQUENT_BREAKER, (?MAX_HISTORY_LENGTH+5)).
|
||||
|
||||
-define(RETURN2(X), begin (catch put(why2, [?LINE|get(why2)])), X end).
|
||||
|
||||
|
@ -103,9 +106,6 @@
|
|||
%% Amount of epoch number skip-ahead for set_chain_members call
|
||||
-define(SET_CHAIN_MEMBERS_EPOCH_SKIP, 1111).
|
||||
|
||||
%% Maximum length of the history of adopted projections (via C120).
|
||||
-define(MAX_HISTORY_LENGTH, 30).
|
||||
|
||||
%% API
|
||||
-export([start_link/2, start_link/3, stop/1, ping/1,
|
||||
set_chain_members/2, set_chain_members/6, set_active/2,
|
||||
|
@ -234,11 +234,13 @@ test_read_latest_public_projection(Pid, ReadRepairP) ->
|
|||
%% manager's pid in MgrOpts and use direct gen_server calls to the
|
||||
%% local projection store.
|
||||
|
||||
init({MyName, InitMembersDict, MgrOpts}) ->
|
||||
init({MyName, InitMembersDict, MgrOpts0}) ->
|
||||
put(ttt, [?LINE]),
|
||||
_ = random:seed(now()),
|
||||
init_remember_down_list(),
|
||||
MgrOpts = MgrOpts0 ++ application:get_env(machi, chain_manager_opts, []),
|
||||
Opt = fun(Key, Default) -> proplists:get_value(Key, MgrOpts, Default) end,
|
||||
|
||||
InitWitness_list = Opt(witnesses, []),
|
||||
ZeroAll_list = [P#p_srvr.name || {_,P} <- orddict:to_list(InitMembersDict)],
|
||||
ZeroProj = make_none_projection(0, MyName, ZeroAll_list,
|
||||
|
@ -388,6 +390,7 @@ handle_cast(_Cast, S) ->
|
|||
handle_info(tick_check_environment, #ch_mgr{ignore_timer=true}=S) ->
|
||||
{noreply, S};
|
||||
handle_info(tick_check_environment, S) ->
|
||||
gobble_ticks(),
|
||||
{{_Delta, Props, _Epoch}, S1} = do_react_to_env(S),
|
||||
S2 = sanitize_repair_state(S1),
|
||||
S3 = perhaps_start_repair(S2),
|
||||
|
@ -460,7 +463,7 @@ get_my_proj_boot_info(MgrOpts, DefaultDict, DefaultProj, ProjType) ->
|
|||
{DefaultDict, DefaultProj};
|
||||
Store ->
|
||||
{ok, P} = machi_projection_store:read_latest_projection(Store,
|
||||
ProjType),
|
||||
ProjType, 7789),
|
||||
{P#projection_v1.members_dict, P}
|
||||
end.
|
||||
|
||||
|
@ -837,7 +840,10 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
|
|||
D_foo=[{repair_done, {repair_final_status, ok, (S#ch_mgr.proj)#projection_v1.epoch_number}}],
|
||||
{NewUPI_list ++ Repairing_list2, [], RunEnv2};
|
||||
true ->
|
||||
D_foo=[d_foo2],
|
||||
D_foo=[d_foo2, {sim_p,Simulator_p},
|
||||
{simr_p,SimRepair_p}, {same_epoch,SameEpoch_p},
|
||||
{rel_to,RelativeToServer},
|
||||
{repch,RepChk_LastInUPI}, {repair_fs,RepairFS}],
|
||||
{NewUPI_list, OldRepairing_list, RunEnv2}
|
||||
end;
|
||||
{_ABC, _XYZ} ->
|
||||
|
@ -1974,7 +1980,7 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) ->
|
|||
%% In contrast to the public projection store writes, Humming Consensus
|
||||
%% doesn't care about the status of writes to the public store: it's
|
||||
%% always relying only on successful reads of the public store.
|
||||
case {?FLU_PC:write_projection(MyStorePid, private, P_latest2,?TO*30),Goo} of
|
||||
case {?FLU_PC:write_projection(MyStorePid, private, P_latest2,?TO*30+66),Goo} of
|
||||
{ok, Goo} ->
|
||||
?REACT({c110, [{write, ok}]}),
|
||||
react_to_env_C111(P_latest, P_latest2, Extra1, MyStorePid, S);
|
||||
|
@ -2060,7 +2066,6 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H,
|
|||
?REACT(c120),
|
||||
H2 = add_and_trunc_history(P_latest, H, ?MAX_HISTORY_LENGTH),
|
||||
|
||||
%% diversion_c120_verbose_goop(P_latest, S),
|
||||
?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}),
|
||||
S2 = set_proj(S#ch_mgr{proj_history=H2,
|
||||
sane_transitions=Xtns + 1}, P_latest),
|
||||
|
@ -2068,20 +2073,21 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H,
|
|||
false ->
|
||||
S2;
|
||||
{{_ConfEpoch, _ConfCSum}, ConfTime} ->
|
||||
io:format(user, "\nCONFIRM debug C120 ~w was annotated ~w\n", [S#ch_mgr.name, P_latest#projection_v1.epoch_number]),
|
||||
P_latestEpoch = P_latest#projection_v1.epoch_number,
|
||||
io:format(user, "\nCONFIRM debug C120 ~w was annotated ~w\n", [S#ch_mgr.name, P_latestEpoch]),
|
||||
S2#ch_mgr{proj_unanimous=ConfTime}
|
||||
end,
|
||||
V = case file:read_file("/tmp/moomoo."++atom_to_list(S#ch_mgr.name)) of {ok,_} -> true; _ -> false end,
|
||||
if V -> io:format("C120: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end,
|
||||
{{now_using, FinalProps, P_latest#projection_v1.epoch_number}, S3}.
|
||||
|
||||
add_and_trunc_history(P_latest, H, MaxLength) ->
|
||||
add_and_trunc_history(#projection_v1{epoch_number=0}, H, _MaxLength) ->
|
||||
H;
|
||||
add_and_trunc_history(#projection_v1{} = P_latest, H, MaxLength) ->
|
||||
Latest_U_R = {P_latest#projection_v1.upi, P_latest#projection_v1.repairing},
|
||||
H2 = if P_latest#projection_v1.epoch_number > 0 ->
|
||||
queue:in(Latest_U_R, H);
|
||||
true ->
|
||||
H
|
||||
end,
|
||||
add_and_trunc_history(Latest_U_R, H, MaxLength);
|
||||
add_and_trunc_history(Item, H, MaxLength) ->
|
||||
H2 = queue:in(Item, H),
|
||||
case queue:len(H2) of
|
||||
X when X > MaxLength ->
|
||||
{_V, Hxx} = queue:out(H2),
|
||||
|
@ -2094,11 +2100,10 @@ react_to_env_C200(Retries, P_latest, S) ->
|
|||
?REACT(c200),
|
||||
try
|
||||
AuthorProxyPid = proxy_pid(P_latest#projection_v1.author_server, S),
|
||||
?FLU_PC:kick_projection_reaction(AuthorProxyPid, [])
|
||||
%% This is just advisory, we don't need a sync reply.
|
||||
?FLU_PC:kick_projection_reaction(AuthorProxyPid, [], 100)
|
||||
catch _Type:_Err ->
|
||||
%% ?V("TODO: tell_author_yo is broken: ~p ~p\n",
|
||||
%% [_Type, _Err]),
|
||||
ok
|
||||
ok
|
||||
end,
|
||||
react_to_env_C210(Retries, S).
|
||||
|
||||
|
@ -2485,19 +2490,23 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current} = S) ->
|
|||
ProjStore = get_projection_store_pid_or_regname(S),
|
||||
#projection_v1{epoch_number=_EpochRep,
|
||||
epoch_csum= <<_CSumRep:4/binary,_/binary>>,
|
||||
author_server=AuthRep,
|
||||
upi=_UPIRep,
|
||||
repairing=_RepairingRep} = NewProj,
|
||||
ok = machi_projection_store:write(ProjStore, private, NewProj),
|
||||
case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of
|
||||
case proplists:get_value(private_write_verbose_confirm, S#ch_mgr.opts) of
|
||||
true ->
|
||||
io:format(user, "\n~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), _EpochRep, _CSumRep, _UPIRep, _RepairingRep, MyName]);
|
||||
error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w auth ~w by ~w\n", [_EpochRep, _CSumRep, _UPIRep, _RepairingRep, AuthRep, MyName]);
|
||||
_ ->
|
||||
ok
|
||||
end,
|
||||
%% Unwedge our FLU.
|
||||
{ok, NotifyPid} = machi_projection_store:get_wedge_notify_pid(ProjStore),
|
||||
_ = machi_flu1:update_wedge_state(NotifyPid, false, EpochID),
|
||||
S2#ch_mgr{proj_unanimous=Now};
|
||||
#ch_mgr{proj_history=H} = S2,
|
||||
H2 = add_and_trunc_history({confirm, Epoch}, H,
|
||||
?MAX_HISTORY_LENGTH),
|
||||
S2#ch_mgr{proj_unanimous=Now, proj_history=H2};
|
||||
_ ->
|
||||
S2
|
||||
end;
|
||||
|
@ -2537,6 +2546,14 @@ gobble_calls(StaticCall) ->
|
|||
ok
|
||||
end.
|
||||
|
||||
gobble_ticks() ->
|
||||
receive
|
||||
tick_check_environment ->
|
||||
gobble_ticks()
|
||||
after 0 ->
|
||||
ok
|
||||
end.
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
perhaps_start_repair(#ch_mgr{name=MyName,
|
||||
|
@ -2552,12 +2569,13 @@ perhaps_start_repair(#ch_mgr{name=MyName,
|
|||
%% RepairOpts = [{repair_mode, check}, verbose],
|
||||
RepairFun = fun() -> do_repair(S, RepairOpts, CMode) end,
|
||||
LastUPI = lists:last(UPI),
|
||||
StabilityTime = application:get_env(machi, stability_time, ?REPAIR_START_STABILITY_TIME),
|
||||
IgnoreStabilityTime_p = proplists:get_value(ignore_stability_time,
|
||||
S#ch_mgr.opts, false),
|
||||
case timer:now_diff(os:timestamp(), Start) div 1000000 of
|
||||
N when MyName == LastUPI andalso
|
||||
(IgnoreStabilityTime_p orelse
|
||||
N >= ?REPAIR_START_STABILITY_TIME) ->
|
||||
N >= StabilityTime) ->
|
||||
{WorkerPid, _Ref} = spawn_monitor(RepairFun),
|
||||
S#ch_mgr{repair_worker=WorkerPid,
|
||||
repair_start=os:timestamp(),
|
||||
|
@ -2966,34 +2984,33 @@ zerf_find_last_annotated(FLU, MajoritySize, S) ->
|
|||
[] % lists:flatten() will destroy
|
||||
end.
|
||||
|
||||
perhaps_verbose_c111(P_latest2, S) ->
|
||||
case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of
|
||||
true ->
|
||||
perhaps_verbose_c111(P_latest2, #ch_mgr{name=MyName, opts=Opts}=S) ->
|
||||
PrivWriteVerb = proplists:get_value(private_write_verbose, Opts, false),
|
||||
PrivWriteVerbCONFIRM = proplists:get_value(private_write_verbose_confirm, Opts, false),
|
||||
if PrivWriteVerb orelse PrivWriteVerbCONFIRM ->
|
||||
Dbg2X = lists:keydelete(react, 1,
|
||||
P_latest2#projection_v1.dbg2) ++
|
||||
[{is_annotated,is_annotated(P_latest2)}],
|
||||
P_latest2x = P_latest2#projection_v1{dbg2=Dbg2X}, % limit verbose len.
|
||||
Last2 = get(last_verbose),
|
||||
Summ2 = machi_projection:make_summary(P_latest2x),
|
||||
if P_latest2#projection_v1.upi == [],
|
||||
(S#ch_mgr.proj)#projection_v1.upi /= [] ->
|
||||
<<CSumRep:4/binary,_/binary>> =
|
||||
P_latest2#projection_v1.epoch_csum,
|
||||
io:format(user, "~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), (S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]);
|
||||
if PrivWriteVerb, Summ2 /= Last2 ->
|
||||
put(last_verbose, Summ2),
|
||||
error_logger:info_msg("~p uses plain: ~w \n",
|
||||
[MyName, Summ2]);
|
||||
true ->
|
||||
ok
|
||||
end,
|
||||
case proplists:get_value(private_write_verbose,
|
||||
S#ch_mgr.opts) of
|
||||
%% case true of
|
||||
true when Summ2 /= Last2 ->
|
||||
put(last_verbose, Summ2),
|
||||
?V("\n~s ~p uses plain: ~w \n",
|
||||
[machi_util:pretty_time(), S#ch_mgr.name, Summ2]);
|
||||
_ ->
|
||||
if PrivWriteVerbCONFIRM,
|
||||
P_latest2#projection_v1.upi == [],
|
||||
(S#ch_mgr.proj)#projection_v1.upi /= [] ->
|
||||
<<CSumRep:4/binary,_/binary>> =
|
||||
P_latest2#projection_v1.epoch_csum,
|
||||
error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w auth ~w by ~w\n", [(S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, P_latest2#projection_v1.author_server, S#ch_mgr.name]);
|
||||
true ->
|
||||
ok
|
||||
end;
|
||||
_ ->
|
||||
true ->
|
||||
ok
|
||||
end.
|
||||
|
||||
|
|
|
@ -78,8 +78,8 @@
|
|||
terminate/2, code_change/3]).
|
||||
|
||||
-define(FLU_PC, machi_proxy_flu1_client).
|
||||
-define(TIMEOUT, 2*1000).
|
||||
-define(DEFAULT_TIMEOUT, 10*1000).
|
||||
-define(TIMEOUT, 10*1000).
|
||||
-define(DEFAULT_TIMEOUT, ?TIMEOUT*5).
|
||||
-define(MAX_RUNTIME, 8*1000).
|
||||
-define(WORST_PROJ, #projection_v1{epoch_number=0,epoch_csum= <<>>,
|
||||
members_dict=[]}).
|
||||
|
@ -506,7 +506,7 @@ do_read_chunk2(NSInfo, File, Offset, Size, Opts, Depth, STime, TO,
|
|||
Tail = lists:last(UPI),
|
||||
ConsistencyMode = P#projection_v1.mode,
|
||||
case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), NSInfo, EpochID,
|
||||
File, Offset, Size, Opts, ?TIMEOUT) of
|
||||
File, Offset, Size, Opts, TO) of
|
||||
{ok, {Chunks, Trimmed}} when is_list(Chunks), is_list(Trimmed) ->
|
||||
%% After partition heal, there could happen that heads may
|
||||
%% have chunk trimmed but tails may have chunk written -
|
||||
|
@ -690,7 +690,7 @@ read_repair2(cp_mode=ConsistencyMode,
|
|||
%% TODO WTF was I thinking here??....
|
||||
Tail = lists:last(readonly_flus(P)),
|
||||
case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), NSInfo, EpochID,
|
||||
File, Offset, Size, undefined, ?TIMEOUT) of
|
||||
File, Offset, Size, undefined, ?DEFAULT_TIMEOUT) of
|
||||
{ok, Chunks} when is_list(Chunks) ->
|
||||
%% TODO: change to {Chunks, Trimmed} and have them repaired
|
||||
ToRepair = mutation_flus(P) -- [Tail],
|
||||
|
@ -840,7 +840,7 @@ do_checksum_list(File, Depth, STime, TO, #state{proj=P}=S) ->
|
|||
do_checksum_list2(File, Depth, STime, TO,
|
||||
#state{proj=P, proxies_dict=PD}=S) ->
|
||||
Proxy = orddict:fetch(lists:last(readonly_flus(P)), PD),
|
||||
case ?FLU_PC:checksum_list(Proxy, File, ?TIMEOUT) of
|
||||
case ?FLU_PC:checksum_list(Proxy, File, TO) of
|
||||
{ok, _}=OK ->
|
||||
{reply, OK, S};
|
||||
{error, Retry}
|
||||
|
@ -875,7 +875,7 @@ do_list_files(Depth, STime, TO, #state{proj=P}=S) ->
|
|||
do_list_files2(Depth, STime, TO,
|
||||
#state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) ->
|
||||
Proxy = orddict:fetch(lists:last(readonly_flus(P)), PD),
|
||||
case ?FLU_PC:list_files(Proxy, EpochID, ?TIMEOUT) of
|
||||
case ?FLU_PC:list_files(Proxy, EpochID, ?DEFAULT_TIMEOUT) of
|
||||
{ok, _}=OK ->
|
||||
{reply, OK, S};
|
||||
{error, Retry}
|
||||
|
|
|
@ -1,3 +1,23 @@
|
|||
%% -------------------------------------------------------------------
|
||||
%%
|
||||
%% Copyright (c) 2007-2016 Basho Technologies, Inc. All Rights Reserved.
|
||||
%%
|
||||
%% This file is provided to you under the Apache License,
|
||||
%% Version 2.0 (the "License"); you may not use this file
|
||||
%% except in compliance with the License. You may obtain
|
||||
%% a copy of the License at
|
||||
%%
|
||||
%% http://www.apache.org/licenses/LICENSE-2.0
|
||||
%%
|
||||
%% Unless required by applicable law or agreed to in writing,
|
||||
%% software distributed under the License is distributed on an
|
||||
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
%% KIND, either express or implied. See the License for the
|
||||
%% specific language governing permissions and limitations
|
||||
%% under the License.
|
||||
%%
|
||||
%% -------------------------------------------------------------------
|
||||
|
||||
-module(machi_csum_table).
|
||||
|
||||
-export([open/2,
|
||||
|
@ -65,10 +85,18 @@ find(#machi_csum_table{table=T}, Offset, Size) ->
|
|||
{ok, I} = eleveldb:iterator(T, [], keys_only),
|
||||
EndKey = sext:encode({Offset+Size, 0}),
|
||||
StartKey = sext:encode({Offset, Size}),
|
||||
|
||||
{ok, FirstKey} = case eleveldb:iterator_move(I, StartKey) of
|
||||
{error, invalid_iterator} ->
|
||||
eleveldb:iterator_move(I, first);
|
||||
try
|
||||
%% Assume that the invalid_iterator is because
|
||||
%% we tried to move to the end via StartKey.
|
||||
%% Instead, move there directly.
|
||||
{ok, _} = eleveldb:iterator_move(I, last),
|
||||
{ok, _} = eleveldb:iterator_move(I, prev)
|
||||
catch
|
||||
_:_ ->
|
||||
{ok, _} = eleveldb:iterator_move(I, first)
|
||||
end;
|
||||
{ok, _} = R0 ->
|
||||
case eleveldb:iterator_move(I, prev) of
|
||||
{error, invalid_iterator} ->
|
||||
|
@ -92,7 +120,6 @@ find(#machi_csum_table{table=T}, Offset, Size) ->
|
|||
end,
|
||||
lists:reverse(eleveldb_fold(T, FirstKey, EndKey, FoldFun, [])).
|
||||
|
||||
|
||||
%% @doc Updates all chunk info, by deleting existing entries if exists
|
||||
%% and putting new chunk info
|
||||
-spec write(table(),
|
||||
|
@ -126,6 +153,8 @@ write(#machi_csum_table{table=T} = CsumT, Offset, Size, CSum,
|
|||
DeleteOps = lists:map(fun({O, L, _}) ->
|
||||
{delete, sext:encode({O, L})}
|
||||
end, Chunks),
|
||||
%% io:format(user, "PutOps: ~P\n", [PutOps, 20]),
|
||||
%% io:format(user, "DelOps: ~P\n", [DeleteOps, 20]),
|
||||
eleveldb:write(T, DeleteOps ++ PutOps, [{sync, true}]).
|
||||
|
||||
-spec find_leftneighbor(table(), non_neg_integer()) ->
|
||||
|
|
|
@ -32,6 +32,12 @@
|
|||
-type chunk_summary() :: {file_offset(), chunk_size(), chunk_bin(), chunk_cstrm()}.
|
||||
-type chunk_pos() :: {file_offset(), chunk_size(), file_name_s()}.
|
||||
-type chunk_size() :: non_neg_integer().
|
||||
|
||||
%% Tags that stand for how that checksum was generated. See
|
||||
%% machi_util:make_tagged_csum/{1,2} for further documentation and
|
||||
%% implementation.
|
||||
-type csum_tag() :: none | client_sha | server_sha | server_regen_sha.
|
||||
|
||||
-type error_general() :: 'bad_arg' | 'wedged' | 'bad_checksum'.
|
||||
-type epoch_csum() :: binary().
|
||||
-type epoch_num() :: -1 | non_neg_integer().
|
||||
|
@ -53,11 +59,6 @@
|
|||
-type read_opts() :: #read_opts{}.
|
||||
-type read_opts_x() :: 'undefined' | 'noopt' | 'none' | #read_opts{}.
|
||||
|
||||
%% Tags that stand for how that checksum was generated. See
|
||||
%% machi_util:make_tagged_csum/{1,2} for further documentation and
|
||||
%% implementation.
|
||||
-type csum_tag() :: none | client_sha | server_sha | server_regen_sha.
|
||||
|
||||
-export_type([
|
||||
append_opts/0,
|
||||
chunk/0,
|
||||
|
@ -68,6 +69,7 @@
|
|||
chunk_summary/0,
|
||||
chunk_pos/0,
|
||||
chunk_size/0,
|
||||
csum_tag/0,
|
||||
error_general/0,
|
||||
epoch_csum/0,
|
||||
epoch_num/0,
|
||||
|
|
|
@ -71,7 +71,7 @@
|
|||
code_change/3
|
||||
]).
|
||||
|
||||
-define(TICK, 30*1000). %% XXX FIXME Should be something like 5 seconds
|
||||
-define(TICK, 5*1000).
|
||||
-define(TICK_THRESHOLD, 5). %% After this + 1 more quiescent ticks, shutdown
|
||||
-define(TIMEOUT, 10*1000).
|
||||
-define(TOO_MANY_ERRORS_RATIO, 50).
|
||||
|
@ -91,6 +91,7 @@
|
|||
csum_table :: machi_csum_table:table(),
|
||||
eof_position = 0 :: non_neg_integer(),
|
||||
max_file_size = ?DEFAULT_MAX_FILE_SIZE :: pos_integer(),
|
||||
rollover = false :: boolean(),
|
||||
tref :: reference(), %% timer ref
|
||||
ticks = 0 :: non_neg_integer(), %% ticks elapsed with no new operations
|
||||
ops = 0 :: non_neg_integer(), %% sum of all ops
|
||||
|
@ -239,7 +240,7 @@ init({FluName, Filename, DataDir}) ->
|
|||
data_filehandle = FHd,
|
||||
csum_table = CsumTable,
|
||||
tref = Tref,
|
||||
eof_position = Eof,
|
||||
eof_position = erlang:max(Eof, ?MINIMUM_OFFSET),
|
||||
max_file_size = machi_config:max_file_size()},
|
||||
lager:debug("Starting file proxy ~p for filename ~p, state = ~p, Eof = ~p",
|
||||
[self(), Filename, St, Eof]),
|
||||
|
@ -449,11 +450,23 @@ handle_cast(Cast, State) ->
|
|||
{noreply, State}.
|
||||
|
||||
% @private
|
||||
handle_info(tick, State = #state{eof_position = Eof,
|
||||
handle_info(tick, State = #state{fluname = FluName,
|
||||
filename = F,
|
||||
eof_position = Eof,
|
||||
max_file_size = MaxFileSize}) when Eof >= MaxFileSize ->
|
||||
lager:notice("Eof position ~p >= max file size ~p. Shutting down.",
|
||||
[Eof, MaxFileSize]),
|
||||
{stop, file_rollover, State};
|
||||
%% Older code halted here with {stop, file_rollover, State}.
|
||||
%% However, there may be other requests in our mailbox already
|
||||
%% and/or not yet delivered but in a race with the
|
||||
%% machi_flu_metadata_mgr. So we close our eleveldb instance (to
|
||||
%% avoid double-open attempt by a new file proxy proc), tell
|
||||
%% machi_flu_metadata_mgr that we request a rollover, then stop.
|
||||
%% terminate() will take care of forwarding messages that are
|
||||
%% caught in the race.
|
||||
lager:notice("Eof ~s position ~p >= max file size ~p. Shutting down.",
|
||||
[F, Eof, MaxFileSize]),
|
||||
State2 = close_files(State),
|
||||
machi_flu_metadata_mgr:stop_proxy_pid_rollover(FluName, {file, F}),
|
||||
{stop, normal, State2#state{rollover = true}};
|
||||
|
||||
%% XXX Is this a good idea? Need to think this through a bit.
|
||||
handle_info(tick, State = #state{wedged = true}) ->
|
||||
|
@ -467,7 +480,7 @@ handle_info(tick, State = #state{
|
|||
writes = {WT, WE},
|
||||
appends = {AT, AE}
|
||||
}) when Ops > 100 andalso
|
||||
trunc(((RE+WE+AE) / RT+WT+AT) * 100) > ?TOO_MANY_ERRORS_RATIO ->
|
||||
trunc(((RE+WE+AE) / (RT+WT+AT)) * 100) > ?TOO_MANY_ERRORS_RATIO ->
|
||||
Errors = RE + WE + AE,
|
||||
lager:notice("Got ~p errors. Shutting down.", [Errors]),
|
||||
{stop, too_many_errors, State};
|
||||
|
@ -526,30 +539,23 @@ handle_info(Req, State) ->
|
|||
{noreply, State}.
|
||||
|
||||
% @private
|
||||
terminate(Reason, #state{filename = F,
|
||||
data_filehandle = FHd,
|
||||
csum_table = T,
|
||||
reads = {RT, RE},
|
||||
writes = {WT, WE},
|
||||
appends = {AT, AE}
|
||||
}) ->
|
||||
terminate(Reason, State = #state{fluname = FluName,
|
||||
filename = F,
|
||||
rollover = Rollover_p,
|
||||
reads = {RT, RE},
|
||||
writes = {WT, WE},
|
||||
appends = {AT, AE}
|
||||
}) ->
|
||||
lager:info("Shutting down proxy for file ~p because ~p", [F, Reason]),
|
||||
lager:info(" Op Tot/Error", []),
|
||||
lager:info(" Reads: ~p/~p", [RT, RE]),
|
||||
lager:info(" Writes: ~p/~p", [WT, WE]),
|
||||
lager:info("Appends: ~p/~p", [AT, AE]),
|
||||
case FHd of
|
||||
undefined ->
|
||||
noop; %% file deleted
|
||||
_ ->
|
||||
ok = file:sync(FHd),
|
||||
ok = file:close(FHd)
|
||||
end,
|
||||
case T of
|
||||
undefined ->
|
||||
noop; %% file deleted
|
||||
_ ->
|
||||
ok = machi_csum_table:close(T)
|
||||
close_files(State),
|
||||
if Rollover_p ->
|
||||
forward_late_messages(FluName, F, 500);
|
||||
true ->
|
||||
ok
|
||||
end,
|
||||
ok.
|
||||
|
||||
|
@ -867,3 +873,36 @@ maybe_gc(Reply, S = #state{fluname=FluName,
|
|||
false ->
|
||||
{reply, Reply, S}
|
||||
end.
|
||||
|
||||
close_files(State = #state{data_filehandle = FHd,
|
||||
csum_table = T}) ->
|
||||
case FHd of
|
||||
undefined ->
|
||||
noop; %% file deleted
|
||||
_ ->
|
||||
ok = file:sync(FHd),
|
||||
ok = file:close(FHd)
|
||||
end,
|
||||
case T of
|
||||
undefined ->
|
||||
noop; %% file deleted
|
||||
_ ->
|
||||
ok = machi_csum_table:close(T)
|
||||
end,
|
||||
State#state{data_filehandle = undefined, csum_table = undefined}.
|
||||
|
||||
forward_late_messages(FluName, F, Timeout) ->
|
||||
receive
|
||||
M ->
|
||||
case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, F}) of
|
||||
{ok, Pid} ->
|
||||
Pid ! M;
|
||||
{error, trimmed} ->
|
||||
lager:error("TODO: FLU ~p file ~p reports trimmed status "
|
||||
"when forwarding ~P\n",
|
||||
[FluName, F, M, 20])
|
||||
end,
|
||||
forward_late_messages(FluName, F, Timeout)
|
||||
after Timeout ->
|
||||
ok
|
||||
end.
|
||||
|
|
|
@ -108,6 +108,7 @@ handle_call({update_local_down_list, Down, MembersDict}, _From,
|
|||
#state{my_flu_name=MyFluName, pending_map=OldMap,
|
||||
local_down=OldDown, members_dict=OldMembersDict,
|
||||
admin_down=AdminDown}=S) ->
|
||||
verbose("FITNESS: ~w has down suspect ~w\n", [MyFluName, Down]),
|
||||
NewMap = store_in_map(OldMap, MyFluName, erlang:now(), Down,
|
||||
AdminDown, [props_yo]),
|
||||
S2 = if Down == OldDown, MembersDict == OldMembersDict ->
|
||||
|
@ -119,13 +120,17 @@ handle_call({update_local_down_list, Down, MembersDict}, _From,
|
|||
end,
|
||||
{reply, ok, S2#state{local_down=Down}};
|
||||
handle_call({add_admin_down, DownFLU, DownProps}, _From,
|
||||
#state{local_down=OldDown, admin_down=AdminDown}=S) ->
|
||||
#state{my_flu_name=MyFluName,
|
||||
local_down=OldDown, admin_down=AdminDown}=S) ->
|
||||
verbose("FITNESS: ~w add admin down ~w\n", [MyFluName, DownFLU]),
|
||||
NewAdminDown = [{DownFLU,DownProps}|lists:keydelete(DownFLU, 1, AdminDown)],
|
||||
S3 = finish_admin_down(erlang:now(), OldDown, NewAdminDown,
|
||||
[props_yo], S),
|
||||
{reply, ok, S3};
|
||||
handle_call({delete_admin_down, DownFLU}, _From,
|
||||
#state{local_down=OldDown, admin_down=AdminDown}=S) ->
|
||||
#state{my_flu_name=MyFluName,
|
||||
local_down=OldDown, admin_down=AdminDown}=S) ->
|
||||
verbose("FITNESS: ~w delete admin down ~w\n", [MyFluName, DownFLU]),
|
||||
NewAdminDown = lists:keydelete(DownFLU, 1, AdminDown),
|
||||
S3 = finish_admin_down(erlang:now(), OldDown, NewAdminDown,
|
||||
[props_yo], S),
|
||||
|
@ -143,7 +148,8 @@ handle_call(_Request, _From, S) ->
|
|||
handle_cast(_Msg, S) ->
|
||||
{noreply, S}.
|
||||
|
||||
handle_info({adjust_down_list, FLU}, #state{active_unfit=ActiveUnfit}=S) ->
|
||||
handle_info({adjust_down_list, FLU}, #state{my_flu_name=MyFluName,
|
||||
active_unfit=ActiveUnfit}=S) ->
|
||||
NewUnfit = make_unfit_list(S),
|
||||
Added_to_new = NewUnfit -- ActiveUnfit,
|
||||
Dropped_from_new = ActiveUnfit -- NewUnfit,
|
||||
|
@ -184,9 +190,11 @@ handle_info({adjust_down_list, FLU}, #state{active_unfit=ActiveUnfit}=S) ->
|
|||
{true, true} ->
|
||||
error({bad, ?MODULE, ?LINE, FLU, ActiveUnfit, NewUnfit});
|
||||
{true, false} ->
|
||||
{noreply, S#state{active_unfit=lists:usort(ActiveUnfit ++ [FLU])}};
|
||||
NewActive = wrap_active(MyFluName,lists:usort(ActiveUnfit++[FLU])),
|
||||
{noreply, S#state{active_unfit=NewActive}};
|
||||
{false, true} ->
|
||||
{noreply, S#state{active_unfit=ActiveUnfit -- [FLU]}};
|
||||
NewActive = wrap_active(MyFluName,ActiveUnfit--[FLU]),
|
||||
{noreply, S#state{active_unfit=NewActive}};
|
||||
{false, false} ->
|
||||
{noreply, S}
|
||||
end;
|
||||
|
@ -424,6 +432,18 @@ map_value(Map) ->
|
|||
map_merge(Map1, Map2) ->
|
||||
?MAP:merge(Map1, Map2).
|
||||
|
||||
wrap_active(MyFluName, L) ->
|
||||
verbose("FITNESS: ~w has new down list ~w\n", [MyFluName, L]),
|
||||
L.
|
||||
|
||||
verbose(Fmt, Args) ->
|
||||
case application:get_env(machi, fitness_verbose) of
|
||||
{ok, true} ->
|
||||
error_logger:info_msg(Fmt, Args);
|
||||
_ ->
|
||||
ok
|
||||
end.
|
||||
|
||||
-ifdef(TEST).
|
||||
|
||||
dt_understanding_test() ->
|
||||
|
|
|
@ -157,8 +157,9 @@ handle_call({find_filename, _FluName, EpochId, NSInfo, Prefix}, _From, S = #stat
|
|||
File = increment_and_cache_filename(Tid, DataDir, NSInfo, Prefix),
|
||||
{reply, {file, File}, S#state{epoch = EpochId}};
|
||||
|
||||
handle_call({increment_sequence, #ns_info{name=NS, locator=NSLocator}, Prefix}, _From, S = #state{ datadir = DataDir }) ->
|
||||
ok = machi_util:increment_max_filenum(DataDir, NS, NSLocator, Prefix),
|
||||
handle_call({increment_sequence, #ns_info{name=NS, locator=NSLocator}, Prefix}, _From, S = #state{ datadir = DataDir, tid=Tid }) ->
|
||||
NSInfo = #ns_info{name=NS, locator=NSLocator},
|
||||
_File = increment_and_cache_filename(Tid, DataDir, NSInfo, Prefix),
|
||||
{reply, ok, S};
|
||||
handle_call({list_files, Prefix}, From, S = #state{ datadir = DataDir }) ->
|
||||
spawn(fun() ->
|
||||
|
|
|
@ -63,6 +63,7 @@
|
|||
lookup_proxy_pid/2,
|
||||
start_proxy_pid/2,
|
||||
stop_proxy_pid/2,
|
||||
stop_proxy_pid_rollover/2,
|
||||
build_metadata_mgr_name/2,
|
||||
trim_file/2
|
||||
]).
|
||||
|
@ -100,7 +101,10 @@ start_proxy_pid(FluName, {file, Filename}) ->
|
|||
gen_server:call(get_manager_atom(FluName, Filename), {start_proxy_pid, Filename}, ?TIMEOUT).
|
||||
|
||||
stop_proxy_pid(FluName, {file, Filename}) ->
|
||||
gen_server:call(get_manager_atom(FluName, Filename), {stop_proxy_pid, Filename}, ?TIMEOUT).
|
||||
gen_server:call(get_manager_atom(FluName, Filename), {stop_proxy_pid, false, Filename}, ?TIMEOUT).
|
||||
|
||||
stop_proxy_pid_rollover(FluName, {file, Filename}) ->
|
||||
gen_server:call(get_manager_atom(FluName, Filename), {stop_proxy_pid, true, Filename}, ?TIMEOUT).
|
||||
|
||||
trim_file(FluName, {file, Filename}) ->
|
||||
gen_server:call(get_manager_atom(FluName, Filename), {trim_file, Filename}, ?TIMEOUT).
|
||||
|
@ -151,7 +155,7 @@ handle_call({start_proxy_pid, Filename}, _From,
|
|||
{reply, {error, trimmed}, State}
|
||||
end;
|
||||
|
||||
handle_call({stop_proxy_pid, Filename}, _From, State = #state{ tid = Tid }) ->
|
||||
handle_call({stop_proxy_pid, Rollover_p, Filename}, _From, State = #state{ tid = Tid }) ->
|
||||
case lookup_md(Tid, Filename) of
|
||||
not_found ->
|
||||
ok;
|
||||
|
@ -159,8 +163,13 @@ handle_call({stop_proxy_pid, Filename}, _From, State = #state{ tid = Tid }) ->
|
|||
ok;
|
||||
#md{ proxy_pid = Pid, mref = M } = R ->
|
||||
demonitor(M, [flush]),
|
||||
machi_file_proxy:stop(Pid),
|
||||
update_ets(Tid, R#md{ proxy_pid = undefined, mref = undefined })
|
||||
if Rollover_p ->
|
||||
do_rollover(Filename, State);
|
||||
true ->
|
||||
machi_file_proxy:stop(Pid),
|
||||
update_ets(Tid, R#md{ proxy_pid = undefined,
|
||||
mref = undefined })
|
||||
end
|
||||
end,
|
||||
{reply, ok, State};
|
||||
|
||||
|
@ -182,27 +191,6 @@ handle_info({'DOWN', Mref, process, Pid, normal}, State = #state{ tid = Tid }) -
|
|||
clear_ets(Tid, Mref),
|
||||
{noreply, State};
|
||||
|
||||
handle_info({'DOWN', Mref, process, Pid, file_rollover}, State = #state{ fluname = FluName,
|
||||
tid = Tid }) ->
|
||||
lager:info("file proxy ~p shutdown because of file rollover", [Pid]),
|
||||
R = get_md_record_by_mref(Tid, Mref),
|
||||
{Prefix, NS, NSLocator, _, _} =
|
||||
machi_util:parse_filename(R#md.filename),
|
||||
|
||||
%% We only increment the counter here. The filename will be generated on the
|
||||
%% next append request to that prefix and since the filename will have a new
|
||||
%% sequence number it probably will be associated with a different metadata
|
||||
%% manager. That's why we don't want to generate a new file name immediately
|
||||
%% and use it to start a new file proxy.
|
||||
NSInfo = #ns_info{name=NS, locator=NSLocator},
|
||||
ok = machi_flu_filename_mgr:increment_prefix_sequence(FluName, NSInfo, {prefix, Prefix}),
|
||||
|
||||
%% purge our ets table of this entry completely since it is likely the
|
||||
%% new filename (whenever it comes) will be in a different manager than
|
||||
%% us.
|
||||
purge_ets(Tid, R),
|
||||
{noreply, State};
|
||||
|
||||
handle_info({'DOWN', Mref, process, Pid, wedged}, State = #state{ tid = Tid }) ->
|
||||
lager:error("file proxy ~p shutdown because it's wedged", [Pid]),
|
||||
clear_ets(Tid, Mref),
|
||||
|
@ -275,8 +263,35 @@ get_md_record_by_mref(Tid, Mref) ->
|
|||
[R] = ets:match_object(Tid, {md, '_', '_', Mref}),
|
||||
R.
|
||||
|
||||
get_md_record_by_filename(Tid, Filename) ->
|
||||
[R] = ets:lookup(Tid, Filename),
|
||||
R.
|
||||
|
||||
get_env(Setting, Default) ->
|
||||
case application:get_env(machi, Setting) of
|
||||
undefined -> Default;
|
||||
{ok, V} -> V
|
||||
end.
|
||||
|
||||
do_rollover(Filename, _State = #state{ fluname = FluName,
|
||||
tid = Tid }) ->
|
||||
R = get_md_record_by_filename(Tid, Filename),
|
||||
lager:info("file ~p proxy ~p shutdown because of file rollover",
|
||||
[Filename, R#md.proxy_pid]),
|
||||
{Prefix, NS, NSLocator, _, _} =
|
||||
machi_util:parse_filename(R#md.filename),
|
||||
|
||||
%% We only increment the counter here. The filename will be generated on the
|
||||
%% next append request to that prefix and since the filename will have a new
|
||||
%% sequence number it probably will be associated with a different metadata
|
||||
%% manager. That's why we don't want to generate a new file name immediately
|
||||
%% and use it to start a new file proxy.
|
||||
NSInfo = #ns_info{name=NS, locator=NSLocator},
|
||||
lager:warning("INCR: ~p ~p\n", [FluName, Prefix]),
|
||||
ok = machi_flu_filename_mgr:increment_prefix_sequence(FluName, NSInfo, {prefix, Prefix}),
|
||||
|
||||
%% purge our ets table of this entry completely since it is likely the
|
||||
%% new filename (whenever it comes) will be in a different manager than
|
||||
%% us.
|
||||
purge_ets(Tid, R),
|
||||
ok.
|
||||
|
|
|
@ -1,3 +1,23 @@
|
|||
%% -------------------------------------------------------------------
|
||||
%%
|
||||
%% Copyright (c) 2007-2016 Basho Technologies, Inc. All Rights Reserved.
|
||||
%%
|
||||
%% This file is provided to you under the Apache License,
|
||||
%% Version 2.0 (the "License"); you may not use this file
|
||||
%% except in compliance with the License. You may obtain
|
||||
%% a copy of the License at
|
||||
%%
|
||||
%% http://www.apache.org/licenses/LICENSE-2.0
|
||||
%%
|
||||
%% Unless required by applicable law or agreed to in writing,
|
||||
%% software distributed under the License is distributed on an
|
||||
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
%% KIND, either express or implied. See the License for the
|
||||
%% specific language governing permissions and limitations
|
||||
%% under the License.
|
||||
%%
|
||||
%% -------------------------------------------------------------------
|
||||
|
||||
-module(machi_plist).
|
||||
|
||||
%%% @doc persistent list of binaries
|
||||
|
|
|
@ -65,5 +65,11 @@ init([]) ->
|
|||
LifecycleMgr =
|
||||
{machi_lifecycle_mgr, {machi_lifecycle_mgr, start_link, []},
|
||||
Restart, Shutdown, worker, []},
|
||||
|
||||
{ok, {SupFlags, [ServerSup, RanchSup, LifecycleMgr]}}.
|
||||
RunningApps = [A || {A,_D,_V} <- application:which_applications()],
|
||||
Specs = case lists:member(ranch, RunningApps) of
|
||||
true ->
|
||||
[ServerSup, LifecycleMgr];
|
||||
false ->
|
||||
[ServerSup, RanchSup, LifecycleMgr]
|
||||
end,
|
||||
{ok, {SupFlags, Specs}}.
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
-export([
|
||||
checksum_chunk/1,
|
||||
make_tagged_csum/1, make_tagged_csum/2,
|
||||
make_client_csum/1,
|
||||
unmake_tagged_csum/1,
|
||||
hexstr_to_bin/1, bin_to_hexstr/1,
|
||||
hexstr_to_int/1, int_to_hexstr/2, int_to_hexbin/2,
|
||||
|
@ -327,6 +328,9 @@ make_tagged_csum(?CSUM_TAG_SERVER_SHA_ATOM, SHA) ->
|
|||
make_tagged_csum(?CSUM_TAG_SERVER_REGEN_SHA_ATOM, SHA) ->
|
||||
<<?CSUM_TAG_SERVER_REGEN_SHA:8, SHA/binary>>.
|
||||
|
||||
make_client_csum(BinOrList) ->
|
||||
make_tagged_csum(?CSUM_TAG_CLIENT_SHA_ATOM, checksum_chunk(BinOrList)).
|
||||
|
||||
unmake_tagged_csum(<<Tag:8, Rest/binary>>) ->
|
||||
{Tag, Rest}.
|
||||
|
||||
|
|
|
@ -134,6 +134,7 @@ Press control-c to interrupt the test....".
|
|||
%% convergence_demo_testfun(3).
|
||||
|
||||
-define(DEFAULT_MGR_OPTS, [{private_write_verbose, false},
|
||||
{private_write_verbose_confirm, true},
|
||||
{active_mode,false},
|
||||
{use_partition_simulator, true}]).
|
||||
|
||||
|
@ -150,7 +151,8 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) ->
|
|||
%% Faster test startup, commented: io:format(user, short_doc(), []),
|
||||
%% Faster test startup, commented: timer:sleep(3000),
|
||||
|
||||
application:start(sasl),
|
||||
Apps = [sasl, ranch],
|
||||
[application:start(App) || App <- Apps],
|
||||
|
||||
MgrOpts = MgrOpts0 ++ ?DEFAULT_MGR_OPTS,
|
||||
TcpPort = proplists:get_value(port_base, MgrOpts, 62877),
|
||||
|
@ -393,7 +395,8 @@ timer:sleep(1234),
|
|||
exit(SupPid, normal),
|
||||
ok = machi_partition_simulator:stop(),
|
||||
[ok = ?FLU_PC:quit(PPid) || {_, PPid} <- Namez],
|
||||
machi_util:wait_for_death(SupPid, 100)
|
||||
machi_util:wait_for_death(SupPid, 100),
|
||||
[application:start(App) || App <- lists:reverse(Apps)]
|
||||
end.
|
||||
|
||||
%% Many of the static partition lists below have been problematic at one
|
||||
|
|
|
@ -38,7 +38,7 @@ clean_up_data_dir(DataDir) ->
|
|||
-ifndef(PULSE).
|
||||
|
||||
-define(TESTDIR, "./t").
|
||||
-define(HYOOGE, 1 * 1024 * 1024 * 1024). % 1 long GB
|
||||
-define(HYOOGE, 75 * 1024 * 1024). % 75 MBytes
|
||||
|
||||
random_binary_single() ->
|
||||
%% OK, I guess it's not that random...
|
||||
|
|
Loading…
Reference in a new issue