Compare commits

...

120 commits

Author SHA1 Message Date
Scott Lystig Fritchie
e87bd59a97 Merge branch 'slf/perf-improvements1' into tmp/merge-delme 2016-03-29 18:40:14 +09:00
Scott Lystig Fritchie
1e0bb4c404 Fix file rollover problems 2016-03-29 18:39:52 +09:00
Scott Lystig Fritchie
549963545f Update b_b driver's top-of-module comments 2016-03-29 15:59:48 +09:00
Scott Lystig Fritchie
2aa8917875 Check checksums on 'read' ops 2016-03-29 15:33:18 +09:00
Scott Lystig Fritchie
27cbf1e38c b_b: only load ETS table if 'read' is in the 'operations' list 2016-03-29 14:58:32 +09:00
Scott Lystig Fritchie
d2fa79e037 Fix arithmetic error in src/machi_file_proxy.erl 2016-03-28 22:07:59 +09:00
Scott Lystig Fritchie
57ba204210 Client API timeout and other minor bugfixes 2016-03-28 21:35:17 +09:00
Scott Lystig Fritchie
24f8cb21a2 Fix eunit test failures related to min file position 2016-03-28 21:05:12 +09:00
Scott Lystig Fritchie
e63db8dedc Fix read_chunk op in b_b driver 2016-03-28 18:42:04 +09:00
Scott Lystig Fritchie
a739b5265c Fix append_chunk op in b_b driver 2016-03-28 17:51:53 +09:00
Scott Lystig Fritchie
767f5d9e60 Performance fix: don't always go to beginning of leveldb table! 2016-03-28 17:51:30 +09:00
Scott Lystig Fritchie
8c21539fcb Add missing copyright header comment 2016-03-28 17:48:06 +09:00
Scott Lystig Fritchie
0f24b69378 README and FAQ updates for mid-March 2016 2016-03-09 12:19:50 -08:00
Scott Lystig Fritchie
ec9d391047 README and FAQ updates for mid-March 2016 2016-03-09 12:19:04 -08:00
Scott Lystig Fritchie
fa71a918b8 README and FAQ updates for mid-March 2016 2016-03-09 12:14:51 -08:00
Scott Lystig Fritchie
6cddfcf988 Merge branch 'slf/hc-demo-env' 2016-03-09 11:16:35 -08:00
Scott Lystig Fritchie
6b000f6e7c Ignore +rel/vars/dev*vars.config 2016-03-09 11:14:43 -08:00
Scott Lystig Fritchie
96c46ec5aa Add explanation for the 'CONFIRM' log messages 2016-03-09 10:54:39 -08:00
Scott Lystig Fritchie
cd166361aa WIP 2016-03-09 10:48:00 -08:00
Scott Lystig Fritchie
4e5c16f5e2 WIP 2016-03-09 10:30:23 -08:00
Scott Lystig Fritchie
16153a5d31 Fix deps building problem, silly 2016-02-27 01:56:16 +09:00
Scott Lystig Fritchie
84f522f865 WIP: Vagrant 2016-02-27 00:05:29 +09:00
Scott Lystig Fritchie
fc46cd1b25 WIP: Vagrant 2016-02-26 17:32:51 +09:00
Scott Lystig Fritchie
184a54ebbd Change ?HYOGE blob size from 1GB -> 75MB to reduce RAM required for eunit tests 2016-02-26 15:46:17 +09:00
Scott Lystig Fritchie
4cb166368a priv/humming-consensus-demo.setup.sh debugged, all appears to work 2016-02-25 18:10:11 +09:00
Scott Lystig Fritchie
f433e84fab Add 'stability_time' env var for repair 2016-02-25 17:52:40 +09:00
Scott Lystig Fritchie
a3fbe2c8bb WIP: demo script writing, derp, need a shell script to simplify 2016-02-25 17:00:05 +09:00
Scott Lystig Fritchie
bdf47da10c oops fix doc links 2016-02-24 15:11:35 +09:00
Scott Lystig Fritchie
6c03f5c1a6 Split out docs dev-clone-compile.md and dev-prerequisites.md 2016-02-24 15:08:41 +09:00
Scott Lystig Fritchie
11921d82bf WIP: start of demo doc 2016-02-23 17:30:30 +09:00
Scott Lystig Fritchie
a27425147d Re-add a flapping check, but also take advantage of confirmed accepted epoch 2016-02-23 15:07:16 +09:00
Scott Lystig Fritchie
34f8632f19 Add ranch startup to machi_chain_manager1_converge_demo 2016-02-23 15:06:33 +09:00
Scott Lystig Fritchie
c02a0bed70 Change 'uses' verbose message to error_logger:info 2016-02-22 17:03:50 +09:00
Scott Lystig Fritchie
1d8bc19891 Fix repair-is-finished-but-message-not-consumed DoS during peer SIGSTOP 2016-02-22 16:48:02 +09:00
Scott Lystig Fritchie
53ce6d89dd Add verbose() option to machi_fitness 2016-02-19 18:02:56 +09:00
Scott Lystig Fritchie
2e46d199c8 Export csum_tag() type 2016-02-19 17:06:05 +09:00
Scott Lystig Fritchie
0f543b4c4d Add author_server to CONFIRM messages 2016-02-19 17:06:05 +09:00
Scott Lystig Fritchie
d5c3da78fb Change 'COMMIT epoch' logging & chain mgr options 2016-02-19 17:06:05 +09:00
Scott Lystig Fritchie
affad6b1d3 Specify short timeout to ?FLU_PC:kick_projection_reaction() call 2016-02-19 17:06:05 +09:00
Scott Lystig Fritchie
ed56a2c6cf Fix 'ranch' app dependency upon re-start w/FLUs configured
... and allow direct start by machi_sup for EUnit tests.
2016-02-19 17:05:34 +09:00
Scott Lystig Fritchie
c2e9a83372 Merge branch 'slf/doc-cluster-terminology' 2016-02-16 12:47:05 +09:00
Scott Lystig Fritchie
67dad7fb8a Fix dialyzer warnings 2016-02-15 17:51:08 +09:00
Scott Lystig Fritchie
9d4483ae68 Minor edits to doc/cluster/name-game-sketch.org 2016-02-15 17:23:55 +09:00
Scott Lystig Fritchie
12ebf4390d Undo testing restriction in test/machi_ap_repair_eqc.erl 2016-02-14 16:00:11 +09:00
Scott Lystig Fritchie
b246ebc376 Rearrange unfinished NS locator reminder spam in machi_flu1_net_server.erl 2016-02-14 15:59:50 +09:00
Scott Lystig Fritchie
943e23e050 Hooray, all eunit tests including EQC pass! 2016-02-10 19:35:52 +09:00
Scott Lystig Fritchie
ecfad4726b Fix machi_flu_filename_mgr to avoid double-write errors during network partitions 2016-02-10 18:17:15 +09:00
Scott Lystig Fritchie
7c39af5bb7 WIP: narrowing in on repair problems due to double-write errors 2 2016-02-10 16:57:50 +09:00
Scott Lystig Fritchie
3bd575899f WIP: narrowing in on repair problems due to double-write errors 2 2016-02-10 16:39:57 +09:00
Scott Lystig Fritchie
a7f42d636e WIP: narrowing in on repair problems due to double-write errors 2016-02-09 01:27:58 +09:00
Scott Lystig Fritchie
fbb0203f67 WIP: most eunit tests fixed, chain repair intermittently broken 2016-02-08 22:04:09 +09:00
Scott Lystig Fritchie
6e17988ac7 Comment & old TODO cleanup 2016-02-02 16:54:31 +09:00
Scott Lystig Fritchie
202ace33d3 Add doc/process-protocol-module-overview.jpg 2016-01-29 16:40:34 +09:00
Scott Lystig Fritchie
2fddf2ec2d Tweak make-faq.pl 2016-01-29 15:10:00 +09:00
Scott Lystig Fritchie
3b82dc2e38 'Thread through' FLU props to machi_flu1_net_server 2015-12-31 17:34:35 +09:00
Scott Lystig Fritchie
3b594504fe Client API module edoc added, see also http://www.snookles.com/scotttmp/IMG_7279-copy-copy.jpg 2015-12-31 17:34:20 +09:00
Scott Lystig Fritchie
a3fc1c3d68 Add namespace info to wedge_status API call; add namespace enforcement @ machi_flu1_net_server 2015-12-31 14:34:15 +09:00
Scott Lystig Fritchie
f09eef14eb Fix damn-syntactically-valid-not-found-by-dialyzer typo 2015-12-30 15:54:19 +09:00
Scott Lystig Fritchie
c65424569d Use 'bool' type in PB spec where feasible 2015-12-29 19:17:18 +09:00
Scott Lystig Fritchie
3c6f1be5d0 Change read_chunk options to use new #read_opts{} 2015-12-29 18:47:08 +09:00
Scott Lystig Fritchie
76ae4247cd Fix cut-and-paste-o 2015-12-29 18:02:56 +09:00
Scott Lystig Fritchie
e24acb7246 Clean up internal protocol<->tuple mappings for correct epoch checking 2015-12-29 17:26:09 +09:00
Scott Lystig Fritchie
5a65a164c3 Remove straggler CoC items in code 2015-12-29 16:01:52 +09:00
Scott Lystig Fritchie
0a8c4156c2 trim_chunk API refactoring; all tests pass; todo tasks remain 2015-12-29 14:13:33 +09:00
Scott Lystig Fritchie
3d730ea215 write_chunk API refactoring; all tests pass; todo tasks remain 2015-12-29 14:13:33 +09:00
Scott Lystig Fritchie
6089ee6851 read_chunk API refactoring; all tests pass; todo tasks remain 2015-12-29 14:13:33 +09:00
Scott Lystig Fritchie
2932a17ea6 append_chunk API refactoring; all tests pass; todo tasks remain 2015-12-29 14:13:29 +09:00
Scott Lystig Fritchie
03b118b52c Clustering API changes in various docs
* name-game-sketch.org
* flu-and-chain-lifecycle.org
* FAQ.md

I've left out changes to the two design docs for now; most of their
respective texts omit multiple chain scenarios entirely, so there
isn't a huge amount to change.
2015-12-29 14:09:00 +09:00
Scott Lystig Fritchie
546901ef49 Dialyzer warning cleanup 2015-12-18 17:48:33 +09:00
Scott Lystig Fritchie
de5d5e88dd Do not use 18.x for TravisCI testing 2015-12-18 17:40:16 +09:00
Scott Lystig Fritchie
70d42a3fb5 Merge pull request #55 from basho/ss/flu1-factorization2
Ss/flu1 factorization2
2015-12-18 17:19:17 +09:00
Scott Lystig Fritchie
7d262fd3ec Move update_wedge_state() & wedge_myself() to machi_flu1_append_server.erl 2015-12-18 16:56:01 +09:00
Shunichi Shinohara
b8297afc18 Reduce count of accepror processes 100 -> 10 2015-12-18 16:30:35 +09:00
Scott Lystig Fritchie
c49ccafdc6 Merge slf/flu-config-rcd-style 2015-12-18 15:41:02 +09:00
Scott Lystig Fritchie
d602663060 Ignore RUNLOG* 2015-12-18 13:43:18 +09:00
Scott Lystig Fritchie
0922def0d6 s/verb/term/gi 2015-12-18 11:50:15 +09:00
Scott Lystig Fritchie
bb0e67f6e0 Add doc/flu-and-chain-lifecycle.org 2015-12-17 21:33:30 +09:00
Scott Lystig Fritchie
51a05ba770 Fix dialyzer complaints in machi_lifecycle_mgr.erl 2015-12-17 12:44:10 +09:00
Scott Lystig Fritchie
1d1bfadb96 Corrections from review 2015-12-17 12:31:32 +09:00
Scott Lystig Fritchie
f98b4da45b Add 'quick admin' config management: better file handling 2015-12-16 19:05:25 +09:00
Shunichi Shinohara
dffb73330d Dialyzer and eunit fix 2015-12-16 17:56:17 +09:00
Shunichi Shinohara
3a0086afb2 Change listen port to avoid eaddrinuse on Linux 2015-12-16 17:38:28 +09:00
Shunichi Shinohara
dcb2464cb8 Separate append server as gen_server callback module 2015-12-16 17:33:53 +09:00
Scott Lystig Fritchie
463d20a9fd Add 'quick admin' config management tool/hack 2015-12-16 16:41:11 +09:00
Scott Lystig Fritchie
6f077fbb62 New machi_lifecycle_mgr_test, AST spec -> running FLUs & chains works! 2015-12-11 19:07:00 +09:00
Scott Lystig Fritchie
e55115fdba All EUnit tests in machi_lifecycle_mgr_test pass! 2015-12-11 17:28:27 +09:00
Scott Lystig Fritchie
009bad230f WIP: change internal types for most strings -> atom to match chmgr internal use 2015-12-11 16:36:18 +09:00
Scott Lystig Fritchie
6b7d871ada WIP: diff in progress 2 2015-12-11 16:26:13 +09:00
Scott Lystig Fritchie
1db232db1b WIP: diff in progress 2015-12-11 15:33:31 +09:00
Scott Lystig Fritchie
3826af8ee2 WIP: dict -> gb_trees, 2 of 2 2015-12-11 13:17:33 +09:00
Scott Lystig Fritchie
df8eea8c10 WIP: dict -> gb_trees, 1 of 2 2015-12-11 12:54:54 +09:00
Scott Lystig Fritchie
61eae1300f WIP: finish basic 'run', add negative tests 2015-12-11 12:43:38 +09:00
Scott Lystig Fritchie
3ee3de1aaf WIP: end of day 2015-12-10 23:44:27 +09:00
Scott Lystig Fritchie
6a5c590ad1 WIP: AST change {chain,...} thingie 2015-12-10 23:05:08 +09:00
Scott Lystig Fritchie
c37f23d97a WIP: 'Run' AST thingie ha, take that, wheel! 2015-12-10 22:53:17 +09:00
Scott Lystig Fritchie
9cec53eea6 Yet another strawman AST 2015-12-10 19:18:25 +09:00
Scott Lystig Fritchie
9472bad37b Clean up test errors 2015-12-10 15:57:35 +09:00
Scott Lystig Fritchie
cb706f0d23 Add test/machi_lifecycle_mgr_test.erl 2015-12-10 15:20:56 +09:00
Scott Lystig Fritchie
61ef7739cd Modify chain mostly works, better 2015-12-10 00:12:34 +09:00
Scott Lystig Fritchie
b0a9e65ca2 WIP: trying to shut down entire chain, but buggy, derp 2015-12-09 23:00:27 +09:00
Scott Lystig Fritchie
95e2df304e WIP: minor cleanup 2015-12-09 22:25:43 +09:00
Scott Lystig Fritchie
7f25fcc8f8 Modify chain mostly works 2015-12-09 19:02:16 +09:00
Scott Lystig Fritchie
cd9bf9eeab Modify chain mostly works, 2 TODOs remain 2015-12-09 18:17:26 +09:00
Scott Lystig Fritchie
2871f8397c WIP: modify chain still a bit broken 2015-12-09 17:19:02 +09:00
Scott Lystig Fritchie
65eec61f82 Basic stuff to add new flus via 'pending' dir 2015-12-09 14:48:46 +09:00
Scott Lystig Fritchie
7301c8308e Clarify the initial docs, thanks @mrallen1! 2015-12-09 14:07:27 +09:00
Scott Lystig Fritchie
f23e500993 WIP comments 2015-12-09 11:32:05 +09:00
Scott Lystig Fritchie
69280bfb4f Fix typo/thinko: correct chain name @ bootstrap 2015-12-08 22:19:26 +09:00
Scott Lystig Fritchie
0fc7bc74b7 EDoc fixes 2015-12-08 22:05:11 +09:00
Scott Lystig Fritchie
8285899dba Bootstrap chain @ app init: done, with an example.
For example:

% make clean
% make stage

And then configure 3 FLUs:

    % echo '{p_srvr, a, machi_flu1_client, "localhost", 39000, []}.' > rel/machi/etc/flu-config/a
    % echo '{p_srvr, b, machi_flu1_client, "localhost", 39001, []}.' > rel/machi/etc/flu-config/b
    % echo '{p_srvr, c, machi_flu1_client, "localhost", 39002, []}.' > rel/machi/etc/flu-config/c

And then configure a chain to use 2 of those 3 FLUs:

    % echo '{chain_def_v1,c1,ap_mode,[{p_srvr,a,machi_flu1_client,"localhost",39000,[]},{p_srvr,b,machi_flu1_client,"localhost",39001,[]}],[],[]}.' > rel/machi/etc/chain-config/c1

... then start Machi e.g.

    % ./rel/machi/bin/machi console

... you should see the following console messages scroll by (including a :

    =PROGRESS REPORT==== 8-Dec-2015::22:01:44 ===
              supervisor: {local,machi_flu_sup}
                 started: [{pid,<0.145.0>},
                           {name,a},
                           {mfargs,
                               {machi_flu_psup,start_link,
                                   [a,39000,"./data/flu/a",[]]}},
                           {restart_type,permanent},
                           {shutdown,5000},
                           {child_type,supervisor}]

    [... and also for the other two FLUs, including a bunch of progress
         reports for processes that started underneath that sub-supervisor.]

    22:01:44.446 [info] Running FLUs: [a,b,c]
    22:01:44.446 [info] Running FLUs at epoch 0: [a,b,c]
    22:01:44.532 [warning] The following FLUs are defined but are not also members of a defined chain: [c]
2015-12-08 21:57:29 +09:00
Scott Lystig Fritchie
37ac09a680 Rename src/machi_chain_bootstrap.erl -> src/machi_lifecycle_mgr.erl 2015-12-08 17:46:11 +09:00
Scott Lystig Fritchie
3391c89818 Clean up verbosity of nonunanimous_setup_and_fix_test2() 2015-12-08 16:29:56 +09:00
Scott Lystig Fritchie
27e8a31307 Fix fitness timing problem with short-circuit +trigger_early_adjustment/2 2015-12-08 15:27:47 +09:00
Scott Lystig Fritchie
ef10ebed22 WIP: now trying to diagnose fitness server bug? 2015-12-08 14:50:16 +09:00
Scott Lystig Fritchie
1bc9033076 Yay, all tests pass! 2015-12-07 22:15:23 +09:00
Scott Lystig Fritchie
38e63e8181 Add & remove, mostly working (2 eunit tests broken) 2015-12-07 21:52:27 +09:00
Scott Lystig Fritchie
5aeaf872d9 WIP: machi_chain_manager1:set_chain_members() API change, all tests pass, yay 2015-12-07 14:41:56 +09:00
Scott Lystig Fritchie
3c880dc437 WIP: find 1st overlapping FLU in any #chain_def_v1{} 2015-12-04 17:47:18 +09:00
Scott Lystig Fritchie
a7ffef6b8e Add src/machi_chain_bootstrap.erl 2015-12-04 17:18:15 +09:00
Scott Lystig Fritchie
cf0829b934 Add rc.d style config dir for FLU server startup 2015-12-04 16:37:05 +09:00
81 changed files with 5792 additions and 2500 deletions

4
.gitignore vendored
View file

@ -2,7 +2,9 @@ prototype/chain-manager/patch.*
.eqc-info
.eunit
deps
dev
erl_crash.dump
eqc
.concrete/DEV_MODE
.rebar
edoc
@ -20,10 +22,12 @@ include/machi_pb.hrl
# Release packaging
rel/machi
rel/vars/dev*vars.config
# Misc Scott cruft
*.patch
current_counterexample.eqc
foo*
RUNLOG*
typescript*
*.swp

View file

@ -4,4 +4,4 @@ notifications:
script: "priv/test-for-gh-pr.sh"
otp_release:
- 17.5
- 18.1
## No, Dialyzer is too different between 17 & 18: - 18.1

177
FAQ.md
View file

@ -11,14 +11,14 @@
+ [1 Questions about Machi in general](#n1)
+ [1.1 What is Machi?](#n1.1)
+ [1.2 What is a Machi "cluster of clusters"?](#n1.2)
+ [1.2.1 This "cluster of clusters" idea needs a better name, don't you agree?](#n1.2.1)
+ [1.3 What is Machi like when operating in "eventually consistent" mode?](#n1.3)
+ [1.4 What is Machi like when operating in "strongly consistent" mode?](#n1.4)
+ [1.5 What does Machi's API look like?](#n1.5)
+ [1.6 What licensing terms are used by Machi?](#n1.6)
+ [1.7 Where can I find the Machi source code and documentation? Can I contribute?](#n1.7)
+ [1.8 What is Machi's expected release schedule, packaging, and operating system/OS distribution support?](#n1.8)
+ [1.2 What is a Machi chain?](#n1.2)
+ [1.3 What is a Machi cluster?](#n1.3)
+ [1.4 What is Machi like when operating in "eventually consistent" mode?](#n1.4)
+ [1.5 What is Machi like when operating in "strongly consistent" mode?](#n1.5)
+ [1.6 What does Machi's API look like?](#n1.6)
+ [1.7 What licensing terms are used by Machi?](#n1.7)
+ [1.8 Where can I find the Machi source code and documentation? Can I contribute?](#n1.8)
+ [1.9 What is Machi's expected release schedule, packaging, and operating system/OS distribution support?](#n1.9)
+ [2 Questions about Machi relative to {{something else}}](#n2)
+ [2.1 How is Machi better than Hadoop?](#n2.1)
+ [2.2 How does Machi differ from HadoopFS/HDFS?](#n2.2)
@ -28,13 +28,15 @@
+ [3 Machi's specifics](#n3)
+ [3.1 What technique is used to replicate Machi's files? Can other techniques be used?](#n3.1)
+ [3.2 Does Machi have a reliance on a coordination service such as ZooKeeper or etcd?](#n3.2)
+ [3.3 Is it true that there's an allegory written to describe humming consensus?](#n3.3)
+ [3.4 How is Machi tested?](#n3.4)
+ [3.5 Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks](#n3.5)
+ [3.6 Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device?](#n3.6)
+ [3.7 What language(s) is Machi written in?](#n3.7)
+ [3.8 Does Machi use the Erlang/OTP network distribution system (aka "disterl")?](#n3.8)
+ [3.9 Can I use HTTP to write/read stuff into/from Machi?](#n3.9)
+ [3.3 Are there any presentations available about Humming Consensus](#n3.3)
+ [3.4 Is it true that there's an allegory written to describe Humming Consensus?](#n3.4)
+ [3.5 How is Machi tested?](#n3.5)
+ [3.6 Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks](#n3.6)
+ [3.7 Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device?](#n3.7)
+ [3.8 What language(s) is Machi written in?](#n3.8)
+ [3.9 Can Machi run on Windows? Can Machi run on 32-bit platforms?](#n3.9)
+ [3.10 Does Machi use the Erlang/OTP network distribution system (aka "disterl")?](#n3.10)
+ [3.11 Can I use HTTP to write/read stuff into/from Machi?](#n3.11)
<!-- ENDOUTLINE -->
@ -44,13 +46,13 @@
<a name="n1.1">
### 1.1. What is Machi?
Very briefly, Machi is a very simple append-only file store.
Very briefly, Machi is a very simple append-only blob/file store.
Machi is
"dumber" than many other file stores (i.e., lacking many features
found in other file stores) such as HadoopFS or simple NFS or CIFS file
found in other file stores) such as HadoopFS or a simple NFS or CIFS file
server.
However, Machi is a distributed file store, which makes it different
However, Machi is a distributed blob/file store, which makes it different
(and, in some ways, more complicated) than a simple NFS or CIFS file
server.
@ -82,45 +84,39 @@ For a much longer answer, please see the
[Machi high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-machi.pdf).
<a name="n1.2">
### 1.2. What is a Machi "cluster of clusters"?
### 1.2. What is a Machi chain?
Machi's design is based on using small, well-understood and provable
(mathematically) techniques to maintain multiple file copies without
data loss or data corruption. At its lowest level, Machi contains no
support for distribution/partitioning/sharding of files across many
servers. A typical, fully-functional Machi cluster will likely be two
or three machines.
A Machi chain is a small number of machines that maintain a common set
of replicated files. A typical chain is of length 2 or 3. For
critical data that must be available despite several simultaneous
server failures, a chain length of 6 or 7 might be used.
However, Machi is designed to be an excellent building block for
building larger systems. A deployment of Machi "cluster of clusters"
will use the "random slicing" technique for partitioning files across
multiple Machi clusters that, as individuals, are unaware of the
larger cluster-of-clusters scheme.
<a name="n1.3">
### 1.3. What is a Machi cluster?
The cluster-of-clusters management service will be fully decentralized
A Machi cluster is a collection of Machi chains that
partitions/shards/distributes files (based on file name) across the
collection of chains. Machi uses the "random slicing" algorithm (a
variation of consistent hashing) to define the mapping of file name to
chain name.
The cluster management service will be fully decentralized
and run as a separate software service installed on each Machi
cluster. This manager will appear to the local Machi server as simply
another Machi file client. The cluster-of-clusters managers will take
another Machi file client. The cluster managers will take
care of file migration as the cluster grows and shrinks in capacity
and in response to day-to-day changes in workload.
Though the cluster-of-clusters manager has not yet been implemented,
Though the cluster manager has not yet been implemented,
its design is fully decentralized and capable of operating despite
multiple partial failure of its member clusters. We expect this
multiple partial failure of its member chains. We expect this
design to scale easily to at least one thousand servers.
Please see the
[Machi source repository's 'doc' directory for more details](https://github.com/basho/machi/tree/master/doc/).
<a name="n1.2.1">
#### 1.2.1. This "cluster of clusters" idea needs a better name, don't you agree?
Yes. Please help us: we are bad at naming things.
For proof that naming things is hard, see
[http://martinfowler.com/bliki/TwoHardThings.html](http://martinfowler.com/bliki/TwoHardThings.html)
<a name="n1.3">
### 1.3. What is Machi like when operating in "eventually consistent" mode?
<a name="n1.4">
### 1.4. What is Machi like when operating in "eventually consistent" mode?
Machi's operating mode dictates how a Machi cluster will react to
network partitions. A network partition may be caused by:
@ -143,13 +139,14 @@ consistency mode during and after network partitions are:
together from "all sides" of the partition(s).
* Unique files are copied in their entirety.
* Byte ranges within the same file are merged. This is possible
due to Machi's restrictions on file naming (files names are
alwoys assigned by Machi servers) and file offset assignments
(byte offsets are also always chosen by Machi servers according
to rules which guarantee safe mergeability.).
due to Machi's restrictions on file naming and file offset
assignment. Both file names and file offsets are always chosen
by Machi servers according to rules which guarantee safe
mergeability. Server-assigned names are a characteristic of a
"blob store".
<a name="n1.4">
### 1.4. What is Machi like when operating in "strongly consistent" mode?
<a name="n1.5">
### 1.5. What is Machi like when operating in "strongly consistent" mode?
The consistency semantics of file operations while in strongly
consistency mode during and after network partitions are:
@ -167,19 +164,19 @@ consistency mode during and after network partitions are:
Machi's design can provide the illusion of quorum minority write
availability if the cluster is configured to operate with "witness
servers". (This feaure is not implemented yet, as of June 2015.)
servers". (This feaure partially implemented, as of December 2015.)
See Section 11 of
[Machi chain manager high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-chain-mgr.pdf)
for more details.
<a name="n1.5">
### 1.5. What does Machi's API look like?
<a name="n1.6">
### 1.6. What does Machi's API look like?
The Machi API only contains a handful of API operations. The function
arguments shown below use Erlang-style type annotations.
arguments shown below (in simplifed form) use Erlang-style type annotations.
append_chunk(Prefix:binary(), Chunk:binary()).
append_chunk_extra(Prefix:binary(), Chunk:binary(), ExtraSpace:non_neg_integer()).
append_chunk(Prefix:binary(), Chunk:binary(), CheckSum:binary()).
append_chunk_extra(Prefix:binary(), Chunk:binary(), CheckSum:binary(), ExtraSpace:non_neg_integer()).
read_chunk(File:binary(), Offset:non_neg_integer(), Size:non_neg_integer()).
checksum_list(File:binary()).
@ -204,15 +201,15 @@ level" internal protocol are in a
[Protocol Buffers](https://developers.google.com/protocol-buffers/docs/overview)
definition at [./src/machi.proto](./src/machi.proto).
<a name="n1.6">
### 1.6. What licensing terms are used by Machi?
<a name="n1.7">
### 1.7. What licensing terms are used by Machi?
All Machi source code and documentation is licensed by
[Basho Technologies, Inc.](http://www.basho.com/)
under the [Apache Public License version 2](https://github.com/basho/machi/tree/master/LICENSE).
<a name="n1.7">
### 1.7. Where can I find the Machi source code and documentation? Can I contribute?
<a name="n1.8">
### 1.8. Where can I find the Machi source code and documentation? Can I contribute?
All Machi source code and documentation can be found at GitHub:
[https://github.com/basho/machi](https://github.com/basho/machi).
@ -226,8 +223,8 @@ ideas for improvement, please see our contributing & collaboration
guidelines at
[https://github.com/basho/machi/blob/master/CONTRIBUTING.md](https://github.com/basho/machi/blob/master/CONTRIBUTING.md).
<a name="n1.8">
### 1.8. What is Machi's expected release schedule, packaging, and operating system/OS distribution support?
<a name="n1.9">
### 1.9. What is Machi's expected release schedule, packaging, and operating system/OS distribution support?
Basho expects that Machi's first major product release will take place
during the 2nd quarter of 2016.
@ -305,15 +302,15 @@ file's writable phase).
<tr>
<td> Does not have any file distribution/partitioning/sharding across
Machi clusters: in a single Machi cluster, all files are replicated by
all servers in the cluster. The "cluster of clusters" concept is used
Machi chains: in a single Machi chain, all files are replicated by
all servers in the chain. The "random slicing" technique is used
to distribute/partition/shard files across multiple Machi clusters.
<td> File distribution/partitioning/sharding is performed
automatically by the HDFS "name node".
<tr>
<td> Machi requires no central "name node" for single cluster use.
Machi requires no central "name node" for "cluster of clusters" use
<td> Machi requires no central "name node" for single chain use or
for multi-chain cluster use.
<td> Requires a single "namenode" server to maintain file system contents
and file content mapping. (May be deployed with a "secondary
namenode" to reduce unavailability when the primary namenode fails.)
@ -479,8 +476,8 @@ difficult to adapt to Machi's design goals:
* Both protocols use quorum majority consensus, which requires a
minimum of *2F + 1* working servers to tolerate *F* failures. For
example, to tolerate 2 server failures, quorum majority protocols
require a minium of 5 servers. To tolerate the same number of
failures, Chain replication requires only 3 servers.
require a minimum of 5 servers. To tolerate the same number of
failures, Chain Replication requires a minimum of only 3 servers.
* Machi's use of "humming consensus" to manage internal server
metadata state would also (probably) require conversion to Paxos or
Raft. (Or "outsourced" to a service such as ZooKeeper.)
@ -497,7 +494,17 @@ Humming consensus is described in the
[Machi chain manager high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-chain-mgr.pdf).
<a name="n3.3">
### 3.3. Is it true that there's an allegory written to describe humming consensus?
### 3.3. Are there any presentations available about Humming Consensus
Scott recently (November 2015) gave a presentation at the
[RICON 2015 conference](http://ricon.io) about one of the techniques
used by Machi; "Managing Chain Replication Metadata with
Humming Consensus" is available online now.
* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf)
* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q)
<a name="n3.4">
### 3.4. Is it true that there's an allegory written to describe Humming Consensus?
Yes. In homage to Leslie Lamport's original paper about the Paxos
protocol, "The Part-time Parliamant", there is an allegorical story
@ -508,8 +515,8 @@ The full story, full of wonder and mystery, is called
There is also a
[short followup blog posting](http://www.snookles.com/slf-blog/2015/03/20/on-humming-consensus-an-allegory-part-2/).
<a name="n3.4">
### 3.4. How is Machi tested?
<a name="n3.5">
### 3.5. How is Machi tested?
While not formally proven yet, Machi's implementation of Chain
Replication and of humming consensus have been extensively tested with
@ -538,16 +545,16 @@ All test code is available in the [./test](./test) subdirectory.
Modules that use QuickCheck will use a file suffix of `_eqc`, for
example, [./test/machi_ap_repair_eqc.erl](./test/machi_ap_repair_eqc.erl).
<a name="n3.5">
### 3.5. Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks
<a name="n3.6">
### 3.6. Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks
No, Machi's design assumes that each Machi server is a fully
independent hardware and assumes only standard local disks (Winchester
and/or SSD style) with local-only interfaces (e.g. SATA, SCSI, PCI) in
each machine.
<a name="n3.6">
### 3.6. Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device?
<a name="n3.7">
### 3.7. Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device?
No. When used with servers with multiple disks, the intent is to
deploy multiple Machi servers per machine: one Machi server per disk.
@ -565,10 +572,10 @@ deploy multiple Machi servers per machine: one Machi server per disk.
placement relative to 12 servers is smaller than a placement problem
of managing 264 seprate disks (if each of 12 servers has 22 disks).
<a name="n3.7">
### 3.7. What language(s) is Machi written in?
<a name="n3.8">
### 3.8. What language(s) is Machi written in?
So far, Machi is written in 100% Erlang. Machi uses at least one
So far, Machi is written in Erlang, mostly. Machi uses at least one
library, [ELevelDB](https://github.com/basho/eleveldb), that is
implemented both in C++ and in Erlang, using Erlang NIFs (Native
Interface Functions) to allow Erlang code to call C++ functions.
@ -580,8 +587,16 @@ in C, Java, or other "gotta go fast fast FAST!!" programming
language. We expect that the Chain Replication manager and other
critical "control plane" software will remain in Erlang.
<a name="n3.8">
### 3.8. Does Machi use the Erlang/OTP network distribution system (aka "disterl")?
<a name="n3.9">
### 3.9. Can Machi run on Windows? Can Machi run on 32-bit platforms?
The ELevelDB NIF does not compile or run correctly on Erlang/OTP
Windows platforms, nor does it compile correctly on 32-bit platforms.
Machi should support all 64-bit UNIX-like platforms that are supported
by Erlang/OTP and ELevelDB.
<a name="n3.10">
### 3.10. Does Machi use the Erlang/OTP network distribution system (aka "disterl")?
No, Machi doesn't use Erlang/OTP's built-in distributed message
passing system. The code would be *much* simpler if we did use
@ -596,8 +611,8 @@ All wire protocols used by Machi are defined & implemented using
[Protocol Buffers](https://developers.google.com/protocol-buffers/docs/overview).
The definition file can be found at [./src/machi.proto](./src/machi.proto).
<a name="n3.9">
### 3.9. Can I use HTTP to write/read stuff into/from Machi?
<a name="n3.11">
### 3.11. Can I use HTTP to write/read stuff into/from Machi?
Short answer: No, not yet.

View file

@ -10,7 +10,7 @@ endif
OVERLAY_VARS ?=
EUNIT_OPTS = -v
.PHONY: rel deps package pkgclean edoc
.PHONY: rel stagedevrel deps package pkgclean edoc
all: deps compile
@ -35,6 +35,9 @@ deps:
clean:
$(REBAR) -r clean
edoc: edoc-clean
$(REBAR) skip_deps=true doc
edoc-clean:
rm -f edoc/*.png edoc/*.html edoc/*.css edoc/edoc-info
@ -54,6 +57,37 @@ relclean:
stage : rel
$(foreach dep,$(wildcard deps/*), rm -rf rel/$(REPO)/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) rel/$(REPO)/lib;)
##
## Developer targets
##
## devN - Make a dev build for node N
## stagedevN - Make a stage dev build for node N (symlink libraries)
## devrel - Make a dev build for 1..$DEVNODES
## stagedevrel Make a stagedev build for 1..$DEVNODES
##
## Example, make a 68 node devrel cluster
## make stagedevrel DEVNODES=68
.PHONY : stagedevrel devrel
DEVNODES ?= 3
# 'seq' is not available on all *BSD, so using an alternate in awk
SEQ = $(shell awk 'BEGIN { for (i = 1; i < '$(DEVNODES)'; i++) printf("%i ", i); print i ;exit(0);}')
$(eval stagedevrel : $(foreach n,$(SEQ),stagedev$(n)))
$(eval devrel : $(foreach n,$(SEQ),dev$(n)))
dev% : all
mkdir -p dev
rel/gen_dev $@ rel/vars/dev_vars.config.src rel/vars/$@_vars.config
(cd rel && ../rebar generate target_dir=../dev/$@ overlay_vars=vars/$@_vars.config)
stagedev% : dev%
$(foreach dep,$(wildcard deps/*), rm -rf dev/$^/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) dev/$^/lib;)
devclean: clean
rm -rf dev
DIALYZER_APPS = kernel stdlib sasl erts ssl compiler eunit crypto public_key syntax_tools
PLT = $(HOME)/.machi_dialyzer_plt

124
README.md
View file

@ -1,19 +1,19 @@
# Machi: a robust & reliable, distributed, highly available, large file store
# Machi: a distributed, decentralized blob/large file store
[Travis-CI](http://travis-ci.org/basho/machi) :: ![Travis-CI](https://secure.travis-ci.org/basho/machi.png)
Outline
1. [Why another file store?](#sec1)
1. [Why another blob/file store?](#sec1)
2. [Where to learn more about Machi](#sec2)
3. [Development status summary](#sec3)
4. [Contributing to Machi's development](#sec4)
<a name="sec1">
## 1. Why another file store?
## 1. Why another blob/file store?
Our goal is a robust & reliable, distributed, highly available, large
file store. Such stores already exist, both in the open source world
file and blob store. Such stores already exist, both in the open source world
and in the commercial world. Why reinvent the wheel? We believe
there are three reasons, ordered by decreasing rarity.
@ -25,9 +25,8 @@ there are three reasons, ordered by decreasing rarity.
3. We want to manage file replicas in a way that's provably correct
and also easy to test.
Of all the file stores in the open source & commercial worlds, only
criteria #3 is a viable option. Or so we hope. Or we just don't
care, and if data gets lost or corrupted, then ... so be it.
Criteria #3 is difficult to find in the open source world but perhaps
not impossible.
If we have app use cases where availability is more important than
consistency, then systems that meet criteria #2 are also rare.
@ -39,12 +38,13 @@ file data and attempts best-effort file reads?
If we really do care about data loss and/or data corruption, then we
really want both #3 and #1. Unfortunately, systems that meet
criteria #1 are _very rare_.
criteria #1 are _very rare_. (Nonexistant?)
Why? This is 2015. We have decades of research that shows
that computer hardware can (and
indeed does) corrupt data at nearly every level of the modern
client/server application stack. Systems with end-to-end data
corruption detection should be ubiquitous today. Alas, they are not.
Machi is an effort to change the deplorable state of the world, one
Erlang function at a time.
@ -64,49 +64,68 @@ Humming Consensus" is available online now.
* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf)
* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q)
See later in this document for how to run the Humming Consensus demos,
including the network partition simulator.
<a name="sec3">
## 3. Development status summary
Mid-December 2015: work is underway.
Mid-March 2016: The Machi development team has been downsized in
recent months, and the pace of development has slowed. Here is a
summary of the status of Machi's major components.
* In progress:
* Code refactoring: metadata management using
[ELevelDB](https://github.com/basho/eleveldb)
* File repair using file-centric, Merkle-style hash tree.
* Server-side socket handling is now performed by
[ranch](https://github.com/ninenines/ranch)
* QuickCheck tests for file repair correctness
* 2015-12-15: The EUnit test `machi_ap_repair_eqc` is
currently failing occasionally because it (correctly) detects
double-write errors. Double-write errors will be eliminated
when the ELevelDB integration work is complete.
* The `make stage` and `make release` commands can be used to
create a primitive "package". Use `./rel/machi/bin/machi console`
to start the Machi app in interactive mode. Substitute the word
`start` instead of console to start Machi in background/daemon
mode. The `./rel/machi/bin/machi` command without any arguments
will give a short usage summary.
* Chain Replication management using the Humming Consensus
algorithm to manage chain state is stable.
* ... with the caveat that it runs very well in a very harsh
and unforgiving network partition simulator but has not run
much yet in the real world.
* All Machi client/server protocols are based on
[Protocol Buffers](https://developers.google.com/protocol-buffers/docs/overview).
* The current specification for Machi's protocols can be found at
[https://github.com/basho/machi/blob/master/src/machi.proto](https://github.com/basho/machi/blob/master/src/machi.proto).
* The Machi PB protocol is not yet stable. Expect change!
* The Erlang language client implementation of the high-level
protocol flavor is brittle (e.g., little error handling yet).
* Humming Consensus and the chain manager
* No new safety bugs have been found by model-checking tests.
* A new document,
[Hands-on experiments with Machi and Humming Consensus](doc/humming-consensus-demo.md)
is now available. It is a tutorial for setting up a 3 virtual
machine Machi cluster and how to demonstrate the chain manager's
reactions to server stops & starts, crashes & restarts, and pauses
(simulated by `SIGSTOP` and `SIGCONT`).
* The chain manager can still make suboptimal-but-safe choices for
chain transitions when a server hangs/pauses temporarily.
* Recent chain manager changes have made the instability window
much shorter when the slow/paused server resumes execution.
* Scott believes that a modest change to the chain manager's
calculation of a new projection can reduce flapping in this (and
many other cases) less likely. Currently, the new local
projection is calculated using only local state (i.e., the chain
manager's internal state + the fitness server's state).
However, if the "latest" projection read from the public
projection stores were also input to the new projection
calculation function, then many obviously bad projections can be
avoided without needing rounds of Humming Consensus to
demonstrate that a bad projection is bad.
If you would like to run the network partition simulator
mentioned in the Ricon 2015 presentation about Humming Consensus,
please see the
[partition simulator convergence test doc.](./doc/machi_chain_manager1_converge_demo.md)
* FLU/data server process
* All known correctness bugs have been fixed.
* Performance has not yet been measured. Performance measurement
and enhancements are scheduled to start in the middle of March 2016.
(This will include a much-needed update to the `basho_bench` driver.)
If you'd like to work on a protocol such as Thrift, UBF,
msgpack over UDP, or some other protocol, let us know by
[opening an issue to discuss it](./issues/new).
* Access protocols and client libraries
* The protocol used by both external clients and internally (instead
of using Erlang's native message passing mechanisms) is based on
Protocol Buffers.
* (Machi PB protocol specification: ./src/machi.proto)[./src/machi.proto]
* At the moment, the PB specification contains two protocols.
Sometime in the near future, the spec will be split to separate
the external client API (the "high" protocol) from the internal
communication API (the "low" protocol).
* Recent conference talks about Machi
* Erlang Factory San Francisco 2016
[the slides and video recording](http://www.erlang-factory.com/sfbay2016/scott-lystig-fritchie)
will be available a few weeks after the conference ends on March
11, 2016.
* Ricon 2015
* [The slides](http://ricon.io/archive/2015/slides/Scott_Fritchie_Ricon_2015.pdf)
* and the [video recording](https://www.youtube.com/watch?v=yR5kHL1bu1Q&index=13&list=PL9Jh2HsAWHxIc7Tt2M6xez_TOP21GBH6M)
are now available.
* If you would like to run the Humming Consensus code (with or without
the network partition simulator) as described in the RICON 2015
presentation, please see the
[Humming Consensus demo doc](./doc/humming_consensus_demo.md).
<a name="sec4">
## 4. Contributing to Machi's development
@ -134,13 +153,22 @@ X. The only known limitations for using R16 are minor type
specification difference between R16 and 17, but we strongly suggest
continuing development using version 17.
We also assume that you have the standard UNIX/Linux developers
tool chain for C and C++ applications. Specifically, we assume `make`
is available. The utility used to compile the Machi source code,
We also assume that you have the standard UNIX/Linux developer
tool chain for C and C++ applications. Also, we assume
that Git and GNU Make are available.
The utility used to compile the Machi source code,
`rebar`, is pre-compiled and included in the repo.
For more details, please see the
[Machi development environment prerequisites doc](./doc/dev-prerequisites.md).
Machi has a dependency on the
[ELevelDB](https://github.com/basho/eleveldb) library. ELevelDB only
supports UNIX/Linux OSes and 64-bit versions of Erlang/OTP only; we
apologize to Windows-based and 32-bit-based Erlang developers for this
restriction.
### 4.3 New protocols and features
If you'd like to work on a protocol such as Thrift, UBF,
msgpack over UDP, or some other protocol, let us know by
[opening an issue to discuss it](./issues/new).

View file

@ -11,6 +11,5 @@ Unknown types:
######## Specific messages #####################
##################################################
machi_chain_manager1.erl:2473: The created fun has no local return
machi_chain_manager1.erl:2102: The pattern <_P1, P2, Else = {'expected_author2', UPI1_tail, _}> can never match the type <#projection_v1{epoch_number::'undefined' | non_neg_integer(),epoch_csum::'undefined' | binary(),author_server::atom() | binary(),all_members::'undefined' | [atom() | binary()],witnesses::[atom() | binary()],creation_time::'undefined' | {non_neg_integer(),non_neg_integer(),non_neg_integer()},mode::'ap_mode' | 'cp_mode',upi::'undefined' | [atom() | binary()],repairing::'undefined' | [atom() | binary()],down::'undefined' | [atom() | binary()],dbg::'undefined' | [any()],dbg2::'undefined' | [any()],members_dict::'undefined' | [{_,_}]},#projection_v1{epoch_number::'undefined' | non_neg_integer(),epoch_csum::binary(),author_server::atom() | binary(),all_members::'undefined' | [atom() | binary()],witnesses::[atom() | binary()],creation_time::'undefined' | {non_neg_integer(),non_neg_integer(),non_neg_integer()},mode::'ap_mode' | 'cp_mode',upi::'undefined' | [atom() | binary()],repairing::'undefined' | [atom() | binary()],down::'undefined' | [atom() | binary()],dbg::'undefined' | [any()],dbg2::'undefined' | [any()],members_dict::'undefined' | [{_,_}]},'true'>
machi_chain_manager1.erl:2151: The pattern <_P1 = {'projection_v1', _, _, _, _, _, _, 'cp_mode', UPI1, Repairing1, _, _, _, _}, _P2 = {'projection_v1', _, _, _, _, _, _, 'cp_mode', UPI2, Repairing2, _, _, _, _}, Else = {'epoch_not_si', EpochX, 'not_gt', EpochY}> can never match the type <#projection_v1{epoch_number::'undefined' | non_neg_integer(),epoch_csum::'undefined' | binary(),author_server::atom() | binary(),all_members::'undefined' | [atom() | binary()],witnesses::[atom() | binary()],creation_time::'undefined' | {non_neg_integer(),non_neg_integer(),non_neg_integer()},mode::'ap_mode' | 'cp_mode',upi::'undefined' | [atom() | binary()],repairing::'undefined' | [atom() | binary()],down::'undefined' | [atom() | binary()],dbg::'undefined' | [any()],dbg2::'undefined' | [any()],members_dict::'undefined' | [{_,_}]},#projection_v1{epoch_number::'undefined' | non_neg_integer(),epoch_csum::binary(),author_server::atom() | binary(),all_members::'undefined' | [atom() | binary()],witnesses::[atom() | binary()],creation_time::'undefined' | {non_neg_integer(),non_neg_integer(),non_neg_integer()},mode::'ap_mode' | 'cp_mode',upi::'undefined' | [atom() | binary()],repairing::'undefined' | [atom() | binary()],down::'undefined' | [atom() | binary()],dbg::'undefined' | [any()],dbg2::'undefined' | [any()],members_dict::'undefined' | [{_,_}]},'true'>
machi_flu1.erl:246: The created fun has no local return
machi_chain_manager1.erl:2184: The pattern <_P1, P2, Else = {'expected_author2', UPI1_tail, _}> can never match the type <#projection_v1{epoch_number::'undefined' | non_neg_integer(),epoch_csum::'undefined' | binary(),author_server::atom(),chain_name::atom(),all_members::'undefined' | [atom()],witnesses::[atom()],creation_time::'undefined' | {non_neg_integer(),non_neg_integer(),non_neg_integer()},mode::'ap_mode' | 'cp_mode',upi::'undefined' | [atom()],repairing::'undefined' | [atom()],down::'undefined' | [atom()],dbg::'undefined' | [any()],dbg2::'undefined' | [any()],members_dict::'undefined' | [{_,_}]},#projection_v1{epoch_number::'undefined' | non_neg_integer(),epoch_csum::binary(),author_server::atom(),chain_name::atom(),all_members::'undefined' | [atom()],witnesses::[atom()],creation_time::'undefined' | {non_neg_integer(),non_neg_integer(),non_neg_integer()},mode::'ap_mode' | 'cp_mode',upi::'undefined' | [atom()],repairing::'undefined' | [atom()],down::'undefined' | [atom()],dbg::'undefined' | [any()],dbg2::'undefined' | [any()],members_dict::'undefined' | [{_,_}]},'true'>
machi_chain_manager1.erl:2233: The pattern <_P1 = {'projection_v1', _, _, _, _, _, _, _, 'cp_mode', UPI1, Repairing1, _, _, _, _}, _P2 = {'projection_v1', _, _, _, _, _, _, _, 'cp_mode', UPI2, Repairing2, _, _, _, _}, Else = {'epoch_not_si', EpochX, 'not_gt', EpochY}> can never match the type <#projection_v1{epoch_number::'undefined' | non_neg_integer(),epoch_csum::'undefined' | binary(),author_server::atom(),chain_name::atom(),all_members::'undefined' | [atom()],witnesses::[atom()],creation_time::'undefined' | {non_neg_integer(),non_neg_integer(),non_neg_integer()},mode::'ap_mode' | 'cp_mode',upi::'undefined' | [atom()],repairing::'undefined' | [atom()],down::'undefined' | [atom()],dbg::'undefined' | [any()],dbg2::'undefined' | [any()],members_dict::'undefined' | [{_,_}]},#projection_v1{epoch_number::'undefined' | non_neg_integer(),epoch_csum::binary(),author_server::atom(),chain_name::atom(),all_members::'undefined' | [atom()],witnesses::[atom()],creation_time::'undefined' | {non_neg_integer(),non_neg_integer(),non_neg_integer()},mode::'ap_mode' | 'cp_mode',upi::'undefined' | [atom()],repairing::'undefined' | [atom()],down::'undefined' | [atom()],dbg::'undefined' | [any()],dbg2::'undefined' | [any()],members_dict::'undefined' | [{_,_}]},'true'>

View file

@ -66,9 +66,9 @@ an introduction to the
self-management algorithm proposed for Machi. Most material has been
moved to the [high-level-chain-mgr.pdf](high-level-chain-mgr.pdf) document.
### cluster-of-clusters (directory)
### cluster (directory)
This directory contains the sketch of the "cluster of clusters" design
This directory contains the sketch of the cluster design
strawman for partitioning/distributing/sharding files across a large
number of independent Machi clusters.
number of independent Machi chains.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.7 KiB

View file

@ -1,479 +0,0 @@
-*- mode: org; -*-
#+TITLE: Machi cluster-of-clusters "name game" sketch
#+AUTHOR: Scott
#+STARTUP: lognotedone hidestars indent showall inlineimages
#+SEQ_TODO: TODO WORKING WAITING DONE
#+COMMENT: M-x visual-line-mode
#+COMMENT: Also, disable auto-fill-mode
* 1. "Name Games" with random-slicing style consistent hashing
Our goal: to distribute lots of files very evenly across a cluster of
Machi clusters (hereafter called a "cluster of clusters" or "CoC").
* 2. Assumptions
** Basic familiarity with Machi high level design and Machi's "projection"
The [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] contains all of the basic
background assumed by the rest of this document.
** Analogy: "neighborhood : city :: Machi : cluster-of-clusters"
Analogy: The word "machi" in Japanese means small town or
neighborhood. As the Tokyo Metropolitan Area is built from many
machis and smaller cities, therefore a big, partitioned file store can
be built out of many small Machi clusters.
** Familiarity with the Machi cluster-of-clusters/CoC concept
It's clear (I hope!) from
the [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] that Machi alone does not support
any kind of file partitioning/distribution/sharding across multiple
small Machi clusters. There must be another layer above a Machi cluster to
provide such partitioning services.
The name "cluster of clusters" originated within Basho to avoid
conflicting use of the word "cluster". A Machi cluster is usually
synonymous with a single Chain Replication chain and a single set of
machines (e.g. 2-5 machines). However, in the not-so-far future, we
expect much more complicated patterns of Chain Replication to be used
in real-world deployments.
"Cluster of clusters" is clunky and long, but we haven't found a good
substitute yet. If you have a good suggestion, please contact us!
~^_^~
Using the [[https://github.com/basho/machi/tree/master/prototype/demo-day-hack][cluster-of-clusters quick-and-dirty prototype]] as an
architecture sketch, let's now assume that we have ~n~ independent Machi
clusters. We assume that each of these clusters has roughly the same
chain length in the nominal case, e.g. chain length of 3.
We wish to provide partitioned/distributed file storage
across all ~n~ clusters. We call the entire collection of ~n~ Machi
clusters a "cluster of clusters", or abbreviated "CoC".
We may wish to have several types of Machi clusters, e.g. chain length
of 3 for normal data, longer for cannot-afford-data-loss files, and
shorter for don't-care-if-it-gets-lost files. Each of these types of
chains will have a name ~N~ in the CoC namespace. The role of the CoC
namespace will be demonstrated in Section 3 below.
** Continue CoC prototype's assumption: a Machi cluster is unaware of CoC
Let's continue with an assumption that an individual Machi cluster
inside of the cluster-of-clusters is completely unaware of the
cluster-of-clusters layer.
TODO: We may need to break this assumption sometime in the future?
** The reader is familiar with the random slicing technique
I'd done something very-very-nearly-identical for the Hibari database
6 years ago. But the Hibari technique was based on stuff I did at
Sendmail, Inc, so it felt old news to me. {shrug}
The Hibari documentation has a brief photo illustration of how random
slicing works, see [[http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration][Hibari Sysadmin Guide, chain migration]]
For a comprehensive description, please see these two papers:
#+BEGIN_QUOTE
Reliable and Randomized Data Distribution Strategies for Large Scale Storage Systems
Alberto Miranda et al.
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.226.5609
(short version, HIPC'11)
Random Slicing: Efficient and Scalable Data Placement for Large-Scale
Storage Systems
Alberto Miranda et al.
DOI: http://dx.doi.org/10.1145/2632230 (long version, ACM Transactions
on Storage, Vol. 10, No. 3, Article 9, 2014)
#+END_QUOTE
** CoC locator: We borrow from random slicing but do not hash any strings!
We will use the general technique of random slicing, but we adapt the
technique to fit our use case.
In general, random slicing says:
- Hash a string onto the unit interval [0.0, 1.0)
- Calculate h(unit interval point, Map) -> bin, where ~Map~ partitions
the unit interval into bins.
Our adaptation is in step 1: we do not hash any strings. Instead, we
store & use the unit interval point as-is, without using a hash
function in this step. This number is called the "CoC locator".
As described later in this doc, Machi file names are structured into
several components. One component of the file name contains the "CoC
locator"; we use the number as-is for step 2 above.
* 3. A simple illustration
We use a variation of the Random Slicing hash that we will call
~rs_hash_with_float()~. The Erlang-style function type is shown
below.
#+BEGIN_SRC erlang
%% type specs, Erlang-style
-spec rs_hash_with_float(float(), rs_hash:map()) -> rs_hash:cluster_id().
#+END_SRC
I'm borrowing an illustration from the HibariDB documentation here,
but it fits my purposes quite well. (I am the original creator of that
image, and also the use license is compatible.)
#+CAPTION: Illustration of 'Map', using four Machi clusters
[[./migration-4.png]]
Assume that we have a random slicing map called ~Map~. This particular
~Map~ maps the unit interval onto 4 Machi clusters:
| Hash range | Cluster ID |
|-------------+------------|
| 0.00 - 0.25 | Cluster1 |
| 0.25 - 0.33 | Cluster4 |
| 0.33 - 0.58 | Cluster2 |
| 0.58 - 0.66 | Cluster4 |
| 0.66 - 0.91 | Cluster3 |
| 0.91 - 1.00 | Cluster4 |
Assume that the system chooses a CoC locator of 0.05.
According to ~Map~, the value of
~rs_hash_with_float(0.05,Map) = Cluster1~.
Similarly, ~rs_hash_with_float(0.26,Map) = Cluster4~.
* 4. An additional assumption: clients will want some control over file location
We will continue to use the 4-cluster diagram from the previous
section.
** Our new assumption: client control over initial file location
The CoC management scheme may decide that files need to migrate to
other clusters. The reason could be for storage load or I/O load
balancing reasons. It could be because a cluster is being
decommissioned by its owners. There are many legitimate reasons why a
file that is initially created on cluster ID X has been moved to
cluster ID Y.
However, there are also legitimate reasons for why the client would want
control over the choice of Machi cluster when the data is first
written. The single biggest reason is load balancing. Assuming that
the client (or the CoC management layer acting on behalf of the CoC
client) knows the current utilization across the participating Machi
clusters, then it may be very helpful to send new append() requests to
under-utilized clusters.
* 5. Use of the CoC namespace: name separation plus chain type
Let us assume that the CoC framework provides several different types
of chains:
| Chain length | CoC namespace | Mode | Comment |
|--------------+---------------+------+----------------------------------|
| 3 | normal | AP | Normal storage redundancy & cost |
| 2 | reduced | AP | Reduced cost storage |
| 1 | risky | AP | Really, really cheap storage |
| 9 | paranoid | AP | Safety-critical storage |
| 3 | sequential | CP | Strong consistency |
|--------------+---------------+------+----------------------------------|
The client may want to choose the amount of redundancy that its
application requires: normal, reduced cost, or perhaps even a single
copy. The CoC namespace is used by the client to signal this
intention.
Further, the CoC administrators may wish to use the namespace to
provide separate storage for different applications. Jane's
application may use the namespace "jane-normal" and Bob's app uses
"bob-reduced". The CoC administrators may definite separate groups of
chains on separate servers to serve these two applications.
* 6. Floating point is not required ... it is merely convenient for explanation
NOTE: Use of floating point terms is not required. For example,
integer arithmetic could be used, if using a sufficiently large
interval to create an even & smooth distribution of hashes across the
expected maximum number of clusters.
For example, if the maximum CoC cluster size would be 4,000 individual
Machi clusters, then a minimum of 12 bits of integer space is required
to assign one integer per Machi cluster. However, for load balancing
purposes, a finer grain of (for example) 100 integers per Machi
cluster would permit file migration to move increments of
approximately 1% of single Machi cluster's storage capacity. A
minimum of 12+7=19 bits of hash space would be necessary to accommodate
these constraints.
It is likely that Machi's final implementation will choose a 24 bit
integer to represent the CoC locator.
* 7. Proposal: Break the opacity of Machi file names
Machi assigns file names based on:
~ClientSuppliedPrefix ++ "^" ++ SomeOpaqueFileNameSuffix~
What if the CoC client could peek inside of the opaque file name
suffix in order to look at the CoC location information that we might
code in the filename suffix?
** The notation we use
- ~T~ = the target CoC member/Cluster ID chosen by the CoC client at the time of ~append()~
- ~p~ = file prefix, chosen by the CoC client.
- ~L~ = the CoC locator
- ~N~ = the CoC namespace
- ~u~ = the Machi file server unique opaque file name suffix, e.g. a GUID string
- ~F~ = a Machi file name, i.e., ~p^L^N^u~
** The details: CoC file write
1. CoC client chooses ~p~, ~T~, and ~N~ (i.e., the file prefix, target
cluster, and target cluster namespace)
2. CoC client knows the CoC ~Map~ for namespace ~N~.
3. CoC client choose some CoC locator value ~L~ such that
~rs_hash_with_float(L,Map) = T~ (see below).
4. CoC client sends its request to cluster
~T~: ~append_chunk(p,L,N,...) -> {ok,p^L^N^u,ByteOffset}~
5. CoC stores/uses the file name ~F = p^L^N^u~.
** The details: CoC file read
1. CoC client knows the file name ~F~ and parses it to find
the values of ~L~ and ~N~ (recall, ~F = p^L^N^u~).
2. CoC client knows the CoC ~Map~ for type ~N~.
3. CoC calculates ~rs_hash_with_float(L,Map) = T~
4. CoC client sends request to cluster ~T~: ~read_chunk(F,...) ->~ ... success!
** The details: calculating 'L' (the CoC locator) to match a desired target cluster
1. We know ~Map~, the current CoC mapping for a CoC namespace ~N~.
2. We look inside of ~Map~, and we find all of the unit interval ranges
that map to our desired target cluster ~T~. Let's call this list
~MapList = [Range1=(start,end],Range2=(start,end],...]~.
3. In our example, ~T=Cluster2~. The example ~Map~ contains a single
unit interval range for ~Cluster2~, ~[(0.33,0.58]]~.
4. Choose a uniformly random number ~r~ on the unit interval.
5. Calculate locator ~L~ by mapping ~r~ onto the concatenation
of the CoC hash space range intervals in ~MapList~. For example,
if ~r=0.5~, then ~L = 0.33 + 0.5*(0.58-0.33) = 0.455~, which is
exactly in the middle of the ~(0.33,0.58]~ interval.
** A bit more about the CoC locator's meaning and use
- If two files were written using exactly the same CoC locator and the
same CoC namespace, then the client is indicating that it wishes
that the two files be stored in the same chain.
- If two files have a different CoC locator, then the client has
absolutely no expectation of where the two files will be stored
relative to each other.
Given the items above, then some consequences are:
- If the client doesn't care about CoC placement, then picking a
random number is fine. Always choosing a different locator ~L~ for
each append will scatter data across the CoC as widely as possible.
- If the client believes that some physical locality is good, then the
client should reuse the same locator ~L~ for a batch of appends to
the same prefix ~p~ and namespace ~N~. We have no recommendations
for the batch size, yet; perhaps 10-1,000 might be a good start for
experiments?
When the client choose CoC namespace ~N~ and CoC locator ~L~ (using
random number or target cluster technique), the client uses ~N~'s CoC
map to find the CoC target cluster, ~T~. The client has also chosen
the file prefix ~p~. The append op sent to cluster ~T~ would look
like:
~append_chunk(N="reduced",L=0.25,p="myprefix",<<900-data-bytes>>,<<checksum>>,...)~
A successful result would yield a chunk position:
~{offset=883293,size=900,file="myprefix^reduced^0.25^OpaqueSuffix"}~
** A bit more about the CoC namespaces's meaning and use
- The CoC framework will provide means of creating and managing
chains of different types, e.g., chain length, consistency mode.
- The CoC framework will manage the mapping of CoC namespace names to
the chains in the system.
- The CoC framework will provide a query service to map a CoC
namespace name to a Coc map,
e.g. ~coc_latest_map("reduced") -> Map{generation=7,...}~.
For use by Riak CS, for example, we'd likely start with the following
namespaces ... working our way down the list as we add new features
and/or re-implement existing CS features.
- "standard" = Chain length = 3, eventually consistency mode
- "reduced" = Chain length = 2, eventually consistency mode.
- "stanchion7" = Chain length = 7, strong consistency mode. Perhaps
use this namespace for the metadata required to re-implement the
operations that are performed by today's Stanchion application.
* 8. File migration (a.k.a. rebalancing/reparitioning/resharding/redistribution)
** What is "migration"?
This section describes Machi's file migration. Other storage systems
call this process as "rebalancing", "repartitioning", "resharding" or
"redistribution".
For Riak Core applications, it is called "handoff" and "ring resizing"
(depending on the context).
See also the [[http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html#Balancer][Hadoop file balancer]] for another example of a data
migration process.
As discussed in section 5, the client can have good reason for wanting
to have some control of the initial location of the file within the
cluster. However, the cluster manager has an ongoing interest in
balancing resources throughout the lifetime of the file. Disks will
get full, hardware will change, read workload will fluctuate,
etc etc.
This document uses the word "migration" to describe moving data from
one Machi chain to another within a CoC system.
A simple variation of the Random Slicing hash algorithm can easily
accommodate Machi's need to migrate files without interfering with
availability. Machi's migration task is much simpler due to the
immutable nature of Machi file data.
** Change to Random Slicing
The map used by the Random Slicing hash algorithm needs a few simple
changes to make file migration straightforward.
- Add a "generation number", a strictly increasing number (similar to
a Machi cluster's "epoch number") that reflects the history of
changes made to the Random Slicing map
- Use a list of Random Slicing maps instead of a single map, one map
per chance that files may not have been migrated yet out of
that map.
As an example:
#+CAPTION: Illustration of 'Map', using four Machi clusters
[[./migration-3to4.png]]
And the new Random Slicing map for some CoC namespace ~N~ might look
like this:
| Generation number / Namespace | 7 / reduced |
|-------------------------------+-------------|
| SubMap | 1 |
|-------------------------------+-------------|
| Hash range | Cluster ID |
|-------------------------------+-------------|
| 0.00 - 0.33 | Cluster1 |
| 0.33 - 0.66 | Cluster2 |
| 0.66 - 1.00 | Cluster3 |
|-------------------------------+-------------|
| SubMap | 2 |
|-------------------------------+-------------|
| Hash range | Cluster ID |
|-------------------------------+-------------|
| 0.00 - 0.25 | Cluster1 |
| 0.25 - 0.33 | Cluster4 |
| 0.33 - 0.58 | Cluster2 |
| 0.58 - 0.66 | Cluster4 |
| 0.66 - 0.91 | Cluster3 |
| 0.91 - 1.00 | Cluster4 |
When a new Random Slicing map contains a single submap, then its use
is identical to the original Random Slicing algorithm. If the map
contains multiple submaps, then the access rules change a bit:
- Write operations always go to the newest/largest submap.
- Read operations attempt to read from all unique submaps.
- Skip searching submaps that refer to the same cluster ID.
- In this example, unit interval value 0.10 is mapped to Cluster1
by both submaps.
- Read from newest/largest submap to oldest/smallest submap.
- If not found in any submap, search a second time (to handle races
with file copying between submaps).
- If the requested data is found, optionally copy it directly to the
newest submap. (This is a variation of read repair (RR). RR here
accelerates the migration process and can reduce the number of
operations required to query servers in multiple submaps).
The cluster-of-clusters manager is responsible for:
- Managing the various generations of the CoC Random Slicing maps for
all namespaces.
- Distributing namespace maps to CoC clients.
- Managing the processes that are responsible for copying "cold" data,
i.e., files data that is not regularly accessed, to its new submap
location.
- When migration of a file to its new cluster is confirmed successful,
delete it from the old cluster.
In example map #7, the CoC manager will copy files with unit interval
assignments in ~(0.25,0.33]~, ~(0.58,0.66]~, and ~(0.91,1.00]~ from their
old locations in cluster IDs Cluster1/2/3 to their new cluster,
Cluster4. When the CoC manager is satisfied that all such files have
been copied to Cluster4, then the CoC manager can create and
distribute a new map, such as:
| Generation number / Namespace | 8 / reduced |
|-------------------------------+-------------|
| SubMap | 1 |
|-------------------------------+-------------|
| Hash range | Cluster ID |
|-------------------------------+-------------|
| 0.00 - 0.25 | Cluster1 |
| 0.25 - 0.33 | Cluster4 |
| 0.33 - 0.58 | Cluster2 |
| 0.58 - 0.66 | Cluster4 |
| 0.66 - 0.91 | Cluster3 |
| 0.91 - 1.00 | Cluster4 |
The HibariDB system performs data migrations in almost exactly this
manner. However, one important
limitation of HibariDB is not being able to
perform more than one migration at a time. HibariDB's data is
mutable, and mutation causes many problems already when migrating data
across two submaps; three or more submaps was too complex to implement
quickly.
Fortunately for Machi, its file data is immutable and therefore can
easily manage many migrations in parallel, i.e., its submap list may
be several maps long, each one for an in-progress file migration.
* 9. Other considerations for FLU/sequencer implementations
** Append to existing file when possible
In the earliest Machi FLU implementation, it was impossible to append
to the same file after ~30 seconds. For example:
- Client: ~append(prefix="foo",...) -> {ok,"foo^suffix1",Offset1}~
- Client: ~append(prefix="foo",...) -> {ok,"foo^suffix1",Offset2}~
- Client: ~append(prefix="foo",...) -> {ok,"foo^suffix1",Offset3}~
- Client: sleep 40 seconds
- Server: after 30 seconds idle time, stop Erlang server process for
the ~"foo^suffix1"~ file
- Client: ...wakes up...
- Client: ~append(prefix="foo",...) -> {ok,"foo^suffix2",Offset4}~
Our ideal append behavior is to always append to the same file. Why?
It would be nice if Machi didn't create zillions of tiny files if the
client appends to some prefix very infrequently. In general, it is
better to create fewer & bigger files by re-using a Machi file name
when possible.
The sequencer should always assign new offsets to the latest/newest
file for any prefix, as long as all prerequisites are also true,
- The epoch has not changed. (In AP mode, epoch change -> mandatory file name suffix change.)
- The latest file for prefix ~p~ is smaller than maximum file size for a FLU's configuration.
* 10. Acknowledgments
The source for the "migration-4.png" and "migration-3to4.png" images
come from the [[http://hibari.github.io/hibari-doc/images/migration-3to4.png][HibariDB documentation]].

View file

@ -88,16 +88,16 @@ Single
4 0 0 50 -1 2 14 0.0000 4 180 495 4425 3525 ~8%\001
4 0 0 50 -1 2 14 0.0000 4 240 1710 5025 3525 ~25% total keys\001
4 0 0 50 -1 2 14 0.0000 4 180 495 6825 3525 ~8%\001
4 0 0 50 -1 2 24 0.0000 4 270 1485 600 600 Cluster1\001
4 0 0 50 -1 2 24 0.0000 4 270 1485 3000 600 Cluster2\001
4 0 0 50 -1 2 24 0.0000 4 270 1485 5400 600 Cluster3\001
4 0 0 50 -1 2 24 0.0000 4 270 1485 300 2850 Cluster1\001
4 0 0 50 -1 2 24 0.0000 4 270 1485 2700 2850 Cluster2\001
4 0 0 50 -1 2 24 0.0000 4 270 1485 5175 2850 Cluster3\001
4 0 0 50 -1 2 24 0.0000 4 270 405 2100 2625 Cl\001
4 0 0 50 -1 2 24 0.0000 4 270 405 6900 2625 Cl\001
4 0 0 50 -1 2 24 0.0000 4 270 195 2175 3075 4\001
4 0 0 50 -1 2 24 0.0000 4 270 195 4575 3075 4\001
4 0 0 50 -1 2 24 0.0000 4 270 195 6975 3075 4\001
4 0 0 50 -1 2 24 0.0000 4 270 405 4500 2625 Cl\001
4 0 0 50 -1 2 18 0.0000 4 240 3990 1200 4875 CoC locator, on the unit interval\001
4 0 0 50 -1 2 24 0.0000 4 270 1245 600 600 Chain1\001
4 0 0 50 -1 2 24 0.0000 4 270 1245 3000 600 Chain2\001
4 0 0 50 -1 2 24 0.0000 4 270 1245 5400 600 Chain3\001
4 0 0 50 -1 2 24 0.0000 4 270 285 2100 2625 C\001
4 0 0 50 -1 2 24 0.0000 4 270 285 4500 2625 C\001
4 0 0 50 -1 2 24 0.0000 4 270 285 6900 2625 C\001
4 0 0 50 -1 2 24 0.0000 4 270 1245 525 2850 Chain1\001
4 0 0 50 -1 2 24 0.0000 4 270 1245 2925 2850 Chain2\001
4 0 0 50 -1 2 24 0.0000 4 270 1245 5325 2850 Chain3\001
4 0 0 50 -1 2 18 0.0000 4 240 4350 1350 4875 Cluster locator, on the unit interval\001

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.6 KiB

BIN
doc/cluster/migration-4.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.4 KiB

View file

@ -0,0 +1,481 @@
-*- mode: org; -*-
#+TITLE: Machi cluster "name game" sketch
#+AUTHOR: Scott
#+STARTUP: lognotedone hidestars indent showall inlineimages
#+SEQ_TODO: TODO WORKING WAITING DONE
#+COMMENT: M-x visual-line-mode
#+COMMENT: Also, disable auto-fill-mode
* 1. "Name Games" with random-slicing style consistent hashing
Our goal: to distribute lots of files very evenly across a large
collection of individual, small Machi chains.
* 2. Assumptions
** Basic familiarity with Machi high level design and Machi's "projection"
The [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] contains all of the basic
background assumed by the rest of this document.
** Analogy: "neighborhood : city :: Machi chain : Machi cluster"
Analogy: The word "machi" in Japanese means small town or
neighborhood. As the Tokyo Metropolitan Area is built from many
machis and smaller cities, therefore a big, partitioned file store can
be built out of many small Machi chains.
** Familiarity with the Machi chain concept
It's clear (I hope!) from
the [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] that Machi alone does not support
any kind of file partitioning/distribution/sharding across multiple
small Machi chains. There must be another layer above a Machi chain to
provide such partitioning services.
Using the [[https://github.com/basho/machi/tree/master/prototype/demo-day-hack][cluster quick-and-dirty prototype]] as an
architecture sketch, let's now assume that we have ~n~ independent Machi
chains. We assume that each of these chains has the same
chain length in the nominal case, e.g. chain length of 3.
We wish to provide partitioned/distributed file storage
across all ~n~ chains. We call the entire collection of ~n~ Machi
chains a "cluster".
We may wish to have several types of Machi clusters. For example:
+ Chain length of 1 for "don't care if it gets lost,
store stuff very very cheaply" data.
+ Chain length of 2 for normal data.
+ Equivalent to quorum replication's reliability with 3 copies.
+ Chain length of 7 for critical, unreplaceable data.
+ Equivalent to quorum replication's reliability with 15 copies.
Each of these types of chains will have a name ~N~ in the
namespace. The role of the cluster namespace will be demonstrated in
Section 3 below.
** Continue an early assumption: a Machi chain is unaware of clustering
Let's continue with an assumption that an individual Machi chain
inside of a cluster is completely unaware of the cluster layer.
** The reader is familiar with the random slicing technique
I'd done something very-very-nearly-like-this for the Hibari database
6 years ago. But the Hibari technique was based on stuff I did at
Sendmail, Inc, in 2000, so this technique feels like old news to me.
{shrug}
The following section provides an illustrated example.
Very quickly, the random slicing algorithm is:
- Hash a string onto the unit interval [0.0, 1.0)
- Calculate h(unit interval point, Map) -> bin, where ~Map~ divides
the unit interval into bins (or partitions or shards).
Machi's adaptation is in step 1: we do not hash any strings. Instead, we
simply choose a number on the unit interval. This number is called
the "cluster locator number".
As described later in this doc, Machi file names are structured into
several components. One component of the file name contains the cluster
locator number; we use the number as-is for step 2 above.
*** For more information about Random Slicing
For a comprehensive description of random slicing, please see the
first two papers. For a quicker summary, please see the third
reference.
#+BEGIN_QUOTE
Reliable and Randomized Data Distribution Strategies for Large Scale Storage Systems
Alberto Miranda et al.
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.226.5609
(short version, HIPC'11)
Random Slicing: Efficient and Scalable Data Placement for Large-Scale
Storage Systems
Alberto Miranda et al.
DOI: http://dx.doi.org/10.1145/2632230 (long version, ACM Transactions
on Storage, Vol. 10, No. 3, Article 9, 2014)
[[http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration][Hibari Sysadmin Guide, chain migration section]].
http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration
#+END_QUOTE
* 3. A simple illustration
We use a variation of the Random Slicing hash that we will call
~rs_hash_with_float()~. The Erlang-style function type is shown
below.
#+BEGIN_SRC erlang
%% type specs, Erlang-style
-spec rs_hash_with_float(float(), rs_hash:map()) -> rs_hash:chain_id().
#+END_SRC
I'm borrowing an illustration from the HibariDB documentation here,
but it fits my purposes quite well. (I am the original creator of that
image, and also the use license is compatible.)
#+CAPTION: Illustration of 'Map', using four Machi chains
[[./migration-4.png]]
Assume that we have a random slicing map called ~Map~. This particular
~Map~ maps the unit interval onto 4 Machi chains:
| Hash range | Chain ID |
|-------------+----------|
| 0.00 - 0.25 | Chain1 |
| 0.25 - 0.33 | Chain4 |
| 0.33 - 0.58 | Chain2 |
| 0.58 - 0.66 | Chain4 |
| 0.66 - 0.91 | Chain3 |
| 0.91 - 1.00 | Chain4 |
Assume that the system chooses a cluster locator of 0.05.
According to ~Map~, the value of
~rs_hash_with_float(0.05,Map) = Chain1~.
Similarly, ~rs_hash_with_float(0.26,Map) = Chain4~.
This example should look very similar to Hibari's technique.
The Hibari documentation has a brief photo illustration of how random
slicing works, see [[http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration][Hibari Sysadmin Guide, chain migration]].
* 4. Use of the cluster namespace: name separation plus chain type
Let us assume that the cluster framework provides several different types
of chains:
| Chain length | Namespace | Consistency Mode | Comment |
|--------------+--------------+------------------+----------------------------------|
| 3 | ~normal~ | eventual | Normal storage redundancy & cost |
| 2 | ~reduced~ | eventual | Reduced cost storage |
| 1 | ~risky~ | eventual | Really, really cheap storage |
| 7 | ~paranoid~ | eventual | Safety-critical storage |
| 3 | ~sequential~ | strong | Strong consistency |
|--------------+--------------+------------------+----------------------------------|
The client may want to choose the amount of redundancy that its
application requires: normal, reduced cost, or perhaps even a single
copy. The cluster namespace is used by the client to signal this
intention.
Further, the cluster administrators may wish to use the namespace to
provide separate storage for different applications. Jane's
application may use the namespace "jane-normal" and Bob's app uses
"bob-reduced". Administrators may definine separate groups of
chains on separate servers to serve these two applications.
* 5. In its lifetime, a file may be moved to different chains
The cluster management scheme may decide that files need to migrate to
other chains -- i.e., file that is initially created on chain ID ~X~
has been moved to chain ID ~Y~.
+ For storage load or I/O load balancing reasons.
+ Because a chain is being decommissioned by the sysadmin.
* 6. Floating point is not required ... it is merely convenient for explanation
NOTE: Use of floating point terms is not required. For example,
integer arithmetic could be used, if using a sufficiently large
interval to create an even & smooth distribution of hashes across the
expected maximum number of chains.
For example, if the maximum cluster size would be 4,000 individual
Machi chains, then a minimum of 12 bits of integer space is required
to assign one integer per Machi chain. However, for load balancing
purposes, a finer grain of (for example) 100 integers per Machi
chain would permit file migration to move increments of
approximately 1% of single Machi chain's storage capacity. A
minimum of 12+7=19 bits of hash space would be necessary to accommodate
these constraints.
It is likely that Machi's final implementation will choose a 24 bit
integer (or perhaps 32 bits) to represent the cluster locator.
* 7. Proposal: Break the opacity of Machi file names, slightly.
Machi assigns file names based on:
~ClientSuppliedPrefix ++ "^" ++ SomeOpaqueFileNameSuffix~
What if some parts of the system could peek inside of the opaque file name
suffix in order to look at the cluster location information that we might
code in the filename suffix?
We break the system into parts that speak two levels of protocols,
"high" and "low".
+ The high level protocol is used outside of the Machi cluster
+ The low level protocol is used inside of the Machi cluster
Both protocols are based on a Protocol Buffers specification and
implementation. Other protocols, such as HTTP, will be added later.
#+BEGIN_SRC
+-----------------------+
| Machi external client |
| e.g. Riak CS |
+-----------------------+
^
| Machi "high" API
| ProtoBuffs protocol Machi cluster boundary: outside
.........................................................................
| Machi cluster boundary: inside
v
+--------------------------+ +------------------------+
| Machi "high" API service | | Machi HTTP API service |
+--------------------------+ +------------------------+
^ |
| +------------------------+
v v
+------------------------+
| Cluster bridge service |
+------------------------+
^
| Machi "low" API
| ProtoBuffs protocol
+----------------------------------------+----+----+
| | | |
v v v v
+-------------------------+ ... other chains...
| Chain C1 (logical view) |
| +--------------+ |
| | FLU server 1 | |
| | +--------------+ |
| +--| FLU server 2 | |
| +--------------+ | In reality, API bridge talks directly
+-------------------------+ to each FLU server in a chain.
#+END_SRC
** The notation we use
- ~N~ = the cluster namespace, chosen by the client.
- ~p~ = file prefix, chosen by the client.
- ~L~ = the cluster locator (a number, type is implementation-dependent)
- ~Map~ = a mapping of cluster locators to chains
- ~T~ = the target chain ID/name
- ~u~ = a unique opaque file name suffix, e.g. a GUID string
- ~F~ = a Machi file name, i.e., a concatenation of ~p^L^N^u~
** The details: cluster file append
0. Cluster client chooses ~N~ and ~p~ (i.e., cluster namespace and
file prefix) and sends the append request to a Machi cluster member
via the Protocol Buffers "high" API.
1. Cluster bridge chooses ~T~ (i.e., target chain), based on criteria
such as disk utilization percentage.
2. Cluster bridge knows the cluster ~Map~ for namespace ~N~.
3. Cluster bridge choose some cluster locator value ~L~ such that
~rs_hash_with_float(L,Map) = T~ (see algorithm below).
4. Cluster bridge sends its request to chain
~T~: ~append_chunk(p,L,N,...) -> {ok,p^L^N^u,ByteOffset}~
5. Cluster bridge forwards the reply tuple to the client.
6. Client stores/uses the file name ~F = p^L^N^u~.
** The details: Cluster file read
0. Cluster client sends the read request to a Machi cluster member via
the Protocol Buffers "high" API.
1. Cluster bridge parses the file name ~F~ to find
the values of ~L~ and ~N~ (recall, ~F = p^L^N^u~).
2. Cluster bridge knows the Cluster ~Map~ for type ~N~.
3. Cluster bridge calculates ~rs_hash_with_float(L,Map) = T~
4. Cluster bridge sends request to chain ~T~:
~read_chunk(F,...) ->~ ... reply
5. Cluster bridge forwards the reply to the client.
** The details: calculating 'L' (the cluster locator number) to match a desired target chain
1. We know ~Map~, the current cluster mapping for a cluster namespace ~N~.
2. We look inside of ~Map~, and we find all of the unit interval ranges
that map to our desired target chain ~T~. Let's call this list
~MapList = [Range1=(start,end],Range2=(start,end],...]~.
3. In our example, ~T=Chain2~. The example ~Map~ contains a single
unit interval range for ~Chain2~, ~[(0.33,0.58]]~.
4. Choose a uniformly random number ~r~ on the unit interval.
5. Calculate the cluster locator ~L~ by mapping ~r~ onto the concatenation
of the cluster hash space range intervals in ~MapList~. For example,
if ~r=0.5~, then ~L = 0.33 + 0.5*(0.58-0.33) = 0.455~, which is
exactly in the middle of the ~(0.33,0.58]~ interval.
** A bit more about the cluster namespaces's meaning and use
For use by Riak CS, for example, we'd likely start with the following
namespaces ... working our way down the list as we add new features
and/or re-implement existing CS features.
- "standard" = Chain length = 3, eventually consistency mode
- "reduced" = Chain length = 2, eventually consistency mode.
- "stanchion7" = Chain length = 7, strong consistency mode. Perhaps
use this namespace for the metadata required to re-implement the
operations that are performed by today's Stanchion application.
We want the cluster framework to:
- provide means of creating and managing
chains of different types, e.g., chain length, consistency mode.
- manage the mapping of cluster namespace
names to the chains in the system.
- provide query functions to map a cluster
namespace name to a cluster map,
e.g. ~get_cluster_latest_map("reduced") -> Map{generation=7,...}~.
* 8. File migration (a.k.a. rebalancing/reparitioning/resharding/redistribution)
** What is "migration"?
This section describes Machi's file migration. Other storage systems
call this process as "rebalancing", "repartitioning", "resharding" or
"redistribution".
For Riak Core applications, it is called "handoff" and "ring resizing"
(depending on the context).
See also the [[http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html#Balancer][Hadoop file balancer]] for another example of a data
migration process.
As discussed in section 5, the client can have good reason for wanting
to have some control of the initial location of the file within the
chain. However, the chain manager has an ongoing interest in
balancing resources throughout the lifetime of the file. Disks will
get full, hardware will change, read workload will fluctuate,
etc etc.
This document uses the word "migration" to describe moving data from
one Machi chain to another chain within a cluster system.
A simple variation of the Random Slicing hash algorithm can easily
accommodate Machi's need to migrate files without interfering with
availability. Machi's migration task is much simpler due to the
immutable nature of Machi file data.
** Change to Random Slicing
The map used by the Random Slicing hash algorithm needs a few simple
changes to make file migration straightforward.
- Add a "generation number", a strictly increasing number (similar to
a Machi chain's "epoch number") that reflects the history of
changes made to the Random Slicing map
- Use a list of Random Slicing maps instead of a single map, one map
per chance that files may not have been migrated yet out of
that map.
As an example:
#+CAPTION: Illustration of 'Map', using four Machi chains
[[./migration-3to4.png]]
And the new Random Slicing map for some cluster namespace ~N~ might look
like this:
| Generation number / Namespace | 7 / reduced |
|-------------------------------+-------------|
| SubMap | 1 |
|-------------------------------+-------------|
| Hash range | Chain ID |
|-------------------------------+-------------|
| 0.00 - 0.33 | Chain1 |
| 0.33 - 0.66 | Chain2 |
| 0.66 - 1.00 | Chain3 |
|-------------------------------+-------------|
| SubMap | 2 |
|-------------------------------+-------------|
| Hash range | Chain ID |
|-------------------------------+-------------|
| 0.00 - 0.25 | Chain1 |
| 0.25 - 0.33 | Chain4 |
| 0.33 - 0.58 | Chain2 |
| 0.58 - 0.66 | Chain4 |
| 0.66 - 0.91 | Chain3 |
| 0.91 - 1.00 | Chain4 |
When a new Random Slicing map contains a single submap, then its use
is identical to the original Random Slicing algorithm. If the map
contains multiple submaps, then the access rules change a bit:
- Write operations always go to the newest/largest submap.
- Read operations attempt to read from all unique submaps.
- Skip searching submaps that refer to the same chain ID.
- In this example, unit interval value 0.10 is mapped to Chain1
by both submaps.
- Read from newest/largest submap to oldest/smallest submap.
- If not found in any submap, search a second time (to handle races
with file copying between submaps).
- If the requested data is found, optionally copy it directly to the
newest submap. (This is a variation of read repair (RR). RR here
accelerates the migration process and can reduce the number of
operations required to query servers in multiple submaps).
The cluster manager is responsible for:
- Managing the various generations of the cluster Random Slicing maps for
all namespaces.
- Distributing namespace maps to cluster bridges.
- Managing the processes that are responsible for copying "cold" data,
i.e., files data that is not regularly accessed, to its new submap
location.
- When migration of a file to its new chain is confirmed successful,
delete it from the old chain.
In example map #7, the cluster manager will copy files with unit interval
assignments in ~(0.25,0.33]~, ~(0.58,0.66]~, and ~(0.91,1.00]~ from their
old locations in chain IDs Chain1/2/3 to their new chain,
Chain4. When the cluster manager is satisfied that all such files have
been copied to Chain4, then the cluster manager can create and
distribute a new map, such as:
| Generation number / Namespace | 8 / reduced |
|-------------------------------+-------------|
| SubMap | 1 |
|-------------------------------+-------------|
| Hash range | Chain ID |
|-------------------------------+-------------|
| 0.00 - 0.25 | Chain1 |
| 0.25 - 0.33 | Chain4 |
| 0.33 - 0.58 | Chain2 |
| 0.58 - 0.66 | Chain4 |
| 0.66 - 0.91 | Chain3 |
| 0.91 - 1.00 | Chain4 |
The HibariDB system performs data migrations in almost exactly this
manner. However, one important
limitation of HibariDB is not being able to
perform more than one migration at a time. HibariDB's data is
mutable. Mutation causes many problems when migrating data
across two submaps; three or more submaps was too complex to implement
quickly and correctly.
Fortunately for Machi, its file data is immutable and therefore can
easily manage many migrations in parallel, i.e., its submap list may
be several maps long, each one for an in-progress file migration.
* 9. Other considerations for FLU/sequencer implementations
** Append to existing file when possible
The sequencer should always assign new offsets to the latest/newest
file for any prefix, as long as all prerequisites are also true,
- The epoch has not changed. (In AP mode, epoch change -> mandatory
file name suffix change.)
- The cluster locator number is stable.
- The latest file for prefix ~p~ is smaller than maximum file size for
a FLU's configuration.
The stability of the cluster locator number is an implementation detail that
must be managed by the cluster bridge.
Reuse of the same file is not possible if the bridge always chooses a
different cluster locator number ~L~ or if the client always uses a unique
file prefix ~p~. The latter is a sign of a misbehaved client; the
former is a poorly-implemented bridge.
* 10. Acknowledgments
The original source for the "migration-4.png" and "migration-3to4.png" images
come from the [[http://hibari.github.io/hibari-doc/images/migration-3to4.png][HibariDB documentation]].

30
doc/dev-clone-compile.md Normal file
View file

@ -0,0 +1,30 @@
# Clone and compile Machi
Clone the Machi source repo and compile the source and test code. Run
the following commands at your login shell:
cd /tmp
git clone https://github.com/basho/machi.git
cd machi
git checkout master
make # or 'gmake' if GNU make uses an alternate name
Then run the unit test suite. This may take up to two minutes or so
to finish.
make test
At the end, the test suite should report that all tests passed. The
actual number of tests shown in the "All `X` tests passed" line may be
different than the example below.
[... many lines omitted ...]
module 'event_logger'
module 'chain_mgr_legacy'
=======================================================
All 90 tests passed.
If you had a test failure, a likely cause may be a limit on the number
of file descriptors available to your user process. (Recent releases
of OS X have a limit of 1024 file descriptors, which may be too slow.)
The output of the `limit -n` will tell you your file descriptor limit.

38
doc/dev-prerequisites.md Normal file
View file

@ -0,0 +1,38 @@
## Machi developer environment prerequisites
1. Machi requires an 64-bit variant of UNIX: OS X, FreeBSD, Linux, or
Solaris machine is a standard developer environment for C and C++
applications (64-bit versions).
2. You'll need the `git` source management utility.
3. You'll need the 64-bit Erlang/OTP 17 runtime environment. Please
don't use earlier or later versions until we have a chance to fix
the compilation warnings that versions R16B and 18 will trigger.
Also, please verify that you are not using a 32-bit Erlang/OTP
runtime package.
For `git` and the Erlang runtime, please use your OS-specific
package manager to install these. If your package manager doesn't
have 64-bit Erlang/OTP version 17 available, then we recommend using the
[precompiled packages available at Erlang Solutions](https://www.erlang-solutions.com/resources/download.html).
Also, please verify that you have enough file descriptors available to
your user processes. The output of `ulimit -n` should report at least
4,000 file descriptors available. If your limit is lower (a frequent
problem for OS X users), please increase it to at least 4,000.
# Using Vagrant to set up a developer environment for Machi
The Machi source directory contains a `Vagrantfile` for creating an
Ubuntu Linux-based virtual machine for compiling and running Machi.
This file is in the
[$SRC_TOP/priv/humming-consensus-demo.vagrant](../priv/humming-consensus-demo.vagrant)
directory.
If used as-is, the virtual machine specification is modest.
* 1 virtual CPU
* 512MB virtual memory
* 768MB swap space
* 79GB sparse virtual disk image. After installing prerequisites and
compiling Machi, the root file system uses approximately 2.7 GBytes.

View file

@ -0,0 +1,617 @@
FLU and Chain Life Cycle Management -*- mode: org; -*-
#+STARTUP: lognotedone hidestars indent showall inlineimages
#+COMMENT: To generate the outline section: egrep '^\*[*]* ' doc/flu-and-chain-lifecycle.org | egrep -v '^\* Outline' | sed -e 's/^\*\*\* / + /' -e 's/^\*\* / + /' -e 's/^\* /+ /'
* FLU and Chain Life Cycle Management
In an ideal world, we (the Machi development team) would have a full
vision of how Machi would be managed, down to the last detail of
beautiful CLI character and network protocol bit. Our vision isn't
complete yet, so we are working one small step at a time.
* Outline
+ FLU and Chain Life Cycle Management
+ Terminology review
+ Terminology: Machi run-time components/services/thingies
+ Terminology: Machi chain data structures
+ Terminology: Machi cluster data structures
+ Overview of administrative life cycles
+ Cluster administrative life cycle
+ Chain administrative life cycle
+ FLU server administrative life cycle
+ Quick admin: declarative management of Machi FLU and chain life cycles
+ Quick admin uses the "rc.d" config scheme for life cycle management
+ Quick admin's declarative "language": an Erlang-flavored AST
+ Term 'host': define a new host for FLU services
+ Term 'flu': define a new FLU
+ Term 'chain': define or reconfigure a chain
+ Executing quick admin AST files via the 'machi-admin' utility
+ Checking the syntax of an AST file
+ Executing an AST file
+ Using quick admin to manage multiple machines
+ The "rc.d" style configuration file scheme
+ Riak had a similar configuration file editing problem (and its solution)
+ Machi's "rc.d" file scheme.
+ FLU life cycle management using "rc.d" style files
+ The key configuration components of a FLU
+ Chain life cycle management using "rc.d" style files
+ The key configuration components of a chain
* Terminology review
** Terminology: Machi run-time components/services/thingies
+ FLU: a basic Machi server, responsible for managing a collection of
files.
+ Chain: a small collection of FLUs that maintain replicas of the same
collection of files. A chain is usually small, 1-3 servers, where
more than 3 would be used only in cases when availability of
certain data is critical despite failures of several machines.
+ The length of a chain is directly proportional to its
replication factor, e.g., a chain length=3 will maintain
(nominally) 3 replicas of each file.
+ To maintain file availability when ~F~ failures have occurred, a
chain must be at least ~F+1~ members long. (In comparison, the
quorum replication technique requires ~2F+1~ members in the
general case.)
+ Cluster: A collection of Machi chains that are used to store files
in a horizontally partitioned/sharded/distributed manner.
** Terminology: Machi data structures
+ Projection: used to define a single chain: the chain's consistency
mode (strong or eventual consistency), all members (from an
administrative point of view), all active members (from a runtime,
automatically-managed point of view), repairing/file-syncing members
(also runtime, auto-managed), and so on
+ Epoch: A version number of a projection. The epoch number is used
by both clients & servers to manage transitions from one projection
to another, e.g., when the chain is temporarily shortened by the
failure of a member FLU server.
** Terminology: Machi cluster data structures
+ Namespace: A collection of human-friendly names that are mapped to
groups of Machi chains that provide the same type of storage
service: consistency mode, replication policy, etc.
+ A single namespace name, e.g. ~normal-ec~, is paired with a single
cluster map (see below).
+ Example: ~normal-ec~ might be a collection of Machi chains in
eventually-consistent mode that are of length=3.
+ Example: ~risky-ec~ might be a collection of Machi chains in
eventually-consistent mode that are of length=1.
+ Example: ~mgmt-critical~ might be a collection of Machi chains in
strongly-consistent mode that are of length=7.
+ Cluster map: Encodes the rules which partition/shard/distribute
the files stored in a particular namespace across a group of chains
that collectively store the namespace's files.
+ Chain weight: A value assigned to each chain within a cluster map
structure that defines the relative storage capacity of a chain
within the namespace. For example, a chain weight=150 has 50% more
capacity than a chain weight=100.
+ Cluster map epoch: The version number assigned to a cluster map.
* Overview of administrative life cycles
** Cluster administrative life cycle
+ Cluster is first created
+ Adds namespaces (e.g. consistency policy + chain length policy) to
the cluster
+ Chains are added to/removed from a namespace to increase/decrease the
namespace's storage capacity.
+ Adjust chain weights within a namespace, e.g., to shift files
within the namespace to chains with greater storage capacity
resources and/or runtime I/O resources.
A cluster "file migration" is the process of moving files from one
namespace member chain to another for purposes of shifting &
re-balancing storage capacity and/or runtime I/O capacity.
** Chain administrative life cycle
+ A chain is created with an initial FLU membership list.
+ Chain may be administratively modified zero or more times to
add/remove member FLU servers.
+ A chain may be decommissioned.
See also: http://basho.github.io/machi/edoc/machi_lifecycle_mgr.html
** FLU server administrative life cycle
+ A FLU is created after an administrator chooses the FLU's runtime
location is selected by the administrator: which machine/virtual
machine, IP address and TCP port allocation, etc.
+ An unassigned FLU may be added to a chain by chain administrative
policy.
+ A FLU that is assigned to a chain may be removed from that chain by
chain administrative policy.
+ In the current implementation, the FLU's Erlang processes will be
halted. Then the FLU's data and metadata files will be moved to
another area of the disk for safekeeping. Later, a "garbage
collection" process can be used for reclaiming disk space used by
halted FLU servers.
See also: http://basho.github.io/machi/edoc/machi_lifecycle_mgr.html
* Quick admin: declarative management of Machi FLU and chain life cycles
The "quick admin" scheme is a temporary (?) tool for managing Machi
FLU server and chain life cycles in a declarative manner. The API is
described in this section.
** Quick admin uses the "rc.d" config scheme for life cycle management
As described at the top of
http://basho.github.io/machi/edoc/machi_lifecycle_mgr.html, the "rc.d"
config files do not manage "policy". "Policy" is doing the right
thing with a Machi cluster from a systems administrator's
point of view. The "rc.d" config files can only implement decisions
made according to policy.
The "quick admin" tool is a first attempt at automating policy
decisions in a safe way (we hope) that is also easy to implement (we
hope) with a variety of systems management tools, e.g. Chef, Puppet,
Ansible, Saltstack, or plain-old-human-at-a-keyboard.
** Quick admin's declarative "language": an Erlang-flavored AST
The "language" that an administrator uses to express desired policy
changes is not (yet) a true language. As a quick implementation hack,
the current language is an Erlang-flavored abstract syntax tree
(AST). The tree isn't very deep, either, frequently just one
element tall. (Not much of a tree, is it?)
There are three terms in the language currently:
+ ~host~, define a new host that can execute FLU servers
+ ~flu~, define a new FLU
+ ~chain~, define a new chain or re-configure an existing chain with
the same name
*** Term 'host': define a new host for FLU services
In this context, a host is a machine, virtual machine, or container
that can execute the Machi application and can therefore provide FLU
services, i.e. file service, Humming Consensus management.
Two formats may be used to define a new host:
#+BEGIN_SRC
{host, Name, Props}.
{host, Name, AdminI, ClientI, Props}.
#+END_SRC
The shorter tuple is shorthand notation for the latter. If the
shorthand form is used, then it will be converted automatically to the
long form as:
#+BEGIN_SRC
{host, Name, AdminI=Name, ClientI=Name, Props}.
#+END_SRC
Type information, description, and restrictions:
+ ~Name::string()~ The ~Name~ attribute must be unique. Note that it
is possible to define two different hosts, one using a DNS hostname
and one using an IP address. The user must avoid this
double-definition because it is not enforced by quick admin.
+ The ~Name~ field is used for cross-reference purposes with other
terms, e.g., ~flu~ and ~chain~.
+ There is no syntax yet for removing a host definition.
+ ~AdminI::string()~ A DNS hostname or IP address for cluster
administration purposes, e.g. SSH access.
+ This field is unused at the present time.
+ ~ClientI::string()~ A DNS hostname or IP address for Machi's client
protocol access, e.g., Protocol Buffers network API service.
+ This field is unused at the present time.
+ ~props::proplist()~ is an Erlang-style property list for specifying
additional configuration options, debugging information, sysadmin
comments, etc.
+ A full-featured admin tool should also include managing several
other aspects of configuration related to a "host". For example,
for any single IP address, quick admin assumes that there will be
exactly one Erlang VM that is running the Machi application. Of
course, it is possible to have dozens of Erlang VMs on the same
(let's assume for clarity) hardware machine and all running Machi
... but there are additional aspects of such a machine that quick
admin does not account for
+ multiple IP addresses per machine
+ multiple Machi package installation paths
+ multiple Machi config files (e.g. cuttlefish config, ~etc.conf~,
~vm.args~)
+ multiple data directories/file system mount points
+ This is also a management problem for quick admin for a single
Machi package on a machine to take advantage of bulk data
storage using multiple multiple file system mount points.
+ multiple Erlang VM host names, required for distributed Erlang,
which is used for communication with ~machi~ and ~machi-admin~
command line utilities.
+ and others....
*** Term 'flu': define a new FLU
A new FLU is defined relative to a previously-defined ~host~ entities;
an exception will be thrown if the ~host~ cannot be cross-referenced.
#+BEGIN_SRC
{flu, Name, HostName, Port, Props}
#+END_SRC
Type information, description, and restrictions:
+ ~Name::atom()~ The name of the FLU, as a human-friendly name and
also for internal management use; please note the ~atom()~ type.
This name must be unique.
+ The ~Name~ field is used for cross-reference purposes with the
~chain~ term.
+ There is no syntax yet for removing a FLU definition.
+ ~Hostname::string()~ The cross-reference name of the ~host~ that
this FLU should run on.
+ ~Port::non_neg_integer()~ The TCP port used by this FLU server's
Protocol Buffers network API listener service
+ ~props::proplist()~ is an Erlang-style property list for specifying
additional configuration options, debugging information, sysadmin
comments, etc.
*** Term 'chain': define or reconfigure a chain
A chain is defined relative to zero or more previously-defined ~flu~
entities; an exception will be thrown if any ~flu~ cannot be
cross-referenced.
Two formats may be used to define/reconfigure a chain:
#+BEGIN_SRC
{chain, Name, FullList, Props}.
{chain, Name, CMode, FullList, Witnesses, Props}.
#+END_SRC
The shorter tuple is shorthand notation for the latter. If the
shorthand form is used, then it will be converted automatically to the
long form as:
#+BEGIN_SRC
{chain, Name, ap_mode, FullList, [], Props}.
#+END_SRC
Type information, description, and restrictions:
+ ~Name::atom()~ The name of the chain, as a human-friendly name and
also for internal management use; please note the ~atom()~ type.
This name must be unique.
+ There is no syntax yet for removing a chain definition.
+ ~CMode::'ap_mode'|'cp_mode'~ Defines the consistency mode of the
chain, either eventual consistency or strong consistency,
respectively.
+ A chain cannot change consistency mode, e.g., from
strong~->~eventual consistency.
+ ~FullList::list(atom())~ Specifies the list of full-service FLU
servers, i.e. servers that provide file data & metadata services as
well as Humming Consensus. Each atom in the list must
cross-reference with a previously defined ~chain~; an exception will
be thrown if any ~flu~ cannot be cross-referenced.
+ ~Witnesses::list(atom())~ Specifies the list of witness-only
servers, i.e. servers that only participate in Humming Consensus.
Each atom in the list must cross-reference with a previously defined
~chain~; an exception will be thrown if any ~flu~ cannot be
cross-referenced.
+ This list must be empty for eventual consistency chains.
+ ~props::proplist()~ is an Erlang-style property list for specifying
additional configuration options, debugging information, sysadmin
comments, etc.
+ If this term specifies a new ~chain~ name, then all of the member
FLU servers (full & witness types) will be bootstrapped to a
starting configuration.
+ If this term specifies a previously-defined ~chain~ name, then all
of the member FLU servers (full & witness types, respectively) will
be adjusted to add or remove members, as appropriate.
+ Any FLU servers added to either list must not be assigned to any
other chain, or they must be a member of this specific chain.
+ Any FLU servers removed from either list will be halted.
(See the "FLU server administrative life cycle" section above.)
** Executing quick admin AST files via the 'machi-admin' utility
Examples of quick admin AST files can be found in the
~priv/quick-admin/examples~ directory. Below is an example that will
define a new host ( ~"localhost"~ ), three new FLU servers ( ~f1~ & ~f2~
and ~f3~ ), and an eventually consistent chain ( ~c1~ ) that uses the new
FLU servers:
#+BEGIN_SRC
{host, "localhost", []}.
{flu,f1,"localhost",20401,[]}.
{flu,f2,"localhost",20402,[]}.
{flu,f3,"localhost",20403,[]}.
{chain,c1,[f1,f2,f3],[]}.
#+END_SRC
*** Checking the syntax of an AST file
Given an AST config file, ~/path/to/ast/file~, its basic syntax and
correctness can be checked without executing it.
#+BEGIN_SRC
./rel/machi/bin/machi-admin quick-admin-check /path/to/ast/file
#+END_SRC
+ The utility will exit with status zero and output ~ok~ if the syntax
and proposed configuration appears to be correct.
+ If there is an error, the utility will exit with status one, and an
error message will be printed.
*** Executing an AST file
Given an AST config file, ~/path/to/ast/file~, it can be executed
using the command:
#+BEGIN_SRC
./rel/machi/bin/machi-admin quick-admin-apply /path/to/ast/file RelativeHost
#+END_SRC
... where the last argument, ~RelativeHost~, should be the exact
spelling of one of the previously defined AST ~host~ entities,
*and also* is the same host that the ~machi-admin~ utility is being
executed on.
Restrictions and warnings:
+ This is alpha quality software.
+ There is no "undo".
+ Of course there is, but you need to resort to doing things like
using ~machi attach~ to attach to the server's CLI to then execute
magic Erlang incantations to stop FLUs, unconfigure chains, etc.
+ Oh, and delete some files with magic paths, also.
** Using quick admin to manage multiple machines
A quick sketch follows:
1. Create the AST file to specify all of the changes that you wish to
make to all hosts, FLUs, and/or chains, e.g., ~/tmp/ast.txt~.
2. Check the basic syntax with the ~quick-admin-check~ argument to
~machi-admin~.
3. If the syntax is good, then copy ~/tmp/ast.txt~ to all hosts in the
cluster, using the same path, ~/tmp/ast.txt~.
4. For each machine in the cluster, run:
#+BEGIN_SRC
./rel/machi/bin/machi-admin quick-admin-apply /tmp/ast.txt RelativeHost
#+END_SRC
... where RelativeHost is the AST ~host~ name of the machine that you
are executing the ~machi-admin~ command on. The command should be
successful, with exit status 0 and outputting the string ~ok~.
Finally, for each machine in the cluster, a listing of all files in
the directory ~rel/machi/etc/quick-admin-archive~ should show exactly
the same files, one for each time that ~quick-admin-apply~ has been
run successfully on that machine.
* The "rc.d" style configuration file scheme
This configuration scheme is inspired by BSD UNIX's ~init(8)~ process
manager's configuration style, called "rc.d" after the name of the
directory where these files are stored, ~/etc/rc.d~. The ~init~
process is responsible for (among other things) starting UNIX
processes at machine boot time and stopping them when the machine is
shut down.
The original scheme used by ~init~ to start processes at boot time was
a single Bourne shell script called ~/etc/rc~. When a new software
package was installed that required a daemon to be started at boot
time, text was added to the ~/etc/rc~ file. Uninstalling packages was
much trickier, because it meant removing lines from a file that
*is a computer program (run by the Bourne shell, a Turing-complete
programming language)*. Error-free editing of the ~/etc/rc~ script
was impossible in all cases.
Later, ~init~'s configuration was split into a few master Bourne shell
scripts and a subdirectory, ~/etc/rc.d~. The subdirectory contained
shell scripts that were responsible for boot time starting of a single
daemon or service, e.g. NFS or an HTTP server. When a new software
package was added, a new file was added to the ~rc.d~ subdirectory.
When a package was removed, the corresponding file in ~rc.d~ was
removed. With this simple scheme, addition & removal of boot time
scripts was vastly simplified.
** Riak had a similar configuration file editing problem (and its solution)
Another software product from Basho Technologies, Riak, had a similar
configuration file editing problem. One file in particular,
~app.config~, had a syntax that made it difficult both for human
systems administrators and also computer programs to edit the file in
a syntactically correct manner.
Later releases of Riak switched to an alternative configuration file
format, one inspired by the BSD UNIX ~sysctl(8)~ utility and
~sysctl.conf(5)~ file syntax. The ~sysctl.conf~ format is much easier
to manage by computer programs to add items. Removing items is not
100% simple, however: the correct lines must be identified and then
removed (e.g. with Perl or a text editor or combination of ~grep -v~
and ~mv~), but removing any comment lines that "belong" to the removed
config item(s) is not any easy for a 1-line shell script to do 100%
correctly.
Machi will use the ~sysctl.conf~ style configuration for some
application configuration variables. However, adding & removing FLUs
and chains will be managed using the "rc.d" style because of the
"rc.d" scheme's simplicity and tolerance of mistakes by administrators
(human or computer).
** Machi's "rc.d" file scheme.
Machi will use a single subdirectory that will contain configuration
files for some life cycle management task, e.g. a single FLU or a
single chain.
The contents of the file should be a single Erlang term, serialized in
ASCII form as Erlang source code statement, i.e. a single Erlang term
~T~ that is formatted by ~io:format("~w.",[T]).~. This file must be
parseable by the Erlang function ~file:consult()~.
Later versions of Machi may change the file format to be more familiar
to administrators who are unaccustomed to Erlang language syntax.
** FLU life cycle management using "rc.d" style files
*** The key configuration components of a FLU
1. The machine (or virtual machine) to run it on.
2. The Machi software package's artifacts to execute.
3. The disk device(s) used to store Machi file data & metadata, "rc.d"
style config files, etc.
4. The name, IP address and TCP port assigned to the FLU service.
5. Its chain assignment.
Notes:
+ Items 1-3 are currently outside of the scope of this life cycle
document. We assume that human administrators know how to do these
things.
+ Item 4's properties are explicitly managed by a FLU-defining "rc.d"
style config file.
+ Item 5 is managed by the chain life cycle management system.
Here is an example of a properly formatted FLU config file:
#+BEGIN_SRC
{p_srvr,f1,machi_flu1_client,"192.168.72.23",20401,[]}.
#+END_SRC
... which corresponds to the following Erlang record definition:
#+BEGIN_SRC
-record(p_srvr, {
name :: atom(),
proto_mod = 'machi_flu1_client' :: atom(), % Module name
address :: term(), % Protocol-specific
port :: term(), % Protocol-specific
props = [] :: list() % proplist for other related info
}).
#+END_SRC
+ ~name~ is ~f1~. This is name of the FLU. This name should be
unique over the lifetime of the administrative domain and thus
managed by external policy. This name must be the same as the name
of the config file that defines the FLU.
+ ~proto_mod~ is used for internal management purposes and should be
considered a mandatory constant.
+ ~address~ is "192.168.72.23". The DNS hostname or IP address used
by other servers to communicate with this FLU. This must be a valid
IP address, previously assigned to this machine/VM using the
appropriate operating system-specific procedure.
+ ~port~ is TCP port 20401. The TCP port number that the FLU listens
to for incoming Protocol Buffers-serialized communication. This TCP
port must not be in use (now or in the future) by another Machi FLU
or any other process running on this machine/VM.
+ ~props~ is an Erlang-style property list for specifying additional
configuration options, debugging information, sysadmin comments,
etc.
** Chain life cycle management using "rc.d" style files
Unlike FLUs, chains have a self-management aspect that makes a chain
life cycle different from a single FLU server. Machi's chains are
self-managing, via Humming Consensus; see the
https://github.com/basho/machi/tree/master/doc/ directory for much
more detail about Humming Consensus. After FLUs have received their
initial chain configuration for Humming Consensus, the FLUs will
manage the chain (and each other) by themselves.
However, Humming Consensus does not handle three chain management
problems:
1. Specifying the very first chain configuration,
2. Altering the membership of the chain (i.e. adding/removing FLUs
from the chain),
3. Stopping the chain permanently.
A chain "rc.d" file will only be used to bootstrap a newly-defined FLU
server. It's like a piece of glue information to introduce the new
FLU to the Humming Consensus group that is managing the chain's
dynamic state (e.g. which members are up or down). In all other
respects, chain config files are ignored by life cycle management code.
However, to mimic the life cycle of the FLU server's "rc.d" config
files, a chain "rc.d" files is not deleted until the chain has been
decommissioned (i.e. defined with length=0).
*** The key configuration components of a chain
1. The name of the chain.
2. Consistency mode: eventually consistent or strongly consistent.
3. The membership list of all FLU servers in the chain.
+ Remember, all servers in a single chain will manage full replicas
of the same collection of Machi files.
4. If the chain is defined to use strongly consistent mode, then a
list of "witness servers" may also be defined. See the
[https://github.com/basho/machi/tree/master/doc/] documentation for
more information on witness servers.
+ The witness list must be empty for all chains in eventual
consistency mode.
Here is an example of a properly formatted chain config file:
#+BEGIN_SRC
{chain_def_v1,c1,ap_mode,
[{p_srvr,f1,machi_flu1_client,"localhost",20401,[]},
{p_srvr,f2,machi_flu1_client,"localhost",20402,[]},
{p_srvr,f3,machi_flu1_client,"localhost",20403,[]}],
[],[],[],
[f1,f2,f3],
[],[]}.
#+END_SRC
... which corresponds to the following Erlang record definition:
#+BEGIN_SRC
-record(chain_def_v1, {
name :: atom(), % chain name
mode :: 'ap_mode' | 'cp_mode',
full = [] :: [p_srvr()],
witnesses = [] :: [p_srvr()],
old_full = [] :: [atom()], % guard against some races
old_witnesses=[] :: [atom()], % guard against some races
local_run = [] :: [atom()], % must be tailored to each machine!
local_stop = [] :: [atom()], % must be tailored to each machine!
props = [] :: list() % proplist for other related info
}).
#+END_SRC
+ ~name~ is ~c1~, the name of the chain. This name should be unique
over the lifetime of the administrative domain and thus managed by
external policy. This name must be the same as the name of the
config file that defines the chain.
+ ~mode~ is ~ap_mode~, an internal code symbol for eventual
consistency mode.
+ ~full~ is a list of Erlang ~#p_srvr{}~ records for full-service
members of the chain, i.e., providing Machi file data & metadata
storage services.
+ ~witnesses~ is a list of Erlang ~#p_srvr{}~ records for witness-only
FLU servers, i.e., providing only Humming Consensus service.
+ The next four fields are used for internal management only.
+ ~props~ is an Erlang-style property list for specifying additional
configuration options, debugging information, sysadmin comments,
etc.

View file

@ -0,0 +1,372 @@
# Table of contents
* [Hands-on experiments with Machi and Humming Consensus](#hands-on)
* [Using the network partition simulator and convergence demo test code](#partition-simulator)
<a name="hands-on">
# Hands-on experiments with Machi and Humming Consensus
## Prerequisites
Please refer to the
[Machi development environment prerequisites doc](./dev-prerequisites.md)
for Machi developer environment prerequisites.
If you do not have an Erlang/OTP runtime system available, but you do
have [the Vagrant virtual machine](https://www.vagrantup.com/) manager
available, then please refer to the instructions in the prerequisites
doc for using Vagrant.
<a name="clone-compile">
## Clone and compile the code
Please see the
[Machi 'clone and compile' doc](./dev-clone-compile.md)
for the short list of steps required to fetch the Machi source code
from GitHub and to compile &amp; test Machi.
## Running three Machi instances on a single machine
All of the commands that should be run at your login shell (e.g. Bash,
c-shell) can be cut-and-pasted from this document directly to your
login shell prompt.
Run the following command:
make stagedevrel
This will create a directory structure like this:
|-dev1-|... stand-alone Machi app + subdirectories
|-dev-|-dev2-|... stand-alone Machi app + directories
|-dev3-|... stand-alone Machi app + directories
Each of the `dev/dev1`, `dev/dev2`, and `dev/dev3` are stand-alone
application instances of Machi and can be run independently of each
other on the same machine. This demo will use all three.
The lifecycle management utilities for Machi are a bit immature,
currently. They assume that each Machi server runs on a host with a
unique hostname -- there is no flexibility built-in yet to easily run
multiple Machi instances on the same machine. To continue with the
demo, we need to use `sudo` or `su` to obtain superuser privileges to
edit the `/etc/hosts` file.
Please add the following line to `/etc/hosts`, using this command:
sudo sh -c 'echo "127.0.0.1 machi1 machi2 machi3" >> /etc/hosts'
Next, we will use a shell script to finish setting up our cluster. It
will do the following for us:
* Verify that the new line that was added to `/etc/hosts` is correct.
* Modify the `etc/app.config` files to configure the Humming Consensus
chain manager's actions logged to the `log/console.log` file.
* Start the three application instances.
* Verify that the three instances are running correctly.
* Configure a single chain, with one FLU server per application
instance.
Please run this script using this command:
./priv/humming-consensus-demo.setup.sh
If the output looks like this (and exits with status zero), then the
script was successful.
Step: Verify that the required entries in /etc/hosts are present
Step: add a verbose logging option to app.config
Step: start three three Machi application instances
pong
pong
pong
Step: configure one chain to start a Humming Consensus group with three members
Result: ok
Result: ok
Result: ok
We have now created a single replica chain, called `c1`, that has
three file servers participating in the chain. Thanks to the
hostnames that we added to `/etc/hosts`, all are using the localhost
network interface.
| App instance | Pseudo | FLU name | TCP port |
| directory | Hostname | | number |
|--------------+----------+----------+----------|
| dev1 | machi1 | flu1 | 20401 |
| dev2 | machi2 | flu2 | 20402 |
| dev3 | machi3 | flu3 | 20403 |
The log files for each application instance can be found in the
`./dev/devN/log/console.log` file, where the `N` is the instance
number: 1, 2, or 3.
## Understanding the chain manager's log file output
After running the `./priv/humming-consensus-demo.setup.sh` script,
let's look at the last few lines of the `./dev/dev1/log/console.log`
log file for Erlang VM process #1.
2016-03-09 10:16:35.676 [info] <0.105.0>@machi_lifecycle_mgr:process_pending_flu:422 Started FLU f1 with supervisor pid <0.128.0>
2016-03-09 10:16:35.676 [info] <0.105.0>@machi_lifecycle_mgr:move_to_flu_config:540 Creating FLU config file f1
2016-03-09 10:16:35.790 [info] <0.105.0>@machi_lifecycle_mgr:bootstrap_chain2:312 Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[]
2016-03-09 10:16:35.790 [info] <0.105.0>@machi_lifecycle_mgr:move_to_chain_config:546 Creating chain config file c1
2016-03-09 10:16:44.139 [info] <0.132.0> CONFIRM epoch 1141 <<155,42,7,221>> upi [] rep [] auth f1 by f1
2016-03-09 10:16:44.271 [info] <0.132.0> CONFIRM epoch 1148 <<57,213,154,16>> upi [f1] rep [] auth f1 by f1
2016-03-09 10:16:44.864 [info] <0.132.0> CONFIRM epoch 1151 <<239,29,39,70>> upi [f1] rep [f3] auth f1 by f1
2016-03-09 10:16:45.235 [info] <0.132.0> CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1
2016-03-09 10:16:47.343 [info] <0.132.0> CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1
Let's pick apart some of these lines. We have started all three
servers at about the same time. We see some race conditions happen,
and some jostling and readjustment happens pretty quickly in the first
few seconds.
* `Started FLU f1 with supervisor pid <0.128.0>`
* This VM, #1,
started a FLU (Machi data server) with the name `f1`. In the Erlang
process supervisor hierarchy, the process ID of the top supervisor
is `<0.128.0>`.
* `Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[]`
* A bootstrap configuration for a chain named `c1` has been created.
* The FLUs/data servers that are eligible for participation in the
chain have names `f1`, `f2`, and `f3`.
* The chain will operate in eventual consistency mode (`ap_mode`)
* The witness server list is empty. Witness servers are never used
in eventual consistency mode.
* `CONFIRM epoch 1141 <<155,42,7,221>> upi [] rep [] auth f1 by f1`
* All participants in epoch 1141 are unanimous in adopting epoch
1141's projection. All active membership lists are empty, so
there is no functional chain replication yet, at least as far as
server `f1` knows
* The epoch's abbreviated checksum is `<<155,42,7,221>>`.
* The UPI list, i.e. the replicas whose data is 100% in sync is
`[]`, the empty list. (UPI = Update Propagation Invariant)
* The list of servers that are under data repair (`rep`) is also
empty, `[]`.
* This projection was authored by server `f1`.
* The log message was generated by server `f1`.
* `CONFIRM epoch 1148 <<57,213,154,16>> upi [f1] rep [] auth f1 by f1`
* Now the server `f1` has created a chain of length 1, `[f1]`.
* Chain repair/file re-sync is not required when the UPI server list
changes from length 0 -> 1.
* `CONFIRM epoch 1151 <<239,29,39,70>> upi [f1] rep [f3] auth f1 by f1`
* Server `f1` has noticed that server `f3` is alive. Apparently it
has not yet noticed that server `f2` is also running.
* Server `f3` is in the repair list.
* `CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1`
* Server `f2` is apparently now aware that all three servers are running.
* The previous configuration used by `f2` was `upi [f2]`, i.e., `f2`
was running in a chain of one. `f2` noticed that `f1` and `f3`
were now available and has started adding them to the chain.
* All new servers are always added to the tail of the chain in the
repair list.
* In eventual consistency mode, a UPI change like this is OK.
* When performing a read, a client must read from both tail of the
UPI list and also from all repairing servers.
* When performing a write, the client writes to both the UPI
server list and also the repairing list, in that order.
* I.e., the client concatenates both lists,
`UPI ++ Repairing`, for its chain configuration for the write.
* Server `f2` will trigger file repair/re-sync shortly.
* The waiting time for starting repair has been configured to be
extremely short, 1 second. The default waiting time is 10
seconds, in case Humming Consensus remains unstable.
* `CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1`
* File repair/re-sync has finished. All file data on all servers
are now in sync.
* The UPI/in-sync part of the chain is now `[f2,f1,f3]`, and there
are no servers under repair.
## Let's create some failures
Here are some suggestions for creating failures.
* Use the `./dev/devN/bin/machi stop` and `./dev/devN/bin/machi start`
commands to stop & start VM #`N`.
* Stop a VM abnormally by using `kill`. The OS process name to look
for is `beam.smp`.
* Suspend and resume a VM, using the `SIGSTOP` and `SIGCONT` signals.
* E.g. `kill -STOP 9823` and `kill -CONT 9823`
The network partition simulator is not (yet) available when running
Machi in this mode. Please see the next section for instructions on
how to use partition simulator.
<a name="partition-simulator">
# Using the network partition simulator and convergence demo test code
This is the demo code mentioned in the presentation that Scott Lystig
Fritchie gave at the
[RICON 2015 conference](http://ricon.io).
* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf)
* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q)
## A complete example of all input and output
If you don't have an Erlang/OTP 17 runtime environment available,
please see this file for full input and output of a strong consistency
length=3 chain test:
https://gist.github.com/slfritchie/8352efc88cc18e62c72c
This file contains all commands input and all simulator output from a
sample run of the simulator.
To help interpret the output of the test, please skip ahead to the
"The test output is very verbose" section.
## Prerequisites
If you don't have `git` and/or the Erlang 17 runtime system available
on your OS X, FreeBSD, Linux, or Solaris machine, please take a look
at the [Prerequisites section](#prerequisites) first. When you have
installed the prerequisite software, please return back here.
## Clone and compile the code
Please briefly visit the [Clone and compile the code](#clone-compile)
section. When finished, please return back here.
## Run an interactive Erlang CLI shell
Run the following command at your login shell:
erl -pz .eunit ebin deps/*/ebin
If you are using Erlang/OTP version 17, you should see some CLI output
that looks like this:
Erlang/OTP 17 [erts-6.4] [source] [64-bit] [smp:8:8] [async-threads:10] [hipe] [kernel-poll:false] [dtrace]
Eshell V6.4 (abort with ^G)
1>
## The test output is very verbose ... what are the important parts?
The output of the Erlang command
`machi_chain_manager1_converge_demo:help()` will display the following
guide to the output of the tests.
A visualization of the convergence behavior of the chain self-management
algorithm for Machi.
1. Set up some server and chain manager pairs.
2. Create a number of different network partition scenarios, where
(simulated) partitions may be symmetric or asymmetric. Then stop changing
the partitions and keep the simulated network stable (and perhaps broken).
3. Run a number of iterations of the algorithm in parallel by poking each
of the manager processes on a random'ish basis.
4. Afterward, fetch the chain transition changes made by each FLU and
verify that no transition was unsafe.
During the iteration periods, the following is a cheatsheet for the output.
See the internal source for interpreting the rest of the output.
'SET partitions = '
A pair-wise list of actors which cannot send messages. The
list is uni-directional. If there are three servers (a,b,c),
and if the partitions list is '[{a,b},{b,c}]' then all
messages from a->b and b->c will be dropped, but any other
sender->recipient messages will be delivered successfully.
'x uses:'
The FLU x has made an internal state transition and is using
this epoch's projection as operating chain configuration. The
rest of the line is a summary of the projection.
'CONFIRM epoch {N}'
This message confirms that all of the servers listed in the
UPI and repairing lists of the projection at epoch {N} have
agreed to use this projection because they all have written
this projection to their respective private projection stores.
The chain is now usable by/available to all clients.
'Sweet, private projections are stable'
This report announces that this iteration of the test cycle
has passed successfully. The report that follows briefly
summarizes the latest private projection used by each
participating server. For example, when in strong consistency
mode with 'a' as a witness and 'b' and 'c' as real servers:
%% Legend:
%% server name, epoch ID, UPI list, repairing list, down list, ...
%% ... witness list, 'false' (a constant value)
[{a,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}},
{b,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}]
Both servers 'a' and 'b' agree on epoch 1116 with epoch ID
{1116,<<23,143,246,55>>} where UPI=[a,b], repairing=[],
down=[c], and witnesses=[a].
Server 'c' is not shown because 'c' has wedged itself OOS (out
of service) by configuring a chain length of zero.
If no servers are listed in the report (i.e. only '[]' is
displayed), then all servers have wedged themselves OOS, and
the chain is unavailable.
'DoIt,'
This marks a group of tick events which trigger the manager
processes to evaluate their environment and perhaps make a
state transition.
A long chain of 'DoIt,DoIt,DoIt,' means that the chain state has
(probably) settled to a stable configuration, which is the goal of the
algorithm.
Press control-c to interrupt the test....".
## Run a test in eventual consistency mode
Run the following command at the Erlang CLI prompt:
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}]).
The first argument, `3`, is the number of servers to participate in
the chain. Please note:
* Chain lengths as short as 1 or 2 are valid, but the results are a
bit boring.
* Chain lengths as long as 7 or 9 can be used, but they may
suffer from longer periods of churn/instability before all chain
managers reach agreement via humming consensus. (It is future work
to shorten the worst of the unstable churn latencies.)
* In eventual consistency mode, chain lengths may be even numbers,
e.g. 2, 4, or 6.
* The simulator will choose partition events from the permutations of
all 1, 2, and 3 node partition pairs. The total runtime will
increase *dramatically* with chain length.
* Chain length 2: about 3 partition cases
* Chain length 3: about 35 partition cases
* Chain length 4: about 230 partition cases
* Chain length 5: about 1100 partition cases
## Run a test in strong consistency mode (with witnesses):
*NOTE:* Due to a bug in the test code, please do not try to run the
convergence test in strong consistency mode and also without the
correct minority number of witness servers! If in doubt, please run
the commands shown below exactly.
Run the following command at the Erlang CLI prompt:
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).
The first argument, `3`, is the number of servers to participate in
the chain. Chain lengths as long as 7 or 9 can be used, but they may
suffer from longer periods of churn/instability before all chain
managers reach agreement via humming consensus.
Due to the bug mentioned above, please use the following
commands when running with chain lengths of 5 or 7, respectively.
machi_chain_manager1_converge_demo:t(5, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b]}]).
machi_chain_manager1_converge_demo:t(7, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b,c]}]).

View file

@ -1,185 +0,0 @@
# Using the network partition simulator and convergence demo test code
## A complete example of all input and output
If you don't have an Erlang/OTP 17 runtime environment available,
please see this file for full input and output of a strong consistency
length=3 chain test:
https://gist.github.com/slfritchie/8352efc88cc18e62c72c
This file contains all commands input and all simulator output from a
sample run of the simulator.
To help interpret the output of the test, please skip ahead to the
"The test output is very verbose" section.
## Prerequisites
1. You'll need the `git` source management
2. You'll need the Erlang/OTP 17 runtime environment. Please don't
use earlier or later versions until we have a chance to fix the
compilation warnings that versions R16B and 18 will trigger.
All of the commands that should be run at your login shell (e.g. Bash,
c-shell) can be cut-and-pasted from this document directly to your
login shell prompt.
## Clone and compile the code
Clone the Machi source repo and compile the source and test code. Run
the following commands at your login shell:
cd /tmp
git clone https://github.com/basho/machi.git
cd machi
git checkout master
make
Then run the unit test suite. This may take up to two minutes or so
to finish. Most of the tests will be silent; please be patient until
the tests finish.
make test
## Run an interactive Erlang CLI shell
Run the following command at your login shell:
erl -pz .eunit ebin deps/*/ebin
If you are using Erlang/OTP version 17, you should see some CLI output
that looks like this:
Erlang/OTP 17 [erts-6.4] [source] [64-bit] [smp:8:8] [async-threads:10] [hipe] [kernel-poll:false] [dtrace]
Eshell V6.4 (abort with ^G)
1>
## The test output is very verbose ... what are the important parts?
The output of the Erlang command
`machi_chain_manager1_converge_demo:help()` will display the following
guide to the output of the tests.
A visualization of the convergence behavior of the chain self-management
algorithm for Machi.
1. Set up some server and chain manager pairs.
2. Create a number of different network partition scenarios, where
(simulated) partitions may be symmetric or asymmetric. Then stop changing
the partitions and keep the simulated network stable (and perhaps broken).
3. Run a number of iterations of the algorithm in parallel by poking each
of the manager processes on a random'ish basis.
4. Afterward, fetch the chain transition changes made by each FLU and
verify that no transition was unsafe.
During the iteration periods, the following is a cheatsheet for the output.
See the internal source for interpreting the rest of the output.
'SET partitions = '
A pair-wise list of actors which cannot send messages. The
list is uni-directional. If there are three servers (a,b,c),
and if the partitions list is '[{a,b},{b,c}]' then all
messages from a->b and b->c will be dropped, but any other
sender->recipient messages will be delivered successfully.
'x uses:'
The FLU x has made an internal state transition and is using
this epoch's projection as operating chain configuration. The
rest of the line is a summary of the projection.
'CONFIRM epoch {N}'
This message confirms that all of the servers listed in the
UPI and repairing lists of the projection at epoch {N} have
agreed to use this projection because they all have written
this projection to their respective private projection stores.
The chain is now usable by/available to all clients.
'Sweet, private projections are stable'
This report announces that this iteration of the test cycle
has passed successfully. The report that follows briefly
summarizes the latest private projection used by each
participating server. For example, when in strong consistency
mode with 'a' as a witness and 'b' and 'c' as real servers:
%% Legend:
%% server name, epoch ID, UPI list, repairing list, down list, ...
%% ... witness list, 'false' (a constant value)
[{a,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}},
{b,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}]
Both servers 'a' and 'b' agree on epoch 1116 with epoch ID
{1116,<<23,143,246,55>>} where UPI=[a,b], repairing=[],
down=[c], and witnesses=[a].
Server 'c' is not shown because 'c' has wedged itself OOS (out
of service) by configuring a chain length of zero.
If no servers are listed in the report (i.e. only '[]' is
displayed), then all servers have wedged themselves OOS, and
the chain is unavailable.
'DoIt,'
This marks a group of tick events which trigger the manager
processes to evaluate their environment and perhaps make a
state transition.
A long chain of 'DoIt,DoIt,DoIt,' means that the chain state has
(probably) settled to a stable configuration, which is the goal of the
algorithm.
Press control-c to interrupt the test....".
## Run a test in eventual consistency mode
Run the following command at the Erlang CLI prompt:
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}]).
The first argument, `3`, is the number of servers to participate in
the chain. Please note:
* Chain lengths as short as 1 or 2 are valid, but the results are a
bit boring.
* Chain lengths as long as 7 or 9 can be used, but they may
suffer from longer periods of churn/instability before all chain
managers reach agreement via humming consensus. (It is future work
to shorten the worst of the unstable churn latencies.)
* In eventual consistency mode, chain lengths may be even numbers,
e.g. 2, 4, or 6.
* The simulator will choose partition events from the permutations of
all 1, 2, and 3 node partition pairs. The total runtime will
increase *dramatically* with chain length.
* Chain length 2: about 3 partition cases
* Chain length 3: about 35 partition cases
* Chain length 4: about 230 partition cases
* Chain length 5: about 1100 partition cases
## Run a test in strong consistency mode (with witnesses):
*NOTE:* Due to a bug in the test code, please do not try to run the
convergence test in strong consistency mode and also without the
correct minority number of witness servers! If in doubt, please run
the commands shown below exactly.
Run the following command at the Erlang CLI prompt:
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).
The first argument, `3`, is the number of servers to participate in
the chain. Chain lengths as long as 7 or 9 can be used, but they may
suffer from longer periods of churn/instability before all chain
managers reach agreement via humming consensus.
Due to the bug mentioned above, please use the following
commands when running with chain lengths of 5 or 7, respectively.
machi_chain_manager1_converge_demo:t(5, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b]}]).
machi_chain_manager1_converge_demo:t(7, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b,c]}]).

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

View file

@ -43,3 +43,21 @@
-define(DEFAULT_COC_NAMESPACE, "").
-define(DEFAULT_COC_LOCATOR, 0).
-record(ns_info, {
version = 0 :: machi_dt:namespace_version(),
name = <<>> :: machi_dt:namespace(),
locator = 0 :: machi_dt:locator()
}).
-record(append_opts, {
chunk_extra = 0 :: machi_dt:chunk_size(),
preferred_file_name :: 'undefined' | machi_dt:file_name_s(),
flag_fail_preferred = false :: boolean()
}).
-record(read_opts, {
no_checksum = false :: boolean(),
no_chunk = false :: boolean(),
needs_trimmed = false :: boolean()
}).

View file

@ -1,6 +1,6 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2014 Basho Technologies, Inc. All Rights Reserved.
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
@ -22,10 +22,11 @@
-define(MACHI_PROJECTION_HRL, true).
-type pv1_consistency_mode() :: 'ap_mode' | 'cp_mode'.
-type pv1_chain_name():: atom().
-type pv1_csum() :: binary().
-type pv1_epoch() :: {pv1_epoch_n(), pv1_csum()}.
-type pv1_epoch_n() :: non_neg_integer().
-type pv1_server() :: atom() | binary().
-type pv1_server() :: atom().
-type pv1_timestamp() :: {non_neg_integer(), non_neg_integer(), non_neg_integer()}.
-record(p_srvr, {
@ -55,6 +56,7 @@
epoch_number :: pv1_epoch_n() | ?SPAM_PROJ_EPOCH,
epoch_csum :: pv1_csum(),
author_server :: pv1_server(),
chain_name = ch_not_def_yet :: pv1_chain_name(),
all_members :: [pv1_server()],
witnesses = [] :: [pv1_server()],
creation_time :: pv1_timestamp(),
@ -75,4 +77,16 @@
%% create a consistent projection ranking score.
-define(MAX_CHAIN_LENGTH, 64).
-record(chain_def_v1, {
name :: atom(), % chain name
mode :: pv1_consistency_mode(),
full = [] :: [p_srvr()],
witnesses = [] :: [p_srvr()],
old_full = [] :: [pv1_server()], % guard against some races
old_witnesses=[] :: [pv1_server()], % guard against some races
local_run = [] :: [pv1_server()], % must be tailored to each machine!
local_stop = [] :: [pv1_server()], % must be tailored to each machine!
props = [] :: list() % proplist for other related info
}).
-endif. % !MACHI_PROJECTION_HRL

View file

@ -0,0 +1,56 @@
#!/bin/sh
echo "Step: Verify that the required entries in /etc/hosts are present"
for i in 1 2 3; do
grep machi$i /etc/hosts | egrep -s '^127.0.0.1' > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo ""
echo "'grep -s machi$i' failed. Aborting, sorry."
exit 1
fi
ping -c 1 machi$i > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo ""
echo "Ping attempt on host machi$i failed. Aborting."
echo ""
ping -c 1 machi$i
exit 1
fi
done
echo "Step: add a verbose logging option to app.config"
for i in 1 2 3; do
ed ./dev/dev$i/etc/app.config <<EOF > /dev/null 2>&1
/verbose_confirm
a
{chain_manager_opts, [{private_write_verbose_confirm,true}]},
{stability_time, 1},
.
w
q
EOF
done
echo "Step: start three three Machi application instances"
for i in 1 2 3; do
./dev/dev$i/bin/machi start
./dev/dev$i/bin/machi ping
if [ $? -ne 0 ]; then
echo "Sorry, a 'ping' check for instance dev$i failed. Aborting."
exit 1
fi
done
echo "Step: configure one chain to start a Humming Consensus group with three members"
# Note: $CWD of each Machi proc is two levels below the source code root dir.
LIFECYCLE000=../../priv/quick-admin-examples/demo-000
for i in 3 2 1; do
./dev/dev$i/bin/machi-admin quick-admin-apply $LIFECYCLE000 machi$i
if [ $? -ne 0 ]; then
echo "Sorry, 'machi-admin quick-admin-apply failed' on machi$i. Aborting."
exit 1
fi
done
exit 0

View file

@ -0,0 +1,93 @@
# -*- mode: ruby -*-
# vi: set ft=ruby :
# All Vagrant configuration is done below. The "2" in Vagrant.configure
# configures the configuration version (we support older styles for
# backwards compatibility). Please don't change it unless you know what
# you're doing.
Vagrant.configure(2) do |config|
# The most common configuration options are documented and commented below.
# For a complete reference, please see the online documentation at
# https://docs.vagrantup.com.
# Every Vagrant development environment requires a box. You can search for
# boxes at https://atlas.hashicorp.com/search.
# If this Vagrant box has not been downloaded before (e.g. using "vagrant box add"),
# then Vagrant will automatically download the VM image from HashiCorp.
config.vm.box = "hashicorp/precise64"
# If using a FreeBSD box, Bash may not be installed.
# Use the config.ssh.shell setting to specify an alternate shell.
# Note, however, that any code in the 'config.vm.provision' section
# would then have to use this shell's syntax!
# config.ssh.shell = "/bin/csh -l"
# Disable automatic box update checking. If you disable this, then
# boxes will only be checked for updates when the user runs
# `vagrant box outdated`. This is not recommended.
# config.vm.box_check_update = false
# Create a forwarded port mapping which allows access to a specific port
# within the machine from a port on the host machine. In the example below,
# accessing "localhost:8080" will access port 80 on the guest machine.
# config.vm.network "forwarded_port", guest: 80, host: 8080
# Create a private network, which allows host-only access to the machine
# using a specific IP.
# config.vm.network "private_network", ip: "192.168.33.10"
# Create a public network, which generally matched to bridged network.
# Bridged networks make the machine appear as another physical device on
# your network.
# config.vm.network "public_network"
# Share an additional folder to the guest VM. The first argument is
# the path on the host to the actual folder. The second argument is
# the path on the guest to mount the folder. And the optional third
# argument is a set of non-required options.
# config.vm.synced_folder "../data", "/vagrant_data"
# Provider-specific configuration so you can fine-tune various
# backing providers for Vagrant. These expose provider-specific options.
# Example for VirtualBox:
#
config.vm.provider "virtualbox" do |vb|
# Display the VirtualBox GUI when booting the machine
# vb.gui = true
# Customize the amount of memory on the VM:
vb.memory = "512"
end
#
# View the documentation for the provider you are using for more
# information on available options.
# Define a Vagrant Push strategy for pushing to Atlas. Other push strategies
# such as FTP and Heroku are also available. See the documentation at
# https://docs.vagrantup.com/v2/push/atlas.html for more information.
# config.push.define "atlas" do |push|
# push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME"
# end
# Enable provisioning with a shell script. Additional provisioners such as
# Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the
# documentation for more information about their specific syntax and use.
config.vm.provision "shell", inline: <<-SHELL
# Install prerequsites
# Support here for FreeBSD is experimental
apt-get update ; sudo apt-get install -y git sudo rsync ; # Ubuntu Linux
env ASSUME_ALWAYS_YES=yes pkg install -f git sudo rsync ; # FreeBSD 10
# Install dependent packages, using slf-configurator
git clone https://github.com/slfritchie/slf-configurator.git
chown -R vagrant ./slf-configurator
(cd slf-configurator ; sudo sh -x ./ALL.sh)
echo 'export PATH=${PATH}:/usr/local/erlang/17.5/bin' >> ~vagrant/.bashrc
export PATH=${PATH}:/usr/local/erlang/17.5/bin
## echo 'set path = ( $path /usr/local/erlang/17.5/bin )' >> ~vagrant/.cshrc
## setenv PATH /usr/local/erlang/17.5/bin:$PATH
git clone https://github.com/basho/machi.git
(cd machi ; git checkout master ; make && make test )
chown -R vagrant ./machi
SHELL
end

View file

@ -36,7 +36,7 @@ while (<I>) {
$indent = " " x ($count * 4);
s/^#*\s*[0-9. ]*//;
$anchor = "n$label";
printf T1 "%s+ [%s %s](#%s)\n", $indent, $label, $_, $anchor;
printf T1 "%s+ [%s. %s](#%s)\n", $indent, $label, $_, $anchor;
printf T2 "<a name=\"%s\">\n", $anchor;
$line =~ s/(#+)\s*[0-9. ]*/$1 $label. /;
print T2 $line;

View file

@ -0,0 +1 @@
{host, "localhost", []}.

View file

@ -0,0 +1,4 @@
{flu,f1,"localhost",20401,[]}.
{flu,f2,"localhost",20402,[]}.
{flu,f3,"localhost",20403,[]}.
{chain,c1,[f1,f2,f3],[]}.

View file

@ -0,0 +1,4 @@
{flu,f4,"localhost",20404,[]}.
{flu,f5,"localhost",20405,[]}.
{flu,f6,"localhost",20406,[]}.
{chain,c2,[f4,f5,f6],[]}.

View file

@ -0,0 +1,7 @@
{host, "machi1", []}.
{host, "machi2", []}.
{host, "machi3", []}.
{flu,f1,"machi1",20401,[]}.
{flu,f2,"machi2",20402,[]}.
{flu,f3,"machi3",20403,[]}.
{chain,c1,[f1,f2,f3],[]}.

View file

@ -1,25 +1,35 @@
[
{machi, [
%% Data directory for all FLUs.
{flu_data_dir, "{{platform_data_dir}}"},
{flu_data_dir, "{{platform_data_dir}}/flu"},
%% FLU config directory
{flu_config_dir, "{{platform_etc_dir}}/flu-config"},
%% Chain config directory
{chain_config_dir, "{{platform_etc_dir}}/chain-config"},
%% FLUs to start at app start.
{initial_flus, [
%% Remember, this is a list, so separate all tuples
%% with a comma.
%%
%% {Name::atom(), Port::pos_integer(), proplist()}
%%
%% For example: {my_name_is_a, 12500, []}
]},
%% This task has moved to machi_flu_sup and machi_lifecycle_mgr.
%% Number of metadata manager processes to run per FLU.
%% Default = 10
%% {metadata_manager_count, 2},
%% Default options for chain manager processes.
%% {chain_manager_opts, [{private_write_verbose,true},
%% {private_write_verbose_confirm,true}]},
%% Platform vars (mirror of reltool packaging)
{platform_data_dir, "{{platform_data_dir}}"},
{platform_etc_dir, "{{platform_etc_dir}}"},
%% Do not delete, do not put Machi config items after this line.
{final_comma_stopper, do_not_delete}
]
},
{lager, [
{error_logger_hwm, 5000} % lager's default of 50/sec is too low
]
}
].

View file

@ -22,23 +22,41 @@ cd $RUNNER_BASE_DIR
SCRIPT=`basename $0`
usage() {
echo "Usage: $SCRIPT { test | "
echo "Usage: $SCRIPT { quick-admin-check | quick-admin-apply | "
echo " top }"
}
case "$1" in
test)
quick-admin-check)
# Make sure the local node IS running
node_up_check
shift
# Parse out the node name to pass to the client
NODE_NAME=${NAME_ARG#* }
NODE_NAME=${NAME_ARG#* } # target machi server node name
IN_FILE="$1"
$ERTS_PATH/erl -noshell $NAME_PARAM machi_test$NAME_HOST $COOKIE_ARG \
-pa $RUNNER_LIB_DIR/basho-patches \
-eval "case catch(machi:client_test(\"$NODE_NAME\")) of \
$ERTS_PATH/erl -noshell -noinput $NAME_PARAM machi_test$NAME_HOST $COOKIE_ARG \
-remsh $NODE_NAME \
-eval "Me = self(), spawn('"$NODE_NAME"', fun() -> X = (catch(machi_lifecycle_mgr:quick_admin_sanity_check(\"$IN_FILE\"))), Me ! {res, X} end), XX = receive {res, Res} -> Res after 10*1000 -> timeout end, io:format(user, \"Result: ~p\n\", [XX]), case XX of \
ok -> init:stop(); \
_ -> init:stop(1) \
end."
;;
quick-admin-apply)
# Make sure the local node IS running
node_up_check
shift
NODE_NAME=${NAME_ARG#* } # target machi server node name
IN_FILE="$1"
RELATIVE_HOST="$2"
$ERTS_PATH/erl -noshell -noinput $NAME_PARAM machi_test$NAME_HOST $COOKIE_ARG \
-remsh $NODE_NAME \
-eval "Me = self(), spawn('"$NODE_NAME"', fun() -> X = (catch(machi_lifecycle_mgr:quick_admin_apply(\"$IN_FILE\", \"$RELATIVE_HOST\"))), Me ! {res, X} end), XX = receive {res, Res} -> Res after 10*1000 -> timeout end, io:format(user, \"Result: ~p\n\", [XX]), case XX of \
ok -> init:stop(); \
_ -> init:stop(1) \
end."

16
rel/gen_dev Executable file
View file

@ -0,0 +1,16 @@
#! /bin/sh
#
# Example usage: gen_dev dev4 vars.src vars
#
# Generate an overlay config for devNNN from vars.src and write to vars
#
NAME=$1
TEMPLATE=$2
VARFILE=$3
NODE="$NAME@127.0.0.1"
echo "Generating $NAME - node='$NODE'"
sed -e "s/@NODE@/$NODE/" \
< $TEMPLATE > $VARFILE

View file

@ -47,6 +47,7 @@
{overlay, [
{mkdir, "data"},
{mkdir, "data/^PRESERVE"},
{mkdir, "log"},
%% Copy base files for starting and interacting w/ node
@ -93,6 +94,20 @@
{template, "files/vm.args", "etc/vm.args"},
{template, "files/app.config", "etc/app.config"},
{mkdir, "etc/chain-config"},
{mkdir, "etc/flu-config"},
{mkdir, "etc/pending"},
{mkdir, "etc/rejected"},
%% Experiment: quick-admin
{mkdir, "etc/quick-admin-archive"},
{mkdir, "priv"},
{mkdir, "priv/quick-admin-examples"},
{copy, "../priv/quick-admin-examples/000", "priv/quick-admin-examples"},
{copy, "../priv/quick-admin-examples/001", "priv/quick-admin-examples"},
{copy, "../priv/quick-admin-examples/002", "priv/quick-admin-examples"},
{copy, "../priv/quick-admin-examples/demo-000", "priv/quick-admin-examples/demo-000"},
{mkdir, "lib/basho-patches"}
%% {copy, "../apps/machi/ebin/etop_txt.beam", "lib/basho-patches"}
]}.

View file

@ -1,6 +1,9 @@
%% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*-
%% ex: ft=erlang ts=4 sw=4 et
%% NOTE: When modifying this file, also keep its near cousin
%% config file rel/vars/dev_vars.config.src in sync!
%% Platform-specific installation paths
{platform_bin_dir, "./bin"}.
{platform_data_dir, "./data"}.

View file

@ -0,0 +1,48 @@
%% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*-
%% ex: ft=erlang ts=4 sw=4 et
%% NOTE: When modifying this file, also keep its near cousin
%% config file rel/vars/dev_vars.config.src in sync!
%% Platform-specific installation paths
{platform_bin_dir, "./bin"}.
{platform_data_dir, "./data"}.
{platform_etc_dir, "./etc"}.
{platform_lib_dir, "./lib"}.
{platform_log_dir, "./log"}.
%%
%% etc/app.config
%%
{sasl_error_log, "{{platform_log_dir}}/sasl-error.log"}.
{sasl_log_dir, "{{platform_log_dir}}/sasl"}.
%% lager
{console_log_default, file}.
%%
%% etc/vm.args
%%
{node, "@NODE@"}.
{crash_dump, "{{platform_log_dir}}/erl_crash.dump"}.
%%
%% bin/machi
%%
{runner_script_dir, "\`cd \\`dirname $0\\` 1>/dev/null && /bin/pwd\`"}.
{runner_base_dir, "{{runner_script_dir}}/.."}.
{runner_etc_dir, "$RUNNER_BASE_DIR/etc"}.
{runner_log_dir, "$RUNNER_BASE_DIR/log"}.
{runner_lib_dir, "$RUNNER_BASE_DIR/lib"}.
{runner_patch_dir, "$RUNNER_BASE_DIR/lib/basho-patches"}.
{pipe_dir, "/tmp/$RUNNER_BASE_DIR/"}.
{runner_user, ""}.
{runner_wait_process, "machi_flu_sup"}.
{runner_ulimit_warn, 65536}.
%%
%% cuttlefish
%%
{cuttlefish, ""}. % blank = off
{cuttlefish_conf, "machi.conf"}.

View file

@ -1,7 +1,7 @@
{application, machi, [
{description, "A village of write-once files."},
{vsn, "0.0.0"},
{applications, [kernel, stdlib, crypto, cluster_info]},
{vsn, "0.0.1"},
{applications, [kernel, stdlib, crypto, cluster_info, ranch]},
{mod,{machi_app,[]}},
{registered, []},
{env, [

View file

@ -170,12 +170,18 @@ message Mpb_AuthResp {
// High level API: append_chunk() request & response
message Mpb_AppendChunkReq {
required string coc_namespace = 1;
required uint32 coc_locator = 2;
required string prefix = 3;
required bytes chunk = 4;
required Mpb_ChunkCSum csum = 5;
optional uint32 chunk_extra = 6;
// General namespace arguments
/* In single chain/non-clustered environment, use namespace="" */
required string namespace = 1;
required string prefix = 10;
required bytes chunk = 11;
required Mpb_ChunkCSum csum = 12;
optional uint32 chunk_extra = 20;
optional string preferred_file_name = 21;
/* Fail the operation if our preferred file name is not available */
optional bool flag_fail_preferred = 22 [default=false];
}
message Mpb_AppendChunkResp {
@ -187,7 +193,7 @@ message Mpb_AppendChunkResp {
// High level API: write_chunk() request & response
message Mpb_WriteChunkReq {
required Mpb_Chunk chunk = 1;
required Mpb_Chunk chunk = 10;
}
message Mpb_WriteChunkResp {
@ -197,19 +203,22 @@ message Mpb_WriteChunkResp {
// High level API: read_chunk() request & response
message Mpb_ReadChunkReq {
required Mpb_ChunkPos chunk_pos = 1;
// No namespace arguments are required because NS is embedded
// inside of the file name.
required Mpb_ChunkPos chunk_pos = 10;
// Use flag_no_checksum=non-zero to skip returning the chunk's checksum.
// TODO: not implemented yet.
optional uint32 flag_no_checksum = 2 [default=0];
optional bool flag_no_checksum = 20 [default=false];
// Use flag_no_chunk=non-zero to skip returning the chunk (which
// only makes sense if flag_no_checksum is not set).
// TODO: not implemented yet.
optional uint32 flag_no_chunk = 3 [default=0];
optional bool flag_no_chunk = 21 [default=false];
// TODO: not implemented yet.
optional uint32 flag_needs_trimmed = 4 [default=0];
optional bool flag_needs_trimmed = 22 [default=false];
}
message Mpb_ReadChunkResp {
@ -245,6 +254,8 @@ message Mpb_ChecksumListResp {
// High level API: list_files() request & response
message Mpb_ListFilesReq {
// TODO: Add flag for file glob/regexp/other filter type
// TODO: What else could go wrong?
}
message Mpb_ListFilesResp {
@ -331,18 +342,17 @@ message Mpb_ProjectionV1 {
required uint32 epoch_number = 1;
required bytes epoch_csum = 2;
required string author_server = 3;
repeated string all_members = 4;
repeated string witnesses = 5;
required Mpb_Now creation_time = 6;
required Mpb_Mode mode = 7;
repeated string upi = 8;
repeated string repairing = 9;
repeated string down = 10;
optional bytes opaque_flap = 11;
optional bytes opaque_inner = 12;
required bytes opaque_dbg = 13;
required bytes opaque_dbg2 = 14;
repeated Mpb_MembersDictEntry members_dict = 15;
required string chain_name = 4;
repeated string all_members = 5;
repeated string witnesses = 6;
required Mpb_Now creation_time = 7;
required Mpb_Mode mode = 8;
repeated string upi = 9;
repeated string repairing = 10;
repeated string down = 11;
required bytes opaque_dbg = 12;
required bytes opaque_dbg2 = 13;
repeated Mpb_MembersDictEntry members_dict = 14;
}
//////////////////////////////////////////
@ -378,14 +388,20 @@ message Mpb_ProjectionV1 {
// Low level API: append_chunk()
message Mpb_LL_AppendChunkReq {
required Mpb_EpochID epoch_id = 1;
/* To avoid CoC use, use coc_namespace="" and coc_locator=0 */
required string coc_namespace = 2;
required uint32 coc_locator = 3;
required string prefix = 4;
required bytes chunk = 5;
required Mpb_ChunkCSum csum = 6;
optional uint32 chunk_extra = 7;
// General namespace arguments
required uint32 namespace_version = 1;
required string namespace = 2;
required uint32 locator = 3;
required Mpb_EpochID epoch_id = 10;
required string prefix = 11;
required bytes chunk = 12;
required Mpb_ChunkCSum csum = 13;
optional uint32 chunk_extra = 20;
optional string preferred_file_name = 21;
/* Fail the operation if our preferred file name is not available */
optional bool flag_fail_preferred = 22 [default=false];
}
message Mpb_LL_AppendChunkResp {
@ -397,8 +413,12 @@ message Mpb_LL_AppendChunkResp {
// Low level API: write_chunk()
message Mpb_LL_WriteChunkReq {
required Mpb_EpochID epoch_id = 1;
required Mpb_Chunk chunk = 2;
// General namespace arguments
required uint32 namespace_version = 1;
required string namespace = 2;
required Mpb_EpochID epoch_id = 10;
required Mpb_Chunk chunk = 11;
}
message Mpb_LL_WriteChunkResp {
@ -408,19 +428,23 @@ message Mpb_LL_WriteChunkResp {
// Low level API: read_chunk()
message Mpb_LL_ReadChunkReq {
required Mpb_EpochID epoch_id = 1;
required Mpb_ChunkPos chunk_pos = 2;
// General namespace arguments
required uint32 namespace_version = 1;
required string namespace = 2;
required Mpb_EpochID epoch_id = 10;
required Mpb_ChunkPos chunk_pos = 11;
// Use flag_no_checksum=non-zero to skip returning the chunk's checksum.
// TODO: not implemented yet.
optional uint32 flag_no_checksum = 3 [default=0];
optional bool flag_no_checksum = 20 [default=false];
// Use flag_no_chunk=non-zero to skip returning the chunk (which
// only makes sense if flag_checksum is not set).
// TODO: not implemented yet.
optional uint32 flag_no_chunk = 4 [default=0];
optional bool flag_no_chunk = 21 [default=false];
optional uint32 flag_needs_trimmed = 5 [default=0];
optional bool flag_needs_trimmed = 22 [default=false];
}
message Mpb_LL_ReadChunkResp {
@ -432,11 +456,16 @@ message Mpb_LL_ReadChunkResp {
// Low level API: trim_chunk()
message Mpb_LL_TrimChunkReq {
required Mpb_EpochID epoch_id = 1;
required string file = 2;
required uint64 offset = 3;
required uint32 size = 4;
optional uint32 trigger_gc = 5 [default=0];
// General namespace arguments
required uint32 namespace_version = 1;
required string namespace = 2;
required Mpb_EpochID epoch_id = 10;
required string file = 11;
required uint64 offset = 12;
required uint32 size = 13;
optional bool trigger_gc = 20 [default=false];
}
message Mpb_LL_TrimChunkResp {
@ -446,8 +475,7 @@ message Mpb_LL_TrimChunkResp {
// Low level API: checksum_list()
message Mpb_LL_ChecksumListReq {
required Mpb_EpochID epoch_id = 1;
required string file = 2;
required string file = 1;
}
message Mpb_LL_ChecksumListResp {
@ -478,7 +506,9 @@ message Mpb_LL_WedgeStatusReq {
message Mpb_LL_WedgeStatusResp {
required Mpb_GeneralStatusCode status = 1;
optional Mpb_EpochID epoch_id = 2;
optional uint32 wedged_flag = 3;
optional bool wedged_flag = 3;
optional uint32 namespace_version = 4;
optional string namespace = 5;
}
// Low level API: delete_migration()

View file

@ -90,15 +90,16 @@ verify_file_checksums_local2(Sock1, EpochID, Path0) ->
end.
verify_file_checksums_remote2(Sock1, EpochID, File) ->
NSInfo = undefined,
ReadChunk = fun(File_name, Offset, Size) ->
?FLU_C:read_chunk(Sock1, EpochID,
File_name, Offset, Size, [])
?FLU_C:read_chunk(Sock1, NSInfo, EpochID,
File_name, Offset, Size, undefined)
end,
verify_file_checksums_common(Sock1, EpochID, File, ReadChunk).
verify_file_checksums_common(Sock1, EpochID, File, ReadChunk) ->
verify_file_checksums_common(Sock1, _EpochID, File, ReadChunk) ->
try
case ?FLU_C:checksum_list(Sock1, EpochID, File) of
case ?FLU_C:checksum_list(Sock1, File) of
{ok, InfoBin} ->
Info = machi_csum_table:split_checksum_list_blob_decode(InfoBin),
Res = lists:foldl(verify_chunk_checksum(File, ReadChunk),

View file

@ -1,6 +1,6 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%% Copyright (c) 2007-2016 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
@ -43,23 +43,25 @@
%% could add new entries to this ETS table.
%%
%% Now we can use various integer-centric key generators that are
%% already bundled with basho_bench.
%% already bundled with basho_bench. NOTE: this scheme does not allow
%% mixing of 'append' and 'read' operations in the same config. Basho
%% Bench does not support different key generators for different
%% operations, unfortunately. The work-around is to run two different
%% Basho Bench instances: on for 'append' ops with a key generator for
%% the desired prefix(es), and the other for 'read' ops with an
%% integer key generator.
%%
%% TODO: Add CRC checking, when feasible and when supported on the
%% server side.
%%
%% TODO: As an alternate idea, if we know that the chunks written are
%% always the same size, and if we don't care about CRC checking, then
%% all we need to know are the file names &amp; file sizes on the server:
%% we can then pick any valid offset within that file. That would
%% certainly be more scalable than the zillion-row-ETS-table, which is
%% definitely RAM-hungry.
%% TODO: The 'read' operator will always read chunks at exactly the
%% byte offset & size as the original append/write ops. If reads are
%% desired at any arbitrary offset & size, then a new strategy is
%% required.
-module(machi_basho_bench_driver).
-export([new/1, run/4]).
-record(m, {
id,
conn,
max_key
}).
@ -81,7 +83,7 @@ new(Id) ->
{read_concurrency, true}]),
ets:insert(ETS, {max_key, 0}),
ets:insert(ETS, {total_bytes, 0}),
MaxKeys = load_ets_table(Conn, ETS),
MaxKeys = load_ets_table_maybe(Conn, ETS),
?INFO("Key preload: finished, ~w keys loaded", [MaxKeys]),
Bytes = ets:lookup_element(ETS, total_bytes, 2),
?INFO("Key preload: finished, chunk list specifies ~s MBytes of chunks",
@ -90,12 +92,14 @@ new(Id) ->
true ->
ok
end,
{ok, #m{conn=Conn}}.
{ok, #m{id=Id, conn=Conn}}.
run(append, KeyGen, ValueGen, #m{conn=Conn}=S) ->
Prefix = KeyGen(),
Value = ValueGen(),
case machi_cr_client:append_chunk(Conn, Prefix, Value, ?THE_TIMEOUT) of
CSum = machi_util:make_client_csum(Value),
AppendOpts = {append_opts,0,undefined,false}, % HACK FIXME
case machi_cr_client:append_chunk(Conn, undefined, Prefix, Value, CSum, AppendOpts, ?THE_TIMEOUT) of
{ok, Pos} ->
EtsKey = ets:update_counter(?ETS_TAB, max_key, 1),
true = ets:insert(?ETS_TAB, {EtsKey, Pos}),
@ -112,9 +116,26 @@ run(read, KeyGen, _ValueGen, #m{conn=Conn, max_key=MaxKey}=S) ->
Idx = KeyGen() rem MaxKey,
%% {File, Offset, Size, _CSum} = ets:lookup_element(?ETS_TAB, Idx, 2),
{File, Offset, Size} = ets:lookup_element(?ETS_TAB, Idx, 2),
case machi_cr_client:read_chunk(Conn, File, Offset, Size, [], ?THE_TIMEOUT) of
{ok, _Chunk} ->
{ok, S};
ReadOpts = {read_opts,false,false,false}, % HACK FIXME
case machi_cr_client:read_chunk(Conn, undefined, File, Offset, Size, ReadOpts, ?THE_TIMEOUT) of
{ok, {Chunks, _Trimmed}} ->
%% io:format(user, "Chunks ~P\n", [Chunks, 15]),
%% {ok, S};
case lists:all(fun({File2, Offset2, Chunk, CSum}) ->
{_Tag, CS} = machi_util:unmake_tagged_csum(CSum),
CS2 = machi_util:checksum_chunk(Chunk),
if CS == CS2 ->
true;
CS /= CS2 ->
?ERROR("Client-side checksum error for file ~p offset ~p expected ~p got ~p\n", [File2, Offset2, CS, CS2]),
false
end
end, Chunks) of
true ->
{ok, S};
false ->
{error, bad_checksum, S}
end;
{error, _}=Err ->
?ERROR("read file ~p offset ~w size ~w: ~w\n",
[File, Offset, Size, Err]),
@ -132,21 +153,40 @@ find_server_info(_Id) ->
Ps
end.
load_ets_table_maybe(Conn, ETS) ->
case basho_bench_config:get(operations, undefined) of
undefined ->
?ERROR("The 'operations' key is missing from the config file, aborting", []),
exit(bad_config);
Ops when is_list(Ops) ->
case lists:keyfind(read, 1, Ops) of
{read,_} ->
load_ets_table(Conn, ETS);
false ->
?INFO("No 'read' op in the 'operations' list ~p, skipping ETS table load.", [Ops]),
0
end
end.
load_ets_table(Conn, ETS) ->
{ok, Fs} = machi_cr_client:list_files(Conn),
[begin
{ok, InfoBin} = machi_cr_client:checksum_list(Conn, File),
{ok, InfoBin} = machi_cr_client:checksum_list(Conn, File, ?THE_TIMEOUT),
PosList = machi_csum_table:split_checksum_list_blob_decode(InfoBin),
?INFO("File ~s len PosList ~p\n", [File, length(PosList)]),
StartKey = ets:update_counter(ETS, max_key, 0),
%% _EndKey = lists:foldl(fun({Off,Sz,CSum}, K) ->
%% V = {File, Off, Sz, CSum},
{_, Bytes} = lists:foldl(fun({Off,Sz,_CSum}, {K, Bs}) ->
V = {File, Off, Sz},
ets:insert(ETS, {K, V}),
{K + 1, Bs + Sz}
end, {StartKey, 0}, PosList),
ets:update_counter(ETS, max_key, length(PosList)),
ets:update_counter(ETS, total_bytes, Bytes)
{_, C, Bytes} = lists:foldl(fun({_Off,0,_CSum}, {_K, _C, _Bs}=Acc) ->
Acc;
({0,_Sz,_CSum}, {_K, _C, _Bs}=Acc) ->
Acc;
({Off,Sz,_CSum}, {K, C, Bs}) ->
V = {File, Off, Sz},
ets:insert(ETS, {K, V}),
{K + 1, C + 1, Bs + Sz}
end, {StartKey, 0, 0}, PosList),
_ = ets:update_counter(ETS, max_key, C),
_ = ets:update_counter(ETS, total_bytes, Bytes),
ok
end || {_Size, File} <- Fs],
ets:update_counter(?ETS_TAB, max_key, 0).

View file

@ -92,8 +92,11 @@
-define(REPAIR_START_STABILITY_TIME, 10).
-endif. % TEST
%% Magic constant for looping "too frequently" breaker. TODO revisit & revise.
-define(TOO_FREQUENT_BREAKER, 10).
%% Maximum length of the history of adopted projections (via C120).
-define(MAX_HISTORY_LENGTH, 8).
%% Magic constant for looping "too frequently" breaker.
-define(TOO_FREQUENT_BREAKER, (?MAX_HISTORY_LENGTH+5)).
-define(RETURN2(X), begin (catch put(why2, [?LINE|get(why2)])), X end).
@ -103,12 +106,9 @@
%% Amount of epoch number skip-ahead for set_chain_members call
-define(SET_CHAIN_MEMBERS_EPOCH_SKIP, 1111).
%% Maximum length of the history of adopted projections (via C120).
-define(MAX_HISTORY_LENGTH, 30).
%% API
-export([start_link/2, start_link/3, stop/1, ping/1,
set_chain_members/2, set_chain_members/3, set_active/2,
set_chain_members/2, set_chain_members/6, set_active/2,
trigger_react_to_env/1]).
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, format_status/2, code_change/3]).
@ -168,13 +168,22 @@ ping(Pid) ->
%% with lowest rank, i.e. name z* first, name a* last.
set_chain_members(Pid, MembersDict) ->
set_chain_members(Pid, MembersDict, []).
set_chain_members(Pid, ch0_name, 0, ap_mode, MembersDict, []).
set_chain_members(Pid, MembersDict, Witness_list) ->
case lists:all(fun(Witness) -> orddict:is_key(Witness, MembersDict) end,
Witness_list) of
set_chain_members(Pid, ChainName, OldEpoch, CMode, MembersDict, Witness_list)
when is_atom(ChainName) andalso
is_integer(OldEpoch) andalso OldEpoch >= 0 andalso
(CMode == ap_mode orelse CMode == cp_mode) andalso
is_list(MembersDict) andalso
is_list(Witness_list) ->
case lists:all(fun({X, #p_srvr{name=X}}) -> true;
(_) -> false
end, MembersDict)
andalso
lists:all(fun(Witness) -> orddict:is_key(Witness, MembersDict) end,
Witness_list) of
true ->
Cmd = {set_chain_members, MembersDict, Witness_list},
Cmd = {set_chain_members, ChainName, OldEpoch, CMode, MembersDict, Witness_list},
gen_server:call(Pid, Cmd, infinity);
false ->
{error, bad_arg}
@ -225,11 +234,13 @@ test_read_latest_public_projection(Pid, ReadRepairP) ->
%% manager's pid in MgrOpts and use direct gen_server calls to the
%% local projection store.
init({MyName, InitMembersDict, MgrOpts}) ->
init({MyName, InitMembersDict, MgrOpts0}) ->
put(ttt, [?LINE]),
_ = random:seed(now()),
init_remember_down_list(),
MgrOpts = MgrOpts0 ++ application:get_env(machi, chain_manager_opts, []),
Opt = fun(Key, Default) -> proplists:get_value(Key, MgrOpts, Default) end,
InitWitness_list = Opt(witnesses, []),
ZeroAll_list = [P#p_srvr.name || {_,P} <- orddict:to_list(InitMembersDict)],
ZeroProj = make_none_projection(0, MyName, ZeroAll_list,
@ -281,7 +292,7 @@ init({MyName, InitMembersDict, MgrOpts}) ->
last_down=[no_such_server_initial_value_only],
fitness_svr=machi_flu_psup:make_fitness_regname(MyName)
}, Proj),
{_, S2} = do_set_chain_members_dict(MembersDict, S),
S2 = do_set_chain_members_dict(MembersDict, S),
S3 = if ActiveP == false ->
S2;
ActiveP == true ->
@ -291,12 +302,17 @@ init({MyName, InitMembersDict, MgrOpts}) ->
handle_call({ping}, _From, S) ->
{reply, pong, S};
handle_call({set_chain_members, MembersDict, Witness_list}, _From,
handle_call({set_chain_members, SetChainName, SetOldEpoch, CMode,
MembersDict, Witness_list}, _From,
#ch_mgr{name=MyName,
proj=#projection_v1{all_members=OldAll_list,
epoch_number=OldEpoch,
chain_name=ChainName,
upi=OldUPI}=OldProj}=S) ->
{Reply, S2} = do_set_chain_members_dict(MembersDict, S),
true = (OldEpoch == 0) % in this case we want unconditional set of ch name
orelse
(SetOldEpoch == OldEpoch andalso SetChainName == ChainName),
S2 = do_set_chain_members_dict(MembersDict, S),
%% TODO: should there be any additional sanity checks? Right now,
%% if someone does something bad, then do_react_to_env() will
%% crash, which will crash us, and we'll restart in a sane & old
@ -310,10 +326,10 @@ handle_call({set_chain_members, MembersDict, Witness_list}, _From,
{NUPI, All_list -- NUPI}
end,
NewEpoch = OldEpoch + ?SET_CHAIN_MEMBERS_EPOCH_SKIP,
CMode = calc_consistency_mode(Witness_list),
ok = set_consistency_mode(machi_flu_psup:make_proj_supname(MyName), CMode),
NewProj = machi_projection:update_checksum(
OldProj#projection_v1{author_server=MyName,
chain_name=SetChainName,
creation_time=now(),
mode=CMode,
epoch_number=NewEpoch,
@ -325,7 +341,11 @@ handle_call({set_chain_members, MembersDict, Witness_list}, _From,
members_dict=MembersDict}),
S3 = set_proj(S2#ch_mgr{proj_history=queue:new(),
consistency_mode=CMode}, NewProj),
{_QQ, S4} = do_react_to_env(S3),
{Res, S4} = do_react_to_env(S3),
Reply = case Res of
{_,_,_} -> ok
% Dialyzer says that all vals match with the 3-tuple pattern
end,
{reply, Reply, S4};
handle_call({set_active, Boolean}, _From, #ch_mgr{timer=TRef}=S) ->
case {Boolean, TRef} of
@ -357,8 +377,8 @@ handle_call({test_read_latest_public_projection, ReadRepairP}, _From, S) ->
{reply, Res, S2};
handle_call({trigger_react_to_env}=Call, _From, S) ->
gobble_calls(Call),
{TODOtodo, S2} = do_react_to_env(S),
{reply, TODOtodo, S2};
{Res, S2} = do_react_to_env(S),
{reply, Res, S2};
handle_call(_Call, _From, S) ->
io:format(user, "\nBad call to ~p: ~p\n", [S#ch_mgr.name, _Call]),
{reply, whaaaaaaaaaa, S}.
@ -370,6 +390,7 @@ handle_cast(_Cast, S) ->
handle_info(tick_check_environment, #ch_mgr{ignore_timer=true}=S) ->
{noreply, S};
handle_info(tick_check_environment, S) ->
gobble_ticks(),
{{_Delta, Props, _Epoch}, S1} = do_react_to_env(S),
S2 = sanitize_repair_state(S1),
S3 = perhaps_start_repair(S2),
@ -442,7 +463,7 @@ get_my_proj_boot_info(MgrOpts, DefaultDict, DefaultProj, ProjType) ->
{DefaultDict, DefaultProj};
Store ->
{ok, P} = machi_projection_store:read_latest_projection(Store,
ProjType),
ProjType, 7789),
{P#projection_v1.members_dict, P}
end.
@ -535,6 +556,7 @@ cl_write_public_proj2(FLUs, Partitions, Epoch, Proj, IgnoreWrittenErrorP, S) ->
end
end, {true, []}, FLUs),
%% io:format(user, "\nWrite public ~w by ~w: ~w\n", [Epoch, S#ch_mgr.name, Rs]),
%% io:format(user, "mgr ~w epoch ~w Rs ~p\n", [S#ch_mgr.name, Epoch, Rs]),
{{remote_write_results, Rs}, S}.
do_cl_read_latest_public_projection(ReadRepairP,
@ -556,12 +578,41 @@ do_cl_read_latest_public_projection(ReadRepairP,
read_latest_projection_call_only(ProjectionType, AllHosed,
#ch_mgr{proj=CurrentProj}=S) ->
#projection_v1{all_members=All_list} = CurrentProj,
All_queried_list = All_list -- AllHosed,
All_queried_list = lists:sort(All_list -- AllHosed),
read_latest_projection_call_only1(ProjectionType, AllHosed,
All_queried_list, S).
{Rs, S2} = read_latest_projection_call_only2(ProjectionType,
All_queried_list, S),
FLUsRs = lists:zip(All_queried_list, Rs),
{All_queried_list, FLUsRs, S2}.
read_latest_projection_call_only1(ProjectionType, AllHosed,
All_queried_list, S) ->
{Rs_tmp, S2} = read_latest_projection_call_only2(ProjectionType,
All_queried_list, S),
New_all_maybe =
lists:usort(
lists:flatten(
[A_l || #projection_v1{all_members=A_l} <- Rs_tmp])) -- AllHosed,
case New_all_maybe -- All_queried_list of
[] ->
FLUsRs = lists:zip(All_queried_list, Rs_tmp),
{All_queried_list, FLUsRs, S2};
[AnotherFLU|_] ->
%% Stop AnotherFLU proxy, in unexpected case where it's open
try
Proxy = proxy_pid(AnotherFLU, S2),
?FLU_PC:stop_proxies([Proxy])
catch _:_ -> ok
end,
MD = orddict:from_list(
lists:usort(
lists:flatten(
[orddict:to_list(D) || #projection_v1{members_dict=D} <- Rs_tmp]))),
Another_P_srvr = orddict:fetch(AnotherFLU, MD),
{ok, Proxy2} = ?FLU_PC:start_link(Another_P_srvr),
S3 = S2#ch_mgr{proxies_dict=orddict:store(AnotherFLU, Proxy2,
S2#ch_mgr.proxies_dict)},
read_latest_projection_call_only1(
ProjectionType, AllHosed,
lists:usort([AnotherFLU|All_queried_list]), S3)
end.
read_latest_projection_call_only2(ProjectionType, All_queried_list, S) ->
{_UpNodes, Partitions, S2} = calc_up_nodes(S),
@ -601,6 +652,8 @@ rank_and_sort_projections_with_extra(All_queried_list, FLUsRs, ProjectionType,
Witness_list = CurrentProj#projection_v1.witnesses,
NoneProj = make_none_projection(0, MyName, [], Witness_list,
orddict:new()),
ChainName = CurrentProj#projection_v1.chain_name,
NoneProj2 = NoneProj#projection_v1{chain_name=ChainName},
Extra2 = [{all_members_replied, true},
{all_queried_list, All_queried_list},
{flus_rs, FLUsRs},
@ -609,7 +662,7 @@ rank_and_sort_projections_with_extra(All_queried_list, FLUsRs, ProjectionType,
{bad_answer_flus, BadAnswerFLUs},
{bad_answers, BadAnswers},
{not_unanimous_answers, []}],
{not_unanimous, NoneProj, Extra2, S};
{not_unanimous, NoneProj2, Extra2, S};
ProjectionType == public, UnwrittenRs /= [] ->
{needs_repair, FLUsRs, [flarfus], S};
true ->
@ -723,13 +776,14 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
runenv=RunEnv1,
repair_final_status=RepairFS}=S) ->
#projection_v1{epoch_number=OldEpochNum,
chain_name=ChainName,
members_dict=MembersDict,
witnesses=OldWitness_list,
upi=OldUPI_list,
repairing=OldRepairing_list
} = LastProj,
LastUp = lists:usort(OldUPI_list ++ OldRepairing_list),
AllMembers = (S#ch_mgr.proj)#projection_v1.all_members,
AllMembers = CurrentProj#projection_v1.all_members,
{Up0, Partitions, RunEnv2} = calc_up_nodes(MyName,
AllMembers, RunEnv1),
Up = Up0 -- AllHosed,
@ -786,7 +840,10 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
D_foo=[{repair_done, {repair_final_status, ok, (S#ch_mgr.proj)#projection_v1.epoch_number}}],
{NewUPI_list ++ Repairing_list2, [], RunEnv2};
true ->
D_foo=[d_foo2],
D_foo=[d_foo2, {sim_p,Simulator_p},
{simr_p,SimRepair_p}, {same_epoch,SameEpoch_p},
{rel_to,RelativeToServer},
{repch,RepChk_LastInUPI}, {repair_fs,RepairFS}],
{NewUPI_list, OldRepairing_list, RunEnv2}
end;
{_ABC, _XYZ} ->
@ -821,10 +878,11 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
end,
?REACT({calc,?LINE,[{new_upi, NewUPI},{new_rep, NewRepairing}]}),
P = machi_projection:new(OldEpochNum + 1,
MyName, MembersDict, Down, NewUPI, NewRepairing,
D_foo ++
Dbg ++ [{ps, Partitions},{nodes_up, Up}]),
P0 = machi_projection:new(OldEpochNum + 1,
MyName, MembersDict, Down, NewUPI, NewRepairing,
D_foo ++
Dbg ++ [{ps, Partitions},{nodes_up, Up}]),
P1 = P0#projection_v1{chain_name=ChainName},
P2 = if CMode == cp_mode ->
UpWitnesses = [W || W <- Up, lists:member(W, OldWitness_list)],
Majority = full_majority_size(AllMembers),
@ -833,7 +891,7 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
SoFar = length(NewUPI ++ NewRepairing),
if SoFar >= Majority ->
?REACT({calc,?LINE,[]}),
P;
P1;
true ->
Need = Majority - SoFar,
UpWitnesses = [W || W <- Up,
@ -842,7 +900,7 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
Ws = lists:sublist(UpWitnesses, Need),
?REACT({calc,?LINE,[{ws, Ws}]}),
machi_projection:update_checksum(
P#projection_v1{upi=Ws++NewUPI});
P1#projection_v1{upi=Ws++NewUPI});
true ->
?REACT({calc,?LINE,[]}),
P_none0 = make_none_projection(
@ -855,6 +913,7 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
"Not enough witnesses are available now"
end,
P_none1 = P_none0#projection_v1{
chain_name=ChainName,
%% Stable creation time!
creation_time={1,2,3},
dbg=[{none_projection,true},
@ -875,7 +934,7 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg,
end;
CMode == ap_mode ->
?REACT({calc,?LINE,[]}),
P
P1
end,
P3 = machi_projection:update_checksum(
P2#projection_v1{mode=CMode, witnesses=OldWitness_list}),
@ -1027,31 +1086,33 @@ rank_projection(#projection_v1{author_server=_Author,
do_set_chain_members_dict(MembersDict, #ch_mgr{proxies_dict=OldProxiesDict}=S)->
_ = ?FLU_PC:stop_proxies(OldProxiesDict),
ProxiesDict = ?FLU_PC:start_proxies(MembersDict),
{ok, S#ch_mgr{members_dict=MembersDict,
proxies_dict=ProxiesDict}}.
S#ch_mgr{members_dict=MembersDict,
proxies_dict=ProxiesDict}.
do_react_to_env(#ch_mgr{name=MyName,
proj=#projection_v1{epoch_number=Epoch,
members_dict=[]=OldDict}=OldProj,
opts=Opts}=S) ->
put(ttt, [?LINE]),
%% Read from our local *public* projection store. If some other
%% chain member has written something there, and if we are a
%% member of that chain, then we'll adopt that projection and then
%% start actively humming in that chain.
{NewMembersDict, NewProj} =
{NewMD, NewProj} =
get_my_public_proj_boot_info(Opts, OldDict, OldProj),
case orddict:is_key(MyName, NewMembersDict) of
case orddict:is_key(MyName, NewMD) of
false ->
{{empty_members_dict, [], Epoch}, S};
{{empty_members_dict1, [], Epoch}, S};
true ->
{_, S2} = do_set_chain_members_dict(NewMembersDict, S),
CMode = calc_consistency_mode(NewProj#projection_v1.witnesses),
{{empty_members_dict, [], Epoch},
set_proj(S2#ch_mgr{members_dict=NewMembersDict,
consistency_mode=CMode}, NewProj)}
CMode = NewProj#projection_v1.mode,
S2 = do_set_chain_members_dict(NewMD, S),
{Reply, S3} = react_to_env_C110(NewProj,
S2#ch_mgr{members_dict=NewMD,
consistency_mode=CMode}),
{Reply, S3}
end;
do_react_to_env(S) ->
put(ttt, [?LINE]),
put(ttt, [?LINE]),
%% The not_sanes manager counting dictionary is not strictly
%% limited to flapping scenarios. (Though the mechanism first
%% started as a way to deal with rare flapping scenarios.)
@ -1150,7 +1211,7 @@ react_to_env_A10(S) ->
?REACT(a10),
react_to_env_A20(0, poll_private_proj_is_upi_unanimous(S)).
react_to_env_A20(Retries, #ch_mgr{name=MyName}=S) ->
react_to_env_A20(Retries, #ch_mgr{name=MyName, proj=P_current}=S) ->
?REACT(a20),
init_remember_down_list(),
{UnanimousTag, P_latest, ReadExtra, S2} =
@ -1178,17 +1239,34 @@ react_to_env_A20(Retries, #ch_mgr{name=MyName}=S) ->
false when P_latest#projection_v1.epoch_number /= LastComplaint,
P_latest#projection_v1.all_members /= [] ->
put(rogue_server_epoch, P_latest#projection_v1.epoch_number),
error_logger:info_msg("Chain manager ~p found latest public "
"projection ~p has author ~p has a "
"members list ~p that does not include me.\n",
error_logger:info_msg("Chain manager ~w found latest public "
"projection ~w with author ~w has a "
"members list ~w that does not include me. "
"We assume this is a result of administrator "
"action and will thus wedge ourselves until "
"we are re-added to the chain or shutdown.\n",
[S#ch_mgr.name,
P_latest#projection_v1.epoch_number,
P_latest#projection_v1.author_server,
P_latest#projection_v1.all_members]);
P_latest#projection_v1.all_members]),
EpochID = machi_projection:make_epoch_id(P_current),
ProjStore = get_projection_store_pid_or_regname(S),
{ok, NotifyPid} = machi_projection_store:get_wedge_notify_pid(ProjStore),
_QQ = machi_flu1:update_wedge_state(NotifyPid, true, EpochID),
#projection_v1{epoch_number=Epoch,
chain_name=ChainName,
all_members=All_list,
witnesses=Witness_list,
members_dict=MembersDict} = P_current,
P_none0 = make_none_projection(Epoch,
MyName, All_list, Witness_list, MembersDict),
P_none = P_none0#projection_v1{chain_name=ChainName},
{{now_using,[],Epoch}, set_proj(S2, P_none)};
_ ->
ok
end,
react_to_env_A21(Retries, UnanimousTag, P_latest, ReadExtra, S2)
end.
react_to_env_A21(Retries, UnanimousTag, P_latest, ReadExtra, S) ->
%% The UnanimousTag isn't quite sufficient for our needs. We need
%% to determine if *all* of the UPI+Repairing FLUs are members of
%% the unanimous server replies. All Repairing FLUs should be up
@ -1233,7 +1311,7 @@ react_to_env_A20(Retries, #ch_mgr{name=MyName}=S) ->
true ->
exit({badbad, UnanimousTag})
end,
react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, S2).
react_to_env_A29(Retries, P_latest, LatestUnanimousP, ReadExtra, S).
react_to_env_A29(Retries, P_latest, LatestUnanimousP, _ReadExtra,
#ch_mgr{consistency_mode=CMode,
@ -1267,7 +1345,6 @@ react_to_env_A29(Retries, P_latest, LatestUnanimousP, _ReadExtra,
?REACT({a29, ?LINE,
[{zerf_backstop, true},
{zerf_in, machi_projection:make_summary(Zerf)}]}),
%% io:format(user, "zerf_in: A29: ~p: ~w\n\t~p\n", [MyName, machi_projection:make_summary(Zerf), get(yyy_hack)]),
#projection_v1{dbg=ZerfDbg} = Zerf,
Backstop = if Zerf#projection_v1.upi == [] ->
[];
@ -1287,7 +1364,8 @@ react_to_env_A29(Retries, P_latest, LatestUnanimousP, _ReadExtra,
end.
react_to_env_A30(Retries, P_latest, LatestUnanimousP, P_current_calc,
#ch_mgr{name=MyName, consistency_mode=CMode} = S) ->
#ch_mgr{name=MyName, proj=P_current,
consistency_mode=CMode} = S) ->
V = case file:read_file("/tmp/moomoo."++atom_to_list(S#ch_mgr.name)) of {ok,_} -> true; _ -> false end,
if V -> io:format(user, "A30: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end,
?REACT(a30),
@ -1307,15 +1385,17 @@ react_to_env_A30(Retries, P_latest, LatestUnanimousP, P_current_calc,
P = #projection_v1{down=Down} =
make_none_projection(Epoch + 1, MyName, All_list,
Witness_list, MembersDict),
ChainName = P_current#projection_v1.chain_name,
P1 = P#projection_v1{chain_name=ChainName},
P_newprop = if CMode == ap_mode ->
%% Not really none proj: just myself, AP style
machi_projection:update_checksum(
P#projection_v1{upi=[MyName],
P1#projection_v1{upi=[MyName],
down=Down -- [MyName],
dbg=[{hosed_list,AllHosed}]});
CMode == cp_mode ->
machi_projection:update_checksum(
P#projection_v1{dbg=[{hosed_list,AllHosed}]})
P1#projection_v1{dbg=[{hosed_list,AllHosed}]})
end,
react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
P_current_calc, true, S);
@ -1388,13 +1468,22 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
%% we have a disagreement.
not ordsets:is_disjoint(P_latest_s, Down_s)
end,
AmExcludedFromLatestAll_p =
P_latest#projection_v1.epoch_number /= 0
andalso
(not lists:member(MyName, P_latest#projection_v1.all_members)),
?REACT({a40, ?LINE,
[{latest_author, P_latest#projection_v1.author_server},
{am_excluded_from_latest_all_p, AmExcludedFromLatestAll_p},
{author_is_down_p, LatestAuthorDownP},
{rank_latest, Rank_latest},
{rank_newprop, Rank_newprop}]}),
if
AmExcludedFromLatestAll_p ->
?REACT({a40, ?LINE, [{latest,machi_projection:make_summary(P_latest)}]}),
react_to_env_A50(P_latest, [], S);
AmHosedP ->
ExpectedUPI = if CMode == cp_mode -> [];
CMode == ap_mode -> [MyName]
@ -1560,12 +1649,10 @@ react_to_env_A40(Retries, P_newprop, P_latest, LatestUnanimousP,
end,
if GoTo50_p ->
?REACT({a40, ?LINE, []}),
%% io:format(user, "CONFIRM debug question line ~w\n", [?LINE]),
FinalProps = [{throttle_seconds, 0}],
react_to_env_A50(P_latest, FinalProps, S);
true ->
?REACT({a40, ?LINE, []}),
io:format(user, "CONFIRM debug question line ~w\n", [?LINE]),
react_to_env_C300(P_newprop, P_latest, S)
end
end.
@ -1575,7 +1662,6 @@ react_to_env_A50(P_latest, FinalProps, #ch_mgr{proj=P_current}=S) ->
?REACT({a50, ?LINE, [{current_epoch, P_current#projection_v1.epoch_number},
{latest_epoch, P_latest#projection_v1.epoch_number},
{final_props, FinalProps}]}),
%% if S#ch_mgr.name == c -> io:format(user, "A50: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end,
V = case file:read_file("/tmp/moomoo."++atom_to_list(S#ch_mgr.name)) of {ok,_} -> true; _ -> false end,
if V -> io:format(user, "A50: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end,
{{no_change, FinalProps, P_current#projection_v1.epoch_number}, S}.
@ -1829,7 +1915,7 @@ react_to_env_C100_inner(Author_latest, NotSanesDict0, _MyName,
S2 = S#ch_mgr{not_sanes=NotSanesDict, sane_transitions=0},
case orddict:fetch(Author_latest, NotSanesDict) of
N when N > ?TOO_FREQUENT_BREAKER ->
%% ?V("\n\nYOYO ~w breaking the cycle of:\n current: ~w\n new : ~w\n", [_MyName, machi_projection:make_summary(S#ch_mgr.proj), machi_projection:make_summary(P_latest)]),
?V("\n\nYOYO ~w breaking the cycle insane-freq=~w by-author=~w of:\n current: ~w\n new : ~w\n", [_MyName, N, Author_latest, machi_projection:make_summary(S#ch_mgr.proj), machi_projection:make_summary(P_latest)]),
?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}),
react_to_env_C103(P_newprop, P_latest, P_current_calc, S2);
N ->
@ -1850,12 +1936,14 @@ react_to_env_C103(#projection_v1{epoch_number=_Epoch_newprop} = _P_newprop,
members_dict=MembersDict} = P_current,
P_none0 = make_none_projection(Epoch_latest,
MyName, All_list, Witness_list, MembersDict),
P_none1 = P_none0#projection_v1{dbg=[{none_projection,true}]},
ChainName = P_current#projection_v1.chain_name,
P_none1 = P_none0#projection_v1{chain_name=ChainName,
dbg=[{none_projection,true}]},
P_none = machi_projection:update_checksum(P_none1),
?REACT({c103, ?LINE,
[{current_epoch, P_current#projection_v1.epoch_number},
{none_projection_epoch, P_none#projection_v1.epoch_number}]}),
io:format(user, "SET add_admin_down(~w) at ~w =====================================\n", [MyName, time()]),
io:format(user, "SET add_admin_down(~w) at ~w current_epoch ~w none_proj_epoch ~w =====================================\n", [MyName, time(), P_current#projection_v1.epoch_number, P_none#projection_v1.epoch_number]),
machi_fitness:add_admin_down(S#ch_mgr.fitness_svr, MyName, []),
timer:sleep(5*1000),
io:format(user, "SET delete_admin_down(~w) at ~w =====================================\n", [MyName, time()]),
@ -1892,7 +1980,7 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) ->
%% In contrast to the public projection store writes, Humming Consensus
%% doesn't care about the status of writes to the public store: it's
%% always relying only on successful reads of the public store.
case {?FLU_PC:write_projection(MyStorePid, private, P_latest2,?TO*30),Goo} of
case {?FLU_PC:write_projection(MyStorePid, private, P_latest2,?TO*30+66),Goo} of
{ok, Goo} ->
?REACT({c110, [{write, ok}]}),
react_to_env_C111(P_latest, P_latest2, Extra1, MyStorePid, S);
@ -1978,7 +2066,6 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H,
?REACT(c120),
H2 = add_and_trunc_history(P_latest, H, ?MAX_HISTORY_LENGTH),
%% diversion_c120_verbose_goop(P_latest, S),
?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}),
S2 = set_proj(S#ch_mgr{proj_history=H2,
sane_transitions=Xtns + 1}, P_latest),
@ -1986,20 +2073,21 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H,
false ->
S2;
{{_ConfEpoch, _ConfCSum}, ConfTime} ->
io:format(user, "\nCONFIRM debug C120 ~w was annotated ~w\n", [S#ch_mgr.name, P_latest#projection_v1.epoch_number]),
P_latestEpoch = P_latest#projection_v1.epoch_number,
io:format(user, "\nCONFIRM debug C120 ~w was annotated ~w\n", [S#ch_mgr.name, P_latestEpoch]),
S2#ch_mgr{proj_unanimous=ConfTime}
end,
V = case file:read_file("/tmp/moomoo."++atom_to_list(S#ch_mgr.name)) of {ok,_} -> true; _ -> false end,
if V -> io:format("C120: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end,
{{now_using, FinalProps, P_latest#projection_v1.epoch_number}, S3}.
add_and_trunc_history(P_latest, H, MaxLength) ->
add_and_trunc_history(#projection_v1{epoch_number=0}, H, _MaxLength) ->
H;
add_and_trunc_history(#projection_v1{} = P_latest, H, MaxLength) ->
Latest_U_R = {P_latest#projection_v1.upi, P_latest#projection_v1.repairing},
H2 = if P_latest#projection_v1.epoch_number > 0 ->
queue:in(Latest_U_R, H);
true ->
H
end,
add_and_trunc_history(Latest_U_R, H, MaxLength);
add_and_trunc_history(Item, H, MaxLength) ->
H2 = queue:in(Item, H),
case queue:len(H2) of
X when X > MaxLength ->
{_V, Hxx} = queue:out(H2),
@ -2012,11 +2100,10 @@ react_to_env_C200(Retries, P_latest, S) ->
?REACT(c200),
try
AuthorProxyPid = proxy_pid(P_latest#projection_v1.author_server, S),
?FLU_PC:kick_projection_reaction(AuthorProxyPid, [])
%% This is just advisory, we don't need a sync reply.
?FLU_PC:kick_projection_reaction(AuthorProxyPid, [], 100)
catch _Type:_Err ->
%% ?V("TODO: tell_author_yo is broken: ~p ~p\n",
%% [_Type, _Err]),
ok
ok
end,
react_to_env_C210(Retries, S).
@ -2206,6 +2293,7 @@ projection_transition_is_sane_except_si_epoch(
creation_time=CreationTime1,
mode=CMode1,
author_server=AuthorServer1,
chain_name=ChainName1,
all_members=All_list1,
witnesses=Witness_list1,
down=Down_list1,
@ -2217,6 +2305,7 @@ projection_transition_is_sane_except_si_epoch(
creation_time=CreationTime2,
mode=CMode2,
author_server=AuthorServer2,
chain_name=ChainName2,
all_members=All_list2,
witnesses=Witness_list2,
down=Down_list2,
@ -2237,7 +2326,8 @@ projection_transition_is_sane_except_si_epoch(
true = is_binary(CSum1) andalso is_binary(CSum2),
{_,_,_} = CreationTime1,
{_,_,_} = CreationTime2,
true = is_atom(AuthorServer1) andalso is_atom(AuthorServer2), % todo type may change?
true = is_atom(AuthorServer1) andalso is_atom(AuthorServer2),
true = is_atom(ChainName1) andalso is_atom(ChainName2),
true = is_list(All_list1) andalso is_list(All_list2),
true = is_list(Witness_list1) andalso is_list(Witness_list2),
true = is_list(Down_list1) andalso is_list(Down_list2),
@ -2249,6 +2339,9 @@ projection_transition_is_sane_except_si_epoch(
%% projection_transition_is_sane_with_si_epoch().
true = Epoch2 >= Epoch1,
%% Don't change chain names in the middle of the stream.
true = (ChainName1 == ChainName2),
%% No duplicates
true = lists:sort(Witness_list2) == lists:usort(Witness_list2),
true = lists:sort(Down_list2) == lists:usort(Down_list2),
@ -2256,7 +2349,7 @@ projection_transition_is_sane_except_si_epoch(
true = lists:sort(Repairing_list2) == lists:usort(Repairing_list2),
%% Disjoint-ness
All_list1 = All_list2, % todo will probably change
%% %% %% %% %% %% %% %% All_list1 = All_list2, % todo will probably change
%% true = lists:sort(All_list2) == lists:sort(Down_list2 ++ UPI_list2 ++
%% Repairing_list2),
[] = [X || X <- Witness_list2, not lists:member(X, All_list2)],
@ -2361,8 +2454,7 @@ poll_private_proj_is_upi_unanimous_sleep(Count, #ch_mgr{runenv=RunEnv}=S) ->
S2
end.
poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current,
opts=MgrOpts} = S) ->
poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current} = S) ->
UPI = P_current#projection_v1.upi,
EpochID = machi_projection:make_epoch_id(P_current),
{Rs, S2} = read_latest_projection_call_only2(private, UPI, S),
@ -2395,33 +2487,30 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current,
Annotation = make_annotation(EpochID, Now),
NewDbg2 = [Annotation|P_currentFull#projection_v1.dbg2],
NewProj = P_currentFull#projection_v1{dbg2=NewDbg2},
ProjStore = case get_projection_store_regname(MgrOpts) of
undefined ->
machi_flu_psup:make_proj_supname(MyName);
PStr ->
PStr
end,
ProjStore = get_projection_store_pid_or_regname(S),
#projection_v1{epoch_number=_EpochRep,
epoch_csum= <<_CSumRep:4/binary,_/binary>>,
author_server=AuthRep,
upi=_UPIRep,
repairing=_RepairingRep} = NewProj,
ok = machi_projection_store:write(ProjStore, private, NewProj),
case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of
case proplists:get_value(private_write_verbose_confirm, S#ch_mgr.opts) of
true ->
io:format(user, "\n~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), _EpochRep, _CSumRep, _UPIRep, _RepairingRep, MyName]);
error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w auth ~w by ~w\n", [_EpochRep, _CSumRep, _UPIRep, _RepairingRep, AuthRep, MyName]);
_ ->
ok
end,
%% Unwedge our FLU.
{ok, NotifyPid} = machi_projection_store:get_wedge_notify_pid(ProjStore),
_ = machi_flu1:update_wedge_state(NotifyPid, false, EpochID),
S2#ch_mgr{proj_unanimous=Now};
#ch_mgr{proj_history=H} = S2,
H2 = add_and_trunc_history({confirm, Epoch}, H,
?MAX_HISTORY_LENGTH),
S2#ch_mgr{proj_unanimous=Now, proj_history=H2};
_ ->
S2
end;
_Else ->
%% io:format(user, "poll by ~w: want ~W got ~W\n",
%% [MyName, EpochID, 6, _Else, 8]),
S2
end.
@ -2457,6 +2546,14 @@ gobble_calls(StaticCall) ->
ok
end.
gobble_ticks() ->
receive
tick_check_environment ->
gobble_ticks()
after 0 ->
ok
end.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
perhaps_start_repair(#ch_mgr{name=MyName,
@ -2472,12 +2569,13 @@ perhaps_start_repair(#ch_mgr{name=MyName,
%% RepairOpts = [{repair_mode, check}, verbose],
RepairFun = fun() -> do_repair(S, RepairOpts, CMode) end,
LastUPI = lists:last(UPI),
StabilityTime = application:get_env(machi, stability_time, ?REPAIR_START_STABILITY_TIME),
IgnoreStabilityTime_p = proplists:get_value(ignore_stability_time,
S#ch_mgr.opts, false),
case timer:now_diff(os:timestamp(), Start) div 1000000 of
N when MyName == LastUPI andalso
(IgnoreStabilityTime_p orelse
N >= ?REPAIR_START_STABILITY_TIME) ->
N >= StabilityTime) ->
{WorkerPid, _Ref} = spawn_monitor(RepairFun),
S#ch_mgr{repair_worker=WorkerPid,
repair_start=os:timestamp(),
@ -2518,8 +2616,8 @@ do_repair(#ch_mgr{name=MyName,
T1 = os:timestamp(),
RepairId = proplists:get_value(repair_id, Opts, id1),
error_logger:info_msg(
"Repair start: tail ~p of ~p -> ~p, ~p ID ~w\n",
[MyName, UPI0, Repairing, RepairMode, RepairId]),
"Repair ~w start: tail ~p of ~p -> ~p, ~p\n",
[RepairId, MyName, UPI0, Repairing, RepairMode]),
UPI = UPI0 -- Witness_list,
Res = machi_chain_repair:repair(RepairMode, MyName, Repairing, UPI,
@ -2532,10 +2630,9 @@ do_repair(#ch_mgr{name=MyName,
end,
Stats = [{K, ets:lookup_element(ETS, K, 2)} || K <- ETS_T_Keys],
error_logger:info_msg(
"Repair ~s: tail ~p of ~p finished ~p repair ID ~w: "
"~p\nStats ~p\n",
[Summary, MyName, UPI0, RepairMode, RepairId,
Res, Stats]),
"Repair ~w ~s: tail ~p of ~p finished ~p: "
"~p Stats: ~p\n",
[RepairId, Summary, MyName, UPI0, RepairMode, Res, Stats]),
ets:delete(ETS),
exit({repair_final_status, Res});
_ ->
@ -2772,6 +2869,7 @@ full_majority_size(L) when is_list(L) ->
full_majority_size(length(L)).
make_zerf(#projection_v1{epoch_number=OldEpochNum,
chain_name=ChainName,
all_members=AllMembers,
members_dict=MembersDict,
witnesses=OldWitness_list
@ -2794,7 +2892,8 @@ make_zerf(#projection_v1{epoch_number=OldEpochNum,
MyName, AllMembers, OldWitness_list,
MembersDict),
machi_projection:update_checksum(
P#projection_v1{mode=cp_mode,
P#projection_v1{chain_name=ChainName,
mode=cp_mode,
dbg2=[zerf_none,{up,Up},{maj,MajoritySize}]});
true ->
make_zerf2(OldEpochNum, Up, MajoritySize, MyName,
@ -2809,7 +2908,6 @@ make_zerf2(OldEpochNum, Up, MajoritySize, MyName, AllMembers, OldWitness_list,
Proj2 = Proj#projection_v1{dbg2=[{make_zerf,Epoch},
{yyy_hack, get(yyy_hack)},
{up,Up},{maj,MajoritySize}]},
%% io:format(user, "ZERF ~w\n",[machi_projection:make_summary(Proj2)]),
Proj2
catch
throw:{zerf,no_common} ->
@ -2886,41 +2984,36 @@ zerf_find_last_annotated(FLU, MajoritySize, S) ->
[] % lists:flatten() will destroy
end.
perhaps_verbose_c111(P_latest2, S) ->
case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of
true ->
perhaps_verbose_c111(P_latest2, #ch_mgr{name=MyName, opts=Opts}=S) ->
PrivWriteVerb = proplists:get_value(private_write_verbose, Opts, false),
PrivWriteVerbCONFIRM = proplists:get_value(private_write_verbose_confirm, Opts, false),
if PrivWriteVerb orelse PrivWriteVerbCONFIRM ->
Dbg2X = lists:keydelete(react, 1,
P_latest2#projection_v1.dbg2) ++
[{is_annotated,is_annotated(P_latest2)}],
P_latest2x = P_latest2#projection_v1{dbg2=Dbg2X}, % limit verbose len.
Last2 = get(last_verbose),
Summ2 = machi_projection:make_summary(P_latest2x),
if P_latest2#projection_v1.upi == [],
(S#ch_mgr.proj)#projection_v1.upi /= [] ->
<<CSumRep:4/binary,_/binary>> =
P_latest2#projection_v1.epoch_csum,
io:format(user, "\n~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), (S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]);
if PrivWriteVerb, Summ2 /= Last2 ->
put(last_verbose, Summ2),
error_logger:info_msg("~p uses plain: ~w \n",
[MyName, Summ2]);
true ->
ok
end,
case proplists:get_value(private_write_verbose,
S#ch_mgr.opts) of
true when Summ2 /= Last2 ->
put(last_verbose, Summ2),
?V("\n~s ~p uses plain: ~w \n",
[machi_util:pretty_time(), S#ch_mgr.name, Summ2]);
_ ->
if PrivWriteVerbCONFIRM,
P_latest2#projection_v1.upi == [],
(S#ch_mgr.proj)#projection_v1.upi /= [] ->
<<CSumRep:4/binary,_/binary>> =
P_latest2#projection_v1.epoch_csum,
error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w auth ~w by ~w\n", [(S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, P_latest2#projection_v1.author_server, S#ch_mgr.name]);
true ->
ok
end;
_ ->
true ->
ok
end.
calc_consistency_mode(_Witness_list = []) ->
ap_mode;
calc_consistency_mode(_Witness_list) ->
cp_mode.
set_proj(S, Proj) ->
S#ch_mgr{proj=Proj, proj_unanimous=false}.
@ -2953,3 +3046,10 @@ get_unfit_list(FitnessServer) ->
[]
end.
get_projection_store_pid_or_regname(#ch_mgr{name=MyName, opts=MgrOpts}) ->
case get_projection_store_regname(MgrOpts) of
undefined ->
machi_flu_psup:make_proj_supname(MyName);
PStr ->
PStr
end.

View file

@ -103,7 +103,8 @@ repair(ap_mode=ConsistencyMode, Src, Repairing, UPI, MembersDict, ETS, Opts) ->
Add = fun(Name, Pid) -> put(proxies_dict, orddict:store(Name, Pid, get(proxies_dict))) end,
OurFLUs = lists:usort([Src] ++ Repairing ++ UPI), % AP assumption!
RepairMode = proplists:get_value(repair_mode, Opts, repair),
Verb = proplists:get_value(verbose, Opts, true),
Verb = proplists:get_value(verbose, Opts, false),
RepairId = proplists:get_value(repair_id, Opts, id1),
Res = try
_ = [begin
{ok, Proxy} = machi_proxy_flu1_client:start_link(P),
@ -116,31 +117,39 @@ repair(ap_mode=ConsistencyMode, Src, Repairing, UPI, MembersDict, ETS, Opts) ->
get_file_lists(Proxy, FLU, Dict)
end, D, ProxiesDict),
MissingFileSummary = make_missing_file_summary(D2, OurFLUs),
?VERB("MissingFileSummary ~p\n", [MissingFileSummary]),
%% ?VERB("~w MissingFileSummary ~p\n",[RepairId,MissingFileSummary]),
lager:info("Repair ~w MissingFileSummary ~p\n",
[RepairId, MissingFileSummary]),
[ets:insert(ETS, {{directive_bytes, FLU}, 0}) || FLU <- OurFLUs],
%% Repair files from perspective of Src, i.e. tail(UPI).
SrcProxy = orddict:fetch(Src, ProxiesDict),
{ok, EpochID} = machi_proxy_flu1_client:get_epoch_id(
SrcProxy, ?SHORT_TIMEOUT),
?VERB("Make repair directives: "),
%% ?VERB("Make repair directives: "),
Ds =
[{File, make_repair_directives(
ConsistencyMode, RepairMode, File, Size, EpochID,
Verb,
Src, OurFLUs, ProxiesDict, ETS)} ||
{File, {Size, _MissingList}} <- MissingFileSummary],
?VERB(" done\n"),
%% ?VERB(" done\n"),
lager:info("Repair ~w repair directives finished\n", [RepairId]),
[begin
[{_, Bytes}] = ets:lookup(ETS, {directive_bytes, FLU}),
?VERB("Out-of-sync data for FLU ~p: ~s MBytes\n",
[FLU, mbytes(Bytes)])
%% ?VERB("Out-of-sync data for FLU ~p: ~s MBytes\n",
%% [FLU, mbytes(Bytes)]),
lager:info("Repair ~w "
"Out-of-sync data for FLU ~p: ~s MBytes\n",
[RepairId, FLU, mbytes(Bytes)]),
ok
end || FLU <- OurFLUs],
?VERB("Execute repair directives: "),
%% ?VERB("Execute repair directives: "),
ok = execute_repair_directives(ConsistencyMode, Ds, Src, EpochID,
Verb, OurFLUs, ProxiesDict, ETS),
?VERB(" done\n"),
%% ?VERB(" done\n"),
lager:info("Repair ~w repair directives finished\n", [RepairId]),
ok
catch
What:Why ->
@ -198,7 +207,7 @@ make_repair_compare_fun(SrcFLU) ->
T_a =< T_b
end.
make_repair_directives(ConsistencyMode, RepairMode, File, Size, EpochID,
make_repair_directives(ConsistencyMode, RepairMode, File, Size, _EpochID,
Verb, Src, FLUs0, ProxiesDict, ETS) ->
true = (Size < ?MAX_OFFSET),
FLUs = lists:usort(FLUs0),
@ -207,7 +216,7 @@ make_repair_directives(ConsistencyMode, RepairMode, File, Size, EpochID,
Proxy = orddict:fetch(FLU, ProxiesDict),
OffSzCs =
case machi_proxy_flu1_client:checksum_list(
Proxy, EpochID, File, ?LONG_TIMEOUT) of
Proxy, File, ?LONG_TIMEOUT) of
{ok, InfoBin} ->
machi_csum_table:split_checksum_list_blob_decode(InfoBin);
{error, no_such_file} ->
@ -227,7 +236,6 @@ make_repair_directives(ConsistencyMode, RepairMode, File, Size, EpochID,
make_repair_directives2(C2, ConsistencyMode, RepairMode,
File, Verb, Src, FLUs, ProxiesDict, ETS) ->
?VERB("."),
make_repair_directives3(C2, ConsistencyMode, RepairMode,
File, Verb, Src, FLUs, ProxiesDict, ETS, []).
@ -257,7 +265,18 @@ make_repair_directives3([{Offset, Size, CSum, _FLU}=A|Rest0],
%% byte range from all FLUs
%% 3b. Log big warning about data loss.
%% 4. Log any other checksum discrepencies as they are found.
exit({todo_repair_sanity_check, ?LINE, File, Offset, As})
QQ = [begin
Pxy = orddict:fetch(FLU, ProxiesDict),
{ok, EpochID} = machi_proxy_flu1_client:get_epoch_id(
Pxy, ?SHORT_TIMEOUT),
NSInfo = undefined,
XX = machi_proxy_flu1_client:read_chunk(
Pxy, NSInfo, EpochID, File, Offset, Size, undefined,
?SHORT_TIMEOUT),
{FLU, XX}
end || {__Offset, __Size, __CSum, FLU} <- As],
exit({todo_repair_sanity_check, ?LINE, File, Offset, {as,As}, {qq,QQ}})
end,
%% List construction guarantees us that there's at least one ?MAX_OFFSET
%% item remains. Sort order + our "taking" of all exact Offset+Size
@ -310,23 +329,25 @@ execute_repair_directives(ap_mode=_ConsistencyMode, Ds, _Src, EpochID, Verb,
{ProxiesDict, EpochID, Verb, ETS}, Ds),
ok.
execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, Verb, ETS}=Acc) ->
execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, _Verb, ETS}=Acc) ->
EtsKeys = [{in_files, t_in_files}, {in_chunks, t_in_chunks},
{in_bytes, t_in_bytes}, {out_files, t_out_files},
{out_chunks, t_out_chunks}, {out_bytes, t_out_bytes}],
[ets:insert(ETS, {L_K, 0}) || {L_K, _T_K} <- EtsKeys],
F = fun({copy, {Offset, Size, TaggedCSum, MySrc}, MyDsts}, Acc2) ->
SrcP = orddict:fetch(MySrc, ProxiesDict),
case ets:lookup_element(ETS, in_chunks, 2) rem 100 of
0 -> ?VERB(".", []);
_ -> ok
end,
%% case ets:lookup_element(ETS, in_chunks, 2) rem 100 of
%% 0 -> ?VERB(".2", []);
%% _ -> ok
%% end,
_T1 = os:timestamp(),
%% TODO: support case multiple written or trimmed chunks returned
{ok, {[{_, Offset, Chunk, _}], _}} =
NSInfo = undefined,
{ok, {[{_, Offset, Chunk, _ReadCSum}|OtherChunks], []=_TrimmedList}} =
machi_proxy_flu1_client:read_chunk(
SrcP, EpochID, File, Offset, Size, [],
SrcP, NSInfo, EpochID, File, Offset, Size, undefined,
?SHORT_TIMEOUT),
[] = OtherChunks,
_T2 = os:timestamp(),
<<_Tag:1/binary, CSum/binary>> = TaggedCSum,
case machi_util:checksum_chunk(Chunk) of
@ -335,7 +356,7 @@ execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, Verb, ETS}=Acc) ->
DstP = orddict:fetch(DstFLU, ProxiesDict),
_T3 = os:timestamp(),
ok = machi_proxy_flu1_client:write_chunk(
DstP, EpochID, File, Offset, Chunk,
DstP, NSInfo, EpochID, File, Offset, Chunk, TaggedCSum,
?SHORT_TIMEOUT),
_T4 = os:timestamp()
end || DstFLU <- MyDsts],

View file

@ -85,8 +85,8 @@ fitness(FluName) ->
-spec flu1(atom()) -> [{atom(), term()}].
flu1(FluName) ->
State = machi_flu1:current_state(FluName),
machi_flu1:format_state(State).
State = machi_flu1_append_server:current_state(FluName),
machi_flu1_append_server:format_state(State).
%% Internal functions

View file

@ -21,8 +21,9 @@
%% @doc Erlang API for the Machi client-implemented Chain Replication
%% (CORFU-style) protocol.
%%
%% See also the docs for {@link machi_flu1_client} for additional
%% details on data types and operation descriptions.
%% Please see {@link machi_flu1_client} the "Client API implemntation notes"
%% section for how this module relates to the rest of the client API
%% implementation.
%%
%% The API here is much simpler than the {@link machi_flu1_client} or
%% {@link machi_proxy_flu1_client} APIs. This module's API is a
@ -43,64 +44,6 @@
%%
%% Doc TODO: Once this API stabilizes, add all relevant data type details
%% to the EDoc here.
%%
%%
%% === Missing API features ===
%%
%% So far, there is one missing client API feature that ought to be
%% added to Machi in the near future: more flexible checksum
%% management.
%%
%% Add a `source' annotation to all checksums to indicate where the
%% checksum was calculated. For example,
%%
%% <ul>
%%
%% <li> Calculated by client that performed the original chunk append,
%% </li>
%%
%% <li> Calculated by the 1st Machi server to receive an
%% un-checksummed append request
%% </li>
%%
%% <li> Re-calculated by Machi to manage fewer checksums of blocks of
%% data larger than the original client-specified chunks.
%% </li>
%% </ul>
%%
%% Client-side checksums would be the "strongest" type of
%% checksum, meaning that any data corruption (of the original
%% data and/or of the checksum itself) can be detected after the
%% client-side calculation. There are too many horror stories on
%% The Net about IP PDUs that are corrupted but unnoticed due to
%% weak TCP checksums, buggy hardware, buggy OS drivers, etc.
%% Checksum versioning is also desirable if/when the current checksum
%% implementation changes from SHA-1 to something else.
%%
%%
%% === Implementation notes ===
%%
%% The major operation processing is implemented in a state machine-like
%% manner. Before attempting an operation `X', there's an initial
%% operation `pre-X' that takes care of updating the epoch id,
%% restarting client protocol proxies, and if there's any server
%% instability (e.g. some server is wedged), then insert some sleep
%% time. When the chain appears to have stabilized, then we try the `X'
%% operation again.
%%
%% Function name for the `pre-X' stuff is usually `X()', and the
%% function name for the `X' stuff is usually `X2()'. (I.e., the `X'
%% stuff follows after `pre-X' and therefore has a `2' suffix on the
%% function name.)
%%
%% In the case of read repair, there are two stages: find the value to
%% perform the repair, then perform the repair writes. In the case of
%% the repair writes, the `pre-X' function is named `read_repair3()',
%% and the `X' function is named `read_repair4()'.
%%
%% TODO: It would be nifty to lift the very-nearly-but-not-quite-boilerplate
%% of the `pre-X' functions into a single common function ... but I'm not
%% sure yet on how to do it without making the code uglier.
-module(machi_cr_client).
@ -118,13 +61,11 @@
%% FLU1 API
-export([
%% File API
append_chunk/3, append_chunk/4,
append_chunk/5, append_chunk/6,
append_chunk_extra/4, append_chunk_extra/5,
append_chunk_extra/6, append_chunk_extra/7,
write_chunk/4, write_chunk/5,
read_chunk/5, read_chunk/6,
trim_chunk/4, trim_chunk/5,
append_chunk/5,
append_chunk/6, append_chunk/7,
write_chunk/6, write_chunk/7,
read_chunk/6, read_chunk/7,
trim_chunk/5, trim_chunk/6,
checksum_list/2, checksum_list/3,
list_files/1, list_files/2,
@ -137,8 +78,8 @@
terminate/2, code_change/3]).
-define(FLU_PC, machi_proxy_flu1_client).
-define(TIMEOUT, 2*1000).
-define(DEFAULT_TIMEOUT, 10*1000).
-define(TIMEOUT, 10*1000).
-define(DEFAULT_TIMEOUT, ?TIMEOUT*5).
-define(MAX_RUNTIME, 8*1000).
-define(WORST_PROJ, #projection_v1{epoch_number=0,epoch_csum= <<>>,
members_dict=[]}).
@ -165,101 +106,61 @@ start_link(P_srvr_list, Opts) ->
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk(PidSpec, Prefix, Chunk) ->
append_chunk_extra(PidSpec, ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR,
Prefix, Chunk, 0, ?DEFAULT_TIMEOUT).
append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum) ->
append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum, #append_opts{}, ?DEFAULT_TIMEOUT).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk(PidSpec, Prefix, Chunk, Timeout) ->
append_chunk_extra(PidSpec, ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR,
Prefix, Chunk, 0, Timeout).
append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum, #append_opts{}=Opts) ->
append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum, Opts, ?DEFAULT_TIMEOUT).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk(PidSpec, CoC_Namespace, CoC_Locator, Prefix, Chunk) ->
append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator,
Prefix, Chunk, 0, ?DEFAULT_TIMEOUT).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk(PidSpec, CoC_Namespace, CoC_Locator, Prefix, Chunk, Timeout) ->
append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator,
Prefix, Chunk, 0, Timeout).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra)
when is_integer(ChunkExtra), ChunkExtra >= 0 ->
append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra, ?DEFAULT_TIMEOUT).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra, Timeout0) ->
append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum, #append_opts{}=Opts, Timeout0) ->
NSInfo2 = machi_util:ns_info_default(NSInfo),
{TO, Timeout} = timeout(Timeout0),
gen_server:call(PidSpec, {req, {append_chunk_extra,
?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR,
Prefix,
Chunk, ChunkExtra, TO}},
Timeout).
append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator, Prefix, Chunk, ChunkExtra)
when is_integer(ChunkExtra), ChunkExtra >= 0 ->
append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra, ?DEFAULT_TIMEOUT).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra, Timeout0) ->
{TO, Timeout} = timeout(Timeout0),
gen_server:call(PidSpec, {req, {append_chunk_extra,
CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, TO}},
gen_server:call(PidSpec, {req, {append_chunk,
NSInfo2, Prefix, Chunk, CSum, Opts, TO}},
Timeout).
%% @doc Write a chunk of data (that has already been
%% allocated/sequenced by an earlier append_chunk_extra() call) to
%% allocated/sequenced by an earlier append_chunk() call) to
%% `File' at `Offset'.
write_chunk(PidSpec, File, Offset, Chunk) ->
write_chunk(PidSpec, File, Offset, Chunk, ?DEFAULT_TIMEOUT).
write_chunk(PidSpec, NSInfo, File, Offset, Chunk, CSum) ->
write_chunk(PidSpec, NSInfo, File, Offset, Chunk, CSum, ?DEFAULT_TIMEOUT).
%% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
write_chunk(PidSpec, File, Offset, Chunk, Timeout0) ->
write_chunk(PidSpec, NSInfo, File, Offset, Chunk, CSum, Timeout0) ->
{TO, Timeout} = timeout(Timeout0),
gen_server:call(PidSpec, {req, {write_chunk, File, Offset, Chunk, TO}},
gen_server:call(PidSpec, {req, {write_chunk, NSInfo, File, Offset, Chunk, CSum, TO}},
Timeout).
%% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
read_chunk(PidSpec, File, Offset, Size, Opts) ->
read_chunk(PidSpec, File, Offset, Size, Opts, ?DEFAULT_TIMEOUT).
read_chunk(PidSpec, NSInfo, File, Offset, Size, Opts) ->
read_chunk(PidSpec, NSInfo, File, Offset, Size, Opts, ?DEFAULT_TIMEOUT).
%% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
read_chunk(PidSpec, File, Offset, Size, Opts, Timeout0) ->
read_chunk(PidSpec, NSInfo, File, Offset, Size, Opts, Timeout0) ->
{TO, Timeout} = timeout(Timeout0),
gen_server:call(PidSpec, {req, {read_chunk, File, Offset, Size, Opts, TO}},
gen_server:call(PidSpec, {req, {read_chunk, NSInfo, File, Offset, Size, Opts, TO}},
Timeout).
%% @doc Trim a chunk of data of size `Size' from `File' at `Offset'.
trim_chunk(PidSpec, File, Offset, Size) ->
trim_chunk(PidSpec, File, Offset, Size, ?DEFAULT_TIMEOUT).
trim_chunk(PidSpec, NSInfo, File, Offset, Size) ->
trim_chunk(PidSpec, NSInfo, File, Offset, Size, ?DEFAULT_TIMEOUT).
%% @doc Trim a chunk of data of size `Size' from `File' at `Offset'.
trim_chunk(PidSpec, File, Offset, Size, Timeout0) ->
trim_chunk(PidSpec, NSInfo, File, Offset, Size, Timeout0) ->
{TO, Timeout} = timeout(Timeout0),
gen_server:call(PidSpec, {req, {trim_chunk, File, Offset, Size, TO}},
gen_server:call(PidSpec, {req, {trim_chunk, NSInfo, File, Offset, Size, TO}},
Timeout).
%% @doc Fetch the list of chunk checksums for `File'.
@ -324,28 +225,27 @@ code_change(_OldVsn, S, _Extra) ->
%%%%%%%%%%%%%%%%%%%%%%%%%%%
handle_call2({append_chunk_extra, CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra, TO}, _From, S) ->
do_append_head(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, 0, os:timestamp(), TO, S);
handle_call2({write_chunk, File, Offset, Chunk, TO}, _From, S) ->
do_write_head(File, Offset, Chunk, 0, os:timestamp(), TO, S);
handle_call2({read_chunk, File, Offset, Size, Opts, TO}, _From, S) ->
do_read_chunk(File, Offset, Size, Opts, 0, os:timestamp(), TO, S);
handle_call2({trim_chunk, File, Offset, Size, TO}, _From, S) ->
do_trim_chunk(File, Offset, Size, 0, os:timestamp(), TO, S);
handle_call2({append_chunk, NSInfo,
Prefix, Chunk, CSum, Opts, TO}, _From, S) ->
do_append_head(NSInfo, Prefix,
Chunk, CSum, Opts, 0, os:timestamp(), TO, S);
handle_call2({write_chunk, NSInfo, File, Offset, Chunk, CSum, TO}, _From, S) ->
do_write_head(NSInfo, File, Offset, Chunk, CSum, 0, os:timestamp(), TO, S);
handle_call2({read_chunk, NSInfo, File, Offset, Size, Opts, TO}, _From, S) ->
do_read_chunk(NSInfo, File, Offset, Size, Opts, 0, os:timestamp(), TO, S);
handle_call2({trim_chunk, NSInfo, File, Offset, Size, TO}, _From, S) ->
do_trim_chunk(NSInfo, File, Offset, Size, 0, os:timestamp(), TO, S);
handle_call2({checksum_list, File, TO}, _From, S) ->
do_checksum_list(File, 0, os:timestamp(), TO, S);
handle_call2({list_files, TO}, _From, S) ->
do_list_files(0, os:timestamp(), TO, S).
do_append_head(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, 0=Depth, STime, TO, S) ->
do_append_head2(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, Depth + 1, STime, TO, S);
do_append_head(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, Depth, STime, TO, #state{proj=P}=S) ->
%% io:format(user, "head sleep1,", []),
do_append_head(NSInfo, Prefix,
Chunk, CSum, Opts, 0=Depth, STime, TO, S) ->
do_append_head2(NSInfo, Prefix,
Chunk, CSum, Opts, Depth + 1, STime, TO, S);
do_append_head(NSInfo, Prefix,
Chunk, CSum, Opts, Depth, STime, TO, #state{proj=P}=S) ->
sleep_a_while(Depth),
DiffMs = timer:now_diff(os:timestamp(), STime) div 1000,
if DiffMs > TO ->
@ -359,62 +259,61 @@ do_append_head(CoC_Namespace, CoC_Locator, Prefix,
case S2#state.proj of
P2 when P2 == undefined orelse
P2#projection_v1.upi == [] ->
do_append_head(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, Depth + 1,
do_append_head(NSInfo, Prefix,
Chunk, CSum, Opts, Depth + 1,
STime, TO, S2);
_ ->
do_append_head2(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, Depth + 1,
do_append_head2(NSInfo, Prefix,
Chunk, CSum, Opts, Depth + 1,
STime, TO, S2)
end
end.
do_append_head2(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, Depth, STime, TO,
do_append_head2(NSInfo, Prefix,
Chunk, CSum, Opts, Depth, STime, TO,
#state{proj=P}=S) ->
[HeadFLU|_RestFLUs] = mutation_flus(P),
case is_witness_flu(HeadFLU, P) of
true ->
case witnesses_use_our_epoch(S) of
true ->
do_append_head3(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, Depth,
do_append_head3(NSInfo, Prefix,
Chunk, CSum, Opts, Depth,
STime, TO, S);
false ->
%% Bummer, go back to the beginning and retry.
do_append_head(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, Depth,
do_append_head(NSInfo, Prefix,
Chunk, CSum, Opts, Depth,
STime, TO, S)
end;
false ->
do_append_head3(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, Depth, STime, TO, S)
do_append_head3(NSInfo, Prefix,
Chunk, CSum, Opts, Depth, STime, TO, S)
end.
do_append_head3(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, Depth, STime, TO,
do_append_head3(NSInfo, Prefix,
Chunk, CSum, Opts, Depth, STime, TO,
#state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) ->
[HeadFLU|RestFLUs] = non_witness_flus(mutation_flus(P), P),
Proxy = orddict:fetch(HeadFLU, PD),
case ?FLU_PC:append_chunk_extra(Proxy, EpochID,
CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, ?TIMEOUT) of
case ?FLU_PC:append_chunk(Proxy, NSInfo, EpochID,
Prefix, Chunk, CSum, Opts, ?TIMEOUT) of
{ok, {Offset, _Size, File}=_X} ->
do_append_midtail(RestFLUs, CoC_Namespace, CoC_Locator, Prefix,
File, Offset, Chunk, ChunkExtra,
[HeadFLU], 0, STime, TO, S);
do_wr_app_midtail(RestFLUs, NSInfo, Prefix,
File, Offset, Chunk, CSum, Opts,
[HeadFLU], 0, STime, TO, append, S);
{error, bad_checksum}=BadCS ->
{reply, BadCS, S};
{error, Retry}
when Retry == partition; Retry == bad_epoch; Retry == wedged ->
do_append_head(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, Depth, STime, TO, S);
do_append_head(NSInfo, Prefix,
Chunk, CSum, Opts, Depth, STime, TO, S);
{error, written} ->
%% Implicit sequencing + this error = we don't know where this
%% written block is. But we lost a race. Repeat, with a new
%% sequencer assignment.
do_append_head(CoC_Namespace, CoC_Locator, Prefix,
Chunk, ChunkExtra, Depth, STime, TO, S);
do_append_head(NSInfo, Prefix,
Chunk, CSum, Opts, Depth, STime, TO, S);
{error, trimmed} = Err ->
%% TODO: behaviour
{reply, Err, S};
@ -423,17 +322,16 @@ do_append_head3(CoC_Namespace, CoC_Locator, Prefix,
Prefix,iolist_size(Chunk)})
end.
do_append_midtail(RestFLUs, CoC_Namespace, CoC_Locator, Prefix,
File, Offset, Chunk, ChunkExtra,
Ws, Depth, STime, TO, S)
do_wr_app_midtail(RestFLUs, NSInfo, Prefix,
File, Offset, Chunk, CSum, Opts,
Ws, Depth, STime, TO, MyOp, S)
when RestFLUs == [] orelse Depth == 0 ->
do_append_midtail2(RestFLUs, CoC_Namespace, CoC_Locator, Prefix,
File, Offset, Chunk, ChunkExtra,
Ws, Depth + 1, STime, TO, S);
do_append_midtail(_RestFLUs, CoC_Namespace, CoC_Locator, Prefix, File,
Offset, Chunk, ChunkExtra,
Ws, Depth, STime, TO, #state{proj=P}=S) ->
%% io:format(user, "midtail sleep2,", []),
do_wr_app_midtail2(RestFLUs, NSInfo, Prefix,
File, Offset, Chunk, CSum, Opts,
Ws, Depth + 1, STime, TO, MyOp, S);
do_wr_app_midtail(_RestFLUs, NSInfo, Prefix, File,
Offset, Chunk, CSum, Opts,
Ws, Depth, STime, TO, MyOp, #state{proj=P}=S) ->
sleep_a_while(Depth),
DiffMs = timer:now_diff(os:timestamp(), STime) div 1000,
if DiffMs > TO ->
@ -447,60 +345,66 @@ do_append_midtail(_RestFLUs, CoC_Namespace, CoC_Locator, Prefix, File,
RestFLUs2 = mutation_flus(P2),
case RestFLUs2 -- Ws of
RestFLUs2 ->
%% None of the writes that we have done so far
%% are to FLUs that are in the RestFLUs2 list.
%% We are pessimistic here and assume that
%% those FLUs are permanently dead. Start
%% over with a new sequencer assignment, at
%% the 2nd have of the impl (we have already
%% slept & refreshed the projection).
if Prefix == undefined -> % atom! not binary()!!
{error, partition};
true ->
do_append_head2(CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra,
Depth, STime, TO, S2)
MyOp == append ->
%% None of the writes that we have done so
%% far are to FLUs that are in the
%% RestFLUs2 list. We are pessimistic
%% here and assume that those FLUs are
%% permanently dead. Start over with a
%% new sequencer assignment, at the 2nd
%% have of the impl (we have already slept
%% & refreshed the projection).
do_append_head2(NSInfo,
Prefix, Chunk, CSum, Opts,
Depth, STime, TO, S2);
MyOp == write ->
do_wr_app_midtail2(RestFLUs2,
NSInfo,
Prefix, File, Offset,
Chunk, CSum, Opts,
Ws, Depth + 1, STime, TO,
MyOp, S2)
end;
RestFLUs3 ->
do_append_midtail2(RestFLUs3,
CoC_Namespace, CoC_Locator,
do_wr_app_midtail2(RestFLUs3,
NSInfo,
Prefix, File, Offset,
Chunk, ChunkExtra,
Ws, Depth + 1, STime, TO, S2)
Chunk, CSum, Opts,
Ws, Depth + 1, STime, TO,
MyOp, S2)
end
end
end.
do_append_midtail2([], _CoC_Namespace, _CoC_Locator,
do_wr_app_midtail2([], _NSInfo,
_Prefix, File, Offset, Chunk,
_ChunkExtra, _Ws, _Depth, _STime, _TO, S) ->
%% io:format(user, "ok!\n", []),
_CSum, _Opts, _Ws, _Depth, _STime, _TO, _MyOp, S) ->
{reply, {ok, {Offset, chunk_wrapper_size(Chunk), File}}, S};
do_append_midtail2([FLU|RestFLUs]=FLUs, CoC_Namespace, CoC_Locator,
do_wr_app_midtail2([FLU|RestFLUs]=FLUs, NSInfo,
Prefix, File, Offset, Chunk,
ChunkExtra, Ws, Depth, STime, TO,
CSum, Opts, Ws, Depth, STime, TO, MyOp,
#state{epoch_id=EpochID, proxies_dict=PD}=S) ->
Proxy = orddict:fetch(FLU, PD),
case ?FLU_PC:write_chunk(Proxy, EpochID, File, Offset, Chunk, ?TIMEOUT) of
case ?FLU_PC:write_chunk(Proxy, NSInfo, EpochID, File, Offset, Chunk, CSum, ?TIMEOUT) of
ok ->
%% io:format(user, "write ~w,", [FLU]),
do_append_midtail2(RestFLUs, CoC_Namespace, CoC_Locator, Prefix,
do_wr_app_midtail2(RestFLUs, NSInfo, Prefix,
File, Offset, Chunk,
ChunkExtra, [FLU|Ws], Depth, STime, TO, S);
CSum, Opts, [FLU|Ws], Depth, STime, TO, MyOp, S);
{error, bad_checksum}=BadCS ->
%% TODO: alternate strategy?
{reply, BadCS, S};
{error, Retry}
when Retry == partition; Retry == bad_epoch; Retry == wedged ->
do_append_midtail(FLUs, CoC_Namespace, CoC_Locator, Prefix,
do_wr_app_midtail(FLUs, NSInfo, Prefix,
File, Offset, Chunk,
ChunkExtra, Ws, Depth, STime, TO, S);
CSum, Opts, Ws, Depth, STime, TO, MyOp, S);
{error, written} ->
%% We know what the chunk ought to be, so jump to the
%% middle of read-repair.
Resume = {append, Offset, iolist_size(Chunk), File},
do_repair_chunk(FLUs, Resume, Chunk, [], File, Offset,
do_repair_chunk(FLUs, Resume, Chunk, CSum, [], NSInfo, File, Offset,
iolist_size(Chunk), Depth, STime, S);
{error, trimmed} = Err ->
%% TODO: nothing can be done
@ -520,16 +424,15 @@ witnesses_use_our_epoch([FLU|RestFLUs],
Proxy = orddict:fetch(FLU, PD),
%% Check both that the EpochID is the same *and* not wedged!
case ?FLU_PC:wedge_status(Proxy, ?TIMEOUT) of
{ok, {false, EID}} when EID == EpochID ->
{ok, {false, EID,_,_}} when EID == EpochID ->
witnesses_use_our_epoch(RestFLUs, S);
_Else ->
false
end.
do_write_head(File, Offset, Chunk, 0=Depth, STime, TO, S) ->
do_write_head2(File, Offset, Chunk, Depth + 1, STime, TO, S);
do_write_head(File, Offset, Chunk, Depth, STime, TO, #state{proj=P}=S) ->
%% io:format(user, "head sleep1,", []),
do_write_head(NSInfo, File, Offset, Chunk, CSum, 0=Depth, STime, TO, S) ->
do_write_head2(NSInfo, File, Offset, Chunk, CSum, Depth + 1, STime, TO, S);
do_write_head(NSInfo, File, Offset, Chunk, CSum, Depth, STime, TO, #state{proj=P}=S) ->
sleep_a_while(Depth),
DiffMs = timer:now_diff(os:timestamp(), STime) div 1000,
if DiffMs > TO ->
@ -543,30 +446,32 @@ do_write_head(File, Offset, Chunk, Depth, STime, TO, #state{proj=P}=S) ->
case S2#state.proj of
P2 when P2 == undefined orelse
P2#projection_v1.upi == [] ->
do_write_head(File, Offset, Chunk, Depth + 1,
do_write_head(NSInfo, File, Offset, Chunk, CSum, Depth + 1,
STime, TO, S2);
_ ->
do_write_head2(File, Offset, Chunk, Depth + 1,
do_write_head2(NSInfo, File, Offset, Chunk, CSum, Depth + 1,
STime, TO, S2)
end
end.
do_write_head2(File, Offset, Chunk, Depth, STime, TO,
do_write_head2(NSInfo, File, Offset, Chunk, CSum, Depth, STime, TO,
#state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) ->
[HeadFLU|RestFLUs] = mutation_flus(P),
Proxy = orddict:fetch(HeadFLU, PD),
case ?FLU_PC:write_chunk(Proxy, EpochID, File, Offset, Chunk, ?TIMEOUT) of
case ?FLU_PC:write_chunk(Proxy, NSInfo, EpochID, File, Offset, Chunk, CSum, ?TIMEOUT) of
ok ->
%% From this point onward, we use the same code & logic path as
%% append does.
do_append_midtail(RestFLUs, undefined, undefined, undefined,
Prefix=unused_write_path,
Opts=unused_write_path,
do_wr_app_midtail(RestFLUs, NSInfo, Prefix,
File, Offset, Chunk,
undefined, [HeadFLU], 0, STime, TO, S);
CSum, Opts, [HeadFLU], 0, STime, TO, write, S);
{error, bad_checksum}=BadCS ->
{reply, BadCS, S};
{error, Retry}
when Retry == partition; Retry == bad_epoch; Retry == wedged ->
do_write_head(File, Offset, Chunk, Depth, STime, TO, S);
do_write_head(NSInfo, File, Offset, Chunk, CSum, Depth, STime, TO, S);
{error, written}=Err ->
{reply, Err, S};
{error, trimmed}=Err ->
@ -576,10 +481,10 @@ do_write_head2(File, Offset, Chunk, Depth, STime, TO,
iolist_size(Chunk)})
end.
do_read_chunk(File, Offset, Size, Opts, 0=Depth, STime, TO,
do_read_chunk(NSInfo, File, Offset, Size, Opts, 0=Depth, STime, TO,
#state{proj=#projection_v1{upi=[_|_]}}=S) -> % UPI is non-empty
do_read_chunk2(File, Offset, Size, Opts, Depth + 1, STime, TO, S);
do_read_chunk(File, Offset, Size, Opts, Depth, STime, TO, #state{proj=P}=S) ->
do_read_chunk2(NSInfo, File, Offset, Size, Opts, Depth + 1, STime, TO, S);
do_read_chunk(NSInfo, File, Offset, Size, Opts, Depth, STime, TO, #state{proj=P}=S) ->
sleep_a_while(Depth),
DiffMs = timer:now_diff(os:timestamp(), STime) div 1000,
if DiffMs > TO ->
@ -589,19 +494,19 @@ do_read_chunk(File, Offset, Size, Opts, Depth, STime, TO, #state{proj=P}=S) ->
case S2#state.proj of
P2 when P2 == undefined orelse
P2#projection_v1.upi == [] ->
do_read_chunk(File, Offset, Size, Opts, Depth + 1, STime, TO, S2);
do_read_chunk(NSInfo, File, Offset, Size, Opts, Depth + 1, STime, TO, S2);
_ ->
do_read_chunk2(File, Offset, Size, Opts, Depth + 1, STime, TO, S2)
do_read_chunk2(NSInfo, File, Offset, Size, Opts, Depth + 1, STime, TO, S2)
end
end.
do_read_chunk2(File, Offset, Size, Opts, Depth, STime, TO,
do_read_chunk2(NSInfo, File, Offset, Size, Opts, Depth, STime, TO,
#state{proj=P, epoch_id=EpochID, proxies_dict=PD}=S) ->
UPI = readonly_flus(P),
Tail = lists:last(UPI),
ConsistencyMode = P#projection_v1.mode,
case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), EpochID,
File, Offset, Size, Opts, ?TIMEOUT) of
case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), NSInfo, EpochID,
File, Offset, Size, Opts, TO) of
{ok, {Chunks, Trimmed}} when is_list(Chunks), is_list(Trimmed) ->
%% After partition heal, there could happen that heads may
%% have chunk trimmed but tails may have chunk written -
@ -625,9 +530,9 @@ do_read_chunk2(File, Offset, Size, Opts, Depth, STime, TO,
{reply, BadCS, S};
{error, Retry}
when Retry == partition; Retry == bad_epoch; Retry == wedged ->
do_read_chunk(File, Offset, Size, Opts, Depth, STime, TO, S);
do_read_chunk(NSInfo, File, Offset, Size, Opts, Depth, STime, TO, S);
{error, not_written} ->
read_repair(ConsistencyMode, read, File, Offset, Size, Depth, STime, S);
read_repair(ConsistencyMode, read, NSInfo, File, Offset, Size, Depth, STime, S);
%% {reply, {error, not_written}, S};
{error, written} ->
exit({todo_should_never_happen,?MODULE,?LINE,File,Offset,Size});
@ -635,10 +540,10 @@ do_read_chunk2(File, Offset, Size, Opts, Depth, STime, TO,
{reply, Err, S}
end.
do_trim_chunk(File, Offset, Size, 0=Depth, STime, TO, S) ->
do_trim_chunk(File, Offset, Size, Depth+1, STime, TO, S);
do_trim_chunk(NSInfo, File, Offset, Size, 0=Depth, STime, TO, S) ->
do_trim_chunk(NSInfo, File, Offset, Size, Depth+1, STime, TO, S);
do_trim_chunk(File, Offset, Size, Depth, STime, TO, #state{proj=P}=S) ->
do_trim_chunk(NSInfo, File, Offset, Size, Depth, STime, TO, #state{proj=P}=S) ->
sleep_a_while(Depth),
DiffMs = timer:now_diff(os:timestamp(), STime) div 1000,
if DiffMs > TO ->
@ -652,42 +557,41 @@ do_trim_chunk(File, Offset, Size, Depth, STime, TO, #state{proj=P}=S) ->
case S2#state.proj of
P2 when P2 == undefined orelse
P2#projection_v1.upi == [] ->
do_trim_chunk(File, Offset, Size, Depth + 1,
do_trim_chunk(NSInfo, File, Offset, Size, Depth + 1,
STime, TO, S2);
_ ->
do_trim_chunk2(File, Offset, Size, Depth + 1,
do_trim_chunk2(NSInfo, File, Offset, Size, Depth + 1,
STime, TO, S2)
end
end.
do_trim_chunk2(File, Offset, Size, Depth, STime, TO,
do_trim_chunk2(NSInfo, File, Offset, Size, Depth, STime, TO,
#state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) ->
[HeadFLU|RestFLUs] = mutation_flus(P),
Proxy = orddict:fetch(HeadFLU, PD),
case ?FLU_PC:trim_chunk(Proxy, EpochID, File, Offset, Size, ?TIMEOUT) of
case ?FLU_PC:trim_chunk(Proxy, NSInfo, EpochID, File, Offset, Size, ?TIMEOUT) of
ok ->
do_trim_midtail(RestFLUs, undefined, File, Offset, Size,
do_trim_midtail(RestFLUs, undefined, NSInfo, File, Offset, Size,
[HeadFLU], 0, STime, TO, S);
{error, trimmed} ->
%% Maybe the trim had failed in the middle of the tail so re-run
%% trim accross the whole chain.
do_trim_midtail(RestFLUs, undefined, File, Offset, Size,
do_trim_midtail(RestFLUs, undefined, NSInfo, File, Offset, Size,
[HeadFLU], 0, STime, TO, S);
{error, bad_checksum}=BadCS ->
{reply, BadCS, S};
{error, Retry}
when Retry == partition; Retry == bad_epoch; Retry == wedged ->
do_trim_chunk(File, Offset, Size, Depth, STime, TO, S)
do_trim_chunk(NSInfo, File, Offset, Size, Depth, STime, TO, S)
end.
do_trim_midtail(RestFLUs, Prefix, File, Offset, Size,
do_trim_midtail(RestFLUs, Prefix, NSInfo, File, Offset, Size,
Ws, Depth, STime, TO, S)
when RestFLUs == [] orelse Depth == 0 ->
do_trim_midtail2(RestFLUs, Prefix, File, Offset, Size,
do_trim_midtail2(RestFLUs, Prefix, NSInfo, File, Offset, Size,
Ws, Depth + 1, STime, TO, S);
do_trim_midtail(_RestFLUs, Prefix, File, Offset, Size,
do_trim_midtail(_RestFLUs, Prefix, NSInfo, File, Offset, Size,
Ws, Depth, STime, TO, #state{proj=P}=S) ->
%% io:format(user, "midtail sleep2,", []),
sleep_a_while(Depth),
DiffMs = timer:now_diff(os:timestamp(), STime) div 1000,
if DiffMs > TO ->
@ -712,38 +616,36 @@ do_trim_midtail(_RestFLUs, Prefix, File, Offset, Size,
if Prefix == undefined -> % atom! not binary()!!
{error, partition};
true ->
do_trim_chunk(Prefix, Offset, Size,
do_trim_chunk(NSInfo, Prefix, Offset, Size,
Depth, STime, TO, S2)
end;
RestFLUs3 ->
do_trim_midtail2(RestFLUs3, Prefix, File, Offset, Size,
do_trim_midtail2(RestFLUs3, Prefix, NSInfo, File, Offset, Size,
Ws, Depth + 1, STime, TO, S2)
end
end
end.
do_trim_midtail2([], _Prefix, _File, _Offset, _Size,
do_trim_midtail2([], _Prefix, _NSInfo, _File, _Offset, _Size,
_Ws, _Depth, _STime, _TO, S) ->
%% io:format(user, "ok!\n", []),
{reply, ok, S};
do_trim_midtail2([FLU|RestFLUs]=FLUs, Prefix, File, Offset, Size,
do_trim_midtail2([FLU|RestFLUs]=FLUs, Prefix, NSInfo, File, Offset, Size,
Ws, Depth, STime, TO,
#state{epoch_id=EpochID, proxies_dict=PD}=S) ->
Proxy = orddict:fetch(FLU, PD),
case ?FLU_PC:trim_chunk(Proxy, EpochID, File, Offset, Size, ?TIMEOUT) of
case ?FLU_PC:trim_chunk(Proxy, NSInfo, EpochID, File, Offset, Size, ?TIMEOUT) of
ok ->
%% io:format(user, "write ~w,", [FLU]),
do_trim_midtail2(RestFLUs, Prefix, File, Offset, Size,
do_trim_midtail2(RestFLUs, Prefix, NSInfo, File, Offset, Size,
[FLU|Ws], Depth, STime, TO, S);
{error, trimmed} ->
do_trim_midtail2(RestFLUs, Prefix, File, Offset, Size,
do_trim_midtail2(RestFLUs, Prefix, NSInfo, File, Offset, Size,
[FLU|Ws], Depth, STime, TO, S);
{error, bad_checksum}=BadCS ->
%% TODO: alternate strategy?
{reply, BadCS, S};
{error, Retry}
when Retry == partition; Retry == bad_epoch; Retry == wedged ->
do_trim_midtail(FLUs, Prefix, File, Offset, Size,
do_trim_midtail(FLUs, Prefix, NSInfo, File, Offset, Size,
Ws, Depth, STime, TO, S)
end.
@ -759,11 +661,11 @@ do_trim_midtail2([FLU|RestFLUs]=FLUs, Prefix, File, Offset, Size,
%% Never matches because Depth is always incremented beyond 0 prior to
%% getting here.
%%
%% read_repair(ConsistencyMode, ReturnMode, File, Offset, Size, 0=Depth,
%% read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, 0=Depth,
%% STime, #state{proj=#projection_v1{upi=[_|_]}}=S) -> % UPI is non-empty
%% read_repair2(ConsistencyMode, ReturnMode, File, Offset, Size, Depth + 1,
%% read_repair2(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, Depth + 1,
%% STime, S);
read_repair(ConsistencyMode, ReturnMode, File, Offset, Size, Depth,
read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, Depth,
STime, #state{proj=P}=S) ->
sleep_a_while(Depth),
DiffMs = timer:now_diff(os:timestamp(), STime) div 1000,
@ -774,26 +676,26 @@ read_repair(ConsistencyMode, ReturnMode, File, Offset, Size, Depth,
case S2#state.proj of
P2 when P2 == undefined orelse
P2#projection_v1.upi == [] ->
read_repair(ConsistencyMode, ReturnMode, File, Offset,
read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset,
Size, Depth + 1, STime, S2);
_ ->
read_repair2(ConsistencyMode, ReturnMode, File, Offset,
read_repair2(ConsistencyMode, ReturnMode, NSInfo, File, Offset,
Size, Depth + 1, STime, S2)
end
end.
read_repair2(cp_mode=ConsistencyMode,
ReturnMode, File, Offset, Size, Depth, STime,
ReturnMode, NSInfo, File, Offset, Size, Depth, STime,
#state{proj=P, epoch_id=EpochID, proxies_dict=PD}=S) ->
%% TODO WTF was I thinking here??....
Tail = lists:last(readonly_flus(P)),
case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), EpochID,
File, Offset, Size, [], ?TIMEOUT) of
case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), NSInfo, EpochID,
File, Offset, Size, undefined, ?DEFAULT_TIMEOUT) of
{ok, Chunks} when is_list(Chunks) ->
%% TODO: change to {Chunks, Trimmed} and have them repaired
ToRepair = mutation_flus(P) -- [Tail],
{Reply, S1} = do_repair_chunks(Chunks, ToRepair, ReturnMode,
[Tail], File, Depth, STime, S, {ok, Chunks}),
[Tail], NSInfo, File, Depth, STime, S, {ok, Chunks}),
{reply, Reply, S1};
%% {ok, BadChunk} ->
%% exit({todo, bad_chunk_size, ?MODULE, ?LINE, File, Offset,
@ -803,7 +705,7 @@ read_repair2(cp_mode=ConsistencyMode,
{reply, BadCS, S};
{error, Retry}
when Retry == partition; Retry == bad_epoch; Retry == wedged ->
read_repair(ConsistencyMode, ReturnMode, File, Offset,
read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset,
Size, Depth, STime, S);
{error, not_written} ->
{reply, {error, not_written}, S};
@ -816,24 +718,23 @@ read_repair2(cp_mode=ConsistencyMode,
exit({todo_should_repair_unlinked_files, ?MODULE, ?LINE, File})
end;
read_repair2(ap_mode=ConsistencyMode,
ReturnMode, File, Offset, Size, Depth, STime,
ReturnMode, NSInfo, File, Offset, Size, Depth, STime,
#state{proj=P}=S) ->
Eligible = mutation_flus(P),
case try_to_find_chunk(Eligible, File, Offset, Size, S) of
case try_to_find_chunk(Eligible, NSInfo, File, Offset, Size, S) of
{ok, {Chunks, _Trimmed}, GotItFrom} when is_list(Chunks) ->
%% TODO: Repair trimmed chunks
ToRepair = mutation_flus(P) -- [GotItFrom],
{Reply0, S1} = do_repair_chunks(Chunks, ToRepair, ReturnMode, [GotItFrom],
File, Depth, STime, S, {ok, Chunks}),
{ok, Chunks} = Reply0,
Reply = {ok, {Chunks, _Trimmed}},
Reply = {ok, {Chunks, []}},
{Reply, S1} = do_repair_chunks(Chunks, ToRepair, ReturnMode, [GotItFrom],
NSInfo, File, Depth, STime, S, Reply),
{reply, Reply, S1};
{error, bad_checksum}=BadCS ->
%% TODO: alternate strategy?
{reply, BadCS, S};
{error, Retry}
when Retry == partition; Retry == bad_epoch; Retry == wedged ->
read_repair(ConsistencyMode, ReturnMode, File,
read_repair(ConsistencyMode, ReturnMode, NSInfo, File,
Offset, Size, Depth, STime, S);
{error, not_written} ->
{reply, {error, not_written}, S};
@ -845,22 +746,22 @@ read_repair2(ap_mode=ConsistencyMode,
exit({todo_should_repair_unlinked_files, ?MODULE, ?LINE, File})
end.
do_repair_chunks([], _, _, _, _, _, _, S, Reply) ->
do_repair_chunks([], _, _, _, _, _, _, _, S, Reply) ->
{Reply, S};
do_repair_chunks([{_, Offset, Chunk, _Csum}|T],
ToRepair, ReturnMode, [GotItFrom], File, Depth, STime, S, Reply) ->
do_repair_chunks([{_, Offset, Chunk, CSum}|T],
ToRepair, ReturnMode, [GotItFrom], NSInfo, File, Depth, STime, S, Reply) ->
true = not is_atom(CSum),
Size = iolist_size(Chunk),
case do_repair_chunk(ToRepair, ReturnMode, Chunk, [GotItFrom], File, Offset,
case do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, [GotItFrom], NSInfo, File, Offset,
Size, Depth, STime, S) of
{ok, Chunk, S1} ->
do_repair_chunks(T, ToRepair, ReturnMode, [GotItFrom], File, Depth, STime, S1, Reply);
{reply, {ok, _}, S1} ->
do_repair_chunks(T, ToRepair, ReturnMode, [GotItFrom], NSInfo, File, Depth, STime, S1, Reply);
Error ->
Error
end.
do_repair_chunk(ToRepair, ReturnMode, Chunk, Repaired, File, Offset,
do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offset,
Size, Depth, STime, #state{proj=P}=S) ->
%% io:format(user, "read_repair3 sleep1,", []),
sleep_a_while(Depth),
DiffMs = timer:now_diff(os:timestamp(), STime) div 1000,
if DiffMs > ?MAX_RUNTIME ->
@ -870,42 +771,42 @@ do_repair_chunk(ToRepair, ReturnMode, Chunk, Repaired, File, Offset,
case S2#state.proj of
P2 when P2 == undefined orelse
P2#projection_v1.upi == [] ->
do_repair_chunk(ToRepair, ReturnMode, Chunk, Repaired, File,
do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File,
Offset, Size, Depth + 1, STime, S2);
P2 ->
ToRepair2 = mutation_flus(P2) -- Repaired,
do_repair_chunk2(ToRepair2, ReturnMode, Chunk, Repaired, File,
do_repair_chunk2(ToRepair2, ReturnMode, Chunk, CSum, Repaired, NSInfo, File,
Offset, Size, Depth + 1, STime, S2)
end
end.
do_repair_chunk2([], ReturnMode, Chunk, _Repaired, File, Offset,
do_repair_chunk2([], ReturnMode, Chunk, CSum, _Repaired, _NSInfo, File, Offset,
_IgnoreSize, _Depth, _STime, S) ->
%% TODO: add stats for # of repairs, length(_Repaired)-1, etc etc?
case ReturnMode of
read ->
{ok, Chunk, S};
{reply, {ok, {[{File, Offset, Chunk, CSum}], []}}, S};
{append, Offset, Size, File} ->
{ok, {Offset, Size, File}, S}
{reply, {ok, {[{Offset, Size, File}], []}}, S}
end;
do_repair_chunk2([First|Rest]=ToRepair, ReturnMode, Chunk, Repaired, File, Offset,
do_repair_chunk2([First|Rest]=ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offset,
Size, Depth, STime, #state{epoch_id=EpochID, proxies_dict=PD}=S) ->
Proxy = orddict:fetch(First, PD),
case ?FLU_PC:write_chunk(Proxy, EpochID, File, Offset, Chunk, ?TIMEOUT) of
case ?FLU_PC:write_chunk(Proxy, NSInfo, EpochID, File, Offset, Chunk, CSum, ?TIMEOUT) of
ok ->
do_repair_chunk2(Rest, ReturnMode, Chunk, [First|Repaired], File,
do_repair_chunk2(Rest, ReturnMode, Chunk, CSum, [First|Repaired], NSInfo, File,
Offset, Size, Depth, STime, S);
{error, bad_checksum}=BadCS ->
%% TODO: alternate strategy?
{BadCS, S};
{error, Retry}
when Retry == partition; Retry == bad_epoch; Retry == wedged ->
do_repair_chunk(ToRepair, ReturnMode, Chunk, Repaired, File,
do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File,
Offset, Size, Depth, STime, S);
{error, written} ->
%% TODO: To be very paranoid, read the chunk here to verify
%% that it is exactly our Chunk.
do_repair_chunk2(Rest, ReturnMode, Chunk, Repaired, File,
do_repair_chunk2(Rest, ReturnMode, Chunk, CSum, Repaired, NSInfo, File,
Offset, Size, Depth, STime, S);
{error, trimmed} = _Error ->
%% TODO
@ -937,9 +838,9 @@ do_checksum_list(File, Depth, STime, TO, #state{proj=P}=S) ->
end.
do_checksum_list2(File, Depth, STime, TO,
#state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) ->
#state{proj=P, proxies_dict=PD}=S) ->
Proxy = orddict:fetch(lists:last(readonly_flus(P)), PD),
case ?FLU_PC:checksum_list(Proxy, EpochID, File, ?TIMEOUT) of
case ?FLU_PC:checksum_list(Proxy, File, TO) of
{ok, _}=OK ->
{reply, OK, S};
{error, Retry}
@ -974,7 +875,7 @@ do_list_files(Depth, STime, TO, #state{proj=P}=S) ->
do_list_files2(Depth, STime, TO,
#state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) ->
Proxy = orddict:fetch(lists:last(readonly_flus(P)), PD),
case ?FLU_PC:list_files(Proxy, EpochID, ?TIMEOUT) of
case ?FLU_PC:list_files(Proxy, EpochID, ?DEFAULT_TIMEOUT) of
{ok, _}=OK ->
{reply, OK, S};
{error, Retry}
@ -1025,11 +926,13 @@ update_proj2(Count, #state{bad_proj=BadProj, proxies_dict=ProxiesDict,
update_proj2(Count + 1, S);
P when P >= BadProj ->
#projection_v1{epoch_number=Epoch, epoch_csum=CSum,
members_dict=NewMembersDict} = P,
members_dict=NewMembersDict, dbg2=Dbg2} = P,
EpochID = {Epoch, CSum},
?FLU_PC:stop_proxies(ProxiesDict),
NewProxiesDict = ?FLU_PC:start_proxies(NewMembersDict),
S#state{bad_proj=undefined, proj=P, epoch_id=EpochID,
%% Make crash reports shorter by getting rid of 'react' history.
P2 = P#projection_v1{dbg2=lists:keydelete(react, 1, Dbg2)},
S#state{bad_proj=undefined, proj=P2, epoch_id=EpochID,
members_dict=NewMembersDict, proxies_dict=NewProxiesDict};
_P ->
sleep_a_while(Count),
@ -1074,14 +977,14 @@ choose_best_proj(Rs) ->
BestProj
end, ?WORST_PROJ, Rs).
try_to_find_chunk(Eligible, File, Offset, Size,
try_to_find_chunk(Eligible, NSInfo, File, Offset, Size,
#state{epoch_id=EpochID, proxies_dict=PD}) ->
Timeout = 2*1000,
Work = fun(FLU) ->
Proxy = orddict:fetch(FLU, PD),
case ?FLU_PC:read_chunk(Proxy, EpochID,
case ?FLU_PC:read_chunk(Proxy, NSInfo, EpochID,
%% TODO Trimmed is required here
File, Offset, Size, []) of
File, Offset, Size, undefined) of
{ok, {_Chunks, _} = ChunksAndTrimmed} ->
{FLU, {ok, ChunksAndTrimmed}};
Else ->

View file

@ -1,3 +1,23 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2016 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(machi_csum_table).
-export([open/2,
@ -65,10 +85,18 @@ find(#machi_csum_table{table=T}, Offset, Size) ->
{ok, I} = eleveldb:iterator(T, [], keys_only),
EndKey = sext:encode({Offset+Size, 0}),
StartKey = sext:encode({Offset, Size}),
{ok, FirstKey} = case eleveldb:iterator_move(I, StartKey) of
{error, invalid_iterator} ->
eleveldb:iterator_move(I, first);
try
%% Assume that the invalid_iterator is because
%% we tried to move to the end via StartKey.
%% Instead, move there directly.
{ok, _} = eleveldb:iterator_move(I, last),
{ok, _} = eleveldb:iterator_move(I, prev)
catch
_:_ ->
{ok, _} = eleveldb:iterator_move(I, first)
end;
{ok, _} = R0 ->
case eleveldb:iterator_move(I, prev) of
{error, invalid_iterator} ->
@ -92,7 +120,6 @@ find(#machi_csum_table{table=T}, Offset, Size) ->
end,
lists:reverse(eleveldb_fold(T, FirstKey, EndKey, FoldFun, [])).
%% @doc Updates all chunk info, by deleting existing entries if exists
%% and putting new chunk info
-spec write(table(),
@ -126,6 +153,8 @@ write(#machi_csum_table{table=T} = CsumT, Offset, Size, CSum,
DeleteOps = lists:map(fun({O, L, _}) ->
{delete, sext:encode({O, L})}
end, Chunks),
%% io:format(user, "PutOps: ~P\n", [PutOps, 20]),
%% io:format(user, "DelOps: ~P\n", [DeleteOps, 20]),
eleveldb:write(T, DeleteOps ++ PutOps, [{sync, true}]).
-spec find_leftneighbor(table(), non_neg_integer()) ->
@ -256,7 +285,7 @@ build_unwritten_bytes_list([{CO, CS, _Ck}|Rest], _LastOffset, Acc) ->
build_unwritten_bytes_list(Rest, CO + CS, Acc).
%% @doc If you want to find an overlap among two areas [x, y] and [a,
%% b] where x < y and a < b; if (a-y)*(b-x) < 0 then there's a
%% b] where x &lt; y and a &lt; b; if (a-y)*(b-x) &lt; 0 then there's a
%% overlap, else, > 0 then there're no overlap. border condition = 0
%% is not overlap in this offset-size case.
%% inclusion_match_spec(Offset, Size) ->

View file

@ -20,18 +20,24 @@
-module(machi_dt).
-include("machi.hrl").
-include("machi_projection.hrl").
-type chunk() :: chunk_bin() | {chunk_csum(), chunk_bin()}.
-type chunk_bin() :: binary() | iolist(). % client can use either
-type chunk_csum() :: binary(). % 1 byte tag, N-1 bytes checksum
-type chunk_summary() :: {file_offset(), chunk_size(), binary()}.
-type chunk_s() :: 'trimmed' | binary().
-type append_opts() :: #append_opts{}.
-type chunk() :: chunk_bin() | iolist(). % client can choose either rep.
-type chunk_bin() :: binary(). % server returns binary() only.
-type chunk_csum() :: <<>> | chunk_csum_bin() | {csum_tag(), binary()}.
-type chunk_csum_bin() :: binary(). % 1 byte tag, N-1 bytes checksum
-type chunk_cstrm() :: 'trimmed' | chunk_csum().
-type chunk_summary() :: {file_offset(), chunk_size(), chunk_bin(), chunk_cstrm()}.
-type chunk_pos() :: {file_offset(), chunk_size(), file_name_s()}.
-type chunk_size() :: non_neg_integer().
-type coc_namespace() :: string().
-type coc_nl() :: {coc, coc_namespace(), coc_locator()}.
-type coc_locator() :: non_neg_integer().
%% Tags that stand for how that checksum was generated. See
%% machi_util:make_tagged_csum/{1,2} for further documentation and
%% implementation.
-type csum_tag() :: none | client_sha | server_sha | server_regen_sha.
-type error_general() :: 'bad_arg' | 'wedged' | 'bad_checksum'.
-type epoch_csum() :: binary().
-type epoch_num() :: -1 | non_neg_integer().
@ -44,26 +50,26 @@
-type file_prefix() :: binary() | list().
-type inet_host() :: inet:ip_address() | inet:hostname().
-type inet_port() :: inet:port_number().
-type locator() :: number().
-type namespace() :: binary().
-type namespace_version() :: non_neg_integer().
-type ns_info() :: #ns_info{}.
-type projection() :: #projection_v1{}.
-type projection_type() :: 'public' | 'private'.
%% @doc Tags that stand for how that checksum was generated. See
%% machi_util:make_tagged_csum/{1,2} for further documentation and
%% implementation.
-type csum_tag() :: none | client_sha | server_sha | server_regen_sha.
-type read_opts() :: #read_opts{}.
-type read_opts_x() :: 'undefined' | 'noopt' | 'none' | #read_opts{}.
-export_type([
append_opts/0,
chunk/0,
chunk_bin/0,
chunk_csum/0,
csum_tag/0,
chunk_csum_bin/0,
chunk_cstrm/0,
chunk_summary/0,
chunk_s/0,
chunk_pos/0,
chunk_size/0,
coc_namespace/0,
coc_nl/0,
coc_locator/0,
csum_tag/0,
error_general/0,
epoch_csum/0,
epoch_num/0,
@ -76,7 +82,13 @@
file_prefix/0,
inet_host/0,
inet_port/0,
locator/0,
namespace/0,
namespace_version/0,
ns_info/0,
projection/0,
projection_type/0
projection_type/0,
read_opts/0,
read_opts_x/0
]).

View file

@ -71,7 +71,7 @@
code_change/3
]).
-define(TICK, 30*1000). %% XXX FIXME Should be something like 5 seconds
-define(TICK, 5*1000).
-define(TICK_THRESHOLD, 5). %% After this + 1 more quiescent ticks, shutdown
-define(TIMEOUT, 10*1000).
-define(TOO_MANY_ERRORS_RATIO, 50).
@ -91,6 +91,7 @@
csum_table :: machi_csum_table:table(),
eof_position = 0 :: non_neg_integer(),
max_file_size = ?DEFAULT_MAX_FILE_SIZE :: pos_integer(),
rollover = false :: boolean(),
tref :: reference(), %% timer ref
ticks = 0 :: non_neg_integer(), %% ticks elapsed with no new operations
ops = 0 :: non_neg_integer(), %% sum of all ops
@ -141,18 +142,18 @@ sync(_Pid, Type) ->
Data :: binary(), Checksum :: binary()}]} |
{error, Reason :: term()}.
read(Pid, Offset, Length) ->
read(Pid, Offset, Length, []).
read(Pid, Offset, Length, #read_opts{}).
-spec read(Pid :: pid(),
Offset :: non_neg_integer(),
Length :: non_neg_integer(),
[{no_checksum|no_chunk|needs_trimmed, boolean()}]) ->
machi_dt:read_opts_x()) ->
{ok, [{Filename::string(), Offset :: non_neg_integer(),
Data :: binary(), Checksum :: binary()}]} |
{error, Reason :: term()}.
read(Pid, Offset, Length, Opts) when is_pid(Pid) andalso is_integer(Offset) andalso Offset >= 0
andalso is_integer(Length) andalso Length > 0
andalso is_list(Opts) ->
read(Pid, Offset, Length, #read_opts{}=Opts)
when is_pid(Pid) andalso is_integer(Offset) andalso Offset >= 0
andalso is_integer(Length) andalso Length > 0 ->
gen_server:call(Pid, {read, Offset, Length, Opts}, ?TIMEOUT);
read(_Pid, Offset, Length, Opts) ->
lager:warning("Bad args to read: Offset ~p, Length ~p, Options ~p", [Offset, Length, Opts]),
@ -239,7 +240,7 @@ init({FluName, Filename, DataDir}) ->
data_filehandle = FHd,
csum_table = CsumTable,
tref = Tref,
eof_position = Eof,
eof_position = erlang:max(Eof, ?MINIMUM_OFFSET),
max_file_size = machi_config:max_file_size()},
lager:debug("Starting file proxy ~p for filename ~p, state = ~p, Eof = ~p",
[self(), Filename, St, Eof]),
@ -298,15 +299,15 @@ handle_call({read, Offset, Length, Opts}, _From,
}) ->
%% TODO: use these options - NoChunk prevents reading from disks
%% NoChecksum doesn't check checksums
NoChecksum = proplists:get_value(no_checksum, Opts, false),
NoChunk = proplists:get_value(no_chunk, Opts, false),
#read_opts{no_checksum=NoChecksum, no_chunk=NoChunk,
needs_trimmed=NeedsTrimmed} = Opts,
{Resp, NewErr} =
case do_read(FH, F, CsumTable, Offset, Length, NoChunk, NoChecksum) of
{ok, {[], []}} ->
{{error, not_written}, Err + 1};
{ok, {Chunks0, Trimmed0}} ->
Chunks = slice_both_side(Chunks0, Offset, Offset+Length),
Trimmed = case proplists:get_value(needs_trimmed, Opts, false) of
Trimmed = case NeedsTrimmed of
true -> Trimmed0;
false -> []
end,
@ -449,11 +450,23 @@ handle_cast(Cast, State) ->
{noreply, State}.
% @private
handle_info(tick, State = #state{eof_position = Eof,
handle_info(tick, State = #state{fluname = FluName,
filename = F,
eof_position = Eof,
max_file_size = MaxFileSize}) when Eof >= MaxFileSize ->
lager:notice("Eof position ~p >= max file size ~p. Shutting down.",
[Eof, MaxFileSize]),
{stop, file_rollover, State};
%% Older code halted here with {stop, file_rollover, State}.
%% However, there may be other requests in our mailbox already
%% and/or not yet delivered but in a race with the
%% machi_flu_metadata_mgr. So we close our eleveldb instance (to
%% avoid double-open attempt by a new file proxy proc), tell
%% machi_flu_metadata_mgr that we request a rollover, then stop.
%% terminate() will take care of forwarding messages that are
%% caught in the race.
lager:notice("Eof ~s position ~p >= max file size ~p. Shutting down.",
[F, Eof, MaxFileSize]),
State2 = close_files(State),
machi_flu_metadata_mgr:stop_proxy_pid_rollover(FluName, {file, F}),
{stop, normal, State2#state{rollover = true}};
%% XXX Is this a good idea? Need to think this through a bit.
handle_info(tick, State = #state{wedged = true}) ->
@ -467,7 +480,7 @@ handle_info(tick, State = #state{
writes = {WT, WE},
appends = {AT, AE}
}) when Ops > 100 andalso
trunc(((RE+WE+AE) / RT+WT+AT) * 100) > ?TOO_MANY_ERRORS_RATIO ->
trunc(((RE+WE+AE) / (RT+WT+AT)) * 100) > ?TOO_MANY_ERRORS_RATIO ->
Errors = RE + WE + AE,
lager:notice("Got ~p errors. Shutting down.", [Errors]),
{stop, too_many_errors, State};
@ -526,30 +539,23 @@ handle_info(Req, State) ->
{noreply, State}.
% @private
terminate(Reason, #state{filename = F,
data_filehandle = FHd,
csum_table = T,
reads = {RT, RE},
writes = {WT, WE},
appends = {AT, AE}
}) ->
terminate(Reason, State = #state{fluname = FluName,
filename = F,
rollover = Rollover_p,
reads = {RT, RE},
writes = {WT, WE},
appends = {AT, AE}
}) ->
lager:info("Shutting down proxy for file ~p because ~p", [F, Reason]),
lager:info(" Op Tot/Error", []),
lager:info(" Reads: ~p/~p", [RT, RE]),
lager:info(" Writes: ~p/~p", [WT, WE]),
lager:info("Appends: ~p/~p", [AT, AE]),
case FHd of
undefined ->
noop; %% file deleted
_ ->
ok = file:sync(FHd),
ok = file:close(FHd)
end,
case T of
undefined ->
noop; %% file deleted
_ ->
ok = machi_csum_table:close(T)
close_files(State),
if Rollover_p ->
forward_late_messages(FluName, F, 500);
true ->
ok
end,
ok.
@ -867,3 +873,36 @@ maybe_gc(Reply, S = #state{fluname=FluName,
false ->
{reply, Reply, S}
end.
close_files(State = #state{data_filehandle = FHd,
csum_table = T}) ->
case FHd of
undefined ->
noop; %% file deleted
_ ->
ok = file:sync(FHd),
ok = file:close(FHd)
end,
case T of
undefined ->
noop; %% file deleted
_ ->
ok = machi_csum_table:close(T)
end,
State#state{data_filehandle = undefined, csum_table = undefined}.
forward_late_messages(FluName, F, Timeout) ->
receive
M ->
case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, F}) of
{ok, Pid} ->
Pid ! M;
{error, trimmed} ->
lager:error("TODO: FLU ~p file ~p reports trimmed status "
"when forwarding ~P\n",
[FluName, F, M, 20])
end,
forward_late_messages(FluName, F, Timeout)
after Timeout ->
ok
end.

View file

@ -39,7 +39,8 @@
get_unfit_list/1, update_local_down_list/3,
add_admin_down/3, delete_admin_down/2,
send_fitness_update_spam/3,
send_spam_to_everyone/1]).
send_spam_to_everyone/1,
trigger_early_adjustment/2]).
%% gen_server callbacks
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
@ -81,6 +82,13 @@ send_fitness_update_spam(Pid, FromName, Dict) ->
send_spam_to_everyone(Pid) ->
gen_server:call(Pid, {send_spam_to_everyone}, infinity).
%% @doc For testing purposes, we don't want a test to wait for
%% wall-clock time to elapse before the fitness server makes a
%% down->up status decision.
trigger_early_adjustment(Pid, FLU) ->
Pid ! {adjust_down_list, FLU}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
init([{MyFluName}|Args]) ->
@ -100,6 +108,7 @@ handle_call({update_local_down_list, Down, MembersDict}, _From,
#state{my_flu_name=MyFluName, pending_map=OldMap,
local_down=OldDown, members_dict=OldMembersDict,
admin_down=AdminDown}=S) ->
verbose("FITNESS: ~w has down suspect ~w\n", [MyFluName, Down]),
NewMap = store_in_map(OldMap, MyFluName, erlang:now(), Down,
AdminDown, [props_yo]),
S2 = if Down == OldDown, MembersDict == OldMembersDict ->
@ -111,13 +120,17 @@ handle_call({update_local_down_list, Down, MembersDict}, _From,
end,
{reply, ok, S2#state{local_down=Down}};
handle_call({add_admin_down, DownFLU, DownProps}, _From,
#state{local_down=OldDown, admin_down=AdminDown}=S) ->
#state{my_flu_name=MyFluName,
local_down=OldDown, admin_down=AdminDown}=S) ->
verbose("FITNESS: ~w add admin down ~w\n", [MyFluName, DownFLU]),
NewAdminDown = [{DownFLU,DownProps}|lists:keydelete(DownFLU, 1, AdminDown)],
S3 = finish_admin_down(erlang:now(), OldDown, NewAdminDown,
[props_yo], S),
{reply, ok, S3};
handle_call({delete_admin_down, DownFLU}, _From,
#state{local_down=OldDown, admin_down=AdminDown}=S) ->
#state{my_flu_name=MyFluName,
local_down=OldDown, admin_down=AdminDown}=S) ->
verbose("FITNESS: ~w delete admin down ~w\n", [MyFluName, DownFLU]),
NewAdminDown = lists:keydelete(DownFLU, 1, AdminDown),
S3 = finish_admin_down(erlang:now(), OldDown, NewAdminDown,
[props_yo], S),
@ -135,7 +148,8 @@ handle_call(_Request, _From, S) ->
handle_cast(_Msg, S) ->
{noreply, S}.
handle_info({adjust_down_list, FLU}, #state{active_unfit=ActiveUnfit}=S) ->
handle_info({adjust_down_list, FLU}, #state{my_flu_name=MyFluName,
active_unfit=ActiveUnfit}=S) ->
NewUnfit = make_unfit_list(S),
Added_to_new = NewUnfit -- ActiveUnfit,
Dropped_from_new = ActiveUnfit -- NewUnfit,
@ -176,9 +190,11 @@ handle_info({adjust_down_list, FLU}, #state{active_unfit=ActiveUnfit}=S) ->
{true, true} ->
error({bad, ?MODULE, ?LINE, FLU, ActiveUnfit, NewUnfit});
{true, false} ->
{noreply, S#state{active_unfit=lists:usort(ActiveUnfit ++ [FLU])}};
NewActive = wrap_active(MyFluName,lists:usort(ActiveUnfit++[FLU])),
{noreply, S#state{active_unfit=NewActive}};
{false, true} ->
{noreply, S#state{active_unfit=ActiveUnfit -- [FLU]}};
NewActive = wrap_active(MyFluName,ActiveUnfit--[FLU]),
{noreply, S#state{active_unfit=NewActive}};
{false, false} ->
{noreply, S}
end;
@ -416,6 +432,18 @@ map_value(Map) ->
map_merge(Map1, Map2) ->
?MAP:merge(Map1, Map2).
wrap_active(MyFluName, L) ->
verbose("FITNESS: ~w has new down list ~w\n", [MyFluName, L]),
L.
verbose(Fmt, Args) ->
case application:get_env(machi, fitness_verbose) of
{ok, true} ->
error_logger:info_msg(Fmt, Args);
_ ->
ok
end.
-ifdef(TEST).
dt_understanding_test() ->

View file

@ -21,7 +21,9 @@
%% @doc The Machi FLU file server + file location sequencer.
%%
%% This module implements only the Machi FLU file server and its
%% implicit sequencer.
%% implicit sequencer together with listener, append server,
%% file management and file proxy processes.
%% Please see the EDoc "Overview" for details about the FLU as a
%% primitive file server process vs. the larger Machi design of a FLU
%% as a sequencer + file server + chain manager group of processes.
@ -54,27 +56,16 @@
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
-export([timing_demo_test_COMMENTED_/0, sort_2lines/2]). % Just to suppress warning
-endif. % TEST
-export([start_link/1, stop/1,
update_wedge_state/3, wedge_myself/2]).
-export([make_projection_server_regname/1]).
-export([make_projection_server_regname/1,
ets_table_name/1]).
%% TODO: remove or replace in OTP way after gen_*'ified
-export([main2/4, run_append_server/2,
current_state/1, format_state/1]).
-export([main2/4]).
-record(state, {
flu_name :: atom(),
proj_store :: pid(),
witness = false :: boolean(),
append_pid :: pid(),
wedged = true :: boolean(),
etstab :: ets:tid(),
epoch_id :: 'undefined' | machi_dt:epoch_id(),
props = [] :: list() % proplist
}).
-define(SERVER_CMD_READ_TIMEOUT, 600*1000).
-define(INIT_TIMEOUT, 60*1000).
start_link([{FluName, TcpPort, DataDir}|Rest])
@ -96,33 +87,14 @@ stop(Pid) when is_pid(Pid) ->
error
end.
update_wedge_state(PidSpec, Boolean, EpochId)
when (Boolean == true orelse Boolean == false), is_tuple(EpochId) ->
PidSpec ! {wedge_state_change, Boolean, EpochId}.
update_wedge_state(PidSpec, Boolean, EpochId) ->
machi_flu1_append_server:int_update_wedge_state(PidSpec, Boolean, EpochId).
wedge_myself(PidSpec, EpochId)
when is_tuple(EpochId) ->
PidSpec ! {wedge_myself, EpochId}.
current_state(PidSpec) ->
PidSpec ! {current_state, self()},
%% TODO: Not so rubust f(^^;)
receive
Res -> Res
after
60*1000 -> {error, timeout}
end.
format_state(State) ->
Fields = record_info(fields, state),
[_Name | Values] = tuple_to_list(State),
lists:zip(Fields, Values).
wedge_myself(PidSpec, EpochId) ->
machi_flu1_append_server:int_wedge_myself(PidSpec, EpochId).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
ets_table_name(FluName) when is_atom(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_epoch").
main2(FluName, TcpPort, DataDir, Props) ->
{SendAppendPidToProj_p, ProjectionPid} =
case proplists:get_value(projection_store_registered_name, Props) of
@ -149,23 +121,16 @@ main2(FluName, TcpPort, DataDir, Props) ->
{true, undefined}
end,
Witness_p = proplists:get_value(witness_mode, Props, false),
S0 = #state{flu_name=FluName,
proj_store=ProjectionPid,
wedged=Wedged_p,
witness=Witness_p,
etstab=ets_table_name(FluName),
epoch_id=EpochId,
props=Props},
{ok, AppendPid} = start_append_server(S0, self()),
{ok, AppendPid} = start_append_server(FluName, Witness_p, Wedged_p, EpochId),
if SendAppendPidToProj_p ->
machi_projection_store:set_wedge_notify_pid(ProjectionPid,
AppendPid);
machi_projection_store:set_wedge_notify_pid(ProjectionPid, AppendPid);
true ->
ok
end,
S1 = S0#state{append_pid=AppendPid},
{ok, ListenerPid} = start_listen_server(TcpPort, DataDir, S1),
{ok, ListenerPid} = start_listen_server(FluName, TcpPort, Witness_p, DataDir,
ets_table_name(FluName), ProjectionPid,
Props),
%% io:format(user, "Listener started: ~w~n", [{FluName, ListenerPid}]),
Config_e = machi_util:make_config_filename(DataDir, "unused"),
@ -176,135 +141,24 @@ main2(FluName, TcpPort, DataDir, Props) ->
ok = filelib:ensure_dir(Projection_e),
put(flu_flu_name, FluName),
put(flu_append_pid, S1#state.append_pid),
put(flu_append_pid, AppendPid),
put(flu_projection_pid, ProjectionPid),
put(flu_listen_pid, ListenerPid),
proc_lib:init_ack({ok, self()}),
receive killme -> ok end,
(catch exit(S1#state.append_pid, kill)),
(catch exit(AppendPid, kill)),
(catch exit(ProjectionPid, kill)),
(catch exit(ListenerPid, kill)),
ok.
start_append_server(S, AckPid) ->
proc_lib:start_link(?MODULE, run_append_server, [AckPid, S], ?INIT_TIMEOUT).
start_append_server(FluName, Witness_p, Wedged_p, EpochId) ->
machi_flu1_subsup:start_append_server(FluName, Witness_p, Wedged_p, EpochId).
start_listen_server(TcpPort, DataDir,
#state{flu_name=FluName, witness=Witness, etstab=EtsTab,
proj_store=ProjStore}=_S) ->
machi_listener_sup:start_listener(FluName, TcpPort, Witness, DataDir,
EtsTab, ProjStore).
run_append_server(FluPid, #state{flu_name=Name,
wedged=Wedged_p,epoch_id=EpochId}=S) ->
%% Reminder: Name is the "main" name of the FLU, i.e., no suffix
register(Name, self()),
TID = ets:new(ets_table_name(Name),
[set, protected, named_table, {read_concurrency, true}]),
ets:insert(TID, {epoch, {Wedged_p, EpochId}}),
proc_lib:init_ack({ok, self()}),
append_server_loop(FluPid, S#state{etstab=TID}).
append_server_loop(FluPid, #state{wedged=Wedged_p,
witness=Witness_p,
epoch_id=OldEpochId, flu_name=FluName}=S) ->
receive
{seq_append, From, _N, _L, _Prefix, _Chunk, _CSum, _Extra, _EpochID}
when Witness_p ->
%% The FLU's machi_flu1_net_server process ought to filter all
%% witness states, but we'll keep this clause for extra
%% paranoia.
From ! witness,
append_server_loop(FluPid, S);
{seq_append, From, _N, _L, _Prefix, _Chunk, _CSum, _Extra, _EpochID}
when Wedged_p ->
From ! wedged,
append_server_loop(FluPid, S);
{seq_append, From, CoC_Namespace, CoC_Locator,
Prefix, Chunk, CSum, Extra, EpochID} ->
%% Old is the one from our state, plain old 'EpochID' comes
%% from the client.
_ = case OldEpochId == EpochID of
true ->
spawn(fun() ->
append_server_dispatch(From, CoC_Namespace, CoC_Locator,
Prefix, Chunk, CSum, Extra,
FluName, EpochID)
end);
false ->
From ! {error, bad_epoch}
end,
append_server_loop(FluPid, S);
{wedge_myself, WedgeEpochId} ->
if not Wedged_p andalso WedgeEpochId == OldEpochId ->
true = ets:insert(S#state.etstab,
{epoch, {true, OldEpochId}}),
%% Tell my chain manager that it might want to react to
%% this new world.
Chmgr = machi_chain_manager1:make_chmgr_regname(FluName),
spawn(fun() ->
catch machi_chain_manager1:trigger_react_to_env(Chmgr)
end),
append_server_loop(FluPid, S#state{wedged=true});
true ->
append_server_loop(FluPid, S)
end;
{wedge_state_change, Boolean, {NewEpoch, _}=NewEpochId} ->
OldEpoch = case OldEpochId of {OldE, _} -> OldE;
undefined -> -1
end,
if NewEpoch >= OldEpoch ->
true = ets:insert(S#state.etstab,
{epoch, {Boolean, NewEpochId}}),
append_server_loop(FluPid, S#state{wedged=Boolean,
epoch_id=NewEpochId});
true ->
append_server_loop(FluPid, S)
end;
{wedge_status, FromPid} ->
#state{wedged=Wedged_p, epoch_id=EpochId} = S,
FromPid ! {wedge_status_reply, Wedged_p, EpochId},
append_server_loop(FluPid, S);
{current_state, FromPid} ->
FromPid ! S;
Else ->
io:format(user, "append_server_loop: WHA? ~p\n", [Else]),
append_server_loop(FluPid, S)
end.
append_server_dispatch(From, CoC_Namespace, CoC_Locator,
Prefix, Chunk, CSum, Extra, FluName, EpochId) ->
Result = case handle_append(CoC_Namespace, CoC_Locator,
Prefix, Chunk, CSum, Extra, FluName, EpochId) of
{ok, File, Offset} ->
{assignment, Offset, File};
Other ->
Other
end,
From ! Result,
exit(normal).
handle_append(_N, _L, _Prefix, <<>>, _Csum, _Extra, _FluName, _EpochId) ->
{error, bad_arg};
handle_append(CoC_Namespace, CoC_Locator,
Prefix, Chunk, Csum, Extra, FluName, EpochId) ->
CoC = {coc, CoC_Namespace, CoC_Locator},
Res = machi_flu_filename_mgr:find_or_make_filename_from_prefix(
FluName, EpochId, {prefix, Prefix}, CoC),
case Res of
{file, F} ->
case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, F}) of
{ok, Pid} ->
{Tag, CS} = machi_util:unmake_tagged_csum(Csum),
Meta = [{client_csum_tag, Tag}, {client_csum, CS}],
machi_file_proxy:append(Pid, Meta, Extra, Chunk);
{error, trimmed} = E ->
E
end;
Error ->
Error
end.
start_listen_server(FluName, TcpPort, Witness_p, DataDir, EtsTab, ProjectionPid,
Props) ->
machi_flu1_subsup:start_listener(FluName, TcpPort, Witness_p, DataDir,
EtsTab, ProjectionPid, Props).
%% This is the name of the projection store that is spawned by the
%% *flu*, for use primarily in testing scenarios. In normal use, we
@ -316,6 +170,8 @@ handle_append(CoC_Namespace, CoC_Locator,
make_projection_server_regname(BaseName) ->
list_to_atom(atom_to_list(BaseName) ++ "_pstore").
ets_table_name(FluName) when is_atom(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_epoch").
-ifdef(TEST).
@ -357,7 +213,7 @@ timing_demo_test2() ->
lists:foldl(fun(X, _) ->
B = machi_checksums:encode_csum_file_entry_hex(X, 100, CSum),
%% file:write(ZZZ, [B, 10]),
machi_checksums:decode_csum_file_entry_hex(list_to_binary(B))
decode_csum_file_entry_hex(list_to_binary(B))
end, x, Xs)
end),
io:format(user, "~.3f sec\n", [HexUSec / 1000000]),

View file

@ -0,0 +1,193 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc Machi FLU1 append serialization server process
-module(machi_flu1_append_server).
-behavior(gen_server).
-include("machi.hrl").
-include("machi_projection.hrl").
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
-endif. % TEST
-export([start_link/4]).
-export([init/1]).
-export([handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
-export([int_update_wedge_state/3, int_wedge_myself/2]).
-export([current_state/1, format_state/1]).
-record(state, {
flu_name :: atom(),
witness = false :: boolean(),
wedged = true :: boolean(),
etstab :: ets:tid(),
epoch_id :: 'undefined' | machi_dt:epoch_id()
}).
-define(INIT_TIMEOUT, 60*1000).
-define(CALL_TIMEOUT, 60*1000).
-spec start_link(pv1_server(), boolean(), boolean(),
undefined | machi_dt:epoch_id()) -> {ok, pid()}.
start_link(Fluname, Witness_p, Wedged_p, EpochId) ->
%% Reminder: Name is the "main" name of the FLU, i.e., no suffix
gen_server:start_link({local, Fluname},
?MODULE, [Fluname, Witness_p, Wedged_p, EpochId],
[{timeout, ?INIT_TIMEOUT}]).
-spec current_state(atom() | pid()) -> term().
current_state(PidSpec) ->
gen_server:call(PidSpec, current_state, ?CALL_TIMEOUT).
format_state(State) ->
Fields = record_info(fields, state),
[_Name | Values] = tuple_to_list(State),
lists:zip(Fields, Values).
int_update_wedge_state(PidSpec, Boolean, EpochId)
when is_boolean(Boolean), is_tuple(EpochId) ->
gen_server:cast(PidSpec, {wedge_state_change, Boolean, EpochId}).
int_wedge_myself(PidSpec, EpochId)
when is_tuple(EpochId) ->
gen_server:cast(PidSpec, {wedge_myself, EpochId}).
init([Fluname, Witness_p, Wedged_p, EpochId]) ->
TID = ets:new(machi_flu1:ets_table_name(Fluname),
[set, protected, named_table, {read_concurrency, true}]),
ets:insert(TID, {epoch, {Wedged_p, EpochId}}),
{ok, #state{flu_name=Fluname, witness=Witness_p, wedged=Wedged_p,
etstab=TID, epoch_id=EpochId}}.
handle_call({seq_append, _From2, _NSInfo, _EpochID, _Prefix, _Chunk, _TCSum, _Opts},
_From, #state{witness=true}=S) ->
%% The FLU's machi_flu1_net_server process ought to filter all
%% witness states, but we'll keep this clause for extra
%% paranoia.
{reply, witness, S};
handle_call({seq_append, _From2, _NSInfo, _EpochID, _Prefix, _Chunk, _TCSum, _Opts},
_From, #state{wedged=true}=S) ->
{reply, wedged, S};
handle_call({seq_append, _From2, NSInfo, EpochID,
Prefix, Chunk, TCSum, Opts},
From, #state{flu_name=FluName, epoch_id=OldEpochId}=S) ->
%% Old is the one from our state, plain old 'EpochID' comes
%% from the client.
_ = case OldEpochId of
EpochID ->
spawn(fun() ->
append_server_dispatch(From, NSInfo,
Prefix, Chunk, TCSum, Opts,
FluName, EpochID)
end),
{noreply, S};
_ ->
{reply, {error, bad_epoch}, S}
end;
%% TODO: Who sends this message?
handle_call(wedge_status, _From,
#state{wedged=Wedged_p, epoch_id=EpochId} = S) ->
{reply, {wedge_status_reply, Wedged_p, EpochId}, S};
handle_call(current_state, _From, S) ->
{reply, S, S};
handle_call(Else, From, S) ->
io:format(user, "~s:handle_call: WHA? from=~w ~w\n", [?MODULE, From, Else]),
{noreply, S}.
handle_cast({wedge_myself, WedgeEpochId},
#state{flu_name=FluName, wedged=Wedged_p, epoch_id=OldEpochId}=S) ->
if not Wedged_p andalso WedgeEpochId == OldEpochId ->
true = ets:insert(S#state.etstab,
{epoch, {true, OldEpochId}}),
%% Tell my chain manager that it might want to react to
%% this new world.
Chmgr = machi_chain_manager1:make_chmgr_regname(FluName),
spawn(fun() ->
catch machi_chain_manager1:trigger_react_to_env(Chmgr)
end),
{noreply, S#state{wedged=true}};
true ->
{noreply, S}
end;
handle_cast({wedge_state_change, Boolean, {NewEpoch, _}=NewEpochId},
#state{epoch_id=OldEpochId}=S) ->
OldEpoch = case OldEpochId of {OldE, _} -> OldE;
undefined -> -1
end,
if NewEpoch >= OldEpoch ->
true = ets:insert(S#state.etstab,
{epoch, {Boolean, NewEpochId}}),
{noreply, S#state{wedged=Boolean, epoch_id=NewEpochId}};
true ->
{noreply, S}
end;
handle_cast(Else, S) ->
io:format(user, "~s:handle_cast: WHA? ~p\n", [?MODULE, Else]),
{noreply, S}.
handle_info(Else, S) ->
io:format(user, "~s:handle_info: WHA? ~p\n", [?MODULE, Else]),
{noreply, S}.
terminate(normal, _S) ->
ok;
terminate(Reason, _S) ->
lager:warning("~s:terminate: ~w", [?MODULE, Reason]),
ok.
code_change(_OldVsn, S, _Extra) ->
{ok, S}.
append_server_dispatch(From, NSInfo,
Prefix, Chunk, TCSum, Opts, FluName, EpochId) ->
Result = case handle_append(NSInfo,
Prefix, Chunk, TCSum, Opts, FluName, EpochId) of
{ok, File, Offset} ->
{assignment, Offset, File};
Other ->
Other
end,
_ = gen_server:reply(From, Result),
ok.
handle_append(NSInfo,
Prefix, Chunk, TCSum, Opts, FluName, EpochId) ->
Res = machi_flu_filename_mgr:find_or_make_filename_from_prefix(
FluName, EpochId, {prefix, Prefix}, NSInfo),
case Res of
{file, F} ->
case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, F}) of
{ok, Pid} ->
{Tag, CS} = machi_util:unmake_tagged_csum(TCSum),
Meta = [{client_csum_tag, Tag}, {client_csum, CS}],
Extra = Opts#append_opts.chunk_extra,
machi_file_proxy:append(Pid, Meta, Extra, Chunk);
{error, trimmed} = E ->
E
end;
Error ->
Error
end.

View file

@ -38,6 +38,71 @@
%% TODO This EDoc was written first, and the EDoc and also `-type' and
%% `-spec' definitions for {@link machi_proxy_flu1_client} and {@link
%% machi_cr_client} must be improved.
%%
%% == Client API implementation notes ==
%%
%% At the moment, there are several modules that implement various
%% subsets of the Machi API. The table below attempts to show how and
%% why they differ.
%%
%% ```
%% |--------------------------+-------+-----+------+------+-------+----------------|
%% | | PB | | # | | Conn | Epoch & NS |
%% | Module name | Level | CR? | FLUS | Impl | Life? | version aware? |
%% |--------------------------+-------+-----+------+------+-------+----------------|
%% | machi_pb_high_api_client | high | yes | many | proc | long | no |
%% | machi_cr_client | low | yes | many | proc | long | no |
%% | machi_proxy_flu1_client | low | no | 1 | proc | long | yes |
%% | machi_flu1_client | low | no | 1 | lib | short | yes |
%% |--------------------------+-------+-----+------+------+-------+----------------|
%% '''
%%
%% In terms of use and API layering, the table rows are in highest`->'lowest
%% order: each level calls the layer immediately below it.
%%
%% <dl>
%% <dt> <b> PB Level</b> </dt>
%% <dd> The Protocol Buffers API is divided logically into two levels,
%% "low" and "high". The low-level protocol is used for intra-chain
%% communication. The high-level protocol is used for clients outside
%% of a Machi chain or Machi cluster of chains.
%% </dd>
%% <dt> <b> CR?</b> </dt>
%% <dd> Does this API support (directly or indirectly) Chain
%% Replication? If `no', then the API has no awareness of multiple
%% replicas of any file or file chunk; unaware clients can only
%% perform operations at a single Machi FLU's file service or
%% projection store service.
%% </dd>
%% <dt> <b> # FLUs</b> </dt>
%% <dd> Now many FLUs does this API layer communicate with
%% simultaneously? Note that there is a one-to-one correspondence
%% between this value and the "CR?" column's value.
%% </dd>
%% <dt> <b> Impl</b> </dt>
%% <dd> Implementation: library-only or an Erlang process,
%% e.g., `gen_server'.
%% </dd>
%% <dt> <b> Conn Life?</b> </dt>
%% <dd> Expected TCP session connection life: short or long. At the
%% lowest level, the {@link machi_flu1_client} API implementation takes
%% no effort to reconnect to a remote FLU when its single TCP session
%% is broken. For long-lived connection life APIs, the server side will
%% automatically attempt to reconnect to remote FLUs when a TCP session
%% is broken.
%% </dd>
%% <dt> <b> Epoch &amp; NS version aware?</b> </dt>
%% <dd> Are clients of this API responsible for knowing a chain's EpochID
%% and namespace version numbers? If `no', then the server side of the
%% API will automatically attempt to discover/re-discover the EpochID and
%% namespace version numbers whenever they change.
%% </dd>
%% </dl>
%%
%% The only protocol that we expect to be used by entities outside of
%% a single Machi chain or a multi-chain cluster is the "high"
%% Protocol Buffers API. The {@link riak_pb_high_api_client} module
%% is an Erlang reference implementation of this PB API.
-module(machi_flu1_client).
@ -50,16 +115,15 @@
-include_lib("pulse_otp/include/pulse_otp.hrl").
-endif.
-define(HARD_TIMEOUT, 2500).
-define(SHORT_TIMEOUT, 2500).
-define(LONG_TIMEOUT, (60*1000)).
-export([
%% File API
append_chunk/4, append_chunk/5,
append_chunk/6, append_chunk/7,
append_chunk_extra/5, append_chunk_extra/6,
append_chunk_extra/7, append_chunk_extra/8,
read_chunk/6, read_chunk/7,
checksum_list/3, checksum_list/4,
append_chunk/8, append_chunk/9,
read_chunk/7, read_chunk/8,
checksum_list/2, checksum_list/3,
list_files/2, list_files/3,
wedge_status/1, wedge_status/2,
@ -81,190 +145,113 @@
]).
%% For "internal" replication only.
-export([
write_chunk/5, write_chunk/6,
trim_chunk/5,
write_chunk/7, write_chunk/8,
trim_chunk/6,
delete_migration/3, delete_migration/4,
trunc_hack/3, trunc_hack/4
]).
-type port_wrap() :: {w,atom(),term()}.
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
-spec append_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk()) ->
-spec append_chunk(port_wrap(),
'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(),
machi_dt:file_prefix(), machi_dt:chunk(),
machi_dt:chunk_csum()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk(Sock, EpochID, Prefix, Chunk) ->
append_chunk2(Sock, EpochID,
?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR,
Prefix, Chunk, 0).
append_chunk(Sock, NSInfo, EpochID, Prefix, Chunk, CSum) ->
append_chunk(Sock, NSInfo, EpochID, Prefix, Chunk, CSum,
#append_opts{}, ?LONG_TIMEOUT).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
%% with `Prefix' and also request an additional `Extra' bytes.
%%
%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then
%% the file offsets that follow `Chunk''s position for the following 4K will
%% be reserved by the file sequencer for later write(s) by the
%% `write_chunk()' API.
-spec append_chunk(machi_dt:inet_host(), machi_dt:inet_port(),
machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk()) ->
'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(),
machi_dt:file_prefix(), machi_dt:chunk(),
machi_dt:chunk_csum()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk(Host, TcpPort, EpochID, Prefix, Chunk) ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
append_chunk2(Sock, EpochID,
?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR,
Prefix, Chunk, 0)
after
disconnect(Sock)
end.
append_chunk(Host, TcpPort, NSInfo, EpochID, Prefix, Chunk, CSum) ->
append_chunk(Host, TcpPort, NSInfo, EpochID, Prefix, Chunk, CSum,
#append_opts{}, ?LONG_TIMEOUT).
-spec append_chunk(port_wrap(),
'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(),
machi_dt:file_prefix(), machi_dt:chunk(),
machi_dt:chunk_csum(), machi_dt:append_opts(), timeout()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk(Sock, NSInfo0, EpochID, Prefix, Chunk, CSum, Opts, Timeout) ->
NSInfo = machi_util:ns_info_default(NSInfo0),
append_chunk2(Sock, NSInfo, EpochID, Prefix, Chunk, CSum, Opts, Timeout).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
-spec append_chunk(port_wrap(), machi_dt:epoch_id(),
machi_dt:coc_namespace(), machi_dt:coc_locator(),
machi_dt:file_prefix(), machi_dt:chunk()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk(Sock, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk) ->
append_chunk2(Sock, EpochID,
CoC_Namespace, CoC_Locator,
Prefix, Chunk, 0).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
%% with `Prefix' and also request an additional `Extra' bytes.
%%
%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then
%% the file offsets that follow `Chunk''s position for the following 4K will
%% be reserved by the file sequencer for later write(s) by the
%% `write_chunk()' API.
-spec append_chunk(machi_dt:inet_host(), machi_dt:inet_port(),
machi_dt:epoch_id(),
machi_dt:coc_namespace(), machi_dt:coc_locator(),
machi_dt:file_prefix(), machi_dt:chunk()) ->
'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(),
machi_dt:file_prefix(), machi_dt:chunk(),
machi_dt:chunk_csum(), machi_dt:append_opts(), timeout()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk(Host, TcpPort, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk) ->
append_chunk(Host, TcpPort, NSInfo0, EpochID,
Prefix, Chunk, CSum, Opts, Timeout) ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
append_chunk2(Sock, EpochID,
CoC_Namespace, CoC_Locator,
Prefix, Chunk, 0)
after
disconnect(Sock)
end.
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix' and also request an additional `Extra' bytes.
%%
%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then
%% the file offsets that follow `Chunk''s position for the following 4K will
%% be reserved by the file sequencer for later write(s) by the
%% `write_chunk()' API.
-spec append_chunk_extra(port_wrap(), machi_dt:epoch_id(),
machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk_extra(Sock, EpochID, Prefix, Chunk, ChunkExtra)
when is_integer(ChunkExtra), ChunkExtra >= 0 ->
append_chunk2(Sock, EpochID,
?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR,
Prefix, Chunk, ChunkExtra).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix' and also request an additional `Extra' bytes.
%%
%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then
%% the file offsets that follow `Chunk''s position for the following 4K will
%% be reserved by the file sequencer for later write(s) by the
%% `write_chunk()' API.
-spec append_chunk_extra(machi_dt:inet_host(), machi_dt:inet_port(),
machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk_extra(Host, TcpPort, EpochID, Prefix, Chunk, ChunkExtra)
when is_integer(ChunkExtra), ChunkExtra >= 0 ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
append_chunk2(Sock, EpochID,
?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR,
Prefix, Chunk, ChunkExtra)
after
disconnect(Sock)
end.
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix' and also request an additional `Extra' bytes.
%%
%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then
%% the file offsets that follow `Chunk''s position for the following 4K will
%% be reserved by the file sequencer for later write(s) by the
%% `write_chunk()' API.
-spec append_chunk_extra(port_wrap(), machi_dt:epoch_id(),
machi_dt:coc_namespace(), machi_dt:coc_locator(),
machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk_extra(Sock, EpochID, CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra)
when is_integer(ChunkExtra), ChunkExtra >= 0 ->
append_chunk2(Sock, EpochID,
CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix' and also request an additional `Extra' bytes.
%%
%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then
%% the file offsets that follow `Chunk''s position for the following 4K will
%% be reserved by the file sequencer for later write(s) by the
%% `write_chunk()' API.
-spec append_chunk_extra(machi_dt:inet_host(), machi_dt:inet_port(),
machi_dt:epoch_id(),
machi_dt:coc_namespace(), machi_dt:coc_locator(),
machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk_extra(Host, TcpPort, EpochID,
CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra)
when is_integer(ChunkExtra), ChunkExtra >= 0 ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
append_chunk2(Sock, EpochID,
CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra)
NSInfo = machi_util:ns_info_default(NSInfo0),
append_chunk2(Sock, NSInfo, EpochID,
Prefix, Chunk, CSum, Opts, Timeout)
after
disconnect(Sock)
end.
%% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
-spec read_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size(),
proplists:proplist()) ->
{ok, machi_dt:chunk_s()} |
-spec read_chunk(port_wrap(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size(),
machi_dt:read_opts_x()) ->
{ok, {[machi_dt:chunk_summary()], [machi_dt:chunk_pos()]}} |
{error, machi_dt:error_general() | 'not_written' | 'partial_read'} |
{error, term()}.
read_chunk(Sock, EpochID, File, Offset, Size, Opts)
read_chunk(Sock, NSInfo0, EpochID, File, Offset, Size, Opts0)
when Offset >= ?MINIMUM_OFFSET, Size >= 0 ->
read_chunk2(Sock, EpochID, File, Offset, Size, Opts).
NSInfo = machi_util:ns_info_default(NSInfo0),
Opts = machi_util:read_opts_default(Opts0),
read_chunk2(Sock, NSInfo, EpochID, File, Offset, Size, Opts).
%% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
-spec read_chunk(machi_dt:inet_host(), machi_dt:inet_port(), machi_dt:epoch_id(),
-spec read_chunk(machi_dt:inet_host(), machi_dt:inet_port(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(),
machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size(),
proplists:proplist()) ->
{ok, machi_dt:chunk_s()} |
machi_dt:read_opts_x()) ->
{ok, [machi_dt:chunk_summary()]} |
{error, machi_dt:error_general() | 'not_written' | 'partial_read'} |
{error, term()}.
read_chunk(Host, TcpPort, EpochID, File, Offset, Size, Opts)
read_chunk(Host, TcpPort, NSInfo0, EpochID, File, Offset, Size, Opts0)
when Offset >= ?MINIMUM_OFFSET, Size >= 0 ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
NSInfo = machi_util:ns_info_default(NSInfo0),
Opts = machi_util:read_opts_default(Opts0),
try
read_chunk2(Sock, EpochID, File, Offset, Size, Opts)
read_chunk2(Sock, NSInfo, EpochID, File, Offset, Size, Opts)
after
disconnect(Sock)
end.
%% @doc Fetch the list of chunk checksums for `File'.
-spec checksum_list(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name()) ->
-spec checksum_list(port_wrap(), machi_dt:file_name()) ->
{ok, binary()} |
{error, machi_dt:error_general() | 'no_such_file' | 'partial_read'} |
{error, term()}.
checksum_list(Sock, EpochID, File) ->
checksum_list2(Sock, EpochID, File).
checksum_list(Sock, File) ->
checksum_list2(Sock, File).
%% @doc Fetch the list of chunk checksums for `File'.
%%
@ -288,13 +275,13 @@ checksum_list(Sock, EpochID, File) ->
%% Details of the encoding used inside the `binary()' blog can be found
%% in the EDoc comments for {@link machi_flu1:decode_csum_file_entry/1}.
-spec checksum_list(machi_dt:inet_host(), machi_dt:inet_port(), machi_dt:epoch_id(), machi_dt:file_name()) ->
-spec checksum_list(machi_dt:inet_host(), machi_dt:inet_port(), machi_dt:file_name()) ->
{ok, binary()} |
{error, machi_dt:error_general() | 'no_such_file'} | {error, term()}.
checksum_list(Host, TcpPort, EpochID, File) when is_integer(TcpPort) ->
checksum_list(Host, TcpPort, File) when is_integer(TcpPort) ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
checksum_list2(Sock, EpochID, File)
checksum_list2(Sock, File)
after
disconnect(Sock)
end.
@ -321,7 +308,7 @@ list_files(Host, TcpPort, EpochID) when is_integer(TcpPort) ->
%% @doc Fetch the wedge status from the remote FLU.
-spec wedge_status(port_wrap()) ->
{ok, {boolean(), machi_dt:epoch_id()}} | {error, term()}.
{ok, {boolean(), machi_dt:epoch_id(), machi_dt:namespace_version(),machi_dt:namespace()}} | {error, term()}.
wedge_status(Sock) ->
wedge_status2(Sock).
@ -329,7 +316,7 @@ wedge_status(Sock) ->
%% @doc Fetch the wedge status from the remote FLU.
-spec wedge_status(machi_dt:inet_host(), machi_dt:inet_port()) ->
{ok, {boolean(), machi_dt:epoch_id()}} | {error, term()}.
{ok, {boolean(), machi_dt:epoch_id(), machi_dt:namespace_version(),machi_dt:namespace()}} | {error, term()}.
wedge_status(Host, TcpPort) when is_integer(TcpPort) ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
@ -540,23 +527,25 @@ disconnect(_) ->
%% @doc Restricted API: Write a chunk of already-sequenced data to
%% `File' at `Offset'.
-spec write_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk()) ->
-spec write_chunk(port_wrap(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk(), machi_dt:chunk_csum()) ->
ok | {error, machi_dt:error_general()} | {error, term()}.
write_chunk(Sock, EpochID, File, Offset, Chunk)
write_chunk(Sock, NSInfo0, EpochID, File, Offset, Chunk, CSum)
when Offset >= ?MINIMUM_OFFSET ->
write_chunk2(Sock, EpochID, File, Offset, Chunk).
NSInfo = machi_util:ns_info_default(NSInfo0),
write_chunk2(Sock, NSInfo, EpochID, File, Offset, Chunk, CSum).
%% @doc Restricted API: Write a chunk of already-sequenced data to
%% `File' at `Offset'.
-spec write_chunk(machi_dt:inet_host(), machi_dt:inet_port(),
machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk()) ->
'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk(), machi_dt:chunk_csum()) ->
ok | {error, machi_dt:error_general()} | {error, term()}.
write_chunk(Host, TcpPort, EpochID, File, Offset, Chunk)
write_chunk(Host, TcpPort, NSInfo0, EpochID, File, Offset, Chunk, CSum)
when Offset >= ?MINIMUM_OFFSET ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
write_chunk2(Sock, EpochID, File, Offset, Chunk)
NSInfo = machi_util:ns_info_default(NSInfo0),
write_chunk2(Sock, NSInfo, EpochID, File, Offset, Chunk, CSum)
after
disconnect(Sock)
end.
@ -564,16 +553,18 @@ write_chunk(Host, TcpPort, EpochID, File, Offset, Chunk)
%% @doc Restricted API: Write a chunk of already-sequenced data to
%% `File' at `Offset'.
-spec trim_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size()) ->
-spec trim_chunk(port_wrap(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size()) ->
ok | {error, machi_dt:error_general()} | {error, term()}.
trim_chunk(Sock, EpochID, File0, Offset, Size)
trim_chunk(Sock, NSInfo0, EpochID, File0, Offset, Size)
when Offset >= ?MINIMUM_OFFSET ->
ReqID = <<"id">>,
NSInfo = machi_util:ns_info_default(NSInfo0),
#ns_info{version=NSVersion, name=NS} = NSInfo,
File = machi_util:make_binary(File0),
true = (Offset >= ?MINIMUM_OFFSET),
Req = machi_pb_translate:to_pb_request(
ReqID,
{low_trim_chunk, EpochID, File, Offset, Size, 0}),
{low_trim_chunk, NSVersion, NS, EpochID, File, Offset, Size, 0}),
do_pb_request_common(Sock, ReqID, Req).
%% @doc Restricted API: Delete a file after it has been successfully
@ -620,83 +611,88 @@ trunc_hack(Host, TcpPort, EpochID, File) when is_integer(TcpPort) ->
%%%%%%%%%%%%%%%%%%%%%%%%%%%
read_chunk2(Sock, EpochID, File0, Offset, Size, Opts) ->
read_chunk2(Sock, NSInfo, EpochID, File0, Offset, Size, Opts) ->
ReqID = <<"id">>,
#ns_info{version=NSVersion, name=NS} = NSInfo,
File = machi_util:make_binary(File0),
Req = machi_pb_translate:to_pb_request(
ReqID,
{low_read_chunk, EpochID, File, Offset, Size, Opts}),
{low_read_chunk, NSVersion, NS, EpochID, File, Offset, Size, Opts}),
do_pb_request_common(Sock, ReqID, Req).
append_chunk2(Sock, EpochID, CoC_Namespace, CoC_Locator,
Prefix0, Chunk0, ChunkExtra) ->
append_chunk2(Sock, NSInfo, EpochID,
Prefix0, Chunk, CSum0, Opts, Timeout) ->
ReqID = <<"id">>,
{Chunk, CSum_tag, CSum} =
case Chunk0 of
X when is_binary(X) ->
{Chunk0, ?CSUM_TAG_NONE, <<>>};
{ChunkCSum, Chk} ->
{Tag, CS} = machi_util:unmake_tagged_csum(ChunkCSum),
{Chk, Tag, CS}
end,
Prefix = machi_util:make_binary(Prefix0),
{CSum_tag, CSum} = case CSum0 of
<<>> ->
{?CSUM_TAG_NONE, <<>>};
{_Tag, _CS} ->
CSum0;
B when is_binary(B) ->
machi_util:unmake_tagged_csum(CSum0)
end,
#ns_info{version=NSVersion, name=NS, locator=NSLocator} = NSInfo,
%% NOTE: The tuple position of NSLocator is a bit odd, because EpochID
%% _must_ be in the 4th position (as NSV & NS must be in 2nd & 3rd).
Req = machi_pb_translate:to_pb_request(
ReqID,
{low_append_chunk, EpochID, CoC_Namespace, CoC_Locator,
Prefix, Chunk, CSum_tag, CSum, ChunkExtra}),
do_pb_request_common(Sock, ReqID, Req).
{low_append_chunk, NSVersion, NS, EpochID, NSLocator,
Prefix, Chunk, CSum_tag, CSum, Opts}),
do_pb_request_common(Sock, ReqID, Req, true, Timeout).
write_chunk2(Sock, EpochID, File0, Offset, Chunk0) ->
write_chunk2(Sock, NSInfo, EpochID, File0, Offset, Chunk, CSum0) ->
ReqID = <<"id">>,
#ns_info{version=NSVersion, name=NS} = NSInfo,
File = machi_util:make_binary(File0),
true = (Offset >= ?MINIMUM_OFFSET),
{Chunk, CSum_tag, CSum} =
case Chunk0 of
X when is_binary(X) ->
{Chunk0, ?CSUM_TAG_NONE, <<>>};
{ChunkCSum, Chk} ->
{Tag, CS} = machi_util:unmake_tagged_csum(ChunkCSum),
{Chk, Tag, CS}
end,
{CSum_tag, CSum} = case CSum0 of
<<>> ->
{?CSUM_TAG_NONE, <<>>};
{_Tag, _CS} ->
CSum0;
B when is_binary(B) ->
machi_util:unmake_tagged_csum(CSum0)
end,
Req = machi_pb_translate:to_pb_request(
ReqID,
{low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, CSum}),
{low_write_chunk, NSVersion, NS, EpochID, File, Offset, Chunk, CSum_tag, CSum}),
do_pb_request_common(Sock, ReqID, Req).
list2(Sock, EpochID) ->
ReqID = <<"id">>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_list_files, EpochID}),
ReqID, {low_skip_wedge, {low_list_files, EpochID}}),
do_pb_request_common(Sock, ReqID, Req).
wedge_status2(Sock) ->
ReqID = <<"id">>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_wedge_status, undefined}),
ReqID, {low_skip_wedge, {low_wedge_status}}),
do_pb_request_common(Sock, ReqID, Req).
echo2(Sock, Message) ->
ReqID = <<"id">>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_echo, undefined, Message}),
ReqID, {low_skip_wedge, {low_echo, Message}}),
do_pb_request_common(Sock, ReqID, Req).
checksum_list2(Sock, EpochID, File) ->
checksum_list2(Sock, File) ->
ReqID = <<"id">>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_checksum_list, EpochID, File}),
ReqID, {low_skip_wedge, {low_checksum_list, File}}),
do_pb_request_common(Sock, ReqID, Req).
delete_migration2(Sock, EpochID, File) ->
ReqID = <<"id">>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_delete_migration, EpochID, File}),
ReqID, {low_skip_wedge, {low_delete_migration, EpochID, File}}),
do_pb_request_common(Sock, ReqID, Req).
trunc_hack2(Sock, EpochID, File) ->
ReqID = <<"id-trunc">>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_trunc_hack, EpochID, File}),
ReqID, {low_skip_wedge, {low_trunc_hack, EpochID, File}}),
do_pb_request_common(Sock, ReqID, Req).
get_latest_epochid2(Sock, ProjType) ->
@ -739,18 +735,18 @@ kick_projection_reaction2(Sock, _Options) ->
ReqID = <<42>>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_proj, {kick_projection_reaction}}),
do_pb_request_common(Sock, ReqID, Req, false).
do_pb_request_common(Sock, ReqID, Req, false, ?LONG_TIMEOUT).
do_pb_request_common(Sock, ReqID, Req) ->
do_pb_request_common(Sock, ReqID, Req, true).
do_pb_request_common(Sock, ReqID, Req, true, ?LONG_TIMEOUT).
do_pb_request_common(Sock, ReqID, Req, GetReply_p) ->
do_pb_request_common(Sock, ReqID, Req, GetReply_p, Timeout) ->
erase(bad_sock),
try
ReqBin = list_to_binary(machi_pb:encode_mpb_ll_request(Req)),
ok = w_send(Sock, ReqBin),
if GetReply_p ->
case w_recv(Sock, 0) of
case w_recv(Sock, 0, Timeout) of
{ok, RespBin} ->
Resp = machi_pb:decode_mpb_ll_response(RespBin),
{ReqID2, Reply} = machi_pb_translate:from_pb_response(Resp),
@ -796,7 +792,7 @@ w_connect(#p_srvr{proto_mod=?MODULE, address=Host, port=Port, props=Props}=_P)->
case proplists:get_value(session_proto, Props, tcp) of
tcp ->
put(xxx, goofus),
Sock = machi_util:connect(Host, Port, ?HARD_TIMEOUT),
Sock = machi_util:connect(Host, Port, ?SHORT_TIMEOUT),
put(xxx, Sock),
ok = inet:setopts(Sock, ?PB_PACKET_OPTS),
{w,tcp,Sock};
@ -820,8 +816,8 @@ w_close({w,tcp,Sock}) ->
catch gen_tcp:close(Sock),
ok.
w_recv({w,tcp,Sock}, Amt) ->
gen_tcp:recv(Sock, Amt, ?HARD_TIMEOUT).
w_recv({w,tcp,Sock}, Amt, Timeout) ->
gen_tcp:recv(Sock, Amt, Timeout).
w_send({w,tcp,Sock}, IoData) ->
gen_tcp:send(Sock, IoData).

View file

@ -66,19 +66,25 @@
flu_name :: pv1_server(),
%% Used in server_wedge_status to lookup the table
epoch_tab :: ets:tab(),
%% Clustering: cluster map version number
namespace_version = 0 :: machi_dt:namespace_version(),
%% Clustering: my (and my chain's) assignment to a specific namespace
namespace = <<>> :: machi_dt:namespace(),
%% High mode only
high_clnt :: pid(),
%% anything you want
props = [] :: list() % proplist
props = [] :: proplists:proplist()
}).
-type socket() :: any().
-type state() :: #state{}.
-spec start_link(ranch:ref(), socket(), module(), [term()]) -> {ok, pid()}.
start_link(Ref, Socket, Transport, [FluName, Witness, DataDir, EpochTab, ProjStore]) ->
start_link(Ref, Socket, Transport, [FluName, Witness, DataDir, EpochTab, ProjStore, Props]) ->
NS = proplists:get_value(namespace, Props, <<>>),
true = is_binary(NS),
proc_lib:start_link(?MODULE, init, [#state{ref=Ref,
socket=Socket,
transport=Transport,
@ -86,7 +92,9 @@ start_link(Ref, Socket, Transport, [FluName, Witness, DataDir, EpochTab, ProjSto
witness=Witness,
data_dir=DataDir,
epoch_tab=EpochTab,
proj_store=ProjStore}]).
proj_store=ProjStore,
namespace=NS,
props=Props}]).
-spec init(state()) -> no_return().
init(#state{ref=Ref, socket=Socket, transport=Transport}=State) ->
@ -209,44 +217,51 @@ do_pb_ll_request(#mpb_ll_request{req_id=ReqID}, #state{pb_mode=high}=S) ->
{machi_pb_translate:to_pb_response(ReqID, unused, Result), S};
do_pb_ll_request(PB_request, S) ->
Req = machi_pb_translate:from_pb_request(PB_request),
%% io:format(user, "[~w] do_pb_ll_request Req: ~w~n", [S#state.flu_name, Req]),
{ReqID, Cmd, Result, S2} =
case Req of
{RqID, {LowCmd, _}=Cmd0}
when LowCmd =:= low_proj;
LowCmd =:= low_wedge_status;
LowCmd =:= low_list_files ->
{RqID, {low_skip_wedge, LowSubCmd}=Cmd0} ->
%% Skip wedge check for these unprivileged commands
{Rs, NewS} = do_pb_ll_request3(LowSubCmd, S),
{RqID, Cmd0, Rs, NewS};
{RqID, {low_proj, _LowSubCmd}=Cmd0} ->
{Rs, NewS} = do_pb_ll_request3(Cmd0, S),
{RqID, Cmd0, Rs, NewS};
{RqID, Cmd0} ->
EpochID = element(2, Cmd0), % by common convention
{Rs, NewS} = do_pb_ll_request2(EpochID, Cmd0, S),
%% All remaining must have NSVersion, NS, & EpochID at next pos
NSVersion = element(2, Cmd0),
NS = element(3, Cmd0),
EpochID = element(4, Cmd0),
{Rs, NewS} = do_pb_ll_request2(NSVersion, NS, EpochID, Cmd0, S),
{RqID, Cmd0, Rs, NewS}
end,
{machi_pb_translate:to_pb_response(ReqID, Cmd, Result), S2}.
do_pb_ll_request2(EpochID, CMD, S) ->
%% do_pb_ll_request2(): Verification of epoch details & namespace details.
do_pb_ll_request2(NSVersion, NS, EpochID, CMD, S) ->
{Wedged_p, CurrentEpochID} = lookup_epoch(S),
%% io:format(user, "{Wedged_p, CurrentEpochID}: ~w~n", [{Wedged_p, CurrentEpochID}]),
if Wedged_p == true ->
if not is_tuple(EpochID) orelse tuple_size(EpochID) /= 2 ->
exit({bad_epoch_id, EpochID, for, CMD});
Wedged_p == true ->
{{error, wedged}, S#state{epoch_id=CurrentEpochID}};
is_tuple(EpochID)
andalso
EpochID /= CurrentEpochID ->
{Epoch, _} = EpochID,
{CurrentEpoch, _} = CurrentEpochID,
if Epoch < CurrentEpoch ->
ok;
{{error, bad_epoch}, S};
true ->
%% We're at same epoch # but different checksum, or
%% we're at a newer/bigger epoch #.
_ = machi_flu1:wedge_myself(S#state.flu_name, CurrentEpochID),
ok
end,
{{error, bad_epoch}, S#state{epoch_id=CurrentEpochID}};
{{error, wedged}, S#state{epoch_id=CurrentEpochID}}
end;
true ->
do_pb_ll_request3(CMD, S#state{epoch_id=CurrentEpochID})
#state{namespace_version=MyNSVersion, namespace=MyNS} = S,
if NSVersion /= MyNSVersion ->
{{error, bad_epoch}, S};
NS /= MyNS ->
{{error, bad_arg}, S};
true ->
do_pb_ll_request3(CMD, S)
end
end.
lookup_epoch(#state{epoch_tab=T}) ->
@ -254,34 +269,35 @@ lookup_epoch(#state{epoch_tab=T}) ->
ets:lookup_element(T, epoch, 2).
%% Witness status does not matter below.
do_pb_ll_request3({low_echo, _BogusEpochID, Msg}, S) ->
do_pb_ll_request3({low_echo, Msg}, S) ->
{Msg, S};
do_pb_ll_request3({low_auth, _BogusEpochID, _User, _Pass}, S) ->
do_pb_ll_request3({low_auth, _User, _Pass}, S) ->
{-6, S};
do_pb_ll_request3({low_wedge_status, _EpochID}, S) ->
do_pb_ll_request3({low_wedge_status}, S) ->
{do_server_wedge_status(S), S};
do_pb_ll_request3({low_proj, PCMD}, S) ->
{do_server_proj_request(PCMD, S), S};
%% Witness status *matters* below
do_pb_ll_request3({low_append_chunk, _EpochID, CoC_Namespace, CoC_Locator,
do_pb_ll_request3({low_append_chunk, NSVersion, NS, EpochID, NSLocator,
Prefix, Chunk, CSum_tag,
CSum, ChunkExtra},
CSum, Opts},
#state{witness=false}=S) ->
{do_server_append_chunk(CoC_Namespace, CoC_Locator,
NSInfo = #ns_info{version=NSVersion, name=NS, locator=NSLocator},
{do_server_append_chunk(NSInfo, EpochID,
Prefix, Chunk, CSum_tag, CSum,
ChunkExtra, S), S};
do_pb_ll_request3({low_write_chunk, _EpochID, File, Offset, Chunk, CSum_tag,
Opts, S), S};
do_pb_ll_request3({low_write_chunk, _NSVersion, _NS, _EpochID, File, Offset, Chunk, CSum_tag,
CSum},
#state{witness=false}=S) ->
{do_server_write_chunk(File, Offset, Chunk, CSum_tag, CSum, S), S};
do_pb_ll_request3({low_read_chunk, _EpochID, File, Offset, Size, Opts},
do_pb_ll_request3({low_read_chunk, _NSVersion, _NS, _EpochID, File, Offset, Size, Opts},
#state{witness=false} = S) ->
{do_server_read_chunk(File, Offset, Size, Opts, S), S};
do_pb_ll_request3({low_trim_chunk, _EpochID, File, Offset, Size, TriggerGC},
do_pb_ll_request3({low_trim_chunk, _NSVersion, _NS, _EpochID, File, Offset, Size, TriggerGC},
#state{witness=false}=S) ->
{do_server_trim_chunk(File, Offset, Size, TriggerGC, S), S};
do_pb_ll_request3({low_checksum_list, _EpochID, File},
do_pb_ll_request3({low_checksum_list, File},
#state{witness=false}=S) ->
{do_server_checksum_listing(File, S), S};
do_pb_ll_request3({low_list_files, _EpochID},
@ -334,37 +350,36 @@ do_server_proj_request({kick_projection_reaction},
end),
async_no_response.
do_server_append_chunk(CoC_Namespace, CoC_Locator,
do_server_append_chunk(NSInfo, EpochID,
Prefix, Chunk, CSum_tag, CSum,
ChunkExtra, S) ->
Opts, S) ->
case sanitize_prefix(Prefix) of
ok ->
do_server_append_chunk2(CoC_Namespace, CoC_Locator,
do_server_append_chunk2(NSInfo, EpochID,
Prefix, Chunk, CSum_tag, CSum,
ChunkExtra, S);
Opts, S);
_ ->
{error, bad_arg}
end.
do_server_append_chunk2(CoC_Namespace, CoC_Locator,
do_server_append_chunk2(NSInfo, EpochID,
Prefix, Chunk, CSum_tag, Client_CSum,
ChunkExtra, #state{flu_name=FluName,
epoch_id=EpochID}=_S) ->
Opts, #state{flu_name=FluName,
epoch_id=EpochID}=_S) ->
%% TODO: Do anything with PKey?
try
TaggedCSum = check_or_make_tagged_checksum(CSum_tag, Client_CSum,Chunk),
R = {seq_append, self(), CoC_Namespace, CoC_Locator,
Prefix, Chunk, TaggedCSum, ChunkExtra, EpochID},
FluName ! R,
receive
R = {seq_append, self(), NSInfo, EpochID,
Prefix, Chunk, TaggedCSum, Opts},
case gen_server:call(FluName, R, 10*1000) of
{assignment, Offset, File} ->
Size = iolist_size(Chunk),
{ok, {Offset, Size, File}};
witness ->
{error, bad_arg};
wedged ->
{error, wedged}
after 10*1000 ->
{error, wedged};
{error, timeout} ->
{error, partition}
end
catch
@ -458,14 +473,14 @@ do_server_list_files(#state{data_dir=DataDir}=_S) ->
{Size, File}
end || File <- Files]}.
do_server_wedge_status(S) ->
do_server_wedge_status(#state{namespace_version=NSVersion, namespace=NS}=S) ->
{Wedged_p, CurrentEpochID0} = lookup_epoch(S),
CurrentEpochID = if CurrentEpochID0 == undefined ->
?DUMMY_PV1_EPOCH;
true ->
CurrentEpochID0
end,
{Wedged_p, CurrentEpochID}.
{Wedged_p, CurrentEpochID, NSVersion, NS}.
do_server_delete_migration(File, #state{data_dir=DataDir}=_S) ->
case sanitize_file_string(File) of
@ -564,26 +579,30 @@ do_pb_hl_request2({high_echo, Msg}, S) ->
{Msg, S};
do_pb_hl_request2({high_auth, _User, _Pass}, S) ->
{-77, S};
do_pb_hl_request2({high_append_chunk, CoC_Namespace, CoC_Locator,
Prefix, ChunkBin, TaggedCSum,
ChunkExtra}, #state{high_clnt=Clnt}=S) ->
Chunk = {TaggedCSum, ChunkBin},
Res = machi_cr_client:append_chunk_extra(Clnt, CoC_Namespace, CoC_Locator,
Prefix, Chunk,
ChunkExtra),
{Res, S};
do_pb_hl_request2({high_write_chunk, File, Offset, ChunkBin, TaggedCSum},
do_pb_hl_request2({high_append_chunk=Op, NS, Prefix, Chunk, TaggedCSum, Opts},
#state{high_clnt=Clnt}=S) ->
Chunk = {TaggedCSum, ChunkBin},
Res = machi_cr_client:write_chunk(Clnt, File, Offset, Chunk),
NSInfo = #ns_info{name=NS}, % TODO populate other fields
todo_perhaps_remind_ns_locator_not_chosen(Op),
Res = machi_cr_client:append_chunk(Clnt, NSInfo,
Prefix, Chunk, TaggedCSum, Opts),
{Res, S};
do_pb_hl_request2({high_read_chunk, File, Offset, Size, Opts},
do_pb_hl_request2({high_write_chunk=Op, File, Offset, Chunk, CSum},
#state{high_clnt=Clnt}=S) ->
Res = machi_cr_client:read_chunk(Clnt, File, Offset, Size, Opts),
NSInfo = undefined,
todo_perhaps_remind_ns_locator_not_chosen(Op),
Res = machi_cr_client:write_chunk(Clnt, NSInfo, File, Offset, Chunk, CSum),
{Res, S};
do_pb_hl_request2({high_trim_chunk, File, Offset, Size},
do_pb_hl_request2({high_read_chunk=Op, File, Offset, Size, Opts},
#state{high_clnt=Clnt}=S) ->
Res = machi_cr_client:trim_chunk(Clnt, File, Offset, Size),
NSInfo = undefined,
todo_perhaps_remind_ns_locator_not_chosen(Op),
Res = machi_cr_client:read_chunk(Clnt, NSInfo, File, Offset, Size, Opts),
{Res, S};
do_pb_hl_request2({high_trim_chunk=Op, File, Offset, Size},
#state{high_clnt=Clnt}=S) ->
NSInfo = undefined,
todo_perhaps_remind_ns_locator_not_chosen(Op),
Res = machi_cr_client:trim_chunk(Clnt, NSInfo, File, Offset, Size),
{Res, S};
do_pb_hl_request2({high_checksum_list, File}, #state{high_clnt=Clnt}=S) ->
Res = machi_cr_client:checksum_list(Clnt, File),
@ -601,3 +620,15 @@ make_high_clnt(#state{high_clnt=undefined}=S) ->
S#state{high_clnt=Clnt};
make_high_clnt(S) ->
S.
todo_perhaps_remind_ns_locator_not_chosen(Op) ->
Key = {?MODULE, Op},
case get(Key) of
undefined ->
io:format(user, "TODO op ~w is using default locator value\n",
[Op]),
put(Key, true);
_ ->
ok
end.

118
src/machi_flu1_subsup.erl Normal file
View file

@ -0,0 +1,118 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc A supervisor to hold dynamic processes inside single
%% FLU service, ranch listener and append server.
%% TODO: This supervisor is maybe useless. First introduced for
%% workaround to start listener dynamically in flu1 initialization
%% phase. Because `machi_flu_psup' is being blocked in flu1
%% initialization time, adding a child to the supervisor leads to
%% deadlock. If initialization can be done only by static arguments,
%% then this supervisor should be removed and added as a direct child
%% of `machi_flu_psup'.
-module(machi_flu1_subsup).
-behaviour(supervisor).
%% public API
-export([start_link/1,
start_append_server/4,
stop_append_server/1,
start_listener/7,
stop_listener/1,
subsup_name/1,
listener_name/1]).
%% supervisor callback
-export([init/1]).
-include("machi_projection.hrl").
-define(SHUTDOWN, 5000).
-define(BACKLOG, 8192).
-spec start_link(pv1_server()) -> {ok, pid()}.
start_link(FluName) ->
supervisor:start_link({local, subsup_name(FluName)}, ?MODULE, []).
-spec start_append_server(pv1_server(), boolean(), boolean(),
undefined | machi_dt:epoch_id()) ->
{ok, pid()}.
start_append_server(FluName, Witness_p, Wedged_p, EpochId) ->
supervisor:start_child(subsup_name(FluName),
append_server_spec(FluName, Witness_p, Wedged_p, EpochId)).
-spec stop_append_server(pv1_server()) -> ok.
stop_append_server(FluName) ->
SubSup = listener_name(FluName),
ok = supervisor:terminate_child(SubSup, FluName),
ok = supervisor:delete_child(SubSup, FluName).
-spec start_listener(pv1_server(), inet:port_number(), boolean(),
string(), ets:tab(), atom() | pid(),
proplists:proplist()) -> {ok, pid()}.
start_listener(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore,
Props) ->
supervisor:start_child(subsup_name(FluName),
listener_spec(FluName, TcpPort, Witness, DataDir,
EpochTab, ProjStore, Props)).
-spec stop_listener(pv1_server()) -> ok.
stop_listener(FluName) ->
SupName = subsup_name(FluName),
ListenerName = listener_name(FluName),
ok = supervisor:terminate_child(SupName, ListenerName),
ok = supervisor:delete_child(SupName, ListenerName).
-spec subsup_name(pv1_server()) -> atom().
subsup_name(FluName) when is_atom(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_flu1_subsup").
-spec listener_name(pv1_server()) -> atom().
listener_name(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_listener").
%% Supervisor callback
init([]) ->
SupFlags = {one_for_all, 1000, 10},
{ok, {SupFlags, []}}.
%% private
-spec listener_spec(pv1_server(), inet:port_number(), boolean(),
string(), ets:tab(), atom() | pid(),
proplists:proplist()) -> supervisor:child_spec().
listener_spec(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore, Props) ->
ListenerName = listener_name(FluName),
NbAcceptors = 10,
TcpOpts = [{port, TcpPort}, {backlog, ?BACKLOG}],
NetServerOpts = [FluName, Witness, DataDir, EpochTab, ProjStore, Props],
ranch:child_spec(ListenerName, NbAcceptors,
ranch_tcp, TcpOpts,
machi_flu1_net_server, NetServerOpts).
-spec append_server_spec(pv1_server(), boolean(), boolean(),
undefined | machi_dt:epoch_id()) -> supervisor:child_spec().
append_server_spec(FluName, Witness_p, Wedged_p, EpochId) ->
{FluName, {machi_flu1_append_server, start_link,
[FluName, Witness_p, Wedged_p, EpochId]},
permanent, ?SHUTDOWN, worker, [machi_flu1_append_server]}.

View file

@ -67,12 +67,13 @@
]).
-define(TIMEOUT, 10 * 1000).
-include("machi_projection.hrl"). %% included for pv1_epoch_n type
-include("machi.hrl"). %% included for #ns_info record
-include("machi_projection.hrl"). %% included for pv1_epoch type
-record(state, {fluname :: atom(),
tid :: ets:tid(),
datadir :: string(),
epoch :: pv1_epoch_n()
epoch :: pv1_epoch()
}).
%% public API
@ -87,31 +88,31 @@ start_link(FluName, DataDir) when is_atom(FluName) andalso is_list(DataDir) ->
N = make_filename_mgr_name(FluName),
gen_server:start_link({local, N}, ?MODULE, [FluName, DataDir], []).
-spec find_or_make_filename_from_prefix( FluName :: atom(),
EpochId :: pv1_epoch_n(),
-spec find_or_make_filename_from_prefix( FluName :: atom(),
EpochId :: pv1_epoch(),
Prefix :: {prefix, string()},
machi_dt:coc_nl()) ->
machi_dt:ns_info()) ->
{file, Filename :: string()} | {error, Reason :: term() } | timeout.
% @doc Find the latest available or make a filename from a prefix. A prefix
% should be in the form of a tagged tuple `{prefix, P}'. Returns a tagged
% tuple in the form of `{file, F}' or an `{error, Reason}'
find_or_make_filename_from_prefix(FluName, EpochId,
{prefix, Prefix},
{coc, _CoC_Ns, _CoC_Loc}=CoC_NL)
#ns_info{}=NSInfo)
when is_atom(FluName) ->
N = make_filename_mgr_name(FluName),
gen_server:call(N, {find_filename, EpochId, CoC_NL, Prefix}, ?TIMEOUT);
gen_server:call(N, {find_filename, FluName, EpochId, NSInfo, Prefix}, ?TIMEOUT);
find_or_make_filename_from_prefix(_FluName, _EpochId, Other, Other2) ->
lager:error("~p is not a valid prefix/CoC ~p", [Other, Other2]),
lager:error("~p is not a valid prefix/locator ~p", [Other, Other2]),
error(badarg).
-spec increment_prefix_sequence( FluName :: atom(), CoC_NL :: machi_dt:coc_nl(), Prefix :: {prefix, string()} ) ->
-spec increment_prefix_sequence( FluName :: atom(), NSInfo :: machi_dt:ns_info(), Prefix :: {prefix, string()} ) ->
ok | {error, Reason :: term() } | timeout.
% @doc Increment the sequence counter for a given prefix. Prefix should
% be in the form of `{prefix, P}'.
increment_prefix_sequence(FluName, {coc,_CoC_Namespace,_CoC_Locator}=CoC_NL, {prefix, Prefix}) when is_atom(FluName) ->
gen_server:call(make_filename_mgr_name(FluName), {increment_sequence, CoC_NL, Prefix}, ?TIMEOUT);
increment_prefix_sequence(_FluName, _CoC_NL, Other) ->
increment_prefix_sequence(FluName, #ns_info{}=NSInfo, {prefix, Prefix}) when is_atom(FluName) ->
gen_server:call(make_filename_mgr_name(FluName), {increment_sequence, NSInfo, Prefix}, ?TIMEOUT);
increment_prefix_sequence(_FluName, _NSInfo, Other) ->
lager:error("~p is not a valid prefix.", [Other]),
error(badarg).
@ -130,7 +131,7 @@ list_files_by_prefix(_FluName, Other) ->
init([FluName, DataDir]) ->
Tid = ets:new(make_filename_mgr_name(FluName), [named_table, {read_concurrency, true}]),
{ok, #state{fluname = FluName,
epoch = 0,
epoch = ?DUMMY_PV1_EPOCH,
datadir = DataDir,
tid = Tid}}.
@ -142,23 +143,23 @@ handle_cast(Req, State) ->
%% the FLU has already validated that the caller's epoch id and the FLU's epoch id
%% are the same. So we *assume* that remains the case here - that is to say, we
%% are not wedged.
handle_call({find_filename, EpochId, CoC_NL, Prefix}, _From, S = #state{ datadir = DataDir,
epoch = EpochId,
tid = Tid }) ->
handle_call({find_filename, FluName, EpochId, NSInfo, Prefix}, _From,
S = #state{ datadir = DataDir, epoch = EpochId, tid = Tid }) ->
%% Our state and the caller's epoch ids are the same. Business as usual.
File = handle_find_file(Tid, CoC_NL, Prefix, DataDir),
File = handle_find_file(FluName, Tid, NSInfo, Prefix, DataDir),
{reply, {file, File}, S};
handle_call({find_filename, EpochId, CoC_NL, Prefix}, _From, S = #state{ datadir = DataDir, tid = Tid }) ->
handle_call({find_filename, _FluName, EpochId, NSInfo, Prefix}, _From, S = #state{ datadir = DataDir, tid = Tid }) ->
%% If the epoch id in our state and the caller's epoch id were the same, it would've
%% matched the above clause. Since we're here, we know that they are different.
%% If epoch ids between our state and the caller's are different, we must increment the
%% sequence number, generate a filename and then cache it.
File = increment_and_cache_filename(Tid, DataDir, CoC_NL, Prefix),
File = increment_and_cache_filename(Tid, DataDir, NSInfo, Prefix),
{reply, {file, File}, S#state{epoch = EpochId}};
handle_call({increment_sequence, {coc,CoC_Namespace,CoC_Locator}=_CoC_NL, Prefix}, _From, S = #state{ datadir = DataDir }) ->
ok = machi_util:increment_max_filenum(DataDir, CoC_Namespace,CoC_Locator, Prefix),
handle_call({increment_sequence, #ns_info{name=NS, locator=NSLocator}, Prefix}, _From, S = #state{ datadir = DataDir, tid=Tid }) ->
NSInfo = #ns_info{name=NS, locator=NSLocator},
_File = increment_and_cache_filename(Tid, DataDir, NSInfo, Prefix),
{reply, ok, S};
handle_call({list_files, Prefix}, From, S = #state{ datadir = DataDir }) ->
spawn(fun() ->
@ -191,12 +192,6 @@ generate_uuid_v4_str() ->
io_lib:format("~8.16.0b-~4.16.0b-4~3.16.0b-~4.16.0b-~12.16.0b",
[A, B, C band 16#0fff, D band 16#3fff bor 16#8000, E]).
find_file(DataDir, {coc,CoC_Namespace,CoC_Locator}=_CoC_NL, Prefix, N) ->
{_Filename, Path} = machi_util:make_data_filename(DataDir,
CoC_Namespace,CoC_Locator,
Prefix, "*", N),
filelib:wildcard(Path).
list_files(DataDir, Prefix) ->
{F_bin, Path} = machi_util:make_data_filename(DataDir, "*^" ++ Prefix ++ "^*"),
filelib:wildcard(binary_to_list(F_bin), filename:dirname(Path)).
@ -204,50 +199,31 @@ list_files(DataDir, Prefix) ->
make_filename_mgr_name(FluName) when is_atom(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_filename_mgr").
handle_find_file(Tid, {coc,CoC_Namespace,CoC_Locator}=CoC_NL, Prefix, DataDir) ->
N = machi_util:read_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix),
{File, Cleanup} = case find_file(DataDir, CoC_NL, Prefix, N) of
[] ->
{find_or_make_filename(Tid, DataDir, CoC_Namespace, CoC_Locator, Prefix, N), false};
[H] -> {H, true};
[Fn | _ ] = L ->
lager:debug(
"Searching for a matching file to prefix ~p and sequence number ~p gave multiples: ~p",
[Prefix, N, L]),
{Fn, true}
end,
maybe_cleanup(Tid, {CoC_Namespace, CoC_Locator, Prefix, N}, Cleanup),
filename:basename(File).
find_or_make_filename(Tid, DataDir, CoC_Namespace, CoC_Locator, Prefix, N) ->
case ets:lookup(Tid, {CoC_Namespace, CoC_Locator, Prefix, N}) of
handle_find_file(_FluName, Tid, #ns_info{name=NS, locator=NSLocator}, Prefix, DataDir) ->
case ets:lookup(Tid, {NS, NSLocator, Prefix}) of
[] ->
F = generate_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, N),
true = ets:insert_new(Tid, {{CoC_Namespace, CoC_Locator, Prefix, N}, F}),
N = machi_util:read_max_filenum(DataDir, NS, NSLocator, Prefix),
F = generate_filename(DataDir, NS, NSLocator, Prefix, N),
true = ets:insert(Tid, {{NS, NSLocator, Prefix}, F}),
F;
[{_Key, File}] ->
File
end.
generate_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, N) ->
{F, _} = machi_util:make_data_filename(
generate_filename(DataDir, NS, NSLocator, Prefix, N) ->
{F, _Q} = machi_util:make_data_filename(
DataDir,
CoC_Namespace, CoC_Locator, Prefix,
NS, NSLocator, Prefix,
generate_uuid_v4_str(),
N),
binary_to_list(F).
maybe_cleanup(_Tid, _Key, false) ->
ok;
maybe_cleanup(Tid, Key, true) ->
true = ets:delete(Tid, Key).
increment_and_cache_filename(Tid, DataDir, {coc,CoC_Namespace,CoC_Locator}, Prefix) ->
ok = machi_util:increment_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix),
N = machi_util:read_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix),
F = generate_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, N),
true = ets:insert_new(Tid, {{CoC_Namespace, CoC_Locator, Prefix, N}, F}),
filename:basename(F).
increment_and_cache_filename(Tid, DataDir, #ns_info{name=NS,locator=NSLocator}, Prefix) ->
ok = machi_util:increment_max_filenum(DataDir, NS, NSLocator, Prefix),
N = machi_util:read_max_filenum(DataDir, NS, NSLocator, Prefix),
F = generate_filename(DataDir, NS, NSLocator, Prefix, N),
true = ets:insert(Tid, {{NS, NSLocator, Prefix}, F}),
F.

View file

@ -34,6 +34,7 @@
-module(machi_flu_metadata_mgr).
-behaviour(gen_server).
-include("machi.hrl").
-define(MAX_MGRS, 10). %% number of managers to start by default.
-define(HASH(X), erlang:phash2(X)). %% hash algorithm to use
@ -62,6 +63,7 @@
lookup_proxy_pid/2,
start_proxy_pid/2,
stop_proxy_pid/2,
stop_proxy_pid_rollover/2,
build_metadata_mgr_name/2,
trim_file/2
]).
@ -99,7 +101,10 @@ start_proxy_pid(FluName, {file, Filename}) ->
gen_server:call(get_manager_atom(FluName, Filename), {start_proxy_pid, Filename}, ?TIMEOUT).
stop_proxy_pid(FluName, {file, Filename}) ->
gen_server:call(get_manager_atom(FluName, Filename), {stop_proxy_pid, Filename}, ?TIMEOUT).
gen_server:call(get_manager_atom(FluName, Filename), {stop_proxy_pid, false, Filename}, ?TIMEOUT).
stop_proxy_pid_rollover(FluName, {file, Filename}) ->
gen_server:call(get_manager_atom(FluName, Filename), {stop_proxy_pid, true, Filename}, ?TIMEOUT).
trim_file(FluName, {file, Filename}) ->
gen_server:call(get_manager_atom(FluName, Filename), {trim_file, Filename}, ?TIMEOUT).
@ -150,7 +155,7 @@ handle_call({start_proxy_pid, Filename}, _From,
{reply, {error, trimmed}, State}
end;
handle_call({stop_proxy_pid, Filename}, _From, State = #state{ tid = Tid }) ->
handle_call({stop_proxy_pid, Rollover_p, Filename}, _From, State = #state{ tid = Tid }) ->
case lookup_md(Tid, Filename) of
not_found ->
ok;
@ -158,8 +163,13 @@ handle_call({stop_proxy_pid, Filename}, _From, State = #state{ tid = Tid }) ->
ok;
#md{ proxy_pid = Pid, mref = M } = R ->
demonitor(M, [flush]),
machi_file_proxy:stop(Pid),
update_ets(Tid, R#md{ proxy_pid = undefined, mref = undefined })
if Rollover_p ->
do_rollover(Filename, State);
true ->
machi_file_proxy:stop(Pid),
update_ets(Tid, R#md{ proxy_pid = undefined,
mref = undefined })
end
end,
{reply, ok, State};
@ -181,28 +191,6 @@ handle_info({'DOWN', Mref, process, Pid, normal}, State = #state{ tid = Tid }) -
clear_ets(Tid, Mref),
{noreply, State};
handle_info({'DOWN', Mref, process, Pid, file_rollover}, State = #state{ fluname = FluName,
tid = Tid }) ->
lager:info("file proxy ~p shutdown because of file rollover", [Pid]),
R = get_md_record_by_mref(Tid, Mref),
{Prefix, CoC_Namespace, CoC_Locator, _, _} =
machi_util:parse_filename(R#md.filename),
%% CoC_Namespace = list_to_binary(CoC_Namespace_str),
%% CoC_Locator = list_to_integer(CoC_Locator_str),
%% We only increment the counter here. The filename will be generated on the
%% next append request to that prefix and since the filename will have a new
%% sequence number it probably will be associated with a different metadata
%% manager. That's why we don't want to generate a new file name immediately
%% and use it to start a new file proxy.
ok = machi_flu_filename_mgr:increment_prefix_sequence(FluName, {coc, CoC_Namespace, CoC_Locator}, {prefix, Prefix}),
%% purge our ets table of this entry completely since it is likely the
%% new filename (whenever it comes) will be in a different manager than
%% us.
purge_ets(Tid, R),
{noreply, State};
handle_info({'DOWN', Mref, process, Pid, wedged}, State = #state{ tid = Tid }) ->
lager:error("file proxy ~p shutdown because it's wedged", [Pid]),
clear_ets(Tid, Mref),
@ -275,8 +263,35 @@ get_md_record_by_mref(Tid, Mref) ->
[R] = ets:match_object(Tid, {md, '_', '_', Mref}),
R.
get_md_record_by_filename(Tid, Filename) ->
[R] = ets:lookup(Tid, Filename),
R.
get_env(Setting, Default) ->
case application:get_env(machi, Setting) of
undefined -> Default;
{ok, V} -> V
end.
do_rollover(Filename, _State = #state{ fluname = FluName,
tid = Tid }) ->
R = get_md_record_by_filename(Tid, Filename),
lager:info("file ~p proxy ~p shutdown because of file rollover",
[Filename, R#md.proxy_pid]),
{Prefix, NS, NSLocator, _, _} =
machi_util:parse_filename(R#md.filename),
%% We only increment the counter here. The filename will be generated on the
%% next append request to that prefix and since the filename will have a new
%% sequence number it probably will be associated with a different metadata
%% manager. That's why we don't want to generate a new file name immediately
%% and use it to start a new file proxy.
NSInfo = #ns_info{name=NS, locator=NSLocator},
lager:warning("INCR: ~p ~p\n", [FluName, Prefix]),
ok = machi_flu_filename_mgr:increment_prefix_sequence(FluName, NSInfo, {prefix, Prefix}),
%% purge our ets table of this entry completely since it is likely the
%% new filename (whenever it comes) will be in a different manager than
%% us.
purge_ets(Tid, R),
ok.

View file

@ -83,6 +83,8 @@
%% Supervisor callbacks
-export([init/1]).
make_package_spec(#p_srvr{name=FluName, port=TcpPort, props=Props}) when is_list(Props) ->
make_package_spec({FluName, TcpPort, Props});
make_package_spec({FluName, TcpPort, Props}) when is_list(Props) ->
FluDataDir = get_env(flu_data_dir, undefined_is_invalid),
MyDataDir = filename:join(FluDataDir, atom_to_list(FluName)),
@ -94,7 +96,7 @@ make_package_spec(FluName, TcpPort, DataDir, Props) ->
permanent, ?SHUTDOWN, supervisor, []}.
start_flu_package(#p_srvr{name=FluName, port=TcpPort, props=Props}) ->
DataDir = get_data_dir(Props),
DataDir = get_data_dir(FluName, Props),
start_flu_package(FluName, TcpPort, DataDir, Props).
start_flu_package(FluName, TcpPort, DataDir, Props) ->
@ -143,9 +145,9 @@ init([FluName, TcpPort, DataDir, Props0]) ->
FProxySupSpec = machi_file_proxy_sup:child_spec(FluName),
ListenerSupSpec = {machi_listener_sup:make_listener_sup_name(FluName),
{machi_listener_sup, start_link, [FluName]},
permanent, ?SHUTDOWN, supervisor, []},
Flu1SubSupSpec = {machi_flu1_subsup:subsup_name(FluName),
{machi_flu1_subsup, start_link, [FluName]},
permanent, ?SHUTDOWN, supervisor, []},
FluSpec = {FluName,
{machi_flu1, start_link,
@ -155,7 +157,7 @@ init([FluName, TcpPort, DataDir, Props0]) ->
{ok, {SupFlags, [
ProjSpec, FitnessSpec, MgrSpec,
FProxySupSpec, FNameMgrSpec, MetaSupSpec,
ListenerSupSpec, FluSpec]}}.
Flu1SubSupSpec, FluSpec]}}.
make_flu_regname(FluName) when is_atom(FluName) ->
FluName.
@ -178,8 +180,11 @@ get_env(Setting, Default) ->
{ok, V} -> V
end.
get_data_dir(Props) ->
get_data_dir(FluName, Props) ->
case proplists:get_value(data_dir, Props) of
Path when is_list(Path) ->
Path
Path;
undefined ->
{ok, Dir} = application:get_env(machi, flu_data_dir),
Dir ++ "/" ++ atom_to_list(FluName)
end.

View file

@ -21,6 +21,9 @@
%% @doc Supervisor for Machi FLU servers and their related support
%% servers.
%%
%% Responsibility for managing FLU and chain lifecycle after the initial
%% application startup is delegated to {@link machi_lifecycle_mgr}.
%%
%% See {@link machi_flu_psup} for an illustration of the entire Machi
%% application process structure.
@ -29,8 +32,11 @@
-behaviour(supervisor).
-include("machi.hrl").
-include("machi_projection.hrl").
-include("machi_verbose.hrl").
-ifdef(TEST).
-compile(export_all).
-ifdef(PULSE).
-compile({parse_transform, pulse_instrument}).
-include_lib("pulse_otp/include/pulse_otp.hrl").
@ -38,9 +44,12 @@
-else.
-define(SHUTDOWN, 5000).
-endif.
-endif. %TEST
%% API
-export([start_link/0]).
-export([start_link/0,
get_initial_flus/0, load_rc_d_files_from_dir/1,
sanitize_p_srvr_records/1]).
%% Supervisor callbacks
-export([init/1]).
@ -69,5 +78,66 @@ get_initial_flus() ->
[].
-else. % PULSE
get_initial_flus() ->
application:get_env(machi, initial_flus, []).
DoesNotExist = "/tmp/does/not/exist",
ConfigDir = case application:get_env(machi, flu_config_dir, DoesNotExist) of
DoesNotExist ->
DoesNotExist;
Dir ->
Dir
end,
Ps = [P || {_File, P} <- load_rc_d_files_from_dir(ConfigDir)],
sanitize_p_srvr_records(Ps).
-endif. % PULSE
load_rc_d_files_from_dir(Dir) ->
Files = filelib:wildcard(Dir ++ "/*"),
[case file:consult(File) of
{ok, [X]} ->
{File, X};
_ ->
lager:warning("Error parsing file '~s', ignoring",
[File]),
{File, []}
end || File <- Files].
sanitize_p_srvr_records(Ps) ->
{Sane, _} = lists:foldl(fun sanitize_p_srvr_rec/2, {[], dict:new()}, Ps),
Sane.
sanitize_p_srvr_rec(Whole, {Acc, D}) ->
try
#p_srvr{name=Name,
proto_mod=PMod,
address=Address,
port=Port,
props=Props} = Whole,
true = is_atom(Name),
NameK = {name, Name},
error = dict:find(NameK, D),
true = is_atom(PMod),
case code:is_loaded(PMod) of
{file, _} ->
ok;
_ ->
{module, _} = code:load_file(PMod),
ok
end,
if is_list(Address) -> ok;
is_tuple(Address) -> ok % Erlang-style IPv4 or IPv6
end,
true = is_integer(Port) andalso Port >= 1024 andalso Port =< 65534,
PortK = {port, Port},
error = dict:find(PortK, D),
true = is_list(Props),
%% All is sane enough.
D2 = dict:store(NameK, Name,
dict:store(PortK, Port, D)),
{[Whole|Acc], D2}
catch _:_ ->
_ = lager:log(error, self(),
"~s: Bad (or duplicate name/port) p_srvr record, "
"skipping: ~P\n",
[?MODULE, Whole, 15]),
{Acc, D}
end.

1015
src/machi_lifecycle_mgr.erl Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,89 +0,0 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc A supervisor to hold ranch listener for sigle FLU.
%% It holds at most one child worker.
%% TODO: This supervisor is maybe useless. First introduced for
%% workaround to start listener dynamically in flu1 initialization
%% time. Because psup is being blocked in flu1 initialization time,
%% adding a child to psup leads to deadlock. If initialization can be
%% done only by static arguments, then this supervisor should be
%% removed and added as a direct child of `machi_flu_psup'.
-module(machi_listener_sup).
-behaviour(supervisor).
%% public API
-export([start_link/1,
start_listener/6,
stop_listener/1,
make_listener_sup_name/1,
make_listener_name/1]).
%% supervisor callback
-export([init/1]).
-include("machi_projection.hrl").
-define(BACKLOG, 8192).
-spec start_link(pv1_server()) -> {ok, pid()}.
start_link(FluName) ->
supervisor:start_link({local, make_listener_sup_name(FluName)}, ?MODULE, []).
-spec start_listener(pv1_server(), inet:port_number(), boolean(),
string(), ets:tab(), atom() | pid()) -> {ok, pid()}.
start_listener(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore) ->
supervisor:start_child(make_listener_sup_name(FluName),
child_spec(FluName, TcpPort, Witness, DataDir,
EpochTab, ProjStore)).
-spec stop_listener(pv1_server()) -> ok.
stop_listener(FluName) ->
SupName = make_listener_sup_name(FluName),
ListenerName = make_listener_name(FluName),
ok = supervisor:terminate_child(SupName, ListenerName),
ok = supervisor:delete_child(SupName, ListenerName).
-spec make_listener_name(pv1_server()) -> atom().
make_listener_sup_name(FluName) when is_atom(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_listener_sup").
-spec make_listener_sup_name(pv1_server()) -> atom().
make_listener_name(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_listener").
%% Supervisor callback
init([]) ->
SupFlags = {one_for_one, 1000, 10},
{ok, {SupFlags, []}}.
-spec child_spec(pv1_server(), inet:port_number(), boolean(),
string(), ets:tab(), atom() | pid()) -> supervisor:child_spec().
child_spec(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore) ->
ListenerName = make_listener_name(FluName),
NbAcceptors = 100,
TcpOpts = [{port, TcpPort}, {backlog, ?BACKLOG}],
NetServerOpts = [FluName, Witness, DataDir, EpochTab, ProjStore],
ranch:child_spec(ListenerName, NbAcceptors,
ranch_tcp, TcpOpts,
machi_flu1_net_server, NetServerOpts).

View file

@ -25,6 +25,10 @@
%% to a single socket connection, and there is no code to deal with
%% multiple connections/load balancing/error handling to several/all
%% Machi cluster servers.
%%
%% Please see {@link machi_flu1_client} the "Client API implemntation notes"
%% section for how this module relates to the rest of the client API
%% implementation.
-module(machi_pb_high_client).
@ -38,7 +42,7 @@
connected_p/1,
echo/2, echo/3,
auth/3, auth/4,
append_chunk/7, append_chunk/8,
append_chunk/6, append_chunk/7,
write_chunk/5, write_chunk/6,
read_chunk/5, read_chunk/6,
trim_chunk/4, trim_chunk/5,
@ -58,7 +62,7 @@
count=0 :: non_neg_integer()
}).
%% @doc official error types that is specific in Machi
%% Official error types that is specific in Machi
-type machi_client_error_reason() :: bad_arg | wedged | bad_checksum |
partition | not_written | written |
trimmed | no_such_file | partial_read |
@ -96,30 +100,33 @@ auth(PidSpec, User, Pass) ->
auth(PidSpec, User, Pass, Timeout) ->
send_sync(PidSpec, {auth, User, Pass}, Timeout).
-spec append_chunk(pid(), CoC_namespace::binary(), CoC_locator::integer(), Prefix::binary(), Chunk::binary(),
CSum::binary(), ChunkExtra::non_neg_integer()) ->
-spec append_chunk(pid(),
NS::machi_dt:namespace(), Prefix::machi_dt:file_prefix(),
Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum(),
Opts::machi_dt:append_opts()) ->
{ok, Filename::string(), Offset::machi_dt:file_offset()} |
{error, machi_client_error_reason()}.
append_chunk(PidSpec, CoC_namespace, CoC_locator, Prefix, Chunk, CSum, ChunkExtra) ->
append_chunk(PidSpec, CoC_namespace, CoC_locator, Prefix, Chunk, CSum, ChunkExtra, ?DEFAULT_TIMEOUT).
append_chunk(PidSpec, NS, Prefix, Chunk, CSum, Opts) ->
append_chunk(PidSpec, NS, Prefix, Chunk, CSum, Opts, ?DEFAULT_TIMEOUT).
-spec append_chunk(pid(), CoC_namespace::binary(), CoC_locator::integer(), Prefix::binary(),
Chunk::binary(), CSum::binary(),
ChunkExtra::non_neg_integer(),
-spec append_chunk(pid(),
NS::machi_dt:namespace(), Prefix::machi_dt:file_prefix(),
Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum(),
Opts::machi_dt:append_opts(),
Timeout::non_neg_integer()) ->
{ok, Filename::string(), Offset::machi_dt:file_offset()} |
{error, machi_client_error_reason()}.
append_chunk(PidSpec, CoC_namespace, CoC_locator, Prefix, Chunk, CSum, ChunkExtra, Timeout) ->
send_sync(PidSpec, {append_chunk, CoC_namespace, CoC_locator, Prefix, Chunk, CSum, ChunkExtra}, Timeout).
append_chunk(PidSpec, NS, Prefix, Chunk, CSum, Opts, Timeout) ->
send_sync(PidSpec, {append_chunk, NS, Prefix, Chunk, CSum, Opts}, Timeout).
-spec write_chunk(pid(), File::string(), machi_dt:file_offset(),
Chunk::binary(), CSum::binary()) ->
Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum()) ->
ok | {error, machi_client_error_reason()}.
write_chunk(PidSpec, File, Offset, Chunk, CSum) ->
write_chunk(PidSpec, File, Offset, Chunk, CSum, ?DEFAULT_TIMEOUT).
-spec write_chunk(pid(), File::string(), machi_dt:file_offset(),
Chunk::binary(), CSum::binary(), Timeout::non_neg_integer()) ->
Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum(), Timeout::non_neg_integer()) ->
ok | {error, machi_client_error_reason()}.
write_chunk(PidSpec, File, Offset, Chunk, CSum, Timeout) ->
send_sync(PidSpec, {write_chunk, File, Offset, Chunk, CSum}, Timeout).
@ -128,24 +135,25 @@ write_chunk(PidSpec, File, Offset, Chunk, CSum, Timeout) ->
%% {Chunks, TrimmedChunks}}' for live file while it returns `{error,
%% trimmed}' if all bytes of the file was trimmed.
-spec read_chunk(pid(), File::string(), machi_dt:file_offset(), machi_dt:chunk_size(),
[{flag_no_checksum | flag_no_chunk | needs_trimmed, boolean()}]) ->
machi_dt:read_opts_x()) ->
{ok, {Chunks::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), binary()}],
Trimmed::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size()}]}} |
{error, machi_client_error_reason()}.
read_chunk(PidSpec, File, Offset, Size, Options) ->
read_chunk(PidSpec, File, Offset, Size, Options, ?DEFAULT_TIMEOUT).
read_chunk(PidSpec, File, Offset, Size, Opts) ->
read_chunk(PidSpec, File, Offset, Size, Opts, ?DEFAULT_TIMEOUT).
-spec read_chunk(pid(), File::string(), machi_dt:file_offset(), machi_dt:chunk_size(),
[{flag_no_checksum | flag_no_chunk | needs_trimmed, boolean()}],
machi_dt:read_opts_x(),
Timeout::non_neg_integer()) ->
{ok, {Chunks::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), binary()}],
Trimmed::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size()}]}} |
{error, machi_client_error_reason()}.
read_chunk(PidSpec, File, Offset, Size, Options, Timeout) ->
send_sync(PidSpec, {read_chunk, File, Offset, Size, Options}, Timeout).
read_chunk(PidSpec, File, Offset, Size, Opts0, Timeout) ->
Opts = machi_util:read_opts_default(Opts0),
send_sync(PidSpec, {read_chunk, File, Offset, Size, Opts}, Timeout).
%% @doc Trims arbitrary binary range of any file. If a specified range
%% has any byte trimmed, it fails and returns `{error, trimmed}`.
%% has any byte trimmed, it fails and returns `{error, trimmed}'.
%% Otherwise it trims all bytes in that range. If there are
%% overlapping chunks with client-specified checksum, they will cut
%% off and checksum are re-calculated in server side. TODO: Add
@ -281,18 +289,19 @@ do_send_sync2({auth, User, Pass}, #state{sock=Sock}=S) ->
Res = {bummer, {X, Y, erlang:get_stacktrace()}},
{Res, S}
end;
do_send_sync2({append_chunk, CoC_Namespace, CoC_Locator,
Prefix, Chunk, CSum, ChunkExtra},
do_send_sync2({append_chunk, NS, Prefix, Chunk, CSum, Opts},
#state{sock=Sock, sock_id=Index, count=Count}=S) ->
try
ReqID = <<Index:64/big, Count:64/big>>,
CSumT = convert_csum_req(CSum, Chunk),
Req = #mpb_appendchunkreq{coc_namespace=CoC_Namespace,
coc_locator=CoC_Locator,
{ChunkExtra, Pref, FailPref} = machi_pb_translate:conv_from_append_opts(Opts),
Req = #mpb_appendchunkreq{namespace=NS,
prefix=Prefix,
chunk=Chunk,
csum=CSumT,
chunk_extra=ChunkExtra},
chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref},
R1a = #mpb_request{req_id=ReqID, do_not_alter=1,
append_chunk=Req},
Bin1a = machi_pb:encode_mpb_request(R1a),
@ -337,13 +346,13 @@ do_send_sync2({write_chunk, File, Offset, Chunk, CSum},
Res = {bummer, {X, Y, erlang:get_stacktrace()}},
{Res, S#state{count=Count+1}}
end;
do_send_sync2({read_chunk, File, Offset, Size, Options},
do_send_sync2({read_chunk, File, Offset, Size, Opts},
#state{sock=Sock, sock_id=Index, count=Count}=S) ->
try
ReqID = <<Index:64/big, Count:64/big>>,
FlagNoChecksum = proplists:get_value(no_checksum, Options, false),
FlagNoChunk = proplists:get_value(no_chunk, Options, false),
NeedsTrimmed = proplists:get_value(needs_trimmed, Options, false),
#read_opts{no_checksum=FlagNoChecksum,
no_chunk=FlagNoChunk,
needs_trimmed=NeedsTrimmed} = Opts,
Req = #mpb_readchunkreq{chunk_pos=#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size},
@ -436,9 +445,15 @@ do_send_sync2({list_files},
{Res, S#state{count=Count+1}}
end.
%% We only convert the checksum types that make sense here:
%% none or client_sha. None of the other types should be sent
%% to us via the PB high protocol.
convert_csum_req(none, Chunk) ->
#mpb_chunkcsum{type='CSUM_TAG_CLIENT_SHA',
csum=machi_util:checksum_chunk(Chunk)};
convert_csum_req(<<>>, Chunk) ->
convert_csum_req(none, Chunk);
convert_csum_req({client_sha, CSumBin}, _Chunk) ->
#mpb_chunkcsum{type='CSUM_TAG_CLIENT_SHA',
csum=CSumBin}.
@ -486,12 +501,12 @@ convert_read_chunk_resp(#mpb_readchunkresp{status='OK', chunks=PB_Chunks, trimme
csum=#mpb_chunkcsum{type=T, csum=Ck}}) ->
%% TODO: cleanup export
Csum = <<(machi_pb_translate:conv_to_csum_tag(T)):8, Ck/binary>>,
{File, Offset, Chunk, Csum}
{list_to_binary(File), Offset, Chunk, Csum}
end, PB_Chunks),
Trimmed = lists:map(fun(#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size}) ->
{File, Offset, Size}
{list_to_binary(File), Offset, Size}
end, PB_Trimmed),
{ok, {Chunks, Trimmed}};
convert_read_chunk_resp(#mpb_readchunkresp{status=Status}) ->

View file

@ -34,7 +34,9 @@
-export([from_pb_request/1,
from_pb_response/1,
to_pb_request/2,
to_pb_response/3
to_pb_response/3,
conv_from_append_opts/1,
conv_to_append_opts/1
]).
%% TODO: fixme cleanup
@ -43,95 +45,104 @@
from_pb_request(#mpb_ll_request{
req_id=ReqID,
echo=#mpb_echoreq{message=Msg}}) ->
{ReqID, {low_echo, undefined, Msg}};
{ReqID, {low_skip_wedge, {low_echo, Msg}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
auth=#mpb_authreq{user=User, password=Pass}}) ->
{ReqID, {low_auth, undefined, User, Pass}};
{ReqID, {low_skip_wedge, {low_auth, User, Pass}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
append_chunk=#mpb_ll_appendchunkreq{
append_chunk=IR=#mpb_ll_appendchunkreq{
namespace_version=NSVersion,
namespace=NS_str,
locator=NSLocator,
epoch_id=PB_EpochID,
coc_namespace=CoC_Namespace,
coc_locator=CoC_Locator,
prefix=Prefix,
chunk=Chunk,
csum=#mpb_chunkcsum{type=CSum_type, csum=CSum},
chunk_extra=ChunkExtra}}) ->
csum=#mpb_chunkcsum{type=CSum_type, csum=CSum}}}) ->
NS = list_to_binary(NS_str),
EpochID = conv_to_epoch_id(PB_EpochID),
CSum_tag = conv_to_csum_tag(CSum_type),
{ReqID, {low_append_chunk, EpochID, CoC_Namespace, CoC_Locator,
Prefix, Chunk, CSum_tag, CSum,
ChunkExtra}};
Opts = conv_to_append_opts(IR),
%% NOTE: The tuple position of NSLocator is a bit odd, because EpochID
%% _must_ be in the 4th position (as NSV & NS must be in 2nd & 3rd).
{ReqID, {low_append_chunk, NSVersion, NS, EpochID, NSLocator,
Prefix, Chunk, CSum_tag, CSum, Opts}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
write_chunk=#mpb_ll_writechunkreq{
namespace_version=NSVersion,
namespace=NS_str,
epoch_id=PB_EpochID,
chunk=#mpb_chunk{file_name=File,
offset=Offset,
chunk=Chunk,
csum=#mpb_chunkcsum{type=CSum_type, csum=CSum}}}}) ->
NS = list_to_binary(NS_str),
EpochID = conv_to_epoch_id(PB_EpochID),
CSum_tag = conv_to_csum_tag(CSum_type),
{ReqID, {low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, CSum}};
{ReqID, {low_write_chunk, NSVersion, NS, EpochID, File, Offset, Chunk, CSum_tag, CSum}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
read_chunk=#mpb_ll_readchunkreq{
namespace_version=NSVersion,
namespace=NS_str,
epoch_id=PB_EpochID,
chunk_pos=ChunkPos,
flag_no_checksum=PB_GetNoChecksum,
flag_no_chunk=PB_GetNoChunk,
flag_needs_trimmed=PB_NeedsTrimmed}}) ->
NS = list_to_binary(NS_str),
EpochID = conv_to_epoch_id(PB_EpochID),
Opts = [{no_checksum, conv_to_boolean(PB_GetNoChecksum)},
{no_chunk, conv_to_boolean(PB_GetNoChunk)},
{needs_trimmed, conv_to_boolean(PB_NeedsTrimmed)}],
Opts = #read_opts{no_checksum=PB_GetNoChecksum,
no_chunk=PB_GetNoChunk,
needs_trimmed=PB_NeedsTrimmed},
#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size} = ChunkPos,
{ReqID, {low_read_chunk, EpochID, File, Offset, Size, Opts}};
{ReqID, {low_read_chunk, NSVersion, NS, EpochID, File, Offset, Size, Opts}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
trim_chunk=#mpb_ll_trimchunkreq{
namespace_version=NSVersion,
namespace=NS_str,
epoch_id=PB_EpochID,
file=File,
offset=Offset,
size=Size,
trigger_gc=PB_TriggerGC}}) ->
trigger_gc=TriggerGC}}) ->
NS = list_to_binary(NS_str),
EpochID = conv_to_epoch_id(PB_EpochID),
TriggerGC = conv_to_boolean(PB_TriggerGC),
{ReqID, {low_trim_chunk, EpochID, File, Offset, Size, TriggerGC}};
{ReqID, {low_trim_chunk, NSVersion, NS, EpochID, File, Offset, Size, TriggerGC}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
checksum_list=#mpb_ll_checksumlistreq{
epoch_id=PB_EpochID,
file=File}}) ->
EpochID = conv_to_epoch_id(PB_EpochID),
{ReqID, {low_checksum_list, EpochID, File}};
{ReqID, {low_skip_wedge, {low_checksum_list, File}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
list_files=#mpb_ll_listfilesreq{
epoch_id=PB_EpochID}}) ->
EpochID = conv_to_epoch_id(PB_EpochID),
{ReqID, {low_list_files, EpochID}};
{ReqID, {low_skip_wedge, {low_list_files, EpochID}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
wedge_status=#mpb_ll_wedgestatusreq{}}) ->
{ReqID, {low_wedge_status, undefined}};
{ReqID, {low_skip_wedge, {low_wedge_status}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
delete_migration=#mpb_ll_deletemigrationreq{
epoch_id=PB_EpochID,
file=File}}) ->
EpochID = conv_to_epoch_id(PB_EpochID),
{ReqID, {low_delete_migration, EpochID, File}};
{ReqID, {low_skip_wedge, {low_delete_migration, EpochID, File}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
trunc_hack=#mpb_ll_trunchackreq{
epoch_id=PB_EpochID,
file=File}}) ->
EpochID = conv_to_epoch_id(PB_EpochID),
{ReqID, {low_trunc_hack, EpochID, File}};
{ReqID, {low_skip_wedge, {low_trunc_hack, EpochID, File}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
proj_gl=#mpb_ll_getlatestepochidreq{type=ProjType}}) ->
@ -172,23 +183,22 @@ from_pb_request(#mpb_request{req_id=ReqID,
{ReqID, {high_auth, User, Pass}};
from_pb_request(#mpb_request{req_id=ReqID,
append_chunk=IR=#mpb_appendchunkreq{}}) ->
#mpb_appendchunkreq{coc_namespace=CoC_namespace,
coc_locator=CoC_locator,
#mpb_appendchunkreq{namespace=NS_str,
prefix=Prefix,
chunk=Chunk,
csum=CSum,
chunk_extra=ChunkExtra} = IR,
csum=CSum} = IR,
NS = list_to_binary(NS_str),
TaggedCSum = make_tagged_csum(CSum, Chunk),
{ReqID, {high_append_chunk, CoC_namespace, CoC_locator, Prefix, Chunk,
TaggedCSum, ChunkExtra}};
Opts = conv_to_append_opts(IR),
{ReqID, {high_append_chunk, NS, Prefix, Chunk, TaggedCSum, Opts}};
from_pb_request(#mpb_request{req_id=ReqID,
write_chunk=IR=#mpb_writechunkreq{}}) ->
#mpb_writechunkreq{chunk=#mpb_chunk{file_name=File,
offset=Offset,
chunk=Chunk,
csum=CSum}} = IR,
TaggedCSum = make_tagged_csum(CSum, Chunk),
{ReqID, {high_write_chunk, File, Offset, Chunk, TaggedCSum}};
csum=CSumRec}} = IR,
CSum = make_tagged_csum(CSumRec, Chunk),
{ReqID, {high_write_chunk, File, Offset, Chunk, CSum}};
from_pb_request(#mpb_request{req_id=ReqID,
read_chunk=IR=#mpb_readchunkreq{}}) ->
#mpb_readchunkreq{chunk_pos=#mpb_chunkpos{file_name=File,
@ -197,11 +207,10 @@ from_pb_request(#mpb_request{req_id=ReqID,
flag_no_checksum=FlagNoChecksum,
flag_no_chunk=FlagNoChunk,
flag_needs_trimmed=NeedsTrimmed} = IR,
%% I want MAPS
Options = [{no_checksum, machi_util:int2bool(FlagNoChecksum)},
{no_chunk, machi_util:int2bool(FlagNoChunk)},
{needs_trimmed, machi_util:int2bool(NeedsTrimmed)}],
{ReqID, {high_read_chunk, File, Offset, Size, Options}};
Opts = #read_opts{no_checksum=FlagNoChecksum,
no_chunk=FlagNoChunk,
needs_trimmed=NeedsTrimmed},
{ReqID, {high_read_chunk, File, Offset, Size, Opts}};
from_pb_request(#mpb_request{req_id=ReqID,
trim_chunk=IR=#mpb_trimchunkreq{}}) ->
#mpb_trimchunkreq{chunk_pos=#mpb_chunkpos{file_name=File,
@ -265,12 +274,12 @@ from_pb_response(#mpb_ll_response{
chunk=Bytes,
csum=#mpb_chunkcsum{type=T,csum=Ck}}) ->
Csum = <<(conv_to_csum_tag(T)):8, Ck/binary>>,
{File, Offset, Bytes, Csum}
{list_to_binary(File), Offset, Bytes, Csum}
end, PB_Chunks),
Trimmed = lists:map(fun(#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size}) ->
{File, Offset, Size}
{list_to_binary(File), Offset, Size}
end, PB_Trimmed),
{ReqID, {ok, {Chunks, Trimmed}}};
_ ->
@ -306,12 +315,16 @@ from_pb_response(#mpb_ll_response{
from_pb_response(#mpb_ll_response{
req_id=ReqID,
wedge_status=#mpb_ll_wedgestatusresp{
epoch_id=PB_EpochID, wedged_flag=PB_Wedged}}) ->
status=Status,
epoch_id=PB_EpochID, wedged_flag=Wedged_p,
namespace_version=NSVersion, namespace=NS_str}}) ->
GeneralStatus = case machi_pb_high_client:convert_general_status_code(Status) of
ok -> ok;
_Else -> {yukky, _Else}
end,
EpochID = conv_to_epoch_id(PB_EpochID),
Wedged_p = if PB_Wedged == 1 -> true;
PB_Wedged == 0 -> false
end,
{ReqID, {ok, {Wedged_p, EpochID}}};
NS = list_to_binary(NS_str),
{ReqID, {GeneralStatus, {Wedged_p, EpochID, NSVersion, NS}}};
from_pb_response(#mpb_ll_response{
req_id=ReqID,
delete_migration=#mpb_ll_deletemigrationresp{
@ -377,90 +390,100 @@ from_pb_response(#mpb_ll_response{
'OK' ->
{ReqID, {ok, Epochs}};
_ ->
{ReqID< machi_pb_high_client:convert_general_status_code(Status)}
{ReqID, machi_pb_high_client:convert_general_status_code(Status)}
end.
%% No response for proj_kp/kick_projection_reaction
%% TODO: move the #mbp_* record making code from
%% machi_pb_high_client:do_send_sync() clauses into to_pb_request().
to_pb_request(ReqID, {low_echo, _BogusEpochID, Msg}) ->
to_pb_request(ReqID, {low_skip_wedge, {low_echo, Msg}}) ->
#mpb_ll_request{
req_id=ReqID, do_not_alter=2,
echo=#mpb_echoreq{message=Msg}};
to_pb_request(ReqID, {low_auth, _BogusEpochID, User, Pass}) ->
to_pb_request(ReqID, {low_skip_wedge, {low_auth, User, Pass}}) ->
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
auth=#mpb_authreq{user=User, password=Pass}};
to_pb_request(ReqID, {low_append_chunk, EpochID, CoC_Namespace, CoC_Locator,
Prefix, Chunk, CSum_tag, CSum, ChunkExtra}) ->
%% NOTE: The tuple position of NSLocator is a bit odd, because EpochID
%% _must_ be in the 4th position (as NSV & NS must be in 2nd & 3rd).
to_pb_request(ReqID, {low_append_chunk, NSVersion, NS, EpochID, NSLocator,
Prefix, Chunk, CSum_tag, CSum, Opts}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
CSum_type = conv_from_csum_tag(CSum_tag),
PB_CSum = #mpb_chunkcsum{type=CSum_type, csum=CSum},
{ChunkExtra, Pref, FailPref} = conv_from_append_opts(Opts),
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
append_chunk=#mpb_ll_appendchunkreq{
namespace_version=NSVersion,
namespace=NS,
locator=NSLocator,
epoch_id=PB_EpochID,
coc_namespace=CoC_Namespace,
coc_locator=CoC_Locator,
prefix=Prefix,
chunk=Chunk,
csum=PB_CSum,
chunk_extra=ChunkExtra}};
to_pb_request(ReqID, {low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, CSum}) ->
chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref}};
to_pb_request(ReqID, {low_write_chunk, NSVersion, NS, EpochID, File, Offset, Chunk, CSum_tag, CSum}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
CSum_type = conv_from_csum_tag(CSum_tag),
PB_CSum = #mpb_chunkcsum{type=CSum_type, csum=CSum},
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
write_chunk=#mpb_ll_writechunkreq{
namespace_version=NSVersion,
namespace=NS,
epoch_id=PB_EpochID,
chunk=#mpb_chunk{file_name=File,
offset=Offset,
chunk=Chunk,
csum=PB_CSum}}};
to_pb_request(ReqID, {low_read_chunk, EpochID, File, Offset, Size, Opts}) ->
to_pb_request(ReqID, {low_read_chunk, NSVersion, NS, EpochID, File, Offset, Size, Opts}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
FNChecksum = proplists:get_value(no_checksum, Opts, false),
FNChunk = proplists:get_value(no_chunk, Opts, false),
NeedsTrimmed = proplists:get_value(needs_trimmed, Opts, false),
#read_opts{no_checksum=FNChecksum,
no_chunk=FNChunk,
needs_trimmed=NeedsTrimmed} = Opts,
#mpb_ll_request{
req_id=ReqID, do_not_alter=2,
read_chunk=#mpb_ll_readchunkreq{
epoch_id=PB_EpochID,
chunk_pos=#mpb_chunkpos{
namespace_version=NSVersion,
namespace=NS,
epoch_id=PB_EpochID,
chunk_pos=#mpb_chunkpos{
file_name=File,
offset=Offset,
chunk_size=Size},
flag_no_checksum=machi_util:bool2int(FNChecksum),
flag_no_chunk=machi_util:bool2int(FNChunk),
flag_needs_trimmed=machi_util:bool2int(NeedsTrimmed)}};
to_pb_request(ReqID, {low_trim_chunk, EpochID, File, Offset, Size, TriggerGC}) ->
flag_no_checksum=FNChecksum,
flag_no_chunk=FNChunk,
flag_needs_trimmed=NeedsTrimmed}};
to_pb_request(ReqID, {low_trim_chunk, NSVersion, NS, EpochID, File, Offset, Size, TriggerGC}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
trim_chunk=#mpb_ll_trimchunkreq{
namespace_version=NSVersion,
namespace=NS,
epoch_id=PB_EpochID,
file=File,
offset=Offset,
size=Size,
trigger_gc=TriggerGC}};
to_pb_request(ReqID, {low_checksum_list, EpochID, File}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
to_pb_request(ReqID, {low_skip_wedge, {low_checksum_list, File}}) ->
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
checksum_list=#mpb_ll_checksumlistreq{
epoch_id=PB_EpochID,
file=File}};
to_pb_request(ReqID, {low_list_files, EpochID}) ->
to_pb_request(ReqID, {low_skip_wedge, {low_list_files, EpochID}}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
list_files=#mpb_ll_listfilesreq{epoch_id=PB_EpochID}};
to_pb_request(ReqID, {low_wedge_status, _BogusEpochID}) ->
to_pb_request(ReqID, {low_skip_wedge, {low_wedge_status}}) ->
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
wedge_status=#mpb_ll_wedgestatusreq{}};
to_pb_request(ReqID, {low_delete_migration, EpochID, File}) ->
to_pb_request(ReqID, {low_skip_wedge, {low_delete_migration, EpochID, File}}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
delete_migration=#mpb_ll_deletemigrationreq{
epoch_id=PB_EpochID,
file=File}};
to_pb_request(ReqID, {low_trunc_hack, EpochID, File}) ->
to_pb_request(ReqID, {low_skip_wedge, {low_trunc_hack, EpochID, File}}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
trunc_hack=#mpb_ll_trunchackreq{
@ -496,15 +519,15 @@ to_pb_response(_ReqID, _, async_no_response=X) ->
X;
to_pb_response(ReqID, _, {low_error, ErrCode, ErrMsg}) ->
make_ll_error_resp(ReqID, ErrCode, ErrMsg);
to_pb_response(ReqID, {low_echo, _BogusEpochID, _Msg}, Resp) ->
to_pb_response(ReqID, {low_skip_wedge, {low_echo, _Msg}}, Resp) ->
#mpb_ll_response{
req_id=ReqID,
echo=#mpb_echoresp{message=Resp}};
to_pb_response(ReqID, {low_auth, _, _, _}, __TODO_Resp) ->
to_pb_response(ReqID, {low_skip_wedge, {low_auth, _, _}}, __TODO_Resp) ->
#mpb_ll_response{req_id=ReqID,
generic=#mpb_errorresp{code=1,
msg="AUTH not implemented"}};
to_pb_response(ReqID, {low_append_chunk, _EID, _N, _L, _Pfx, _Ch, _CST, _CS, _CE}, Resp)->
to_pb_response(ReqID, {low_append_chunk, _NSV, _NS, _EID, _NSL, _Pfx, _Ch, _CST, _CS, _O}, Resp)->
case Resp of
{ok, {Offset, Size, File}} ->
Where = #mpb_chunkpos{offset=Offset,
@ -520,11 +543,11 @@ to_pb_response(ReqID, {low_append_chunk, _EID, _N, _L, _Pfx, _Ch, _CST, _CS, _CE
_Else ->
make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {low_write_chunk, _EID, _Fl, _Off, _Ch, _CST, _CS},Resp)->
to_pb_response(ReqID, {low_write_chunk, _NSV, _NS, _EID, _Fl, _Off, _Ch, _CST, _CS},Resp)->
Status = conv_from_status(Resp),
#mpb_ll_response{req_id=ReqID,
write_chunk=#mpb_ll_writechunkresp{status=Status}};
to_pb_response(ReqID, {low_read_chunk, _EID, _Fl, _Off, _Sz, _Opts}, Resp)->
to_pb_response(ReqID, {low_read_chunk, _NSV, _NS, _EID, _Fl, _Off, _Sz, _Opts}, Resp)->
case Resp of
{ok, {Chunks, Trimmed}} ->
PB_Chunks = lists:map(fun({File, Offset, Bytes, Csum}) ->
@ -551,7 +574,7 @@ to_pb_response(ReqID, {low_read_chunk, _EID, _Fl, _Off, _Sz, _Opts}, Resp)->
_Else ->
make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {low_trim_chunk, _, _, _, _, _}, Resp) ->
to_pb_response(ReqID, {low_trim_chunk, _, _, _, _, _, _, _}, Resp) ->
case Resp of
ok ->
#mpb_ll_response{req_id=ReqID,
@ -559,11 +582,11 @@ to_pb_response(ReqID, {low_trim_chunk, _, _, _, _, _}, Resp) ->
{error, _}=Error ->
Status = conv_from_status(Error),
#mpb_ll_response{req_id=ReqID,
read_chunk=#mpb_ll_trimchunkresp{status=Status}};
trim_chunk=#mpb_ll_trimchunkresp{status=Status}};
_Else ->
make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {low_checksum_list, _EpochID, _File}, Resp) ->
to_pb_response(ReqID, {low_skip_wedge, {low_checksum_list, _File}}, Resp) ->
case Resp of
{ok, Chunk} ->
#mpb_ll_response{req_id=ReqID,
@ -576,7 +599,7 @@ to_pb_response(ReqID, {low_checksum_list, _EpochID, _File}, Resp) ->
_Else ->
make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {low_list_files, _EpochID}, Resp) ->
to_pb_response(ReqID, {low_skip_wedge, {low_list_files, _EpochID}}, Resp) ->
case Resp of
{ok, FileInfo} ->
PB_Files = [#mpb_fileinfo{file_size=Size, file_name=Name} ||
@ -591,26 +614,28 @@ to_pb_response(ReqID, {low_list_files, _EpochID}, Resp) ->
_Else ->
make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {low_wedge_status, _BogusEpochID}, Resp) ->
to_pb_response(ReqID, {low_skip_wedge, {low_wedge_status}}, Resp) ->
case Resp of
{error, _}=Error ->
Status = conv_from_status(Error),
#mpb_ll_response{req_id=ReqID,
wedge_status=#mpb_ll_wedgestatusresp{status=Status}};
{Wedged_p, EpochID} ->
PB_Wedged = conv_from_boolean(Wedged_p),
{Wedged_p, EpochID, NSVersion, NS} ->
PB_EpochID = conv_from_epoch_id(EpochID),
#mpb_ll_response{req_id=ReqID,
wedge_status=#mpb_ll_wedgestatusresp{
status='OK',
epoch_id=PB_EpochID,
wedged_flag=PB_Wedged}}
wedged_flag=Wedged_p,
namespace_version=NSVersion,
namespace=NS
}}
end;
to_pb_response(ReqID, {low_delete_migration, _EID, _Fl}, Resp)->
to_pb_response(ReqID, {low_skip_wedge, {low_delete_migration, _EID, _Fl}}, Resp)->
Status = conv_from_status(Resp),
#mpb_ll_response{req_id=ReqID,
delete_migration=#mpb_ll_deletemigrationresp{status=Status}};
to_pb_response(ReqID, {low_trunc_hack, _EID, _Fl}, Resp)->
to_pb_response(ReqID, {low_skip_wedge, {low_trunc_hack, _EID, _Fl}}, Resp)->
Status = conv_from_status(Resp),
#mpb_ll_response{req_id=ReqID,
trunc_hack=#mpb_ll_trunchackresp{status=Status}};
@ -691,7 +716,7 @@ to_pb_response(ReqID, {high_auth, _User, _Pass}, _Resp) ->
#mpb_response{req_id=ReqID,
generic=#mpb_errorresp{code=1,
msg="AUTH not implemented"}};
to_pb_response(ReqID, {high_append_chunk, _CoC_n, _CoC_l, _Prefix, _Chunk, _TSum, _CE}, Resp)->
to_pb_response(ReqID, {high_append_chunk, _NS, _Prefix, _Chunk, _TSum, _O}, Resp)->
case Resp of
{ok, {Offset, Size, File}} ->
Where = #mpb_chunkpos{offset=Offset,
@ -707,7 +732,7 @@ to_pb_response(ReqID, {high_append_chunk, _CoC_n, _CoC_l, _Prefix, _Chunk, _TSum
_Else ->
make_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {high_write_chunk, _File, _Offset, _Chunk, _TaggedCSum}, Resp) ->
to_pb_response(ReqID, {high_write_chunk, _File, _Offset, _Chunk, _CSum}, Resp) ->
case Resp of
{ok, {_,_,_}} ->
%% machi_cr_client returns ok 2-tuple, convert to simple ok.
@ -797,12 +822,12 @@ make_tagged_csum(#mpb_chunkcsum{type='CSUM_TAG_CLIENT_SHA', csum=CSum}, _CB) ->
make_ll_error_resp(ReqID, Code, Msg) ->
#mpb_ll_response{req_id=ReqID,
generic=#mpb_errorresp{code=Code,
msg=Msg}}.
msg=Msg}}.
make_error_resp(ReqID, Code, Msg) ->
#mpb_response{req_id=ReqID,
generic=#mpb_errorresp{code=Code,
msg=Msg}}.
msg=Msg}}.
conv_from_epoch_id({Epoch, EpochCSum}) ->
#mpb_epochid{epoch_number=Epoch,
@ -815,6 +840,7 @@ conv_to_epoch_id(#mpb_epochid{epoch_number=Epoch,
conv_to_projection_v1(#mpb_projectionv1{epoch_number=Epoch,
epoch_csum=CSum,
author_server=Author,
chain_name=ChainName,
all_members=AllMembers,
witnesses=Witnesses,
creation_time=CTime,
@ -828,6 +854,7 @@ conv_to_projection_v1(#mpb_projectionv1{epoch_number=Epoch,
#projection_v1{epoch_number=Epoch,
epoch_csum=CSum,
author_server=to_atom(Author),
chain_name=to_atom(ChainName),
all_members=[to_atom(X) || X <- AllMembers],
witnesses=[to_atom(X) || X <- Witnesses],
creation_time=conv_to_now(CTime),
@ -957,24 +984,34 @@ conv_from_status({error, partial_read}) ->
conv_from_status({error, bad_epoch}) ->
'BAD_EPOCH';
conv_from_status(_OOPS) ->
io:format(user, "HEY, ~s:~w got ~w\n", [?MODULE, ?LINE, _OOPS]),
io:format(user, "HEY, ~s:~w got ~p\n", [?MODULE, ?LINE, _OOPS]),
'BAD_JOSS'.
conv_to_boolean(undefined) ->
false;
conv_to_boolean(0) ->
false;
conv_to_boolean(N) when is_integer(N) ->
true.
conv_from_append_opts(#append_opts{chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref}) ->
{ChunkExtra, Pref, FailPref}.
conv_from_boolean(false) ->
0;
conv_from_boolean(true) ->
1.
conv_to_append_opts(#mpb_appendchunkreq{
chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref}) ->
#append_opts{chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref};
conv_to_append_opts(#mpb_ll_appendchunkreq{
chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref}) ->
#append_opts{chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref}.
conv_from_projection_v1(#projection_v1{epoch_number=Epoch,
epoch_csum=CSum,
author_server=Author,
chain_name=ChainName,
all_members=AllMembers,
witnesses=Witnesses,
creation_time=CTime,
@ -988,6 +1025,7 @@ conv_from_projection_v1(#projection_v1{epoch_number=Epoch,
#mpb_projectionv1{epoch_number=Epoch,
epoch_csum=CSum,
author_server=to_list(Author),
chain_name=to_list(ChainName),
all_members=[to_list(X) || X <- AllMembers],
witnesses=[to_list(X) || X <- Witnesses],
creation_time=conv_from_now(CTime),

View file

@ -1,3 +1,23 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2016 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(machi_plist).
%%% @doc persistent list of binaries

View file

@ -174,6 +174,7 @@ make_summary(#projection_v1{epoch_number=EpochNum,
repairing=Repairing_list,
dbg=Dbg, dbg2=Dbg2}) ->
[{epoch,EpochNum}, {csum,_CSum4},
{all, _All_list},
{author,Author}, {mode,CMode},{witnesses, Witness_list},
{upi,UPI_list},{repair,Repairing_list},{down,Down_list}] ++
[{d,Dbg}, {d2,Dbg2}].

View file

@ -321,7 +321,7 @@ do_proj_write3(ProjType, #projection_v1{epoch_number=Epoch,
end.
do_proj_write4(ProjType, Proj, Path, Epoch, #state{consistency_mode=CMode}=S) ->
{ok, FH} = file:open(Path, [write, raw, binary]),
{{ok, FH}, Epoch, Path} = {file:open(Path, [write, raw, binary]), Epoch, Path},
ok = file:write(FH, term_to_binary(Proj)),
ok = file:sync(FH),
ok = file:close(FH),
@ -387,7 +387,6 @@ wait_for_liveness(PidSpec, StartTime, WaitTime) ->
undefined ->
case timer:now_diff(os:timestamp(), StartTime) div 1000 of
X when X < WaitTime ->
io:format(user, "\nYOO ~p ~p\n", [PidSpec, lists:sort(registered())]),
timer:sleep(1),
wait_for_liveness(PidSpec, StartTime, WaitTime)
end;

View file

@ -22,6 +22,10 @@
%% proxy-process style API for hiding messy details such as TCP
%% connection/disconnection with the remote Machi server.
%%
%% Please see {@link machi_flu1_client} the "Client API implemntation notes"
%% section for how this module relates to the rest of the client API
%% implementation.
%%
%% Machi is intentionally avoiding using distributed Erlang for
%% Machi's communication. This design decision makes Erlang-side code
%% more difficult &amp; complex, but it's the price to pay for some
@ -57,12 +61,9 @@
%% FLU1 API
-export([
%% File API
append_chunk/4, append_chunk/5,
append_chunk/6, append_chunk/7,
append_chunk_extra/5, append_chunk_extra/6,
append_chunk_extra/7, append_chunk_extra/8,
read_chunk/6, read_chunk/7,
checksum_list/3, checksum_list/4,
append_chunk/6, append_chunk/8,
read_chunk/7, read_chunk/8,
checksum_list/2, checksum_list/3,
list_files/2, list_files/3,
wedge_status/1, wedge_status/2,
@ -80,8 +81,8 @@
quit/1,
%% Internal API
write_chunk/5, write_chunk/6,
trim_chunk/5, trim_chunk/6,
write_chunk/7, write_chunk/8,
trim_chunk/6, trim_chunk/7,
%% Helpers
stop_proxies/1, start_proxies/1
@ -106,80 +107,39 @@ start_link(#p_srvr{}=I) ->
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk(PidSpec, EpochID, Prefix, Chunk) ->
append_chunk(PidSpec, EpochID, Prefix, Chunk, infinity).
append_chunk(PidSpec, NSInfo, EpochID, Prefix, Chunk, CSum) ->
append_chunk(PidSpec, NSInfo, EpochID, Prefix, Chunk, CSum,
#append_opts{}, infinity).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk(PidSpec, EpochID, Prefix, Chunk, Timeout) ->
append_chunk_extra(PidSpec, EpochID,
?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR,
Prefix, Chunk, 0, Timeout).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk(PidSpec, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk) ->
append_chunk(PidSpec, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk, infinity).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk(PidSpec, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk, Timeout) ->
append_chunk_extra(PidSpec, EpochID,
CoC_Namespace, CoC_Locator,
Prefix, Chunk, 0, Timeout).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk_extra(PidSpec, EpochID, Prefix, Chunk, ChunkExtra)
when is_integer(ChunkExtra), ChunkExtra >= 0 ->
append_chunk_extra(PidSpec, EpochID,
?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR,
Prefix, Chunk, ChunkExtra, infinity).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk_extra(PidSpec, EpochID, Prefix, Chunk, ChunkExtra, Timeout) ->
append_chunk_extra(PidSpec, EpochID,
?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR,
Prefix, Chunk, ChunkExtra, Timeout).
append_chunk_extra(PidSpec, EpochID, CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra) ->
append_chunk_extra(PidSpec, EpochID, CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra, infinity).
append_chunk_extra(PidSpec, EpochID, CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra, Timeout) ->
gen_server:call(PidSpec, {req, {append_chunk_extra, EpochID,
CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra}},
append_chunk(PidSpec, NSInfo, EpochID, Prefix, Chunk, CSum, Opts,
Timeout) ->
gen_server:call(PidSpec, {req, {append_chunk, NSInfo, EpochID,
Prefix, Chunk, CSum, Opts, Timeout}},
Timeout).
%% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
read_chunk(PidSpec, EpochID, File, Offset, Size, Opts) ->
read_chunk(PidSpec, EpochID, File, Offset, Size, Opts, infinity).
read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, Opts) ->
read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, Opts, infinity).
%% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
read_chunk(PidSpec, EpochID, File, Offset, Size, Opts, Timeout) ->
gen_server:call(PidSpec, {req, {read_chunk, EpochID, File, Offset, Size, Opts}},
read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, Opts, Timeout) ->
gen_server:call(PidSpec, {req, {read_chunk, NSInfo, EpochID, File, Offset, Size, Opts}},
Timeout).
%% @doc Fetch the list of chunk checksums for `File'.
checksum_list(PidSpec, EpochID, File) ->
checksum_list(PidSpec, EpochID, File, infinity).
checksum_list(PidSpec, File) ->
checksum_list(PidSpec, File, infinity).
%% @doc Fetch the list of chunk checksums for `File'.
checksum_list(PidSpec, EpochID, File, Timeout) ->
gen_server:call(PidSpec, {req, {checksum_list, EpochID, File}},
checksum_list(PidSpec, File, Timeout) ->
gen_server:call(PidSpec, {req, {checksum_list, File}},
Timeout).
%% @doc Fetch the list of all files on the remote FLU.
@ -320,18 +280,18 @@ quit(PidSpec) ->
%% @doc Write a chunk (binary- or iolist-style) of data to a file
%% with `Prefix' at `Offset'.
write_chunk(PidSpec, EpochID, File, Offset, Chunk) ->
write_chunk(PidSpec, EpochID, File, Offset, Chunk, infinity).
write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum) ->
write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum, infinity).
%% @doc Write a chunk (binary- or iolist-style) of data to a file
%% with `Prefix' at `Offset'.
write_chunk(PidSpec, EpochID, File, Offset, Chunk, Timeout) ->
case gen_server:call(PidSpec, {req, {write_chunk, EpochID, File, Offset, Chunk}},
write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum, Timeout) ->
case gen_server:call(PidSpec, {req, {write_chunk, NSInfo, EpochID, File, Offset, Chunk, CSum}},
Timeout) of
{error, written}=Err ->
Size = byte_size(Chunk),
case read_chunk(PidSpec, EpochID, File, Offset, Size, [], Timeout) of
case read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, undefined, Timeout) of
{ok, {[{File, Offset, Chunk2, _}], []}} when Chunk2 == Chunk ->
%% See equivalent comment inside write_projection().
ok;
@ -343,15 +303,15 @@ write_chunk(PidSpec, EpochID, File, Offset, Chunk, Timeout) ->
end.
trim_chunk(PidSpec, EpochID, File, Offset, Size) ->
trim_chunk(PidSpec, EpochID, File, Offset, Size, infinity).
trim_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size) ->
trim_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, infinity).
%% @doc Write a chunk (binary- or iolist-style) of data to a file
%% with `Prefix' at `Offset'.
trim_chunk(PidSpec, EpochID, File, Offset, Chunk, Timeout) ->
trim_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, Timeout) ->
gen_server:call(PidSpec,
{req, {trim_chunk, EpochID, File, Offset, Chunk}},
{req, {trim_chunk, NSInfo, EpochID, File, Offset, Chunk}},
Timeout).
%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -415,24 +375,24 @@ do_req_retry(_Req, 2, Err, S) ->
do_req_retry(Req, Depth, _Err, S) ->
do_req(Req, Depth + 1, try_connect(disconnect(S))).
make_req_fun({append_chunk_extra, EpochID, CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra},
make_req_fun({append_chunk, NSInfo, EpochID,
Prefix, Chunk, CSum, Opts, Timeout},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:append_chunk_extra(Sock, EpochID, CoC_Namespace, CoC_Locator,
Prefix, Chunk, ChunkExtra)
fun() -> Mod:append_chunk(Sock, NSInfo, EpochID,
Prefix, Chunk, CSum, Opts, Timeout)
end;
make_req_fun({read_chunk, EpochID, File, Offset, Size, Opts},
make_req_fun({read_chunk, NSInfo, EpochID, File, Offset, Size, Opts},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:read_chunk(Sock, EpochID, File, Offset, Size, Opts) end;
make_req_fun({write_chunk, EpochID, File, Offset, Chunk},
fun() -> Mod:read_chunk(Sock, NSInfo, EpochID, File, Offset, Size, Opts) end;
make_req_fun({write_chunk, NSInfo, EpochID, File, Offset, Chunk, CSum},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:write_chunk(Sock, EpochID, File, Offset, Chunk) end;
make_req_fun({trim_chunk, EpochID, File, Offset, Size},
fun() -> Mod:write_chunk(Sock, NSInfo, EpochID, File, Offset, Chunk, CSum) end;
make_req_fun({trim_chunk, NSInfo, EpochID, File, Offset, Size},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:trim_chunk(Sock, EpochID, File, Offset, Size) end;
make_req_fun({checksum_list, EpochID, File},
fun() -> Mod:trim_chunk(Sock, NSInfo, EpochID, File, Offset, Size) end;
make_req_fun({checksum_list, File},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:checksum_list(Sock, EpochID, File) end;
fun() -> Mod:checksum_list(Sock, File) end;
make_req_fun({list_files, EpochID},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:list_files(Sock, EpochID) end;

View file

@ -57,8 +57,19 @@ init([]) ->
Shutdown = ?SHUTDOWN,
Type = supervisor,
FluSup = {machi_flu_sup, {machi_flu_sup, start_link, []},
Restart, Shutdown, Type, []},
ServerSup =
{machi_flu_sup, {machi_flu_sup, start_link, []},
Restart, Shutdown, Type, []},
RanchSup = {ranch_sup, {ranch_sup, start_link, []},
Restart, Shutdown, supervisor, [ranch_sup]},
{ok, {SupFlags, [FluSup, RanchSup]}}.
LifecycleMgr =
{machi_lifecycle_mgr, {machi_lifecycle_mgr, start_link, []},
Restart, Shutdown, worker, []},
RunningApps = [A || {A,_D,_V} <- application:which_applications()],
Specs = case lists:member(ranch, RunningApps) of
true ->
[ServerSup, LifecycleMgr];
false ->
[ServerSup, RanchSup, LifecycleMgr]
end,
{ok, {SupFlags, Specs}}.

View file

@ -25,6 +25,7 @@
-export([
checksum_chunk/1,
make_tagged_csum/1, make_tagged_csum/2,
make_client_csum/1,
unmake_tagged_csum/1,
hexstr_to_bin/1, bin_to_hexstr/1,
hexstr_to_int/1, int_to_hexstr/2, int_to_hexbin/2,
@ -49,7 +50,9 @@
%% Other
wait_for_death/2, wait_for_life/2,
bool2int/1,
int2bool/1
int2bool/1,
read_opts_default/1,
ns_info_default/1
]).
-include("machi.hrl").
@ -68,12 +71,12 @@ make_regname(Prefix) when is_list(Prefix) ->
%% @doc Calculate a config file path, by common convention.
-spec make_config_filename(string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string()) ->
-spec make_config_filename(string(), machi_dt:namespace(), machi_dt:locator(), string()) ->
string().
make_config_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix) ->
Locator_str = int_to_hexstr(CoC_Locator, 32),
make_config_filename(DataDir, NS, NSLocator, Prefix) ->
NSLocator_str = int_to_hexstr(NSLocator, 32),
lists:flatten(io_lib:format("~s/config/~s^~s^~s",
[DataDir, Prefix, CoC_Namespace, Locator_str])).
[DataDir, Prefix, NS, NSLocator_str])).
%% @doc Calculate a config file path, by common convention.
@ -102,19 +105,19 @@ make_checksum_filename(DataDir, FileName) ->
%% @doc Calculate a file data file path, by common convention.
-spec make_data_filename(string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string(), atom()|string()|binary(), integer()|string()) ->
-spec make_data_filename(string(), machi_dt:namespace(), machi_dt:locator(), string(), atom()|string()|binary(), integer()|string()) ->
{binary(), string()}.
make_data_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, SequencerName, FileNum)
make_data_filename(DataDir, NS, NSLocator, Prefix, SequencerName, FileNum)
when is_integer(FileNum) ->
Locator_str = int_to_hexstr(CoC_Locator, 32),
NSLocator_str = int_to_hexstr(NSLocator, 32),
File = erlang:iolist_to_binary(io_lib:format("~s^~s^~s^~s^~w",
[Prefix, CoC_Namespace, Locator_str, SequencerName, FileNum])),
[Prefix, NS, NSLocator_str, SequencerName, FileNum])),
make_data_filename2(DataDir, File);
make_data_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, SequencerName, String)
make_data_filename(DataDir, NS, NSLocator, Prefix, SequencerName, String)
when is_list(String) ->
Locator_str = int_to_hexstr(CoC_Locator, 32),
NSLocator_str = int_to_hexstr(NSLocator, 32),
File = erlang:iolist_to_binary(io_lib:format("~s^~s^~s^~s^~s",
[Prefix, CoC_Namespace, Locator_str, SequencerName, string])),
[Prefix, NS, NSLocator_str, SequencerName, string])),
make_data_filename2(DataDir, File).
make_data_filename2(DataDir, File) ->
@ -154,37 +157,36 @@ is_valid_filename(Filename) ->
%% The components will be:
%% <ul>
%% <li>Prefix</li>
%% <li>CoC Namespace</li>
%% <li>CoC locator</li>
%% <li>Cluster namespace</li>
%% <li>Cluster locator</li>
%% <li>UUID</li>
%% <li>Sequence number</li>
%% </ul>
%%
%% Invalid filenames will return an empty list.
-spec parse_filename( Filename :: string() ) -> {} | {string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string(), string() }.
-spec parse_filename( Filename :: string() ) -> {} | {string(), machi_dt:namespace(), machi_dt:locator(), string(), string() }.
parse_filename(Filename) ->
case string:tokens(Filename, "^") of
[Prefix, CoC_NS, CoC_Loc, UUID, SeqNo] ->
{Prefix, CoC_NS, list_to_integer(CoC_Loc), UUID, SeqNo};
[Prefix, CoC_Loc, UUID, SeqNo] ->
[Prefix, NS, NSLocator, UUID, SeqNo] ->
{Prefix, NS, list_to_integer(NSLocator), UUID, SeqNo};
[Prefix, NSLocator, UUID, SeqNo] ->
%% string:tokens() doesn't consider "foo^^bar" as 3 tokens {sigh}
case re:replace(Filename, "[^^]+", "x", [global,{return,binary}]) of
<<"x^^x^x^x">> ->
{Prefix, <<"">>, list_to_integer(CoC_Loc), UUID, SeqNo};
{Prefix, <<"">>, list_to_integer(NSLocator), UUID, SeqNo};
_ ->
{}
end;
_ -> {}
end.
%% @doc Read the file size of a config file, which is used as the
%% basis for a minimum sequence number.
-spec read_max_filenum(string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string()) ->
-spec read_max_filenum(string(), machi_dt:namespace(), machi_dt:locator(), string()) ->
non_neg_integer().
read_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix) ->
case file:read_file_info(make_config_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix)) of
read_max_filenum(DataDir, NS, NSLocator, Prefix) ->
case file:read_file_info(make_config_filename(DataDir, NS, NSLocator, Prefix)) of
{error, enoent} ->
0;
{ok, FI} ->
@ -194,11 +196,11 @@ read_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix) ->
%% @doc Increase the file size of a config file, which is used as the
%% basis for a minimum sequence number.
-spec increment_max_filenum(string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string()) ->
-spec increment_max_filenum(string(), machi_dt:namespace(), machi_dt:locator(), string()) ->
ok | {error, term()}.
increment_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix) ->
increment_max_filenum(DataDir, NS, NSLocator, Prefix) ->
try
{ok, FH} = file:open(make_config_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix), [append]),
{ok, FH} = file:open(make_config_filename(DataDir, NS, NSLocator, Prefix), [append]),
ok = file:write(FH, "x"),
ok = file:sync(FH),
ok = file:close(FH)
@ -287,12 +289,25 @@ int_to_hexbin(I, I_size) ->
checksum_chunk(Chunk) when is_binary(Chunk); is_list(Chunk) ->
crypto:hash(sha, Chunk).
convert_csum_tag(A) when is_atom(A)->
A;
convert_csum_tag(?CSUM_TAG_NONE) ->
?CSUM_TAG_NONE_ATOM;
convert_csum_tag(?CSUM_TAG_CLIENT_SHA) ->
?CSUM_TAG_CLIENT_SHA_ATOM;
convert_csum_tag(?CSUM_TAG_SERVER_SHA) ->
?CSUM_TAG_SERVER_SHA_ATOM;
convert_csum_tag(?CSUM_TAG_SERVER_REGEN_SHA) ->
?CSUM_TAG_SERVER_REGEN_SHA_ATOM.
%% @doc Create a tagged checksum
make_tagged_csum(none) ->
<<?CSUM_TAG_NONE:8>>;
make_tagged_csum(<<>>) ->
<<?CSUM_TAG_NONE:8>>;
make_tagged_csum({Tag, CSum}) ->
make_tagged_csum(Tag, CSum).
make_tagged_csum(convert_csum_tag(Tag), CSum).
%% @doc Makes tagged csum. Each meanings are:
%% none / ?CSUM_TAG_NONE
@ -313,6 +328,9 @@ make_tagged_csum(?CSUM_TAG_SERVER_SHA_ATOM, SHA) ->
make_tagged_csum(?CSUM_TAG_SERVER_REGEN_SHA_ATOM, SHA) ->
<<?CSUM_TAG_SERVER_REGEN_SHA:8, SHA/binary>>.
make_client_csum(BinOrList) ->
make_tagged_csum(?CSUM_TAG_CLIENT_SHA_ATOM, checksum_chunk(BinOrList)).
unmake_tagged_csum(<<Tag:8, Rest/binary>>) ->
{Tag, Rest}.
@ -360,7 +378,7 @@ wait_for_death(Pid, Iters) when is_pid(Pid) ->
false ->
ok;
true ->
timer:sleep(1),
timer:sleep(10),
wait_for_death(Pid, Iters-1)
end.
@ -431,3 +449,17 @@ bool2int(true) -> 1;
bool2int(false) -> 0.
int2bool(0) -> false;
int2bool(I) when is_integer(I) -> true.
read_opts_default(#read_opts{}=NSInfo) ->
NSInfo;
read_opts_default(A) when A == 'undefined'; A == 'noopt'; A == 'none' ->
#read_opts{};
read_opts_default(A) when is_atom(A) ->
#read_opts{}.
ns_info_default(#ns_info{}=NSInfo) ->
NSInfo;
ns_info_default(A) when is_atom(A) ->
#ns_info{}.

View file

@ -22,6 +22,8 @@
-module(machi_yessir_client).
-ifdef(TODO_refactoring_deferred).
-include("machi.hrl").
-include("machi_projection.hrl").
@ -30,7 +32,7 @@
append_chunk/4, append_chunk/5,
append_chunk_extra/5, append_chunk_extra/6,
read_chunk/5, read_chunk/6,
checksum_list/3, checksum_list/4,
checksum_list/2, checksum_list/3,
list_files/2, list_files/3,
wedge_status/1, wedge_status/2,
@ -173,7 +175,7 @@ read_chunk(_Host, _TcpPort, EpochID, File, Offset, Size)
%% @doc Fetch the list of chunk checksums for `File'.
checksum_list(#yessir{name=Name,chunk_size=ChunkSize}, _EpochID, File) ->
checksum_list(#yessir{name=Name,chunk_size=ChunkSize}, File) ->
case get({Name,offset,File}) of
undefined ->
{error, no_such_file};
@ -187,10 +189,10 @@ checksum_list(#yessir{name=Name,chunk_size=ChunkSize}, _EpochID, File) ->
%% @doc Fetch the list of chunk checksums for `File'.
checksum_list(_Host, _TcpPort, EpochID, File) ->
checksum_list(_Host, _TcpPort, File) ->
Sock = connect(#p_srvr{proto_mod=?MODULE}),
try
checksum_list(Sock, EpochID, File)
checksum_list(Sock, File)
after
disconnect(Sock)
end.
@ -509,3 +511,5 @@ disconnect(#yessir{name=Name}) ->
%% =INFO REPORT==== 17-May-2015::18:57:52 ===
%% Repair success: tail a of [a] finished ap_mode repair ID {a,{1431,856671,140404}}: ok
%% Stats [{t_in_files,0},{t_in_chunks,10413},{t_in_bytes,682426368},{t_out_files,0},{t_out_chunks,10413},{t_out_bytes,682426368},{t_bad_chunks,0},{t_elapsed_seconds,1.591}]
-endif. % TODO_refactoring_deferred

View file

@ -44,6 +44,8 @@ verify_file_checksums_test2() ->
TcpPort = 32958,
DataDir = "./data",
W_props = [{initial_wedged, false}],
NSInfo = undefined,
NoCSum = <<>>,
try
machi_test_util:start_flu_package(verify1_flu, TcpPort, DataDir,
W_props),
@ -51,8 +53,8 @@ verify_file_checksums_test2() ->
try
Prefix = <<"verify_prefix">>,
NumChunks = 10,
[{ok, _} = ?FLU_C:append_chunk(Sock1, ?DUMMY_PV1_EPOCH,
Prefix, <<X:(X*8)/big>>) ||
[{ok, _} = ?FLU_C:append_chunk(Sock1, NSInfo, ?DUMMY_PV1_EPOCH,
Prefix, <<X:(X*8)/big>>, NoCSum) ||
X <- lists:seq(1, NumChunks)],
{ok, [{_FileSize,File}]} = ?FLU_C:list_files(Sock1, ?DUMMY_PV1_EPOCH),
?assertEqual({ok, []},

View file

@ -118,7 +118,10 @@ append(CRIndex, Bin, #state{verbose=V}=S) ->
{_SimSelfName, C} = lists:nth(CRIndex, CRList),
Prefix = <<"pre">>,
Len = byte_size(Bin),
Res = (catch machi_cr_client:append_chunk(C, Prefix, Bin, {sec(1), sec(1)})),
NSInfo = #ns_info{},
NoCSum = <<>>,
Opts1 = #append_opts{},
Res = (catch machi_cr_client:append_chunk(C, NSInfo, Prefix, Bin, NoCSum, Opts1, sec(1))),
case Res of
{ok, {_Off, Len, _FileName}=Key} ->
case ets:insert_new(?WRITTEN_TAB, {Key, Bin}) of
@ -190,6 +193,7 @@ change_partition(Partition,
%% Don't wait for stable chain, tick will be executed on demand
%% in append oprations
_ = tick(S),
ok.
%% Generators
@ -407,8 +411,8 @@ stabilize(0, _T) ->
stabilize(_CmdsLen, #target{flu_names=FLUNames, mgr_names=MgrNames,
verbose=Verbose}) ->
machi_partition_simulator:no_partitions(),
wait_until_stable(chain_state_all_ok(FLUNames), FLUNames, MgrNames,
100, Verbose),
true = wait_until_stable(chain_state_all_ok(FLUNames), FLUNames, MgrNames,
100, Verbose),
ok.
chain_state_all_ok(FLUNames) ->
@ -427,7 +431,7 @@ confirm_result(_T) ->
0 -> ok;
_ ->
DumpFailed = filename:join(DirBase, "dump-failed-" ++ Suffix),
?V("Dump failed ETS tab to: ~w~n", [DumpFailed]),
?V("Dump failed ETS tab to: ~s~n", [DumpFailed]),
ets:tab2file(?FAILED_TAB, DumpFailed)
end,
case Critical of
@ -450,14 +454,14 @@ confirm_written(C) ->
assert_chunk(C, {Off, Len, FileName}=Key, Bin) ->
%% TODO: This probably a bug, read_chunk respnds with filename of `string()' type
FileNameStr = binary_to_list(FileName),
%% TODO : Use CSum instead of binary (after disuccsion about CSum is calmed down?)
case (catch machi_cr_client:read_chunk(C, FileName, Off, Len, [], sec(3))) of
{ok, {[{FileNameStr, Off, Bin, _}], []}} ->
NSInfo = undefined,
case (catch machi_cr_client:read_chunk(C, NSInfo, FileName, Off, Len, undefined, sec(3))) of
{ok, {[{FileName, Off, Bin, _}], []}} ->
ok;
{ok, Got} ->
?V("read_chunk got different binary for Key=~p~n", [Key]),
?V(" Expected: ~p~n", [{[{FileNameStr, Off, Bin, <<"CSum-NYI">>}], []}]),
?V(" Expected: ~p~n", [{[{FileName, Off, Bin, <<"CSum-NYI">>}], []}]),
?V(" Got: ~p~n", [Got]),
{error, different_binary};
{error, Reason} ->
@ -479,7 +483,7 @@ eqc_verbose() ->
os:getenv("EQC_VERBOSE") =:= "true".
eqc_timeout(Default) ->
PropTimeout = case os:getenv("EQC_TIMEOUT") of
PropTimeout = case os:getenv("EQC_TIME") of
false -> Default;
V -> list_to_integer(V)
end,
@ -554,8 +558,10 @@ wait_until_stable(ExpectedChainState, FLUNames, MgrNames, Retries, Verbose) ->
FCList = fc_list(),
wait_until_stable1(ExpectedChainState, TickFun, FCList, Retries, Verbose).
wait_until_stable1(_ExpectedChainState, _TickFun, FCList, 0, _Verbose) ->
wait_until_stable1(ExpectedChainState, _TickFun, FCList, 0, _Verbose) ->
?V(" [ERROR] _ExpectedChainState ~p\n", [ExpectedChainState]),
?V(" [ERROR] wait_until_stable failed.... : ~p~n", [chain_state(FCList)]),
?V(" [ERROR] norm.... : ~p~n", [normalize_chain_state(chain_state(FCList))]),
false;
wait_until_stable1(ExpectedChainState, TickFun, FCList, Reties, Verbose) ->
[TickFun(3, 0, 100) || _ <- lists:seq(1, 3)],

View file

@ -134,6 +134,7 @@ Press control-c to interrupt the test....".
%% convergence_demo_testfun(3).
-define(DEFAULT_MGR_OPTS, [{private_write_verbose, false},
{private_write_verbose_confirm, true},
{active_mode,false},
{use_partition_simulator, true}]).
@ -150,7 +151,8 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) ->
%% Faster test startup, commented: io:format(user, short_doc(), []),
%% Faster test startup, commented: timer:sleep(3000),
application:start(sasl),
Apps = [sasl, ranch],
[application:start(App) || App <- Apps],
MgrOpts = MgrOpts0 ++ ?DEFAULT_MGR_OPTS,
TcpPort = proplists:get_value(port_base, MgrOpts, 62877),
@ -187,15 +189,18 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) ->
end || #p_srvr{name=Name}=P <- Ps],
MembersDict = machi_projection:make_members_dict(Ps),
Witnesses = proplists:get_value(witnesses, MgrOpts, []),
CMode = case {Witnesses, proplists:get_value(consistency_mode, MgrOpts,
ap_mode)} of
{[_|_], _} -> cp_mode;
{_, cp_mode} -> cp_mode;
{_, ap_mode} -> ap_mode
end,
MgrNamez = [begin
MgrName = machi_flu_psup:make_mgr_supname(Name),
ok = ?MGR:set_chain_members(MgrName,MembersDict,Witnesses),
ok = ?MGR:set_chain_members(MgrName, ch_demo, 0, CMode,
MembersDict,Witnesses),
{Name, MgrName}
end || #p_srvr{name=Name} <- Ps],
CpApMode = case Witnesses /= [] of
true -> cp_mode;
false -> ap_mode
end,
try
[{_, Ma}|_] = MgrNamez,
@ -303,9 +308,9 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) ->
[{FLU, true} = {FLU, ?MGR:projection_transitions_are_sane_retrospective(Psx, FLU)} ||
{FLU, Psx} <- PrivProjs]
catch
_Err:_What when CpApMode == cp_mode ->
_Err:_What when CMode == cp_mode ->
io:format(user, "none proj skip detected, TODO? ", []);
_Err:_What when CpApMode == ap_mode ->
_Err:_What when CMode == ap_mode ->
io:format(user, "PrivProjs ~p\n", [PrivProjs]),
exit({line, ?LINE, _Err, _What})
end,
@ -371,9 +376,9 @@ timer:sleep(1234),
{FLU, Psx} <- PrivProjs],
io:format(user, "\nAll sanity checks pass, hooray!\n", [])
catch
_Err:_What when CpApMode == cp_mode ->
_Err:_What when CMode == cp_mode ->
io:format(user, "none proj skip detected, TODO? ", []);
_Err:_What when CpApMode == ap_mode ->
_Err:_What when CMode == ap_mode ->
io:format(user, "Report ~p\n", [Report]),
io:format(user, "PrivProjs ~p\n", [PrivProjs]),
exit({line, ?LINE, _Err, _What})
@ -390,7 +395,8 @@ timer:sleep(1234),
exit(SupPid, normal),
ok = machi_partition_simulator:stop(),
[ok = ?FLU_PC:quit(PPid) || {_, PPid} <- Namez],
machi_util:wait_for_death(SupPid, 100)
machi_util:wait_for_death(SupPid, 100),
[application:start(App) || App <- lists:reverse(Apps)]
end.
%% Many of the static partition lists below have been problematic at one

View file

@ -273,6 +273,17 @@ make_prop_ets() ->
-endif. % EQC
make_advance_fun(FitList, FLUList, MgrList, Num) ->
fun() ->
[begin
[catch machi_fitness:trigger_early_adjustment(Fit, Tgt) ||
Fit <- FitList,
Tgt <- FLUList ],
[catch ?MGR:trigger_react_to_env(Mgr) || Mgr <- MgrList],
ok
end || _ <- lists:seq(1, Num)]
end.
smoke0_test() ->
TcpPort = 6623,
{[Pa], [M0], _Dirs} = machi_test_util:start_flu_packages(
@ -319,20 +330,24 @@ smoke1_test2() ->
machi_test_util:stop_flu_packages()
end.
nonunanimous_setup_and_fix_test() ->
nonunanimous_setup_and_fix_test_() ->
os:cmd("rm -f /tmp/moomoo.*"),
{timeout, 1*60, fun() -> nonunanimous_setup_and_fix_test2() end}.
nonunanimous_setup_and_fix_test2() ->
TcpPort = 62877,
MgrOpts = [{active_mode,false}],
{Ps, [Ma,Mb], _Dirs} = machi_test_util:start_flu_packages(
2, TcpPort, "./data.", MgrOpts),
{Ps, [Ma,Mb,Mc], Dirs} = machi_test_util:start_flu_packages(
3, TcpPort, "./data.", MgrOpts),
MembersDict = machi_projection:make_members_dict(Ps),
[machi_chain_manager1:set_chain_members(M, MembersDict) || M <- [Ma, Mb]],
ChainName = my_little_chain,
[machi_chain_manager1:set_chain_members(M, ChainName, 0, ap_mode,
MembersDict, []) || M <- [Ma, Mb]],
[Proxy_a, Proxy_b] = Proxies =
[Proxy_a, Proxy_b, Proxy_c] = Proxies =
[element(2, ?FLU_PC:start_link(P)) || P <- Ps],
try
ok = machi_chain_manager1:set_chain_members(Ma, MembersDict, []),
ok = machi_chain_manager1:set_chain_members(Mb, MembersDict, []),
{ok, P1} = ?MGR:test_calc_projection(Ma, false),
P1a = machi_projection:update_checksum(
@ -368,11 +383,110 @@ nonunanimous_setup_and_fix_test() ->
{ok, P2pb} = ?FLU_PC:read_latest_projection(Proxy_b, private),
P2 = P2pb#projection_v1{dbg2=[]},
%% Pspam = machi_projection:update_checksum(
%% P1b#projection_v1{epoch_number=?SPAM_PROJ_EPOCH,
%% dbg=[hello_spam]}),
%% ok = ?FLU_PC:write_projection(Proxy_b, public, Pspam),
Mgrs = [a_chmgr, b_chmgr, c_chmgr],
Advance = make_advance_fun([a_fitness,b_fitness,c_fitness],
[a,b,c],
Mgrs,
3),
Advance(),
{_, _, TheEpoch_3} = ?MGR:trigger_react_to_env(Ma),
{_, _, TheEpoch_3} = ?MGR:trigger_react_to_env(Mb),
{_, _, TheEpoch_3} = ?MGR:trigger_react_to_env(Mc),
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Remove 'a' from the chain.\n", []),
MembersDict4 = machi_projection:make_members_dict(tl(Ps)),
ok = machi_chain_manager1:set_chain_members(
Mb, ChainName, TheEpoch_3, ap_mode, MembersDict4, []),
Advance(),
{ok, {true, _,_,_}} = ?FLU_PC:wedge_status(Proxy_a),
{_, _, TheEpoch_4} = ?MGR:trigger_react_to_env(Mb),
{_, _, TheEpoch_4} = ?MGR:trigger_react_to_env(Mc),
[{ok, #projection_v1{upi=[b,c], repairing=[]}} =
?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- tl(Proxies)],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Add a to the chain again (a is running).\n", []),
MembersDict5 = machi_projection:make_members_dict(Ps),
ok = machi_chain_manager1:set_chain_members(
Mb, ChainName, TheEpoch_4, ap_mode, MembersDict5, []),
Advance(),
{_, _, TheEpoch_5} = ?MGR:trigger_react_to_env(Ma),
{_, _, TheEpoch_5} = ?MGR:trigger_react_to_env(Mb),
{_, _, TheEpoch_5} = ?MGR:trigger_react_to_env(Mc),
[{ok, #projection_v1{upi=[b,c], repairing=[a]}} =
?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- Proxies],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Stop a while a chain member, advance b&c.\n", []),
ok = machi_flu_psup:stop_flu_package(a),
Advance(),
{_, _, TheEpoch_6} = ?MGR:trigger_react_to_env(Mb),
{_, _, TheEpoch_6} = ?MGR:trigger_react_to_env(Mc),
[{ok, #projection_v1{upi=[b,c], repairing=[]}} =
?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- tl(Proxies)],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Remove 'a' from the chain.\n", []),
MembersDict7 = machi_projection:make_members_dict(tl(Ps)),
ok = machi_chain_manager1:set_chain_members(
Mb, ChainName, TheEpoch_6, ap_mode, MembersDict7, []),
Advance(),
{_, _, TheEpoch_7} = ?MGR:trigger_react_to_env(Mb),
{_, _, TheEpoch_7} = ?MGR:trigger_react_to_env(Mc),
[{ok, #projection_v1{upi=[b,c], repairing=[]}} =
?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- tl(Proxies)],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Start a, advance.\n", []),
Opts = [{active_mode, false}, {initial_wedged, true}],
#p_srvr{name=NameA} = hd(Ps),
{ok,_}=machi_flu_psup:start_flu_package(NameA, TcpPort+1, hd(Dirs), Opts),
Advance(),
{ok, {true, _,_,_}} = ?FLU_PC:wedge_status(Proxy_a),
{ok, {false, EpochID_8,_,_}} = ?FLU_PC:wedge_status(Proxy_b),
{ok, {false, EpochID_8,_,_}} = ?FLU_PC:wedge_status(Proxy_c),
[{ok, #projection_v1{upi=[b,c], repairing=[]}} =
?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- tl(Proxies)],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Stop a, delete a's data, leave it stopped\n", []),
ok = machi_flu_psup:stop_flu_package(a),
Advance(),
machi_flu1_test:clean_up_data_dir(hd(Dirs)),
{ok, {false, _,_,_}} = ?FLU_PC:wedge_status(Proxy_b),
{ok, {false, _,_,_}} = ?FLU_PC:wedge_status(Proxy_c),
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Add a to the chain again (a is stopped).\n", []),
MembersDict9 = machi_projection:make_members_dict(Ps),
{_, _, TheEpoch_9} = ?MGR:trigger_react_to_env(Mb),
ok = machi_chain_manager1:set_chain_members(
Mb, ChainName, TheEpoch_9, ap_mode, MembersDict9, []),
Advance(),
{_, _, TheEpoch_9b} = ?MGR:trigger_react_to_env(Mb),
true = (TheEpoch_9b > TheEpoch_9),
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Start a, and it joins like it ought to\n", []),
{ok,_}=machi_flu_psup:start_flu_package(NameA, TcpPort+1, hd(Dirs), Opts),
Advance(),
{ok, {false, {TheEpoch10,_},_,_}} = ?FLU_PC:wedge_status(Proxy_a),
{ok, {false, {TheEpoch10,_},_,_}} = ?FLU_PC:wedge_status(Proxy_b),
{ok, {false, {TheEpoch10,_},_,_}} = ?FLU_PC:wedge_status(Proxy_c),
[{ok, #projection_v1{upi=[b,c], repairing=[a]}} =
?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- Proxies],
ok
after
[ok = ?FLU_PC:quit(X) || X <- Proxies],

View file

@ -38,6 +38,7 @@ smoke_test_() ->
fun() -> machi_cinfo:private_projection(a) end,
fun() -> machi_cinfo:fitness(a) end,
fun() -> machi_cinfo:chain_manager(a) end,
fun() -> machi_cinfo:flu1(a) end,
fun() -> machi_cinfo:dump() end
]}.

View file

@ -58,9 +58,15 @@ setup_smoke_test(Host, PortBase, Os, Witness_list) ->
%% 4. Wait until all others are using epoch id from #3.
%%
%% Damn, this is a pain to make 100% deterministic, bleh.
ok = machi_chain_manager1:set_chain_members(a_chmgr, D, Witness_list),
ok = machi_chain_manager1:set_chain_members(b_chmgr, D, Witness_list),
ok = machi_chain_manager1:set_chain_members(c_chmgr, D, Witness_list),
CMode = if Witness_list == [] -> ap_mode;
Witness_list /= [] -> cp_mode
end,
ok = machi_chain_manager1:set_chain_members(a_chmgr, ch0, 0, CMode,
D, Witness_list),
ok = machi_chain_manager1:set_chain_members(b_chmgr, ch0, 0, CMode,
D, Witness_list),
ok = machi_chain_manager1:set_chain_members(c_chmgr, ch0, 0, CMode,
D, Witness_list),
run_ticks([a_chmgr,b_chmgr,c_chmgr]),
%% Everyone is settled on the same damn epoch id.
{ok, EpochID} = machi_flu1_client:get_latest_epochid(Host, PortBase+0,
@ -101,6 +107,8 @@ smoke_test2() ->
try
Prefix = <<"pre">>,
Chunk1 = <<"yochunk">>,
NSInfo = undefined,
NoCSum = <<>>,
Host = "localhost",
PortBase = 64454,
Os = [{ignore_stability_time, true}, {active_mode, false}],
@ -108,91 +116,92 @@ smoke_test2() ->
%% Whew ... ok, now start some damn tests.
{ok, C1} = machi_cr_client:start_link([P || {_,P}<-orddict:to_list(D)]),
machi_cr_client:append_chunk(C1, Prefix, Chunk1),
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum),
{ok, {Off1,Size1,File1}} =
machi_cr_client:append_chunk(C1, Prefix, Chunk1),
Chunk1_badcs = {<<?CSUM_TAG_CLIENT_SHA:8, 0:(8*20)>>, Chunk1},
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum),
BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, "foo")},
{error, bad_checksum} =
machi_cr_client:append_chunk(C1, Prefix, Chunk1_badcs),
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, BadCSum),
{ok, {[{_, Off1, Chunk1, _}], []}} =
machi_cr_client:read_chunk(C1, File1, Off1, Size1, []),
machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, Size1, undefined),
{ok, PPP} = machi_flu1_client:read_latest_projection(Host, PortBase+0,
private),
%% Verify that the client's CR wrote to all of them.
[{ok, {[{_, Off1, Chunk1, _}], []}} =
machi_flu1_client:read_chunk(
Host, PortBase+X, EpochID, File1, Off1, Size1, []) ||
Host, PortBase+X, NSInfo, EpochID, File1, Off1, Size1, undefined) ||
X <- [0,1,2] ],
%% Test read repair: Manually write to head, then verify that
%% read-repair fixes all.
FooOff1 = Off1 + (1024*1024),
[{error, not_written} = machi_flu1_client:read_chunk(
Host, PortBase+X, EpochID,
File1, FooOff1, Size1, []) || X <- [0,1,2] ],
ok = machi_flu1_client:write_chunk(Host, PortBase+0, EpochID,
File1, FooOff1, Chunk1),
{ok, {[{_, FooOff1, Chunk1, _}], []}} =
machi_flu1_client:read_chunk(Host, PortBase+0, EpochID,
File1, FooOff1, Size1, []),
{ok, {[{_, FooOff1, Chunk1, _}], []}} =
machi_cr_client:read_chunk(C1, File1, FooOff1, Size1, []),
Host, PortBase+X, NSInfo, EpochID,
File1, FooOff1, Size1, undefined) || X <- [0,1,2] ],
ok = machi_flu1_client:write_chunk(Host, PortBase+0, NSInfo, EpochID,
File1, FooOff1, Chunk1, NoCSum),
{ok, {[{File1, FooOff1, Chunk1, _}=_YY], []}} =
machi_flu1_client:read_chunk(Host, PortBase+0, NSInfo, EpochID,
File1, FooOff1, Size1, undefined),
{ok, {[{File1, FooOff1, Chunk1, _}], []}} =
machi_cr_client:read_chunk(C1, NSInfo, File1, FooOff1, Size1, undefined),
[?assertMatch({X,{ok, {[{_, FooOff1, Chunk1, _}], []}}},
{X,machi_flu1_client:read_chunk(
Host, PortBase+X, EpochID,
File1, FooOff1, Size1, [])})
Host, PortBase+X, NSInfo, EpochID,
File1, FooOff1, Size1, undefined)})
|| X <- [0,1,2] ],
%% Test read repair: Manually write to middle, then same checking.
FooOff2 = Off1 + (2*1024*1024),
Chunk2 = <<"Middle repair chunk">>,
Size2 = size(Chunk2),
ok = machi_flu1_client:write_chunk(Host, PortBase+1, EpochID,
File1, FooOff2, Chunk2),
{ok, {[{_, FooOff2, Chunk2, _}], []}} =
machi_cr_client:read_chunk(C1, File1, FooOff2, Size2, []),
[{X,{ok, {[{_, FooOff2, Chunk2, _}], []}}} =
ok = machi_flu1_client:write_chunk(Host, PortBase+1, NSInfo, EpochID,
File1, FooOff2, Chunk2, NoCSum),
{ok, {[{File1, FooOff2, Chunk2, _}], []}} =
machi_cr_client:read_chunk(C1, NSInfo, File1, FooOff2, Size2, undefined),
[{X,{ok, {[{File1, FooOff2, Chunk2, _}], []}}} =
{X,machi_flu1_client:read_chunk(
Host, PortBase+X, EpochID,
File1, FooOff2, Size2, [])} || X <- [0,1,2] ],
Host, PortBase+X, NSInfo, EpochID,
File1, FooOff2, Size2, undefined)} || X <- [0,1,2] ],
%% Misc API smoke & minor regression checks
{error, bad_arg} = machi_cr_client:read_chunk(C1, <<"no">>,
999999999, 1, []),
{ok, {[{_,Off1,Chunk1,_}, {_,FooOff1,Chunk1,_}, {_,FooOff2,Chunk2,_}],
{error, bad_arg} = machi_cr_client:read_chunk(C1, NSInfo, <<"no">>,
999999999, 1, undefined),
{ok, {[{File1,Off1,Chunk1,_}, {File1,FooOff1,Chunk1,_}, {File1,FooOff2,Chunk2,_}],
[]}} =
machi_cr_client:read_chunk(C1, File1, Off1, 88888888, []),
machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, 88888888, undefined),
%% Checksum list return value is a primitive binary().
{ok, KludgeBin} = machi_cr_client:checksum_list(C1, File1),
true = is_binary(KludgeBin),
{error, bad_arg} = machi_cr_client:checksum_list(C1, <<"!!!!">>),
io:format(user, "\nFiles = ~p\n", [machi_cr_client:list_files(C1)]),
io:format(user, "\nFiles = ~p\n", [machi_cr_client:list_files(C1)]),
%% Exactly one file right now, e.g.,
%% {ok,[{2098202,<<"pre^b144ef13-db4d-4c9f-96e7-caff02dc754f^1">>}]}
{ok, [_]} = machi_cr_client:list_files(C1),
%% Go back and test append_chunk_extra() and write_chunk()
%% Go back and test append_chunk() + extra and write_chunk()
Chunk10 = <<"It's a different chunk!">>,
Size10 = byte_size(Chunk10),
Extra10 = 5,
Opts1 = #append_opts{chunk_extra=Extra10*Size10},
{ok, {Off10,Size10,File10}} =
machi_cr_client:append_chunk_extra(C1, Prefix, Chunk10,
Extra10 * Size10),
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk10,
NoCSum, Opts1),
{ok, {[{_, Off10, Chunk10, _}], []}} =
machi_cr_client:read_chunk(C1, File10, Off10, Size10, []),
machi_cr_client:read_chunk(C1, NSInfo, File10, Off10, Size10, undefined),
[begin
Offx = Off10 + (Seq * Size10),
%% TODO: uncomment written/not_written enforcement is available.
%% {error,not_written} = machi_cr_client:read_chunk(C1, File10,
%% {error,not_written} = machi_cr_client:read_chunk(C1, NSInfo, File10,
%% Offx, Size10),
{ok, {Offx,Size10,File10}} =
machi_cr_client:write_chunk(C1, File10, Offx, Chunk10),
machi_cr_client:write_chunk(C1, NSInfo, File10, Offx, Chunk10, NoCSum),
{ok, {[{_, Offx, Chunk10, _}], []}} =
machi_cr_client:read_chunk(C1, File10, Offx, Size10, [])
machi_cr_client:read_chunk(C1, NSInfo, File10, Offx, Size10, undefined)
end || Seq <- lists:seq(1, Extra10)],
{ok, {Off11,Size11,File11}} =
machi_cr_client:append_chunk(C1, Prefix, Chunk10),
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk10, NoCSum),
%% %% Double-check that our reserved extra bytes were really honored!
%% true = (Off11 > (Off10 + (Extra10 * Size10))),
io:format(user, "\nFiles = ~p\n", [machi_cr_client:list_files(C1)]),
@ -218,6 +227,8 @@ witness_smoke_test2() ->
try
Prefix = <<"pre">>,
Chunk1 = <<"yochunk">>,
NSInfo = undefined,
NoCSum = <<>>,
Host = "localhost",
PortBase = 64444,
Os = [{ignore_stability_time, true}, {active_mode, false},
@ -227,14 +238,15 @@ witness_smoke_test2() ->
%% Whew ... ok, now start some damn tests.
{ok, C1} = machi_cr_client:start_link([P || {_,P}<-orddict:to_list(D)]),
{ok, _} = machi_cr_client:append_chunk(C1, Prefix, Chunk1),
{ok, _} = machi_cr_client:append_chunk(C1, NSInfo, Prefix,
Chunk1, NoCSum),
{ok, {Off1,Size1,File1}} =
machi_cr_client:append_chunk(C1, Prefix, Chunk1),
Chunk1_badcs = {<<?CSUM_TAG_CLIENT_SHA:8, 0:(8*20)>>, Chunk1},
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum),
BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, "foo")},
{error, bad_checksum} =
machi_cr_client:append_chunk(C1, Prefix, Chunk1_badcs),
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, BadCSum),
{ok, {[{_, Off1, Chunk1, _}], []}} =
machi_cr_client:read_chunk(C1, File1, Off1, Size1, []),
machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, Size1, undefined),
%% Stop 'b' and let the chain reset.
ok = machi_flu_psup:stop_flu_package(b),
@ -247,24 +259,25 @@ witness_smoke_test2() ->
%% Let's wedge OurWitness and see what happens: timeout/partition.
#p_srvr{name=WitName, address=WitA, port=WitP} =
orddict:fetch(OurWitness, D),
{ok, {false, EpochID2}} = machi_flu1_client:wedge_status(WitA, WitP),
{ok, {false, EpochID2,_,_}} = machi_flu1_client:wedge_status(WitA, WitP),
machi_flu1:wedge_myself(WitName, EpochID2),
case machi_flu1_client:wedge_status(WitA, WitP) of
{ok, {true, EpochID2}} ->
{ok, {true, EpochID2,_,_}} ->
ok;
{ok, {false, EpochID2}} ->
{ok, {false, EpochID2,_,_}} ->
%% This is racy. Work around it by sleeping a while.
timer:sleep(6*1000),
{ok, {true, EpochID2}} =
{ok, {true, EpochID2,_,_}} =
machi_flu1_client:wedge_status(WitA, WitP)
end,
%% Chunk1 is still readable: not affected by wedged witness head.
{ok, {[{_, Off1, Chunk1, _}], []}} =
machi_cr_client:read_chunk(C1, File1, Off1, Size1, []),
machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, Size1, undefined),
%% But because the head is wedged, an append will fail.
{error, partition} =
machi_cr_client:append_chunk(C1, Prefix, Chunk1, 1*1000),
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum,
#append_opts{}, 1*1000),
%% The witness's wedge status should cause timeout/partition
%% for write_chunk also.
@ -273,7 +286,7 @@ witness_smoke_test2() ->
File10 = File1,
Offx = Off1 + (1 * Size10),
{error, partition} =
machi_cr_client:write_chunk(C1, File10, Offx, Chunk10, 1*1000),
machi_cr_client:write_chunk(C1, NSInfo, File10, Offx, Chunk10, NoCSum, 1*1000),
ok
after

View file

@ -35,10 +35,14 @@
%% EUNIT TEST DEFINITION
eqc_test_() ->
{timeout, 60,
PropTimeout = case os:getenv("EQC_TIME") of
false -> 30;
V -> list_to_integer(V)
end,
{timeout, PropTimeout*2 + 30,
{spawn,
[
?_assertEqual(true, eqc:quickcheck(eqc:testing_time(30, ?QC_OUT(prop_ok()))))
?_assertEqual(true, eqc:quickcheck(eqc:testing_time(PropTimeout, ?QC_OUT(prop_ok()))))
]
}}.

View file

@ -38,7 +38,7 @@ clean_up_data_dir(DataDir) ->
-ifndef(PULSE).
-define(TESTDIR, "./t").
-define(HYOOGE, 1 * 1024 * 1024 * 1024). % 1 long GB
-define(HYOOGE, 75 * 1024 * 1024). % 75 MBytes
random_binary_single() ->
%% OK, I guess it's not that random...
@ -119,7 +119,7 @@ multiple_chunks_read_test_() ->
?_assertEqual(ok, machi_file_proxy:trim(Pid, 0, 1, false)),
?_assertMatch({ok, {[], [{"test", 0, 1}]}},
machi_file_proxy:read(Pid, 0, 1,
[{needs_trimmed, true}])),
#read_opts{needs_trimmed=true})),
?_assertMatch({ok, "test", _}, machi_file_proxy:append(Pid, random_binary(0, 1024))),
?_assertEqual(ok, machi_file_proxy:write(Pid, 10000, <<"fail">>)),
?_assertEqual(ok, machi_file_proxy:write(Pid, 20000, <<"fail">>)),
@ -134,7 +134,7 @@ multiple_chunks_read_test_() ->
machi_file_proxy:read(Pid, 1024, 530000)),
?_assertMatch({ok, {[{"test", 1, _, _}], [{"test", 0, 1}]}},
machi_file_proxy:read(Pid, 0, 1024,
[{needs_trimmed, true}]))
#read_opts{needs_trimmed=true}))
]
end}.

View file

@ -30,12 +30,69 @@
-define(FLU, machi_flu1).
-define(FLU_C, machi_flu1_client).
get_env_vars(App, Ks) ->
Raw = [application:get_env(App, K) || K <- Ks],
Old = lists:zip(Ks, Raw),
{App, Old}.
clean_up_env_vars({App, Old}) ->
[case Res of
undefined ->
application:unset_env(App, K);
{ok, V} ->
application:set_env(App, K, V)
end || {K, Res} <- Old].
filter_env_var({ok, V}) -> V;
filter_env_var(Else) -> Else.
clean_up_data_dir(DataDir) ->
[begin
Fs = filelib:wildcard(DataDir ++ Glob),
[file:delete(F) || F <- Fs],
[file:del_dir(F) || F <- Fs]
end || Glob <- ["*/*/*/*", "*/*/*", "*/*", "*"] ],
_ = file:del_dir(DataDir),
ok.
start_flu_package(RegName, TcpPort, DataDir) ->
start_flu_package(RegName, TcpPort, DataDir, []).
start_flu_package(RegName, TcpPort, DataDir, Props) ->
case proplists:get_value(save_data_dir, Props) of
true ->
ok;
_ ->
clean_up_data_dir(DataDir)
end,
maybe_start_sup(),
machi_flu_psup:start_flu_package(RegName, TcpPort, DataDir, Props).
stop_flu_package(FluName) ->
machi_flu_psup:stop_flu_package(FluName),
Pid = whereis(machi_sup),
exit(Pid, normal),
machi_util:wait_for_death(Pid, 100).
maybe_start_sup() ->
case whereis(machi_sup) of
undefined ->
machi_sup:start_link(),
%% evil but we have to let stuff start up
timer:sleep(10),
maybe_start_sup();
Pid -> Pid
end.
-ifndef(PULSE).
flu_smoke_test() ->
Host = "localhost",
TcpPort = 12957,
DataDir = "./data",
NSInfo = undefined,
NoCSum = <<>>,
Prefix = <<"prefix!">>,
BadPrefix = BadFile = "no/good",
W_props = [{initial_wedged, false}],
@ -43,32 +100,31 @@ flu_smoke_test() ->
try
Msg = "Hello, world!",
Msg = ?FLU_C:echo(Host, TcpPort, Msg),
{error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort,
?DUMMY_PV1_EPOCH,
"does-not-exist"),
{error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort,
?DUMMY_PV1_EPOCH, BadFile),
{error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort,"does-not-exist"),
{error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, BadFile),
{ok, []} = ?FLU_C:list_files(Host, TcpPort, ?DUMMY_PV1_EPOCH),
{ok, {false, _}} = ?FLU_C:wedge_status(Host, TcpPort),
{ok, {false, _,_,_}} = ?FLU_C:wedge_status(Host, TcpPort),
Chunk1 = <<"yo!">>,
{ok, {Off1,Len1,File1}} = ?FLU_C:append_chunk(Host, TcpPort,
{ok, {Off1,Len1,File1}} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo,
?DUMMY_PV1_EPOCH,
Prefix, Chunk1),
{ok, {[{_, Off1, Chunk1, _}], _}} = ?FLU_C:read_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH,
File1, Off1, Len1, []),
{ok, KludgeBin} = ?FLU_C:checksum_list(Host, TcpPort,
?DUMMY_PV1_EPOCH, File1),
Prefix, Chunk1, NoCSum),
{ok, {[{_, Off1, Chunk1, _}], _}} = ?FLU_C:read_chunk(Host, TcpPort,
NSInfo, ?DUMMY_PV1_EPOCH,
File1, Off1, Len1,
noopt),
{ok, KludgeBin} = ?FLU_C:checksum_list(Host, TcpPort, File1),
true = is_binary(KludgeBin),
{error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort,
{error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo,
?DUMMY_PV1_EPOCH,
BadPrefix, Chunk1),
BadPrefix, Chunk1, NoCSum),
{ok, [{_,File1}]} = ?FLU_C:list_files(Host, TcpPort, ?DUMMY_PV1_EPOCH),
Len1 = size(Chunk1),
{error, not_written} = ?FLU_C:read_chunk(Host, TcpPort,
?DUMMY_PV1_EPOCH,
File1, Off1*983829323, Len1, []),
NSInfo, ?DUMMY_PV1_EPOCH,
File1, Off1*983829323, Len1,
noopt),
%% XXX FIXME
%%
%% This is failing because the read extends past the end of the file.
@ -77,19 +133,22 @@ flu_smoke_test() ->
%% of the read will cause it to fail.
%%
%% {error, partial_read} = ?FLU_C:read_chunk(Host, TcpPort,
%% ?DUMMY_PV1_EPOCH,
%% NSInfo, ?DUMMY_PV1_EPOCH,
%% File1, Off1, Len1*9999),
{ok, {Off1b,Len1b,File1b}} = ?FLU_C:append_chunk(Host, TcpPort,
{ok, {Off1b,Len1b,File1b}} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo,
?DUMMY_PV1_EPOCH,
Prefix, Chunk1),
Prefix, Chunk1,NoCSum),
Extra = 42,
{ok, {Off1c,Len1c,File1c}} = ?FLU_C:append_chunk_extra(Host, TcpPort,
Opts1 = #append_opts{chunk_extra=Extra},
{ok, {Off1c,Len1c,File1c}} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo,
?DUMMY_PV1_EPOCH,
Prefix, Chunk1, Extra),
Prefix, Chunk1, NoCSum,
Opts1, infinity),
{ok, {Off1d,Len1d,File1d}} = ?FLU_C:append_chunk(Host, TcpPort,
NSInfo,
?DUMMY_PV1_EPOCH,
Prefix, Chunk1),
Prefix, Chunk1,NoCSum),
if File1b == File1c, File1c == File1d ->
true = (Off1c == Off1b + Len1b),
true = (Off1d == Off1c + Len1c + Extra);
@ -97,27 +156,44 @@ flu_smoke_test() ->
exit(not_mandatory_but_test_expected_same_file_fixme)
end,
Chunk1_cs = {<<?CSUM_TAG_NONE:8, 0:(8*20)>>, Chunk1},
{ok, {Off1e,Len1e,File1e}} = ?FLU_C:append_chunk(Host, TcpPort,
?DUMMY_PV1_EPOCH,
Prefix, Chunk1_cs),
Chunk2 = <<"yo yo">>,
Len2 = byte_size(Chunk2),
Off2 = ?MINIMUM_OFFSET + 77,
File2 = "smoke-whole-file^^0^1^1",
ok = ?FLU_C:write_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH,
File2, Off2, Chunk2),
{error, bad_arg} = ?FLU_C:write_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH,
BadFile, Off2, Chunk2),
ok = ?FLU_C:write_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH,
File2, Off2, Chunk2, NoCSum),
{error, bad_arg} = ?FLU_C:write_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH,
BadFile, Off2, Chunk2, NoCSum),
{ok, {[{_, Off2, Chunk2, _}], _}} =
?FLU_C:read_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, File2, Off2, Len2, []),
?FLU_C:read_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, File2, Off2, Len2, noopt),
{error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort,
?DUMMY_PV1_EPOCH,
"no!!", Off2, Len2, []),
NSInfo, ?DUMMY_PV1_EPOCH,
"no!!", Off2, Len2, noopt),
{error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort,
?DUMMY_PV1_EPOCH,
BadFile, Off2, Len2, []),
NSInfo, ?DUMMY_PV1_EPOCH,
BadFile, Off2, Len2, noopt),
%% Make a connected socket.
Sock1 = ?FLU_C:connect(#p_srvr{address=Host, port=TcpPort}),
%% Let's test some cluster version enforcement.
Good_EpochNum = 0,
Good_NSVersion = 0,
Good_NS = <<>>,
{ok, {false, {Good_EpochNum,_}, Good_NSVersion, GoodNS}} =
?FLU_C:wedge_status(Sock1),
NS_good = #ns_info{version=Good_NSVersion, name=Good_NS},
{ok, {[{_, Off2, Chunk2, _}], _}} =
?FLU_C:read_chunk(Sock1, NS_good, ?DUMMY_PV1_EPOCH,
File2, Off2, Len2, noopt),
NS_bad_version = #ns_info{version=1, name=Good_NS},
NS_bad_name = #ns_info{version=Good_NSVersion, name= <<"foons">>},
{error, bad_epoch} =
?FLU_C:read_chunk(Sock1, NS_bad_version, ?DUMMY_PV1_EPOCH,
File2, Off2, Len2, noopt),
{error, bad_arg} =
?FLU_C:read_chunk(Sock1, NS_bad_name, ?DUMMY_PV1_EPOCH,
File2, Off2, Len2, noopt),
%% We know that File1 still exists. Pretend that we've done a
%% migration and exercise the delete_migration() API.
@ -134,8 +210,7 @@ flu_smoke_test() ->
{error, bad_arg} = ?FLU_C:trunc_hack(Host, TcpPort,
?DUMMY_PV1_EPOCH, BadFile),
ok = ?FLU_C:quit(?FLU_C:connect(#p_srvr{address=Host,
port=TcpPort}))
ok = ?FLU_C:quit(Sock1)
after
machi_test_util:stop_flu_package()
end.
@ -148,7 +223,7 @@ flu_projection_smoke_test() ->
try
[ok = flu_projection_common(Host, TcpPort, T) ||
T <- [public, private] ]
%% , {ok, {false, EpochID1}} = ?FLU_C:wedge_status(Host, TcpPort),
%% , {ok, {false, EpochID1,_,_}} = ?FLU_C:wedge_status(Host, TcpPort),
%% io:format(user, "EpochID1 ~p\n", [EpochID1])
after
machi_test_util:stop_flu_package()
@ -183,13 +258,15 @@ bad_checksum_test() ->
DataDir = "./data.bct",
Opts = [{initial_wedged, false}],
{_,_,_} = machi_test_util:start_flu_package(projection_test_flu, TcpPort, DataDir, Opts),
NSInfo = undefined,
try
Prefix = <<"some prefix">>,
Chunk1 = <<"yo yo yo">>,
Chunk1_badcs = {<<?CSUM_TAG_CLIENT_SHA:8, 0:(8*20)>>, Chunk1},
{error, bad_checksum} = ?FLU_C:append_chunk(Host, TcpPort,
BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, ".................")},
{error, bad_checksum} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo,
?DUMMY_PV1_EPOCH,
Prefix, Chunk1_badcs),
Prefix,
Chunk1, BadCSum),
ok
after
machi_test_util:stop_flu_package()
@ -201,6 +278,8 @@ witness_test() ->
DataDir = "./data.witness",
Opts = [{initial_wedged, false}, {witness_mode, true}],
{_,_,_} = machi_test_util:start_flu_package(projection_test_flu, TcpPort, DataDir, Opts),
NSInfo = undefined,
NoCSum = <<>>,
try
Prefix = <<"some prefix">>,
Chunk1 = <<"yo yo yo">>,
@ -213,15 +292,14 @@ witness_test() ->
{ok, EpochID1} = ?FLU_C:get_latest_epochid(Host, TcpPort, private),
%% Witness-protected ops all fail
{error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, EpochID1,
Prefix, Chunk1),
{error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, EpochID1,
Prefix, Chunk1, NoCSum),
File = <<"foofile">>,
{error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort, EpochID1,
File, 9999, 9999, []),
{error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, EpochID1,
File),
{error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort, NSInfo, EpochID1,
File, 9999, 9999, noopt),
{error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, File),
{error, bad_arg} = ?FLU_C:list_files(Host, TcpPort, EpochID1),
{ok, {false, EpochID1}} = ?FLU_C:wedge_status(Host, TcpPort),
{ok, {false, EpochID1,_,_}} = ?FLU_C:wedge_status(Host, TcpPort),
{ok, _} = ?FLU_C:get_latest_epochid(Host, TcpPort, public),
{ok, _} = ?FLU_C:read_latest_projection(Host, TcpPort, public),
{error, not_written} = ?FLU_C:read_projection(Host, TcpPort,

View file

@ -84,20 +84,23 @@ partial_stop_restart2() ->
WedgeStatus = fun({_,#p_srvr{address=Addr, port=TcpPort}}) ->
machi_flu1_client:wedge_status(Addr, TcpPort)
end,
NSInfo = undefined,
Append = fun({_,#p_srvr{address=Addr, port=TcpPort}}, EpochID) ->
NoCSum = <<>>,
machi_flu1_client:append_chunk(Addr, TcpPort,
EpochID,
<<"prefix">>, <<"data">>)
NSInfo, EpochID,
<<"prefix">>,
<<"data">>, NoCSum)
end,
try
[Start(P) || P <- Ps],
[{ok, {true, _}} = WedgeStatus(P) || P <- Ps], % all are wedged
[{ok, {true, _,_,_}} = WedgeStatus(P) || P <- Ps], % all are wedged
[{error,wedged} = Append(P, ?DUMMY_PV1_EPOCH) || P <- Ps], % all are wedged
[machi_chain_manager1:set_chain_members(ChMgr, Dict) ||
ChMgr <- ChMgrs ],
{ok, {false, EpochID1}} = WedgeStatus(hd(Ps)),
[{ok, {false, EpochID1}} = WedgeStatus(P) || P <- Ps], % *not* wedged
{ok, {false, EpochID1,_,_}} = WedgeStatus(hd(Ps)),
[{ok, {false, EpochID1,_,_}} = WedgeStatus(P) || P <- Ps], % *not* wedged
[{ok,_} = Append(P, EpochID1) || P <- Ps], % *not* wedged
{ok, {_,_,File1}} = Append(hd(Ps), EpochID1),
@ -123,9 +126,9 @@ partial_stop_restart2() ->
Epoch_m = Proj_m#projection_v1.epoch_number,
%% Confirm that all FLUs are *not* wedged, with correct proj & epoch
Proj_mCSum = Proj_m#projection_v1.epoch_csum,
[{ok, {false, {Epoch_m, Proj_mCSum}}} = WedgeStatus(P) || % *not* wedged
[{ok, {false, {Epoch_m, Proj_mCSum},_,_}} = WedgeStatus(P) || % *not* wedged
P <- Ps],
{ok, {false, EpochID1}} = WedgeStatus(hd(Ps)),
{ok, {false, EpochID1,_,_}} = WedgeStatus(hd(Ps)),
[{ok,_} = Append(P, EpochID1) || P <- Ps], % *not* wedged
%% Stop all but 'a'.
@ -145,10 +148,10 @@ partial_stop_restart2() ->
{error, wedged} = Append(hd(Ps), EpochID1),
{_, #p_srvr{address=Addr_a, port=TcpPort_a}} = hd(Ps),
{error, wedged} = machi_flu1_client:read_chunk(
Addr_a, TcpPort_a, ?DUMMY_PV1_EPOCH,
<<>>, 99999999, 1, []),
{error, wedged} = machi_flu1_client:checksum_list(
Addr_a, TcpPort_a, ?DUMMY_PV1_EPOCH, <<>>),
Addr_a, TcpPort_a, NSInfo, ?DUMMY_PV1_EPOCH,
<<>>, 99999999, 1, undefined),
{error, bad_arg} = machi_flu1_client:checksum_list(
Addr_a, TcpPort_a, <<>>),
%% list_files() is permitted despite wedged status
{ok, _} = machi_flu1_client:list_files(
Addr_a, TcpPort_a, ?DUMMY_PV1_EPOCH),
@ -157,7 +160,7 @@ partial_stop_restart2() ->
{now_using,_,Epoch_n} = machi_chain_manager1:trigger_react_to_env(
hd(ChMgrs)),
true = (Epoch_n > Epoch_m),
{ok, {false, EpochID3}} = WedgeStatus(hd(Ps)),
{ok, {false, EpochID3,_,_}} = WedgeStatus(hd(Ps)),
%% The file we're assigned should be different with the epoch change.
{ok, {_,_,File3}} = Append(hd(Ps), EpochID3),
true = (File1 /= File3),
@ -173,6 +176,19 @@ partial_stop_restart2() ->
ok
end.
p_srvr_rec_test() ->
P = #p_srvr{name=a, address="localhost", port=1024, props=[yo]},
[P] = machi_flu_sup:sanitize_p_srvr_records([P]),
[P] = machi_flu_sup:sanitize_p_srvr_records([P,P]),
[] = machi_flu_sup:sanitize_p_srvr_records([nope]),
[] = machi_flu_sup:sanitize_p_srvr_records([#p_srvr{proto_mod=does_not_exist}]),
[] = machi_flu_sup:sanitize_p_srvr_records([#p_srvr{proto_mod="lists"}]),
[] = machi_flu_sup:sanitize_p_srvr_records([#p_srvr{address=7}]),
[] = machi_flu_sup:sanitize_p_srvr_records([#p_srvr{port=5}]),
[] = machi_flu_sup:sanitize_p_srvr_records([#p_srvr{port=foo}]),
[] = machi_flu_sup:sanitize_p_srvr_records([#p_srvr{props=foo}]),
ok.
-endif. % !PULSE
-endif. % TEST

View file

@ -0,0 +1,307 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2014 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(machi_lifecycle_mgr_test).
-compile(export_all).
-ifdef(TEST).
-ifndef(PULSE).
-include_lib("eunit/include/eunit.hrl").
-include("machi.hrl").
-include("machi_projection.hrl").
-define(MGR, machi_chain_manager1).
setup() ->
catch application:stop(machi),
{ok, SupPid} = machi_sup:start_link(),
error_logger:tty(false),
Dir = "./" ++ atom_to_list(?MODULE) ++ ".datadir",
machi_flu1_test:clean_up_data_dir(Dir ++ "/*/*"),
machi_flu1_test:clean_up_data_dir(Dir),
Envs = [{flu_data_dir, Dir ++ "/data/flu"},
{flu_config_dir, Dir ++ "/etc/flu-config"},
{chain_config_dir, Dir ++ "/etc/chain-config"},
{platform_data_dir, Dir ++ "/data"},
{platform_etc_dir, Dir ++ "/etc"},
{not_used_pending, Dir ++ "/etc/pending"}
],
EnvKeys = [K || {K,_V} <- Envs],
undefined = application:get_env(machi, yo),
Cleanup = machi_flu1_test:get_env_vars(machi, EnvKeys ++ [yo]),
[begin
filelib:ensure_dir(V ++ "/unused"),
application:set_env(machi, K, V)
end || {K, V} <- Envs],
{SupPid, Dir, Cleanup}.
cleanup({SupPid, Dir, Cleanup}) ->
exit(SupPid, normal),
machi_util:wait_for_death(SupPid, 100),
error_logger:tty(true),
catch application:stop(machi),
machi_flu1_test:clean_up_data_dir(Dir ++ "/*/*"),
machi_flu1_test:clean_up_data_dir(Dir),
machi_flu1_test:clean_up_env_vars(Cleanup),
undefined = application:get_env(machi, yo),
ok.
smoke_test_() ->
{timeout, 60, fun() -> smoke_test2() end}.
smoke_test2() ->
YoCleanup = setup(),
try
Prefix = <<"pre">>,
Chunk1 = <<"yochunk">>,
Host = "localhost",
PortBase = 60120,
Pa = #p_srvr{name=a,address="localhost",port=PortBase+0},
Pb = #p_srvr{name=b,address="localhost",port=PortBase+1},
Pc = #p_srvr{name=c,address="localhost",port=PortBase+2},
%% Pstore_a = machi_flu1:make_projection_server_regname(a),
%% Pstore_b = machi_flu1:make_projection_server_regname(b),
%% Pstore_c = machi_flu1:make_projection_server_regname(c),
Pstores = [Pstore_a, Pstore_b, Pstore_c] =
[machi_flu1:make_projection_server_regname(a),
machi_flu1:make_projection_server_regname(b),
machi_flu1:make_projection_server_regname(c)],
ChMgrs = [ChMgr_a, ChMgr_b, ChMgr_c] =
[machi_chain_manager1:make_chmgr_regname(a),
machi_chain_manager1:make_chmgr_regname(b),
machi_chain_manager1:make_chmgr_regname(c)],
Fits = [Fit_a, Fit_b, Fit_c] =
[machi_flu_psup:make_fitness_regname(a),
machi_flu_psup:make_fitness_regname(b),
machi_flu_psup:make_fitness_regname(c)],
Advance = machi_chain_manager1_test:make_advance_fun(
Fits, [a,b,c], ChMgrs, 3),
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("\nSTEP: Start 3 FLUs, no chain.\n", []),
[machi_lifecycle_mgr:make_pending_config(P) || P <- [Pa,Pb,Pc] ],
{[_,_,_],[]} = machi_lifecycle_mgr:process_pending(),
[{ok, #projection_v1{epoch_number=0}} =
machi_projection_store:read_latest_projection(PSTORE, private)
|| PSTORE <- Pstores],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("\nSTEP: Start chain = [a,b,c]\n", []),
C1 = #chain_def_v1{name=cx, mode=ap_mode, full=[Pa,Pb,Pc],
local_run=[a,b,c]},
machi_lifecycle_mgr:make_pending_config(C1),
{[],[_]} = machi_lifecycle_mgr:process_pending(),
Advance(),
[{ok, #projection_v1{all_members=[a,b,c]}} =
machi_projection_store:read_latest_projection(PSTORE, private)
|| PSTORE <- Pstores],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("\nSTEP: Reset chain = [b,c]\n", []),
C2 = #chain_def_v1{name=cx, mode=ap_mode, full=[Pb,Pc],
old_full=[a,b,c], old_witnesses=[],
local_stop=[a], local_run=[b,c]},
machi_lifecycle_mgr:make_pending_config(C2),
{[],[_]} = machi_lifecycle_mgr:process_pending(),
Advance(),
%% a should be down
{'EXIT', _} = (catch machi_projection_store:read_latest_projection(
hd(Pstores), private)),
[{ok, #projection_v1{all_members=[b,c]}} =
machi_projection_store:read_latest_projection(PSTORE, private)
|| PSTORE <- tl(Pstores)],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("\nSTEP: Reset chain = []\n", []),
C3 = #chain_def_v1{name=cx, mode=ap_mode, full=[],
old_full=[b,c], old_witnesses=[],
local_stop=[b,c], local_run=[]},
machi_lifecycle_mgr:make_pending_config(C3),
{[],[_]} = machi_lifecycle_mgr:process_pending(),
Advance(),
%% a,b,c should be down
[{'EXIT', _} = (catch machi_projection_store:read_latest_projection(
PSTORE, private))
|| PSTORE <- Pstores],
ok
after
cleanup(YoCleanup)
end.
ast_tuple_syntax_test() ->
T = fun(L) -> machi_lifecycle_mgr:check_ast_tuple_syntax(L) end,
Canon1 = [ {host, "localhost", []},
{host, "localhost", [{client_interface, "1.2.3.4"},
{admin_interface, "5.6.7.8"}]},
{flu, 'fx', "foohost", 4000, []},
switch_old_and_new,
{chain, 'cy', ['fx', 'fy'], [{foo,"yay"},{bar,baz}]} ],
{_Good,[]=_Bad} = T(Canon1),
Canon1_norm = machi_lifecycle_mgr:normalize_ast_tuple_syntax(Canon1),
true = (length(Canon1) == length(Canon1_norm)),
{Canon1_norm_b, []} = T(Canon1_norm),
true = (length(Canon1_norm) == length(Canon1_norm_b)),
{[],[_,_,_,_]} =
T([ {host, 'localhost', []},
{host, 'localhost', yo},
{host, "localhost", [{client_interface, 77.88293829832}]},
{host, "localhost", [{client_interface, "1.2.3.4"},
{bummer, "5.6.7.8"}]} ]),
{[],[_,_,_,_,_,_]} =
T([ {flu, 'fx', 'foohost', 4000, []},
{flu, 'fx', <<"foohost">>, 4000, []},
{flu, 'fx', "foohost", -4000, []},
{flu, 'fx', "foohost", 40009999, []},
{flu, 'fx', "foohost", 4000, gack},
{flu, 'fx', "foohost", 4000, [22]} ]),
{[],[_,_,_]} =
T([ {chain, 'cy', ["fx", "fy"], [foo,{bar,baz}]},
yoloyolo,
{chain, "cy", ["fx", 27], oops,arity,way,way,way,too,big,x}
]).
ast_run_test() ->
PortBase = 20300,
R1 = [
{host, "localhost", "localhost", "localhost", []},
{flu, 'f0', "localhost", PortBase+0, []},
{flu, 'f1', "localhost", PortBase+1, []},
{chain, 'ca', ['f0'], []},
{chain, 'cb', ['f1'], []},
switch_old_and_new,
{flu, 'f2', "localhost", PortBase+2, []},
{flu, 'f3', "localhost", PortBase+3, []},
{flu, 'f4', "localhost", PortBase+4, []},
{chain, 'ca', ['f0', 'f2'], []},
{chain, 'cc', ['f3', 'f4'], []}
],
{ok, Env1} = machi_lifecycle_mgr:run_ast(R1),
%% Uncomment to examine the Env trees.
%% Y1 = {lists:sort(gb_trees:to_list(element(1, Env1))),
%% lists:sort(gb_trees:to_list(element(2, Env1))),
%% element(3, Env1)},
%% io:format(user, "\nY1 ~p\n", [Y1]),
Negative_after_R1 =
[
{host, "localhost", "foo", "foo", []}, % dupe host
{flu, 'f1', "other", PortBase+9999999, []}, % bogus port # (syntax)
{flu, 'f1', "other", PortBase+888, []}, % dupe flu name
{flu, 'f7', "localhost", PortBase+1, []}, % dupe host+port
{chain, 'ca', ['f7'], []}, % unknown flu
{chain, 'cc', ['f0'], []}, % flu previously assigned
{chain, 'ca', cp_mode, ['f0', 'f1', 'f2'], [], []} % mode change
],
[begin
%% io:format(user, "dbg: Neg ~p\n", [Neg]),
{error, _} = machi_lifecycle_mgr:run_ast(R1 ++ [Neg])
end || Neg <- Negative_after_R1],
%% The 'run' phase doesn't blow smoke. What about 'diff'?
{X1a, X1b} = machi_lifecycle_mgr:diff_env(Env1, "localhost"),
%% There's only one host, "localhost", so 'all' should be exactly equal.
{X1a, X1b} = machi_lifecycle_mgr:diff_env(Env1, all),
%% io:format(user, "X1b: ~p\n", [X1b]),
%% Append to the R1 scenario: for chain cc: add f5, remove f4
%% Expect: see pattern matching below on X2b.
R2 = (R1 -- [switch_old_and_new]) ++
[switch_old_and_new,
{flu, 'f5', "localhost", PortBase+5, []},
{chain, 'cc', ['f3','f5'], []}],
{ok, Env2} = machi_lifecycle_mgr:run_ast(R2),
{_X2a, X2b} = machi_lifecycle_mgr:diff_env(Env2, "localhost"),
%% io:format(user, "X2b: ~p\n", [X2b]),
F5_port = PortBase+5,
[#p_srvr{name='f5',address="localhost",port=F5_port},
#chain_def_v1{name='cc',
full=[#p_srvr{name='f3'},#p_srvr{name='f5'}], witnesses=[],
old_full=[f3,f4], old_witnesses=[],
local_run=[f5], local_stop=[f4]}] = X2b,
ok.
ast_then_apply_test_() ->
{timeout, 60, fun() -> ast_then_apply_test2() end}.
ast_then_apply_test2() ->
YoCleanup = setup(),
try
PortBase = 20400,
NumChains = 4,
ChainLen = 3,
FLU_num = NumChains * ChainLen,
FLU_defs = [{flu, list_to_atom("f"++integer_to_list(X)),
"localhost", PortBase+X, []} || X <- lists:seq(1,FLU_num)],
FLU_names = [FLU || {flu,FLU,_,_,_} <- FLU_defs],
Ch_defs = [{chain, list_to_atom("c"++integer_to_list(X)),
lists:sublist(FLU_names, X, 3),
[]} || X <- lists:seq(1, FLU_num, 3)],
R1 = [switch_old_and_new,
{host, "localhost", "localhost", "localhost", []}]
++ FLU_defs ++ Ch_defs,
{ok, Env1} = machi_lifecycle_mgr:run_ast(R1),
{_X1a, X1b} = machi_lifecycle_mgr:diff_env(Env1, "localhost"),
%% io:format(user, "X1b ~p\n", [X1b]),
[machi_lifecycle_mgr:make_pending_config(X) || X <- X1b],
{PassFLUs, PassChains} = machi_lifecycle_mgr:process_pending(),
true = (length(PassFLUs) == length(FLU_defs)),
true = (length(PassChains) == length(Ch_defs)),
%% Kick the chain managers into doing something useful right now.
Pstores = [list_to_atom(atom_to_list(X) ++ "_pstore") || X <- FLU_names],
Fits = [list_to_atom(atom_to_list(X) ++ "_fitness") || X <- FLU_names],
ChMgrs = [list_to_atom(atom_to_list(X) ++ "_chmgr") || X <- FLU_names],
Advance = machi_chain_manager1_test:make_advance_fun(
Fits, FLU_names, ChMgrs, 3),
Advance(),
%% Sanity check: everyone is configured properly.
[begin
{ok, #projection_v1{epoch_number=Epoch, all_members=All,
chain_name=ChainName, upi=UPI}} =
machi_projection_store:read_latest_projection(PStore, private),
%% io:format(user, "~p: epoch ~p all ~p\n", [PStore, Epoch, All]),
true = Epoch > 0,
ChainLen = length(All),
true = (length(UPI) > 0),
{chain, _, Full, []} = lists:keyfind(ChainName, 2, Ch_defs),
true = lists:sort(Full) == lists:sort(All)
end || PStore <- Pstores],
ok
after
cleanup(YoCleanup)
end.
-endif. % !PULSE
-endif. % TEST

View file

@ -24,6 +24,7 @@
-ifdef(TEST).
-ifndef(PULSE).
-include("machi.hrl").
-include("machi_pb.hrl").
-include("machi_projection.hrl").
-include_lib("eunit/include/eunit.hrl").
@ -55,17 +56,18 @@ smoke_test2() ->
%% a separate test module? Or separate test func?
{error, _} = ?C:auth(Clnt, "foo", "bar"),
CoC_n = "", % CoC_namespace (not implemented)
CoC_l = 0, % CoC_locator (not implemented)
Prefix = <<"prefix">>,
Chunk1 = <<"Hello, chunk!">>,
NS = "",
NoCSum = <<>>,
Opts1 = #append_opts{},
{ok, {Off1, Size1, File1}} =
?C:append_chunk(Clnt, CoC_n, CoC_l, Prefix, Chunk1, none, 0),
?C:append_chunk(Clnt, NS, Prefix, Chunk1, NoCSum, Opts1),
true = is_binary(File1),
Chunk2 = "It's another chunk",
CSum2 = {client_sha, machi_util:checksum_chunk(Chunk2)},
{ok, {Off2, Size2, File2}} =
?C:append_chunk(Clnt, CoC_n, CoC_l, Prefix, Chunk2, CSum2, 1024),
?C:append_chunk(Clnt, NS, Prefix, Chunk2, CSum2, Opts1),
Chunk3 = ["This is a ", <<"test,">>, 32, [["Hello, world!"]]],
File3 = File2,
Off3 = Off2 + iolist_size(Chunk2),
@ -76,9 +78,9 @@ smoke_test2() ->
{iolist_to_binary(Chunk2), File2, Off2, Size2},
{iolist_to_binary(Chunk3), File3, Off3, Size3}],
[begin
File = binary_to_list(Fl),
File = Fl,
?assertMatch({ok, {[{File, Off, Ch, _}], []}},
?C:read_chunk(Clnt, Fl, Off, Sz, []))
?C:read_chunk(Clnt, Fl, Off, Sz, undefined))
end || {Ch, Fl, Off, Sz} <- Reads],
{ok, KludgeBin} = ?C:checksum_list(Clnt, File1),
@ -102,16 +104,16 @@ smoke_test2() ->
end || {_Ch, Fl, Off, Sz} <- Reads],
[begin
{ok, {[], Trimmed}} =
?C:read_chunk(Clnt, Fl, Off, Sz, [{needs_trimmed, true}]),
Filename = binary_to_list(Fl),
?C:read_chunk(Clnt, Fl, Off, Sz, #read_opts{needs_trimmed=true}),
Filename = Fl,
?assertEqual([{Filename, Off, Sz}], Trimmed)
end || {_Ch, Fl, Off, Sz} <- Reads],
LargeBytes = binary:copy(<<"x">>, 1024*1024),
LBCsum = {client_sha, machi_util:checksum_chunk(LargeBytes)},
{ok, {Offx, Sizex, Filex}} =
?C:append_chunk(Clnt, CoC_n, CoC_l,
Prefix, LargeBytes, LBCsum, 0),
?C:append_chunk(Clnt, NS,
Prefix, LargeBytes, LBCsum, Opts1),
ok = ?C:trim_chunk(Clnt, Filex, Offx, Sizex),
%% Make sure everything was trimmed
@ -128,7 +130,7 @@ smoke_test2() ->
[begin
{error, trimmed} =
?C:read_chunk(Clnt, Fl, Off, Sz, [])
?C:read_chunk(Clnt, Fl, Off, Sz, undefined)
end || {_Ch, Fl, Off, Sz} <- Reads],
ok
after

View file

@ -32,10 +32,12 @@
api_smoke_test() ->
RegName = api_smoke_flu,
TcpPort = 57124,
TcpPort = 17124,
DataDir = "./data.api_smoke_flu",
W_props = [{active_mode, false},{initial_wedged, false}],
Prefix = <<"prefix">>,
NSInfo = undefined,
NoCSum = <<>>,
try
{[I], _, _} = machi_test_util:start_flu_package(
@ -43,35 +45,42 @@ api_smoke_test() ->
{ok, Prox1} = ?MUT:start_link(I),
try
FakeEpoch = ?DUMMY_PV1_EPOCH,
[{ok, {_,_,_}} = ?MUT:append_chunk(Prox1,
FakeEpoch, Prefix, <<"data">>,
infinity) || _ <- lists:seq(1,5)],
[{ok, {_,_,_}} = ?MUT:append_chunk(
Prox1, NSInfo, FakeEpoch,
Prefix, <<"data">>, NoCSum) ||
_ <- lists:seq(1,5)],
%% Stop the FLU, what happens?
machi_test_util:stop_flu_package(),
[{error,partition} = ?MUT:append_chunk(Prox1,
[{error,partition} = ?MUT:append_chunk(Prox1, NSInfo,
FakeEpoch, Prefix, <<"data-stopped1">>,
infinity) || _ <- lists:seq(1,3)],
NoCSum) || _ <- lists:seq(1,3)],
%% Start the FLU again, we should be able to do stuff immediately
machi_test_util:start_flu_package(RegName, TcpPort, DataDir,
[no_cleanup|W_props]),
MyChunk = <<"my chunk data">>,
{ok, {MyOff,MySize,MyFile}} =
?MUT:append_chunk(Prox1, FakeEpoch, Prefix, MyChunk,
infinity),
{ok, {[{_, MyOff, MyChunk, _}], []}} =
?MUT:read_chunk(Prox1, FakeEpoch, MyFile, MyOff, MySize, []),
MyChunk2 = <<"my chunk data, yeah, again">>,
?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, Prefix, MyChunk,
NoCSum),
{ok, {[{_, MyOff, MyChunk, _MyChunkCSUM}], []}} =
?MUT:read_chunk(Prox1, NSInfo, FakeEpoch, MyFile, MyOff, MySize, undefined),
MyChunk2_parts = [<<"my chunk ">>, "data", <<", yeah, again">>],
MyChunk2 = iolist_to_binary(MyChunk2_parts),
Opts1 = #append_opts{chunk_extra=4242},
{ok, {MyOff2,MySize2,MyFile2}} =
?MUT:append_chunk_extra(Prox1, FakeEpoch, Prefix,
MyChunk2, 4242, infinity),
{ok, {[{_, MyOff2, MyChunk2, _}], []}} =
?MUT:read_chunk(Prox1, FakeEpoch, MyFile2, MyOff2, MySize2, []),
MyChunk_badcs = {<<?CSUM_TAG_CLIENT_SHA:8, 0:(8*20)>>, MyChunk},
{error, bad_checksum} = ?MUT:append_chunk(Prox1, FakeEpoch,
Prefix, MyChunk_badcs),
{error, bad_checksum} = ?MUT:write_chunk(Prox1, FakeEpoch,
<<"foo-file^^0^1^1">>, 99832,
MyChunk_badcs),
?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, Prefix,
MyChunk2_parts, NoCSum, Opts1, infinity),
[{ok, {[{_, MyOff2, MyChunk2, _}], []}} =
?MUT:read_chunk(Prox1, NSInfo, FakeEpoch, MyFile2, MyOff2, MySize2, DefaultOptions) ||
DefaultOptions <- [undefined, noopt, none, any_atom_at_all] ],
BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, "...................")},
{error, bad_checksum} = ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch,
Prefix, MyChunk, BadCSum),
{error, bad_checksum} = ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch,
MyFile2,
MyOff2 + size(MyChunk2),
MyChunk, BadCSum,
infinity),
%% Put kick_projection_reaction() in the middle of the test so
%% that any problems with its async nature will (hopefully)
@ -80,9 +89,9 @@ api_smoke_test() ->
%% Alright, now for the rest of the API, whee
BadFile = <<"no-such-file">>,
{error, bad_arg} = ?MUT:checksum_list(Prox1, FakeEpoch, BadFile),
{error, bad_arg} = ?MUT:checksum_list(Prox1, BadFile),
{ok, [_|_]} = ?MUT:list_files(Prox1, FakeEpoch),
{ok, {false, _}} = ?MUT:wedge_status(Prox1),
{ok, {false, _,_,_}} = ?MUT:wedge_status(Prox1),
{ok, {0, _SomeCSum}} = ?MUT:get_latest_epochid(Prox1, public),
{ok, #projection_v1{epoch_number=0}} =
?MUT:read_latest_projection(Prox1, public),
@ -108,9 +117,11 @@ flu_restart_test_() ->
flu_restart_test2() ->
RegName = a,
TcpPort = 57125,
TcpPort = 17125,
DataDir = "./data.api_smoke_flu2",
W_props = [{initial_wedged, false}, {active_mode, false}],
NSInfo = undefined,
NoCSum = <<>>,
try
{[I], _, _} = machi_test_util:start_flu_package(
@ -120,9 +131,8 @@ flu_restart_test2() ->
FakeEpoch = ?DUMMY_PV1_EPOCH,
Data = <<"data!">>,
Dataxx = <<"Fake!">>,
{ok, {Off1,Size1,File1}} = ?MUT:append_chunk(Prox1,
FakeEpoch, <<"prefix">>, Data,
infinity),
{ok, {Off1,Size1,File1}} = ?MUT:append_chunk(Prox1, NSInfo,
FakeEpoch, <<"prefix">>, Data, NoCSum),
P_a = #p_srvr{name=a, address="localhost", port=6622},
P1 = machi_projection:new(1, RegName, [P_a], [], [RegName], [], []),
P1xx = P1#projection_v1{dbg2=["dbg2 changes are ok"]},
@ -146,6 +156,7 @@ flu_restart_test2() ->
%% makes the code a bit convoluted. (No LFE or
%% Elixir macros here, alas, they'd be useful.)
AppendOpts1 = #append_opts{chunk_extra=42},
ExpectedOps =
[
fun(run) -> ?assertEqual({ok, EpochID}, ?MUT:get_epoch_id(Prox1)),
@ -227,35 +238,37 @@ flu_restart_test2() ->
(stop) -> ?MUT:get_all_projections(Prox1, private)
end,
fun(run) -> {ok, {_,_,_}} =
?MUT:append_chunk(Prox1, FakeEpoch,
<<"prefix">>, Data, infinity),
?MUT:append_chunk(Prox1, NSInfo, FakeEpoch,
<<"prefix">>, Data, NoCSum),
ok;
(line) -> io:format("line ~p, ", [?LINE]);
(stop) -> ?MUT:append_chunk(Prox1, FakeEpoch,
<<"prefix">>, Data, infinity)
(stop) -> ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch,
<<"prefix">>, Data, NoCSum)
end,
fun(run) -> {ok, {_,_,_}} =
?MUT:append_chunk_extra(Prox1, FakeEpoch,
<<"prefix">>, Data, 42, infinity),
?MUT:append_chunk(Prox1, NSInfo, FakeEpoch,
<<"prefix">>, Data, NoCSum,
AppendOpts1, infinity),
ok;
(line) -> io:format("line ~p, ", [?LINE]);
(stop) -> ?MUT:append_chunk_extra(Prox1, FakeEpoch,
<<"prefix">>, Data, 42, infinity)
(stop) -> ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch,
<<"prefix">>, Data, NoCSum,
AppendOpts1, infinity)
end,
fun(run) -> {ok, {[{_, Off1, Data, _}], []}} =
?MUT:read_chunk(Prox1, FakeEpoch,
File1, Off1, Size1, []),
?MUT:read_chunk(Prox1, NSInfo, FakeEpoch,
File1, Off1, Size1, undefined),
ok;
(line) -> io:format("line ~p, ", [?LINE]);
(stop) -> ?MUT:read_chunk(Prox1, FakeEpoch,
File1, Off1, Size1, [])
(stop) -> ?MUT:read_chunk(Prox1, NSInfo, FakeEpoch,
File1, Off1, Size1, undefined)
end,
fun(run) -> {ok, KludgeBin} =
?MUT:checksum_list(Prox1, FakeEpoch, File1),
?MUT:checksum_list(Prox1, File1),
true = is_binary(KludgeBin),
ok;
(line) -> io:format("line ~p, ", [?LINE]);
(stop) -> ?MUT:checksum_list(Prox1, FakeEpoch, File1)
(stop) -> ?MUT:checksum_list(Prox1, File1)
end,
fun(run) -> {ok, _} =
?MUT:list_files(Prox1, FakeEpoch),
@ -271,21 +284,21 @@ flu_restart_test2() ->
end,
fun(run) ->
ok =
?MUT:write_chunk(Prox1, FakeEpoch, File1, Off1,
Data, infinity),
?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, File1, Off1,
Data, NoCSum, infinity),
ok;
(line) -> io:format("line ~p, ", [?LINE]);
(stop) -> ?MUT:write_chunk(Prox1, FakeEpoch, File1, Off1,
Data, infinity)
(stop) -> ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, File1, Off1,
Data, NoCSum, infinity)
end,
fun(run) ->
{error, written} =
?MUT:write_chunk(Prox1, FakeEpoch, File1, Off1,
Dataxx, infinity),
?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, File1, Off1,
Dataxx, NoCSum, infinity),
ok;
(line) -> io:format("line ~p, ", [?LINE]);
(stop) -> ?MUT:write_chunk(Prox1, FakeEpoch, File1, Off1,
Dataxx, infinity)
(stop) -> ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, File1, Off1,
Dataxx, NoCSum, infinity)
end
],

View file

@ -83,7 +83,7 @@ stop_machi_sup() ->
undefined -> ok;
Pid ->
catch exit(whereis(machi_sup), normal),
machi_util:wait_for_death(Pid, 30)
machi_util:wait_for_death(Pid, 100)
end.
clean_up(FluInfo) ->