Compare commits

...

615 commits

Author SHA1 Message Date
Scott Lystig Fritchie
e87bd59a97 Merge branch 'slf/perf-improvements1' into tmp/merge-delme 2016-03-29 18:40:14 +09:00
Scott Lystig Fritchie
1e0bb4c404 Fix file rollover problems 2016-03-29 18:39:52 +09:00
Scott Lystig Fritchie
549963545f Update b_b driver's top-of-module comments 2016-03-29 15:59:48 +09:00
Scott Lystig Fritchie
2aa8917875 Check checksums on 'read' ops 2016-03-29 15:33:18 +09:00
Scott Lystig Fritchie
27cbf1e38c b_b: only load ETS table if 'read' is in the 'operations' list 2016-03-29 14:58:32 +09:00
Scott Lystig Fritchie
d2fa79e037 Fix arithmetic error in src/machi_file_proxy.erl 2016-03-28 22:07:59 +09:00
Scott Lystig Fritchie
57ba204210 Client API timeout and other minor bugfixes 2016-03-28 21:35:17 +09:00
Scott Lystig Fritchie
24f8cb21a2 Fix eunit test failures related to min file position 2016-03-28 21:05:12 +09:00
Scott Lystig Fritchie
e63db8dedc Fix read_chunk op in b_b driver 2016-03-28 18:42:04 +09:00
Scott Lystig Fritchie
a739b5265c Fix append_chunk op in b_b driver 2016-03-28 17:51:53 +09:00
Scott Lystig Fritchie
767f5d9e60 Performance fix: don't always go to beginning of leveldb table! 2016-03-28 17:51:30 +09:00
Scott Lystig Fritchie
8c21539fcb Add missing copyright header comment 2016-03-28 17:48:06 +09:00
Scott Lystig Fritchie
0f24b69378 README and FAQ updates for mid-March 2016 2016-03-09 12:19:50 -08:00
Scott Lystig Fritchie
ec9d391047 README and FAQ updates for mid-March 2016 2016-03-09 12:19:04 -08:00
Scott Lystig Fritchie
fa71a918b8 README and FAQ updates for mid-March 2016 2016-03-09 12:14:51 -08:00
Scott Lystig Fritchie
6cddfcf988 Merge branch 'slf/hc-demo-env' 2016-03-09 11:16:35 -08:00
Scott Lystig Fritchie
6b000f6e7c Ignore +rel/vars/dev*vars.config 2016-03-09 11:14:43 -08:00
Scott Lystig Fritchie
96c46ec5aa Add explanation for the 'CONFIRM' log messages 2016-03-09 10:54:39 -08:00
Scott Lystig Fritchie
cd166361aa WIP 2016-03-09 10:48:00 -08:00
Scott Lystig Fritchie
4e5c16f5e2 WIP 2016-03-09 10:30:23 -08:00
Scott Lystig Fritchie
16153a5d31 Fix deps building problem, silly 2016-02-27 01:56:16 +09:00
Scott Lystig Fritchie
84f522f865 WIP: Vagrant 2016-02-27 00:05:29 +09:00
Scott Lystig Fritchie
fc46cd1b25 WIP: Vagrant 2016-02-26 17:32:51 +09:00
Scott Lystig Fritchie
184a54ebbd Change ?HYOGE blob size from 1GB -> 75MB to reduce RAM required for eunit tests 2016-02-26 15:46:17 +09:00
Scott Lystig Fritchie
4cb166368a priv/humming-consensus-demo.setup.sh debugged, all appears to work 2016-02-25 18:10:11 +09:00
Scott Lystig Fritchie
f433e84fab Add 'stability_time' env var for repair 2016-02-25 17:52:40 +09:00
Scott Lystig Fritchie
a3fbe2c8bb WIP: demo script writing, derp, need a shell script to simplify 2016-02-25 17:00:05 +09:00
Scott Lystig Fritchie
bdf47da10c oops fix doc links 2016-02-24 15:11:35 +09:00
Scott Lystig Fritchie
6c03f5c1a6 Split out docs dev-clone-compile.md and dev-prerequisites.md 2016-02-24 15:08:41 +09:00
Scott Lystig Fritchie
11921d82bf WIP: start of demo doc 2016-02-23 17:30:30 +09:00
Scott Lystig Fritchie
a27425147d Re-add a flapping check, but also take advantage of confirmed accepted epoch 2016-02-23 15:07:16 +09:00
Scott Lystig Fritchie
34f8632f19 Add ranch startup to machi_chain_manager1_converge_demo 2016-02-23 15:06:33 +09:00
Scott Lystig Fritchie
c02a0bed70 Change 'uses' verbose message to error_logger:info 2016-02-22 17:03:50 +09:00
Scott Lystig Fritchie
1d8bc19891 Fix repair-is-finished-but-message-not-consumed DoS during peer SIGSTOP 2016-02-22 16:48:02 +09:00
Scott Lystig Fritchie
53ce6d89dd Add verbose() option to machi_fitness 2016-02-19 18:02:56 +09:00
Scott Lystig Fritchie
2e46d199c8 Export csum_tag() type 2016-02-19 17:06:05 +09:00
Scott Lystig Fritchie
0f543b4c4d Add author_server to CONFIRM messages 2016-02-19 17:06:05 +09:00
Scott Lystig Fritchie
d5c3da78fb Change 'COMMIT epoch' logging & chain mgr options 2016-02-19 17:06:05 +09:00
Scott Lystig Fritchie
affad6b1d3 Specify short timeout to ?FLU_PC:kick_projection_reaction() call 2016-02-19 17:06:05 +09:00
Scott Lystig Fritchie
ed56a2c6cf Fix 'ranch' app dependency upon re-start w/FLUs configured
... and allow direct start by machi_sup for EUnit tests.
2016-02-19 17:05:34 +09:00
Scott Lystig Fritchie
c2e9a83372 Merge branch 'slf/doc-cluster-terminology' 2016-02-16 12:47:05 +09:00
Scott Lystig Fritchie
67dad7fb8a Fix dialyzer warnings 2016-02-15 17:51:08 +09:00
Scott Lystig Fritchie
9d4483ae68 Minor edits to doc/cluster/name-game-sketch.org 2016-02-15 17:23:55 +09:00
Scott Lystig Fritchie
12ebf4390d Undo testing restriction in test/machi_ap_repair_eqc.erl 2016-02-14 16:00:11 +09:00
Scott Lystig Fritchie
b246ebc376 Rearrange unfinished NS locator reminder spam in machi_flu1_net_server.erl 2016-02-14 15:59:50 +09:00
Scott Lystig Fritchie
943e23e050 Hooray, all eunit tests including EQC pass! 2016-02-10 19:35:52 +09:00
Scott Lystig Fritchie
ecfad4726b Fix machi_flu_filename_mgr to avoid double-write errors during network partitions 2016-02-10 18:17:15 +09:00
Scott Lystig Fritchie
7c39af5bb7 WIP: narrowing in on repair problems due to double-write errors 2 2016-02-10 16:57:50 +09:00
Scott Lystig Fritchie
3bd575899f WIP: narrowing in on repair problems due to double-write errors 2 2016-02-10 16:39:57 +09:00
Scott Lystig Fritchie
a7f42d636e WIP: narrowing in on repair problems due to double-write errors 2016-02-09 01:27:58 +09:00
Scott Lystig Fritchie
fbb0203f67 WIP: most eunit tests fixed, chain repair intermittently broken 2016-02-08 22:04:09 +09:00
Scott Lystig Fritchie
6e17988ac7 Comment & old TODO cleanup 2016-02-02 16:54:31 +09:00
Scott Lystig Fritchie
202ace33d3 Add doc/process-protocol-module-overview.jpg 2016-01-29 16:40:34 +09:00
Scott Lystig Fritchie
2fddf2ec2d Tweak make-faq.pl 2016-01-29 15:10:00 +09:00
Scott Lystig Fritchie
3b82dc2e38 'Thread through' FLU props to machi_flu1_net_server 2015-12-31 17:34:35 +09:00
Scott Lystig Fritchie
3b594504fe Client API module edoc added, see also http://www.snookles.com/scotttmp/IMG_7279-copy-copy.jpg 2015-12-31 17:34:20 +09:00
Scott Lystig Fritchie
a3fc1c3d68 Add namespace info to wedge_status API call; add namespace enforcement @ machi_flu1_net_server 2015-12-31 14:34:15 +09:00
Scott Lystig Fritchie
f09eef14eb Fix damn-syntactically-valid-not-found-by-dialyzer typo 2015-12-30 15:54:19 +09:00
Scott Lystig Fritchie
c65424569d Use 'bool' type in PB spec where feasible 2015-12-29 19:17:18 +09:00
Scott Lystig Fritchie
3c6f1be5d0 Change read_chunk options to use new #read_opts{} 2015-12-29 18:47:08 +09:00
Scott Lystig Fritchie
76ae4247cd Fix cut-and-paste-o 2015-12-29 18:02:56 +09:00
Scott Lystig Fritchie
e24acb7246 Clean up internal protocol<->tuple mappings for correct epoch checking 2015-12-29 17:26:09 +09:00
Scott Lystig Fritchie
5a65a164c3 Remove straggler CoC items in code 2015-12-29 16:01:52 +09:00
Scott Lystig Fritchie
0a8c4156c2 trim_chunk API refactoring; all tests pass; todo tasks remain 2015-12-29 14:13:33 +09:00
Scott Lystig Fritchie
3d730ea215 write_chunk API refactoring; all tests pass; todo tasks remain 2015-12-29 14:13:33 +09:00
Scott Lystig Fritchie
6089ee6851 read_chunk API refactoring; all tests pass; todo tasks remain 2015-12-29 14:13:33 +09:00
Scott Lystig Fritchie
2932a17ea6 append_chunk API refactoring; all tests pass; todo tasks remain 2015-12-29 14:13:29 +09:00
Scott Lystig Fritchie
03b118b52c Clustering API changes in various docs
* name-game-sketch.org
* flu-and-chain-lifecycle.org
* FAQ.md

I've left out changes to the two design docs for now; most of their
respective texts omit multiple chain scenarios entirely, so there
isn't a huge amount to change.
2015-12-29 14:09:00 +09:00
Scott Lystig Fritchie
546901ef49 Dialyzer warning cleanup 2015-12-18 17:48:33 +09:00
Scott Lystig Fritchie
de5d5e88dd Do not use 18.x for TravisCI testing 2015-12-18 17:40:16 +09:00
Scott Lystig Fritchie
70d42a3fb5 Merge pull request #55 from basho/ss/flu1-factorization2
Ss/flu1 factorization2
2015-12-18 17:19:17 +09:00
Scott Lystig Fritchie
7d262fd3ec Move update_wedge_state() & wedge_myself() to machi_flu1_append_server.erl 2015-12-18 16:56:01 +09:00
Shunichi Shinohara
b8297afc18 Reduce count of accepror processes 100 -> 10 2015-12-18 16:30:35 +09:00
Scott Lystig Fritchie
c49ccafdc6 Merge slf/flu-config-rcd-style 2015-12-18 15:41:02 +09:00
Scott Lystig Fritchie
d602663060 Ignore RUNLOG* 2015-12-18 13:43:18 +09:00
Scott Lystig Fritchie
0922def0d6 s/verb/term/gi 2015-12-18 11:50:15 +09:00
Scott Lystig Fritchie
bb0e67f6e0 Add doc/flu-and-chain-lifecycle.org 2015-12-17 21:33:30 +09:00
Scott Lystig Fritchie
51a05ba770 Fix dialyzer complaints in machi_lifecycle_mgr.erl 2015-12-17 12:44:10 +09:00
Scott Lystig Fritchie
1d1bfadb96 Corrections from review 2015-12-17 12:31:32 +09:00
Scott Lystig Fritchie
f98b4da45b Add 'quick admin' config management: better file handling 2015-12-16 19:05:25 +09:00
Shunichi Shinohara
dffb73330d Dialyzer and eunit fix 2015-12-16 17:56:17 +09:00
Shunichi Shinohara
3a0086afb2 Change listen port to avoid eaddrinuse on Linux 2015-12-16 17:38:28 +09:00
Shunichi Shinohara
dcb2464cb8 Separate append server as gen_server callback module 2015-12-16 17:33:53 +09:00
Scott Lystig Fritchie
463d20a9fd Add 'quick admin' config management tool/hack 2015-12-16 16:41:11 +09:00
Scott Lystig Fritchie
b8b3e872e4 Merge pull request #54 from basho/slf/doc-201512-update
Doc update, including mid-December 2015 status
2015-12-16 11:15:09 +09:00
Scott Lystig Fritchie
9bd885ccb4 Part 3 of X 2015-12-15 16:20:57 +09:00
Scott Lystig Fritchie
ec56164bd1 Part 2 of X 2015-12-15 15:52:29 +09:00
Scott Lystig Fritchie
e5f7f3ba9a Remove doc/overview.edoc 2015-12-15 15:07:18 +09:00
Scott Lystig Fritchie
d196dbcee5 MD fixup 2015-12-14 19:21:20 +09:00
Scott Lystig Fritchie
e4a784d3dd Part 1 of X 2015-12-14 19:19:07 +09:00
Scott Lystig Fritchie
6f077fbb62 New machi_lifecycle_mgr_test, AST spec -> running FLUs & chains works! 2015-12-11 19:07:00 +09:00
Scott Lystig Fritchie
e55115fdba All EUnit tests in machi_lifecycle_mgr_test pass! 2015-12-11 17:28:27 +09:00
Scott Lystig Fritchie
009bad230f WIP: change internal types for most strings -> atom to match chmgr internal use 2015-12-11 16:36:18 +09:00
Scott Lystig Fritchie
6b7d871ada WIP: diff in progress 2 2015-12-11 16:26:13 +09:00
Scott Lystig Fritchie
1db232db1b WIP: diff in progress 2015-12-11 15:33:31 +09:00
Scott Lystig Fritchie
3826af8ee2 WIP: dict -> gb_trees, 2 of 2 2015-12-11 13:17:33 +09:00
Scott Lystig Fritchie
df8eea8c10 WIP: dict -> gb_trees, 1 of 2 2015-12-11 12:54:54 +09:00
Scott Lystig Fritchie
61eae1300f WIP: finish basic 'run', add negative tests 2015-12-11 12:43:38 +09:00
Shunichi Shinohara
436c308db2 Merge pull request #46 from basho/ss/flu1-factorization1-ranch
FLU1 Factorization 1/N: Introduce ranch and factor out socket handling process
2015-12-11 10:16:10 +09:00
Scott Lystig Fritchie
3ee3de1aaf WIP: end of day 2015-12-10 23:44:27 +09:00
Scott Lystig Fritchie
6a5c590ad1 WIP: AST change {chain,...} thingie 2015-12-10 23:05:08 +09:00
Scott Lystig Fritchie
c37f23d97a WIP: 'Run' AST thingie ha, take that, wheel! 2015-12-10 22:53:17 +09:00
Scott Lystig Fritchie
9cec53eea6 Yet another strawman AST 2015-12-10 19:18:25 +09:00
Shunichi Shinohara
eef00e4f8f Add TODO comment for possible race condition 2015-12-10 15:58:17 +09:00
Scott Lystig Fritchie
9472bad37b Clean up test errors 2015-12-10 15:57:35 +09:00
Scott Lystig Fritchie
cb706f0d23 Add test/machi_lifecycle_mgr_test.erl 2015-12-10 15:20:56 +09:00
Scott Lystig Fritchie
61ef7739cd Modify chain mostly works, better 2015-12-10 00:12:34 +09:00
Scott Lystig Fritchie
b0a9e65ca2 WIP: trying to shut down entire chain, but buggy, derp 2015-12-09 23:00:27 +09:00
Scott Lystig Fritchie
95e2df304e WIP: minor cleanup 2015-12-09 22:25:43 +09:00
Scott Lystig Fritchie
7f25fcc8f8 Modify chain mostly works 2015-12-09 19:02:16 +09:00
Scott Lystig Fritchie
cd9bf9eeab Modify chain mostly works, 2 TODOs remain 2015-12-09 18:17:26 +09:00
Shunichi Shinohara
2e2d282afc Use outside of ephemeral port range to listen on
When there is TCP_WAIT connection whose local part has
port to be listened, listen (bind) will fail by eaddrinuse
_on Linux_ (won't on Mac OS X).
This commit also adds some logs and pattern matches.

Reference
- Ephemeral port - Wikipedia, the free encyclopedia
  https://en.wikipedia.org/wiki/Ephemeral_port

  "Many Linux kernels use the port range 32768 to 61000.[note 2]
  FreeBSD has used the IANA port range since release 4.6. Previous
  versions, including the Berkeley Software Distribution (BSD), use
  ports 1024 to 5000 as ephemeral ports.[2]"

- Demostration of collision between already-closed ephemeral port
  and listen port on Linux (Mac OS X allows)
  https://gist.github.com/shino/36ae1e01608366d52236
2015-12-09 18:04:50 +09:00
Scott Lystig Fritchie
2871f8397c WIP: modify chain still a bit broken 2015-12-09 17:19:02 +09:00
Scott Lystig Fritchie
65eec61f82 Basic stuff to add new flus via 'pending' dir 2015-12-09 14:48:46 +09:00
Scott Lystig Fritchie
7301c8308e Clarify the initial docs, thanks @mrallen1! 2015-12-09 14:07:27 +09:00
Scott Lystig Fritchie
b243a9b863 Avoid TCP port # reuse in machi_flu_psup_test tests 2015-12-09 12:29:59 +09:00
Scott Lystig Fritchie
f23e500993 WIP comments 2015-12-09 11:32:05 +09:00
Shunichi Shinohara
ade4430d30 More cleaner clean up 2015-12-09 10:29:45 +09:00
Shunichi Shinohara
aa0a0413d1 Cosmetics of comments, specs, whitespaces and unit tests refactoring 2015-12-09 09:58:34 +09:00
Shunichi Shinohara
14765a7279 Change ranch callback module name 2015-12-09 09:58:34 +09:00
Shunichi Shinohara
a1f5a6ce62 Fix unit test cases around flu1 startup 2015-12-09 09:58:34 +09:00
Shunichi Shinohara
7614910f36 Initialize FLU package with ranch listener 2015-12-09 09:58:33 +09:00
Shunichi Shinohara
9579b1b8b2 Introduce ranch and add transport callback 2015-12-09 09:58:33 +09:00
Shunichi Shinohara
a8785e44b1 Set longer timeout for hyooge binary write test case 2015-12-09 09:46:11 +09:00
Shunichi Shinohara
83b4466a85 Merge pull request #50 from basho/slf/doc-name-game2
Review & merge slf/doc-name-game2
2015-12-09 09:45:11 +09:00
Scott Lystig Fritchie
69280bfb4f Fix typo/thinko: correct chain name @ bootstrap 2015-12-08 22:19:26 +09:00
Scott Lystig Fritchie
0fc7bc74b7 EDoc fixes 2015-12-08 22:05:11 +09:00
Scott Lystig Fritchie
8285899dba Bootstrap chain @ app init: done, with an example.
For example:

% make clean
% make stage

And then configure 3 FLUs:

    % echo '{p_srvr, a, machi_flu1_client, "localhost", 39000, []}.' > rel/machi/etc/flu-config/a
    % echo '{p_srvr, b, machi_flu1_client, "localhost", 39001, []}.' > rel/machi/etc/flu-config/b
    % echo '{p_srvr, c, machi_flu1_client, "localhost", 39002, []}.' > rel/machi/etc/flu-config/c

And then configure a chain to use 2 of those 3 FLUs:

    % echo '{chain_def_v1,c1,ap_mode,[{p_srvr,a,machi_flu1_client,"localhost",39000,[]},{p_srvr,b,machi_flu1_client,"localhost",39001,[]}],[],[]}.' > rel/machi/etc/chain-config/c1

... then start Machi e.g.

    % ./rel/machi/bin/machi console

... you should see the following console messages scroll by (including a :

    =PROGRESS REPORT==== 8-Dec-2015::22:01:44 ===
              supervisor: {local,machi_flu_sup}
                 started: [{pid,<0.145.0>},
                           {name,a},
                           {mfargs,
                               {machi_flu_psup,start_link,
                                   [a,39000,"./data/flu/a",[]]}},
                           {restart_type,permanent},
                           {shutdown,5000},
                           {child_type,supervisor}]

    [... and also for the other two FLUs, including a bunch of progress
         reports for processes that started underneath that sub-supervisor.]

    22:01:44.446 [info] Running FLUs: [a,b,c]
    22:01:44.446 [info] Running FLUs at epoch 0: [a,b,c]
    22:01:44.532 [warning] The following FLUs are defined but are not also members of a defined chain: [c]
2015-12-08 21:57:29 +09:00
Scott Lystig Fritchie
37ac09a680 Rename src/machi_chain_bootstrap.erl -> src/machi_lifecycle_mgr.erl 2015-12-08 17:46:11 +09:00
Scott Lystig Fritchie
3391c89818 Clean up verbosity of nonunanimous_setup_and_fix_test2() 2015-12-08 16:29:56 +09:00
Scott Lystig Fritchie
e27a59e20f Merge pull request #51 from basho/ku/eleveldb
LevelDB introduction
2015-12-08 16:04:52 +09:00
Scott Lystig Fritchie
27e8a31307 Fix fitness timing problem with short-circuit +trigger_early_adjustment/2 2015-12-08 15:27:47 +09:00
Scott Lystig Fritchie
ef10ebed22 WIP: now trying to diagnose fitness server bug? 2015-12-08 14:50:16 +09:00
Scott Lystig Fritchie
16acda3c7e Merge pull request #53 from basho/bugfix/52
Simple fix for #52: file size matters
2015-12-08 11:50:10 +09:00
Scott Lystig Fritchie
1bc9033076 Yay, all tests pass! 2015-12-07 22:15:23 +09:00
Scott Lystig Fritchie
38e63e8181 Add & remove, mostly working (2 eunit tests broken) 2015-12-07 21:52:27 +09:00
UENISHI Kota
293eb4810f Fix dialyzer error 2015-12-07 14:49:34 +09:00
Scott Lystig Fritchie
5aeaf872d9 WIP: machi_chain_manager1:set_chain_members() API change, all tests pass, yay 2015-12-07 14:41:56 +09:00
UENISHI Kota
89e80a8862 Fix GC not running 2015-12-07 12:07:46 +09:00
UENISHI Kota
07c2b97918 Change checksum_list API to return a t2b list 2015-12-07 10:55:45 +09:00
Scott Lystig Fritchie
1d3d121d83 Simple fix for #52: file size matters 2015-12-07 10:24:19 +09:00
Scott Lystig Fritchie
3c880dc437 WIP: find 1st overlapping FLU in any #chain_def_v1{} 2015-12-04 17:47:18 +09:00
UENISHI Kota
befa776685 Fix several new bugs 2015-12-04 17:38:36 +09:00
Scott Lystig Fritchie
a7ffef6b8e Add src/machi_chain_bootstrap.erl 2015-12-04 17:18:15 +09:00
UENISHI Kota
8528567954 Add eleveldb with sext to use it as metadata storage
First step is to use as checksum table. It will also used for file
names store and *ALL* other persistent metadata than files.
2015-12-04 16:38:57 +09:00
Scott Lystig Fritchie
cf0829b934 Add rc.d style config dir for FLU server startup 2015-12-04 16:37:05 +09:00
Scott Lystig Fritchie
35c48300a5 Fix Dialyzer complaints, derp! 2015-12-04 15:21:44 +09:00
UENISHI Kota
66de92490c Introduce eleveldb, along with cuttlefish to avoid dependency confustion 2015-12-03 16:48:23 +09:00
Scott Lystig Fritchie
e9b1134cd9 Merge pull request #48 from basho/mra/merkle-cleanup
Add merkle library
2015-12-02 16:25:50 +09:00
Scott Lystig Fritchie
37f33fae7b Fix bad_arg errors in low level eunit tests ... all pass now, yay! 2015-12-02 16:00:13 +09:00
Scott Lystig Fritchie
d44e9dd542 Fix plumbing for find_file() 2015-12-02 15:54:34 +09:00
Scott Lystig Fritchie
0d517d2377 fix machi_ap_repair_eqc:sublist() 2015-12-02 15:36:41 +09:00
Scott Lystig Fritchie
2f95305292 Add machi_ap_repair_eqc:sublist() 2015-12-02 15:35:39 +09:00
Scott Lystig Fritchie
29c2ede275 Add missing \ldots in chain repair figure 2015-12-02 15:35:39 +09:00
Scott Lystig Fritchie
916ac754d7 WIP: still broken, almost passes suites=machi_cr_client_test tests=smoke_test_ 2015-12-02 15:35:39 +09:00
Scott Lystig Fritchie
5477f3b6f8 WIP broken 2015-12-02 15:35:39 +09:00
Scott Lystig Fritchie
a1c834518d Attempt to address Mark's review comments 2015-12-02 15:35:39 +09:00
Scott Lystig Fritchie
c3002d9852 Push TODO closer to actual TODO work site, standardize var spelling 2015-12-02 15:35:39 +09:00
Scott Lystig Fritchie
10a27ce7dd All eunit tests now passing again 2015-12-02 15:35:38 +09:00
Scott Lystig Fritchie
014ba89e3a Add client-side plumbing for high proto append chunk CoC 2015-12-02 15:35:38 +09:00
Scott Lystig Fritchie
ac10f97220 Add machi_ap_repair_eqc:sublist() 2015-12-02 15:34:27 +09:00
Mark Allen
1f56850c2b Remove merklet from header file too. 2015-11-20 22:34:41 -06:00
Mark Allen
a5a0369905 Add basic unit test 2015-11-20 21:22:41 -06:00
Mark Allen
4ce7a87d56 Remove merklet 2015-11-20 16:29:17 -06:00
Scott Lystig Fritchie
d5c56980b3 Merge pull request #42 from basho/ss/native-ebin-ubuntu-friendly
Ubuntu /bin/sh is dash then something wrong happens sometimes
2015-11-19 09:57:34 +09:00
UENISHI Kota
fd649c00d2 Merge pull request #45 from basho/ku/file-reopen
Add stop and trim command to eqc_statem test on file_proxy
2015-11-19 09:45:41 +09:00
UENISHI Kota
9f6b53fc15 Make bigger offset and a bit code cleanup 2015-11-18 14:38:04 +09:00
UENISHI Kota
e11cdfe95c Add stop and trim command to eqc_statem test on file_proxy 2015-11-18 14:11:25 +09:00
Mark Allen
8d3f631d84 Oops. Accidentally left this out. 2015-11-17 22:07:28 -06:00
UENISHI Kota
84058f8c9c Merge pull request #44 from basho/ss/repair-eqc-pleak-fix
Fix process leak of repair eqc
2015-11-17 19:16:24 +09:00
Shunichi Shinohara
ad419ada50 Refactoring, cosmetics, comments 2015-11-17 12:58:54 +09:00
Shunichi Shinohara
049311614f Fix process leak of CR clients and FLU1 proxy clients 2015-11-17 12:58:50 +09:00
Scott Lystig Fritchie
3a35fe38c8 Merge branch 'slf/doc-cleanup2' ... in the middle of things 2015-11-06 07:22:38 -08:00
Scott Lystig Fritchie
73890171ba Format PDF version of high-level-chain-mgr doc 2015-11-06 07:21:44 -08:00
Shunichi Shinohara
919a408e17 Ubuntu /bin/sh is dash then something wrong happens sometimes
It seems dash does not understand {a,b,...} file pattern ...

% echo 'cp /home/shino/local/erlang/17.5.6_basho_hipe/lib/erlang/lib/stdlib-*/src/{lists,dict}.erl ./.ebin.native' | sh -x                                               + cp /home/shino/local/erlang/17.5.6_basho_hipe/lib/erlang/lib/stdlib-*/src/{lists,dict}.erl ./.ebin.native
cp: cannot stat ‘/home/shino/local/erlang/17.5.6_basho_hipe/lib/erlang/lib/stdlib-*/src/{lists,dict}.erl’: No such file or directory
2015-11-06 12:35:02 +09:00
UENISHI Kota
6786820401 Merge pull request #35 from basho/ku/making-file-proxy-spec
Add eqc trim tests to machi_file_proxy
2015-11-05 16:27:48 +09:00
UENISHI Kota
81fae32539 Remove unused test function 2015-11-05 16:19:46 +09:00
UENISHI Kota
2e6e6dd9e8 Merge pull request #40 from basho/ss/missing-test-cleanups
Add missing test cleanups
2015-11-05 16:16:40 +09:00
UENISHI Kota
ce41f9005e Fix machi_file_proxy_eqc:write_post to proper assertion 2015-11-05 14:48:35 +09:00
Shunichi Shinohara
922baaf433 Make rebar unit output verbose 2015-11-05 11:55:47 +09:00
Shunichi Shinohara
9e4dc83f2a Add missing cleanup tasks, suppress some not-so-useful logs 2015-11-05 11:47:47 +09:00
Shunichi Shinohara
1b0711f151 Stop flu1 client under CR client 2015-11-05 11:46:22 +09:00
Shunichi Shinohara
39a937db9b Merge pull request #38 from basho/ku/otp-18
Support OTP 18.1
2015-11-05 10:49:56 +09:00
UENISHI Kota
f56037240e Plan trim commands in eqc tests 2015-11-04 16:32:53 +09:00
UENISHI Kota
d0e6417f5d Remove unnecessary output 2015-11-04 16:15:33 +09:00
UENISHI Kota
3f6f9e2c6b Address one offset+length issue 2015-11-04 16:08:43 +09:00
UENISHI Kota
c1e5426034 Address PR comments 2015-11-04 16:08:09 +09:00
UENISHI Kota
3b087c0388 Add eqc trim tests to machi_file_proxy
* Add description on high client APIs
* Add notes to rethink high client specification
2015-11-04 16:02:29 +09:00
UENISHI Kota
62c8dacc65 Merge pull request #33 from basho/ss-repair-with-partition-simulator
Add test for append and repair with partition simulator
2015-11-04 14:44:04 +09:00
UENISHI Kota
b956f9e1f0 Fix dialyzer issue 2015-11-04 11:43:00 +09:00
Scott Lystig Fritchie
850a8786b6 Merge branch 'slf/doc-converge-demo'
Docs only.
2015-11-03 00:37:53 +09:00
Scott Lystig Fritchie
557525af05 Clarify chain length 2015-11-03 00:34:51 +09:00
Scott Lystig Fritchie
30000d6602 Add doc/machi_chain_manager1_converge_demo.md 2015-11-03 00:27:09 +09:00
Shunichi Shinohara
059f591d3f Exclude FLUs which can not send to cr client as well as receive from
in terms of partition simulator
2015-11-02 17:57:16 +09:00
UENISHI Kota
3122f2cf54 Support OTP 18.1 2015-11-02 17:43:38 +09:00
Mark Allen
3c5a9e6f53 Torture tests for merkle tree
1,000,000 entries - timings and size
2015-11-02 00:12:58 -06:00
Mark Allen
72a4fab49d Add a naive diff function 2015-10-29 22:18:20 -05:00
Shunichi Shinohara
b5005c3526 Add EQC test case for AP mode repair w/ part. sim. 2015-10-30 10:25:51 +09:00
Shunichi Shinohara
6fa2de28cd Add self pid to debug print 2015-10-30 09:44:54 +09:00
Shunichi Shinohara
93b168415d Change log level to debug for multiple files with certain seq num 2015-10-30 09:40:07 +09:00
Shunichi Shinohara
bf5768eb47 Make CR client partition-simulator-aware 2015-10-30 09:39:21 +09:00
Shunichi Shinohara
447c8c8d48 Add 2-tuple timeout setting to CR client 2015-10-30 09:39:21 +09:00
Scott Lystig Fritchie
b859e23a37 Some long-overdue minor editing, prior to working on issue #4 2015-10-29 21:21:05 +09:00
Scott Lystig Fritchie
44497d5af8 Merge pull request #32 from basho/ku/trim-and-gc
Trim command and GC prototype implementation
2015-10-29 15:07:10 +09:00
Scott Lystig Fritchie
611f33e81b Change trigger_gc default -> 0 2015-10-29 15:05:15 +09:00
UENISHI Kota
028135d927 Update some comments for concise and sound description 2015-10-29 12:07:34 +09:00
Mark Allen
7086899941 Reorg merkle tree code into a library
Was a service previously. Now contains both merklet
and the naive implementations.  Put construction
timing stuff into the test.

Tests are not truly meaningful yet.
2015-10-28 16:59:49 -05:00
UENISHI Kota
170b3cd797 Dialyzer fix 2015-10-28 12:48:50 +09:00
UENISHI Kota
f7358424e4 Trim command and GC prototype implementation
* maybe_gc/2 is triggered at machi_file_proxy, when chunk is deleted
  and the file is larger than `max_file_size`
* A file is deleted if all chunks except 1024 bytes header are trimmed
* If a file is going to be deleted, file_proxy notifies metadata_mgr
  to remember the filename persistently, whose filename is
  `known_files_<FluName>`
* Such trimmed filenames are stored in a machi_plist file per flu
* machi_file_proxy could not be started if the filename is in the
  manager's list. Consequently, any write, read and trim operations
  cannot happen against deleted file.
* After the file was trimmed, any read request to the file returns
  `{error, trimmed}`
* Disclaimer: no tests written yet and machi_plist does not support
  any recovery from partial writes.
* Add some thoughts as comments for repairing trims.

* State diagram of every byte is as follows:

```
state\action| write/append   | read_chunk       | trim_chunk
------------+----------------+------------------+---------------
 unwritten  |  -> written    | fail (+repair)   | -> trimmed
 written    | noop or repair | return content   | -> trimmed
 trimmed    |  fail          | fail             | noop
```
2015-10-28 12:34:03 +09:00
Mark Allen
5e571f6009 Switch to merklet
Still a WIP
2015-10-27 16:33:18 -05:00
Mark Allen
7f561f34e0 Ignore vim cruft too 2015-10-27 16:33:07 -05:00
Mark Allen
77096c5f82 Add merklet as a dependency 2015-10-27 11:57:38 -05:00
Mark Allen
b710517c64 Fixes after testing 2015-10-27 11:57:38 -05:00
Mark Allen
1b8401e7de Initial smoke test 2015-10-27 11:57:38 -05:00
Mark Allen
48fabdcd5f WIP
Almost certainly broken
2015-10-27 11:57:38 -05:00
Scott Lystig Fritchie
61f02dfc9f Merge pull request #30 from basho/ss-fix-opt-timeout-arg-mismatch
Fix missing "options" arg for a few calls
2015-10-27 14:36:52 +09:00
Scott Lystig Fritchie
5ec35773e3 Merge pull request #28 from basho/ku/csum-table-anybytes
Change machi_csum_table to support arbitrary bytes writes and trims
2015-10-27 14:21:09 +09:00
Scott Lystig Fritchie
d2b1c7512a Merge branch 'ku/config-system' into tmp 2015-10-27 14:14:40 +09:00
Scott Lystig Fritchie
cfaed63fa7 Experimental: add 'make dialyzer' to priv/test-for-gh-pr.sh 2015-10-27 14:12:37 +09:00
Scott Lystig Fritchie
b500d5f449 config hell 2015-10-27 14:07:45 +09:00
Scott Lystig Fritchie
dd17b1de0a Remove compiler warnings 2015-10-27 14:07:07 +09:00
UENISHI Kota
7377624579 Dialyzer cleanup 2015-10-27 13:44:12 +09:00
UENISHI Kota
6e3347f727 Support log replay 2015-10-27 13:43:45 +09:00
UENISHI Kota
8a61055f55 Support arbitrary bytes write by using find_(left|right)neighbor/2 2015-10-27 13:43:45 +09:00
UENISHI Kota
d59c1fae31 Fix dialyzer issue 2015-10-27 13:27:20 +09:00
Shunichi Shinohara
39ac71048b Fix missing "options" arg for a few calls 2015-10-27 13:03:56 +09:00
Scott Lystig Fritchie
9fb19aa8ee Merge pull request #27 from basho/ku/border-checksum
Regenerate checksum when chunks are to be sliced
2015-10-27 12:51:02 +09:00
Scott Lystig Fritchie
bbbd9748f3 Fix compiler & dialyzer warnings 2015-10-27 12:45:48 +09:00
UENISHI Kota
b2eb3e089c Cleanup MACROs and changed default value
* machi_file_proxy now uses application environment
  value `max_file_size` via machi_config
* changed name from MAX_FILE_SIZE to DEFAULT_MAX_FILE_SIZE
2015-10-27 11:17:59 +09:00
UENISHI Kota
5913531e32 Introduce machi_config.erl 2015-10-27 11:00:05 +09:00
UENISHI Kota
60364fe0ca Fix typo 2015-10-26 10:10:08 +09:00
UENISHI Kota
0e4ae818af Clarify checksum tags, define macros to avoid typos 2015-10-23 19:32:22 +09:00
UENISHI Kota
3d6d4d8be3 Do the slicing in flu server rather than in CR client 2015-10-23 18:49:49 +09:00
UENISHI Kota
c5661571e3 Regenerate checksum when chunks are to be sliced 2015-10-23 17:33:08 +09:00
UENISHI Kota
5fd225bcdb Merge pull request #25 from basho/ku/trim-pb-protocol-2
Update read_chunk() PB protocol to return trimmed chunks
2015-10-23 17:10:21 +09:00
UENISHI Kota
0f688d6279 Update read_chunk() PB protocol to return trimmed chunks 2015-10-22 23:11:43 +09:00
Scott Lystig Fritchie
41bd8fa64e Merge pull request #24 from basho/ku/tools-mk
Replace some make targets with tools.mk
2015-10-22 15:47:05 +09:00
Scott Lystig Fritchie
51c97da133 Remove now-unused filter-dialyzer-dep-warnings 2015-10-22 15:39:41 +09:00
Scott Lystig Fritchie
49b4b1c304 Silence remaining warnings 2015-10-22 15:37:09 +09:00
Scott Lystig Fritchie
058de6dc9c Reduce all dialyzer runtimes by approx 13 seconds (on my MacBook Pro) 2015-10-22 15:36:50 +09:00
Scott Lystig Fritchie
a0588cbaed Avoid warnings 2015-10-22 12:59:40 +09:00
Scott Lystig Fritchie
3751ca14b5 dialyzer.ignore-warnings .gitignore 2015-10-22 12:59:19 +09:00
Scott Lystig Fritchie
3bb5ffa50e Remove src/machi_sequencer.erl 2015-10-22 12:56:24 +09:00
UENISHI Kota
3d3d26b3af Replace some make targets with tools.mk 2015-10-22 10:53:09 +09:00
Scott Lystig Fritchie
224a293c03 Merge pull request #23 from basho/slf/dialyzer1
Dialyzer cleanup
2015-10-22 10:32:01 +09:00
Scott Lystig Fritchie
8a51230760 I don't understand riak_dt type problem, but machi_fitness.erl works in common case 2015-10-21 18:37:31 +09:00
Scott Lystig Fritchie
b8c5d21876 cp_mode repair warning fix 2015-10-21 18:37:31 +09:00
Scott Lystig Fritchie
5008cbd2d0 Add filter for known pattern in machi_chain_manger1.erl 2015-10-21 18:37:31 +09:00
Scott Lystig Fritchie
9c31139b62 Whitespace 2015-10-21 18:37:31 +09:00
Scott Lystig Fritchie
b3b24b1178 Type corrections & remove dead code 2015-10-21 18:37:31 +09:00
Scott Lystig Fritchie
5122ee00a9 Bugfixes 2015-10-21 18:37:30 +09:00
Scott Lystig Fritchie
0c4c42cc52 Bugfixes 2015-10-21 18:37:30 +09:00
Scott Lystig Fritchie
d2ac5b0583 Bugfix: arg type to machi_util:parse_filename() 2015-10-21 18:37:30 +09:00
Scott Lystig Fritchie
028ddc79ff Data type cleanups, other 2015-10-21 18:37:30 +09:00
Scott Lystig Fritchie
edce9d6463 Remove -Wunderspecs, the pickiness level is too high IMO for maturity level now 2015-10-21 18:37:30 +09:00
Scott Lystig Fritchie
bd91167093 Changes to filter & make target 2015-10-21 18:37:30 +09:00
Scott Lystig Fritchie
595f9a463e Unexported funcs 2015-10-21 18:37:30 +09:00
Scott Lystig Fritchie
177aca0a68 Merge pull request #22 from basho/ss-flu1-init-sync
Make flu1 initialization synchronous
2015-10-21 18:36:12 +09:00
Shunichi Shinohara
478107915b Make flu1 initialization synchronous 2015-10-21 16:16:03 +09:00
Scott Lystig Fritchie
30d7e592a3 Merge pull request #20 from basho/ku/read-all-chunks
Allow reading multiple chunks at once
2015-10-21 15:28:10 +09:00
UENISHI Kota
79e0ae9fe5 Merge pull request #21 from basho/slf/eunit-fixes1
Slf/eunit fixes1
2015-10-21 15:08:08 +09:00
Scott Lystig Fritchie
1c8e436a64 Fix race #3 2015-10-21 15:01:11 +09:00
Scott Lystig Fritchie
9d177c6b54 Fix race #2 2015-10-21 14:45:21 +09:00
Scott Lystig Fritchie
976a701e0c Fix timeout problem in test/machi_proxy_flu1_client_test.erl 2015-10-21 14:31:58 +09:00
Scott Lystig Fritchie
981b55c070 Fix race #1 2015-10-21 14:31:41 +09:00
UENISHI Kota
a43397a7b8 Update to review comments 2015-10-21 10:58:00 +09:00
Scott Lystig Fritchie
84f9ccc4f5 markdown fix 2015-10-21 10:44:52 +09:00
UENISHI Kota
ebb9bc3f5a Allow reading multiple chunks at once
* When repairing multiple chunks at once and any of its repair
  failed, the whole read request and repair work will fail
* Rename read_repair3 and read_repair4 to do_repair_chunks and
  do_repair chunk in machi_file_proxy
* This pull request changes return semantics of read_chunk(), that
  returns any chunk included in requested range
* First and last chunk may be cut to fit the requested range
* In machi_file_proxy, unwritten_bytes are removed and replaced by
  machi_csum_table
2015-10-20 17:59:09 +09:00
Scott Lystig Fritchie
a369f0666a Update README.me for mid-Oct 2015 2015-10-20 14:50:54 +09:00
Scott Lystig Fritchie
1193fb8510 Merge branch 'slf/doc-name-game' 2015-10-19 16:46:17 +09:00
Scott Lystig Fritchie
6f9814ffb4 Merge ss/deps-for-debugging (with rebar.config conflict fix) 2015-10-19 16:41:03 +09:00
Scott Lystig Fritchie
bf61b2900b Merge branch 'master' of github.com:basho/machi 2015-10-19 16:38:10 +09:00
Scott Lystig Fritchie
ecd7eb195a Merge pull request #18 from basho/ku/read-all-valid-chunks
Allow read_chunk() to return partial chunks
2015-10-19 16:25:43 +09:00
Scott Lystig Fritchie
980d646d64 Merge branch 'ku/read-all-valid-chunks' 2015-10-19 16:24:12 +09:00
UENISHI Kota
3e975f53b8 Allow read_chunk() to return partial chunks
This is simply a change of read_chunk() protocol, where a response of
read_chunk() becomes list of written bytes along with checksum. All
related code including repair is changed as such. This is to pass all
tests and not actually supporting partial chunks.
2015-10-19 15:37:17 +09:00
Shunichi Shinohara
208c02853f Add cluster_info to deps and small callback module
For debuging from shell, some functions in machi_cinfo are exported:

- public_projection/1
- private_projection/1
- fitness/1
- chain_manager/1
- flu1/1
2015-10-19 15:36:05 +09:00
Scott Lystig Fritchie
c407ee23f2 Add section '9. Other considerations for FLU/sequencer implementations' 2015-10-19 14:47:13 +09:00
Scott Lystig Fritchie
96c037e799 Notation change 2015-10-19 12:49:33 +09:00
UENISHI Kota
6961930b0f Update eper 2015-10-19 09:40:05 +09:00
Scott Lystig Fritchie
8ab9d3ee50 Name consistency: CoC 'locator', variable name = L 2015-10-17 16:47:00 +09:00
Scott Lystig Fritchie
1485fa416f Clarifications... 2015-10-17 14:28:16 +09:00
Scott Lystig Fritchie
39774bc70f Simplify (I hope!), add CoC namespace 2015-10-17 14:14:27 +09:00
Scott Lystig Fritchie
19d935051f WIP: 1st round of name-game-sketch.org revision done
Also, add diagram source for 'xfig' app, migration-3to4.fig
2015-10-16 22:06:05 +09:00
Scott Lystig Fritchie
f04e7f7132 Merge pull request #16 from basho/slf/travis-ci2
Add priv/test-for-gh-pr.sh, change TravisCI 'script' to use it
2015-10-16 18:33:08 +09:00
Scott Lystig Fritchie
ae7eba9dcf Excuse to re-push 2015-10-16 18:30:46 +09:00
Scott Lystig Fritchie
5f953bc5dd Grr, comma 2015-10-16 18:25:23 +09:00
Scott Lystig Fritchie
cf9d56e9bf Add priv/test-for-gh-pr.sh, change TravisCI 'script' to use it 2015-10-16 18:23:24 +09:00
UENISHI Kota
cb67764273 Merge pull request #12 from basho/slf/packaging1
Slf/packaging1
2015-10-16 17:56:27 +09:00
Scott Lystig Fritchie
aac302a289 Fix cut-and-paste-o in machi-admin script 2015-10-16 17:50:34 +09:00
Scott Lystig Fritchie
de364d3a99 Merge pull request #15 from basho/slf/eunit-verbose-cleanup
Reduce compiler warnings and verbose output that clutters eunit test output
2015-10-16 17:45:24 +09:00
Scott Lystig Fritchie
00ac0f4cd3 Reduce compiler warnings and verbose output that clutters eunit test output 2015-10-16 17:41:01 +09:00
Scott Lystig Fritchie
29eccd5bee Merge pull request #13 from basho/slf/travis-ci
Add TravisCI integration glue
2015-10-16 17:20:30 +09:00
Scott Lystig Fritchie
de68973e46 TravisCI badge thingie in README.md 2015-10-16 17:17:28 +09:00
Scott Lystig Fritchie
bc3c08cf37 Add .travis.yml 2015-10-16 17:11:14 +09:00
Scott Lystig Fritchie
a6a49ca0fb Merge pull request #11 from basho/ku/cut-out-checksum-file
Move checksum file related code to machi_csum_table
2015-10-16 17:04:58 +09:00
Scott Lystig Fritchie
c48513d131 Change runner_wait_process so 'start' works 2015-10-16 16:54:57 +09:00
Scott Lystig Fritchie
bc45b8e37c Deal with commas and commented things 2015-10-16 16:48:58 +09:00
Scott Lystig Fritchie
299016cafb FLU startup via app.config 2015-10-16 16:28:46 +09:00
Scott Lystig Fritchie
ab6c23a335 Oops, machi app actually *runs* now 2015-10-16 15:56:47 +09:00
UENISHI Kota
6f790527f5 Follow with missing tests and related fix 2015-10-16 10:10:05 +09:00
Scott Lystig Fritchie
58d2a43b37 Very first packaging draft, no cuttlefish
* "make rel" does something not horrible
* `rel/machi/bin/machi console` launches a console to an idle VM
2015-10-16 01:23:10 +09:00
UENISHI Kota
e45469b5ce Move checksum file related code to machi_csum_table 2015-10-15 11:28:40 +09:00
Mark Allen
baeffbab0b Merge pull request #6 from basho/mra/write-once-clean
Integrate write once invariant into current FLU implementation
2015-10-14 10:15:57 -05:00
Scott Lystig Fritchie
e344ee42ff Remove stale TODO comment about write-once enforcement 2015-10-14 16:56:51 +09:00
Scott Lystig Fritchie
d6a3180ecd Use pattern matching instead of length() BIF 2015-10-14 16:52:03 +09:00
Scott Lystig Fritchie
71cbf019f5 Merge pull request #9 from basho/kuenishi-patch-1
Fix gen_server style return value
2015-10-14 16:34:07 +09:00
Scott Lystig Fritchie
7439a2738d Work-around racy query of wedge_status in machi_cr_client_test 2015-10-14 16:28:01 +09:00
UENISHI Kota
07ceff095a Fix gen_server style return value 2015-10-14 16:22:11 +09:00
Scott Lystig Fritchie
8eb9cc9700 Fix "HEY, machi_pb_translate:852 got {error,bad_csum}" errors
s/bad_csum/bad_checksum/ as needed in in machi_file_proxy.erl
2015-10-14 14:26:46 +09:00
Scott Lystig Fritchie
ed112bfb52 Argument fix for read_chunk() when write_chunk() says 'written' 2015-10-14 14:16:51 +09:00
Scott Lystig Fritchie
6dbf52db6f Remove some debugging verbosity 2015-10-14 12:50:10 +09:00
Scott Lystig Fritchie
8cd41a7bf2 Clean up projection-related tests in machi_proxy_flu1_client:api_smoke_test 2015-10-14 12:49:48 +09:00
UENISHI Kota
63612c8823 Merge pull request #8 from basho/ku/trim-high-proto
Add surface of trim chunks to scrub
2015-10-14 12:45:07 +09:00
UENISHI Kota
1b612bd969 Fix typo in comment 2015-10-14 12:40:56 +09:00
Mark Allen
ec9682520a Fix tests with bad file names.
Either catch the {error, bad_arg} tuple or modify the file name to
conform to the machi conventions of prefix^uuid^seqno.
2015-10-13 21:13:12 -05:00
Mark Allen
fe71b72494 Add filename parse and validation functions 2015-10-13 21:12:14 -05:00
Mark Allen
f8707c61c0 Choose new filename when epoch changes
The filename manager needs to choose a new file name
for a prefix when the epoch number changes. This helps
ensure safety of file merges across the cluster.
(Prevents conflicts across divergent cluster members.)
2015-10-13 21:09:31 -05:00
Mark Allen
161e6cd9f9 Pass epoch id to append operations
Needed to handle a filename change when epoch changes.
2015-10-13 21:08:48 -05:00
Mark Allen
85e1e5a26d Handle {error, bad_arg} on read 2015-10-13 21:08:24 -05:00
UENISHI Kota
e113f6ffdd Reach the trim stub to CR client 2015-10-13 17:25:59 +09:00
UENISHI Kota
dfe953b7d8 Add surface of trim to scrub 2015-10-13 17:14:44 +09:00
Scott Lystig Fritchie
2724960eaf TODO MARK: added clarification to test/machi_flu_psup_test.erl 2015-10-12 15:43:45 +09:00
Scott Lystig Fritchie
5131ebdd16 Change eunit expectations from change to using psup 2015-10-12 15:38:47 +09:00
Scott Lystig Fritchie
777909b0f5 TODO MARK todo comment and bugfix for machi_cr_client_test 2015-10-12 15:30:37 +09:00
Scott Lystig Fritchie
cbf773215e TODO MARK add comment for machi_cr_client_test:smoke_test2/0 failure 2015-10-12 15:29:54 +09:00
Scott Lystig Fritchie
8a8c4dcede Adapt machi_cr_client_test:smoke_test2/0 to change in FLU semantics: partial_write -> unwritten 2015-10-12 14:22:47 +09:00
Mark Allen
f3e6d46e36 Fix chain manager failures disabling active mode
The FLU psup starts the chain manager in active mode by default
(as it should for normal run-time operation.) By adding the
{active_mode, false} tuple to the options list, we can
tell the chain manager that it should be explicitly manipulated
during tests.
2015-10-11 23:05:44 -05:00
Mark Allen
da0b331936 WIP 2015-10-11 23:05:27 -05:00
Mark Allen
855f94925c Validate semantics on partial reads 2015-10-11 23:05:00 -05:00
Mark Allen
8187e01fe0 Use psup startup 2015-10-11 23:04:43 -05:00
Mark Allen
289b2bcc7c Debug WIP 2015-10-11 23:04:29 -05:00
Mark Allen
5926cef44a Make test start up more reliable 2015-10-08 15:49:22 -05:00
Mark Allen
d9ede473dd Bump lager to 2.2.0
That brings it up to the latest release of 2.x; should consider
using lager 3 though.
2015-10-08 15:48:04 -05:00
Mark Allen
c1b9038447 The return value of ets is generally 'true' 2015-10-08 15:47:11 -05:00
Mark Allen
aca3759e45 Bug fixes found during testing runs 2015-10-08 15:46:40 -05:00
Mark Allen
1ecbb5cffe Fixed order of start_link parameters 2015-10-08 15:45:04 -05:00
Mark Allen
303aad97e9 Use {error, bad_checksum} directly
We previously copied {error, bad_csum} as it was used in the main
FLU code.  The protobufs stuff expects the full atom bad_checksum
though.
2015-10-08 15:43:54 -05:00
Scott Lystig Fritchie
4d0019f141 Merge branch 'slf/pb-checksum-expansion' 2015-10-08 20:42:15 +09:00
Scott Lystig Fritchie
952d2fa508 Change flag_checksum -> flag_no_checksum for consistency 2015-10-08 20:41:59 +09:00
Scott Lystig Fritchie
2bfc199294 Merge branch 'master' of github.com:basho/machi 2015-10-08 14:19:54 +09:00
Scott Lystig Fritchie
7912e77e9e Avoid making 'make pulse' easy to use 2015-10-08 14:19:44 +09:00
Mark Allen
679046600f Merge remote-tracking branch 'origin/bug/from-bp-request-error' into mra/write-once-clean 2015-10-07 23:02:03 -05:00
Mark Allen
ed5dec1cd6 Merge pull request #7 from basho/bug/from-bp-request-error
Add LL generic error PB response decoding
2015-10-07 22:56:43 -05:00
Scott Lystig Fritchie
796937fe75 Add LL generic error PB response decoding 2015-10-08 12:33:55 +09:00
Scott Lystig Fritchie
0054445f13 Delete spammy message from fitness servers every 5 seconds 2015-10-07 18:52:24 +09:00
Mark Allen
d627f238bf Cache generated names until disk files are written 2015-10-06 22:44:31 -05:00
Mark Allen
f83b0973f2 Have to call filename mgr with FluName 2015-10-06 22:43:19 -05:00
Mark Allen
7a6999465a Make sure we use '^' as filename separators 2015-10-06 22:02:31 -05:00
Mark Allen
2d0c03ef35 Integration with current FLU implementation 2015-10-05 22:18:29 -05:00
Mark Allen
36c11e7d08 Add a metadata manager supervisor 2015-10-05 16:37:53 -05:00
Mark Allen
d3fe7ee181 Pull write-once files over to clean branch
I am treating the original write-once branch as a prototype
which I am now throwing away. I had too much work interleved
in there, so I felt like the best thing to do would be to cut
a new clean branch and pull the files over and start over
against a recent-ish master.

We will have to refactor the other things in FLU in a more
piecemeal fashion.
2015-10-02 16:29:09 -05:00
Scott Lystig Fritchie
d7daf203fb Update TODO-shortterm.org for completion of fitness work 2015-09-22 16:44:49 +09:00
Scott Lystig Fritchie
3fb3890788 Merge branch 'slf/cp-mode-adjustments' to 'master' 2015-09-22 16:19:58 +09:00
Scott Lystig Fritchie
6d5b61f747 Tweaks to sleep_ranked_order() call in C200 2015-09-21 21:47:25 +09:00
Scott Lystig Fritchie
5eecb2b935 Change to P_current_calc epoch @ C100 2015-09-21 21:44:03 +09:00
Scott Lystig Fritchie
6425cca13f Fix broken eunit test 2015-09-21 21:44:03 +09:00
Scott Lystig Fritchie
340af05f0f WIP: server-side of CP mode repairing-as-witness 2015-09-21 21:44:03 +09:00
Scott Lystig Fritchie
d9b9397e75 Avoid some projection churn in C100's sanity check 2015-09-21 21:44:03 +09:00
Scott Lystig Fritchie
5010d03677 Call manage_last_down_list() at C220 and C310 2015-09-21 15:36:54 +09:00
Scott Lystig Fritchie
69a304102e Write public proj in all_members order only 2015-09-21 15:09:16 +09:00
Scott Lystig Fritchie
58b19e76be Merge branch temp integration branch 'slf/tmp/merge0920' 2015-09-20 22:45:21 +09:00
Scott Lystig Fritchie
41836b01e6 Merge branch 'slf/chain-manager/remove-inner' into slf/tmp/merge0920 2015-09-20 20:19:00 +09:00
Scott Lystig Fritchie
83e878eb07 More verbosity, whee 2015-09-20 14:06:55 +09:00
Scott Lystig Fritchie
6b4ed1c061 Verbose debugging cruft 2015-09-19 14:25:07 +09:00
Scott Lystig Fritchie
72bfa163ba Small test bugfixes & verbose/debugging cruft 2015-09-19 14:16:54 +09:00
Scott Lystig Fritchie
d695f30e4f Avoid using host/port combo for machi_fitness (ab)use of machi_projection 2015-09-17 16:43:08 +09:00
Scott Lystig Fritchie
09ae2db0ba Bugfix: double-check local private projection write with a read 2015-09-16 16:31:10 +09:00
Scott Lystig Fritchie
79b1d156c4 Add backlog option to gen_tcp:listen 2015-09-16 13:52:36 +09:00
Scott Lystig Fritchie
778bd015ee Bugfix: pattern matching error in C110 2015-09-16 12:41:53 +09:00
Scott Lystig Fritchie
d3b116bd9e Bugfix: CP mode: ignore P_latest if it has UPI or down server in my down list 2015-09-15 17:55:18 +09:00
Scott Lystig Fritchie
5001406499 Add proplist-based configuration for TCP port and tmp dir for converge demo 2015-09-15 17:54:27 +09:00
Scott Lystig Fritchie
75c94420e0 Add test_ets_table to give programmatic slowdown 2015-09-14 22:52:41 +09:00
Scott Lystig Fritchie
7bf1132142 Bugfix: IsRelevantToMe_p adjustment for P_latest.upi == [] 2015-09-14 17:28:50 +09:00
Scott Lystig Fritchie
b4f8bc8058 Add pretty_time(). Add CONFIRM verbose logging for none proj 2015-09-14 17:00:09 +09:00
Scott Lystig Fritchie
4e11cdd50f Bugfix: derp, pattern match for UniqueHistoryTrigger_p 2015-09-14 16:59:58 +09:00
Scott Lystig Fritchie
a036f119a6 Add send_spam_to_everyone(), add 1% chance of using it 2015-09-14 16:01:26 +09:00
Scott Lystig Fritchie
6c543dfc18 Re-use the flapping criteria for a different use (more)
Hooray, very early I ended up with a simulator example which kicked
in and tested this change.  (A deterministice fault injection method
for testing would also be valuable, probably.)

    machi_chain_manager1_converge_demo:t(7, [{private_write_verbose,true}]).

We switched partitions in the simulator like this:

    SET partitions = [{b,f},{c,f},{d,e},{f,e}] (2 of 90252) at {14,37,5}
    ...
    Stable projection at epoch 1429 upi=[b,c,g,a,d],repairing=[]
    ...
    SET partitions = [{b,d},{c,b},{d,c},{f,a}] (3 of 90252) at {14,37,44}

Part of the chain reassembled quickly from the following UPIs: [g], then
[g,e], then [g,e,f] via a series of successful simulated repairs.  For
the first two repairs, all parties (e & f & g) are unanimous about the
projections.  For the final repair, very strange, not all three adopt
[g,e,f] chain: e says nothing, f & g use it.

Also weird, then g immediately moves f!  upi=[g,e],repairing=[f].
Then e also adopts this chain of 2.  From that point forward, f keeps
trying to use upi=[g,e,f],[] and the others try using only upi=[g,e],[f].
There are lots of messages from g saying that it's insane (correctly!)
to try calc=1487:[g,e],[f] -> 1494:[g,e,f],[] without a valid repair
author.

It's worth checking why g dropped from [g,e,f] -> [g,e].  But even
still, this new use for the flapping counter & reset via C103 is
working.  ... Ah, now I understand.  The very occasional undefined
socket bug in machi_flu1_client appears to be the cause: g had a
one-time problem talking with f and so decided f was down long enough to
make the shorter UPI.  The other participants didn't have any such
problem with f and so kept f in the UPI.  This would have been a
deadlock/infinite loop case without someone deciding to reset state.
2015-09-14 15:41:48 +09:00
Scott Lystig Fritchie
23554ffccc Handle timeout/paritition failures in C110 2015-09-14 13:54:47 +09:00
Scott Lystig Fritchie
fdf78bdbbc Tweak IsRelevantToMe_p in B10 (more)
Last night we hit a rare case of failed convergence.

f was out of sync with the rest of the world.
f: upi=[b,g,f] repairing=[a,c]
The "rest of the world" used a larger chain at:
*: upi=[c,b,g,a], repairing=[f]

And f refused to join the larger chain because of the way that
IsRelevantToMe_p was being calculated before this commit.

Hrrrm, though, I'm not convinced that this particular problem
is fixed 100% by this patch.  What if the chain lengths were
the same but also UPI incompatible?  e.g. if I remove 'a' from
the "real world (in the partition simulator)" example above:

f: upi=[b,g,f] repairing=[c]
*: upi=[c,b,g], repairing=[f]

Hrmmmmm, I may need to reintroduce the my-recent-adopted-projection-
flapping-like-counter thingie to try to break this kind of
incompatible deadlock.
2015-09-14 13:40:34 +09:00
Scott Lystig Fritchie
62186395ed Hooray! The weekend's CP work hasn't broken AP, I believe. 2015-09-14 00:04:53 +09:00
Scott Lystig Fritchie
f5901c6cd3 Hey, appears to work for CP mode chain len=3, hooray! 2015-09-13 21:51:20 +09:00
Scott Lystig Fritchie
4fba6c0d33 Adjust converge test conditions slightly 2015-09-13 21:07:54 +09:00
Scott Lystig Fritchie
89f57616a8 Avoid some churn when both latest & newprop are none proj 2015-09-13 17:44:23 +09:00
Scott Lystig Fritchie
04369673b0 MaxFiles static file deletion isn't good for make_zerf(). Add some no-partition scenarios 2015-09-13 16:59:08 +09:00
Scott Lystig Fritchie
f3a0ee91cf WIP: thread P_calc_current all the way to C100 for CP mode assist 2015-09-13 15:58:45 +09:00
Scott Lystig Fritchie
0a20417682 Adjustments for CP mode (still slightly experimental) 2015-09-13 14:56:28 +09:00
Scott Lystig Fritchie
32c4d39156 Bugfix: set consistency_mode at set_chain_members 2015-09-13 14:16:02 +09:00
Scott Lystig Fritchie
b3ce9f9ab8 A bit less verbose output 2015-09-11 23:08:47 +09:00
Scott Lystig Fritchie
5efec1b6cd Add upi_unanimous annotation to AP mode 2015-09-11 21:47:05 +09:00
Scott Lystig Fritchie
fe8ff6033d Make better state transition choices in AP mode 2015-09-11 19:14:41 +09:00
Scott Lystig Fritchie
68f1ff68ee Bugfix: broken eunit test 2015-09-11 17:52:40 +09:00
Scott Lystig Fritchie
a0c129c16d Bugfix: wow, a chain state transition sanity check bug 2015-09-11 17:32:52 +09:00
Scott Lystig Fritchie
8df7d58365 Add partition simulator support to fitness service 2015-09-11 16:45:29 +09:00
Scott Lystig Fritchie
efe6ce7894 WIP: small refactoring to prepare for fitness server 'use' of partition simulator 2015-09-11 16:03:49 +09:00
Scott Lystig Fritchie
35e8efeb96 Add timer:sleep() to accomodate machi_chain_manager1_converge_demo 2015-09-11 15:56:02 +09:00
Scott Lystig Fritchie
bbf925d132 Add fault injection method via C100 to test C103 admin down cycle 2015-09-10 18:05:55 +09:00
Scott Lystig Fritchie
41737ae62a Add delete_admin_down API implementation, oops! 2015-09-10 18:05:18 +09:00
Scott Lystig Fritchie
d45c249e89 Add admin down status API to fitness server 2015-09-10 17:30:11 +09:00
Scott Lystig Fritchie
c14b9ce50f Minor cleanup, add more partitions to converge demo 2015-09-10 16:39:15 +09:00
Scott Lystig Fritchie
af94d1c1c3 Bugfix: ExpectedUPI error in A40 2015-09-10 02:15:49 +09:00
Scott Lystig Fritchie
daf3a3d65a Remove some verbose debugging cruft 2015-09-10 01:47:46 +09:00
Scott Lystig Fritchie
329a5e0682 Bugfix: damn, no idea how many problems this 5 month old bug caused 2015-09-10 01:33:55 +09:00
Scott Lystig Fritchie
5943494d54 Add ExpectedUPI to A40's AmHosedP clause 2015-09-10 00:43:37 +09:00
Scott Lystig Fritchie
10c655ebfe WIP: fix one source of problems, now shift back to 'TODO this clause needs more review' 2015-09-09 23:59:40 +09:00
Scott Lystig Fritchie
b7aa33c617 Yeah, nearly there. AP fails occasionally in multiple-asymmetric-partition sequence 2015-09-09 23:10:39 +09:00
Scott Lystig Fritchie
72141c8ecb WIP: split A30 into A30/A31 based on AllHosed 2015-09-09 21:06:40 +09:00
Scott Lystig Fritchie
5029911b52 WIP: remove verbose goop 2015-09-09 20:46:52 +09:00
Scott Lystig Fritchie
38ea36fc1c WIP: Stand back, I'm going to try math! ... It works, {redacted}! 2015-09-09 20:45:57 +09:00
Scott Lystig Fritchie
27891bc5e9 WIP: 'broadcast'/spam works! async reminder ticks remain! 2015-09-09 19:14:52 +09:00
Scott Lystig Fritchie
dd095f117f Derp, fix smoke_test() for machi_fitness:map_set() 2015-09-09 16:49:27 +09:00
Scott Lystig Fritchie
21015efcbb WIP: Stand back, I'm going to try CRDTs! 2015-09-08 19:13:03 +09:00
Scott Lystig Fritchie
7af863d840 Add stubs of machi_fitness server 2015-09-08 16:13:07 +09:00
Scott Lystig Fritchie
185c9eb313 WIP: add failing eunit placeholder for spam 2015-09-07 15:38:23 +09:00
Scott Lystig Fritchie
c7684f660c WIP: Friday evening/Monday morning, laying groundwork for spam "broadcast" 2015-09-07 15:20:10 +09:00
Scott Lystig Fritchie
4376ce9ec1 Remove all flap counting and inner projection stuff 2015-09-04 17:17:49 +09:00
Scott Lystig Fritchie
97d44ad1e6 Fix minor compilation warnings 2015-09-04 15:40:11 +09:00
Scott Lystig Fritchie
1312cf93f5 Merge partial work of slf/chain-manager/cp-mode4 into tmp-mergeit 2015-09-04 15:24:58 +09:00
Scott Lystig Fritchie
42aeecd9db Fix machi_projection_store_test error 2015-09-04 15:24:16 +09:00
Scott Lystig Fritchie
3c1026da28 WIP: too tired to continue tonight 2015-09-01 22:10:45 +09:00
Scott Lystig Fritchie
4378ef7b54 Bugfix: inner->outer proj @ A30 2015-09-01 00:51:46 +09:00
Scott Lystig Fritchie
2e2f5f44c4 Another tweak to private_projections_are_stable() 2015-09-01 00:51:12 +09:00
Scott Lystig Fritchie
e79265228e Bugfix: more correct for inner->outer sanity transition 2015-08-31 22:14:28 +09:00
Scott Lystig Fritchie
1e5d58b22d Bugfix: more to ignore in make_basic_comparison_stable() 2015-08-31 17:57:37 +09:00
Scott Lystig Fritchie
bce225a200 Bugfix: a30_make_inner_projection() ignore newprop down list if none proj 2015-08-31 17:03:12 +09:00
Scott Lystig Fritchie
a095e0cfc3 Bugfix: ignore creation_time in make_comparison_stable() 2015-08-31 15:40:19 +09:00
Scott Lystig Fritchie
c637939cc2 Bugfix: A29 should trigger if EpochID (not Epoch# alone) differs 2015-08-31 15:21:17 +09:00
Scott Lystig Fritchie
5422dc45c2 Bugfix: derp in A29 revival 2015-08-31 14:44:05 +09:00
Scott Lystig Fritchie
004c686c8c WIP: remove make_zerf() from calc_projection(); add make_zerf() to resurrected A29. Status: broken, needs work 2015-08-30 20:39:58 +09:00
Scott Lystig Fritchie
a449025e8b Bugfix: epoch handling around none proj: epoch 0 only at first bootstrap! 2015-08-30 19:53:47 +09:00
Scott Lystig Fritchie
823b47bef3 Bugfix: convergence property for CP mode, again 2015-08-30 19:52:31 +09:00
Scott Lystig Fritchie
ec2e7b5669 Sunday experiment: all-but-remove A29, feels right but definitely not sure yet 2015-08-30 16:08:14 +09:00
Scott Lystig Fritchie
0dc53274d1 Get more aggressive about AllHosed+down nodes for inner proj 2015-08-30 02:22:59 +09:00
Scott Lystig Fritchie
771164b82f Bugfix: Flapping manifesto, leaving #2: only if not me 2015-08-30 00:50:23 +09:00
Scott Lystig Fritchie
4b83893047 Bugfix: minor flap count bookeeping error 2015-08-30 00:50:03 +09:00
Scott Lystig Fritchie
a7db3a26c6 Bugfix: a30_make_inner_projection() compatible inner if not none proj 2015-08-30 00:04:13 +09:00
Scott Lystig Fritchie
764708f3ef Fix private_projections_are_stable() for long CP mode chains 2015-08-30 00:03:51 +09:00
Scott Lystig Fritchie
53d865b247 Bugfix: serious derp fix for A30's inner->outer 2015-08-29 23:42:47 +09:00
Scott Lystig Fritchie
5c8b255da9 Bugfix: first new CP experiments with chain len=5 2015-08-29 22:40:18 +09:00
Scott Lystig Fritchie
94394d3429 Bugfix: allow none proj to re-emerge from flapping (more)
See comments added in this commit at A40.

So far, I've been doing CP mode testing with a handful of (very useful)
network partition combinations using:

    machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).

Next steps:

* Expand number & types of partitions
* Expand to chain lengths of 5 and beyond
2015-08-29 21:36:53 +09:00
Scott Lystig Fritchie
ee19a0856b WIP: justincase 2015-08-29 19:59:46 +09:00
Scott Lystig Fritchie
6b84cd6e6a Reduce poll sleep time when running with partition simulator 2015-08-29 18:30:53 +09:00
Scott Lystig Fritchie
dc5ae4047a Bugfix: react_to_env_A30 inner->norm fix, make_zerf() none proj derp fix 2015-08-29 18:01:13 +09:00
Scott Lystig Fritchie
85eb3567a3 Bugfix: convergence property for CP mode 2015-08-29 15:57:23 +09:00
Scott Lystig Fritchie
c9340a662d Bugfix: force stable creation_time on inner none proj 2015-08-29 15:06:57 +09:00
Scott Lystig Fritchie
6d9526b379 Add more ?REACT() 2015-08-29 13:13:31 +09:00
Scott Lystig Fritchie
f21fcdd7be Bugfix: none proj must flap, undo previous commits, which may cause mess later 2015-08-29 13:13:23 +09:00
Scott Lystig Fritchie
af0ade9840 Bugfix: projection checksum fix in A30 2015-08-29 12:33:41 +09:00
Scott Lystig Fritchie
582f9e5eab Bugfix: fix effectively-none-projection transition to C100. Still buggy 2015-08-28 23:08:38 +09:00
Scott Lystig Fritchie
403cb5b7a6 WIP: improvements, but now flapping inner epoch keeps increasing {sigh} 2015-08-28 21:13:54 +09:00
Scott Lystig Fritchie
9edd91f48e Bugfixes for a->b column transition & flap dampening 2015-08-28 20:06:09 +09:00
Scott Lystig Fritchie
18aac6e489 WIP: undo AmFlappingNow_p condition added at commit 3dfe5c2 2015-08-28 18:39:18 +09:00
Scott Lystig Fritchie
3dfe5c2677 WIP: fix annotation history on disk 2015-08-28 18:37:11 +09:00
Scott Lystig Fritchie
8ca1ffdb13 WIP: bugfixes and lots of verbose goop added 2015-08-28 01:55:31 +09:00
Scott Lystig Fritchie
deb2cdee2c Bugfix: correct epoch number checking when inner proj 2015-08-27 22:22:15 +09:00
Scott Lystig Fritchie
93b9b948fc WIP: debugging, uff da 2015-08-27 22:02:23 +09:00
Scott Lystig Fritchie
efb89efb0d Reduce verbosity 2015-08-27 20:27:33 +09:00
Scott Lystig Fritchie
0eaa008810 Change checksum algorithm to exclude 'flap' also 2015-08-27 20:27:24 +09:00
Scott Lystig Fritchie
12b74a52fd WIP: pre-dinner paranoid checkin 2015-08-27 18:45:27 +09:00
Scott Lystig Fritchie
65cd18939c WIP: changes to annotation management 2015-08-27 17:58:43 +09:00
Scott Lystig Fritchie
8a61a85ae0 WIP: rewrite make_zerf() to use new annotation scheme 2015-08-27 16:19:22 +09:00
Scott Lystig Fritchie
28335a1310 Add CP mode unwedge. All eunit tests are passing again. 2015-08-26 18:47:39 +09:00
Scott Lystig Fritchie
9222881689 Oops, bugfixes 2015-08-26 17:51:43 +09:00
Scott Lystig Fritchie
568e165f4f Allow pstore -> FLU unwedge only in ap_mode, machi_cr_client_test broken (uses cp_mode) 2015-08-26 15:51:14 +09:00
Scott Lystig Fritchie
e8f3ab381d Add set_consistency_mode() to projection store API, use it 2015-08-26 14:57:51 +09:00
Scott Lystig Fritchie
833463f20d Merge branch 'master' into slf/chain-manager/cp-mode4 2015-08-26 14:39:42 +09:00
Scott Lystig Fritchie
27656eafaa Fix (via sleep, egadz) race condition in machi_flu_psup_test 2015-08-26 14:38:56 +09:00
Scott Lystig Fritchie
c12231c7b6 Fix other tests to accomodate new semantics 2015-08-25 19:45:31 +09:00
Scott Lystig Fritchie
c0ee323637 Our new unit test works, yay 2015-08-25 19:42:33 +09:00
Scott Lystig Fritchie
83f49472db WIP: intermediate refactoring 2015-08-25 19:31:05 +09:00
Scott Lystig Fritchie
0a4c0f963e Add failing test case for annotating private projections via dbg2 list 2015-08-25 19:12:23 +09:00
Scott Lystig Fritchie
e6f8e3516e Merge branch 'slf/chain-manager/cp-mode3' 2015-08-25 18:50:24 +09:00
Scott Lystig Fritchie
6dbe887298 Remove old cruft, including hugly HTTP server hack 2015-08-25 18:49:48 +09:00
Scott Lystig Fritchie
1c5a17b708 WIP: adjust throttle of flapping 'shut up' 2015-08-25 17:01:14 +09:00
Scott Lystig Fritchie
9a86453753 WIP: half-baked idea, stopping for the night (more)
So, I'm 50% sure this is a good idea for CP mode: if there's
a later public projection than P_current, then who knows what
we might have missed.  So, call make_zerf() to find out the
absolute latest.  Problem: flapping state appears to be lost,
booo.
2015-08-24 21:54:30 +09:00
Scott Lystig Fritchie
ea61fe78bf Add flap disabler for 3 seconds after up/down change 2015-08-24 20:38:54 +09:00
Scott Lystig Fritchie
2f82fe0487 WIP: cp_mode improvements 2015-08-24 19:04:26 +09:00
Scott Lystig Fritchie
66cafe066e Remove proj_i_history, tweak AllAreFlapping_and_IamBad_and_NotRelevant_p in B10 2015-08-23 20:47:43 +09:00
Scott Lystig Fritchie
f6e81e6cd0 Add damper check for flapping of *inner* projections, whee! 2015-08-23 20:01:44 +09:00
Scott Lystig Fritchie
70022d11ce Add damper check for flapping of *inner* projections, whee! 2015-08-23 20:00:19 +09:00
Scott Lystig Fritchie
561e60a7ac WIP: start adding support to detect flapping of inner projections (ha!) 2015-08-23 17:50:25 +09:00
Scott Lystig Fritchie
0136fccff7 CP mode fix a30_make_inner_projection 2015-08-23 16:43:15 +09:00
Scott Lystig Fritchie
2d050ff7a6 Fix ?REACT() FSM names: a30->a40 2015-08-23 15:46:57 +09:00
Scott Lystig Fritchie
34d35fab63 Shorten the verbose output of private_write_verbose 2015-08-22 23:30:30 +09:00
Scott Lystig Fritchie
51a06844d5 Fix epoch number reuse bug when transiting C103 2015-08-22 21:40:21 +09:00
Scott Lystig Fritchie
0414da783a Fix repairs when everyone is in stable flapping state 2015-08-22 21:27:01 +09:00
Scott Lystig Fritchie
2b2facaba2 Add more FLU choices to converge demo 2015-08-22 14:56:26 +09:00
Scott Lystig Fritchie
a0477d62c0 WIP: bugfix for checking latest proj's flap count 2015-08-22 14:50:10 +09:00
Scott Lystig Fritchie
95437c2f0b Add missing parenthesis (suggested by PR #3) 2015-08-21 12:17:47 +09:00
Scott Lystig Fritchie
8a493a6610 Merge pull request #5 from cmeiklejohn/fix-typo
Fix typo of actor.
2015-08-21 12:15:15 +09:00
Scott Lystig Fritchie
4857774f41 Merge pull request #2 from cmeiklejohn/fix-quotations
Fix incorrect Latex quotations.
2015-08-21 11:09:09 +09:00
Christopher Meiklejohn
54c2158d1f Fix typo of actor. 2015-08-20 13:25:26 -07:00
Christopher Meiklejohn
d9a3b8a08f Fix incorrect Latex quotations. 2015-08-20 10:34:33 -07:00
Scott Lystig Fritchie
0278d7254b Add A29 state for shouting circuit breaker for long long loops 2015-08-20 23:04:27 +09:00
Scott Lystig Fritchie
b46730eb2c WIP: adjust the flapping manifest: delete clause 3 2015-08-20 21:28:56 +09:00
Scott Lystig Fritchie
4c3051ddf1 Oops, forgot to add include dir changes 2015-08-20 21:27:17 +09:00
Scott Lystig Fritchie
71decc5dc0 WIP: AP mode less bad again 2015-08-20 18:47:50 +09:00
Scott Lystig Fritchie
4e7d1f2310 WIP: egadz, a refactoring mess, but finally AP mode not sucky 2015-08-20 17:32:46 +09:00
Scott Lystig Fritchie
a71e9543fe WIP: refactoring inner handling, but ... (more)
There are a couple of weird things in the snippet below (AP mode):

    22:32:58.209 b uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,115},{1439,904777,11627}} 29 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},29}},{c,{{{epk,121},{1439,904777,134392}},26}}])

    22:32:58.224 c uses inner: [{epoch,136},{author,c},{mode,ap_mode},{witnesses,[]},{upi,[b,c]},{repair,[]},{down,[a]},{flap,undefined},{d,[d_foo1,{ps,[{a,b}]},{nodes_up,[b,c]}]},{d2,[]}] (outer flap epoch 136: {flap_i,{{{epk,115},{1439,904777,11627}},28},[a,{a,problem_with,b},{b,problem_with,a}],[{a,{{{epk,126},{1439,904777,149865}},16}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},15}}]}) (my flap {{epk,121},{1439,904777,134392}} 28 [{a,{{{epk,126},{1439,904777,149865}},28}},{b,{{{epk,115},{1439,904777,11627}},28}},{c,{{{epk,121},{1439,904777,134392}},28}}])

    CONFIRM by epoch inner 136 <<103,64,252,...>> at [b,c] []

    Priv1 [{a,{{132,<<"Cï|ÿzKX:Á"...>>},[a],[c],[b],[],false}},
           {b,{{127,<<185,139,3,2,96,189,...>>},[b,c],[],[a],[],false}},
           {c,{{133,<<145,71,223,6,177,...>>},[b,c],[a],[],[],false}}] agree false
    Pubs: [{a,136},{b,136},{c,136}]
    DoIt,

1. Both the "uses inner" messages and also the "CONFIRM by epoch inner 136"
   show that B & C are using the same inner projection.

   However, the 'Priv1' output shows b & c on different epochs, 127 & 133.
   Weird.

2. I've added an infinite loop, probably in this commit.  :-(
2015-08-18 22:35:57 +09:00
Scott Lystig Fritchie
9bf0eedb64 WIP: add the flapping manifesto, much is muchmuch better now 2015-08-18 20:49:36 +09:00
Scott Lystig Fritchie
e9268080af Finish/catchup commit from end of last week, silly me 2015-08-17 20:14:29 +09:00
Scott Lystig Fritchie
48e82ac1a4 WIP: use digraph to calculate better AllHosed 2015-08-14 22:29:20 +09:00
Scott Lystig Fritchie
20f2bf4b92 WIP: more ?REACT() tracing 2015-08-14 22:28:50 +09:00
Scott Lystig Fritchie
d2ce8f8447 Fix repair bug that has survived witness additions, oops 2015-08-14 19:30:36 +09:00
Scott Lystig Fritchie
9e02a1ea73 Add more ?REACT() tracing 2015-08-14 19:30:05 +09:00
Scott Lystig Fritchie
5aff775383 WIP: it's ugly, but CP+witnesses is mostly working? 2015-08-14 17:05:16 +09:00
Scott Lystig Fritchie
4e66d7bd91 WIP: keep CMode propagation consistent, but still violating CP transition safety 2015-08-14 00:12:13 +09:00
Scott Lystig Fritchie
14fad2d704 End-to-end chain state checking is still broken (more)
If we use verbose output from:

    machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).

And use:

    tail -f typescript_file | egrep --line-buffered 'SET|attempted|CONFIRM'

... then we can clearly see a chain safety violation when moving from
epoch 81 -> 83.  I need to add more smarts to the safety checking,
both at the individual transition sanity check and at the converge_demo
overall rolling sanity check.

Key to output: CONFIRM by epoch {num} {csum} at {UPI} {Repairing}

    SET # of FLUs = 3 members [a,b,c]).
    CONFIRM by epoch 1 <<96,161,96,...>> at [a,b] [c]
    CONFIRM by epoch 5 <<134,243,175,...>> at [b,c] []
    CONFIRM by epoch 7 <<207,93,225,...>> at [b,c] []
    CONFIRM by epoch 47 <<60,142,248,...>> at [b,c] []
    SET partitions = [{c,b},{c,a}] (1 of 2) at {22,3,34}
    CONFIRM by epoch 81 <<223,58,184,...>> at [a,b] []
    SET partitions = [{b,c},{b,a}] (2 of 2) at {22,3,38}
    CONFIRM by epoch 83 <<33,208,224,...>> at [a,c] []
    SET partitions = []
    CONFIRM by epoch 85 <<173,179,149,...>> at [a,c] [b]
2015-08-13 22:16:28 +09:00
Scott Lystig Fritchie
e956c0b534 Fix (yet again) converge demo stable criteria 2015-08-13 21:26:07 +09:00
Scott Lystig Fritchie
f7121f8845 Witness + flapping seems to mostly work, yay! 2015-08-13 21:24:56 +09:00
Scott Lystig Fritchie
425b9c8f60 Merge slf/projection-conditional-write branch 2015-08-13 19:10:48 +09:00
Scott Lystig Fritchie
dcbc3b45ff C110: handle proj store private write failure when conditional fails 2015-08-13 18:45:15 +09:00
Scott Lystig Fritchie
9768f3c035 Projection store private write returns bad_arg if max_public_epochid is greater 2015-08-13 18:44:25 +09:00
Scott Lystig Fritchie
58d840ef7e Minor react changes, minor fix for return val of A50 2015-08-13 18:43:41 +09:00
Scott Lystig Fritchie
eecf5479ed Tweak stability criteria for converge demo 2015-08-13 16:18:33 +09:00
Scott Lystig Fritchie
d4275e5460 WIP: zerf_find_last_common() fix, eunit passes & very basic len=3 converge demo works 2015-08-13 15:41:18 +09:00
Scott Lystig Fritchie
0b8de235a9 WIP: zerf_find_last_common(), but is confused/broken by partial write @ private 2015-08-13 14:21:31 +09:00
Scott Lystig Fritchie
054397d187 WIP: find last common majority epoch 2015-08-12 17:53:39 +09:00
Scott Lystig Fritchie
d340b6a706 WIP: Duh, fix think-o in a40_latest_author_down() 2015-08-12 17:37:45 +09:00
Scott Lystig Fritchie
8e2a688526 WIP: cp_mode code from last Friday 2015-08-11 15:24:26 +09:00
Scott Lystig Fritchie
30a5652299 WIP: refining stable success for machi_chain_manager1_converge_demo, even better 2015-08-07 15:06:23 +09:00
Scott Lystig Fritchie
512251ac55 Adjust flap_limit constant 2015-08-07 12:29:10 +09:00
Scott Lystig Fritchie
c8ddce103e WIP: refining stable success for machi_chain_manager1_converge_demo 2015-08-07 12:28:51 +09:00
Scott Lystig Fritchie
3ca0f4491d WIP: always start chain manager with none projection 2015-08-06 19:24:14 +09:00
Scott Lystig Fritchie
0d7f6c8d7e WIP: chain transitions are now fully (?) aware of witness servers 2015-08-06 17:48:31 +09:00
Scott Lystig Fritchie
e9c4e2f98d WIP: rearrange CP mode projection calc 2015-08-06 15:22:04 +09:00
Scott Lystig Fritchie
82b6726261 Revert UPI [] -> [FirstRepairing] to commit 91496c6 2015-08-06 15:21:44 +09:00
Scott Lystig Fritchie
01da7a7046 TODO WTF was I thinking here??.... 2015-08-06 14:13:19 +09:00
Scott Lystig Fritchie
dcf532bafd WIP: Witness test expansion 2015-08-05 18:23:44 +09:00
Scott Lystig Fritchie
0f18ab8d20 Add better (?) timeout handling to machi_cr_client.erl gen_server calls 2015-08-05 17:48:06 +09:00
Scott Lystig Fritchie
e3d9ba2b83 WIP: Witness test expansion 2015-08-05 17:17:25 +09:00
Scott Lystig Fritchie
b21803a6c6 Fix witness calculation projections, part II 2015-08-05 16:05:03 +09:00
Scott Lystig Fritchie
f43a5ca96d Fix witness calculation projections, part I 2015-08-05 15:50:32 +09:00
Scott Lystig Fritchie
91496c656b Oops, fix PB stuff to add witnesses 2015-08-05 12:53:20 +09:00
Scott Lystig Fritchie
f8a09b233d Clarify name-game-sketch.org's use of K (placement key) 2015-08-04 16:25:20 +09:00
Scott Lystig Fritchie
3f51357577 WIP: pre-travel code, not sure if good, check in for history 2015-07-30 13:12:08 -07:00
Scott Lystig Fritchie
aa1a31982a Add 'witnesses' to machi_projection:make_summary() 2015-07-30 13:11:43 -07:00
Scott Lystig Fritchie
6e521700bd WIP: Adding witness_smoke_test_ but it's broken (more)
So, the problem is that the chain manager isn't finishing repair
because UPI=[a], and a is a witness, and a can't do the list files etc etc
repair stuff that repairer FLUs need to do.

The best (?) way forward is to add some advance smarts to the
chain manager so that it doesn't propose a UPI of 100% witnesses?
2015-07-21 19:05:04 +09:00
Scott Lystig Fritchie
432190435e Add witness_mode to FLU 2015-07-21 17:29:33 +09:00
Scott Lystig Fritchie
6ed5767e06 Merge branch 'slf/chain-manager/cp-mode2' 2015-07-21 14:24:08 +09:00
Scott Lystig Fritchie
52dc40e1fe converge demo: converged iff all private projs are stable and all inner/outer 2015-07-21 14:19:08 +09:00
Scott Lystig Fritchie
88d3228a4c Fix various problems with repair not being aware of inner projections 2015-07-20 16:25:42 +09:00
Scott Lystig Fritchie
319397ecd2 machi_chain_manager1_pulse.erl tweaks 2015-07-20 15:08:03 +09:00
Scott Lystig Fritchie
9ae4afa58e Reduce chmgr verbosity a bit 2015-07-20 14:58:21 +09:00
Scott Lystig Fritchie
e14493373b Bugfix: add missing reset of not_sanes dictionary, fix comments 2015-07-20 14:04:25 +09:00
Scott Lystig Fritchie
f7ef8c54f5 Reduce # of assumptions made by ch_mgr + simulator for 'repair_airquote_done' 2015-07-19 13:32:55 +09:00
Scott Lystig Fritchie
b8c642aaa7 WIP: bugfix for rare flapping infinite loop (done^2 fix I hope)
How can even computer?

So, there's a flavor of the flapping infinite loop problem that
can happen without flapping being detected (by the existing
flapping detector, that is).  That detector relies on a series of
accepted projections to converge to a single projection repeated
X times.  However, it's possible to have a race with a simulated
repair "finishing" that causes a problem so that no more
projections are ever accepted.  Oops.

See also: new comments in do_react_to_env().
2015-07-19 00:43:10 +09:00
Scott Lystig Fritchie
57b7122035 Fix bug found by PULSE that's not directly chain manager-related (more)
PULSE managed to create a situation where machi_proxy_flu_client1
would appear to fail a remote attempt to write_projection.  The
client would retry, but the 1st attempt really did get through to
the server.  So, if we hit this case, we try to read the projection,
and if it's exactly equal to what we tried to write, we consider the
op a success.

Ditto for write_chunk.

Fix up eunit test to accomodate the change of semantics.
2015-07-18 23:22:14 +09:00
Scott Lystig Fritchie
87867f8f2e WIP: bugfix for rare flapping infinite loop (done fix I hope)
{sigh} This is a correction to a think-o error in the
"WIP: bugfix for rare flapping infinite loop (better fix I hope)"
bugfix that I thought I had finished in the slf/chain-manager/cp-mode
branch.

Silly me, the test for myself as the author of the not_sane transition was
wrong: we don't do that kind of insanity, other nodes might, though.  ^_^
2015-07-18 17:53:17 +09:00
Scott Lystig Fritchie
c5052c4f11 More verbose dump_state() in PULSE test 2015-07-17 20:32:36 +09:00
Scott Lystig Fritchie
7a28d9ac73 Fix partial_stop_restart2() (more)
Due to changes by slf/chain-manager/cp-mode branch, there are
no longer extraneous epoch changes by "larger" authors that
re-suggest the same UPI+Repairing just because their author rank
is very slightly higher than the current epoch.  Thus the
partial_stop_restart2() test only needs to deal with one epoch
change instead of the original two.
2015-07-17 17:47:19 +09:00
Scott Lystig Fritchie
4e1e6e3e83 Derp, delete mistakenly-added patch goop 2015-07-17 17:47:19 +09:00
Scott Lystig Fritchie
19ce841471 Merge slf/chain-manager/cp-mode (fix conflicts) 2015-07-17 16:39:37 +09:00
Scott Lystig Fritchie
b295c7f374 Log more info on private projection write failure 2015-07-17 16:20:54 +09:00
Scott Lystig Fritchie
41a29a6f17 Add Seed to verbose PULSE output 2015-07-17 14:55:42 +09:00
Scott Lystig Fritchie
f4d16881c0 WIP: bugfix for rare flapping infinite loop (better fix I hope)
%% So, I'd tried this kind of "if everyone is doing it, then we
        %% 'agree' and we can do something different" strategy before,
        %% and it didn't work then.  Silly me.  Distributed systems
        %% lesson #823: do not forget the past.  In a situation created
        %% by PULSE, of all=[a,b,c,d,e], b & d & e were scheduled
        %% completely unfairly.  So a & c were the only authors ever to
        %% suceessfully write a suggested projection to a public store.
        %% Oops.
        %%
        %% So, we're going to keep track in #ch_mgr state for the number
        %% of times that this insane judgement has happened.
2015-07-17 14:51:39 +09:00
Scott Lystig Fritchie
50b2a28ca4 Fix derp mistakes in noshrink env handling for PULSE test 2015-07-17 14:45:40 +09:00
Scott Lystig Fritchie
0a8821a1c6 WIP: bugfix for rare flapping infinite loop (fixed I hope)
I'll run a set of PULSE tests (Cmd_e of the 'regression' style)
to try to confirm a fix for this pernicious little thing.

Final (?) part of the fix: add myself to SeenFlappers in
react_to_env_A30().
2015-07-16 23:23:30 +09:00
Scott Lystig Fritchie
b4d9ac5fe0 Hooray, PULSE things look stable; remove debugging verbose cruft 2015-07-16 21:57:34 +09:00
Scott Lystig Fritchie
c10200138c Hooray??! Fix the damn PULSE hangs by using infinity supervisor shutdown times 2015-07-16 21:17:46 +09:00
Scott Lystig Fritchie
dbbb6e8b14 Try to pinpoint a hang with even more verbosity (more)
Run via:

    env PULSE_NOSHRINK=yes PULSE_SKIP_NEW=yes PULSE_TIME=900 make pulse

So, this one hangs here:

    tick-<0.991.0>,dump_state(){prop,machi_chain_manager1_pulse,358,<0.891.0>}

At machi_chain_manager1_pulse.erl line 358, that's after the return
of run_commands().  The next verbose message should come from line
362, after the return of pulse:run(), but that message never appears.
My laptop CPU is really busy (fans running, case is hot), but both
console & disterl aren't available now, so no idea why, alas.

Ah, when I run with a console available and then run Redbug, there is
zero activity calling both machi_chain_manager1_pulse:'_' and
machi_chain_manager1:'_'

This may be related to a bad/ugly shutdown?  In both hang cases,
I see at least one SASL error message such as the one below ...
BUT!  There should be erlang:display() messages from the shutdown_hard()
function, which does some exit(Pid, kill) calls, but there is no output
from them!  So, the killing is coming from some kind of PULSE-initiated
process shutdown/cleanup/??

    =SUPERVISOR REPORT==== 16-Jul-2015::20:24:31 ===
         Supervisor: {local,machi_sup}
         Context:    shutdown_error
         Reason:     killed
         Offender:   [{pid,<0.200.0>},
                      {name,machi_flu_sup},
                      {mfargs,{machi_flu_sup,start_link,[]}},
                      {restart_type,permanent},
                      {shutdown,5000},
                      {child_type,supervisor}]
2015-07-16 20:40:51 +09:00
Scott Lystig Fritchie
3a4624ab06 Hrm, fewer deadlocks, but lots of !@#$! mystery hangs @ startup & teardown 2015-07-16 20:13:48 +09:00
Scott Lystig Fritchie
d331e09923 Hrm, fewer deadlocks, but sometimes unreliable shutdown 2015-07-16 17:59:02 +09:00
Scott Lystig Fritchie
f2fc5b91c2 Add more PULSE instrumentation -> more deadlocks 2015-07-16 16:25:38 +09:00
Scott Lystig Fritchie
73ac220d75 Add machi_verbose.hrl 2015-07-16 16:01:53 +09:00
Scott Lystig Fritchie
197687064b Add PULSE_NOSHRINK environment variable 2015-07-16 15:26:35 +09:00
Scott Lystig Fritchie
0ead97093b WIP: bugfix for rare flapping infinite loop (unfinished) part ... 2015-07-16 00:18:42 +09:00
Scott Lystig Fritchie
18c92c98f8 WIP: bugfix for rare flapping infinite loop (unfinished) part IV 2015-07-15 18:42:59 +09:00
Scott Lystig Fritchie
517e77dc4a WIP: bugfix for rare flapping infinite loop (unfinished) part III 2015-07-15 17:35:12 +09:00
Scott Lystig Fritchie
402720d301 WIP: bugfix for rare flapping infinite loop (unfinished) part II 2015-07-15 17:23:17 +09:00
Scott Lystig Fritchie
e41e76062c Add predictable types of variety to PULSE model partitions 2015-07-15 17:22:07 +09:00
Scott Lystig Fritchie
6f9a603e99 WIP: bugfix for rare flapping infinite loop (unfinished) 2015-07-15 12:44:56 +09:00
Scott Lystig Fritchie
0f667c4356 WIP: add more debugging/react info 2015-07-15 11:25:06 +09:00
Scott Lystig Fritchie
7c970d90a6 Bugfix: use correct updated #state in react_to_env_A30() {sigh} 2015-07-15 00:44:07 +09:00
Scott Lystig Fritchie
7fa5849669 Add new regresssion PULSE test case 2015-07-14 17:18:54 +09:00
Scott Lystig Fritchie
5eb6ebc874 Bugfix: add missing remember_partition_hack() calls in perhaps_call path 2015-07-14 17:17:14 +09:00
Scott Lystig Fritchie
fd66fe46b5 Move react logging in react_to_env_A30() 2015-07-14 17:16:23 +09:00
Scott Lystig Fritchie
0089af0a86 Bugfix: moving inner -> outer projection, use calc_projection() for sanity 2015-07-10 21:11:34 +09:00
Scott Lystig Fritchie
8d76cfe0db Robust'ify the testing of projection stability 2015-07-10 21:04:34 +09:00
Scott Lystig Fritchie
f746b75254 Bugfix: A30: if Kicker_p only true if we actually have an inner proj! 2015-07-10 20:25:44 +09:00
Scott Lystig Fritchie
e9e4c54b25 Bugfix: undo the jump directly from A30 -> C100. 2015-07-10 20:24:44 +09:00
Scott Lystig Fritchie
ed7dcd14db Avoid putting inner_summary in dbg proplist 2015-07-10 17:47:33 +09:00
Scott Lystig Fritchie
4d41c59e19 Bugfix: machi_projection:new/6 derp: argument order mistake 2015-07-10 16:41:28 +09:00
Scott Lystig Fritchie
cf9ae5b555 WIP: correct calc of All_UPI_Repairing_were_unanimous, but now infinite loop in long chains?? 2015-07-10 15:30:31 +09:00
Scott Lystig Fritchie
2060b80830 Keep good refactorings from commit a8390ee2
Also, add more misc details to the 'react' breadcrumb trail.  Also,
save get(react) results into dbg2 whenever we write a private projection,
very valuable for debugging.

Also: cleanup PULSE code, add regression commands as option and
controls with some new environment variables.  These regression
sequences were responsbile for several fruitful debugging sessions,
so we keep them for posterity and for their ability (with new seeds
and PULSE) to find new interleavings.
2015-07-10 15:04:50 +09:00
Scott Lystig Fritchie
297d29c79b Finish fixups to the chmgr state transition checking 2015-07-07 23:03:14 +09:00
Scott Lystig Fritchie
3aa3e00806 WIP: major fixups to the chmgr state transition checking (more below)
So, the PULSE test is failing, which is good.  However, I believe
that the failures are all due to the model now being *too strict*.
The model is now catching failures which are now benign, I think.

    {bummer_NOT_DISJOINT,{[a,b,b,c,d],
                          [{a,not_in_this_epoch},
                           {b,not_in_this_epoch},
                           {c,"[{epoch,1546},{author,c},{upi,[c]},{repair,[b]},{down,[a,d]},{d,[{ps,[{a,c},{c,a},{a,d},{b,d},{c,d}]},{nodes_up,[b,c]}]},{d2,[]}]"},
                           {d,"[{epoch,1546},{author,d},{upi,[d]},{repair,[a,b]},{down,[c]},{d,[{ps,[{c,b},{d,c}]},{nodes_up,[a,b,d]}]},{d2,[]}]"}]}}},

In this and all other examples, the UPIs are disjoint but the
repairs are not disjoint.  I believe the model ought to be
ignoring the repair list.

    {bummer_NOT_DISJOINT,{[a,a,b],
                          [{a,"[{epoch,1174},{author,a},{upi,[a]},{repair,[]},{down,[b]},{d,[{ps,[{a,b},{b,a}]},{nodes_up,[a]}]},{d2,[]}]"},
                           {b,"[{epoch,1174},{author,b},{upi,[b]},{repair,[a]},{down,[]},{d,[{ps,[]},{nodes_up,[a,b]}]},{d2,[]}]"}]}}},

or

    {bummer_NOT_DISJOINT,{[c,c,e],
                          [{a,not_in_this_epoch},
                           {b,not_in_this_epoch},
                           {c,"[{epoch,1388},{author,c},{upi,[c]},{repair,[]},{down,[a,b,d,e]},{d,[{ps,[{a,b},{a,c},{c,a},{a,d},{d,a},{e,a},{c,b},{b,e},{e,b},{c,d},{e,c},{e,d}]},{nodes_up,[c]}]},{d2,[]}]"},
                           {d,not_in_this_epoch},
                           {e,"[{epoch,1388},{author,e},{upi,[e]},{repair,[c]},{down,[a,b,d]},{d,[{ps,[{a,b},{b,a},{a,c},{c,a},{a,d},{d,a},{a,e},{e,a},{b,c},{c,b},{b,d},{b,e},{e,b},{c,d},{d,c},{d,e},{e,d}]},{nodes_up,[c,e]}]},{d2,[]}]"}]}}},
2015-07-07 22:11:19 +09:00
Scott Lystig Fritchie
c8ce99023e WIP: model checking refactoring TODO 2015-07-07 18:32:04 +09:00
Scott Lystig Fritchie
d5f521f2bd Various test updates 2015-07-07 15:02:29 +09:00
Scott Lystig Fritchie
009b3f44af Fix eunit test broken by 3f8982cb 2015-07-07 15:01:50 +09:00
Scott Lystig Fritchie
badcfa3064 Remove comment cruft 2015-07-07 14:32:02 +09:00
Scott Lystig Fritchie
0f3d11e1bf Bugfix (part II) rare race between just-finished repair and flapping ending
The prior commit wasn't sufficient: the range of transitions is wider than
assumed by that commit.  So, we take one of two options, with a TODO task
of researching the other option.
2015-07-07 14:30:21 +09:00
Scott Lystig Fritchie
96ca7b7082 Bugfix for rare race between just-finished repair and flapping ending
Fix for today: We are going to game the system.  We know that
    C100 is going to be checking authorship relative to P_current's
    UPI's tail.  Therefore, we're just going to set it here.
    Why???  Because we have been using this projection safely for
    the entire flapping period!  ... The only other way I see is to
    allow C100 to carve out an exception if the repair finished
    PLUS author_server check fails PLUS if we came from here, but
    that feels a bit fragile to me: if some code factoring happens
    in projection_transition_is_saneprojection_transition_is_sane()
    or elsewhere that causes the author_server check to be
    something-other-than-the-final-thing-checked, then such a
    refactoring would likely cause an even harder bug to find &
    fix.  Conditions tested: 5 FLUs plus alternating partitions of:
    [
     [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [],
     [{b,a},{d,e}],
     [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], [], [{a,b}], []
    ].
2015-07-07 01:29:37 +09:00
Scott Lystig Fritchie
54b5014446 WIP: bugfix in transition, just-in-case commit 2015-07-06 23:56:29 +09:00
Scott Lystig Fritchie
9d4b4b1df6 Bugfix: update inner projection based on *previous inner* projection 2015-07-06 17:38:15 +09:00
Scott Lystig Fritchie
3f8982cbe1 MAJOR WIP: set author's rank to constant 0? Worthwhile?? 2015-07-06 16:12:15 +09:00
Scott Lystig Fritchie
471cde1f2c WIP: debugging fmt shuffle 2015-07-06 16:11:14 +09:00
Scott Lystig Fritchie
8ee3377fa7 Fix a state transition bug (chain manager infinite loop, oops)
%% We have a small problem for state transition sanity checking in the
    %% case where we are flapping *and* a repair has finished.  One of the
    %% sanity checks in simple_chain_state_transition_is_sane(() is that
    %% the author of P2 in this case must be the tail of P1's UPI: i.e.,
    %% it's the tail's responsibility to perform repair, therefore the tail
    %% must damn well be the author of any transition that says a repair
    %% finished successfully.
    %%
    %% The problem is that author_server of the inner projection does not
    %% reflect the actual author!  See the comment with the text
    %% "The inner projection will have a fake author" in
    %react_to_env_A30().
    %%
    %% So, there's a special return value that tells us to try to check for
    %% the correct authorship here.
2015-07-05 14:52:50 +09:00
Scott Lystig Fritchie
920c0fc610 WIP: much better structure for inner projection sanity checking 2015-07-04 16:46:02 +09:00
Scott Lystig Fritchie
8241d1f600 WIP: cruft, needs refactoring 2015-07-04 14:57:38 +09:00
Scott Lystig Fritchie
65ee0c23ec Adjust author of inner projections to yield same checksum 2015-07-04 01:58:00 +09:00
Scott Lystig Fritchie
cd026303a0 Unused var cleanup 2015-07-04 00:35:05 +09:00
Scott Lystig Fritchie
9b0a5a1dc3 WIP: 1st part of moving old chain state transtion code to new
Ha, famous last words, amirite?

    %% The chain sequence/order checks at the bottom of this function aren't
    %% as easy-to-read as they ought to be.  However, I'm moderately confident
    %% that it isn't buggy.  TODO: refactor them for clarity.

So, now machi_chain_manager1:projection_transition_is_sane() is using
newer, far less buggy code to make sanity decisions.

TODO: Add support for Retrospective mode. TODO is it really needed?

Examples of how the old code sucks and the new code sucks less.

    138> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
    xxxxxxxxxxxx..x.xxxxxx..x.x....x..xx........................................................Failed! After 69 tests.
    [a,b,c]
    {c,[a,b,c],[c,b],b,[b,a],[b,a,c]}
    Old_res ([335,192,166,160,153,139]): true
    New_res: false (why line [1936])
    Shrinking xxxxxxxxxxxx.xxxxxxx.xxx.xxxxxxxxxxxxxxxxx(3 times)
    [a,b,c]
 %% {Author1,UPI1,   Repair1,Author2,UPI2, Repair2} %%
    {c,      [a,b,c],[],     a,      [b,a],[]}
    Old_res ([338,185,160,153,147]): true
    New_res: false (why line [1936])
    false

Old code is wrong: we've swapped order of a & b, which is bad.

    139> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
    xxxxxxxxxx..x...xx..........xxx..x..............x......x............................................(x10)...(x1)........Failed! After 120 tests.
    [b,c,a]
    {c,[c,a],[c],a,[a,b],[b,a]}
    Old_res ([335,192,185,160,153,123]): true
    New_res: false (why line [1936])
    Shrinking xx.xxxxxx.x.xxxxxxxx.xxxxxxxxxxx(4 times)
    [b,a,c]
 %% {Author1,UPI1,Repair1,Author2,UPI2, Repair2} %%
    {a,      [c], [],     c,      [c,b],[]}
    Old_res ([338,185,160,153,147]): true
    New_res: false (why line [1936])
    false

Old code is wrong: b wasn't repairing in the previous state.

    150> eqc:quickcheck(eqc:testing_time(10, machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check(whole))).
    xxxxxxxxxxx....x...xxxxx..xx.....x.......xxx..x.......xxx...................x................x......(x10).....(x1)........xFailed! After 130 tests.
    [c,a,b]
    {b,[c],[b,a,c],c,[c,a,b],[b]}
    Old_res ([335,214,185,160,153,147]): true
    New_res: false (why line [1936])
    Shrinking xxxx.x.xxx.xxxxxxx.xxxxxxxxx(4 times)
    [c,b,a]
 %% {Author1,UPI1,Repair1,Author2,UPI2,   Repair2} %%
    {c,      [c], [a,b],  c,      [c,b,a],[]}
    Old_res ([335,328,185,160,153,111]): true
    New_res: false (why line [1981,1679])
    false

Old code is wrong: a & b were repairing but UPI2 has a & b in the wrong order.
2015-07-04 00:32:28 +09:00
Scott Lystig Fritchie
42fb6dd002 WIP: it's clear that the legacy state transition check is broken, II 2015-07-03 23:37:36 +09:00
Scott Lystig Fritchie
caeb322725 WIP: it's clear that the legacy state transition check is broken 2015-07-03 23:17:34 +09:00
Scott Lystig Fritchie
83015c319d WIP: yeah, now we're going places 2015-07-03 22:05:35 +09:00
Scott Lystig Fritchie
6a706cbfeb WIP: Refactoring and prototyping goop, broken test 2015-07-03 19:21:41 +09:00
Scott Lystig Fritchie
4a09bfa2d1 Merge branch 'slf/flu-cleanup1' 2015-07-03 16:19:10 +09:00
Scott Lystig Fritchie
9b3cd9056a Un-TEST'ify testr_react_to_env() everywhere 2015-07-03 16:18:40 +09:00
Scott Lystig Fritchie
78c81f93b7 Make machi_chain_manager1_pulse max commands length longer 2015-07-03 16:06:33 +09:00
Scott Lystig Fritchie
2b64028bbd Add kick_projection_reaction, implement yo:tell_author_yo() 2015-07-03 04:30:05 +09:00
Scott Lystig Fritchie
c6870a1c86 If FLU is wedged by a newer client epoch ID, kick the chain manager to react 2015-07-03 02:17:01 +09:00
Scott Lystig Fritchie
ff66638eb3 Sequencer changes file sequence number when epoch_id change is detected 2015-07-03 02:04:04 +09:00
Scott Lystig Fritchie
9cf77f4406 WIP: Refactoring and prototyping goop, broken test 2015-07-03 00:59:04 +09:00
Scott Lystig Fritchie
8820a71152 Clean up comment cruft & line wrap yak shaving 2015-07-02 14:44:47 +09:00
Scott Lystig Fritchie
039fd5fb78 Merge branch 'slf/pb-api-experiment3' 2015-07-01 18:33:33 +09:00
108 changed files with 17197 additions and 5373 deletions

23
.gitignore vendored
View file

@ -2,11 +2,32 @@ prototype/chain-manager/patch.*
.eqc-info
.eunit
deps
*.plt
dev
erl_crash.dump
eqc
.concrete/DEV_MODE
.rebar
edoc
# Dialyzer stuff
.dialyzer-last-run.txt
.ebin.native
.local_dialyzer_plt
dialyzer_unhandled_warnings
dialyzer_warnings
*.plt
# PB artifacts for Erlang
include/machi_pb.hrl
# Release packaging
rel/machi
rel/vars/dev*vars.config
# Misc Scott cruft
*.patch
current_counterexample.eqc
foo*
RUNLOG*
typescript*
*.swp

7
.travis.yml Normal file
View file

@ -0,0 +1,7 @@
language: erlang
notifications:
email: scott@basho.com
script: "priv/test-for-gh-pr.sh"
otp_release:
- 17.5
## No, Dialyzer is too different between 17 & 18: - 18.1

230
FAQ.md
View file

@ -11,14 +11,14 @@
+ [1 Questions about Machi in general](#n1)
+ [1.1 What is Machi?](#n1.1)
+ [1.2 What is a Machi "cluster of clusters"?](#n1.2)
+ [1.2.1 This "cluster of clusters" idea needs a better name, don't you agree?](#n1.2.1)
+ [1.3 What is Machi like when operating in "eventually consistent"/"AP mode"?](#n1.3)
+ [1.4 What is Machi like when operating in "strongly consistent"/"CP mode"?](#n1.4)
+ [1.5 What does Machi's API look like?](#n1.5)
+ [1.6 What licensing terms are used by Machi?](#n1.6)
+ [1.7 Where can I find the Machi source code and documentation? Can I contribute?](#n1.7)
+ [1.8 What is Machi's expected release schedule, packaging, and operating system/OS distribution support?](#n1.8)
+ [1.2 What is a Machi chain?](#n1.2)
+ [1.3 What is a Machi cluster?](#n1.3)
+ [1.4 What is Machi like when operating in "eventually consistent" mode?](#n1.4)
+ [1.5 What is Machi like when operating in "strongly consistent" mode?](#n1.5)
+ [1.6 What does Machi's API look like?](#n1.6)
+ [1.7 What licensing terms are used by Machi?](#n1.7)
+ [1.8 Where can I find the Machi source code and documentation? Can I contribute?](#n1.8)
+ [1.9 What is Machi's expected release schedule, packaging, and operating system/OS distribution support?](#n1.9)
+ [2 Questions about Machi relative to {{something else}}](#n2)
+ [2.1 How is Machi better than Hadoop?](#n2.1)
+ [2.2 How does Machi differ from HadoopFS/HDFS?](#n2.2)
@ -28,13 +28,15 @@
+ [3 Machi's specifics](#n3)
+ [3.1 What technique is used to replicate Machi's files? Can other techniques be used?](#n3.1)
+ [3.2 Does Machi have a reliance on a coordination service such as ZooKeeper or etcd?](#n3.2)
+ [3.3 Is it true that there's an allegory written to describe humming consensus?](#n3.3)
+ [3.4 How is Machi tested?](#n3.4)
+ [3.5 Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks](#n3.5)
+ [3.6 Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device?](#n3.6)
+ [3.7 What language(s) is Machi written in?](#n3.7)
+ [3.8 Does Machi use the Erlang/OTP network distribution system (aka "disterl")?](#n3.8)
+ [3.9 Can I use HTTP to write/read stuff into/from Machi?](#n3.9)
+ [3.3 Are there any presentations available about Humming Consensus](#n3.3)
+ [3.4 Is it true that there's an allegory written to describe Humming Consensus?](#n3.4)
+ [3.5 How is Machi tested?](#n3.5)
+ [3.6 Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks](#n3.6)
+ [3.7 Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device?](#n3.7)
+ [3.8 What language(s) is Machi written in?](#n3.8)
+ [3.9 Can Machi run on Windows? Can Machi run on 32-bit platforms?](#n3.9)
+ [3.10 Does Machi use the Erlang/OTP network distribution system (aka "disterl")?](#n3.10)
+ [3.11 Can I use HTTP to write/read stuff into/from Machi?](#n3.11)
<!-- ENDOUTLINE -->
@ -44,13 +46,13 @@
<a name="n1.1">
### 1.1. What is Machi?
Very briefly, Machi is a very simple append-only file store.
Very briefly, Machi is a very simple append-only blob/file store.
Machi is
"dumber" than many other file stores (i.e., lacking many features
found in other file stores) such as HadoopFS or simple NFS or CIFS file
found in other file stores) such as HadoopFS or a simple NFS or CIFS file
server.
However, Machi is a distributed file store, which makes it different
However, Machi is a distributed blob/file store, which makes it different
(and, in some ways, more complicated) than a simple NFS or CIFS file
server.
@ -82,45 +84,39 @@ For a much longer answer, please see the
[Machi high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-machi.pdf).
<a name="n1.2">
### 1.2. What is a Machi "cluster of clusters"?
### 1.2. What is a Machi chain?
Machi's design is based on using small, well-understood and provable
(mathematically) techniques to maintain multiple file copies without
data loss or data corruption. At its lowest level, Machi contains no
support for distribution/partitioning/sharding of files across many
servers. A typical, fully-functional Machi cluster will likely be two
or three machines.
A Machi chain is a small number of machines that maintain a common set
of replicated files. A typical chain is of length 2 or 3. For
critical data that must be available despite several simultaneous
server failures, a chain length of 6 or 7 might be used.
However, Machi is designed to be an excellent building block for
building larger systems. A deployment of Machi "cluster of clusters"
will use the "random slicing" technique for partitioning files across
multiple Machi clusters that, as individuals, are unaware of the
larger cluster-of-clusters scheme.
<a name="n1.3">
### 1.3. What is a Machi cluster?
The cluster-of-clusters management service will be fully decentralized
A Machi cluster is a collection of Machi chains that
partitions/shards/distributes files (based on file name) across the
collection of chains. Machi uses the "random slicing" algorithm (a
variation of consistent hashing) to define the mapping of file name to
chain name.
The cluster management service will be fully decentralized
and run as a separate software service installed on each Machi
cluster. This manager will appear to the local Machi server as simply
another Machi file client. The cluster-of-clusters managers will take
another Machi file client. The cluster managers will take
care of file migration as the cluster grows and shrinks in capacity
and in response to day-to-day changes in workload.
Though the cluster-of-clusters manager has not yet been implemented,
Though the cluster manager has not yet been implemented,
its design is fully decentralized and capable of operating despite
multiple partial failure of its member clusters. We expect this
multiple partial failure of its member chains. We expect this
design to scale easily to at least one thousand servers.
Please see the
[Machi source repository's 'doc' directory for more details](https://github.com/basho/machi/tree/master/doc/).
<a name="n1.2.1">
#### 1.2.1. This "cluster of clusters" idea needs a better name, don't you agree?
Yes. Please help us: we are bad at naming things.
For proof that naming things is hard, see
[http://martinfowler.com/bliki/TwoHardThings.html](http://martinfowler.com/bliki/TwoHardThings.html)
<a name="n1.3">
### 1.3. What is Machi like when operating in "eventually consistent"/"AP mode"?
<a name="n1.4">
### 1.4. What is Machi like when operating in "eventually consistent" mode?
Machi's operating mode dictates how a Machi cluster will react to
network partitions. A network partition may be caused by:
@ -130,37 +126,30 @@ network partitions. A network partition may be caused by:
* An extreme server software "hang" or "pause", e.g. caused by OS
scheduling problems such as a failing/stuttering disk device.
"AP mode" refers to the "A" and "P" properties of the "CAP
conjecture", meaning that the cluster will be "Available" and
"Partition tolerant".
The consistency semantics of file operations while in "AP mode" are
eventually consistent during and after network partitions:
The consistency semantics of file operations while in eventual
consistency mode during and after network partitions are:
* File write operations are permitted by any client on the "same side"
of the network partition.
* File read operations are successful for any file contents where the
client & server are on the "same side" of the network partition.
* File read operations will probably fail for any file contents where the
client & server are on "different sides" of the network partition.
* After the network partition(s) is resolved, files are merged
together from "all sides" of the partition(s).
* Unique files are copied in their entirety.
* Byte ranges within the same file are merged. This is possible
due to Machi's restrictions on file naming (files names are
alwoys assigned by Machi servers) and file offset assignments
(byte offsets are also always chosen by Machi servers according
to rules which guarantee safe mergeability.).
due to Machi's restrictions on file naming and file offset
assignment. Both file names and file offsets are always chosen
by Machi servers according to rules which guarantee safe
mergeability. Server-assigned names are a characteristic of a
"blob store".
<a name="n1.4">
### 1.4. What is Machi like when operating in "strongly consistent"/"CP mode"?
<a name="n1.5">
### 1.5. What is Machi like when operating in "strongly consistent" mode?
Machi's operating mode dictates how a Machi cluster will react to
network partitions.
"CP mode" refers to the "C" and "P" properties of the "CAP
conjecture", meaning that the cluster will be "Consistent" and
"Partition tolerant".
The consistency semantics of file operations while in "CP mode" are
strongly consistent during and after network partitions:
The consistency semantics of file operations while in strongly
consistency mode during and after network partitions are:
* File write operations are permitted by any client on the "same side"
of the network partition if and only if a quorum majority of Machi servers
@ -175,19 +164,19 @@ strongly consistent during and after network partitions:
Machi's design can provide the illusion of quorum minority write
availability if the cluster is configured to operate with "witness
servers". (This feaure is not implemented yet, as of June 2015.)
servers". (This feaure partially implemented, as of December 2015.)
See Section 11 of
[Machi chain manager high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-chain-mgr.pdf)
for more details.
<a name="n1.5">
### 1.5. What does Machi's API look like?
<a name="n1.6">
### 1.6. What does Machi's API look like?
The Machi API only contains a handful of API operations. The function
arguments shown below use Erlang-style type annotations.
arguments shown below (in simplifed form) use Erlang-style type annotations.
append_chunk(Prefix:binary(), Chunk:binary()).
append_chunk_extra(Prefix:binary(), Chunk:binary(), ExtraSpace:non_neg_integer()).
append_chunk(Prefix:binary(), Chunk:binary(), CheckSum:binary()).
append_chunk_extra(Prefix:binary(), Chunk:binary(), CheckSum:binary(), ExtraSpace:non_neg_integer()).
read_chunk(File:binary(), Offset:non_neg_integer(), Size:non_neg_integer()).
checksum_list(File:binary()).
@ -205,17 +194,22 @@ Internally, there is a more complex protocol used by individual
cluster members to manage file contents and to repair damaged/missing
files. See Figure 3 in
[Machi high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-machi.pdf)
for more details.
for more description.
<a name="n1.6">
### 1.6. What licensing terms are used by Machi?
The definitions of both the "high level" external protocol and "low
level" internal protocol are in a
[Protocol Buffers](https://developers.google.com/protocol-buffers/docs/overview)
definition at [./src/machi.proto](./src/machi.proto).
<a name="n1.7">
### 1.7. What licensing terms are used by Machi?
All Machi source code and documentation is licensed by
[Basho Technologies, Inc.](http://www.basho.com/)
under the [Apache Public License version 2](https://github.com/basho/machi/tree/master/LICENSE).
<a name="n1.7">
### 1.7. Where can I find the Machi source code and documentation? Can I contribute?
<a name="n1.8">
### 1.8. Where can I find the Machi source code and documentation? Can I contribute?
All Machi source code and documentation can be found at GitHub:
[https://github.com/basho/machi](https://github.com/basho/machi).
@ -229,11 +223,11 @@ ideas for improvement, please see our contributing & collaboration
guidelines at
[https://github.com/basho/machi/blob/master/CONTRIBUTING.md](https://github.com/basho/machi/blob/master/CONTRIBUTING.md).
<a name="n1.8">
### 1.8. What is Machi's expected release schedule, packaging, and operating system/OS distribution support?
<a name="n1.9">
### 1.9. What is Machi's expected release schedule, packaging, and operating system/OS distribution support?
Basho expects that Machi's first release will take place near the end
of calendar year 2015.
Basho expects that Machi's first major product release will take place
during the 2nd quarter of 2016.
Basho's official support for operating systems (e.g. Linux, FreeBSD),
operating system packaging (e.g. CentOS rpm/yum package management,
@ -308,15 +302,15 @@ file's writable phase).
<tr>
<td> Does not have any file distribution/partitioning/sharding across
Machi clusters: in a single Machi cluster, all files are replicated by
all servers in the cluster. The "cluster of clusters" concept is used
Machi chains: in a single Machi chain, all files are replicated by
all servers in the chain. The "random slicing" technique is used
to distribute/partition/shard files across multiple Machi clusters.
<td> File distribution/partitioning/sharding is performed
automatically by the HDFS "name node".
<tr>
<td> Machi requires no central "name node" for single cluster use.
Machi requires no central "name node" for "cluster of clusters" use
<td> Machi requires no central "name node" for single chain use or
for multi-chain cluster use.
<td> Requires a single "namenode" server to maintain file system contents
and file content mapping. (May be deployed with a "secondary
namenode" to reduce unavailability when the primary namenode fails.)
@ -482,8 +476,8 @@ difficult to adapt to Machi's design goals:
* Both protocols use quorum majority consensus, which requires a
minimum of *2F + 1* working servers to tolerate *F* failures. For
example, to tolerate 2 server failures, quorum majority protocols
require a minium of 5 servers. To tolerate the same number of
failures, Chain replication requires only 3 servers.
require a minimum of 5 servers. To tolerate the same number of
failures, Chain Replication requires a minimum of only 3 servers.
* Machi's use of "humming consensus" to manage internal server
metadata state would also (probably) require conversion to Paxos or
Raft. (Or "outsourced" to a service such as ZooKeeper.)
@ -500,7 +494,17 @@ Humming consensus is described in the
[Machi chain manager high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-chain-mgr.pdf).
<a name="n3.3">
### 3.3. Is it true that there's an allegory written to describe humming consensus?
### 3.3. Are there any presentations available about Humming Consensus
Scott recently (November 2015) gave a presentation at the
[RICON 2015 conference](http://ricon.io) about one of the techniques
used by Machi; "Managing Chain Replication Metadata with
Humming Consensus" is available online now.
* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf)
* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q)
<a name="n3.4">
### 3.4. Is it true that there's an allegory written to describe Humming Consensus?
Yes. In homage to Leslie Lamport's original paper about the Paxos
protocol, "The Part-time Parliamant", there is an allegorical story
@ -511,8 +515,8 @@ The full story, full of wonder and mystery, is called
There is also a
[short followup blog posting](http://www.snookles.com/slf-blog/2015/03/20/on-humming-consensus-an-allegory-part-2/).
<a name="n3.4">
### 3.4. How is Machi tested?
<a name="n3.5">
### 3.5. How is Machi tested?
While not formally proven yet, Machi's implementation of Chain
Replication and of humming consensus have been extensively tested with
@ -537,16 +541,20 @@ change several times during any single test case) and a random series
of cluster operations, an event trace of all cluster activity is used
to verify that no safety-critical rules have been violated.
<a name="n3.5">
### 3.5. Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks
All test code is available in the [./test](./test) subdirectory.
Modules that use QuickCheck will use a file suffix of `_eqc`, for
example, [./test/machi_ap_repair_eqc.erl](./test/machi_ap_repair_eqc.erl).
<a name="n3.6">
### 3.6. Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks
No, Machi's design assumes that each Machi server is a fully
independent hardware and assumes only standard local disks (Winchester
and/or SSD style) with local-only interfaces (e.g. SATA, SCSI, PCI) in
each machine.
<a name="n3.6">
### 3.6. Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device?
<a name="n3.7">
### 3.7. Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device?
No. When used with servers with multiple disks, the intent is to
deploy multiple Machi servers per machine: one Machi server per disk.
@ -564,10 +572,13 @@ deploy multiple Machi servers per machine: one Machi server per disk.
placement relative to 12 servers is smaller than a placement problem
of managing 264 seprate disks (if each of 12 servers has 22 disks).
<a name="n3.7">
### 3.7. What language(s) is Machi written in?
<a name="n3.8">
### 3.8. What language(s) is Machi written in?
So far, Machi is written in 100% Erlang.
So far, Machi is written in Erlang, mostly. Machi uses at least one
library, [ELevelDB](https://github.com/basho/eleveldb), that is
implemented both in C++ and in Erlang, using Erlang NIFs (Native
Interface Functions) to allow Erlang code to call C++ functions.
In the event that we encounter a performance problem that cannot be
solved within the Erlang/OTP runtime environment, all of Machi's
@ -576,8 +587,16 @@ in C, Java, or other "gotta go fast fast FAST!!" programming
language. We expect that the Chain Replication manager and other
critical "control plane" software will remain in Erlang.
<a name="n3.8">
### 3.8. Does Machi use the Erlang/OTP network distribution system (aka "disterl")?
<a name="n3.9">
### 3.9. Can Machi run on Windows? Can Machi run on 32-bit platforms?
The ELevelDB NIF does not compile or run correctly on Erlang/OTP
Windows platforms, nor does it compile correctly on 32-bit platforms.
Machi should support all 64-bit UNIX-like platforms that are supported
by Erlang/OTP and ELevelDB.
<a name="n3.10">
### 3.10. Does Machi use the Erlang/OTP network distribution system (aka "disterl")?
No, Machi doesn't use Erlang/OTP's built-in distributed message
passing system. The code would be *much* simpler if we did use
@ -588,19 +607,16 @@ bit-twiddling magicSPEED ... without also having to find a replacement
for disterl. (Or without having to re-invent disterl's features in
another language.)
<a name="artisanal-protocol">
In the first drafts of the Machi code, the inter-node communication
uses a hand-crafted, artisanal, mostly ASCII protocol as part of a
"demo day" quick & dirty prototype. Work is underway (summer of 2015)
to replace that protocol gradually with a well-structured,
well-documented protocol based on Protocol Buffers data serialization.
All wire protocols used by Machi are defined & implemented using
[Protocol Buffers](https://developers.google.com/protocol-buffers/docs/overview).
The definition file can be found at [./src/machi.proto](./src/machi.proto).
<a name="n3.9">
### 3.9. Can I use HTTP to write/read stuff into/from Machi?
<a name="n3.11">
### 3.11. Can I use HTTP to write/read stuff into/from Machi?
Yes, sort of. For as long as the legacy of
Machi's first internal protocol & code still
survives, it's possible to use a
Short answer: No, not yet.
Longer answer: No, but it was possible as a hack, many months ago, see
[primitive/hack'y HTTP interface that is described in this source code commit log](https://github.com/basho/machi/commit/6cebf397232cba8e63c5c9a0a8c02ba391b20fef).
Please note that commit `6cebf397232cba8e63c5c9a0a8c02ba391b20fef` is
required to try using this feature: the code has since bit-rotted and

108
Makefile
View file

@ -1,54 +1,94 @@
REBAR_BIN := $(shell which rebar)
ifeq ($(REBAR_BIN),)
REBAR_BIN = ./rebar
REPO ?= machi
PKG_REVISION ?= $(shell git describe --tags)
PKG_BUILD = 1
BASE_DIR = $(shell pwd)
ERLANG_BIN = $(shell dirname $(shell which erl))
REBAR := $(shell which rebar)
ifeq ($(REBAR),)
REBAR = $(BASE_DIR)/rebar
endif
OVERLAY_VARS ?=
EUNIT_OPTS = -v
.PHONY: rel deps package pkgclean edoc
.PHONY: rel stagedevrel deps package pkgclean edoc
all: deps compile
compile:
$(REBAR_BIN) compile
$(REBAR) compile
## Make reltool happy by creating a fake entry in the deps dir for
## machi, because reltool really wants to have a path with
## "machi/ebin" at the end, but we also don't want infinite recursion
## if we just symlink "deps/machi" -> ".."
generate:
rm -rf deps/machi
mkdir deps/machi
ln -s ../../ebin deps/machi
ln -s ../../src deps/machi
$(REBAR) generate $(OVERLAY_VARS) 2>&1 | grep -v 'command does not apply to directory'
deps:
$(REBAR_BIN) get-deps
$(REBAR) get-deps
clean:
$(REBAR_BIN) -r clean
test: deps compile eunit
eunit:
$(REBAR_BIN) -v skip_deps=true eunit
$(REBAR) -r clean
edoc: edoc-clean
$(REBAR_BIN) skip_deps=true doc
$(REBAR) skip_deps=true doc
edoc-clean:
rm -f edoc/*.png edoc/*.html edoc/*.css edoc/edoc-info
pulse: compile
env USE_PULSE=1 $(REBAR_BIN) skip_deps=true clean compile
env USE_PULSE=1 $(REBAR_BIN) skip_deps=true -D PULSE eunit -v
@echo Sorry, PULSE test needs maintenance. -SLF
#env USE_PULSE=1 $(REBAR) skip_deps=true clean compile
#env USE_PULSE=1 $(REBAR) skip_deps=true -D PULSE eunit -v
APPS = kernel stdlib sasl erts ssl compiler eunit crypto
##
## Release targets
##
rel: deps compile generate
relclean:
rm -rf rel/$(REPO)
stage : rel
$(foreach dep,$(wildcard deps/*), rm -rf rel/$(REPO)/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) rel/$(REPO)/lib;)
##
## Developer targets
##
## devN - Make a dev build for node N
## stagedevN - Make a stage dev build for node N (symlink libraries)
## devrel - Make a dev build for 1..$DEVNODES
## stagedevrel Make a stagedev build for 1..$DEVNODES
##
## Example, make a 68 node devrel cluster
## make stagedevrel DEVNODES=68
.PHONY : stagedevrel devrel
DEVNODES ?= 3
# 'seq' is not available on all *BSD, so using an alternate in awk
SEQ = $(shell awk 'BEGIN { for (i = 1; i < '$(DEVNODES)'; i++) printf("%i ", i); print i ;exit(0);}')
$(eval stagedevrel : $(foreach n,$(SEQ),stagedev$(n)))
$(eval devrel : $(foreach n,$(SEQ),dev$(n)))
dev% : all
mkdir -p dev
rel/gen_dev $@ rel/vars/dev_vars.config.src rel/vars/$@_vars.config
(cd rel && ../rebar generate target_dir=../dev/$@ overlay_vars=vars/$@_vars.config)
stagedev% : dev%
$(foreach dep,$(wildcard deps/*), rm -rf dev/$^/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) dev/$^/lib;)
devclean: clean
rm -rf dev
DIALYZER_APPS = kernel stdlib sasl erts ssl compiler eunit crypto public_key syntax_tools
PLT = $(HOME)/.machi_dialyzer_plt
build_plt: deps compile
dialyzer --build_plt --output_plt $(PLT) --apps $(APPS) deps/*/ebin
DIALYZER_DEP_APPS = ebin/machi_pb.beam deps/protobuffs/ebin
DIALYZER_FLAGS = -Wno_return -Wrace_conditions -Wunderspecs
dialyzer: deps compile
dialyzer $(DIALYZER_FLAGS) --plt $(PLT) ebin $(DIALYZER_DEP_APPS) | \
egrep -v -f ./filter-dialyzer-dep-warnings
dialyzer-test: deps compile
echo Force rebar to recompile .eunit dir w/o running tests > /dev/null
rebar skip_deps=true eunit suite=lamport_clock
dialyzer $(DIALYZER_FLAGS) --plt $(PLT) .eunit $(DIALYZER_DEP_APPS) | \
egrep -v -f ./filter-dialyzer-dep-warnings
clean_plt:
rm $(PLT)
include tools.mk

227
README.md
View file

@ -1,59 +1,136 @@
# Machi
# Machi: a distributed, decentralized blob/large file store
Our goal is a robust & reliable, distributed, highly available(*),
large file store based upon write-once registers, append-only files,
Chain Replication, and client-server style architecture. All members
of the cluster store all of the files. Distributed load
balancing/sharding of files is __outside__ of the scope of this
system. However, it is a high priority that this system be able to
integrate easily into systems that do provide distributed load
balancing, e.g., Riak Core. Although strong consistency is a major
feature of Chain Replication, first use cases will focus mainly on
eventual consistency features --- strong consistency design will be
discussed in a separate design document (read more below).
[Travis-CI](http://travis-ci.org/basho/machi) :: ![Travis-CI](https://secure.travis-ci.org/basho/machi.png)
The ability for Machi to maintain strong consistency will make it
attractive as a toolkit for building things like CORFU and Tango as
well as better-known open source software such as Kafka's file
replication. (See the bibliography of the [Machi high level design
doc](./doc/high-level-machi.pdf) for further references.)
Outline
(*) Capable of operating in "AP mode" or "CP mode" relative to the
CAP Theorem.
1. [Why another blob/file store?](#sec1)
2. [Where to learn more about Machi](#sec2)
3. [Development status summary](#sec3)
4. [Contributing to Machi's development](#sec4)
## Status: mid-June 2015: work is underway
<a name="sec1">
## 1. Why another blob/file store?
The two major design documents for Machi are now ready or nearly ready
for internal Basho and external party review. Please see the
[doc](./doc) directory's [README](./doc) for details
Our goal is a robust & reliable, distributed, highly available, large
file and blob store. Such stores already exist, both in the open source world
and in the commercial world. Why reinvent the wheel? We believe
there are three reasons, ordered by decreasing rarity.
* Machi high level design
* Machi chain self-management design
1. We want end-to-end checksums for all file data, from the initial
file writer to every file reader, anywhere, all the time.
2. We need flexibility to trade consistency for availability:
e.g. weak consistency in exchange for being available in cases
of partial system failure.
3. We want to manage file replicas in a way that's provably correct
and also easy to test.
The work of implementing first draft of Machi is now underway. The
code from the [prototype/demo-day-hack](prototype/demo-day-hack/) directory is
being used as the initial scaffolding.
Criteria #3 is difficult to find in the open source world but perhaps
not impossible.
* The chain manager is ready for "AP mode" use in eventual
consistency use cases.
If we have app use cases where availability is more important than
consistency, then systems that meet criteria #2 are also rare.
Most file stores provide only strong consistency and therefore
have unavoidable, unavailable behavior when parts of the system
fail.
What if we want a file store that is always available to write new
file data and attempts best-effort file reads?
* All Machi client/server protocols are based on
[Protocol Buffers](https://developers.google.com/protocol-buffers/docs/overview).
* The current specification for Machi's protocols can be found at
[https://github.com/basho/machi/blob/master/src/machi.proto](https://github.com/basho/machi/blob/master/src/machi.proto).
* The Machi PB protocol is not yet stable. Expect change!
* The Erlang language client implementation of the high-level
protocol flavor is very brittle (e.g., very little error
handling yet).
* The Erlang language client implementation of the low-level
protocol flavor are still a work-in-progress ... but they are
more robust than the high-level library's implementation.
If we really do care about data loss and/or data corruption, then we
really want both #3 and #1. Unfortunately, systems that meet
criteria #1 are _very rare_. (Nonexistant?)
Why? This is 2015. We have decades of research that shows
that computer hardware can (and
indeed does) corrupt data at nearly every level of the modern
client/server application stack. Systems with end-to-end data
corruption detection should be ubiquitous today. Alas, they are not.
If you'd like to work on a protocol such as Thrift, UBF,
msgpack over UDP, or some other protocol, let us know by
[opening an issue](./issues/new) to discuss it.
Machi is an effort to change the deplorable state of the world, one
Erlang function at a time.
## Contributing to Machi: source code, documentation, etc.
<a name="sec2">
## 2. Where to learn more about Machi
The two major design documents for Machi are now mostly stable.
Please see the [doc](./doc) directory's [README](./doc) for details.
We also have a
[Frequently Asked Questions (FAQ) list](./FAQ.md).
Scott recently (November 2015) gave a presentation at the
[RICON 2015 conference](http://ricon.io) about one of the techniques
used by Machi; "Managing Chain Replication Metadata with
Humming Consensus" is available online now.
* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf)
* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q)
See later in this document for how to run the Humming Consensus demos,
including the network partition simulator.
<a name="sec3">
## 3. Development status summary
Mid-March 2016: The Machi development team has been downsized in
recent months, and the pace of development has slowed. Here is a
summary of the status of Machi's major components.
* Humming Consensus and the chain manager
* No new safety bugs have been found by model-checking tests.
* A new document,
[Hands-on experiments with Machi and Humming Consensus](doc/humming-consensus-demo.md)
is now available. It is a tutorial for setting up a 3 virtual
machine Machi cluster and how to demonstrate the chain manager's
reactions to server stops & starts, crashes & restarts, and pauses
(simulated by `SIGSTOP` and `SIGCONT`).
* The chain manager can still make suboptimal-but-safe choices for
chain transitions when a server hangs/pauses temporarily.
* Recent chain manager changes have made the instability window
much shorter when the slow/paused server resumes execution.
* Scott believes that a modest change to the chain manager's
calculation of a new projection can reduce flapping in this (and
many other cases) less likely. Currently, the new local
projection is calculated using only local state (i.e., the chain
manager's internal state + the fitness server's state).
However, if the "latest" projection read from the public
projection stores were also input to the new projection
calculation function, then many obviously bad projections can be
avoided without needing rounds of Humming Consensus to
demonstrate that a bad projection is bad.
* FLU/data server process
* All known correctness bugs have been fixed.
* Performance has not yet been measured. Performance measurement
and enhancements are scheduled to start in the middle of March 2016.
(This will include a much-needed update to the `basho_bench` driver.)
* Access protocols and client libraries
* The protocol used by both external clients and internally (instead
of using Erlang's native message passing mechanisms) is based on
Protocol Buffers.
* (Machi PB protocol specification: ./src/machi.proto)[./src/machi.proto]
* At the moment, the PB specification contains two protocols.
Sometime in the near future, the spec will be split to separate
the external client API (the "high" protocol) from the internal
communication API (the "low" protocol).
* Recent conference talks about Machi
* Erlang Factory San Francisco 2016
[the slides and video recording](http://www.erlang-factory.com/sfbay2016/scott-lystig-fritchie)
will be available a few weeks after the conference ends on March
11, 2016.
* Ricon 2015
* [The slides](http://ricon.io/archive/2015/slides/Scott_Fritchie_Ricon_2015.pdf)
* and the [video recording](https://www.youtube.com/watch?v=yR5kHL1bu1Q&index=13&list=PL9Jh2HsAWHxIc7Tt2M6xez_TOP21GBH6M)
are now available.
* If you would like to run the Humming Consensus code (with or without
the network partition simulator) as described in the RICON 2015
presentation, please see the
[Humming Consensus demo doc](./doc/humming_consensus_demo.md).
<a name="sec4">
## 4. Contributing to Machi's development
### 4.1 License
Basho Technologies, Inc. as committed to licensing all work for Machi
under the
@ -69,57 +146,29 @@ We invite all contributors to review the
[CONTRIBUTING.md](./CONTRIBUTING.md) document for guidelines for
working with the Basho development team.
## A brief survey of this directories in this repository
* A list of Frequently Asked Questions, a.k.a.
[the Machi FAQ](./FAQ.md).
* The [doc](./doc/) directory: home for major documents about Machi:
high level design documents as well as exploration of features still
under design & review within Basho.
* The `ebin` directory: used for compiled application code
* The `include`, `src`, and `test` directories: contain the header
files, source files, and test code for Machi, respectively.
* The [prototype](./prototype/) directory: contains proof of concept
code, scaffolding libraries, and other exploratory code. Curious
readers should see the [prototype/README.md](./prototype/README.md)
file for more explanation of the small sub-projects found here.
## Development environment requirements
### 4.2 Development environment requirements
All development to date has been done with Erlang/OTP version 17 on OS
X. The only known limitations for using R16 are minor type
specification difference between R16 and 17, but we strongly suggest
continuing development using version 17.
We also assume that you have the standard UNIX/Linux developers
tool chain for C and C++ applications. Specifically, we assume `make`
is available. The utility used to compile the Machi source code,
We also assume that you have the standard UNIX/Linux developer
tool chain for C and C++ applications. Also, we assume
that Git and GNU Make are available.
The utility used to compile the Machi source code,
`rebar`, is pre-compiled and included in the repo.
For more details, please see the
[Machi development environment prerequisites doc](./doc/dev-prerequisites.md).
There are no known OS limits at this time: any platform that supports
Erlang/OTP should be sufficient for Machi. This may change over time
(e.g., adding NIFs which can make full portability to Windows OTP
environments difficult), but it hasn't happened yet.
Machi has a dependency on the
[ELevelDB](https://github.com/basho/eleveldb) library. ELevelDB only
supports UNIX/Linux OSes and 64-bit versions of Erlang/OTP only; we
apologize to Windows-based and 32-bit-based Erlang developers for this
restriction.
## Contributions
Basho encourages contributions to Riak from the community. Heres how
to get started.
* Fork the appropriate sub-projects that are affected by your change.
* Create a topic branch for your change and checkout that branch.
git checkout -b some-topic-branch
* Make your changes and run the test suite if one is provided. (see below)
* Commit your changes and push them to your fork.
* Open pull-requests for the appropriate projects.
* Contributors will review your pull request, suggest changes, and merge it when its ready and/or offer feedback.
* To report a bug or issue, please open a new issue against this repository.
-The Machi team at Basho,
[Scott Lystig Fritchie](mailto:scott@basho.com), technical lead, and
[Matt Brender](mailto:mbrender@basho.com), your developer advocate.
### 4.3 New protocols and features
If you'd like to work on a protocol such as Thrift, UBF,
msgpack over UDP, or some other protocol, let us know by
[opening an issue to discuss it](./issues/new).

View file

@ -55,3 +55,60 @@ func, and pattern match Erlang style in that func.
** TODO Move the FLU server to gen_server behavior?
* DONE Chain manager CP mode, Plan B
** SKIP Maybe? Change ch_mgr to use middleworker
**** DONE Is it worthwhile? Is the parallelism so important? No, probably.
**** SKIP Move middleworker func to utility module?
** DONE Add new proc to psup group
*** DONE Name: machi_fitness
** DONE ch_mgr keeps its current proc struct: i.e. same 1 proc as today
** NO chmgr asks hosed mgr for hosed list @ start of react_to_env
** DONE For all hosed, do *async*: try to read latest proj.
*** NO If OK, inform hosed mgr: status change will be used by next HC iter.
*** NO If fail, no change, because that server is already known to be hosed
*** DONE For all non-hosed, continue as the chain manager code does today
*** DONE Any new errors are added to UpNodes/DownNodes tracking as used today
*** DONE At end of react loop, if UpNodes list differs, inform hosed mgr.
* DONE fitness_mon, the fitness monitor
** DONE Map key & val sketch
Logical sketch:
Map key: ObservingServerName::atom()
Map val: { ObservingServerLastModTime::now(),
UnfitList::list(ServerName::atom()),
AdminDownList::list(ServerName::atom()),
Props::proplist() }
Implementation sketch:
1. Use CRDT map.
2. If map key is not atom, then atom->string or atom->binary is fine.
3. For map value, is it possible CRDT LWW type?
** DONE Investigate riak_dt data structure definition, manipulating, etc.
** DONE Add dependency on riak_dt
** DONE Update is an entire dict from Observer O
*** DONE Merge my pending map + update map + my last mod time + my unfit list
*** DONE if merged /= pending:
**** DONE Schedule async tick (more)
Tick message contains list of servers with differing state as of this
instant in time... we want to avoid triggering decisions about
fitness/unfitness for other servers where we might have received less
than a full time period's worth of waiting.
**** DONE Spam merged map to All_list -- [Me]
**** DONE Set pending <- merged
*** DONE When we receive an async tick
**** DONE set active map <- pending map for all servers in ticks list
**** DONE Send ch_mgr a react_to_env tick trigger
*** DONE react_to_env tick trigger actions
**** DONE Filter active map to remove stale entries (i.e. no update in 1 hour)
**** DONE If time since last map spam is too long, spam our *pending* map
**** DONE Proceed with normal react processing, using *active* map for AllHosed!

15
dialyzer.ignore-warnings Normal file
View file

@ -0,0 +1,15 @@
### The auto-generated code of machi_pb.beam has some complaints, not fixed yet.
machi_pb.erl:0:
##################################################
######## Specific types #####################
##################################################
Unknown types:
basho_bench_config:get/2
machi_partition_simulator:get/1
hamcrest:matchspec/0
##################################################
######## Specific messages #####################
##################################################
machi_chain_manager1.erl:2473: The created fun has no local return
machi_chain_manager1.erl:2184: The pattern <_P1, P2, Else = {'expected_author2', UPI1_tail, _}> can never match the type <#projection_v1{epoch_number::'undefined' | non_neg_integer(),epoch_csum::'undefined' | binary(),author_server::atom(),chain_name::atom(),all_members::'undefined' | [atom()],witnesses::[atom()],creation_time::'undefined' | {non_neg_integer(),non_neg_integer(),non_neg_integer()},mode::'ap_mode' | 'cp_mode',upi::'undefined' | [atom()],repairing::'undefined' | [atom()],down::'undefined' | [atom()],dbg::'undefined' | [any()],dbg2::'undefined' | [any()],members_dict::'undefined' | [{_,_}]},#projection_v1{epoch_number::'undefined' | non_neg_integer(),epoch_csum::binary(),author_server::atom(),chain_name::atom(),all_members::'undefined' | [atom()],witnesses::[atom()],creation_time::'undefined' | {non_neg_integer(),non_neg_integer(),non_neg_integer()},mode::'ap_mode' | 'cp_mode',upi::'undefined' | [atom()],repairing::'undefined' | [atom()],down::'undefined' | [atom()],dbg::'undefined' | [any()],dbg2::'undefined' | [any()],members_dict::'undefined' | [{_,_}]},'true'>
machi_chain_manager1.erl:2233: The pattern <_P1 = {'projection_v1', _, _, _, _, _, _, _, 'cp_mode', UPI1, Repairing1, _, _, _, _}, _P2 = {'projection_v1', _, _, _, _, _, _, _, 'cp_mode', UPI2, Repairing2, _, _, _, _}, Else = {'epoch_not_si', EpochX, 'not_gt', EpochY}> can never match the type <#projection_v1{epoch_number::'undefined' | non_neg_integer(),epoch_csum::'undefined' | binary(),author_server::atom(),chain_name::atom(),all_members::'undefined' | [atom()],witnesses::[atom()],creation_time::'undefined' | {non_neg_integer(),non_neg_integer(),non_neg_integer()},mode::'ap_mode' | 'cp_mode',upi::'undefined' | [atom()],repairing::'undefined' | [atom()],down::'undefined' | [atom()],dbg::'undefined' | [any()],dbg2::'undefined' | [any()],members_dict::'undefined' | [{_,_}]},#projection_v1{epoch_number::'undefined' | non_neg_integer(),epoch_csum::binary(),author_server::atom(),chain_name::atom(),all_members::'undefined' | [atom()],witnesses::[atom()],creation_time::'undefined' | {non_neg_integer(),non_neg_integer(),non_neg_integer()},mode::'ap_mode' | 'cp_mode',upi::'undefined' | [atom()],repairing::'undefined' | [atom()],down::'undefined' | [atom()],dbg::'undefined' | [any()],dbg2::'undefined' | [any()],members_dict::'undefined' | [{_,_}]},'true'>

View file

@ -6,20 +6,6 @@ Erlang documentation, please use this link:
## Documents in this directory
### chain-self-management-sketch.org
[chain-self-management-sketch.org](chain-self-management-sketch.org)
is a mostly-deprecated draft of
an introduction to the
self-management algorithm proposed for Machi. Most material has been
moved to the [high-level-chain-mgr.pdf](high-level-chain-mgr.pdf) document.
### cluster-of-clusters (directory)
This directory contains the sketch of the "cluster of clusters" design
strawman for partitioning/distributing/sharding files across a large
number of independent Machi clusters.
### high-level-machi.pdf
[high-level-machi.pdf](high-level-machi.pdf)
@ -50,9 +36,9 @@ introduction to the Humming Consensus algorithm. Its abstract:
> of file updates to all replica servers in a Machi cluster. Chain
> Replication is a variation of primary/backup replication where the
> order of updates between the primary server and each of the backup
> servers is strictly ordered into a single ``chain''. Management of
> Chain Replication's metadata, e.g., ``What is the current order of
> servers in the chain?'', remains an open research problem. The
> servers is strictly ordered into a single "chain". Management of
> Chain Replication's metadata, e.g., "What is the current order of
> servers in the chain?", remains an open research problem. The
> current state of the art for Chain Replication metadata management
> relies on an external oracle (e.g., ZooKeeper) or the Elastic
> Replication algorithm.
@ -60,7 +46,7 @@ introduction to the Humming Consensus algorithm. Its abstract:
> This document describes the Machi chain manager, the component
> responsible for managing Chain Replication metadata state. The chain
> manager uses a new technique, based on a variation of CORFU, called
> ``humming consensus''.
> "humming consensus".
> Humming consensus does not require active participation by all or even
> a majority of participants to make decisions. Machi's chain manager
> bases its logic on humming consensus to make decisions about how to
@ -71,3 +57,18 @@ introduction to the Humming Consensus algorithm. Its abstract:
> decision during that epoch. When a differing decision is discovered,
> new time epochs are proposed in which a new consensus is reached and
> disseminated to all available participants.
### chain-self-management-sketch.org
[chain-self-management-sketch.org](chain-self-management-sketch.org)
is a mostly-deprecated draft of
an introduction to the
self-management algorithm proposed for Machi. Most material has been
moved to the [high-level-chain-mgr.pdf](high-level-chain-mgr.pdf) document.
### cluster (directory)
This directory contains the sketch of the cluster design
strawman for partitioning/distributing/sharding files across a large
number of independent Machi chains.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.8 KiB

View file

@ -1,435 +0,0 @@
-*- mode: org; -*-
#+TITLE: Machi cluster-of-clusters "name game" sketch
#+AUTHOR: Scott
#+STARTUP: lognotedone hidestars indent showall inlineimages
#+SEQ_TODO: TODO WORKING WAITING DONE
#+COMMENT: M-x visual-line-mode
#+COMMENT: Also, disable auto-fill-mode
* 1. "Name Games" with random-slicing style consistent hashing
Our goal: to distribute lots of files very evenly across a cluster of
Machi clusters (hereafter called a "cluster of clusters" or "CoC").
* 2. Assumptions
** Basic familiarity with Machi high level design and Machi's "projection"
The [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] contains all of the basic
background assumed by the rest of this document.
** Familiarity with the Machi cluster-of-clusters/CoC concept
This isn't yet well-defined (April 2015). However, it's clear from
the [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] that Machi alone does not support
any kind of file partitioning/distribution/sharding across multiple
small Machi clusters. There must be another layer above a Machi cluster to
provide such partitioning services.
The name "cluster of clusters" orignated within Basho to avoid
conflicting use of the word "cluster". A Machi cluster is usually
synonymous with a single Chain Replication chain and a single set of
machines (e.g. 2-5 machines). However, in the not-so-far future, we
expect much more complicated patterns of Chain Replication to be used
in real-world deployments.
"Cluster of clusters" is clunky and long, but we haven't found a good
substitute yet. If you have a good suggestion, please contact us!
~^_^~
Using the [[https://github.com/basho/machi/tree/master/prototype/demo-day-hack][cluster-of-clusters quick-and-dirty prototype]] as an
architecture sketch, let's now assume that we have ~N~ independent Machi
clusters. We wish to provide partitioned/distributed file storage
across all ~N~ clusters. We call the entire collection of ~N~ Machi
clusters a "cluster of clusters", or abbreviated "CoC".
** Continue CoC prototype's assumption: a Machi cluster is unaware of CoC
Let's continue with an assumption that an individual Machi cluster
inside of the cluster-of-clusters is completely unaware of the
cluster-of-clusters layer.
We may need to break this assumption sometime in the future? It isn't
quite clear yet, sorry.
** Analogy: "neighborhood : city :: Machi : cluster-of-clusters"
Analogy: The word "machi" in Japanese means small town or
neighborhood. As the Tokyo Metropolitan Area is built from many
machis and smaller cities, therefore a big, partitioned file store can
be built out of many small Machi clusters.
** The reader is familiar with the random slicing technique
I'd done something very-very-nearly-identical for the Hibari database
6 years ago. But the Hibari technique was based on stuff I did at
Sendmail, Inc, so it felt old news to me. {shrug}
The Hibari documentation has a brief photo illustration of how random
slicing works, see [[http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration][Hibari Sysadmin Guide, chain migration]]
For a comprehensive description, please see these two papers:
#+BEGIN_QUOTE
Reliable and Randomized Data Distribution Strategies for Large Scale Storage Systems
Alberto Miranda et al.
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.226.5609
(short version, HIPC'11)
Random Slicing: Efficient and Scalable Data Placement for Large-Scale
Storage Systems
Alberto Miranda et al.
DOI: http://dx.doi.org/10.1145/2632230 (long version, ACM Transactions
on Storage, Vol. 10, No. 3, Article 9, 2014)
#+END_QUOTE
** We use random slicing to map CoC file names -> Machi cluster ID/name
We will use a single random slicing map. This map (called ~Map~ in
the descriptions below), together with the random slicing hash
function (called ~rs_hash()~ below), will be used to map:
#+BEGIN_QUOTE
CoC client-visible file name -> Machi cluster ID/name/thingie
#+END_QUOTE
** Machi cluster ID/name management: TBD, but, really, should be simple
The mapping from:
#+BEGIN_QUOTE
Machi CoC member ID/name/thingie -> ???
#+END_QUOTE
... remains To Be Determined. But, really, this is going to be pretty
simple. The ID/name/thingie will probably be a human-friendly,
printable ASCII string, and the "???" will probably be a single Machi
cluster projection data structure.
The Machi projection is enough information to contact any member of
that cluster and, if necessary, request the most up-to-date projection
information required to use that cluster.
It's likely that the projection given by this map will be out-of-date,
so the client must be ready to use the standard Machi procedure to
request the cluster's current projection, in any case.
* 3. A simple illustration
I'm borrowing an illustration from the HibariDB documentation here,
but it fits my purposes quite well. (And I originally created that
image, and the use license is OK.)
#+CAPTION: Illustration of 'Map', using four Machi clusters
[[./migration-4.png]]
Assume that we have a random slicing map called ~Map~. This particular
~Map~ maps the unit interval onto 4 Machi clusters:
| Hash range | Cluster ID |
|-------------+------------|
| 0.00 - 0.25 | Cluster1 |
| 0.25 - 0.33 | Cluster4 |
| 0.33 - 0.58 | Cluster2 |
| 0.58 - 0.66 | Cluster4 |
| 0.66 - 0.91 | Cluster3 |
| 0.91 - 1.00 | Cluster4 |
Then, if we had CoC file name "~foo~", the hash ~SHA("foo")~ maps to about
0.05 on the unit interval. So, according to ~Map~, the value of
~rs_hash("foo",Map) = Cluster1~. Similarly, ~SHA("hello")~ is about
0.67 on the unit interval, so ~rs_hash("hello",Map) = Cluster3~.
* 4. An additional assumption: clients will want some control over file placement
We will continue to use the 4-cluster diagram from the previous
section.
When a client wishes to append data to a Machi file, the Machi server
chooses the file name & byte offset for storing that data. This
feature is why Machi's eventual consistency operating mode is so
nifty: it allows us to merge together files safely at any time because
any two client append operations will always write to different files
& different offsets.
** Our new assumption: client control over initial file placement
The CoC management scheme may decide that files need to migrate to
other clusters. The reason could be for storage load or I/O load
balancing reasons. It could be because a cluster is being
decomissioned by its owners. There are many legitimate reasons why a
file that is initially created on cluster ID X has been moved to
cluster ID Y.
However, there are also legitimate reasons for why the client would want
control over the choice of Machi cluster when the data is first
written. The single biggest reason is load balancing. Assuming that
the client (or the CoC management layer acting on behalf of the CoC
client) knows the current utilization across the participating Machi
clusters, then it may be very helpful to send new append() requests to
under-utilized clusters.
** Cool! Except for a couple of problems...
If the client wants to store some data
on Cluster2 and therefore sends an ~append("foo",CoolData)~ request to
the head of Cluster2 (which the client magically knows how to
contact), then the result will look something like
~{ok,"foo.s923.z47",ByteOffset}~.
Therefore, the file name "~foo.s923.z47~" must be used by any Machi
CoC client in order to retrieve the CoolData bytes.
*** Problem #1: "foo.s923.z47" doesn't always map via random slicing to Cluster2
... if we ignore the problem of "CoC files may be redistributed in the
future", then we still have a problem.
In fact, the value of ~ps_hash("foo.s923.z47",Map)~ is Cluster1.
*** Problem #2: We want CoC files to move around automatically
If the CoC client stores two pieces of information, the file name
"~foo.s923.z47~" and the Cluster ID Cluster2, then what happens when the
cluster-of-clusters system decides to rebalance files across all
machines? The CoC manager may decide to move our file to Cluster66.
How will a future CoC client wishes to retrieve CoolData when Cluster2
no longer stores the required file?
**** When migrating the file, we could put a "pointer" on Cluster2 that points to the new location, Cluster66.
This scheme is a bit brittle, even if all of the pointers are always
created 100% correctly. Also, if Cluster2 is ever unavailable, then
we cannot fetch our CoolData, even though the file moved away from
Cluster2 several years ago.
The scheme would also introduce extra round-trips to the servers
whenever we try to read a file where we do not know the most
up-to-date cluster ID for.
**** We could store a pointer to file "foo.s923.z47"'s location in an LDAP database!
Or we could store it in Riak. Or in another, external database. We'd
rather not create such an external dependency, however. Furthermore,
we would also have the same problem of updating this external database
each time that a file is moved/rebalanced across the CoC.
* 5. Proposal: Break the opacity of Machi file names, slightly
Assuming that Machi keeps the scheme of creating file names (in
response to ~append()~ and ~sequencer_new_range()~ calls) based on a
predictable client-supplied prefix and an opaque suffix, e.g.,
~append("foo",CoolData) -> {ok,"foo.s923.z47",ByteOffset}.~
... then we propose that all CoC and Machi parties be aware of this
naming scheme, i.e. that Machi assigns file names based on:
~ClientSuppliedPrefix ++ "." ++ SomeOpaqueFileNameSuffix~
The Machi system doesn't care about the file name -- a Machi server
will treat the entire file name as an opaque thing. But this document
is called the "Name Game" for a reason!
What if the CoC client could peek inside of the opaque file name
suffix in order to remove (or add) the CoC location information that
we need?
** The details: legend
- ~T~ = the target CoC member/Cluster ID chosen at the time of ~append()~
- ~p~ = file prefix, chosen by the CoC client (This is exactly the Machi client-chosen file prefix).
- ~s.z~ = the Machi file server opaque file name suffix (Which we
happen to know is a combination of sequencer ID plus file serial
number. This implementation may change, for example, to use a
standard GUID string (rendered into ASCII hexadecimal digits) instead.)
- ~K~ = the CoC placement key
We use a variation of ~rs_hash()~, called ~rs_hash_with_float()~. The
former uses a string as its 1st argument; the latter uses a floating
point number as its 1st argument. Both return a cluster ID name
thingie.
#+BEGIN_SRC erlang
%% type specs, Erlang style
-spec rs_hash(string(), rs_hash:map()) -> rs_hash:cluster_id().
-spec rs_hash_with_float(float(), rs_hash:map()) -> rs_hash:cluster_id().
#+END_SRC
NOTE: Use of floating point terms is not required. For example,
integer arithmetic could be used, if using a sufficiently large
interval to create an even & smooth distribution of hashes across the
expected maximum number of clusters.
For example, if the maximum CoC cluster size would be 4,000 individual
Machi clusters, then a minimum of 12 bits of integer space is required
to assign one integer per Machi cluster. However, for load balancing
purposes, a finer grain of (for example) 100 integers per Machi
cluster would permit file migration to move increments of
approximately 1% of single Machi cluster's storage capacity. A
minimum of 19 bits of hash space would be necessary to accomodate
these constraints.
** The details: CoC file write
1. CoC client chooses ~p~ and ~T~ (i.e., the file prefix & target cluster)
2. CoC client knows the CoC ~Map~
3. CoC client calculates a value ~K~ such that ~rs_hash_with_float(K,Map) = T~, using the method described below.
4. CoC client requests @ cluster ~T~: ~append_chunk(p,...) -> {ok,p.K.s.z,ByteOffset}~
5. CoC stores/uses the file name ~p.K.s.z~.
** The details: CoC file read
1. CoC client knows the file name ~p.K.s.z~ and parses it to find
~K~'s value.
2. CoC client knows the CoC ~Map~
3. Coc calculates ~rs_hash_with_float(K,Map) = T~
4. CoC client requests @ cluster ~T~: ~read_chunk(p.K.s.z,...) ->~ ... success!
** The details: calculating 'K', the CoC placement key
1. We know ~Map~, the current CoC mapping.
2. We look inside of ~Map~, and we find all of the unit interval ranges
that map to our desired target cluster ~T~. Let's call this list
~MapList = [Range1=(start,end],Range2=(start,end],...]~.
3. In our example, ~T=Cluster2~. The example ~Map~ contains a single
unit interval range for ~Cluster2~, ~[(0.33,0.58]]~.
4. Choose a uniformally random number ~r~ on the unit interval.
5. Calculate placement key ~K~ by mapping ~r~ onto the concatenation
of the CoC hash space range intervals in ~MapList~. For example,
if ~r=0.5~, then ~K = 0.33 + 0.5*(0.58-0.33) = 0.455~, which is
exactly in the middle of the ~(0.33,0.58]~ interval.
6. If necessary, encode ~K~ in a file name-friendly manner, e.g., convert it to hexadecimal ASCII digits to create file name ~p.K.s.z~.
** The details: calculating 'K', an alternative method
If the Law of Large Numbers and our random number generator do not create the kind of smooth & even distribution of files across the CoC as we wish, an alternative method of calculating ~K~ follows.
If each server in each Machi cluster keeps track of the CoC ~Map~ and also of all values of ~K~ for all files that it stores, then we can simply ask a cluster member to recommend a value of ~K~ that is least represented by existing files.
* 6. File migration (aka rebalancing/reparitioning/redistribution)
** What is "file migration"?
As discussed in section 5, the client can have good reason for wanting
to have some control of the initial location of the file within the
cluster. However, the cluster manager has an ongoing interest in
balancing resources throughout the lifetime of the file. Disks will
get full, hardware will change, read workload will fluctuate,
etc etc.
This document uses the word "migration" to describe moving data from
one CoC cluster to another. In other systems, this process is
described with words such as rebalancing, repartitioning, and
resharding. For Riak Core applications, the mechanisms are "handoff"
and "ring resizing". See the [[http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html#Balancer][Hadoop file balancer]] for another example.
A simple variation of the Random Slicing hash algorithm can easily
accomodate Machi's need to migrate files without interfering with
availability. Machi's migration task is much simpler due to the
immutable nature of Machi file data.
** Change to Random Slicing
The map used by the Random Slicing hash algorithm needs a few simple
changes to make file migration straightforward.
- Add a "generation number", a strictly increasing number (similar to
a Machi cluster's "epoch number") that reflects the history of
changes made to the Random Slicing map
- Use a list of Random Slicing maps instead of a single map, one map
per possibility that files may not have been migrated yet out of
that map.
As an example:
#+CAPTION: Illustration of 'Map', using four Machi clusters
[[./migration-3to4.png]]
And the new Random Slicing map might look like this:
| Generation number | 7 |
|-------------------+------------|
| SubMap | 1 |
|-------------------+------------|
| Hash range | Cluster ID |
|-------------------+------------|
| 0.00 - 0.33 | Cluster1 |
| 0.33 - 0.66 | Cluster2 |
| 0.66 - 1.00 | Cluster3 |
|-------------------+------------|
| SubMap | 2 |
|-------------------+------------|
| Hash range | Cluster ID |
|-------------------+------------|
| 0.00 - 0.25 | Cluster1 |
| 0.25 - 0.33 | Cluster4 |
| 0.33 - 0.58 | Cluster2 |
| 0.58 - 0.66 | Cluster4 |
| 0.66 - 0.91 | Cluster3 |
| 0.91 - 1.00 | Cluster4 |
When a new Random Slicing map contains a single submap, then its use
is identical to the original Random Slicing algorithm. If the map
contains multiple submaps, then the access rules change a bit:
- Write operations always go to the latest/largest submap.
- Read operations attempt to read from all unique submaps.
- Skip searching submaps that refer to the same cluster ID.
- In this example, unit interval value 0.10 is mapped to Cluster1
by both submaps.
- Read from latest/largest submap to oldest/smallest submap.
- If not found in any submap, search a second time (to handle races
with file copying between submaps).
- If the requested data is found, optionally copy it directly to the
latest submap (as a variation of read repair which really simply
accelerates the migration process and can reduce the number of
operations required to query servers in multiple submaps).
The cluster-of-clusters manager is responsible for:
- Managing the various generations of the CoC Random Slicing maps,
including distributing them to CoC clients.
- Managing the processes that are responsible for copying "cold" data,
i.e., files data that is not regularly accessed, to its new submap
location.
- When migration of a file to its new cluster is confirmed successful,
delete it from the old cluster.
In example map #7, the CoC manager will copy files with unit interval
assignments in ~(0.25,0.33]~, ~(0.58,0.66]~, and ~(0.91,1.00]~ from their
old locations in cluster IDs Cluster1/2/3 to their new cluster,
Cluster4. When the CoC manager is satisfied that all such files have
been copied to Cluster4, then the CoC manager can create and
distribute a new map, such as:
| Generation number | 8 |
|-------------------+------------|
| SubMap | 1 |
|-------------------+------------|
| Hash range | Cluster ID |
|-------------------+------------|
| 0.00 - 0.25 | Cluster1 |
| 0.25 - 0.33 | Cluster4 |
| 0.33 - 0.58 | Cluster2 |
| 0.58 - 0.66 | Cluster4 |
| 0.66 - 0.91 | Cluster3 |
| 0.91 - 1.00 | Cluster4 |
One limitation of HibariDB that I haven't fixed is not being able to
perform more than one migration at a time. The trade-off is that such
migration is difficult enough across two submaps; three or more
submaps becomes even more complicated.
Fortunately for Machi, its file data is immutable and therefore can
easily manage many migrations in parallel, i.e., its submap list may
be several maps long, each one for an in-progress file migration.
* Acknowledgements
The source for the "migration-4.png" and "migration-3to4.png" images
come from the [[http://hibari.github.io/hibari-doc/images/migration-3to4.png][HibariDB documentation]].

View file

@ -0,0 +1,103 @@
#FIG 3.2 Produced by xfig version 3.2.5b
Landscape
Center
Inches
Letter
94.00
Single
-2
1200 2
6 7425 2700 8700 3300
4 0 0 50 -1 2 18 0.0000 4 195 645 7425 2895 After\001
4 0 0 50 -1 2 18 0.0000 4 255 1215 7425 3210 Migration\001
-6
6 7425 450 8700 1050
4 0 0 50 -1 2 18 0.0000 4 195 780 7425 675 Before\001
4 0 0 50 -1 2 18 0.0000 4 255 1215 7425 990 Migration\001
-6
6 75 1425 6900 2325
6 4875 1425 6900 2325
6 5400 1575 6375 2175
4 0 0 50 -1 2 14 0.0000 4 165 390 5400 1800 Not\001
4 0 0 50 -1 2 14 0.0000 4 225 945 5400 2100 migrated\001
-6
2 2 1 2 0 7 50 -1 -1 6.000 0 0 -1 0 0 5
4950 1500 6825 1500 6825 2250 4950 2250 4950 1500
-6
6 2475 1425 4500 2325
6 3000 1575 3975 2175
4 0 0 50 -1 2 14 0.0000 4 165 390 3000 1800 Not\001
4 0 0 50 -1 2 14 0.0000 4 225 945 3000 2100 migrated\001
-6
2 2 1 2 0 7 50 -1 -1 6.000 0 0 -1 0 0 5
2550 1500 4425 1500 4425 2250 2550 2250 2550 1500
-6
6 75 1425 2100 2325
6 600 1575 1575 2175
4 0 0 50 -1 2 14 0.0000 4 165 390 600 1800 Not\001
4 0 0 50 -1 2 14 0.0000 4 225 945 600 2100 migrated\001
-6
2 2 1 2 0 7 50 -1 -1 6.000 0 0 -1 0 0 5
150 1500 2025 1500 2025 2250 150 2250 150 1500
-6
-6
2 1 0 2 0 7 50 -1 -1 6.000 0 0 -1 1 0 2
1 1 3.00 60.00 120.00
150 4200 150 3750
2 1 0 2 0 7 50 -1 -1 6.000 0 0 -1 1 0 2
1 1 3.00 60.00 120.00
3750 4200 3750 3750
2 1 0 2 0 7 50 -1 -1 6.000 0 0 -1 1 0 2
1 1 3.00 60.00 120.00
2025 4200 2025 3750
2 1 0 2 0 7 50 -1 -1 6.000 0 0 -1 1 0 2
1 1 3.00 60.00 120.00
7350 4200 7350 3750
2 1 0 2 0 7 50 -1 -1 6.000 0 0 -1 1 0 2
1 1 3.00 60.00 120.00
5550 4200 5550 3750
2 2 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
2550 0 2550 1500 150 1500 150 0 2550 0
2 2 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
4950 0 4950 1500 2550 1500 2550 0 4950 0
2 2 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
7350 0 7350 1500 4950 1500 4950 0 7350 0
2 2 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
150 2250 2025 2250 2025 3750 150 3750 150 2250
2 2 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
4425 2250 4950 2250 4950 3750 4425 3750 4425 2250
2 2 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
4950 2250 6825 2250 6825 3750 4950 3750 4950 2250
2 2 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
6825 2250 7350 2250 7350 3750 6825 3750 6825 2250
2 2 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
2025 2250 2550 2250 2550 3750 2025 3750 2025 2250
2 2 0 3 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
2550 2250 4425 2250 4425 3750 2550 3750 2550 2250
4 0 0 50 -1 2 18 0.0000 4 195 480 75 4500 0.00\001
4 0 0 50 -1 2 18 0.0000 4 195 480 6825 4500 1.00\001
4 0 0 50 -1 2 18 0.0000 4 195 480 1725 4500 0.25\001
4 0 0 50 -1 2 18 0.0000 4 195 480 3525 4500 0.50\001
4 0 0 50 -1 2 18 0.0000 4 195 480 5250 4500 0.75\001
4 0 0 50 -1 2 14 0.0000 4 240 1710 450 1275 ~33% total keys\001
4 0 0 50 -1 2 14 0.0000 4 240 1710 2925 1275 ~33% total keys\001
4 0 0 50 -1 2 14 0.0000 4 240 1710 5250 1275 ~33% total keys\001
4 0 0 50 -1 2 14 0.0000 4 180 495 2025 3525 ~8%\001
4 0 0 50 -1 2 14 0.0000 4 240 1710 300 3525 ~25% total keys\001
4 0 0 50 -1 2 14 0.0000 4 240 1710 2625 3525 ~25% total keys\001
4 0 0 50 -1 2 14 0.0000 4 180 495 4425 3525 ~8%\001
4 0 0 50 -1 2 14 0.0000 4 240 1710 5025 3525 ~25% total keys\001
4 0 0 50 -1 2 14 0.0000 4 180 495 6825 3525 ~8%\001
4 0 0 50 -1 2 24 0.0000 4 270 195 2175 3075 4\001
4 0 0 50 -1 2 24 0.0000 4 270 195 4575 3075 4\001
4 0 0 50 -1 2 24 0.0000 4 270 195 6975 3075 4\001
4 0 0 50 -1 2 24 0.0000 4 270 1245 600 600 Chain1\001
4 0 0 50 -1 2 24 0.0000 4 270 1245 3000 600 Chain2\001
4 0 0 50 -1 2 24 0.0000 4 270 1245 5400 600 Chain3\001
4 0 0 50 -1 2 24 0.0000 4 270 285 2100 2625 C\001
4 0 0 50 -1 2 24 0.0000 4 270 285 4500 2625 C\001
4 0 0 50 -1 2 24 0.0000 4 270 285 6900 2625 C\001
4 0 0 50 -1 2 24 0.0000 4 270 1245 525 2850 Chain1\001
4 0 0 50 -1 2 24 0.0000 4 270 1245 2925 2850 Chain2\001
4 0 0 50 -1 2 24 0.0000 4 270 1245 5325 2850 Chain3\001
4 0 0 50 -1 2 18 0.0000 4 240 4350 1350 4875 Cluster locator, on the unit interval\001

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.6 KiB

BIN
doc/cluster/migration-4.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.4 KiB

View file

@ -0,0 +1,481 @@
-*- mode: org; -*-
#+TITLE: Machi cluster "name game" sketch
#+AUTHOR: Scott
#+STARTUP: lognotedone hidestars indent showall inlineimages
#+SEQ_TODO: TODO WORKING WAITING DONE
#+COMMENT: M-x visual-line-mode
#+COMMENT: Also, disable auto-fill-mode
* 1. "Name Games" with random-slicing style consistent hashing
Our goal: to distribute lots of files very evenly across a large
collection of individual, small Machi chains.
* 2. Assumptions
** Basic familiarity with Machi high level design and Machi's "projection"
The [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] contains all of the basic
background assumed by the rest of this document.
** Analogy: "neighborhood : city :: Machi chain : Machi cluster"
Analogy: The word "machi" in Japanese means small town or
neighborhood. As the Tokyo Metropolitan Area is built from many
machis and smaller cities, therefore a big, partitioned file store can
be built out of many small Machi chains.
** Familiarity with the Machi chain concept
It's clear (I hope!) from
the [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] that Machi alone does not support
any kind of file partitioning/distribution/sharding across multiple
small Machi chains. There must be another layer above a Machi chain to
provide such partitioning services.
Using the [[https://github.com/basho/machi/tree/master/prototype/demo-day-hack][cluster quick-and-dirty prototype]] as an
architecture sketch, let's now assume that we have ~n~ independent Machi
chains. We assume that each of these chains has the same
chain length in the nominal case, e.g. chain length of 3.
We wish to provide partitioned/distributed file storage
across all ~n~ chains. We call the entire collection of ~n~ Machi
chains a "cluster".
We may wish to have several types of Machi clusters. For example:
+ Chain length of 1 for "don't care if it gets lost,
store stuff very very cheaply" data.
+ Chain length of 2 for normal data.
+ Equivalent to quorum replication's reliability with 3 copies.
+ Chain length of 7 for critical, unreplaceable data.
+ Equivalent to quorum replication's reliability with 15 copies.
Each of these types of chains will have a name ~N~ in the
namespace. The role of the cluster namespace will be demonstrated in
Section 3 below.
** Continue an early assumption: a Machi chain is unaware of clustering
Let's continue with an assumption that an individual Machi chain
inside of a cluster is completely unaware of the cluster layer.
** The reader is familiar with the random slicing technique
I'd done something very-very-nearly-like-this for the Hibari database
6 years ago. But the Hibari technique was based on stuff I did at
Sendmail, Inc, in 2000, so this technique feels like old news to me.
{shrug}
The following section provides an illustrated example.
Very quickly, the random slicing algorithm is:
- Hash a string onto the unit interval [0.0, 1.0)
- Calculate h(unit interval point, Map) -> bin, where ~Map~ divides
the unit interval into bins (or partitions or shards).
Machi's adaptation is in step 1: we do not hash any strings. Instead, we
simply choose a number on the unit interval. This number is called
the "cluster locator number".
As described later in this doc, Machi file names are structured into
several components. One component of the file name contains the cluster
locator number; we use the number as-is for step 2 above.
*** For more information about Random Slicing
For a comprehensive description of random slicing, please see the
first two papers. For a quicker summary, please see the third
reference.
#+BEGIN_QUOTE
Reliable and Randomized Data Distribution Strategies for Large Scale Storage Systems
Alberto Miranda et al.
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.226.5609
(short version, HIPC'11)
Random Slicing: Efficient and Scalable Data Placement for Large-Scale
Storage Systems
Alberto Miranda et al.
DOI: http://dx.doi.org/10.1145/2632230 (long version, ACM Transactions
on Storage, Vol. 10, No. 3, Article 9, 2014)
[[http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration][Hibari Sysadmin Guide, chain migration section]].
http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration
#+END_QUOTE
* 3. A simple illustration
We use a variation of the Random Slicing hash that we will call
~rs_hash_with_float()~. The Erlang-style function type is shown
below.
#+BEGIN_SRC erlang
%% type specs, Erlang-style
-spec rs_hash_with_float(float(), rs_hash:map()) -> rs_hash:chain_id().
#+END_SRC
I'm borrowing an illustration from the HibariDB documentation here,
but it fits my purposes quite well. (I am the original creator of that
image, and also the use license is compatible.)
#+CAPTION: Illustration of 'Map', using four Machi chains
[[./migration-4.png]]
Assume that we have a random slicing map called ~Map~. This particular
~Map~ maps the unit interval onto 4 Machi chains:
| Hash range | Chain ID |
|-------------+----------|
| 0.00 - 0.25 | Chain1 |
| 0.25 - 0.33 | Chain4 |
| 0.33 - 0.58 | Chain2 |
| 0.58 - 0.66 | Chain4 |
| 0.66 - 0.91 | Chain3 |
| 0.91 - 1.00 | Chain4 |
Assume that the system chooses a cluster locator of 0.05.
According to ~Map~, the value of
~rs_hash_with_float(0.05,Map) = Chain1~.
Similarly, ~rs_hash_with_float(0.26,Map) = Chain4~.
This example should look very similar to Hibari's technique.
The Hibari documentation has a brief photo illustration of how random
slicing works, see [[http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration][Hibari Sysadmin Guide, chain migration]].
* 4. Use of the cluster namespace: name separation plus chain type
Let us assume that the cluster framework provides several different types
of chains:
| Chain length | Namespace | Consistency Mode | Comment |
|--------------+--------------+------------------+----------------------------------|
| 3 | ~normal~ | eventual | Normal storage redundancy & cost |
| 2 | ~reduced~ | eventual | Reduced cost storage |
| 1 | ~risky~ | eventual | Really, really cheap storage |
| 7 | ~paranoid~ | eventual | Safety-critical storage |
| 3 | ~sequential~ | strong | Strong consistency |
|--------------+--------------+------------------+----------------------------------|
The client may want to choose the amount of redundancy that its
application requires: normal, reduced cost, or perhaps even a single
copy. The cluster namespace is used by the client to signal this
intention.
Further, the cluster administrators may wish to use the namespace to
provide separate storage for different applications. Jane's
application may use the namespace "jane-normal" and Bob's app uses
"bob-reduced". Administrators may definine separate groups of
chains on separate servers to serve these two applications.
* 5. In its lifetime, a file may be moved to different chains
The cluster management scheme may decide that files need to migrate to
other chains -- i.e., file that is initially created on chain ID ~X~
has been moved to chain ID ~Y~.
+ For storage load or I/O load balancing reasons.
+ Because a chain is being decommissioned by the sysadmin.
* 6. Floating point is not required ... it is merely convenient for explanation
NOTE: Use of floating point terms is not required. For example,
integer arithmetic could be used, if using a sufficiently large
interval to create an even & smooth distribution of hashes across the
expected maximum number of chains.
For example, if the maximum cluster size would be 4,000 individual
Machi chains, then a minimum of 12 bits of integer space is required
to assign one integer per Machi chain. However, for load balancing
purposes, a finer grain of (for example) 100 integers per Machi
chain would permit file migration to move increments of
approximately 1% of single Machi chain's storage capacity. A
minimum of 12+7=19 bits of hash space would be necessary to accommodate
these constraints.
It is likely that Machi's final implementation will choose a 24 bit
integer (or perhaps 32 bits) to represent the cluster locator.
* 7. Proposal: Break the opacity of Machi file names, slightly.
Machi assigns file names based on:
~ClientSuppliedPrefix ++ "^" ++ SomeOpaqueFileNameSuffix~
What if some parts of the system could peek inside of the opaque file name
suffix in order to look at the cluster location information that we might
code in the filename suffix?
We break the system into parts that speak two levels of protocols,
"high" and "low".
+ The high level protocol is used outside of the Machi cluster
+ The low level protocol is used inside of the Machi cluster
Both protocols are based on a Protocol Buffers specification and
implementation. Other protocols, such as HTTP, will be added later.
#+BEGIN_SRC
+-----------------------+
| Machi external client |
| e.g. Riak CS |
+-----------------------+
^
| Machi "high" API
| ProtoBuffs protocol Machi cluster boundary: outside
.........................................................................
| Machi cluster boundary: inside
v
+--------------------------+ +------------------------+
| Machi "high" API service | | Machi HTTP API service |
+--------------------------+ +------------------------+
^ |
| +------------------------+
v v
+------------------------+
| Cluster bridge service |
+------------------------+
^
| Machi "low" API
| ProtoBuffs protocol
+----------------------------------------+----+----+
| | | |
v v v v
+-------------------------+ ... other chains...
| Chain C1 (logical view) |
| +--------------+ |
| | FLU server 1 | |
| | +--------------+ |
| +--| FLU server 2 | |
| +--------------+ | In reality, API bridge talks directly
+-------------------------+ to each FLU server in a chain.
#+END_SRC
** The notation we use
- ~N~ = the cluster namespace, chosen by the client.
- ~p~ = file prefix, chosen by the client.
- ~L~ = the cluster locator (a number, type is implementation-dependent)
- ~Map~ = a mapping of cluster locators to chains
- ~T~ = the target chain ID/name
- ~u~ = a unique opaque file name suffix, e.g. a GUID string
- ~F~ = a Machi file name, i.e., a concatenation of ~p^L^N^u~
** The details: cluster file append
0. Cluster client chooses ~N~ and ~p~ (i.e., cluster namespace and
file prefix) and sends the append request to a Machi cluster member
via the Protocol Buffers "high" API.
1. Cluster bridge chooses ~T~ (i.e., target chain), based on criteria
such as disk utilization percentage.
2. Cluster bridge knows the cluster ~Map~ for namespace ~N~.
3. Cluster bridge choose some cluster locator value ~L~ such that
~rs_hash_with_float(L,Map) = T~ (see algorithm below).
4. Cluster bridge sends its request to chain
~T~: ~append_chunk(p,L,N,...) -> {ok,p^L^N^u,ByteOffset}~
5. Cluster bridge forwards the reply tuple to the client.
6. Client stores/uses the file name ~F = p^L^N^u~.
** The details: Cluster file read
0. Cluster client sends the read request to a Machi cluster member via
the Protocol Buffers "high" API.
1. Cluster bridge parses the file name ~F~ to find
the values of ~L~ and ~N~ (recall, ~F = p^L^N^u~).
2. Cluster bridge knows the Cluster ~Map~ for type ~N~.
3. Cluster bridge calculates ~rs_hash_with_float(L,Map) = T~
4. Cluster bridge sends request to chain ~T~:
~read_chunk(F,...) ->~ ... reply
5. Cluster bridge forwards the reply to the client.
** The details: calculating 'L' (the cluster locator number) to match a desired target chain
1. We know ~Map~, the current cluster mapping for a cluster namespace ~N~.
2. We look inside of ~Map~, and we find all of the unit interval ranges
that map to our desired target chain ~T~. Let's call this list
~MapList = [Range1=(start,end],Range2=(start,end],...]~.
3. In our example, ~T=Chain2~. The example ~Map~ contains a single
unit interval range for ~Chain2~, ~[(0.33,0.58]]~.
4. Choose a uniformly random number ~r~ on the unit interval.
5. Calculate the cluster locator ~L~ by mapping ~r~ onto the concatenation
of the cluster hash space range intervals in ~MapList~. For example,
if ~r=0.5~, then ~L = 0.33 + 0.5*(0.58-0.33) = 0.455~, which is
exactly in the middle of the ~(0.33,0.58]~ interval.
** A bit more about the cluster namespaces's meaning and use
For use by Riak CS, for example, we'd likely start with the following
namespaces ... working our way down the list as we add new features
and/or re-implement existing CS features.
- "standard" = Chain length = 3, eventually consistency mode
- "reduced" = Chain length = 2, eventually consistency mode.
- "stanchion7" = Chain length = 7, strong consistency mode. Perhaps
use this namespace for the metadata required to re-implement the
operations that are performed by today's Stanchion application.
We want the cluster framework to:
- provide means of creating and managing
chains of different types, e.g., chain length, consistency mode.
- manage the mapping of cluster namespace
names to the chains in the system.
- provide query functions to map a cluster
namespace name to a cluster map,
e.g. ~get_cluster_latest_map("reduced") -> Map{generation=7,...}~.
* 8. File migration (a.k.a. rebalancing/reparitioning/resharding/redistribution)
** What is "migration"?
This section describes Machi's file migration. Other storage systems
call this process as "rebalancing", "repartitioning", "resharding" or
"redistribution".
For Riak Core applications, it is called "handoff" and "ring resizing"
(depending on the context).
See also the [[http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html#Balancer][Hadoop file balancer]] for another example of a data
migration process.
As discussed in section 5, the client can have good reason for wanting
to have some control of the initial location of the file within the
chain. However, the chain manager has an ongoing interest in
balancing resources throughout the lifetime of the file. Disks will
get full, hardware will change, read workload will fluctuate,
etc etc.
This document uses the word "migration" to describe moving data from
one Machi chain to another chain within a cluster system.
A simple variation of the Random Slicing hash algorithm can easily
accommodate Machi's need to migrate files without interfering with
availability. Machi's migration task is much simpler due to the
immutable nature of Machi file data.
** Change to Random Slicing
The map used by the Random Slicing hash algorithm needs a few simple
changes to make file migration straightforward.
- Add a "generation number", a strictly increasing number (similar to
a Machi chain's "epoch number") that reflects the history of
changes made to the Random Slicing map
- Use a list of Random Slicing maps instead of a single map, one map
per chance that files may not have been migrated yet out of
that map.
As an example:
#+CAPTION: Illustration of 'Map', using four Machi chains
[[./migration-3to4.png]]
And the new Random Slicing map for some cluster namespace ~N~ might look
like this:
| Generation number / Namespace | 7 / reduced |
|-------------------------------+-------------|
| SubMap | 1 |
|-------------------------------+-------------|
| Hash range | Chain ID |
|-------------------------------+-------------|
| 0.00 - 0.33 | Chain1 |
| 0.33 - 0.66 | Chain2 |
| 0.66 - 1.00 | Chain3 |
|-------------------------------+-------------|
| SubMap | 2 |
|-------------------------------+-------------|
| Hash range | Chain ID |
|-------------------------------+-------------|
| 0.00 - 0.25 | Chain1 |
| 0.25 - 0.33 | Chain4 |
| 0.33 - 0.58 | Chain2 |
| 0.58 - 0.66 | Chain4 |
| 0.66 - 0.91 | Chain3 |
| 0.91 - 1.00 | Chain4 |
When a new Random Slicing map contains a single submap, then its use
is identical to the original Random Slicing algorithm. If the map
contains multiple submaps, then the access rules change a bit:
- Write operations always go to the newest/largest submap.
- Read operations attempt to read from all unique submaps.
- Skip searching submaps that refer to the same chain ID.
- In this example, unit interval value 0.10 is mapped to Chain1
by both submaps.
- Read from newest/largest submap to oldest/smallest submap.
- If not found in any submap, search a second time (to handle races
with file copying between submaps).
- If the requested data is found, optionally copy it directly to the
newest submap. (This is a variation of read repair (RR). RR here
accelerates the migration process and can reduce the number of
operations required to query servers in multiple submaps).
The cluster manager is responsible for:
- Managing the various generations of the cluster Random Slicing maps for
all namespaces.
- Distributing namespace maps to cluster bridges.
- Managing the processes that are responsible for copying "cold" data,
i.e., files data that is not regularly accessed, to its new submap
location.
- When migration of a file to its new chain is confirmed successful,
delete it from the old chain.
In example map #7, the cluster manager will copy files with unit interval
assignments in ~(0.25,0.33]~, ~(0.58,0.66]~, and ~(0.91,1.00]~ from their
old locations in chain IDs Chain1/2/3 to their new chain,
Chain4. When the cluster manager is satisfied that all such files have
been copied to Chain4, then the cluster manager can create and
distribute a new map, such as:
| Generation number / Namespace | 8 / reduced |
|-------------------------------+-------------|
| SubMap | 1 |
|-------------------------------+-------------|
| Hash range | Chain ID |
|-------------------------------+-------------|
| 0.00 - 0.25 | Chain1 |
| 0.25 - 0.33 | Chain4 |
| 0.33 - 0.58 | Chain2 |
| 0.58 - 0.66 | Chain4 |
| 0.66 - 0.91 | Chain3 |
| 0.91 - 1.00 | Chain4 |
The HibariDB system performs data migrations in almost exactly this
manner. However, one important
limitation of HibariDB is not being able to
perform more than one migration at a time. HibariDB's data is
mutable. Mutation causes many problems when migrating data
across two submaps; three or more submaps was too complex to implement
quickly and correctly.
Fortunately for Machi, its file data is immutable and therefore can
easily manage many migrations in parallel, i.e., its submap list may
be several maps long, each one for an in-progress file migration.
* 9. Other considerations for FLU/sequencer implementations
** Append to existing file when possible
The sequencer should always assign new offsets to the latest/newest
file for any prefix, as long as all prerequisites are also true,
- The epoch has not changed. (In AP mode, epoch change -> mandatory
file name suffix change.)
- The cluster locator number is stable.
- The latest file for prefix ~p~ is smaller than maximum file size for
a FLU's configuration.
The stability of the cluster locator number is an implementation detail that
must be managed by the cluster bridge.
Reuse of the same file is not possible if the bridge always chooses a
different cluster locator number ~L~ or if the client always uses a unique
file prefix ~p~. The latter is a sign of a misbehaved client; the
former is a poorly-implemented bridge.
* 10. Acknowledgments
The original source for the "migration-4.png" and "migration-3to4.png" images
come from the [[http://hibari.github.io/hibari-doc/images/migration-3to4.png][HibariDB documentation]].

30
doc/dev-clone-compile.md Normal file
View file

@ -0,0 +1,30 @@
# Clone and compile Machi
Clone the Machi source repo and compile the source and test code. Run
the following commands at your login shell:
cd /tmp
git clone https://github.com/basho/machi.git
cd machi
git checkout master
make # or 'gmake' if GNU make uses an alternate name
Then run the unit test suite. This may take up to two minutes or so
to finish.
make test
At the end, the test suite should report that all tests passed. The
actual number of tests shown in the "All `X` tests passed" line may be
different than the example below.
[... many lines omitted ...]
module 'event_logger'
module 'chain_mgr_legacy'
=======================================================
All 90 tests passed.
If you had a test failure, a likely cause may be a limit on the number
of file descriptors available to your user process. (Recent releases
of OS X have a limit of 1024 file descriptors, which may be too slow.)
The output of the `limit -n` will tell you your file descriptor limit.

38
doc/dev-prerequisites.md Normal file
View file

@ -0,0 +1,38 @@
## Machi developer environment prerequisites
1. Machi requires an 64-bit variant of UNIX: OS X, FreeBSD, Linux, or
Solaris machine is a standard developer environment for C and C++
applications (64-bit versions).
2. You'll need the `git` source management utility.
3. You'll need the 64-bit Erlang/OTP 17 runtime environment. Please
don't use earlier or later versions until we have a chance to fix
the compilation warnings that versions R16B and 18 will trigger.
Also, please verify that you are not using a 32-bit Erlang/OTP
runtime package.
For `git` and the Erlang runtime, please use your OS-specific
package manager to install these. If your package manager doesn't
have 64-bit Erlang/OTP version 17 available, then we recommend using the
[precompiled packages available at Erlang Solutions](https://www.erlang-solutions.com/resources/download.html).
Also, please verify that you have enough file descriptors available to
your user processes. The output of `ulimit -n` should report at least
4,000 file descriptors available. If your limit is lower (a frequent
problem for OS X users), please increase it to at least 4,000.
# Using Vagrant to set up a developer environment for Machi
The Machi source directory contains a `Vagrantfile` for creating an
Ubuntu Linux-based virtual machine for compiling and running Machi.
This file is in the
[$SRC_TOP/priv/humming-consensus-demo.vagrant](../priv/humming-consensus-demo.vagrant)
directory.
If used as-is, the virtual machine specification is modest.
* 1 virtual CPU
* 512MB virtual memory
* 768MB swap space
* 79GB sparse virtual disk image. After installing prerequisites and
compiling Machi, the root file system uses approximately 2.7 GBytes.

View file

@ -0,0 +1,617 @@
FLU and Chain Life Cycle Management -*- mode: org; -*-
#+STARTUP: lognotedone hidestars indent showall inlineimages
#+COMMENT: To generate the outline section: egrep '^\*[*]* ' doc/flu-and-chain-lifecycle.org | egrep -v '^\* Outline' | sed -e 's/^\*\*\* / + /' -e 's/^\*\* / + /' -e 's/^\* /+ /'
* FLU and Chain Life Cycle Management
In an ideal world, we (the Machi development team) would have a full
vision of how Machi would be managed, down to the last detail of
beautiful CLI character and network protocol bit. Our vision isn't
complete yet, so we are working one small step at a time.
* Outline
+ FLU and Chain Life Cycle Management
+ Terminology review
+ Terminology: Machi run-time components/services/thingies
+ Terminology: Machi chain data structures
+ Terminology: Machi cluster data structures
+ Overview of administrative life cycles
+ Cluster administrative life cycle
+ Chain administrative life cycle
+ FLU server administrative life cycle
+ Quick admin: declarative management of Machi FLU and chain life cycles
+ Quick admin uses the "rc.d" config scheme for life cycle management
+ Quick admin's declarative "language": an Erlang-flavored AST
+ Term 'host': define a new host for FLU services
+ Term 'flu': define a new FLU
+ Term 'chain': define or reconfigure a chain
+ Executing quick admin AST files via the 'machi-admin' utility
+ Checking the syntax of an AST file
+ Executing an AST file
+ Using quick admin to manage multiple machines
+ The "rc.d" style configuration file scheme
+ Riak had a similar configuration file editing problem (and its solution)
+ Machi's "rc.d" file scheme.
+ FLU life cycle management using "rc.d" style files
+ The key configuration components of a FLU
+ Chain life cycle management using "rc.d" style files
+ The key configuration components of a chain
* Terminology review
** Terminology: Machi run-time components/services/thingies
+ FLU: a basic Machi server, responsible for managing a collection of
files.
+ Chain: a small collection of FLUs that maintain replicas of the same
collection of files. A chain is usually small, 1-3 servers, where
more than 3 would be used only in cases when availability of
certain data is critical despite failures of several machines.
+ The length of a chain is directly proportional to its
replication factor, e.g., a chain length=3 will maintain
(nominally) 3 replicas of each file.
+ To maintain file availability when ~F~ failures have occurred, a
chain must be at least ~F+1~ members long. (In comparison, the
quorum replication technique requires ~2F+1~ members in the
general case.)
+ Cluster: A collection of Machi chains that are used to store files
in a horizontally partitioned/sharded/distributed manner.
** Terminology: Machi data structures
+ Projection: used to define a single chain: the chain's consistency
mode (strong or eventual consistency), all members (from an
administrative point of view), all active members (from a runtime,
automatically-managed point of view), repairing/file-syncing members
(also runtime, auto-managed), and so on
+ Epoch: A version number of a projection. The epoch number is used
by both clients & servers to manage transitions from one projection
to another, e.g., when the chain is temporarily shortened by the
failure of a member FLU server.
** Terminology: Machi cluster data structures
+ Namespace: A collection of human-friendly names that are mapped to
groups of Machi chains that provide the same type of storage
service: consistency mode, replication policy, etc.
+ A single namespace name, e.g. ~normal-ec~, is paired with a single
cluster map (see below).
+ Example: ~normal-ec~ might be a collection of Machi chains in
eventually-consistent mode that are of length=3.
+ Example: ~risky-ec~ might be a collection of Machi chains in
eventually-consistent mode that are of length=1.
+ Example: ~mgmt-critical~ might be a collection of Machi chains in
strongly-consistent mode that are of length=7.
+ Cluster map: Encodes the rules which partition/shard/distribute
the files stored in a particular namespace across a group of chains
that collectively store the namespace's files.
+ Chain weight: A value assigned to each chain within a cluster map
structure that defines the relative storage capacity of a chain
within the namespace. For example, a chain weight=150 has 50% more
capacity than a chain weight=100.
+ Cluster map epoch: The version number assigned to a cluster map.
* Overview of administrative life cycles
** Cluster administrative life cycle
+ Cluster is first created
+ Adds namespaces (e.g. consistency policy + chain length policy) to
the cluster
+ Chains are added to/removed from a namespace to increase/decrease the
namespace's storage capacity.
+ Adjust chain weights within a namespace, e.g., to shift files
within the namespace to chains with greater storage capacity
resources and/or runtime I/O resources.
A cluster "file migration" is the process of moving files from one
namespace member chain to another for purposes of shifting &
re-balancing storage capacity and/or runtime I/O capacity.
** Chain administrative life cycle
+ A chain is created with an initial FLU membership list.
+ Chain may be administratively modified zero or more times to
add/remove member FLU servers.
+ A chain may be decommissioned.
See also: http://basho.github.io/machi/edoc/machi_lifecycle_mgr.html
** FLU server administrative life cycle
+ A FLU is created after an administrator chooses the FLU's runtime
location is selected by the administrator: which machine/virtual
machine, IP address and TCP port allocation, etc.
+ An unassigned FLU may be added to a chain by chain administrative
policy.
+ A FLU that is assigned to a chain may be removed from that chain by
chain administrative policy.
+ In the current implementation, the FLU's Erlang processes will be
halted. Then the FLU's data and metadata files will be moved to
another area of the disk for safekeeping. Later, a "garbage
collection" process can be used for reclaiming disk space used by
halted FLU servers.
See also: http://basho.github.io/machi/edoc/machi_lifecycle_mgr.html
* Quick admin: declarative management of Machi FLU and chain life cycles
The "quick admin" scheme is a temporary (?) tool for managing Machi
FLU server and chain life cycles in a declarative manner. The API is
described in this section.
** Quick admin uses the "rc.d" config scheme for life cycle management
As described at the top of
http://basho.github.io/machi/edoc/machi_lifecycle_mgr.html, the "rc.d"
config files do not manage "policy". "Policy" is doing the right
thing with a Machi cluster from a systems administrator's
point of view. The "rc.d" config files can only implement decisions
made according to policy.
The "quick admin" tool is a first attempt at automating policy
decisions in a safe way (we hope) that is also easy to implement (we
hope) with a variety of systems management tools, e.g. Chef, Puppet,
Ansible, Saltstack, or plain-old-human-at-a-keyboard.
** Quick admin's declarative "language": an Erlang-flavored AST
The "language" that an administrator uses to express desired policy
changes is not (yet) a true language. As a quick implementation hack,
the current language is an Erlang-flavored abstract syntax tree
(AST). The tree isn't very deep, either, frequently just one
element tall. (Not much of a tree, is it?)
There are three terms in the language currently:
+ ~host~, define a new host that can execute FLU servers
+ ~flu~, define a new FLU
+ ~chain~, define a new chain or re-configure an existing chain with
the same name
*** Term 'host': define a new host for FLU services
In this context, a host is a machine, virtual machine, or container
that can execute the Machi application and can therefore provide FLU
services, i.e. file service, Humming Consensus management.
Two formats may be used to define a new host:
#+BEGIN_SRC
{host, Name, Props}.
{host, Name, AdminI, ClientI, Props}.
#+END_SRC
The shorter tuple is shorthand notation for the latter. If the
shorthand form is used, then it will be converted automatically to the
long form as:
#+BEGIN_SRC
{host, Name, AdminI=Name, ClientI=Name, Props}.
#+END_SRC
Type information, description, and restrictions:
+ ~Name::string()~ The ~Name~ attribute must be unique. Note that it
is possible to define two different hosts, one using a DNS hostname
and one using an IP address. The user must avoid this
double-definition because it is not enforced by quick admin.
+ The ~Name~ field is used for cross-reference purposes with other
terms, e.g., ~flu~ and ~chain~.
+ There is no syntax yet for removing a host definition.
+ ~AdminI::string()~ A DNS hostname or IP address for cluster
administration purposes, e.g. SSH access.
+ This field is unused at the present time.
+ ~ClientI::string()~ A DNS hostname or IP address for Machi's client
protocol access, e.g., Protocol Buffers network API service.
+ This field is unused at the present time.
+ ~props::proplist()~ is an Erlang-style property list for specifying
additional configuration options, debugging information, sysadmin
comments, etc.
+ A full-featured admin tool should also include managing several
other aspects of configuration related to a "host". For example,
for any single IP address, quick admin assumes that there will be
exactly one Erlang VM that is running the Machi application. Of
course, it is possible to have dozens of Erlang VMs on the same
(let's assume for clarity) hardware machine and all running Machi
... but there are additional aspects of such a machine that quick
admin does not account for
+ multiple IP addresses per machine
+ multiple Machi package installation paths
+ multiple Machi config files (e.g. cuttlefish config, ~etc.conf~,
~vm.args~)
+ multiple data directories/file system mount points
+ This is also a management problem for quick admin for a single
Machi package on a machine to take advantage of bulk data
storage using multiple multiple file system mount points.
+ multiple Erlang VM host names, required for distributed Erlang,
which is used for communication with ~machi~ and ~machi-admin~
command line utilities.
+ and others....
*** Term 'flu': define a new FLU
A new FLU is defined relative to a previously-defined ~host~ entities;
an exception will be thrown if the ~host~ cannot be cross-referenced.
#+BEGIN_SRC
{flu, Name, HostName, Port, Props}
#+END_SRC
Type information, description, and restrictions:
+ ~Name::atom()~ The name of the FLU, as a human-friendly name and
also for internal management use; please note the ~atom()~ type.
This name must be unique.
+ The ~Name~ field is used for cross-reference purposes with the
~chain~ term.
+ There is no syntax yet for removing a FLU definition.
+ ~Hostname::string()~ The cross-reference name of the ~host~ that
this FLU should run on.
+ ~Port::non_neg_integer()~ The TCP port used by this FLU server's
Protocol Buffers network API listener service
+ ~props::proplist()~ is an Erlang-style property list for specifying
additional configuration options, debugging information, sysadmin
comments, etc.
*** Term 'chain': define or reconfigure a chain
A chain is defined relative to zero or more previously-defined ~flu~
entities; an exception will be thrown if any ~flu~ cannot be
cross-referenced.
Two formats may be used to define/reconfigure a chain:
#+BEGIN_SRC
{chain, Name, FullList, Props}.
{chain, Name, CMode, FullList, Witnesses, Props}.
#+END_SRC
The shorter tuple is shorthand notation for the latter. If the
shorthand form is used, then it will be converted automatically to the
long form as:
#+BEGIN_SRC
{chain, Name, ap_mode, FullList, [], Props}.
#+END_SRC
Type information, description, and restrictions:
+ ~Name::atom()~ The name of the chain, as a human-friendly name and
also for internal management use; please note the ~atom()~ type.
This name must be unique.
+ There is no syntax yet for removing a chain definition.
+ ~CMode::'ap_mode'|'cp_mode'~ Defines the consistency mode of the
chain, either eventual consistency or strong consistency,
respectively.
+ A chain cannot change consistency mode, e.g., from
strong~->~eventual consistency.
+ ~FullList::list(atom())~ Specifies the list of full-service FLU
servers, i.e. servers that provide file data & metadata services as
well as Humming Consensus. Each atom in the list must
cross-reference with a previously defined ~chain~; an exception will
be thrown if any ~flu~ cannot be cross-referenced.
+ ~Witnesses::list(atom())~ Specifies the list of witness-only
servers, i.e. servers that only participate in Humming Consensus.
Each atom in the list must cross-reference with a previously defined
~chain~; an exception will be thrown if any ~flu~ cannot be
cross-referenced.
+ This list must be empty for eventual consistency chains.
+ ~props::proplist()~ is an Erlang-style property list for specifying
additional configuration options, debugging information, sysadmin
comments, etc.
+ If this term specifies a new ~chain~ name, then all of the member
FLU servers (full & witness types) will be bootstrapped to a
starting configuration.
+ If this term specifies a previously-defined ~chain~ name, then all
of the member FLU servers (full & witness types, respectively) will
be adjusted to add or remove members, as appropriate.
+ Any FLU servers added to either list must not be assigned to any
other chain, or they must be a member of this specific chain.
+ Any FLU servers removed from either list will be halted.
(See the "FLU server administrative life cycle" section above.)
** Executing quick admin AST files via the 'machi-admin' utility
Examples of quick admin AST files can be found in the
~priv/quick-admin/examples~ directory. Below is an example that will
define a new host ( ~"localhost"~ ), three new FLU servers ( ~f1~ & ~f2~
and ~f3~ ), and an eventually consistent chain ( ~c1~ ) that uses the new
FLU servers:
#+BEGIN_SRC
{host, "localhost", []}.
{flu,f1,"localhost",20401,[]}.
{flu,f2,"localhost",20402,[]}.
{flu,f3,"localhost",20403,[]}.
{chain,c1,[f1,f2,f3],[]}.
#+END_SRC
*** Checking the syntax of an AST file
Given an AST config file, ~/path/to/ast/file~, its basic syntax and
correctness can be checked without executing it.
#+BEGIN_SRC
./rel/machi/bin/machi-admin quick-admin-check /path/to/ast/file
#+END_SRC
+ The utility will exit with status zero and output ~ok~ if the syntax
and proposed configuration appears to be correct.
+ If there is an error, the utility will exit with status one, and an
error message will be printed.
*** Executing an AST file
Given an AST config file, ~/path/to/ast/file~, it can be executed
using the command:
#+BEGIN_SRC
./rel/machi/bin/machi-admin quick-admin-apply /path/to/ast/file RelativeHost
#+END_SRC
... where the last argument, ~RelativeHost~, should be the exact
spelling of one of the previously defined AST ~host~ entities,
*and also* is the same host that the ~machi-admin~ utility is being
executed on.
Restrictions and warnings:
+ This is alpha quality software.
+ There is no "undo".
+ Of course there is, but you need to resort to doing things like
using ~machi attach~ to attach to the server's CLI to then execute
magic Erlang incantations to stop FLUs, unconfigure chains, etc.
+ Oh, and delete some files with magic paths, also.
** Using quick admin to manage multiple machines
A quick sketch follows:
1. Create the AST file to specify all of the changes that you wish to
make to all hosts, FLUs, and/or chains, e.g., ~/tmp/ast.txt~.
2. Check the basic syntax with the ~quick-admin-check~ argument to
~machi-admin~.
3. If the syntax is good, then copy ~/tmp/ast.txt~ to all hosts in the
cluster, using the same path, ~/tmp/ast.txt~.
4. For each machine in the cluster, run:
#+BEGIN_SRC
./rel/machi/bin/machi-admin quick-admin-apply /tmp/ast.txt RelativeHost
#+END_SRC
... where RelativeHost is the AST ~host~ name of the machine that you
are executing the ~machi-admin~ command on. The command should be
successful, with exit status 0 and outputting the string ~ok~.
Finally, for each machine in the cluster, a listing of all files in
the directory ~rel/machi/etc/quick-admin-archive~ should show exactly
the same files, one for each time that ~quick-admin-apply~ has been
run successfully on that machine.
* The "rc.d" style configuration file scheme
This configuration scheme is inspired by BSD UNIX's ~init(8)~ process
manager's configuration style, called "rc.d" after the name of the
directory where these files are stored, ~/etc/rc.d~. The ~init~
process is responsible for (among other things) starting UNIX
processes at machine boot time and stopping them when the machine is
shut down.
The original scheme used by ~init~ to start processes at boot time was
a single Bourne shell script called ~/etc/rc~. When a new software
package was installed that required a daemon to be started at boot
time, text was added to the ~/etc/rc~ file. Uninstalling packages was
much trickier, because it meant removing lines from a file that
*is a computer program (run by the Bourne shell, a Turing-complete
programming language)*. Error-free editing of the ~/etc/rc~ script
was impossible in all cases.
Later, ~init~'s configuration was split into a few master Bourne shell
scripts and a subdirectory, ~/etc/rc.d~. The subdirectory contained
shell scripts that were responsible for boot time starting of a single
daemon or service, e.g. NFS or an HTTP server. When a new software
package was added, a new file was added to the ~rc.d~ subdirectory.
When a package was removed, the corresponding file in ~rc.d~ was
removed. With this simple scheme, addition & removal of boot time
scripts was vastly simplified.
** Riak had a similar configuration file editing problem (and its solution)
Another software product from Basho Technologies, Riak, had a similar
configuration file editing problem. One file in particular,
~app.config~, had a syntax that made it difficult both for human
systems administrators and also computer programs to edit the file in
a syntactically correct manner.
Later releases of Riak switched to an alternative configuration file
format, one inspired by the BSD UNIX ~sysctl(8)~ utility and
~sysctl.conf(5)~ file syntax. The ~sysctl.conf~ format is much easier
to manage by computer programs to add items. Removing items is not
100% simple, however: the correct lines must be identified and then
removed (e.g. with Perl or a text editor or combination of ~grep -v~
and ~mv~), but removing any comment lines that "belong" to the removed
config item(s) is not any easy for a 1-line shell script to do 100%
correctly.
Machi will use the ~sysctl.conf~ style configuration for some
application configuration variables. However, adding & removing FLUs
and chains will be managed using the "rc.d" style because of the
"rc.d" scheme's simplicity and tolerance of mistakes by administrators
(human or computer).
** Machi's "rc.d" file scheme.
Machi will use a single subdirectory that will contain configuration
files for some life cycle management task, e.g. a single FLU or a
single chain.
The contents of the file should be a single Erlang term, serialized in
ASCII form as Erlang source code statement, i.e. a single Erlang term
~T~ that is formatted by ~io:format("~w.",[T]).~. This file must be
parseable by the Erlang function ~file:consult()~.
Later versions of Machi may change the file format to be more familiar
to administrators who are unaccustomed to Erlang language syntax.
** FLU life cycle management using "rc.d" style files
*** The key configuration components of a FLU
1. The machine (or virtual machine) to run it on.
2. The Machi software package's artifacts to execute.
3. The disk device(s) used to store Machi file data & metadata, "rc.d"
style config files, etc.
4. The name, IP address and TCP port assigned to the FLU service.
5. Its chain assignment.
Notes:
+ Items 1-3 are currently outside of the scope of this life cycle
document. We assume that human administrators know how to do these
things.
+ Item 4's properties are explicitly managed by a FLU-defining "rc.d"
style config file.
+ Item 5 is managed by the chain life cycle management system.
Here is an example of a properly formatted FLU config file:
#+BEGIN_SRC
{p_srvr,f1,machi_flu1_client,"192.168.72.23",20401,[]}.
#+END_SRC
... which corresponds to the following Erlang record definition:
#+BEGIN_SRC
-record(p_srvr, {
name :: atom(),
proto_mod = 'machi_flu1_client' :: atom(), % Module name
address :: term(), % Protocol-specific
port :: term(), % Protocol-specific
props = [] :: list() % proplist for other related info
}).
#+END_SRC
+ ~name~ is ~f1~. This is name of the FLU. This name should be
unique over the lifetime of the administrative domain and thus
managed by external policy. This name must be the same as the name
of the config file that defines the FLU.
+ ~proto_mod~ is used for internal management purposes and should be
considered a mandatory constant.
+ ~address~ is "192.168.72.23". The DNS hostname or IP address used
by other servers to communicate with this FLU. This must be a valid
IP address, previously assigned to this machine/VM using the
appropriate operating system-specific procedure.
+ ~port~ is TCP port 20401. The TCP port number that the FLU listens
to for incoming Protocol Buffers-serialized communication. This TCP
port must not be in use (now or in the future) by another Machi FLU
or any other process running on this machine/VM.
+ ~props~ is an Erlang-style property list for specifying additional
configuration options, debugging information, sysadmin comments,
etc.
** Chain life cycle management using "rc.d" style files
Unlike FLUs, chains have a self-management aspect that makes a chain
life cycle different from a single FLU server. Machi's chains are
self-managing, via Humming Consensus; see the
https://github.com/basho/machi/tree/master/doc/ directory for much
more detail about Humming Consensus. After FLUs have received their
initial chain configuration for Humming Consensus, the FLUs will
manage the chain (and each other) by themselves.
However, Humming Consensus does not handle three chain management
problems:
1. Specifying the very first chain configuration,
2. Altering the membership of the chain (i.e. adding/removing FLUs
from the chain),
3. Stopping the chain permanently.
A chain "rc.d" file will only be used to bootstrap a newly-defined FLU
server. It's like a piece of glue information to introduce the new
FLU to the Humming Consensus group that is managing the chain's
dynamic state (e.g. which members are up or down). In all other
respects, chain config files are ignored by life cycle management code.
However, to mimic the life cycle of the FLU server's "rc.d" config
files, a chain "rc.d" files is not deleted until the chain has been
decommissioned (i.e. defined with length=0).
*** The key configuration components of a chain
1. The name of the chain.
2. Consistency mode: eventually consistent or strongly consistent.
3. The membership list of all FLU servers in the chain.
+ Remember, all servers in a single chain will manage full replicas
of the same collection of Machi files.
4. If the chain is defined to use strongly consistent mode, then a
list of "witness servers" may also be defined. See the
[https://github.com/basho/machi/tree/master/doc/] documentation for
more information on witness servers.
+ The witness list must be empty for all chains in eventual
consistency mode.
Here is an example of a properly formatted chain config file:
#+BEGIN_SRC
{chain_def_v1,c1,ap_mode,
[{p_srvr,f1,machi_flu1_client,"localhost",20401,[]},
{p_srvr,f2,machi_flu1_client,"localhost",20402,[]},
{p_srvr,f3,machi_flu1_client,"localhost",20403,[]}],
[],[],[],
[f1,f2,f3],
[],[]}.
#+END_SRC
... which corresponds to the following Erlang record definition:
#+BEGIN_SRC
-record(chain_def_v1, {
name :: atom(), % chain name
mode :: 'ap_mode' | 'cp_mode',
full = [] :: [p_srvr()],
witnesses = [] :: [p_srvr()],
old_full = [] :: [atom()], % guard against some races
old_witnesses=[] :: [atom()], % guard against some races
local_run = [] :: [atom()], % must be tailored to each machine!
local_stop = [] :: [atom()], % must be tailored to each machine!
props = [] :: list() % proplist for other related info
}).
#+END_SRC
+ ~name~ is ~c1~, the name of the chain. This name should be unique
over the lifetime of the administrative domain and thus managed by
external policy. This name must be the same as the name of the
config file that defines the chain.
+ ~mode~ is ~ap_mode~, an internal code symbol for eventual
consistency mode.
+ ~full~ is a list of Erlang ~#p_srvr{}~ records for full-service
members of the chain, i.e., providing Machi file data & metadata
storage services.
+ ~witnesses~ is a list of Erlang ~#p_srvr{}~ records for witness-only
FLU servers, i.e., providing only Humming Consensus service.
+ The next four fields are used for internal management only.
+ ~props~ is an Erlang-style property list for specifying additional
configuration options, debugging information, sysadmin comments,
etc.

Binary file not shown.

View file

@ -0,0 +1,372 @@
# Table of contents
* [Hands-on experiments with Machi and Humming Consensus](#hands-on)
* [Using the network partition simulator and convergence demo test code](#partition-simulator)
<a name="hands-on">
# Hands-on experiments with Machi and Humming Consensus
## Prerequisites
Please refer to the
[Machi development environment prerequisites doc](./dev-prerequisites.md)
for Machi developer environment prerequisites.
If you do not have an Erlang/OTP runtime system available, but you do
have [the Vagrant virtual machine](https://www.vagrantup.com/) manager
available, then please refer to the instructions in the prerequisites
doc for using Vagrant.
<a name="clone-compile">
## Clone and compile the code
Please see the
[Machi 'clone and compile' doc](./dev-clone-compile.md)
for the short list of steps required to fetch the Machi source code
from GitHub and to compile &amp; test Machi.
## Running three Machi instances on a single machine
All of the commands that should be run at your login shell (e.g. Bash,
c-shell) can be cut-and-pasted from this document directly to your
login shell prompt.
Run the following command:
make stagedevrel
This will create a directory structure like this:
|-dev1-|... stand-alone Machi app + subdirectories
|-dev-|-dev2-|... stand-alone Machi app + directories
|-dev3-|... stand-alone Machi app + directories
Each of the `dev/dev1`, `dev/dev2`, and `dev/dev3` are stand-alone
application instances of Machi and can be run independently of each
other on the same machine. This demo will use all three.
The lifecycle management utilities for Machi are a bit immature,
currently. They assume that each Machi server runs on a host with a
unique hostname -- there is no flexibility built-in yet to easily run
multiple Machi instances on the same machine. To continue with the
demo, we need to use `sudo` or `su` to obtain superuser privileges to
edit the `/etc/hosts` file.
Please add the following line to `/etc/hosts`, using this command:
sudo sh -c 'echo "127.0.0.1 machi1 machi2 machi3" >> /etc/hosts'
Next, we will use a shell script to finish setting up our cluster. It
will do the following for us:
* Verify that the new line that was added to `/etc/hosts` is correct.
* Modify the `etc/app.config` files to configure the Humming Consensus
chain manager's actions logged to the `log/console.log` file.
* Start the three application instances.
* Verify that the three instances are running correctly.
* Configure a single chain, with one FLU server per application
instance.
Please run this script using this command:
./priv/humming-consensus-demo.setup.sh
If the output looks like this (and exits with status zero), then the
script was successful.
Step: Verify that the required entries in /etc/hosts are present
Step: add a verbose logging option to app.config
Step: start three three Machi application instances
pong
pong
pong
Step: configure one chain to start a Humming Consensus group with three members
Result: ok
Result: ok
Result: ok
We have now created a single replica chain, called `c1`, that has
three file servers participating in the chain. Thanks to the
hostnames that we added to `/etc/hosts`, all are using the localhost
network interface.
| App instance | Pseudo | FLU name | TCP port |
| directory | Hostname | | number |
|--------------+----------+----------+----------|
| dev1 | machi1 | flu1 | 20401 |
| dev2 | machi2 | flu2 | 20402 |
| dev3 | machi3 | flu3 | 20403 |
The log files for each application instance can be found in the
`./dev/devN/log/console.log` file, where the `N` is the instance
number: 1, 2, or 3.
## Understanding the chain manager's log file output
After running the `./priv/humming-consensus-demo.setup.sh` script,
let's look at the last few lines of the `./dev/dev1/log/console.log`
log file for Erlang VM process #1.
2016-03-09 10:16:35.676 [info] <0.105.0>@machi_lifecycle_mgr:process_pending_flu:422 Started FLU f1 with supervisor pid <0.128.0>
2016-03-09 10:16:35.676 [info] <0.105.0>@machi_lifecycle_mgr:move_to_flu_config:540 Creating FLU config file f1
2016-03-09 10:16:35.790 [info] <0.105.0>@machi_lifecycle_mgr:bootstrap_chain2:312 Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[]
2016-03-09 10:16:35.790 [info] <0.105.0>@machi_lifecycle_mgr:move_to_chain_config:546 Creating chain config file c1
2016-03-09 10:16:44.139 [info] <0.132.0> CONFIRM epoch 1141 <<155,42,7,221>> upi [] rep [] auth f1 by f1
2016-03-09 10:16:44.271 [info] <0.132.0> CONFIRM epoch 1148 <<57,213,154,16>> upi [f1] rep [] auth f1 by f1
2016-03-09 10:16:44.864 [info] <0.132.0> CONFIRM epoch 1151 <<239,29,39,70>> upi [f1] rep [f3] auth f1 by f1
2016-03-09 10:16:45.235 [info] <0.132.0> CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1
2016-03-09 10:16:47.343 [info] <0.132.0> CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1
Let's pick apart some of these lines. We have started all three
servers at about the same time. We see some race conditions happen,
and some jostling and readjustment happens pretty quickly in the first
few seconds.
* `Started FLU f1 with supervisor pid <0.128.0>`
* This VM, #1,
started a FLU (Machi data server) with the name `f1`. In the Erlang
process supervisor hierarchy, the process ID of the top supervisor
is `<0.128.0>`.
* `Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[]`
* A bootstrap configuration for a chain named `c1` has been created.
* The FLUs/data servers that are eligible for participation in the
chain have names `f1`, `f2`, and `f3`.
* The chain will operate in eventual consistency mode (`ap_mode`)
* The witness server list is empty. Witness servers are never used
in eventual consistency mode.
* `CONFIRM epoch 1141 <<155,42,7,221>> upi [] rep [] auth f1 by f1`
* All participants in epoch 1141 are unanimous in adopting epoch
1141's projection. All active membership lists are empty, so
there is no functional chain replication yet, at least as far as
server `f1` knows
* The epoch's abbreviated checksum is `<<155,42,7,221>>`.
* The UPI list, i.e. the replicas whose data is 100% in sync is
`[]`, the empty list. (UPI = Update Propagation Invariant)
* The list of servers that are under data repair (`rep`) is also
empty, `[]`.
* This projection was authored by server `f1`.
* The log message was generated by server `f1`.
* `CONFIRM epoch 1148 <<57,213,154,16>> upi [f1] rep [] auth f1 by f1`
* Now the server `f1` has created a chain of length 1, `[f1]`.
* Chain repair/file re-sync is not required when the UPI server list
changes from length 0 -> 1.
* `CONFIRM epoch 1151 <<239,29,39,70>> upi [f1] rep [f3] auth f1 by f1`
* Server `f1` has noticed that server `f3` is alive. Apparently it
has not yet noticed that server `f2` is also running.
* Server `f3` is in the repair list.
* `CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1`
* Server `f2` is apparently now aware that all three servers are running.
* The previous configuration used by `f2` was `upi [f2]`, i.e., `f2`
was running in a chain of one. `f2` noticed that `f1` and `f3`
were now available and has started adding them to the chain.
* All new servers are always added to the tail of the chain in the
repair list.
* In eventual consistency mode, a UPI change like this is OK.
* When performing a read, a client must read from both tail of the
UPI list and also from all repairing servers.
* When performing a write, the client writes to both the UPI
server list and also the repairing list, in that order.
* I.e., the client concatenates both lists,
`UPI ++ Repairing`, for its chain configuration for the write.
* Server `f2` will trigger file repair/re-sync shortly.
* The waiting time for starting repair has been configured to be
extremely short, 1 second. The default waiting time is 10
seconds, in case Humming Consensus remains unstable.
* `CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1`
* File repair/re-sync has finished. All file data on all servers
are now in sync.
* The UPI/in-sync part of the chain is now `[f2,f1,f3]`, and there
are no servers under repair.
## Let's create some failures
Here are some suggestions for creating failures.
* Use the `./dev/devN/bin/machi stop` and `./dev/devN/bin/machi start`
commands to stop & start VM #`N`.
* Stop a VM abnormally by using `kill`. The OS process name to look
for is `beam.smp`.
* Suspend and resume a VM, using the `SIGSTOP` and `SIGCONT` signals.
* E.g. `kill -STOP 9823` and `kill -CONT 9823`
The network partition simulator is not (yet) available when running
Machi in this mode. Please see the next section for instructions on
how to use partition simulator.
<a name="partition-simulator">
# Using the network partition simulator and convergence demo test code
This is the demo code mentioned in the presentation that Scott Lystig
Fritchie gave at the
[RICON 2015 conference](http://ricon.io).
* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf)
* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q)
## A complete example of all input and output
If you don't have an Erlang/OTP 17 runtime environment available,
please see this file for full input and output of a strong consistency
length=3 chain test:
https://gist.github.com/slfritchie/8352efc88cc18e62c72c
This file contains all commands input and all simulator output from a
sample run of the simulator.
To help interpret the output of the test, please skip ahead to the
"The test output is very verbose" section.
## Prerequisites
If you don't have `git` and/or the Erlang 17 runtime system available
on your OS X, FreeBSD, Linux, or Solaris machine, please take a look
at the [Prerequisites section](#prerequisites) first. When you have
installed the prerequisite software, please return back here.
## Clone and compile the code
Please briefly visit the [Clone and compile the code](#clone-compile)
section. When finished, please return back here.
## Run an interactive Erlang CLI shell
Run the following command at your login shell:
erl -pz .eunit ebin deps/*/ebin
If you are using Erlang/OTP version 17, you should see some CLI output
that looks like this:
Erlang/OTP 17 [erts-6.4] [source] [64-bit] [smp:8:8] [async-threads:10] [hipe] [kernel-poll:false] [dtrace]
Eshell V6.4 (abort with ^G)
1>
## The test output is very verbose ... what are the important parts?
The output of the Erlang command
`machi_chain_manager1_converge_demo:help()` will display the following
guide to the output of the tests.
A visualization of the convergence behavior of the chain self-management
algorithm for Machi.
1. Set up some server and chain manager pairs.
2. Create a number of different network partition scenarios, where
(simulated) partitions may be symmetric or asymmetric. Then stop changing
the partitions and keep the simulated network stable (and perhaps broken).
3. Run a number of iterations of the algorithm in parallel by poking each
of the manager processes on a random'ish basis.
4. Afterward, fetch the chain transition changes made by each FLU and
verify that no transition was unsafe.
During the iteration periods, the following is a cheatsheet for the output.
See the internal source for interpreting the rest of the output.
'SET partitions = '
A pair-wise list of actors which cannot send messages. The
list is uni-directional. If there are three servers (a,b,c),
and if the partitions list is '[{a,b},{b,c}]' then all
messages from a->b and b->c will be dropped, but any other
sender->recipient messages will be delivered successfully.
'x uses:'
The FLU x has made an internal state transition and is using
this epoch's projection as operating chain configuration. The
rest of the line is a summary of the projection.
'CONFIRM epoch {N}'
This message confirms that all of the servers listed in the
UPI and repairing lists of the projection at epoch {N} have
agreed to use this projection because they all have written
this projection to their respective private projection stores.
The chain is now usable by/available to all clients.
'Sweet, private projections are stable'
This report announces that this iteration of the test cycle
has passed successfully. The report that follows briefly
summarizes the latest private projection used by each
participating server. For example, when in strong consistency
mode with 'a' as a witness and 'b' and 'c' as real servers:
%% Legend:
%% server name, epoch ID, UPI list, repairing list, down list, ...
%% ... witness list, 'false' (a constant value)
[{a,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}},
{b,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}]
Both servers 'a' and 'b' agree on epoch 1116 with epoch ID
{1116,<<23,143,246,55>>} where UPI=[a,b], repairing=[],
down=[c], and witnesses=[a].
Server 'c' is not shown because 'c' has wedged itself OOS (out
of service) by configuring a chain length of zero.
If no servers are listed in the report (i.e. only '[]' is
displayed), then all servers have wedged themselves OOS, and
the chain is unavailable.
'DoIt,'
This marks a group of tick events which trigger the manager
processes to evaluate their environment and perhaps make a
state transition.
A long chain of 'DoIt,DoIt,DoIt,' means that the chain state has
(probably) settled to a stable configuration, which is the goal of the
algorithm.
Press control-c to interrupt the test....".
## Run a test in eventual consistency mode
Run the following command at the Erlang CLI prompt:
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}]).
The first argument, `3`, is the number of servers to participate in
the chain. Please note:
* Chain lengths as short as 1 or 2 are valid, but the results are a
bit boring.
* Chain lengths as long as 7 or 9 can be used, but they may
suffer from longer periods of churn/instability before all chain
managers reach agreement via humming consensus. (It is future work
to shorten the worst of the unstable churn latencies.)
* In eventual consistency mode, chain lengths may be even numbers,
e.g. 2, 4, or 6.
* The simulator will choose partition events from the permutations of
all 1, 2, and 3 node partition pairs. The total runtime will
increase *dramatically* with chain length.
* Chain length 2: about 3 partition cases
* Chain length 3: about 35 partition cases
* Chain length 4: about 230 partition cases
* Chain length 5: about 1100 partition cases
## Run a test in strong consistency mode (with witnesses):
*NOTE:* Due to a bug in the test code, please do not try to run the
convergence test in strong consistency mode and also without the
correct minority number of witness servers! If in doubt, please run
the commands shown below exactly.
Run the following command at the Erlang CLI prompt:
machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]).
The first argument, `3`, is the number of servers to participate in
the chain. Chain lengths as long as 7 or 9 can be used, but they may
suffer from longer periods of churn/instability before all chain
managers reach agreement via humming consensus.
Due to the bug mentioned above, please use the following
commands when running with chain lengths of 5 or 7, respectively.
machi_chain_manager1_converge_demo:t(5, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b]}]).
machi_chain_manager1_converge_demo:t(7, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b,c]}]).

View file

@ -1,170 +0,0 @@
@title Machi: a small village of replicated files
@doc
== About This EDoc Documentation ==
This EDoc-style documentation will concern itself only with Erlang
function APIs and function &amp; data types. Higher-level design and
commentary will remain outside of the Erlang EDoc system; please see
the "Pointers to Other Machi Documentation" section below for more
details.
Readers should beware that this documentation may be out-of-sync with
the source code. When in doubt, use the `make edoc' command to
regenerate all HTML pages.
It is the developer's responsibility to re-generate the documentation
periodically and commit it to the Git repo.
== Machi Code Overview ==
=== Chain Manager ===
The Chain Manager is responsible for managing the state of Machi's
"Chain Replication" state. This role is roughly analogous to the
"Riak Core" application inside of Riak, which takes care of
coordinating replica placement and replica repair.
For each primitive data server in the cluster, a Machi FLU, there is a
Chain Manager process that manages its FLU's role within the Machi
cluster's Chain Replication scheme. Each Chain Manager process
executes locally and independently to manage the distributed state of
a single Machi Chain Replication chain.
<ul>
<li> To contrast with Riak Core ... Riak Core's claimant process is
solely responsible for managing certain critical aspects of
Riak Core distributed state. Machi's Chain Manager process
performs similar tasks as Riak Core's claimant. However, Machi
has several active Chain Manager processes, one per FLU server,
instead of a single active process like Core's claimant. Each
Chain Manager process acts independently; each is constrained
so that it will reach consensus via independent computation
&amp; action.
Full discussion of this distributed consensus is outside the
scope of this document; see the "Pointers to Other Machi
Documentation" section below for more information.
</li>
<li> Machi differs from a Riak Core application because Machi's
replica placement policy is simply, "All Machi servers store
replicas of all Machi files".
Machi is intended to be a primitive building block for creating larger
cluster-of-clusters where files are
distributed/fragmented/sharded across a large pool of
independent Machi clusters.
</li>
<li> See
[https://www.usenix.org/legacy/events/osdi04/tech/renesse.html]
for a copy of the paper, "Chain Replication for Supporting High
Throughput and Availability" by Robbert van Renesse and Fred
B. Schneider.
</li>
</ul>
=== FLU ===
The FLU is the basic storage server for Machi.
<ul>
<li> The name FLU is taken from "flash storage unit" from the paper
"CORFU: A Shared Log Design for Flash Clusters" by
Balakrishnan, Malkhi, Prabhakaran, and Wobber. See
[https://www.usenix.org/conference/nsdi12/technical-sessions/presentation/balakrishnan]
</li>
<li> In CORFU, the sequencer step is a prerequisite step that is
performed by a separate component, the Sequencer.
In Machi, the `append_chunk()' protocol message has
an implicit "sequencer" operation applied by the "head" of the
Machi Chain Replication chain. If a client wishes to write
data that has already been assigned a sequencer position, then
the `write_chunk()' API function is used.
</li>
</ul>
For each FLU, there are three independent tasks that are implemented
using three different Erlang processes:
<ul>
<li> A FLU server, implemented primarily by `machi_flu.erl'.
</li>
<li> A projection store server, implemented primarily by
`machi_projection_store.erl'.
</li>
<li> A chain state manager server, implemented primarily by
`machi_chain_manager1.erl'.
</li>
</ul>
From the perspective of failure detection, it is very convenient that
all three FLU-related services (file server, sequencer server, and
projection server) are accessed using the same single TCP port.
=== Projection (data structure) ===
The projection is a data structure that specifies the current state
of the Machi cluster: all FLUs, which FLUS are considered
up/running or down/crashed/stopped, which FLUs are actively
participants in the Chain Replication protocol, and which FLUs are
under "repair" (i.e., having their data resyncronized when
newly-added to a cluster or when restarting after a crash).
=== Projection Store (server) ===
The projection store is a storage service that is implemented by an
Erlang/OTP `gen_server' process that is associated with each
FLU. Conceptually, the projection store is an array of
write-once registers. For each projection store register, the
key is a 2-tuple of an epoch number (`non_neg_integer()' type)
and a projection type (`public' or `private' type); the value is
a projection data structure (`projection_v1()' type).
=== Client and Proxy Client ===
Machi is intentionally avoiding using distributed Erlang for Machi's
communication. This design decision makes Erlang-side code more
difficult &amp; complex but allows us the freedom of implementing
parts of Machi in other languages without major
protocol&amp;API&amp;glue code changes later in the product's
lifetime.
There are two layers of interface for Machi clients.
<ul>
<li> The `machi_flu1_client' module implements an API that uses a
TCP socket directly.
</li>
<li> The `machi_proxy_flu1_client' module implements an API that
uses a local, long-lived `gen_server' process as a proxy for
the remote, perhaps disconnected-or-crashed Machi FLU server.
</li>
</ul>
The types for both modules ought to be the same. However, due to
rapid code churn, some differences might exist. Any major difference
is (almost by definition) a bug: please open a GitHub issue to request
a correction.
== TODO notes ==
Any use of the string "TODO" in upper/lower/mixed case, anywhere in
the code, is a reminder signal of unfinished work.
== Pointers to Other Machi Documentation ==
<ul>
<li> If you are viewing this document locally, please look in the
`../doc/' directory,
</li>
<li> If you are viewing this document via the Web, please find the
documentation via this link:
[http://github.com/basho/machi/tree/master/doc/]
Please be aware that this link points to the `master' branch
of the Machi source repository and therefore may be
out-of-sync with non-`master' branch code.
</li>
</ul>

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

File diff suppressed because it is too large Load diff

View file

@ -1489,7 +1489,7 @@ In Usenix ATC 2009.
{\tt https://www.usenix.org/legacy/event/usenix09/ tech/full\_papers/terrace/terrace.pdf}
\bibitem{chain-replication}
van Renesse, Robbert et al.
van Renesse, Robbert and Schneider, Fred.
Chain Replication for Supporting High Throughput and Availability.
Proceedings of the 6th Conference on Symposium on Operating Systems
Design \& Implementation (OSDI'04) - Volume 6, 2004.

View file

@ -1,12 +0,0 @@
####################### patterns for general errors in dep modules:
^protobuffs\.erl:
^protobuffs_[a-z_]*\.erl:
^leexinc\.hrl:[0-9][0-9]*:
^machi_chain_manager1.erl:[0-9][0-9]*: Guard test RetrospectiveP::'false' =:= 'true' can never succeed
^machi_pb\.erl:[0-9][0-9]*:
^pokemon_pb\.erl:[0-9][0-9]*:
####################### patterns for unknown functions:
^ basho_bench_config:get/2
^ erl_prettypr:format/1
^ erl_syntax:form_list/1
^ machi_partition_simulator:get/1

View file

@ -18,10 +18,9 @@
%%
%% -------------------------------------------------------------------
-define(MAX_FILE_SIZE, 256*1024*1024). % 256 MBytes
-define(MAX_CHUNK_SIZE, ((1 bsl 32) - 1)).
%% -define(DATA_DIR, "/Volumes/SAM1/seq-tests/data").
-define(DATA_DIR, "./data").
%% @doc Now 4GiBytes, could be up to 64bit due to PB message limit of
%% chunk size
-define(DEFAULT_MAX_FILE_SIZE, ((1 bsl 32) - 1)).
-define(MINIMUM_OFFSET, 1024).
%% 0th draft of checksum typing with 1st byte.
@ -30,6 +29,35 @@
-define(CSUM_TAG_SERVER_SHA, 2). % Server-genereated SHA1
-define(CSUM_TAG_SERVER_REGEN_SHA, 3). % Server-regenerated SHA1
-define(CSUM_TAG_NONE_ATOM, none).
-define(CSUM_TAG_CLIENT_SHA_ATOM, client_sha).
-define(CSUM_TAG_SERVER_SHA_ATOM, server_sha).
-define(CSUM_TAG_SERVER_REGEN_SHA_ATOM, server_regen_sha).
%% Protocol Buffers goop
-define(PB_MAX_MSG_SIZE, (33*1024*1024)).
-define(PB_PACKET_OPTS, [{packet, 4}, {packet_size, ?PB_MAX_MSG_SIZE}]).
%% TODO: it's used in flu_sup and elsewhere, change this to suitable name
-define(TEST_ETS_TABLE, test_ets_table).
-define(DEFAULT_COC_NAMESPACE, "").
-define(DEFAULT_COC_LOCATOR, 0).
-record(ns_info, {
version = 0 :: machi_dt:namespace_version(),
name = <<>> :: machi_dt:namespace(),
locator = 0 :: machi_dt:locator()
}).
-record(append_opts, {
chunk_extra = 0 :: machi_dt:chunk_size(),
preferred_file_name :: 'undefined' | machi_dt:file_name_s(),
flag_fail_preferred = false :: boolean()
}).
-record(read_opts, {
no_checksum = false :: boolean(),
no_chunk = false :: boolean(),
needs_trimmed = false :: boolean()
}).

View file

@ -0,0 +1,20 @@
%% machi merkle tree records
-record(naive, {
chunk_size = 1048576 :: pos_integer(), %% default 1 MB
recalc = true :: boolean(),
root :: 'undefined' | binary(),
lvl1 = [] :: [ binary() ],
lvl2 = [] :: [ binary() ],
lvl3 = [] :: [ binary() ],
leaves = [] :: [ { Offset :: pos_integer(),
Size :: pos_integer(),
Csum :: binary()} ]
}).
-record(mt, {
filename :: string(),
tree :: #naive{},
backend = 'naive' :: 'naive'
}).

View file

@ -1,6 +1,6 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2014 Basho Technologies, Inc. All Rights Reserved.
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
@ -22,10 +22,11 @@
-define(MACHI_PROJECTION_HRL, true).
-type pv1_consistency_mode() :: 'ap_mode' | 'cp_mode'.
-type pv1_chain_name():: atom().
-type pv1_csum() :: binary().
-type pv1_epoch() :: {pv1_epoch_n(), pv1_csum()}.
-type pv1_epoch_n() :: non_neg_integer().
-type pv1_server() :: atom() | binary().
-type pv1_server() :: atom().
-type pv1_timestamp() :: {non_neg_integer(), non_neg_integer(), non_neg_integer()}.
-record(p_srvr, {
@ -40,7 +41,7 @@
flap_count :: {term(), term()},
all_hosed :: list(),
all_flap_counts :: list(),
bad :: list()
my_unique_prop_count :: non_neg_integer()
}).
-type p_srvr() :: #p_srvr{}.
@ -48,18 +49,21 @@
-define(DUMMY_PV1_EPOCH, {0,<<0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0>>}).
%% Kludge for spam gossip. TODO: replace me
-define(SPAM_PROJ_EPOCH, ((1 bsl 32) - 7)).
-record(projection_v1, {
epoch_number :: pv1_epoch_n(),
epoch_number :: pv1_epoch_n() | ?SPAM_PROJ_EPOCH,
epoch_csum :: pv1_csum(),
author_server :: pv1_server(),
chain_name = ch_not_def_yet :: pv1_chain_name(),
all_members :: [pv1_server()],
witnesses = [] :: [pv1_server()],
creation_time :: pv1_timestamp(),
mode = ap_mode :: pv1_consistency_mode(),
upi :: [pv1_server()],
repairing :: [pv1_server()],
down :: [pv1_server()],
flap :: 'undefined' | #flap_i{}, % flapping information
inner :: 'undefined' | #projection_v1{},
dbg :: list(), %proplist(), is checksummed
dbg2 :: list(), %proplist(), is not checksummed
members_dict :: p_srvr_dict()
@ -73,4 +77,16 @@
%% create a consistent projection ranking score.
-define(MAX_CHAIN_LENGTH, 64).
-record(chain_def_v1, {
name :: atom(), % chain name
mode :: pv1_consistency_mode(),
full = [] :: [p_srvr()],
witnesses = [] :: [p_srvr()],
old_full = [] :: [pv1_server()], % guard against some races
old_witnesses=[] :: [pv1_server()], % guard against some races
local_run = [] :: [pv1_server()], % must be tailored to each machine!
local_stop = [] :: [pv1_server()], % must be tailored to each machine!
props = [] :: list() % proplist for other related info
}).
-endif. % !MACHI_PROJECTION_HRL

31
include/machi_verbose.hrl Normal file
View file

@ -0,0 +1,31 @@
%% -------------------------------------------------------------------
%%
%% Machi: a small village of replicated files
%%
%% Copyright (c) 2014-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-ifdef(PULSE).
-define(V(Fmt, Args), pulse:format(Fmt, Args)).
-else. % PULSE
-define(V(Fmt, Args), io:format(user, Fmt, Args)).
-endif. % PULSE
-define(D(X), ?V("~s ~p\n", [??X, X])).
-define(Dw(X), ?V("~s ~w\n", [??X, X])).

View file

@ -0,0 +1,56 @@
#!/bin/sh
echo "Step: Verify that the required entries in /etc/hosts are present"
for i in 1 2 3; do
grep machi$i /etc/hosts | egrep -s '^127.0.0.1' > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo ""
echo "'grep -s machi$i' failed. Aborting, sorry."
exit 1
fi
ping -c 1 machi$i > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo ""
echo "Ping attempt on host machi$i failed. Aborting."
echo ""
ping -c 1 machi$i
exit 1
fi
done
echo "Step: add a verbose logging option to app.config"
for i in 1 2 3; do
ed ./dev/dev$i/etc/app.config <<EOF > /dev/null 2>&1
/verbose_confirm
a
{chain_manager_opts, [{private_write_verbose_confirm,true}]},
{stability_time, 1},
.
w
q
EOF
done
echo "Step: start three three Machi application instances"
for i in 1 2 3; do
./dev/dev$i/bin/machi start
./dev/dev$i/bin/machi ping
if [ $? -ne 0 ]; then
echo "Sorry, a 'ping' check for instance dev$i failed. Aborting."
exit 1
fi
done
echo "Step: configure one chain to start a Humming Consensus group with three members"
# Note: $CWD of each Machi proc is two levels below the source code root dir.
LIFECYCLE000=../../priv/quick-admin-examples/demo-000
for i in 3 2 1; do
./dev/dev$i/bin/machi-admin quick-admin-apply $LIFECYCLE000 machi$i
if [ $? -ne 0 ]; then
echo "Sorry, 'machi-admin quick-admin-apply failed' on machi$i. Aborting."
exit 1
fi
done
exit 0

View file

@ -0,0 +1,93 @@
# -*- mode: ruby -*-
# vi: set ft=ruby :
# All Vagrant configuration is done below. The "2" in Vagrant.configure
# configures the configuration version (we support older styles for
# backwards compatibility). Please don't change it unless you know what
# you're doing.
Vagrant.configure(2) do |config|
# The most common configuration options are documented and commented below.
# For a complete reference, please see the online documentation at
# https://docs.vagrantup.com.
# Every Vagrant development environment requires a box. You can search for
# boxes at https://atlas.hashicorp.com/search.
# If this Vagrant box has not been downloaded before (e.g. using "vagrant box add"),
# then Vagrant will automatically download the VM image from HashiCorp.
config.vm.box = "hashicorp/precise64"
# If using a FreeBSD box, Bash may not be installed.
# Use the config.ssh.shell setting to specify an alternate shell.
# Note, however, that any code in the 'config.vm.provision' section
# would then have to use this shell's syntax!
# config.ssh.shell = "/bin/csh -l"
# Disable automatic box update checking. If you disable this, then
# boxes will only be checked for updates when the user runs
# `vagrant box outdated`. This is not recommended.
# config.vm.box_check_update = false
# Create a forwarded port mapping which allows access to a specific port
# within the machine from a port on the host machine. In the example below,
# accessing "localhost:8080" will access port 80 on the guest machine.
# config.vm.network "forwarded_port", guest: 80, host: 8080
# Create a private network, which allows host-only access to the machine
# using a specific IP.
# config.vm.network "private_network", ip: "192.168.33.10"
# Create a public network, which generally matched to bridged network.
# Bridged networks make the machine appear as another physical device on
# your network.
# config.vm.network "public_network"
# Share an additional folder to the guest VM. The first argument is
# the path on the host to the actual folder. The second argument is
# the path on the guest to mount the folder. And the optional third
# argument is a set of non-required options.
# config.vm.synced_folder "../data", "/vagrant_data"
# Provider-specific configuration so you can fine-tune various
# backing providers for Vagrant. These expose provider-specific options.
# Example for VirtualBox:
#
config.vm.provider "virtualbox" do |vb|
# Display the VirtualBox GUI when booting the machine
# vb.gui = true
# Customize the amount of memory on the VM:
vb.memory = "512"
end
#
# View the documentation for the provider you are using for more
# information on available options.
# Define a Vagrant Push strategy for pushing to Atlas. Other push strategies
# such as FTP and Heroku are also available. See the documentation at
# https://docs.vagrantup.com/v2/push/atlas.html for more information.
# config.push.define "atlas" do |push|
# push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME"
# end
# Enable provisioning with a shell script. Additional provisioners such as
# Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the
# documentation for more information about their specific syntax and use.
config.vm.provision "shell", inline: <<-SHELL
# Install prerequsites
# Support here for FreeBSD is experimental
apt-get update ; sudo apt-get install -y git sudo rsync ; # Ubuntu Linux
env ASSUME_ALWAYS_YES=yes pkg install -f git sudo rsync ; # FreeBSD 10
# Install dependent packages, using slf-configurator
git clone https://github.com/slfritchie/slf-configurator.git
chown -R vagrant ./slf-configurator
(cd slf-configurator ; sudo sh -x ./ALL.sh)
echo 'export PATH=${PATH}:/usr/local/erlang/17.5/bin' >> ~vagrant/.bashrc
export PATH=${PATH}:/usr/local/erlang/17.5/bin
## echo 'set path = ( $path /usr/local/erlang/17.5/bin )' >> ~vagrant/.cshrc
## setenv PATH /usr/local/erlang/17.5/bin:$PATH
git clone https://github.com/basho/machi.git
(cd machi ; git checkout master ; make && make test )
chown -R vagrant ./machi
SHELL
end

View file

@ -36,7 +36,7 @@ while (<I>) {
$indent = " " x ($count * 4);
s/^#*\s*[0-9. ]*//;
$anchor = "n$label";
printf T1 "%s+ [%s %s](#%s)\n", $indent, $label, $_, $anchor;
printf T1 "%s+ [%s. %s](#%s)\n", $indent, $label, $_, $anchor;
printf T2 "<a name=\"%s\">\n", $anchor;
$line =~ s/(#+)\s*[0-9. ]*/$1 $label. /;
print T2 $line;

View file

@ -0,0 +1 @@
{host, "localhost", []}.

View file

@ -0,0 +1,4 @@
{flu,f1,"localhost",20401,[]}.
{flu,f2,"localhost",20402,[]}.
{flu,f3,"localhost",20403,[]}.
{chain,c1,[f1,f2,f3],[]}.

View file

@ -0,0 +1,4 @@
{flu,f4,"localhost",20404,[]}.
{flu,f5,"localhost",20405,[]}.
{flu,f6,"localhost",20406,[]}.
{chain,c2,[f4,f5,f6],[]}.

View file

@ -0,0 +1,7 @@
{host, "machi1", []}.
{host, "machi2", []}.
{host, "machi3", []}.
{flu,f1,"machi1",20401,[]}.
{flu,f2,"machi2",20402,[]}.
{flu,f3,"machi3",20403,[]}.
{chain,c1,[f1,f2,f3],[]}.

10
priv/test-for-gh-pr.sh Executable file
View file

@ -0,0 +1,10 @@
#!/bin/sh
if [ "${TRAVIS_PULL_REQUEST}" = "false" ]; then
echo '$TRAVIS_PULL_REQUEST is false, skipping tests'
exit 0
else
echo '$TRAVIS_PULL_REQUEST is not false ($TRAVIS_PULL_REQUEST), running tests'
make test
make dialyzer
fi

View file

@ -1,11 +1,21 @@
{require_otp_vsn, "17"}.
{require_otp_vsn, "17|18"}.
%%% {erl_opts, [warnings_as_errors, {parse_transform, lager_transform}, debug_info]}.
{erl_opts, [{parse_transform, lager_transform}, debug_info]}.
{edoc_opts, [{dir, "./edoc"}]}.
{deps, [
{lager, ".*", {git, "git://github.com/basho/lager.git", {tag, "2.0.1"}}},
{protobuffs, "0.8.*", {git, "git://github.com/basho/erlang_protobuffs.git", {tag, "0.8.1p4"}}}
{cuttlefish, ".*", {git, "git://github.com/basho/cuttlefish.git", {branch, "develop"}}},
{sext, ".*", {git, "git://github.com/basho/sext.git", {branch, "master"}}},
{eleveldb, ".*", {git, "git://github.com/basho/eleveldb.git", {branch, "develop"}}},
{lager, ".*", {git, "git://github.com/basho/lager.git", {tag, "2.2.0"}}},
{protobuffs, "0.8.*", {git, "git://github.com/basho/erlang_protobuffs.git", {tag, "0.8.1p4"}}},
{riak_dt, ".*", {git, "git://github.com/basho/riak_dt.git", {branch, "develop"}}},
{ranch, ".*", {git, "git://github.com/ninenines/ranch.git", {branch, "master"}}},
{node_package, ".*", {git, "git://github.com/basho/node_package.git", {branch, "develop"}}},
{eper, ".*", {git, "git://github.com/basho/eper.git", {tag, "0.92-basho1"}}},
{cluster_info, ".*", {git, "git://github.com/basho/cluster_info", {branch, "develop"}}}
]}.
{sub_dirs, ["rel", "apps/machi"]}.
{lib_dirs, ["apps/machi"]}.

View file

@ -8,15 +8,24 @@ case PulseBuild of
true ->
PulseOpts =
[{pulse_no_side_effect,
[{erlang,display,1}
]},
[{erlang,display,1},
{os,getenv,1},
{io,format,2},
{io,format,3}
]},
{pulse_side_effect,
[ {does_not_exist_yet, some_func, '_'}
, {machi_flu1_client, '_', '_'}
, {machi_projection_store, '_', '_'}
, {machi_proxy_flu1_client, '_', '_'}
, {machi_pb_translate, '_', '_'}
, {prim_file, '_', '_'}
, {file, '_', '_'}
, {filelib, '_', '_'}
, {os, '_', '_'} ]},
%% , {os, '_', '_'}
]},
{pulse_replace_module,
[ {gen_server, pulse_gen_server}

35
rel/files/app.config Normal file
View file

@ -0,0 +1,35 @@
[
{machi, [
%% Data directory for all FLUs.
{flu_data_dir, "{{platform_data_dir}}/flu"},
%% FLU config directory
{flu_config_dir, "{{platform_etc_dir}}/flu-config"},
%% Chain config directory
{chain_config_dir, "{{platform_etc_dir}}/chain-config"},
%% FLUs to start at app start.
%% This task has moved to machi_flu_sup and machi_lifecycle_mgr.
%% Number of metadata manager processes to run per FLU.
%% Default = 10
%% {metadata_manager_count, 2},
%% Default options for chain manager processes.
%% {chain_manager_opts, [{private_write_verbose,true},
%% {private_write_verbose_confirm,true}]},
%% Platform vars (mirror of reltool packaging)
{platform_data_dir, "{{platform_data_dir}}"},
{platform_etc_dir, "{{platform_etc_dir}}"},
%% Do not delete, do not put Machi config items after this line.
{final_comma_stopper, do_not_delete}
]
},
{lager, [
{error_logger_hwm, 5000} % lager's default of 50/sec is too low
]
}
].

84
rel/files/machi-admin Executable file
View file

@ -0,0 +1,84 @@
#!/bin/sh
# -*- tab-width:4;indent-tabs-mode:nil -*-
# ex: ts=4 sw=4 et
# Pull environment for this install
. "{{runner_base_dir}}/lib/env.sh"
# Make sure the user running this script is the owner and/or su to that user
check_user "$@"
ES=$?
if [ "$ES" -ne 0 ]; then
exit $ES
fi
# Keep track of where script was invoked
ORIGINAL_DIR=$(pwd)
# Make sure CWD is set to runner run dir
cd $RUNNER_BASE_DIR
# Identify the script name
SCRIPT=`basename $0`
usage() {
echo "Usage: $SCRIPT { quick-admin-check | quick-admin-apply | "
echo " top }"
}
case "$1" in
quick-admin-check)
# Make sure the local node IS running
node_up_check
shift
NODE_NAME=${NAME_ARG#* } # target machi server node name
IN_FILE="$1"
$ERTS_PATH/erl -noshell -noinput $NAME_PARAM machi_test$NAME_HOST $COOKIE_ARG \
-remsh $NODE_NAME \
-eval "Me = self(), spawn('"$NODE_NAME"', fun() -> X = (catch(machi_lifecycle_mgr:quick_admin_sanity_check(\"$IN_FILE\"))), Me ! {res, X} end), XX = receive {res, Res} -> Res after 10*1000 -> timeout end, io:format(user, \"Result: ~p\n\", [XX]), case XX of \
ok -> init:stop(); \
_ -> init:stop(1) \
end."
;;
quick-admin-apply)
# Make sure the local node IS running
node_up_check
shift
NODE_NAME=${NAME_ARG#* } # target machi server node name
IN_FILE="$1"
RELATIVE_HOST="$2"
$ERTS_PATH/erl -noshell -noinput $NAME_PARAM machi_test$NAME_HOST $COOKIE_ARG \
-remsh $NODE_NAME \
-eval "Me = self(), spawn('"$NODE_NAME"', fun() -> X = (catch(machi_lifecycle_mgr:quick_admin_apply(\"$IN_FILE\", \"$RELATIVE_HOST\"))), Me ! {res, X} end), XX = receive {res, Res} -> Res after 10*1000 -> timeout end, io:format(user, \"Result: ~p\n\", [XX]), case XX of \
ok -> init:stop(); \
_ -> init:stop(1) \
end."
;;
top)
# Make sure the local node IS running
node_up_check
shift
MYPID=$$
NODE_NAME=${NAME_ARG#* }
$ERTS_PATH/erl -noshell -noinput \
-pa $RUNNER_LIB_DIR/basho-patches \
-hidden $NAME_PARAM machi_etop$MYPID$NAME_HOST $COOKIE_ARG \
-s etop -s erlang halt -output text \
-node $NODE_NAME \
$* -tracing off
;;
*)
usage
exit 1
;;
esac

220
rel/files/machi.schema Normal file
View file

@ -0,0 +1,220 @@
%%-*- mode: erlang -*-
%% @doc Where to emit the default log messages (typically at 'info'
%% severity):
%% off: disabled
%% file: the file specified by log.console.file
%% console: to standard output (seen when using `machi attach-direct`)
%% both: log.console.file and standard out.
{mapping, "log.console", "lager.handlers", [
{default, {{console_log_default}} },
{datatype, {enum, [off, file, console, both]}}
]}.
%% @doc The severity level of the console log, default is 'info'.
{mapping, "log.console.level", "lager.handlers", [
{default, info},
{datatype, {enum, [debug, info, notice, warning, error, critical, alert, emergency, none]}}
]}.
%% @doc When 'log.console' is set to 'file' or 'both', the file where
%% console messages will be logged.
{mapping, "log.console.file", "lager.handlers", [
{default, "$(platform_log_dir)/console.log"},
{datatype, file}
]}.
%% @doc The file where error messages will be logged.
{mapping, "log.error.file", "lager.handlers", [
{default, "$(platform_log_dir)/error.log"},
{datatype, file}
]}.
%% @doc When set to 'on', enables log output to syslog.
{mapping, "log.syslog", "lager.handlers", [
{default, off},
{datatype, flag}
]}.
%% @doc When set to 'on', enables log output to syslog.
{mapping, "log.syslog.ident", "lager.handlers", [
{default, "machi"},
hidden
]}.
%% @doc Syslog facility to log entries from Riak.
{mapping, "log.syslog.facility", "lager.handlers", [
{default, daemon},
{datatype, {enum,[kern, user, mail, daemon, auth, syslog,
lpr, news, uucp, clock, authpriv, ftp,
cron, local0, local1, local2, local3,
local4, local5, local6, local7]}},
hidden
]}.
%% @doc The severity level at which to log entries to syslog, default is 'info'.
{mapping, "log.syslog.level", "lager.handlers", [
{default, info},
{datatype, {enum, [debug, info, notice, warning, error, critical, alert, emergency, none]}},
hidden
]}.
{translation,
"lager.handlers",
fun(Conf) ->
SyslogHandler = case cuttlefish:conf_get("log.syslog", Conf) of
true ->
Ident = cuttlefish:conf_get("log.syslog.ident", Conf),
Facility = cuttlefish:conf_get("log.syslog.facility", Conf),
LogLevel = cuttlefish:conf_get("log.syslog.level", Conf),
[{lager_syslog_backend, [Ident, Facility, LogLevel]}];
_ -> []
end,
ErrorHandler = case cuttlefish:conf_get("log.error.file", Conf) of
undefined -> [];
ErrorFilename -> [{lager_file_backend, [{file, ErrorFilename},
{level, error},
{size, 10485760},
{date, "$D0"},
{count, 5}]}]
end,
ConsoleLogLevel = cuttlefish:conf_get("log.console.level", Conf),
ConsoleLogFile = cuttlefish:conf_get("log.console.file", Conf),
ConsoleHandler = {lager_console_backend, ConsoleLogLevel},
ConsoleFileHandler = {lager_file_backend, [{file, ConsoleLogFile},
{level, ConsoleLogLevel},
{size, 10485760},
{date, "$D0"},
{count, 5}]},
ConsoleHandlers = case cuttlefish:conf_get("log.console", Conf) of
off -> [];
file -> [ConsoleFileHandler];
console -> [ConsoleHandler];
both -> [ConsoleHandler, ConsoleFileHandler];
_ -> []
end,
SyslogHandler ++ ConsoleHandlers ++ ErrorHandler
end
}.
%% @doc Whether to enable Erlang's built-in error logger.
{mapping, "sasl", "sasl.sasl_error_logger", [
{default, off},
{datatype, flag},
hidden
]}.
%% @doc Whether to enable the crash log.
{mapping, "log.crash", "lager.crash_log", [
{default, on},
{datatype, flag}
]}.
%% @doc If the crash log is enabled, the file where its messages will
%% be written.
{mapping, "log.crash.file", "lager.crash_log", [
{default, "$(platform_log_dir)/crash.log"},
{datatype, file}
]}.
{translation,
"lager.crash_log",
fun(Conf) ->
case cuttlefish:conf_get("log.crash", Conf) of
false -> undefined;
_ ->
cuttlefish:conf_get("log.crash.file", Conf, "{{platform_log_dir}}/crash.log")
end
end}.
%% @doc Maximum size in bytes of individual messages in the crash log
{mapping, "log.crash.maximum_message_size", "lager.crash_log_msg_size", [
{default, "64KB"},
{datatype, bytesize}
]}.
%% @doc Maximum size of the crash log in bytes, before it is rotated
{mapping, "log.crash.size", "lager.crash_log_size", [
{default, "10MB"},
{datatype, bytesize}
]}.
%% @doc The schedule on which to rotate the crash log. For more
%% information see:
%% https://github.com/basho/lager/blob/master/README.md#internal-log-rotation
{mapping, "log.crash.rotation", "lager.crash_log_date", [
{default, "$D0"}
]}.
%% @doc The number of rotated crash logs to keep. When set to
%% 'current', only the current open log file is kept.
{mapping, "log.crash.rotation.keep", "lager.crash_log_count", [
{default, 5},
{datatype, [integer, {atom, current}]},
{validators, ["rotation_count"]}
]}.
{validator,
"rotation_count",
"must be 'current' or a positive integer",
fun(current) -> true;
(Int) when is_integer(Int) andalso Int >= 0 -> true;
(_) -> false
end}.
{translation,
"lager.crash_log_count",
fun(Conf) ->
case cuttlefish:conf_get("log.crash.rotation.keep", Conf) of
current -> 0;
Int -> Int
end
end}.
%% @doc Whether to redirect error_logger messages into lager -
%% defaults to true
{mapping, "log.error.redirect", "lager.error_logger_redirect", [
{default, on},
{datatype, flag},
hidden
]}.
%% @doc Maximum number of error_logger messages to handle in a second
{mapping, "log.error.messages_per_second", "lager.error_logger_hwm", [
{default, 100},
{datatype, integer},
hidden
]}.
%% @doc Cookie for distributed node communication. All nodes in the
%% same cluster should use the same cookie or they will not be able to
%% communicate.
{mapping, "distributed_cookie", "vm_args.-setcookie", [
{default, "machi"}
]}.
%% override zdbbl from 1mb to 32mb
{mapping, "erlang.distribution_buffer_size", "vm_args.+zdbbl", [
{default, "32MB"},
merge
]}.
%% VM scheduler collapse, part 1 of 2
{mapping, "erlang.schedulers.force_wakeup_interval", "vm_args.+sfwi", [
{default, 500},
{datatype, integer},
merge
]}.
%% VM scheduler collapse, part 2 of 2
{mapping, "erlang.schedulers.compaction_of_load", "vm_args.+scl", [
{default, false},
merge
]}.

27
rel/files/vm.args Normal file
View file

@ -0,0 +1,27 @@
## Name of the riak node
-name {{node}}
## Cookie for distributed erlang. All nodes in the same cluster
## should use the same cookie or they will not be able to communicate.
-setcookie machi
## Heartbeat management; auto-restarts VM if it dies or becomes unresponsive
## (Disabled by default..use with caution!)
##-heart
## Enable kernel poll and a few async threads
+K true
+A 64
## Treat error_logger warnings as warnings
+W w
## Increase number of concurrent ports/sockets
-env ERL_MAX_PORTS 4096
## ## Tweak GC to run more often
## -env ERL_FULLSWEEP_AFTER 0
## Set the location of crash dumps
-env ERL_CRASH_DUMP {{crash_dump}}

16
rel/gen_dev Executable file
View file

@ -0,0 +1,16 @@
#! /bin/sh
#
# Example usage: gen_dev dev4 vars.src vars
#
# Generate an overlay config for devNNN from vars.src and write to vars
#
NAME=$1
TEMPLATE=$2
VARFILE=$3
NODE="$NAME@127.0.0.1"
echo "Generating $NAME - node='$NODE'"
sed -e "s/@NODE@/$NODE/" \
< $TEMPLATE > $VARFILE

4
rel/rebar.config Normal file
View file

@ -0,0 +1,4 @@
{lib_dirs, ["../deps"]}.
%% {plugin_dir, "../deps/cuttlefish/src"}.
%% {plugins, [cuttlefish_rebar_plugin]}.
%% {cuttlefish_filename, "machi.conf"}.

113
rel/reltool.config Normal file
View file

@ -0,0 +1,113 @@
%% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*-
%% ex: ft=erlang ts=4 sw=4 et
{sys, [
{lib_dirs, ["../deps"]},
{rel, "machi", "0.0.0",
[
kernel,
stdlib,
lager,
sasl,
public_key,
ssl,
%% riak_sysmon,
%% os_mon,
crypto,
runtime_tools,
machi
%% cluster_info,
%% exometer_core,
]},
{rel, "start_clean", "",
[
kernel,
stdlib
]},
{boot_rel, "machi"},
{profile, embedded},
{excl_sys_filters, ["^bin/.*",
"^erts.*/bin/(dialyzer|typer)",
"^erts.*/doc",
"^erts.*/man"]},
{excl_archive_filters, [".*"]},
%% {app, cuttlefish, [{incl_cond, include}]},
%% {app, cluster_info, [{incl_cond, include}]},
{app, eper, [{incl_cond, include}]},
{app, sasl, [{incl_cond, include}]},
%% {app, syslog, [{incl_cond, include}]},
%% {app, lager_syslog, [{incl_cond, include}]},
{app, lager, [{incl_cond, include}]}
%% {app, exometer_core, [{incl_cond, include}]},
]}.
{target_dir, "machi"}.
{overlay_vars, "vars.config"}.
{overlay, [
{mkdir, "data"},
{mkdir, "data/^PRESERVE"},
{mkdir, "log"},
%% Copy base files for starting and interacting w/ node
{copy, "../deps/node_package/priv/base/erl",
"{{erts_vsn}}/bin/erl"},
{copy, "../deps/node_package/priv/base/nodetool",
"{{erts_vsn}}/bin/nodetool"},
%% {copy, "../deps/cuttlefish/cuttlefish",
%% "{{erts_vsn}}/bin/cuttlefish"},
{template, "../deps/node_package/priv/base/runner",
"bin/machi"},
{template, "../deps/node_package/priv/base/env.sh",
"lib/env.sh"},
{template, "../deps/node_package/priv/base/app_epath.sh",
"lib/app_epath.sh"},
%% Copy config files
%% Cuttlefish Schema Files have a priority order.
%% Anything in a file prefixed with 00- will override
%% anything in a file with a higher numbered prefix.
%% Please only use 0[0-9]-*.schema for development purposes
%% NOTHING PERMANENT
%% {template, "files/riak.schema", "lib/10-riak.schema"},
%% {template, "../deps/cuttlefish/priv/erlang_vm.schema", "lib/11-erlang_vm.schema"},
%% {template, "../deps/riak_core/priv/riak_core.schema", "lib/12-riak_core.schema"},
%% {template, "../deps/riak_api/priv/riak_api.schema", "lib/13-riak_api.schema"},
%% {template, "../deps/riak_kv/priv/riak_kv.schema", "lib/14-riak_kv.schema"},
%% {template, "../deps/riak_sysmon/priv/riak_sysmon.schema", "lib/15-riak_sysmon.schema"},
%% {template, "../deps/bitcask/priv/bitcask.schema", "lib/16-bitcask.schema"},
%% {template, "../deps/bitcask/priv/bitcask_multi.schema", "lib/17-bitcask_multi.schema"},
%% {template, "../deps/riak_control/priv/riak_control.schema", "lib/18-riak_control.schema"},
%% {template, "../deps/riak_kv/priv/multi_backend.schema", "lib/20-multi_backend.schema"},
%% {template, "../deps/eleveldb/priv/eleveldb.schema", "lib/21-leveldb.schema"},
%% {template, "../deps/eleveldb/priv/eleveldb_multi.schema", "lib/22-leveldb_multi.schema"},
%% {template, "../deps/yokozuna/priv/yokozuna.schema", "lib/30-yokozuna.schema"},
%% Copy additional bin scripts
{template, "files/machi-admin", "bin/machi-admin"},
{template, "files/vm.args", "etc/vm.args"},
{template, "files/app.config", "etc/app.config"},
{mkdir, "etc/chain-config"},
{mkdir, "etc/flu-config"},
{mkdir, "etc/pending"},
{mkdir, "etc/rejected"},
%% Experiment: quick-admin
{mkdir, "etc/quick-admin-archive"},
{mkdir, "priv"},
{mkdir, "priv/quick-admin-examples"},
{copy, "../priv/quick-admin-examples/000", "priv/quick-admin-examples"},
{copy, "../priv/quick-admin-examples/001", "priv/quick-admin-examples"},
{copy, "../priv/quick-admin-examples/002", "priv/quick-admin-examples"},
{copy, "../priv/quick-admin-examples/demo-000", "priv/quick-admin-examples/demo-000"},
{mkdir, "lib/basho-patches"}
%% {copy, "../apps/machi/ebin/etop_txt.beam", "lib/basho-patches"}
]}.

48
rel/vars.config Normal file
View file

@ -0,0 +1,48 @@
%% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*-
%% ex: ft=erlang ts=4 sw=4 et
%% NOTE: When modifying this file, also keep its near cousin
%% config file rel/vars/dev_vars.config.src in sync!
%% Platform-specific installation paths
{platform_bin_dir, "./bin"}.
{platform_data_dir, "./data"}.
{platform_etc_dir, "./etc"}.
{platform_lib_dir, "./lib"}.
{platform_log_dir, "./log"}.
%%
%% etc/app.config
%%
{sasl_error_log, "{{platform_log_dir}}/sasl-error.log"}.
{sasl_log_dir, "{{platform_log_dir}}/sasl"}.
%% lager
{console_log_default, file}.
%%
%% etc/vm.args
%%
{node, "machi@127.0.0.1"}.
{crash_dump, "{{platform_log_dir}}/erl_crash.dump"}.
%%
%% bin/machi
%%
{runner_script_dir, "\`cd \\`dirname $0\\` 1>/dev/null && /bin/pwd\`"}.
{runner_base_dir, "{{runner_script_dir}}/.."}.
{runner_etc_dir, "$RUNNER_BASE_DIR/etc"}.
{runner_log_dir, "$RUNNER_BASE_DIR/log"}.
{runner_lib_dir, "$RUNNER_BASE_DIR/lib"}.
{runner_patch_dir, "$RUNNER_BASE_DIR/lib/basho-patches"}.
{pipe_dir, "/tmp/$RUNNER_BASE_DIR/"}.
{runner_user, ""}.
{runner_wait_process, "machi_flu_sup"}.
{runner_ulimit_warn, 65536}.
%%
%% cuttlefish
%%
{cuttlefish, ""}. % blank = off
{cuttlefish_conf, "machi.conf"}.

View file

@ -0,0 +1,48 @@
%% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*-
%% ex: ft=erlang ts=4 sw=4 et
%% NOTE: When modifying this file, also keep its near cousin
%% config file rel/vars/dev_vars.config.src in sync!
%% Platform-specific installation paths
{platform_bin_dir, "./bin"}.
{platform_data_dir, "./data"}.
{platform_etc_dir, "./etc"}.
{platform_lib_dir, "./lib"}.
{platform_log_dir, "./log"}.
%%
%% etc/app.config
%%
{sasl_error_log, "{{platform_log_dir}}/sasl-error.log"}.
{sasl_log_dir, "{{platform_log_dir}}/sasl"}.
%% lager
{console_log_default, file}.
%%
%% etc/vm.args
%%
{node, "@NODE@"}.
{crash_dump, "{{platform_log_dir}}/erl_crash.dump"}.
%%
%% bin/machi
%%
{runner_script_dir, "\`cd \\`dirname $0\\` 1>/dev/null && /bin/pwd\`"}.
{runner_base_dir, "{{runner_script_dir}}/.."}.
{runner_etc_dir, "$RUNNER_BASE_DIR/etc"}.
{runner_log_dir, "$RUNNER_BASE_DIR/log"}.
{runner_lib_dir, "$RUNNER_BASE_DIR/lib"}.
{runner_patch_dir, "$RUNNER_BASE_DIR/lib/basho-patches"}.
{pipe_dir, "/tmp/$RUNNER_BASE_DIR/"}.
{runner_user, ""}.
{runner_wait_process, "machi_flu_sup"}.
{runner_ulimit_warn, 65536}.
%%
%% cuttlefish
%%
{cuttlefish, ""}. % blank = off
{cuttlefish_conf, "machi.conf"}.

View file

@ -1,13 +1,10 @@
{application, machi, [
{description, "A village of write-once files."},
{vsn, "0.0.0"},
{applications, [kernel, stdlib, sasl, crypto]},
{vsn, "0.0.1"},
{applications, [kernel, stdlib, crypto, cluster_info, ranch]},
{mod,{machi_app,[]}},
{registered, []},
{env, [
{flu_list,
[
%%%%%% {flu_a, 32900, "./data.flu_a"}
]}
%% Don't use this static env for defaults, or we will fall into config hell.
]}
]}.

View file

@ -40,12 +40,18 @@ enum Mpb_GeneralStatusCode {
BAD_ARG = 1;
WEDGED = 2;
BAD_CHECKSUM = 3;
/*
** There is no timout error code, only PARTITION. If the client
** wants to know if a lot of time has elapsed, then the client
** can do its own timekeeping.
*/
PARTITION = 4;
NOT_WRITTEN = 5;
WRITTEN = 6;
NO_SUCH_FILE = 7;
PARTIAL_READ = 8;
BAD_EPOCH = 9;
TRIMMED = 7; // The whole file was trimmed
NO_SUCH_FILE = 8;
PARTIAL_READ = 9;
BAD_EPOCH = 10;
BAD_JOSS = 255; // Only for testing by the Taipan
}
@ -81,6 +87,14 @@ message Mpb_ChunkCSum {
optional bytes csum = 2;
}
message Mpb_Chunk {
required uint64 offset = 1;
required string file_name = 2;
required bytes chunk = 3;
// TODO: must be required, in future?
optional Mpb_ChunkCSum csum = 4;
}
// epoch_id() type
message Mpb_EpochID {
required uint32 epoch_number = 1;
@ -125,6 +139,7 @@ message Mpb_ErrorResp {
// append_chunk() : Mpb_AppendChunkReq and Mpb_AppendChunkResp
// write_chunk() : Mpb_WriteChunkReq and Mpb_WriteChunkResp
// read_chunk() : Mpb_ReadChunkReq and Mpb_ReadChunkResp
// trim_chunk() : Mpb_TrimChunkReq and Mpb_TrimChunkResp
// checksum_list() : Mpb_ChecksumListReq and Mpb_ChecksumListResp
// list_files() : Mpb_ListFilesReq and Mpb_ListFilesResp
//
@ -155,11 +170,18 @@ message Mpb_AuthResp {
// High level API: append_chunk() request & response
message Mpb_AppendChunkReq {
optional bytes placement_key = 1;
required string prefix = 2;
required bytes chunk = 3;
required Mpb_ChunkCSum csum = 4;
optional uint32 chunk_extra = 5;
// General namespace arguments
/* In single chain/non-clustered environment, use namespace="" */
required string namespace = 1;
required string prefix = 10;
required bytes chunk = 11;
required Mpb_ChunkCSum csum = 12;
optional uint32 chunk_extra = 20;
optional string preferred_file_name = 21;
/* Fail the operation if our preferred file name is not available */
optional bool flag_fail_preferred = 22 [default=false];
}
message Mpb_AppendChunkResp {
@ -171,10 +193,7 @@ message Mpb_AppendChunkResp {
// High level API: write_chunk() request & response
message Mpb_WriteChunkReq {
required string file = 1;
required uint64 offset = 2;
required bytes chunk = 3;
required Mpb_ChunkCSum csum = 4;
required Mpb_Chunk chunk = 10;
}
message Mpb_WriteChunkResp {
@ -184,22 +203,38 @@ message Mpb_WriteChunkResp {
// High level API: read_chunk() request & response
message Mpb_ReadChunkReq {
required string file = 1;
required uint64 offset = 2;
required uint32 size = 3;
// No namespace arguments are required because NS is embedded
// inside of the file name.
// Use flag_checksum=non-zero to request the chunk's checksum also
optional uint32 flag_checksum = 4 [default=0];
required Mpb_ChunkPos chunk_pos = 10;
// Use flag_no_checksum=non-zero to skip returning the chunk's checksum.
// TODO: not implemented yet.
optional bool flag_no_checksum = 20 [default=false];
// Use flag_no_chunk=non-zero to skip returning the chunk (which
// only makes sense if flag_checksum is set).
optional uint32 flag_no_chunk = 5 [default=0];
// only makes sense if flag_no_checksum is not set).
// TODO: not implemented yet.
optional bool flag_no_chunk = 21 [default=false];
// TODO: not implemented yet.
optional bool flag_needs_trimmed = 22 [default=false];
}
message Mpb_ReadChunkResp {
required Mpb_GeneralStatusCode status = 1;
optional bytes chunk = 2;
optional Mpb_ChunkCSum csum = 3;
repeated Mpb_Chunk chunks = 2;
repeated Mpb_ChunkPos trimmed = 3;
}
// High level API: trim_chunk() request & response
message Mpb_TrimChunkReq {
required Mpb_ChunkPos chunk_pos = 1;
}
message Mpb_TrimChunkResp {
required Mpb_GeneralStatusCode status = 1;
}
// High level API: checksum_list() request & response
@ -219,6 +254,8 @@ message Mpb_ChecksumListResp {
// High level API: list_files() request & response
message Mpb_ListFilesReq {
// TODO: Add flag for file glob/regexp/other filter type
// TODO: What else could go wrong?
}
message Mpb_ListFilesResp {
@ -251,8 +288,9 @@ message Mpb_Request {
optional Mpb_AppendChunkReq append_chunk = 112;
optional Mpb_WriteChunkReq write_chunk = 113;
optional Mpb_ReadChunkReq read_chunk = 114;
optional Mpb_ChecksumListReq checksum_list = 115;
optional Mpb_ListFilesReq list_files = 116;
optional Mpb_TrimChunkReq trim_chunk = 115;
optional Mpb_ChecksumListReq checksum_list = 116;
optional Mpb_ListFilesReq list_files = 117;
}
message Mpb_Response {
@ -274,8 +312,9 @@ message Mpb_Response {
optional Mpb_AppendChunkResp append_chunk = 12;
optional Mpb_WriteChunkResp write_chunk = 13;
optional Mpb_ReadChunkResp read_chunk = 14;
optional Mpb_ChecksumListResp checksum_list = 15;
optional Mpb_ListFilesResp list_files = 16;
optional Mpb_TrimChunkResp trim_chunk = 15;
optional Mpb_ChecksumListResp checksum_list = 16;
optional Mpb_ListFilesResp list_files = 17;
}
//////////////////////////////////////////
@ -303,14 +342,14 @@ message Mpb_ProjectionV1 {
required uint32 epoch_number = 1;
required bytes epoch_csum = 2;
required string author_server = 3;
repeated string all_members = 4;
required Mpb_Now creation_time = 5;
required Mpb_Mode mode = 6;
repeated string upi = 7;
repeated string repairing = 8;
repeated string down = 9;
optional bytes opaque_flap = 10;
optional bytes opaque_inner = 11;
required string chain_name = 4;
repeated string all_members = 5;
repeated string witnesses = 6;
required Mpb_Now creation_time = 7;
required Mpb_Mode mode = 8;
repeated string upi = 9;
repeated string repairing = 10;
repeated string down = 11;
required bytes opaque_dbg = 12;
required bytes opaque_dbg2 = 13;
repeated Mpb_MembersDictEntry members_dict = 14;
@ -328,6 +367,7 @@ message Mpb_ProjectionV1 {
// append_chunk()
// write_chunk()
// read_chunk()
// trim_chunk()
// checksum_list()
// list_files()
// wedge_status()
@ -348,12 +388,20 @@ message Mpb_ProjectionV1 {
// Low level API: append_chunk()
message Mpb_LL_AppendChunkReq {
required Mpb_EpochID epoch_id = 1;
optional bytes placement_key = 2;
required string prefix = 3;
required bytes chunk = 4;
required Mpb_ChunkCSum csum = 5;
optional uint32 chunk_extra = 6;
// General namespace arguments
required uint32 namespace_version = 1;
required string namespace = 2;
required uint32 locator = 3;
required Mpb_EpochID epoch_id = 10;
required string prefix = 11;
required bytes chunk = 12;
required Mpb_ChunkCSum csum = 13;
optional uint32 chunk_extra = 20;
optional string preferred_file_name = 21;
/* Fail the operation if our preferred file name is not available */
optional bool flag_fail_preferred = 22 [default=false];
}
message Mpb_LL_AppendChunkResp {
@ -365,11 +413,12 @@ message Mpb_LL_AppendChunkResp {
// Low level API: write_chunk()
message Mpb_LL_WriteChunkReq {
required Mpb_EpochID epoch_id = 1;
required string file = 2;
required uint64 offset = 3;
required bytes chunk = 4;
required Mpb_ChunkCSum csum = 5;
// General namespace arguments
required uint32 namespace_version = 1;
required string namespace = 2;
required Mpb_EpochID epoch_id = 10;
required Mpb_Chunk chunk = 11;
}
message Mpb_LL_WriteChunkResp {
@ -379,30 +428,54 @@ message Mpb_LL_WriteChunkResp {
// Low level API: read_chunk()
message Mpb_LL_ReadChunkReq {
required Mpb_EpochID epoch_id = 1;
required string file = 2;
required uint64 offset = 3;
required uint32 size = 4;
// General namespace arguments
required uint32 namespace_version = 1;
required string namespace = 2;
// Use flag_checksum=non-zero to request the chunk's checksum also
optional uint32 flag_get_checksum = 5 [default=0];
required Mpb_EpochID epoch_id = 10;
required Mpb_ChunkPos chunk_pos = 11;
// Use flag_no_checksum=non-zero to skip returning the chunk's checksum.
// TODO: not implemented yet.
optional bool flag_no_checksum = 20 [default=false];
// Use flag_no_chunk=non-zero to skip returning the chunk (which
// only makes sense if flag_checksum is set).
optional uint32 flag_no_chunk = 6 [default=0];
// only makes sense if flag_checksum is not set).
// TODO: not implemented yet.
optional bool flag_no_chunk = 21 [default=false];
optional bool flag_needs_trimmed = 22 [default=false];
}
message Mpb_LL_ReadChunkResp {
required Mpb_GeneralStatusCode status = 1;
optional bytes chunk = 2;
optional Mpb_ChunkCSum csum = 3;
repeated Mpb_Chunk chunks = 2;
repeated Mpb_ChunkPos trimmed = 3;
}
// Low level API: trim_chunk()
message Mpb_LL_TrimChunkReq {
// General namespace arguments
required uint32 namespace_version = 1;
required string namespace = 2;
required Mpb_EpochID epoch_id = 10;
required string file = 11;
required uint64 offset = 12;
required uint32 size = 13;
optional bool trigger_gc = 20 [default=false];
}
message Mpb_LL_TrimChunkResp {
required Mpb_GeneralStatusCode status = 1;
}
// Low level API: checksum_list()
message Mpb_LL_ChecksumListReq {
required Mpb_EpochID epoch_id = 1;
required string file = 2;
required string file = 1;
}
message Mpb_LL_ChecksumListResp {
@ -433,7 +506,9 @@ message Mpb_LL_WedgeStatusReq {
message Mpb_LL_WedgeStatusResp {
required Mpb_GeneralStatusCode status = 1;
optional Mpb_EpochID epoch_id = 2;
optional uint32 wedged_flag = 3;
optional bool wedged_flag = 3;
optional uint32 namespace_version = 4;
optional string namespace = 5;
}
// Low level API: delete_migration()
@ -525,6 +600,11 @@ message Mpb_LL_ListAllProjectionsResp {
repeated uint32 epochs = 2;
}
// Low level API: kick_projection_reaction request, NO RESPONSE!
message Mpb_LL_KickProjectionReactionReq {
}
//////////////////////////////////////////
//
// Low API request & response wrapper
@ -552,15 +632,17 @@ message Mpb_LL_Request {
optional Mpb_LL_WriteProjectionReq proj_wp = 15;
optional Mpb_LL_GetAllProjectionsReq proj_ga = 16;
optional Mpb_LL_ListAllProjectionsReq proj_la = 17;
optional Mpb_LL_KickProjectionReactionReq proj_kp = 18;
optional Mpb_LL_AppendChunkReq append_chunk = 30;
optional Mpb_LL_WriteChunkReq write_chunk = 31;
optional Mpb_LL_ReadChunkReq read_chunk = 32;
optional Mpb_LL_ChecksumListReq checksum_list = 33;
optional Mpb_LL_ListFilesReq list_files = 34;
optional Mpb_LL_WedgeStatusReq wedge_status = 35;
optional Mpb_LL_DeleteMigrationReq delete_migration = 36;
optional Mpb_LL_TruncHackReq trunc_hack = 37;
optional Mpb_LL_TrimChunkReq trim_chunk = 33;
optional Mpb_LL_ChecksumListReq checksum_list = 34;
optional Mpb_LL_ListFilesReq list_files = 35;
optional Mpb_LL_WedgeStatusReq wedge_status = 36;
optional Mpb_LL_DeleteMigrationReq delete_migration = 37;
optional Mpb_LL_TruncHackReq trunc_hack = 38;
}
message Mpb_LL_Response {
@ -585,13 +667,15 @@ message Mpb_LL_Response {
optional Mpb_LL_WriteProjectionResp proj_wp = 15;
optional Mpb_LL_GetAllProjectionsResp proj_ga = 16;
optional Mpb_LL_ListAllProjectionsResp proj_la = 17;
// No reponse to Mpb_LL_KickProjectionReactionReq = 18;
optional Mpb_LL_AppendChunkResp append_chunk = 30;
optional Mpb_LL_WriteChunkResp write_chunk = 31;
optional Mpb_LL_ReadChunkResp read_chunk = 32;
optional Mpb_LL_ChecksumListResp checksum_list = 33;
optional Mpb_LL_ListFilesResp list_files = 34;
optional Mpb_LL_WedgeStatusResp wedge_status = 35;
optional Mpb_LL_DeleteMigrationResp delete_migration = 36;
optional Mpb_LL_TruncHackResp trunc_hack = 37;
optional Mpb_LL_TrimChunkResp trim_chunk = 33;
optional Mpb_LL_ChecksumListResp checksum_list = 34;
optional Mpb_LL_ListFilesResp list_files = 35;
optional Mpb_LL_WedgeStatusResp wedge_status = 36;
optional Mpb_LL_DeleteMigrationResp delete_migration = 37;
optional Mpb_LL_TruncHackResp trunc_hack = 38;
}

View file

@ -73,8 +73,13 @@ verify_file_checksums_local2(Sock1, EpochID, Path0) ->
{ok, FH} ->
File = re:replace(Path, ".*/", "", [{return, binary}]),
try
ReadChunk = fun(_File, Offset, Size) ->
file:pread(FH, Offset, Size)
ReadChunk = fun(F, Offset, Size) ->
case file:pread(FH, Offset, Size) of
{ok, Bin} ->
{ok, {[{F, Offset, Bin, undefined}], []}};
Err ->
Err
end
end,
verify_file_checksums_common(Sock1, EpochID, File, ReadChunk)
after
@ -85,17 +90,18 @@ verify_file_checksums_local2(Sock1, EpochID, Path0) ->
end.
verify_file_checksums_remote2(Sock1, EpochID, File) ->
NSInfo = undefined,
ReadChunk = fun(File_name, Offset, Size) ->
?FLU_C:read_chunk(Sock1, EpochID,
File_name, Offset, Size)
?FLU_C:read_chunk(Sock1, NSInfo, EpochID,
File_name, Offset, Size, undefined)
end,
verify_file_checksums_common(Sock1, EpochID, File, ReadChunk).
verify_file_checksums_common(Sock1, EpochID, File, ReadChunk) ->
verify_file_checksums_common(Sock1, _EpochID, File, ReadChunk) ->
try
case ?FLU_C:checksum_list(Sock1, EpochID, File) of
case ?FLU_C:checksum_list(Sock1, File) of
{ok, InfoBin} ->
{Info, _} = machi_flu1:split_checksum_list_blob_decode(InfoBin),
Info = machi_csum_table:split_checksum_list_blob_decode(InfoBin),
Res = lists:foldl(verify_chunk_checksum(File, ReadChunk),
[], Info),
{ok, Res};
@ -110,9 +116,11 @@ verify_file_checksums_common(Sock1, EpochID, File, ReadChunk) ->
end.
verify_chunk_checksum(File, ReadChunk) ->
fun({Offset, Size, <<_Tag:1/binary, CSum/binary>>}, Acc) ->
fun({0, ?MINIMUM_OFFSET, none}, []) ->
[];
({Offset, Size, <<_Tag:1/binary, CSum/binary>>}, Acc) ->
case ReadChunk(File, Offset, Size) of
{ok, Chunk} ->
{ok, {[{_, Offset, Chunk, _}], _}} ->
CSum2 = machi_util:checksum_chunk(Chunk),
if CSum == CSum2 ->
Acc;

View file

@ -27,16 +27,17 @@
-behaviour(application).
-ifdef(PULSE).
-compile({parse_transform, pulse_instrument}).
-include_lib("pulse_otp/include/pulse_otp.hrl").
-endif.
%% Application callbacks
-export([start/2, stop/1]).
start(_StartType, _StartArgs) ->
case machi_sup:start_link() of
{ok, Pid} ->
{ok, Pid};
Error ->
Error
end.
machi_cinfo:register(),
machi_sup:start_link().
stop(_State) ->
ok.

View file

@ -1,6 +1,6 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%% Copyright (c) 2007-2016 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
@ -43,23 +43,25 @@
%% could add new entries to this ETS table.
%%
%% Now we can use various integer-centric key generators that are
%% already bundled with basho_bench.
%% already bundled with basho_bench. NOTE: this scheme does not allow
%% mixing of 'append' and 'read' operations in the same config. Basho
%% Bench does not support different key generators for different
%% operations, unfortunately. The work-around is to run two different
%% Basho Bench instances: on for 'append' ops with a key generator for
%% the desired prefix(es), and the other for 'read' ops with an
%% integer key generator.
%%
%% TODO: Add CRC checking, when feasible and when supported on the
%% server side.
%%
%% TODO: As an alternate idea, if we know that the chunks written are
%% always the same size, and if we don't care about CRC checking, then
%% all we need to know are the file names &amp; file sizes on the server:
%% we can then pick any valid offset within that file. That would
%% certainly be more scalable than the zillion-row-ETS-table, which is
%% definitely RAM-hungry.
%% TODO: The 'read' operator will always read chunks at exactly the
%% byte offset & size as the original append/write ops. If reads are
%% desired at any arbitrary offset & size, then a new strategy is
%% required.
-module(machi_basho_bench_driver).
-export([new/1, run/4]).
-record(m, {
id,
conn,
max_key
}).
@ -81,7 +83,7 @@ new(Id) ->
{read_concurrency, true}]),
ets:insert(ETS, {max_key, 0}),
ets:insert(ETS, {total_bytes, 0}),
MaxKeys = load_ets_table(Conn, ETS),
MaxKeys = load_ets_table_maybe(Conn, ETS),
?INFO("Key preload: finished, ~w keys loaded", [MaxKeys]),
Bytes = ets:lookup_element(ETS, total_bytes, 2),
?INFO("Key preload: finished, chunk list specifies ~s MBytes of chunks",
@ -90,12 +92,14 @@ new(Id) ->
true ->
ok
end,
{ok, #m{conn=Conn}}.
{ok, #m{id=Id, conn=Conn}}.
run(append, KeyGen, ValueGen, #m{conn=Conn}=S) ->
Prefix = KeyGen(),
Value = ValueGen(),
case machi_cr_client:append_chunk(Conn, Prefix, Value, ?THE_TIMEOUT) of
CSum = machi_util:make_client_csum(Value),
AppendOpts = {append_opts,0,undefined,false}, % HACK FIXME
case machi_cr_client:append_chunk(Conn, undefined, Prefix, Value, CSum, AppendOpts, ?THE_TIMEOUT) of
{ok, Pos} ->
EtsKey = ets:update_counter(?ETS_TAB, max_key, 1),
true = ets:insert(?ETS_TAB, {EtsKey, Pos}),
@ -112,9 +116,26 @@ run(read, KeyGen, _ValueGen, #m{conn=Conn, max_key=MaxKey}=S) ->
Idx = KeyGen() rem MaxKey,
%% {File, Offset, Size, _CSum} = ets:lookup_element(?ETS_TAB, Idx, 2),
{File, Offset, Size} = ets:lookup_element(?ETS_TAB, Idx, 2),
case machi_cr_client:read_chunk(Conn, File, Offset, Size, ?THE_TIMEOUT) of
{ok, _Chunk} ->
{ok, S};
ReadOpts = {read_opts,false,false,false}, % HACK FIXME
case machi_cr_client:read_chunk(Conn, undefined, File, Offset, Size, ReadOpts, ?THE_TIMEOUT) of
{ok, {Chunks, _Trimmed}} ->
%% io:format(user, "Chunks ~P\n", [Chunks, 15]),
%% {ok, S};
case lists:all(fun({File2, Offset2, Chunk, CSum}) ->
{_Tag, CS} = machi_util:unmake_tagged_csum(CSum),
CS2 = machi_util:checksum_chunk(Chunk),
if CS == CS2 ->
true;
CS /= CS2 ->
?ERROR("Client-side checksum error for file ~p offset ~p expected ~p got ~p\n", [File2, Offset2, CS, CS2]),
false
end
end, Chunks) of
true ->
{ok, S};
false ->
{error, bad_checksum, S}
end;
{error, _}=Err ->
?ERROR("read file ~p offset ~w size ~w: ~w\n",
[File, Offset, Size, Err]),
@ -132,21 +153,40 @@ find_server_info(_Id) ->
Ps
end.
load_ets_table_maybe(Conn, ETS) ->
case basho_bench_config:get(operations, undefined) of
undefined ->
?ERROR("The 'operations' key is missing from the config file, aborting", []),
exit(bad_config);
Ops when is_list(Ops) ->
case lists:keyfind(read, 1, Ops) of
{read,_} ->
load_ets_table(Conn, ETS);
false ->
?INFO("No 'read' op in the 'operations' list ~p, skipping ETS table load.", [Ops]),
0
end
end.
load_ets_table(Conn, ETS) ->
{ok, Fs} = machi_cr_client:list_files(Conn),
[begin
{ok, InfoBin} = machi_cr_client:checksum_list(Conn, File),
{PosList, _} = machi_flu1:split_checksum_list_blob_decode(InfoBin),
{ok, InfoBin} = machi_cr_client:checksum_list(Conn, File, ?THE_TIMEOUT),
PosList = machi_csum_table:split_checksum_list_blob_decode(InfoBin),
?INFO("File ~s len PosList ~p\n", [File, length(PosList)]),
StartKey = ets:update_counter(ETS, max_key, 0),
%% _EndKey = lists:foldl(fun({Off,Sz,CSum}, K) ->
%% V = {File, Off, Sz, CSum},
{_, Bytes} = lists:foldl(fun({Off,Sz,_CSum}, {K, Bs}) ->
V = {File, Off, Sz},
ets:insert(ETS, {K, V}),
{K + 1, Bs + Sz}
end, {StartKey, 0}, PosList),
ets:update_counter(ETS, max_key, length(PosList)),
ets:update_counter(ETS, total_bytes, Bytes)
{_, C, Bytes} = lists:foldl(fun({_Off,0,_CSum}, {_K, _C, _Bs}=Acc) ->
Acc;
({0,_Sz,_CSum}, {_K, _C, _Bs}=Acc) ->
Acc;
({Off,Sz,_CSum}, {K, C, Bs}) ->
V = {File, Off, Sz},
ets:insert(ETS, {K, V}),
{K + 1, C + 1, Bs + Sz}
end, {StartKey, 0, 0}, PosList),
_ = ets:update_counter(ETS, max_key, C),
_ = ets:update_counter(ETS, total_bytes, Bytes),
ok
end || {_Size, File} <- Fs],
ets:update_counter(?ETS_TAB, max_key, 0).

File diff suppressed because it is too large Load diff

View file

@ -96,11 +96,6 @@
-export([repair/7]).
repair_cp(_Src, _Dst, _MembersDict, _Opts) ->
%% TODO: add missing function: wipe away any trace of chunks
%% are present on Dst but missing on Src.
exit(todo_cp_mode).
repair(ap_mode=ConsistencyMode, Src, Repairing, UPI, MembersDict, ETS, Opts) ->
%% Use process dict so that 'after' clause can always quit all
%% proxy pids.
@ -108,12 +103,13 @@ repair(ap_mode=ConsistencyMode, Src, Repairing, UPI, MembersDict, ETS, Opts) ->
Add = fun(Name, Pid) -> put(proxies_dict, orddict:store(Name, Pid, get(proxies_dict))) end,
OurFLUs = lists:usort([Src] ++ Repairing ++ UPI), % AP assumption!
RepairMode = proplists:get_value(repair_mode, Opts, repair),
Verb = proplists:get_value(verbose, Opts, true),
Verb = proplists:get_value(verbose, Opts, false),
RepairId = proplists:get_value(repair_id, Opts, id1),
Res = try
[begin
{ok, Proxy} = machi_proxy_flu1_client:start_link(P),
Add(FLU, Proxy)
end || {FLU,P} <- MembersDict, lists:member(FLU, OurFLUs)],
_ = [begin
{ok, Proxy} = machi_proxy_flu1_client:start_link(P),
Add(FLU, Proxy)
end || {FLU,P} <- MembersDict, lists:member(FLU, OurFLUs)],
ProxiesDict = get(proxies_dict),
D = dict:new(),
@ -121,31 +117,39 @@ repair(ap_mode=ConsistencyMode, Src, Repairing, UPI, MembersDict, ETS, Opts) ->
get_file_lists(Proxy, FLU, Dict)
end, D, ProxiesDict),
MissingFileSummary = make_missing_file_summary(D2, OurFLUs),
?VERB("MissingFileSummary ~p\n", [MissingFileSummary]),
%% ?VERB("~w MissingFileSummary ~p\n",[RepairId,MissingFileSummary]),
lager:info("Repair ~w MissingFileSummary ~p\n",
[RepairId, MissingFileSummary]),
[ets:insert(ETS, {{directive_bytes, FLU}, 0}) || FLU <- OurFLUs],
%% Repair files from perspective of Src, i.e. tail(UPI).
SrcProxy = orddict:fetch(Src, ProxiesDict),
{ok, EpochID} = machi_proxy_flu1_client:get_epoch_id(
SrcProxy, ?SHORT_TIMEOUT),
?VERB("Make repair directives: "),
%% ?VERB("Make repair directives: "),
Ds =
[{File, make_repair_directives(
ConsistencyMode, RepairMode, File, Size, EpochID,
Verb,
Src, OurFLUs, ProxiesDict, ETS)} ||
{File, {Size, _MissingList}} <- MissingFileSummary],
?VERB(" done\n"),
%% ?VERB(" done\n"),
lager:info("Repair ~w repair directives finished\n", [RepairId]),
[begin
[{_, Bytes}] = ets:lookup(ETS, {directive_bytes, FLU}),
?VERB("Out-of-sync data for FLU ~p: ~s MBytes\n",
[FLU, mbytes(Bytes)])
%% ?VERB("Out-of-sync data for FLU ~p: ~s MBytes\n",
%% [FLU, mbytes(Bytes)]),
lager:info("Repair ~w "
"Out-of-sync data for FLU ~p: ~s MBytes\n",
[RepairId, FLU, mbytes(Bytes)]),
ok
end || FLU <- OurFLUs],
?VERB("Execute repair directives: "),
%% ?VERB("Execute repair directives: "),
ok = execute_repair_directives(ConsistencyMode, Ds, Src, EpochID,
Verb, OurFLUs, ProxiesDict, ETS),
?VERB(" done\n"),
%% ?VERB(" done\n"),
lager:info("Repair ~w repair directives finished\n", [RepairId]),
ok
catch
What:Why ->
@ -155,7 +159,10 @@ repair(ap_mode=ConsistencyMode, Src, Repairing, UPI, MembersDict, ETS, Opts) ->
[(catch machi_proxy_flu1_client:quit(Pid)) ||
Pid <- orddict:to_list(get(proxies_dict))]
end,
Res.
Res;
repair(cp_mode=_ConsistencyMode, Src, Repairing, UPI, MembersDict, ETS, Opts) ->
io:format(user, "\n\nTODO! cp_mode repair is not fully implemented!\n\n", []),
repair(ap_mode, Src, Repairing, UPI, MembersDict, ETS, Opts).
%% Create a list of servers where the file is completely missing.
%% In the "demo day" implementation and in an early integration WIP,
@ -200,7 +207,7 @@ make_repair_compare_fun(SrcFLU) ->
T_a =< T_b
end.
make_repair_directives(ConsistencyMode, RepairMode, File, Size, EpochID,
make_repair_directives(ConsistencyMode, RepairMode, File, Size, _EpochID,
Verb, Src, FLUs0, ProxiesDict, ETS) ->
true = (Size < ?MAX_OFFSET),
FLUs = lists:usort(FLUs0),
@ -209,11 +216,9 @@ make_repair_directives(ConsistencyMode, RepairMode, File, Size, EpochID,
Proxy = orddict:fetch(FLU, ProxiesDict),
OffSzCs =
case machi_proxy_flu1_client:checksum_list(
Proxy, EpochID, File, ?LONG_TIMEOUT) of
Proxy, File, ?LONG_TIMEOUT) of
{ok, InfoBin} ->
{Info, _} =
machi_flu1:split_checksum_list_blob_decode(InfoBin),
Info;
machi_csum_table:split_checksum_list_blob_decode(InfoBin);
{error, no_such_file} ->
[]
end,
@ -231,7 +236,6 @@ make_repair_directives(ConsistencyMode, RepairMode, File, Size, EpochID,
make_repair_directives2(C2, ConsistencyMode, RepairMode,
File, Verb, Src, FLUs, ProxiesDict, ETS) ->
?VERB("."),
make_repair_directives3(C2, ConsistencyMode, RepairMode,
File, Verb, Src, FLUs, ProxiesDict, ETS, []).
@ -261,7 +265,18 @@ make_repair_directives3([{Offset, Size, CSum, _FLU}=A|Rest0],
%% byte range from all FLUs
%% 3b. Log big warning about data loss.
%% 4. Log any other checksum discrepencies as they are found.
exit({todo_repair_sanity_check, ?LINE, File, Offset, As})
QQ = [begin
Pxy = orddict:fetch(FLU, ProxiesDict),
{ok, EpochID} = machi_proxy_flu1_client:get_epoch_id(
Pxy, ?SHORT_TIMEOUT),
NSInfo = undefined,
XX = machi_proxy_flu1_client:read_chunk(
Pxy, NSInfo, EpochID, File, Offset, Size, undefined,
?SHORT_TIMEOUT),
{FLU, XX}
end || {__Offset, __Size, __CSum, FLU} <- As],
exit({todo_repair_sanity_check, ?LINE, File, Offset, {as,As}, {qq,QQ}})
end,
%% List construction guarantees us that there's at least one ?MAX_OFFSET
%% item remains. Sort order + our "taking" of all exact Offset+Size
@ -282,15 +297,16 @@ make_repair_directives3([{Offset, Size, CSum, _FLU}=A|Rest0],
true -> Src;
false -> hd(Gots)
end,
[ets:update_counter(ETS, {directive_bytes, FLU_m}, Size) ||
FLU_m <- Missing],
_ = [ets:update_counter(ETS, {directive_bytes, FLU_m}, Size) ||
FLU_m <- Missing],
if Missing == [] ->
noop;
true ->
{copy, A, Missing}
end;
ConsistencyMode == cp_mode ->
exit({todo_cp_mode, ?MODULE, ?LINE})
end
%% end;
%% ConsistencyMode == cp_mode ->
%% exit({todo_cp_mode, ?MODULE, ?LINE})
end,
Acc2 = if Do == noop -> Acc;
true -> [Do|Acc]
@ -313,38 +329,42 @@ execute_repair_directives(ap_mode=_ConsistencyMode, Ds, _Src, EpochID, Verb,
{ProxiesDict, EpochID, Verb, ETS}, Ds),
ok.
execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, Verb, ETS}=Acc) ->
execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, _Verb, ETS}=Acc) ->
EtsKeys = [{in_files, t_in_files}, {in_chunks, t_in_chunks},
{in_bytes, t_in_bytes}, {out_files, t_out_files},
{out_chunks, t_out_chunks}, {out_bytes, t_out_bytes}],
[ets:insert(ETS, {L_K, 0}) || {L_K, _T_K} <- EtsKeys],
F = fun({copy, {Offset, Size, TaggedCSum, MySrc}, MyDsts}, Acc2) ->
SrcP = orddict:fetch(MySrc, ProxiesDict),
case ets:lookup_element(ETS, in_chunks, 2) rem 100 of
0 -> ?VERB(".", []);
_ -> ok
end,
%% case ets:lookup_element(ETS, in_chunks, 2) rem 100 of
%% 0 -> ?VERB(".2", []);
%% _ -> ok
%% end,
_T1 = os:timestamp(),
{ok, Chunk} = machi_proxy_flu1_client:read_chunk(
SrcP, EpochID, File, Offset, Size,
?SHORT_TIMEOUT),
%% TODO: support case multiple written or trimmed chunks returned
NSInfo = undefined,
{ok, {[{_, Offset, Chunk, _ReadCSum}|OtherChunks], []=_TrimmedList}} =
machi_proxy_flu1_client:read_chunk(
SrcP, NSInfo, EpochID, File, Offset, Size, undefined,
?SHORT_TIMEOUT),
[] = OtherChunks,
_T2 = os:timestamp(),
<<_Tag:1/binary, CSum/binary>> = TaggedCSum,
case machi_util:checksum_chunk(Chunk) of
CSum_now when CSum_now == CSum ->
[begin
DstP = orddict:fetch(DstFLU, ProxiesDict),
_T3 = os:timestamp(),
ok = machi_proxy_flu1_client:write_chunk(
DstP, EpochID, File, Offset, Chunk,
?SHORT_TIMEOUT),
_T4 = os:timestamp()
end || DstFLU <- MyDsts],
ets:update_counter(ETS, in_chunks, 1),
ets:update_counter(ETS, in_bytes, Size),
_ = [begin
DstP = orddict:fetch(DstFLU, ProxiesDict),
_T3 = os:timestamp(),
ok = machi_proxy_flu1_client:write_chunk(
DstP, NSInfo, EpochID, File, Offset, Chunk, TaggedCSum,
?SHORT_TIMEOUT),
_T4 = os:timestamp()
end || DstFLU <- MyDsts],
_ = ets:update_counter(ETS, in_chunks, 1),
_ = ets:update_counter(ETS, in_bytes, Size),
N = length(MyDsts),
ets:update_counter(ETS, out_chunks, N),
ets:update_counter(ETS, out_bytes, N*Size),
_ = ets:update_counter(ETS, out_chunks, N),
_ = ets:update_counter(ETS, out_bytes, N*Size),
Acc2;
CSum_now ->
error_logger:error_msg(
@ -363,8 +383,8 @@ execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, Verb, ETS}=Acc) ->
end,
ok = lists:foldl(F, ok, Cmds),
%% Copy this file's stats to the total counts.
[ets:update_counter(ETS, T_K, ets:lookup_element(ETS, L_K, 2)) ||
{L_K, T_K} <- EtsKeys],
_ = [ets:update_counter(ETS, T_K, ets:lookup_element(ETS, L_K, 2)) ||
{L_K, T_K} <- EtsKeys],
Acc.
mbytes(N) ->

104
src/machi_cinfo.erl Normal file
View file

@ -0,0 +1,104 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc cluster_info callback module for machi specific information
%% gathering.
-module(machi_cinfo).
%% cluster_info callbacks
-export([register/0, cluster_info_init/0, cluster_info_generator_funs/0]).
%% for debug in interactive shell
-export([dump/0,
public_projection/1, private_projection/1,
chain_manager/1, fitness/1, flu1/1]).
-include("machi_projection.hrl").
-spec register() -> ok.
register() ->
ok = cluster_info:register_app(?MODULE).
-spec cluster_info_init() -> ok.
cluster_info_init() ->
ok.
-spec cluster_info_generator_funs() -> [{string(), fun((pid()) -> ok)}].
cluster_info_generator_funs() ->
FluNames = [Name || {Name, _, _, _} <- supervisor:which_children(machi_flu_sup)],
lists:flatten([generator_funs_package(Name) || Name <- FluNames]).
generator_funs_package(FluName) ->
[{"Public projection of FLU " ++ atom_to_list(FluName),
cinfo_wrapper(fun public_projection/1, FluName)},
{"Private projection of FLU " ++ atom_to_list(FluName),
cinfo_wrapper(fun private_projection/1, FluName)},
{"Chain manager status of FLU " ++ atom_to_list(FluName),
cinfo_wrapper(fun chain_manager/1, FluName)},
{"Fitness server status of FLU " ++ atom_to_list(FluName),
cinfo_wrapper(fun fitness/1, FluName)},
{"FLU1 status of FLU " ++ atom_to_list(FluName),
cinfo_wrapper(fun flu1/1, FluName)}].
dump() ->
{{Y,M,D},{HH,MM,SS}} = calendar:local_time(),
Filename = lists:flatten(io_lib:format(
"machi-ci-~4..0B~2..0B~2..0B-~2..0B~2..0B~2..0B.html",
[Y,M,D,HH,MM,SS])),
cluster_info:dump_local_node(Filename).
-spec public_projection(atom()) -> [{atom(), term()}].
public_projection(FluName) ->
projection(FluName, public).
-spec private_projection(atom()) -> [{atom(), term()}].
private_projection(FluName) ->
projection(FluName, private).
-spec chain_manager(atom()) -> term().
chain_manager(FluName) ->
Mgr = machi_flu_psup:make_mgr_supname(FluName),
sys:get_status(Mgr).
-spec fitness(atom()) -> term().
fitness(FluName) ->
Fitness = machi_flu_psup:make_fitness_regname(FluName),
sys:get_status(Fitness).
-spec flu1(atom()) -> [{atom(), term()}].
flu1(FluName) ->
State = machi_flu1_append_server:current_state(FluName),
machi_flu1_append_server:format_state(State).
%% Internal functions
projection(FluName, Kind) ->
ProjStore = machi_flu1:make_projection_server_regname(FluName),
{ok, Projection} = machi_projection_store:read_latest_projection(
whereis(ProjStore), Kind),
Fields = record_info(fields, projection_v1),
[_Name | Values] = tuple_to_list(Projection),
lists:zip(Fields, Values).
cinfo_wrapper(Fun, FluName) ->
fun(C) ->
cluster_info:format(C, "~p", [Fun(FluName)])
end.

43
src/machi_config.erl Normal file
View file

@ -0,0 +1,43 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc Configuration consulting utilities. Some conventions:
%% - The function name should match with exact configuration
%% name in `app.config' or `advanced.config' of `machi' section.
%% - The default value of that configuration is expected to be in
%% cuttlefish schema file. Otherwise some macro in headers may
%% be chosen.
%% - Documentation of the configuration is supposed to be written
%% in cuttlefish schema file, rather than @doc section of the function.
%% - spec of the function should be written.
%% - Returning `undefined' is strongly discouraged. Return some default
%% value instead.
%% - `application:get_env/3' is recommended. See `max_file_size/0' for
%% example.
-module(machi_config).
-include("machi.hrl").
-export([max_file_size/0]).
-spec max_file_size() -> pos_integer().
max_file_size() ->
application:get_env(machi, max_file_size, ?DEFAULT_MAX_FILE_SIZE).

File diff suppressed because it is too large Load diff

329
src/machi_csum_table.erl Normal file
View file

@ -0,0 +1,329 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2016 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(machi_csum_table).
-export([open/2,
find/3,
write/6, write/4, trim/5,
find_leftneighbor/2, find_rightneighbor/2,
all_trimmed/3, any_trimmed/3,
all_trimmed/2,
calc_unwritten_bytes/1,
split_checksum_list_blob_decode/1,
all/1,
close/1, delete/1,
foldl_chunks/3]).
-include("machi.hrl").
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
-endif.
-record(machi_csum_table,
{file :: string(),
table :: eleveldb:db_ref()}).
-type table() :: #machi_csum_table{}.
-type byte_sequence() :: { Offset :: non_neg_integer(),
Size :: pos_integer()|infinity }.
-type chunk() :: {Offset :: machi_dt:file_offset(),
Size :: machi_dt:chunk_size(),
machi_dt:chunk_csum() | trimmed | none}.
-export_type([table/0]).
-spec open(string(), proplists:proplist()) ->
{ok, table()} | {error, file:posix()}.
open(CSumFilename, _Opts) ->
LevelDBOptions = [{create_if_missing, true},
%% Keep this table small so as not to interfere
%% operating system's file cache, which is for
%% Machi's main read efficiency
{total_leveldb_mem_percent, 10}],
{ok, T} = eleveldb:open(CSumFilename, LevelDBOptions),
%% Dummy entry for reserved headers
ok = eleveldb:put(T,
sext:encode({0, ?MINIMUM_OFFSET}),
sext:encode(?CSUM_TAG_NONE_ATOM),
[{sync, true}]),
C0 = #machi_csum_table{
file=CSumFilename,
table=T},
{ok, C0}.
-spec split_checksum_list_blob_decode(binary())-> [chunk()].
split_checksum_list_blob_decode(Bin) ->
erlang:binary_to_term(Bin).
-define(has_overlap(LeftOffset, LeftSize, RightOffset, RightSize),
((LeftOffset - (RightOffset+RightSize)) * (LeftOffset+LeftSize - RightOffset) < 0)).
-spec find(table(), machi_dt:file_offset(), machi_dt:chunk_size())
-> [chunk()].
find(#machi_csum_table{table=T}, Offset, Size) ->
{ok, I} = eleveldb:iterator(T, [], keys_only),
EndKey = sext:encode({Offset+Size, 0}),
StartKey = sext:encode({Offset, Size}),
{ok, FirstKey} = case eleveldb:iterator_move(I, StartKey) of
{error, invalid_iterator} ->
try
%% Assume that the invalid_iterator is because
%% we tried to move to the end via StartKey.
%% Instead, move there directly.
{ok, _} = eleveldb:iterator_move(I, last),
{ok, _} = eleveldb:iterator_move(I, prev)
catch
_:_ ->
{ok, _} = eleveldb:iterator_move(I, first)
end;
{ok, _} = R0 ->
case eleveldb:iterator_move(I, prev) of
{error, invalid_iterator} ->
R0;
{ok, _} = R1 ->
R1
end
end,
_ = eleveldb:iterator_close(I),
FoldFun = fun({K, V}, Acc) ->
{TargetOffset, TargetSize} = sext:decode(K),
case ?has_overlap(TargetOffset, TargetSize, Offset, Size) of
true ->
[{TargetOffset, TargetSize, sext:decode(V)}|Acc];
false ->
Acc
end;
(_K, Acc) ->
lager:error("~p wrong option", [_K]),
Acc
end,
lists:reverse(eleveldb_fold(T, FirstKey, EndKey, FoldFun, [])).
%% @doc Updates all chunk info, by deleting existing entries if exists
%% and putting new chunk info
-spec write(table(),
machi_dt:file_offset(), machi_dt:chunk_size(),
machi_dt:chunk_csum()|'none'|'trimmed',
undefined|chunk(), undefined|chunk()) ->
ok | {error, term()}.
write(#machi_csum_table{table=T} = CsumT, Offset, Size, CSum,
LeftUpdate, RightUpdate) ->
PutOps =
[{put,
sext:encode({Offset, Size}),
sext:encode(CSum)}]
++ case LeftUpdate of
{LO, LS, LCsum} when LO + LS =:= Offset ->
[{put,
sext:encode({LO, LS}),
sext:encode(LCsum)}];
undefined ->
[]
end
++ case RightUpdate of
{RO, RS, RCsum} when RO =:= Offset + Size ->
[{put,
sext:encode({RO, RS}),
sext:encode(RCsum)}];
undefined ->
[]
end,
Chunks = find(CsumT, Offset, Size),
DeleteOps = lists:map(fun({O, L, _}) ->
{delete, sext:encode({O, L})}
end, Chunks),
%% io:format(user, "PutOps: ~P\n", [PutOps, 20]),
%% io:format(user, "DelOps: ~P\n", [DeleteOps, 20]),
eleveldb:write(T, DeleteOps ++ PutOps, [{sync, true}]).
-spec find_leftneighbor(table(), non_neg_integer()) ->
undefined | chunk().
find_leftneighbor(CsumT, Offset) ->
case find(CsumT, Offset, 1) of
[] -> undefined;
[{Offset, _, _}] -> undefined;
[{LOffset, _, CsumOrTrimmed}] -> {LOffset, Offset - LOffset, CsumOrTrimmed}
end.
-spec find_rightneighbor(table(), non_neg_integer()) ->
undefined | chunk().
find_rightneighbor(CsumT, Offset) ->
case find(CsumT, Offset, 1) of
[] -> undefined;
[{Offset, _, _}] -> undefined;
[{ROffset, RSize, CsumOrTrimmed}] ->
{Offset, ROffset + RSize - Offset, CsumOrTrimmed}
end.
-spec write(table(), machi_dt:file_offset(), machi_dt:file_size(),
machi_dt:chunk_csum()|none|trimmed) ->
ok | {error, trimmed|file:posix()}.
write(CsumT, Offset, Size, CSum) ->
write(CsumT, Offset, Size, CSum, undefined, undefined).
trim(CsumT, Offset, Size, LeftUpdate, RightUpdate) ->
write(CsumT, Offset, Size,
trimmed, %% Should this be much smaller like $t or just 't'
LeftUpdate, RightUpdate).
%% @doc returns whether all bytes in a specific window is continously
%% trimmed or not
-spec all_trimmed(table(), non_neg_integer(), non_neg_integer()) -> boolean().
all_trimmed(#machi_csum_table{table=T}, Left, Right) ->
FoldFun = fun({_, _}, false) ->
false;
({K, V}, Pos) when is_integer(Pos) andalso Pos =< Right ->
case {sext:decode(K), sext:decode(V)} of
{{Pos, Size}, trimmed} ->
Pos + Size;
{{Offset, Size}, _}
when Offset + Size =< Left ->
Left;
_Eh ->
false
end
end,
case eleveldb:fold(T, FoldFun, Left, [{verify_checksums, true}]) of
false -> false;
Right -> true;
LastTrimmed when LastTrimmed < Right -> false;
_ -> %% LastTrimmed > Pos0, which is a irregular case but ok
true
end.
%% @doc returns whether all bytes 0-Pos0 is continously trimmed or
%% not, including header.
-spec all_trimmed(table(), non_neg_integer()) -> boolean().
all_trimmed(CsumT, Pos0) ->
all_trimmed(CsumT, 0, Pos0).
-spec any_trimmed(table(),
pos_integer(),
machi_dt:chunk_size()) -> boolean().
any_trimmed(CsumT, Offset, Size) ->
Chunks = find(CsumT, Offset, Size),
lists:any(fun({_, _, State}) -> State =:= trimmed end, Chunks).
-spec calc_unwritten_bytes(table()) -> [byte_sequence()].
calc_unwritten_bytes(#machi_csum_table{table=_} = CsumT) ->
case lists:sort(all(CsumT)) of
[] ->
[{?MINIMUM_OFFSET, infinity}];
Sorted ->
{LastOffset, _, _} = hd(Sorted),
build_unwritten_bytes_list(Sorted, LastOffset, [])
end.
all(CsumT) ->
FoldFun = fun(E, Acc) -> [E|Acc] end,
lists:reverse(foldl_chunks(FoldFun, [], CsumT)).
-spec close(table()) -> ok.
close(#machi_csum_table{table=T}) ->
ok = eleveldb:close(T).
-spec delete(table()) -> ok.
delete(#machi_csum_table{table=T, file=F}) ->
catch eleveldb:close(T),
%% TODO change this to directory walk
case os:cmd("rm -rf " ++ F) of
"" -> ok;
E -> E
end.
-spec foldl_chunks(fun((chunk(), Acc0 :: term()) -> Acc :: term()),
Acc0 :: term(), table()) -> Acc :: term().
foldl_chunks(Fun, Acc0, #machi_csum_table{table=T}) ->
FoldFun = fun({K, V}, Acc) ->
{Offset, Len} = sext:decode(K),
Fun({Offset, Len, sext:decode(V)}, Acc);
(_K, Acc) ->
_ = lager:error("~p: wrong option?", [_K]),
Acc
end,
eleveldb:fold(T, FoldFun, Acc0, [{verify_checksums, true}]).
-spec build_unwritten_bytes_list( CsumData :: [{ Offset :: non_neg_integer(),
Size :: pos_integer(),
Checksum :: binary() }],
LastOffset :: non_neg_integer(),
Acc :: list() ) -> [byte_sequence()].
% @private Given a <b>sorted</b> list of checksum data tuples, return a sorted
% list of unwritten byte ranges. The output list <b>always</b> has at least one
% entry: the last tuple in the list is guaranteed to be the current end of
% bytes written to a particular file with the special space moniker
% `infinity'.
build_unwritten_bytes_list([], Last, Acc) ->
NewAcc = [ {Last, infinity} | Acc ],
lists:reverse(NewAcc);
build_unwritten_bytes_list([{CurrentOffset, CurrentSize, _Csum}|Rest], LastOffset, Acc) when
CurrentOffset /= LastOffset ->
Hole = CurrentOffset - LastOffset,
build_unwritten_bytes_list(Rest, (CurrentOffset+CurrentSize), [{LastOffset, Hole}|Acc]);
build_unwritten_bytes_list([{CO, CS, _Ck}|Rest], _LastOffset, Acc) ->
build_unwritten_bytes_list(Rest, CO + CS, Acc).
%% @doc If you want to find an overlap among two areas [x, y] and [a,
%% b] where x &lt; y and a &lt; b; if (a-y)*(b-x) &lt; 0 then there's a
%% overlap, else, > 0 then there're no overlap. border condition = 0
%% is not overlap in this offset-size case.
%% inclusion_match_spec(Offset, Size) ->
%% {'>', 0,
%% {'*',
%% {'-', Offset + Size, '$1'},
%% {'-', Offset, {'+', '$1', '$2'}}}}.
-spec eleveldb_fold(eleveldb:db_ref(), binary(), binary(),
fun(({binary(), binary()}, AccType::term()) -> AccType::term()),
AccType0::term()) ->
AccType::term().
eleveldb_fold(Ref, Start, End, FoldFun, InitAcc) ->
{ok, Iterator} = eleveldb:iterator(Ref, []),
try
eleveldb_do_fold(eleveldb:iterator_move(Iterator, Start),
Iterator, End, FoldFun, InitAcc)
catch throw:IteratorClosed ->
{error, IteratorClosed}
after
eleveldb:iterator_close(Iterator)
end.
-spec eleveldb_do_fold({ok, binary(), binary()}|{error, iterator_closed|invalid_iterator}|{ok,binary()},
eleveldb:itr_ref(), binary(),
fun(({binary(), binary()}, AccType::term()) -> AccType::term()),
AccType::term()) ->
AccType::term().
eleveldb_do_fold({ok, Key, Value}, _, End, FoldFun, Acc)
when End < Key ->
FoldFun({Key, Value}, Acc);
eleveldb_do_fold({ok, Key, Value}, Iterator, End, FoldFun, Acc) ->
eleveldb_do_fold(eleveldb:iterator_move(Iterator, next),
Iterator, End, FoldFun,
FoldFun({Key, Value}, Acc));
eleveldb_do_fold({error, iterator_closed}, _, _, _, Acc) ->
%% It's really an error which is not expected
throw({iterator_closed, Acc});
eleveldb_do_fold({error, invalid_iterator}, _, _, _, Acc) ->
%% Probably reached to end
Acc.

View file

@ -20,15 +20,24 @@
-module(machi_dt).
-include("machi.hrl").
-include("machi_projection.hrl").
-type chunk() :: chunk_bin() | {chunk_csum(), chunk_bin()}.
-type chunk_bin() :: binary() | iolist(). % client can use either
-type chunk_csum() :: binary(). % 1 byte tag, N-1 bytes checksum
-type chunk_summary() :: {file_offset(), chunk_size(), binary()}.
-type chunk_s() :: binary(). % server always uses binary()
-type append_opts() :: #append_opts{}.
-type chunk() :: chunk_bin() | iolist(). % client can choose either rep.
-type chunk_bin() :: binary(). % server returns binary() only.
-type chunk_csum() :: <<>> | chunk_csum_bin() | {csum_tag(), binary()}.
-type chunk_csum_bin() :: binary(). % 1 byte tag, N-1 bytes checksum
-type chunk_cstrm() :: 'trimmed' | chunk_csum().
-type chunk_summary() :: {file_offset(), chunk_size(), chunk_bin(), chunk_cstrm()}.
-type chunk_pos() :: {file_offset(), chunk_size(), file_name_s()}.
-type chunk_size() :: non_neg_integer().
%% Tags that stand for how that checksum was generated. See
%% machi_util:make_tagged_csum/{1,2} for further documentation and
%% implementation.
-type csum_tag() :: none | client_sha | server_sha | server_regen_sha.
-type error_general() :: 'bad_arg' | 'wedged' | 'bad_checksum'.
-type epoch_csum() :: binary().
-type epoch_num() :: -1 | non_neg_integer().
@ -41,17 +50,26 @@
-type file_prefix() :: binary() | list().
-type inet_host() :: inet:ip_address() | inet:hostname().
-type inet_port() :: inet:port_number().
-type locator() :: number().
-type namespace() :: binary().
-type namespace_version() :: non_neg_integer().
-type ns_info() :: #ns_info{}.
-type projection() :: #projection_v1{}.
-type projection_type() :: 'public' | 'private'.
-type read_opts() :: #read_opts{}.
-type read_opts_x() :: 'undefined' | 'noopt' | 'none' | #read_opts{}.
-export_type([
append_opts/0,
chunk/0,
chunk_bin/0,
chunk_csum/0,
chunk_csum_bin/0,
chunk_cstrm/0,
chunk_summary/0,
chunk_s/0,
chunk_pos/0,
chunk_size/0,
csum_tag/0,
error_general/0,
epoch_csum/0,
epoch_num/0,
@ -64,7 +82,13 @@
file_prefix/0,
inet_host/0,
inet_port/0,
locator/0,
namespace/0,
namespace_version/0,
ns_info/0,
projection/0,
projection_type/0
projection_type/0,
read_opts/0,
read_opts_x/0
]).

908
src/machi_file_proxy.erl Normal file
View file

@ -0,0 +1,908 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc This is a proxy process which mediates access to Machi FLU
%% controlled files. In particular, it manages the "write-once register"
%% conceit at the heart of Machi's design.
%%
%% Read, write and append requests for a single file will be managed
%% through this proxy. Clients can also request syncs for specific
%% types of filehandles.
%%
%% As operations are requested, the proxy keeps track of how many
%% operations it has performed (and how many errors were generated.)
%% After a sufficient number of inactivity, the server terminates
%% itself.
%%
%% TODO:
%% 1. Some way to transition the proxy into a wedged state that
%% doesn't rely on message delivery.
%%
%% 2. Check max file size on appends. Writes we take on faith we can
%% and should handle.
%%
%% 3. Async checksum reads on startup.
-module(machi_file_proxy).
-behaviour(gen_server).
-include("machi.hrl").
%% public API
-export([
start_link/3,
stop/1,
sync/1,
sync/2,
read/3,
read/4,
write/3,
write/4,
trim/4,
append/2,
append/4,
checksum_list/1
]).
%% gen_server callbacks
-export([
init/1,
handle_call/3,
handle_cast/2,
handle_info/2,
terminate/2,
code_change/3
]).
-define(TICK, 5*1000).
-define(TICK_THRESHOLD, 5). %% After this + 1 more quiescent ticks, shutdown
-define(TIMEOUT, 10*1000).
-define(TOO_MANY_ERRORS_RATIO, 50).
-type op_stats() :: { Total :: non_neg_integer(),
Errors :: non_neg_integer() }.
-record(state, {
fluname :: atom(),
data_dir :: string() | undefined,
filename :: string() | undefined,
data_path :: string() | undefined,
wedged = false :: boolean(),
csum_file :: string()|undefined,
csum_path :: string()|undefined,
data_filehandle :: file:io_device(),
csum_table :: machi_csum_table:table(),
eof_position = 0 :: non_neg_integer(),
max_file_size = ?DEFAULT_MAX_FILE_SIZE :: pos_integer(),
rollover = false :: boolean(),
tref :: reference(), %% timer ref
ticks = 0 :: non_neg_integer(), %% ticks elapsed with no new operations
ops = 0 :: non_neg_integer(), %% sum of all ops
reads = {0, 0} :: op_stats(),
writes = {0, 0} :: op_stats(),
appends = {0, 0} :: op_stats(),
trims = {0, 0} :: op_stats()
}).
%% Public API
% @doc Start a new instance of the file proxy service. Takes the filename
% and data directory as arguments. This function is typically called by the
% `machi_file_proxy_sup:start_proxy/2' function.
-spec start_link(FluName :: atom(), Filename :: string(), DataDir :: string()) -> any().
start_link(FluName, Filename, DataDir) ->
gen_server:start_link(?MODULE, {FluName, Filename, DataDir}, []).
% @doc Request to stop an instance of the file proxy service.
-spec stop(Pid :: pid()) -> ok.
stop(Pid) when is_pid(Pid) ->
gen_server:call(Pid, {stop}, ?TIMEOUT).
% @doc Force a sync of all filehandles
-spec sync(Pid :: pid()) -> ok|{error, term()}.
sync(Pid) when is_pid(Pid) ->
sync(Pid, all);
sync(_Pid) ->
lager:warning("Bad pid to sync"),
{error, bad_arg}.
% @doc Force a sync of a specific filehandle type. Valid types are `all', `csum' and `data'.
-spec sync(Pid :: pid(), Type :: all|data|csum) -> ok|{error, term()}.
sync(Pid, Type) when is_pid(Pid) andalso
( Type =:= all orelse Type =:= csum orelse Type =:= data ) ->
gen_server:call(Pid, {sync, Type}, ?TIMEOUT);
sync(_Pid, Type) ->
lager:warning("Bad arg to sync: Type ~p", [Type]),
{error, bad_arg}.
% @doc Read file at offset for length. This returns a sequence of all
% written and trimmed (optional) bytes that overlaps with requested
% offset and length. Borders are not aligned.
-spec read(Pid :: pid(),
Offset :: non_neg_integer(),
Length :: non_neg_integer()) ->
{ok, [{Filename::string(), Offset :: non_neg_integer(),
Data :: binary(), Checksum :: binary()}]} |
{error, Reason :: term()}.
read(Pid, Offset, Length) ->
read(Pid, Offset, Length, #read_opts{}).
-spec read(Pid :: pid(),
Offset :: non_neg_integer(),
Length :: non_neg_integer(),
machi_dt:read_opts_x()) ->
{ok, [{Filename::string(), Offset :: non_neg_integer(),
Data :: binary(), Checksum :: binary()}]} |
{error, Reason :: term()}.
read(Pid, Offset, Length, #read_opts{}=Opts)
when is_pid(Pid) andalso is_integer(Offset) andalso Offset >= 0
andalso is_integer(Length) andalso Length > 0 ->
gen_server:call(Pid, {read, Offset, Length, Opts}, ?TIMEOUT);
read(_Pid, Offset, Length, Opts) ->
lager:warning("Bad args to read: Offset ~p, Length ~p, Options ~p", [Offset, Length, Opts]),
{error, bad_arg}.
% @doc Write data at offset
-spec write(Pid :: pid(), Offset :: non_neg_integer(), Data :: binary()) -> ok|{error, term()}.
write(Pid, Offset, Data) when is_pid(Pid) andalso is_integer(Offset) andalso Offset >= 0
andalso is_binary(Data) ->
write(Pid, Offset, [], Data);
write(_Pid, Offset, _Data) ->
lager:warning("Bad arg to write: Offset ~p", [Offset]),
{error, bad_arg}.
% @doc Write data at offset, including the client metadata. ClientMeta is a proplist
% that expects the following keys and values:
% <ul>
% <li>`client_csum_tag' - the type of checksum from the client as defined in the machi.hrl file</li>
% <li>`client_csum' - the checksum value from the client</li>
% </ul>
-spec write(Pid :: pid(), Offset :: non_neg_integer(), ClientMeta :: proplists:proplist(),
Data :: binary()) -> ok|{error, term()}.
write(Pid, Offset, ClientMeta, Data) when is_pid(Pid) andalso is_integer(Offset) andalso Offset >= 0
andalso is_list(ClientMeta) andalso is_binary(Data) ->
gen_server:call(Pid, {write, Offset, ClientMeta, Data}, ?TIMEOUT);
write(_Pid, Offset, ClientMeta, _Data) ->
lager:warning("Bad arg to write: Offset ~p, ClientMeta: ~p", [Offset, ClientMeta]),
{error, bad_arg}.
trim(Pid, Offset, Size, TriggerGC) when is_pid(Pid),
is_integer(Offset) andalso Offset >= 0,
is_integer(Size) andalso Size > 0,
is_boolean(TriggerGC) ->
gen_server:call(Pid, {trim ,Offset, Size, TriggerGC}, ?TIMEOUT).
% @doc Append data
-spec append(Pid :: pid(), Data :: binary()) -> {ok, File :: string(), Offset :: non_neg_integer()}
|{error, term()}.
append(Pid, Data) when is_pid(Pid) andalso is_binary(Data) ->
append(Pid, [], 0, Data);
append(_Pid, _Data) ->
lager:warning("Bad arguments to append/2"),
{error, bad_arg}.
% @doc Append data to file, supplying client metadata and (if desired) a
% reservation for additional space. ClientMeta is a proplist and expects the
% same keys as write/4.
-spec append(Pid :: pid(), ClientMeta :: proplists:proplist(),
Extra :: non_neg_integer(), Data :: binary()) -> {ok, File :: string(), Offset :: non_neg_integer()}
|{error, term()}.
append(Pid, ClientMeta, Extra, Data) when is_pid(Pid) andalso is_list(ClientMeta)
andalso is_integer(Extra) andalso Extra >= 0
andalso is_binary(Data) ->
gen_server:call(Pid, {append, ClientMeta, Extra, Data}, ?TIMEOUT);
append(_Pid, ClientMeta, Extra, _Data) ->
lager:warning("Bad arg to append: ClientMeta ~p, Extra ~p", [ClientMeta, Extra]),
{error, bad_arg}.
-spec checksum_list(pid()) -> {ok, list()}.
checksum_list(Pid) ->
gen_server:call(Pid, {checksum_list}, ?TIMEOUT).
%% gen_server callbacks
% @private
init({FluName, Filename, DataDir}) ->
CsumFile = machi_util:make_checksum_filename(DataDir, Filename),
{_, DPath} = machi_util:make_data_filename(DataDir, Filename),
ok = filelib:ensure_dir(CsumFile),
ok = filelib:ensure_dir(DPath),
{ok, CsumTable} = machi_csum_table:open(CsumFile, []),
UnwrittenBytes = machi_csum_table:calc_unwritten_bytes(CsumTable),
{Eof, infinity} = lists:last(UnwrittenBytes),
{ok, FHd} = file:open(DPath, [read, write, binary, raw]),
%% Reserve for EC and stuff, to prevent eof when read
ok = file:pwrite(FHd, 0, binary:copy(<<"so what?">>, ?MINIMUM_OFFSET div 8)),
Tref = schedule_tick(),
St = #state{
fluname = FluName,
filename = Filename,
data_dir = DataDir,
data_path = DPath,
csum_file = CsumFile,
data_filehandle = FHd,
csum_table = CsumTable,
tref = Tref,
eof_position = erlang:max(Eof, ?MINIMUM_OFFSET),
max_file_size = machi_config:max_file_size()},
lager:debug("Starting file proxy ~p for filename ~p, state = ~p, Eof = ~p",
[self(), Filename, St, Eof]),
{ok, St}.
% @private
handle_call({stop}, _From, State) ->
lager:debug("Requested to stop."),
{stop, normal, State};
handle_call({sync, data}, _From, State = #state{ data_filehandle = FHd }) ->
R = file:sync(FHd),
{reply, R, State};
handle_call({sync, csum}, _From, State) ->
%% machi_csum_table always writes in {sync, true} option, so here
%% explicit sync isn't actually needed.
{reply, ok, State};
handle_call({sync, all}, _From, State = #state{filename = F,
data_filehandle = FHd,
csum_table = _T
}) ->
Resp = case file:sync(FHd) of
ok ->
ok;
Error ->
lager:error("Got ~p syncing all files for file ~p",
[Error, F]),
Error
end,
{reply, Resp, State};
%%% READS
handle_call({read, _Offset, _Length, _}, _From,
State = #state{wedged = true,
reads = {T, Err}
}) ->
{reply, {error, wedged}, State#state{writes = {T + 1, Err + 1}}};
handle_call({read, Offset, Length, _Opts}, _From,
State = #state{eof_position = Eof,
reads = {T, Err}
}) when Offset > Eof ->
%% make sure [Offset, Offset+Length) has an overlap with file range
lager:error("Read request at offset ~p for ~p bytes is past the last write offset of ~p",
[Offset, Length, Eof]),
{reply, {error, not_written}, State#state{reads = {T + 1, Err + 1}}};
handle_call({read, Offset, Length, Opts}, _From,
State = #state{filename = F,
data_filehandle = FH,
csum_table = CsumTable,
reads = {T, Err}
}) ->
%% TODO: use these options - NoChunk prevents reading from disks
%% NoChecksum doesn't check checksums
#read_opts{no_checksum=NoChecksum, no_chunk=NoChunk,
needs_trimmed=NeedsTrimmed} = Opts,
{Resp, NewErr} =
case do_read(FH, F, CsumTable, Offset, Length, NoChunk, NoChecksum) of
{ok, {[], []}} ->
{{error, not_written}, Err + 1};
{ok, {Chunks0, Trimmed0}} ->
Chunks = slice_both_side(Chunks0, Offset, Offset+Length),
Trimmed = case NeedsTrimmed of
true -> Trimmed0;
false -> []
end,
{{ok, {Chunks, Trimmed}}, Err};
Error ->
lager:error("Can't read ~p, ~p at File ~p", [Offset, Length, F]),
{Error, Err + 1}
end,
{reply, Resp, State#state{reads = {T+1, NewErr}}};
%%% WRITES
handle_call({write, _Offset, _ClientMeta, _Data}, _From,
State = #state{wedged = true,
writes = {T, Err}
}) ->
{reply, {error, wedged}, State#state{writes = {T + 1, Err + 1}}};
handle_call({write, Offset, ClientMeta, Data}, _From,
State = #state{filename = F,
writes = {T, Err},
data_filehandle = FHd,
csum_table = CsumTable}) ->
ClientCsumTag = proplists:get_value(client_csum_tag, ClientMeta, ?CSUM_TAG_NONE),
ClientCsum = proplists:get_value(client_csum, ClientMeta, <<>>),
{Resp, NewErr} =
case check_or_make_tagged_csum(ClientCsumTag, ClientCsum, Data) of
{error, {bad_csum, Bad}} ->
lager:error("Bad checksum on write; client sent ~p, we computed ~p",
[ClientCsum, Bad]),
{{error, bad_checksum}, Err + 1};
TaggedCsum ->
case handle_write(FHd, CsumTable, F, TaggedCsum, Offset, Data) of
ok ->
{ok, Err};
Error ->
{Error, Err + 1}
end
end,
{NewEof, infinity} = lists:last(machi_csum_table:calc_unwritten_bytes(CsumTable)),
lager:debug("Wrote ~p bytes at ~p of file ~p, NewEOF = ~p~n",
[iolist_size(Data), Offset, F, NewEof]),
{reply, Resp, State#state{writes = {T+1, NewErr},
eof_position = NewEof}};
%%% TRIMS
handle_call({trim, _Offset, _ClientMeta, _Data}, _From,
State = #state{wedged = true,
writes = {T, Err}
}) ->
{reply, {error, wedged}, State#state{writes = {T + 1, Err + 1}}};
handle_call({trim, Offset, Size, _TriggerGC}, _From,
State = #state{data_filehandle=FHd,
ops = Ops,
trims = {T, Err},
csum_table = CsumTable}) ->
case machi_csum_table:all_trimmed(CsumTable, Offset, Offset+Size) of
true ->
NewState = State#state{ops=Ops+1, trims={T, Err+1}},
%% All bytes of that range was already trimmed returns ok
%% here, not {error, trimmed}, which means the whole file
%% was trimmed
maybe_gc(ok, NewState);
false ->
LUpdate = maybe_regenerate_checksum(
FHd,
machi_csum_table:find_leftneighbor(CsumTable, Offset)),
RUpdate = maybe_regenerate_checksum(
FHd,
machi_csum_table:find_rightneighbor(CsumTable, Offset+Size)),
case machi_csum_table:trim(CsumTable, Offset, Size, LUpdate, RUpdate) of
ok ->
{NewEof, infinity} = lists:last(machi_csum_table:calc_unwritten_bytes(CsumTable)),
NewState = State#state{ops=Ops+1,
trims={T+1, Err},
eof_position=NewEof},
maybe_gc(ok, NewState);
Error ->
{reply, Error, State#state{ops=Ops+1, trims={T, Err+1}}}
end
end;
%% APPENDS
handle_call({append, _ClientMeta, _Extra, _Data}, _From,
State = #state{wedged = true,
appends = {T, Err}
}) ->
{reply, {error, wedged}, State#state{appends = {T+1, Err+1}}};
handle_call({append, ClientMeta, Extra, Data}, _From,
State = #state{eof_position = EofP,
filename = F,
appends = {T, Err},
data_filehandle = FHd,
csum_table = CsumTable
}) ->
ClientCsumTag = proplists:get_value(client_csum_tag, ClientMeta, ?CSUM_TAG_NONE),
ClientCsum = proplists:get_value(client_csum, ClientMeta, <<>>),
{Resp, NewErr} =
case check_or_make_tagged_csum(ClientCsumTag, ClientCsum, Data) of
{error, {bad_csum, Bad}} ->
lager:error("Bad checksum; client sent ~p, we computed ~p",
[ClientCsum, Bad]),
{{error, bad_checksum}, Err + 1};
TaggedCsum ->
case handle_write(FHd, CsumTable, F, TaggedCsum, EofP, Data) of
ok ->
{{ok, F, EofP}, Err};
Error ->
{Error, Err + 1}
end
end,
NewEof = EofP + byte_size(Data) + Extra,
lager:debug("appended ~p bytes at ~p file ~p. NewEofP = ~p",
[iolist_size(Data), EofP, F, NewEof]),
{reply, Resp, State#state{appends = {T+1, NewErr},
eof_position = NewEof}};
handle_call({checksum_list}, _FRom, State = #state{csum_table=T}) ->
All = machi_csum_table:all(T),
{reply, {ok, All}, State};
handle_call(Req, _From, State) ->
lager:warning("Unknown call: ~p", [Req]),
{reply, whoaaaaaaaaaaaa, State}.
% @private
handle_cast(Cast, State) ->
lager:warning("Unknown cast: ~p", [Cast]),
{noreply, State}.
% @private
handle_info(tick, State = #state{fluname = FluName,
filename = F,
eof_position = Eof,
max_file_size = MaxFileSize}) when Eof >= MaxFileSize ->
%% Older code halted here with {stop, file_rollover, State}.
%% However, there may be other requests in our mailbox already
%% and/or not yet delivered but in a race with the
%% machi_flu_metadata_mgr. So we close our eleveldb instance (to
%% avoid double-open attempt by a new file proxy proc), tell
%% machi_flu_metadata_mgr that we request a rollover, then stop.
%% terminate() will take care of forwarding messages that are
%% caught in the race.
lager:notice("Eof ~s position ~p >= max file size ~p. Shutting down.",
[F, Eof, MaxFileSize]),
State2 = close_files(State),
machi_flu_metadata_mgr:stop_proxy_pid_rollover(FluName, {file, F}),
{stop, normal, State2#state{rollover = true}};
%% XXX Is this a good idea? Need to think this through a bit.
handle_info(tick, State = #state{wedged = true}) ->
{stop, wedged, State};
%% I dunno. This may not be a good idea, but it seems like if we're throwing lots of
%% errors, we ought to shut down and give up our file descriptors.
handle_info(tick, State = #state{
ops = Ops,
reads = {RT, RE},
writes = {WT, WE},
appends = {AT, AE}
}) when Ops > 100 andalso
trunc(((RE+WE+AE) / (RT+WT+AT)) * 100) > ?TOO_MANY_ERRORS_RATIO ->
Errors = RE + WE + AE,
lager:notice("Got ~p errors. Shutting down.", [Errors]),
{stop, too_many_errors, State};
handle_info(tick, State = #state{
ticks = Ticks,
ops = Ops,
reads = {RT, _RE},
writes = {WT, _WE},
appends = {AT, _AE}}) when Ops == RT + WT + AT, Ticks == ?TICK_THRESHOLD ->
lager:debug("Got 5 ticks with no new activity. Shutting down."),
{stop, normal, State};
handle_info(tick, State = #state{
ticks = Ticks,
ops = Ops,
reads = {RT, _RE},
writes = {WT, _WE},
appends = {AT, _AE}}) when Ops == RT + WT + AT ->
lager:debug("No new activity since last tick. Incrementing tick counter."),
Tref = schedule_tick(),
{noreply, State#state{tref = Tref, ticks = Ticks + 1}};
handle_info(tick, State = #state{
reads = {RT, _RE},
writes = {WT, _WE},
appends = {AT, _AE}
}) ->
Ops = RT + WT + AT,
lager:debug("Setting ops counter to ~p", [Ops]),
Tref = schedule_tick(),
{noreply, State#state{tref = Tref, ops = Ops}};
%handle_info({wedged, EpochId} State = #state{epoch = E}) when E /= EpochId ->
% lager:notice("Wedge epoch ~p but ignoring because our epoch id is ~p", [EpochId, E]),
% {noreply, State};
%handle_info({wedged, EpochId}, State = #state{epoch = E}) when E == EpochId ->
% lager:notice("Wedge epoch ~p same as our epoch id ~p; we are wedged. Bummer.", [EpochId, E]),
% {noreply, State#state{wedged = true}};
% flu1.erl:
% ProxyPid = get_proxy_pid(Filename),
% Are we wedged? if not
% machi_file_proxy:read(Pid, Offset, Length)
% otherwise -> error,wedged
%
% get_proxy_pid(Filename) ->
% Pid = lookup_pid(Filename)
% is_pid_alive(Pid)
% Pid
% if not alive then start one
handle_info(Req, State) ->
lager:warning("Unknown info message: ~p", [Req]),
{noreply, State}.
% @private
terminate(Reason, State = #state{fluname = FluName,
filename = F,
rollover = Rollover_p,
reads = {RT, RE},
writes = {WT, WE},
appends = {AT, AE}
}) ->
lager:info("Shutting down proxy for file ~p because ~p", [F, Reason]),
lager:info(" Op Tot/Error", []),
lager:info(" Reads: ~p/~p", [RT, RE]),
lager:info(" Writes: ~p/~p", [WT, WE]),
lager:info("Appends: ~p/~p", [AT, AE]),
close_files(State),
if Rollover_p ->
forward_late_messages(FluName, F, 500);
true ->
ok
end,
ok.
% @private
code_change(_OldVsn, State, _Extra) ->
{ok, State}.
%% Private functions
-spec schedule_tick() -> reference().
schedule_tick() ->
erlang:send_after(?TICK, self(), tick).
-spec check_or_make_tagged_csum(Type :: non_neg_integer(),
Checksum :: binary(),
Data :: binary() ) -> binary() |
{error, {bad_csum, Bad :: binary()}}.
check_or_make_tagged_csum(?CSUM_TAG_NONE, _Csum, Data) ->
%% We are making a checksum here
Csum = machi_util:checksum_chunk(Data),
machi_util:make_tagged_csum(server_sha, Csum);
check_or_make_tagged_csum(Tag, InCsum, Data) when Tag == ?CSUM_TAG_CLIENT_SHA;
Tag == ?CSUM_TAG_SERVER_SHA ->
Csum = machi_util:checksum_chunk(Data),
case Csum =:= InCsum of
true ->
machi_util:make_tagged_csum(server_sha, Csum);
false ->
{error, {bad_csum, Csum}}
end;
check_or_make_tagged_csum(?CSUM_TAG_SERVER_REGEN_SHA,
InCsum, Data) ->
Csum = machi_util:checksum_chunk(Data),
case Csum =:= InCsum of
true ->
machi_util:make_tagged_csum(server_regen_sha, Csum);
false ->
{error, {bad_csum, Csum}}
end;
check_or_make_tagged_csum(OtherTag, _ClientCsum, _Data) ->
lager:warning("Unknown checksum tag ~p", [OtherTag]),
{error, bad_checksum}.
-spec do_read(FHd :: file:io_device(),
Filename :: string(),
CsumTable :: machi_csum_table:table(),
Offset :: non_neg_integer(),
Size :: non_neg_integer(),
NoChunk :: boolean(),
NoChecksum :: boolean()
) -> {ok, {Chunks :: [{string(), Offset::non_neg_integer(), binary(), Csum :: binary()}],
Trimmed :: [{string(), Offset::non_neg_integer(), Size::non_neg_integer()}]}} |
{error, bad_checksum} |
{error, partial_read} |
{error, file:posix()} |
{error, Other :: term() }.
% @private Attempt a read operation on the given offset and length.
% <li>
% <ul> If the byte range is not yet written, `{error, not_written}' is
% returned.</ul>
% <ul> If the checksum given does not match what comes off the disk,
% `{error, bad_checksum}' is returned.</ul>
% <ul> If the number of bytes that comes off the disk is not the requested length,
% `{error, partial_read}' is returned.</ul>
% <ul> If the offset is at or beyond the current file boundary, `eof' is returned.</ul>
% <ul> If some kind of POSIX error occurs, the OTP version of that POSIX error
% tuple is returned.</ul>
% </li>
%
do_read(FHd, Filename, CsumTable, Offset, Size, _, _) ->
%% Note that find/3 only returns overlapping chunks, both borders
%% are not aligned to original Offset and Size.
ChunkCsums = machi_csum_table:find(CsumTable, Offset, Size),
read_all_ranges(FHd, Filename, ChunkCsums, [], []).
-spec read_all_ranges(file:io_device(), string(),
[{non_neg_integer(),non_neg_integer(),trimmed|binary()}],
Chunks :: [{string(), Offset::non_neg_integer(), binary(), Csum::binary()}],
Trimmed :: [{string(), Offset::non_neg_integer(), Size::non_neg_integer()}]) ->
{ok, {
Chunks :: [{string(), Offset::non_neg_integer(), binary(), Csum::binary()}],
Trimmed :: [{string(), Offset::non_neg_integer(), Size::non_neg_integer()}]}} |
{erorr, term()|partial_read}.
read_all_ranges(_, _, [], ReadChunks, TrimmedChunks) ->
%% TODO: currently returns empty list of trimmed chunks
{ok, {lists:reverse(ReadChunks), lists:reverse(TrimmedChunks)}};
read_all_ranges(FHd, Filename, [{Offset, Size, trimmed}|T], ReadChunks, TrimmedChunks) ->
read_all_ranges(FHd, Filename, T, ReadChunks, [{Filename, Offset, Size}|TrimmedChunks]);
read_all_ranges(FHd, Filename, [{Offset, Size, TaggedCsum}|T], ReadChunks, TrimmedChunks) ->
case file:pread(FHd, Offset, Size) of
eof ->
read_all_ranges(FHd, Filename, T, ReadChunks, TrimmedChunks);
{ok, Bytes} when byte_size(Bytes) == Size, TaggedCsum =:= none ->
read_all_ranges(FHd, Filename, T,
[{Filename, Offset, Bytes,
machi_util:make_tagged_csum(none, <<>>)}|ReadChunks],
TrimmedChunks);
{ok, Bytes} when byte_size(Bytes) == Size ->
{Tag, Ck} = machi_util:unmake_tagged_csum(TaggedCsum),
case check_or_make_tagged_csum(Tag, Ck, Bytes) of
{error, Bad} ->
lager:error("Bad checksum; got ~p, expected ~p",
[Bad, Ck]),
{error, bad_checksum};
TaggedCsum ->
read_all_ranges(FHd, Filename, T,
[{Filename, Offset, Bytes, TaggedCsum}|ReadChunks],
TrimmedChunks);
OtherCsum when Tag =:= ?CSUM_TAG_NONE ->
%% XXX FIXME: Should we return something other than
%% {ok, ....} in this case?
read_all_ranges(FHd, Filename, T,
[{Filename, Offset, Bytes, OtherCsum}|ReadChunks],
TrimmedChunks)
end;
{ok, Partial} ->
lager:error("In file ~p, offset ~p, wanted to read ~p bytes, but got ~p",
[Filename, Offset, Size, byte_size(Partial)]),
{error, partial_read};
Other ->
lager:error("While reading file ~p, offset ~p, length ~p, got ~p",
[Filename, Offset, Size, Other]),
{error, Other}
end.
-spec handle_write( FHd :: file:io_device(),
CsumTable :: machi_csum_table:table(),
Filename :: string(),
TaggedCsum :: binary(),
Offset :: non_neg_integer(),
Data :: binary()
) -> ok |
{error, written} |
{error, Reason :: term()}.
% @private Implements the write and append operation. The first task is to
% determine if the offset and data size has been written. If not, the write
% is allowed proceed. A special case is made when an offset and data size
% match a checksum. In that case we read the data off the disk, validate the
% checksum and return a "fake" ok response as if the write had been performed
% when it hasn't really.
%
% If a write proceeds, the offset, size and checksum are written to a
% metadata file, and the internal list of unwritten bytes is modified
% to reflect the just-performed write. This is then returned to the
% caller as `ok'
handle_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Data) ->
Size = iolist_size(Data),
case machi_csum_table:find(CsumTable, Offset, Size) of
[] -> %% Nothing should be there
try
do_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Size, Data)
catch
%% XXX FIXME: be more specific on badmatch that might
%% occur around line 593 when we write the checksum
%% file entry for the data blob we just put on the disk
error:Reason ->
{error, Reason}
end;
[{Offset, Size, TaggedCsum}] ->
case do_read(FHd, Filename, CsumTable, Offset, Size, false, false) of
{error, _} = E ->
lager:warning("This should never happen: got ~p while reading"
" at offset ~p in file ~p that's supposedly written",
[E, Offset, Filename]),
{error, server_insanity};
{ok, {[{_, Offset, Data, TaggedCsum}], _}} ->
%% TODO: what if different checksum got from do_read()?
ok;
{ok, _Other} ->
%% TODO: leave some debug/warning message here?
{error, written}
end;
[{Offset, Size, OtherCsum}] ->
%% Got a checksum, but it doesn't match the data block's
lager:error("During a potential write at offset ~p in file ~p,"
" a check for unwritten bytes gave us checksum ~p"
" but the data we were trying to write has checksum ~p",
[Offset, Filename, OtherCsum, TaggedCsum]),
{error, written};
_Chunks ->
%% TODO: Do we try to read all continuous chunks to see
%% wether its total checksum matches client-provided checksum?
case machi_csum_table:any_trimmed(CsumTable, Offset, Size) of
true ->
%% More than a byte is trimmed, besides, do we
%% have to return exact written bytes? No. Clients
%% must issue read_chunk() with needs_trimmed
%% option as true
{error, trimmed};
false ->
%% No byte is trimmed, but at least one byte is written
{error, written}
end
end.
% @private Implements the disk writes for both the write and append
% operation.
-spec do_write( FHd :: file:io_device(),
CsumTable :: machi_csum_table:table(),
Filename :: string(),
TaggedCsum :: binary(),
Offset :: non_neg_integer(),
Size :: non_neg_integer(),
Data :: binary()
) -> ok | {error, Reason :: term()}.
do_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Size, Data) ->
case file:pwrite(FHd, Offset, Data) of
ok ->
lager:debug("Successful write in file ~p at offset ~p, length ~p",
[Filename, Offset, Size]),
%% Overlapping chunk; calculate checksum
%% read {LOffset, Offset - LOffset} and make csum
%% as server_sha
LUpdate = maybe_regenerate_checksum(
FHd,
machi_csum_table:find_leftneighbor(CsumTable, Offset)),
RUpdate = maybe_regenerate_checksum(
FHd,
machi_csum_table:find_rightneighbor(CsumTable, Offset+Size)),
ok = machi_csum_table:write(CsumTable, Offset, Size,
TaggedCsum, LUpdate, RUpdate),
lager:debug("Successful write to checksum file for ~p",
[Filename]),
ok;
Other ->
lager:error("Got ~p during write to file ~p at offset ~p, length ~p",
[Other, Filename, Offset, Size]),
{error, Other}
end.
%% @doc Trim both right and left border of chunks to fit in to given
%% range [LeftPos, RightPos]. TODO: write unit tests for this function.
%% Dialyzer 'can never match': slice_both_side([], _, _) ->
%% [];
slice_both_side([], _, _) ->
[];
slice_both_side([{F, Offset, Chunk, _Csum}|L], LeftPos, RightPos)
when Offset < LeftPos andalso LeftPos < RightPos ->
TrashLen = (LeftPos - Offset),
<<_:TrashLen/binary, NewChunk/binary>> = Chunk,
NewChecksum = machi_util:make_tagged_csum(?CSUM_TAG_SERVER_REGEN_SHA_ATOM, Chunk),
NewH = {F, LeftPos, NewChunk, NewChecksum},
slice_both_side([NewH|L], LeftPos, RightPos);
slice_both_side(Chunks, LeftPos, RightPos) when LeftPos =< RightPos ->
%% TODO: optimize
[{F, Offset, Chunk, _Csum}|L] = lists:reverse(Chunks),
Size = iolist_size(Chunk),
if RightPos < Offset + Size ->
NewSize = RightPos - Offset,
<<NewChunk:NewSize/binary, _/binary>> = Chunk,
NewChecksum = machi_util:make_tagged_csum(?CSUM_TAG_SERVER_REGEN_SHA_ATOM, Chunk),
lists:reverse([{F, Offset, NewChunk, NewChecksum}|L]);
true ->
Chunks
end.
maybe_regenerate_checksum(_, undefined) ->
undefined;
maybe_regenerate_checksum(_, {_, _, trimmed} = Change) ->
Change;
maybe_regenerate_checksum(FHd, {Offset, Size, _Csum}) ->
case file:pread(FHd, Offset, Size) of
eof ->
error({eof, Offset, Size});
{ok, Bytes} when byte_size(Bytes) =:= Size ->
TaggedCsum = machi_util:make_tagged_csum(server_regen_sha,
machi_util:checksum_chunk(Bytes)),
{Offset, Size, TaggedCsum};
Error ->
throw(Error)
end.
%% GC: make sure unwritten bytes = [{Eof, infinity}] and Eof is > max
%% file size walk through the checksum table and make sure all chunks
%% trimmed Then unlink the file
-spec maybe_gc(term(), #state{}) ->
{reply, term(), #state{}} | {stop, normal, term(), #state{}}.
maybe_gc(Reply, S = #state{eof_position = Eof,
max_file_size = MaxFileSize}) when Eof < MaxFileSize ->
lager:debug("The file is still small; not trying GC (Eof, MaxFileSize) = (~p, ~p)~n",
[Eof, MaxFileSize]),
{reply, Reply, S};
maybe_gc(Reply, S = #state{fluname=FluName,
data_filehandle = FHd,
data_dir = DataDir,
filename = Filename,
eof_position = Eof,
csum_table=CsumTable}) ->
case machi_csum_table:all_trimmed(CsumTable, ?MINIMUM_OFFSET, Eof) of
true ->
lager:debug("GC? Let's do it: ~p.~n", [Filename]),
%% Before unlinking a file, it should inform
%% machi_flu_filename_mgr that this file is
%% deleted and mark it as "trimmed" to avoid
%% filename reuse and resurrection. Maybe garbage
%% will remain if a process crashed but it also
%% should be recovered at filename_mgr startup.
%% Also, this should be informed *before* file proxy
%% deletes files.
ok = machi_flu_metadata_mgr:trim_file(FluName, {file, Filename}),
ok = file:close(FHd),
{_, DPath} = machi_util:make_data_filename(DataDir, Filename),
ok = file:delete(DPath),
machi_csum_table:delete(CsumTable),
{stop, normal, Reply,
S#state{data_filehandle=undefined,
csum_table=undefined}};
false ->
{reply, Reply, S}
end.
close_files(State = #state{data_filehandle = FHd,
csum_table = T}) ->
case FHd of
undefined ->
noop; %% file deleted
_ ->
ok = file:sync(FHd),
ok = file:close(FHd)
end,
case T of
undefined ->
noop; %% file deleted
_ ->
ok = machi_csum_table:close(T)
end,
State#state{data_filehandle = undefined, csum_table = undefined}.
forward_late_messages(FluName, F, Timeout) ->
receive
M ->
case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, F}) of
{ok, Pid} ->
Pid ! M;
{error, trimmed} ->
lager:error("TODO: FLU ~p file ~p reports trimmed status "
"when forwarding ~P\n",
[FluName, F, M, 20])
end,
forward_late_messages(FluName, F, Timeout)
after Timeout ->
ok
end.

View file

@ -0,0 +1,57 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc This is the main supervisor for the file proxies.
-module(machi_file_proxy_sup).
-behaviour(supervisor).
%% public API
-export([
child_spec/1,
start_link/1,
start_proxy/3
]).
%% supervisor callback
-export([
init/1
]).
child_spec(FluName) ->
Name = make_proxy_name(FluName),
{Name,
{?MODULE, start_link, [FluName]},
permanent, 5000, supervisor, [?MODULE]}.
start_link(FluName) ->
supervisor:start_link({local, make_proxy_name(FluName)}, ?MODULE, []).
start_proxy(FluName, DataDir, Filename) ->
supervisor:start_child(make_proxy_name(FluName),
[FluName, Filename, DataDir]).
init([]) ->
SupFlags = {simple_one_for_one, 1000, 10},
ChildSpec = {unused, {machi_file_proxy, start_link, []},
temporary, 2000, worker, [machi_file_proxy]},
{ok, {SupFlags, [ChildSpec]}}.
make_proxy_name(FluName) when is_atom(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_file_proxy_sup").

473
src/machi_fitness.erl Normal file
View file

@ -0,0 +1,473 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(machi_fitness).
-behaviour(gen_server).
-include("machi.hrl").
-include("machi_projection.hrl").
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
-endif. % TEST
-define(LWWREG, riak_dt_lwwreg).
-define(MAP, riak_dt_map).
-define(DELAY_TIME_MS, 300). % TODO make configurable!
%% API
-export([start_link/1,
get_unfit_list/1, update_local_down_list/3,
add_admin_down/3, delete_admin_down/2,
send_fitness_update_spam/3,
send_spam_to_everyone/1,
trigger_early_adjustment/2]).
%% gen_server callbacks
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3, format_status/2]).
-record(state, {
my_flu_name :: atom() | binary(),
reg_name :: atom(),
local_down=[] :: list(),
admin_down=[] :: list({term(),term()}),
members_dict=orddict:new() :: orddict:orddict(),
proxies_dict=orddict:new() :: orddict:orddict(),
active_unfit=[] :: list(),
pending_map=?MAP:new() :: ?MAP:riak_dt_map(),
partition_simulator_p :: boolean()
}).
start_link(Args) ->
gen_server:start_link(?MODULE, Args, []).
get_unfit_list(PidSpec) ->
gen_server:call(PidSpec, {get_unfit_list}, infinity).
update_local_down_list(PidSpec, Down, MembersDict) ->
gen_server:call(PidSpec, {update_local_down_list, Down, MembersDict},
infinity).
add_admin_down(PidSpec, DownFLU, DownProps) ->
gen_server:call(PidSpec, {add_admin_down, DownFLU, DownProps},
infinity).
delete_admin_down(PidSpec, DownFLU) ->
gen_server:call(PidSpec, {delete_admin_down, DownFLU},
infinity).
send_fitness_update_spam(Pid, FromName, Dict) ->
gen_server:call(Pid, {incoming_spam, FromName, Dict}, infinity).
send_spam_to_everyone(Pid) ->
gen_server:call(Pid, {send_spam_to_everyone}, infinity).
%% @doc For testing purposes, we don't want a test to wait for
%% wall-clock time to elapse before the fitness server makes a
%% down->up status decision.
trigger_early_adjustment(Pid, FLU) ->
Pid ! {adjust_down_list, FLU}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
init([{MyFluName}|Args]) ->
RegName = machi_flu_psup:make_fitness_regname(MyFluName),
register(RegName, self()),
{ok, _} = timer:send_interval(5000, debug_dump),
UseSimulatorP = proplists:get_value(use_partition_simulator, Args, false),
{ok, #state{my_flu_name=MyFluName, reg_name=RegName,
partition_simulator_p=UseSimulatorP,
local_down=[complete_bogus_here_to_trigger_initial_spam]
}}.
handle_call({get_unfit_list}, _From, #state{active_unfit=ActiveUnfit}=S) ->
Reply = ActiveUnfit,
{reply, Reply, S};
handle_call({update_local_down_list, Down, MembersDict}, _From,
#state{my_flu_name=MyFluName, pending_map=OldMap,
local_down=OldDown, members_dict=OldMembersDict,
admin_down=AdminDown}=S) ->
verbose("FITNESS: ~w has down suspect ~w\n", [MyFluName, Down]),
NewMap = store_in_map(OldMap, MyFluName, erlang:now(), Down,
AdminDown, [props_yo]),
S2 = if Down == OldDown, MembersDict == OldMembersDict ->
%% Do nothing only if both are equal. If members_dict is
%% changing, that's sufficient reason to spam.
S;
true ->
do_map_change(NewMap, [MyFluName], MembersDict, S)
end,
{reply, ok, S2#state{local_down=Down}};
handle_call({add_admin_down, DownFLU, DownProps}, _From,
#state{my_flu_name=MyFluName,
local_down=OldDown, admin_down=AdminDown}=S) ->
verbose("FITNESS: ~w add admin down ~w\n", [MyFluName, DownFLU]),
NewAdminDown = [{DownFLU,DownProps}|lists:keydelete(DownFLU, 1, AdminDown)],
S3 = finish_admin_down(erlang:now(), OldDown, NewAdminDown,
[props_yo], S),
{reply, ok, S3};
handle_call({delete_admin_down, DownFLU}, _From,
#state{my_flu_name=MyFluName,
local_down=OldDown, admin_down=AdminDown}=S) ->
verbose("FITNESS: ~w delete admin down ~w\n", [MyFluName, DownFLU]),
NewAdminDown = lists:keydelete(DownFLU, 1, AdminDown),
S3 = finish_admin_down(erlang:now(), OldDown, NewAdminDown,
[props_yo], S),
{reply, ok, S3};
handle_call({incoming_spam, Author, Dict}, _From, S) ->
{Res, S2} = do_incoming_spam(Author, Dict, S),
{reply, Res, S2};
handle_call({send_spam_to_everyone}, _From, S) ->
{Res, S2} = do_send_spam_to_everyone(S),
{reply, Res, S2};
handle_call(_Request, _From, S) ->
Reply = whhhhhhhhhhhhhhaaaaaaaaaaaaaaa,
{reply, Reply, S}.
handle_cast(_Msg, S) ->
{noreply, S}.
handle_info({adjust_down_list, FLU}, #state{my_flu_name=MyFluName,
active_unfit=ActiveUnfit}=S) ->
NewUnfit = make_unfit_list(S),
Added_to_new = NewUnfit -- ActiveUnfit,
Dropped_from_new = ActiveUnfit -- NewUnfit,
%% io:format(user, "adjust_down_list: ~w: adjust ~w: add ~p drop ~p\n", [S#state.my_flu_name, FLU, Added_to_new, Dropped_from_new]),
%% We need to schedule a new round of adjustment messages. They might
%% be redundant, or they might not. Here's a case where the current
%% code needs the extra:
%%
%% SET partitions = [{a,c},{b,c},{c,b}] (11 of 26) at {23,37,44}
%% We are stable spam/gossip at:
%% [{a,problem_with,b},{b,problem_with,c},
%% {c,problem_with,a},{c,problem_with,b}]
%% So everyone agrees unfit=[c].
%%
%% SET partitions = [{c,a}] (12 of 26) at {23,37,48}
%% We are stable spam/gossip at:
%% [{a,problem_with,c},{c,problem_with,a}]
%% So everyone *ought* to agree that unfit=[a].
%%
%% In this case, when the partition list changes to [{c,a}],
%% then we will discover via spam gossip that reports by B & C will
%% change. However, our calc_unfit() via
%% make_unfit_list() algorithm will decide that *a* is the bad guy
%% and needs to go into our active_unfit list! And the only way
%% to get added is via an {adjust_down_list,...} message. The
%% usual place for generating them isn't wise enough because it
%% doesn't call make_unfit_list().
%%
%% The cost is that there will (at least) a 2x delay to the
%% ?DELAY_TIME_MS waiting period to detect all partitions.
%%
%% Aside: for all I know right now, there may be a corner case
%% hiding where we need this extra round of messages to *remove* a
%% FLU from the active_unfit list?
_ = schedule_adjust_messages(lists:usort(Added_to_new ++ Dropped_from_new)),
case {lists:member(FLU,Added_to_new), lists:member(FLU,Dropped_from_new)} of
{true, true} ->
error({bad, ?MODULE, ?LINE, FLU, ActiveUnfit, NewUnfit});
{true, false} ->
NewActive = wrap_active(MyFluName,lists:usort(ActiveUnfit++[FLU])),
{noreply, S#state{active_unfit=NewActive}};
{false, true} ->
NewActive = wrap_active(MyFluName,ActiveUnfit--[FLU]),
{noreply, S#state{active_unfit=NewActive}};
{false, false} ->
{noreply, S}
end;
handle_info(debug_dump, #state{my_flu_name=_MyFluName,active_unfit=_ActiveUnfit,
pending_map=_Map}=S) ->
%% io:format(user, "DUMP: ~w/~w: ~p ~W\n", [_MyFluName, self(), _ActiveUnfit, map_value(_Map), 13]),
%% io:format(user, "DUMP ~w: ~w, ", [MyFluName, ActiveUnfit]),
{noreply, S};
handle_info(_Info, S) ->
{noreply, S}.
terminate(_Reason, _S) ->
ok.
format_status(_Opt, [_PDict, Status]) ->
Fields = record_info(fields, state),
[_Name | Values] = tuple_to_list(Status),
lists:zip(Fields, Values).
code_change(_OldVsn, S, _Extra) ->
{ok, S}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
make_unfit_list(#state{members_dict=MembersDict}=S) ->
Now = erlang:now(),
F = fun({Server, {UpdateTime, DownList, AdminDown, _Props}},
{ProblemAcc, AdminAcc}) ->
case timer:now_diff(Now, UpdateTime) div (1000*1000) of
N when N > 900 -> % TODO make configurable
{ProblemAcc, AdminAcc};
_ ->
Probs = [{Server,problem_with,D} || D <- DownList],
{[Probs|ProblemAcc], AdminDown++AdminAcc}
end
end,
{Problems0, AdminDown} = map_fold(F, {[], []}, S#state.pending_map),
Problems = lists:flatten(Problems0),
All_list = [K || {K,_V} <- orddict:to_list(MembersDict)],
Unfit = calc_unfit(All_list, Problems),
lists:usort(Unfit ++ AdminDown).
store_in_map(Map, Name, Now, Down, AdminDown, Props) ->
{AdminDownServers, AdminDownProps0} = lists:unzip(AdminDown),
AdminDownProps = lists:append(AdminDownProps0), % flatten one level
Val = {Now, Down, AdminDownServers, Props ++ AdminDownProps},
map_set(Name, Map, Name, Val).
send_spam(NewMap, DontSendList, MembersDict, #state{my_flu_name=MyFluName}=S) ->
Send = fun(FLU, #p_srvr{address=Host, port=TcpPort}) ->
SpamProj = machi_projection:update_checksum(
#projection_v1{epoch_number=?SPAM_PROJ_EPOCH,
author_server=MyFluName,
dbg=[NewMap],
%% stuff only to make PB happy
all_members=[],
witnesses=[],
creation_time={1,2,3},
mode=ap_mode,
upi=[], repairing=[], down=[],
dbg2=[],
members_dict=[] }),
%% Best effort, don't care about failure.
spawn(fun() ->
send_projection(FLU, Host, TcpPort, SpamProj, S)
end)
end,
F = fun(FLU, P_srvr, Acc) ->
case lists:member(FLU, DontSendList) of
true ->
Acc;
false ->
Send(FLU, P_srvr),
[FLU|Acc]
end
end,
_Sent = orddict:fold(F, [], MembersDict),
ok.
send_projection(FLU, _Host, _TcpPort, SpamProj,
#state{my_flu_name=MyFluName, members_dict=MembersDict,
partition_simulator_p=SimulatorP}=S) ->
%% At the moment, we're using utterly-temporary-hack method of tunneling
%% our messages through the write_projection API. Eventually the PB
%% API should be expanded to accomodate this new fitness service.
%% This is "best effort" only, use catch to ignore failures.
ProxyPid = (catch proxy_pid(FLU, S)),
DoIt = fun(_ArgIgnored) ->
machi_proxy_flu1_client:write_projection(ProxyPid,
public, SpamProj)
end,
ProxyPidPlaceholder = proxy_pid_unused,
if SimulatorP ->
AllMembers = [K || {K,_V} <- orddict:to_list(MembersDict)],
{Partitions, _Islands} = machi_partition_simulator:get(AllMembers),
machi_chain_manager1:init_remember_down_list(),
Res = (catch machi_chain_manager1:perhaps_call(ProxyPidPlaceholder,
MyFluName,
Partitions, FLU, DoIt)),
%% case machi_chain_manager1:get_remember_down_list() of
%% [] ->
%% ok;
%% _ ->
%% io:format(user, "fitness error ~w -> ~w\n",
%% [MyFluName, FLU])
%% end,
Res;
true ->
(catch DoIt(ProxyPidPlaceholder))
end.
proxy_pid(Name, #state{proxies_dict=ProxiesDict}) ->
orddict:fetch(Name, ProxiesDict).
calc_unfit(All_list, HosedAnnotations) ->
G = digraph:new(),
_ = [digraph:add_vertex(G, V) || V <- All_list],
_ = [digraph:add_edge(G, V1, V2) || {V1, problem_with, V2} <- HosedAnnotations],
calc_unfit2(lists:sort(digraph:vertices(G)), G).
calc_unfit2([], G) ->
digraph:delete(G),
[];
calc_unfit2([H|T], G) ->
case digraph:in_degree(G, H) of
0 ->
calc_unfit2(T, G);
1 ->
Neighbors = digraph:in_neighbours(G, H),
case [V || V <- Neighbors, digraph:in_degree(G, V) == 1] of
[AlsoOne|_] ->
%% TODO: be smarter here about the choice of which is down.
[H|calc_unfit2(T -- [AlsoOne], G)];
[] ->
%% H is "on the end", e.g. 1-2-1, so it's OK.
calc_unfit2(T, G)
end;
N when N > 1 ->
[H|calc_unfit2(T, G)]
end.
do_incoming_spam(_Author, Map,
#state{my_flu_name=MyFluName,pending_map=OldMap,
members_dict=MembersDict}=S) ->
OldMapV = map_value(OldMap),
MapV = map_value(Map),
if MapV == OldMapV ->
{ok, S};
true ->
%% io:format(user, "YY1 ~p\n", [OldMapV]),
%% io:format(user, "YY2 ~p\n", [MapV]),
NewMap = map_merge(OldMap, Map),
%% NewMapV = map_value(NewMap),
%% io:format(user, "YY3 ~p\n", [NewMapV]),
%% Hrm, we may have changes that are interesting to the
%% Author of this update, so perhaps we shouldn't exclude
%% Author from our update, right?
S2 = do_map_change(NewMap, [MyFluName], MembersDict, S),
{ok, S2}
end.
do_send_spam_to_everyone(#state{my_flu_name=MyFluName,
pending_map=Map,members_dict=MembersDict}=S) ->
_ = send_spam(Map, [MyFluName], MembersDict, S),
{ok, S}.
do_map_change(NewMap, DontSendList, MembersDict,
#state{my_flu_name=_MyFluName, pending_map=OldMap}=S) ->
send_spam(NewMap, DontSendList, MembersDict, S),
ChangedServers = find_changed_servers(OldMap, NewMap, _MyFluName),
_ = schedule_adjust_messages(ChangedServers),
%% _OldMapV = map_value(OldMap),
%% _MapV = map_value(NewMap),
%% io:format(user, "TODO: ~w async tick trigger/scheduling... ~w for:\n"
%% " ~p\n ~p\n",[_MyFluName,ChangedServers,_OldMapV,_MapV]),
S2 = perhaps_adjust_members_proxies_dicts(MembersDict, S),
S2#state{pending_map=NewMap}.
perhaps_adjust_members_proxies_dicts(SameMembersDict,
#state{members_dict=SameMembersDict}=S) ->
S;
perhaps_adjust_members_proxies_dicts(MembersDict,
#state{proxies_dict=OldProxiesDict}=S) ->
_ = machi_proxy_flu1_client:stop_proxies(OldProxiesDict),
ProxiesDict = machi_proxy_flu1_client:start_proxies(MembersDict),
S#state{members_dict=MembersDict, proxies_dict=ProxiesDict}.
find_changed_servers(OldMap, NewMap, _MyFluName) ->
AddBad = fun({_Who, {_Time, BadList, AdminDown, _Props}}, Acc) ->
BadList ++ AdminDown ++ Acc
end,
OldBad = map_fold(AddBad, [], OldMap),
NewBad = map_fold(AddBad, [], NewMap),
lists:usort((OldBad -- NewBad) ++ (NewBad -- OldBad)).
schedule_adjust_messages(FLU_list) ->
[erlang:send_after(?DELAY_TIME_MS, self(), {adjust_down_list, FLU}) ||
FLU <- FLU_list].
finish_admin_down(Time, Down, NewAdminDown, Props,
#state{my_flu_name=MyFluName, local_down=Down,
pending_map=OldMap, members_dict=MembersDict}=S) ->
NewMap = store_in_map(OldMap, MyFluName, Time, Down, NewAdminDown, Props),
S2 = S#state{admin_down=NewAdminDown},
do_map_change(NewMap, [MyFluName], MembersDict, S2).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
map_set(Actor, Map, Key, ValTerm) ->
Field = {Key, ?LWWREG},
Val = term_to_binary(ValTerm),
{ok, Map2} = ?MAP:update({update, [{update, Field, {assign, Val}}]},
Actor, Map),
Map2.
-ifdef(TEST).
map_get(Map, Key) ->
Field = {Key, ?LWWREG},
case lists:keyfind(Field, 1, ?MAP:value(Map)) of
false ->
error;
{Field, ValBin} ->
{ok, binary_to_term(ValBin)}
end.
-endif. % TEST
map_fold(Fun, Acc, Map) ->
Vs = map_value(Map),
lists:foldl(Fun, Acc, lists:sort(Vs)).
map_value(Map) ->
lists:sort([{K, binary_to_term(V)} || {{K, _Type}, V} <- ?MAP:value(Map)]).
map_merge(Map1, Map2) ->
?MAP:merge(Map1, Map2).
wrap_active(MyFluName, L) ->
verbose("FITNESS: ~w has new down list ~w\n", [MyFluName, L]),
L.
verbose(Fmt, Args) ->
case application:get_env(machi, fitness_verbose) of
{ok, true} ->
error_logger:info_msg(Fmt, Args);
_ ->
ok
end.
-ifdef(TEST).
dt_understanding_test() ->
F1 = {'X', riak_dt_lwwreg},
F2 = {'Y', riak_dt_lwwreg},
{ok, Map1} = ?MAP:update({update, [{update, F1, {assign, <<"A">>}}]}, a, ?MAP:new()),
{ok, Map2} = ?MAP:update({update, [{update, F2, {assign, <<"B2">>}}]}, b, ?MAP:new()),
%% io:format(user, "\n", []),
%% io:format(user, "Merge comparison: ~p\n", [?MAP:merge(Map1, Map2) == ?MAP:merge(Map2, Map1)]),
%% io:format(user, "M12 Val: ~p\n", [?MAP:value(?MAP:merge(Map1, Map2))]),
%% io:format(user, "M21 Val: ~p\n", [?MAP:value(?MAP:merge(Map2, Map1))]),
?MAP:merge(Map1, Map2) == ?MAP:merge(Map2, Map1).
smoke_test() ->
Map1 = map_set(a, ?MAP:new(), k1, val1),
Map2 = map_set(a, Map1, k2, val2),
{ok, val1} = map_get(Map2, k1),
{ok, val2} = map_get(Map2, k2),
error = map_get(Map2, does_not_exist),
Map3 = map_set(a, Map2, k3, val3),
[{k3,1},{k2,1},{k1,1}] = map_fold(fun({K,_}, Acc) -> [{K,1}|Acc] end,
[], Map3),
ok.
-endif. % TEST

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,193 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc Machi FLU1 append serialization server process
-module(machi_flu1_append_server).
-behavior(gen_server).
-include("machi.hrl").
-include("machi_projection.hrl").
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
-endif. % TEST
-export([start_link/4]).
-export([init/1]).
-export([handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
-export([int_update_wedge_state/3, int_wedge_myself/2]).
-export([current_state/1, format_state/1]).
-record(state, {
flu_name :: atom(),
witness = false :: boolean(),
wedged = true :: boolean(),
etstab :: ets:tid(),
epoch_id :: 'undefined' | machi_dt:epoch_id()
}).
-define(INIT_TIMEOUT, 60*1000).
-define(CALL_TIMEOUT, 60*1000).
-spec start_link(pv1_server(), boolean(), boolean(),
undefined | machi_dt:epoch_id()) -> {ok, pid()}.
start_link(Fluname, Witness_p, Wedged_p, EpochId) ->
%% Reminder: Name is the "main" name of the FLU, i.e., no suffix
gen_server:start_link({local, Fluname},
?MODULE, [Fluname, Witness_p, Wedged_p, EpochId],
[{timeout, ?INIT_TIMEOUT}]).
-spec current_state(atom() | pid()) -> term().
current_state(PidSpec) ->
gen_server:call(PidSpec, current_state, ?CALL_TIMEOUT).
format_state(State) ->
Fields = record_info(fields, state),
[_Name | Values] = tuple_to_list(State),
lists:zip(Fields, Values).
int_update_wedge_state(PidSpec, Boolean, EpochId)
when is_boolean(Boolean), is_tuple(EpochId) ->
gen_server:cast(PidSpec, {wedge_state_change, Boolean, EpochId}).
int_wedge_myself(PidSpec, EpochId)
when is_tuple(EpochId) ->
gen_server:cast(PidSpec, {wedge_myself, EpochId}).
init([Fluname, Witness_p, Wedged_p, EpochId]) ->
TID = ets:new(machi_flu1:ets_table_name(Fluname),
[set, protected, named_table, {read_concurrency, true}]),
ets:insert(TID, {epoch, {Wedged_p, EpochId}}),
{ok, #state{flu_name=Fluname, witness=Witness_p, wedged=Wedged_p,
etstab=TID, epoch_id=EpochId}}.
handle_call({seq_append, _From2, _NSInfo, _EpochID, _Prefix, _Chunk, _TCSum, _Opts},
_From, #state{witness=true}=S) ->
%% The FLU's machi_flu1_net_server process ought to filter all
%% witness states, but we'll keep this clause for extra
%% paranoia.
{reply, witness, S};
handle_call({seq_append, _From2, _NSInfo, _EpochID, _Prefix, _Chunk, _TCSum, _Opts},
_From, #state{wedged=true}=S) ->
{reply, wedged, S};
handle_call({seq_append, _From2, NSInfo, EpochID,
Prefix, Chunk, TCSum, Opts},
From, #state{flu_name=FluName, epoch_id=OldEpochId}=S) ->
%% Old is the one from our state, plain old 'EpochID' comes
%% from the client.
_ = case OldEpochId of
EpochID ->
spawn(fun() ->
append_server_dispatch(From, NSInfo,
Prefix, Chunk, TCSum, Opts,
FluName, EpochID)
end),
{noreply, S};
_ ->
{reply, {error, bad_epoch}, S}
end;
%% TODO: Who sends this message?
handle_call(wedge_status, _From,
#state{wedged=Wedged_p, epoch_id=EpochId} = S) ->
{reply, {wedge_status_reply, Wedged_p, EpochId}, S};
handle_call(current_state, _From, S) ->
{reply, S, S};
handle_call(Else, From, S) ->
io:format(user, "~s:handle_call: WHA? from=~w ~w\n", [?MODULE, From, Else]),
{noreply, S}.
handle_cast({wedge_myself, WedgeEpochId},
#state{flu_name=FluName, wedged=Wedged_p, epoch_id=OldEpochId}=S) ->
if not Wedged_p andalso WedgeEpochId == OldEpochId ->
true = ets:insert(S#state.etstab,
{epoch, {true, OldEpochId}}),
%% Tell my chain manager that it might want to react to
%% this new world.
Chmgr = machi_chain_manager1:make_chmgr_regname(FluName),
spawn(fun() ->
catch machi_chain_manager1:trigger_react_to_env(Chmgr)
end),
{noreply, S#state{wedged=true}};
true ->
{noreply, S}
end;
handle_cast({wedge_state_change, Boolean, {NewEpoch, _}=NewEpochId},
#state{epoch_id=OldEpochId}=S) ->
OldEpoch = case OldEpochId of {OldE, _} -> OldE;
undefined -> -1
end,
if NewEpoch >= OldEpoch ->
true = ets:insert(S#state.etstab,
{epoch, {Boolean, NewEpochId}}),
{noreply, S#state{wedged=Boolean, epoch_id=NewEpochId}};
true ->
{noreply, S}
end;
handle_cast(Else, S) ->
io:format(user, "~s:handle_cast: WHA? ~p\n", [?MODULE, Else]),
{noreply, S}.
handle_info(Else, S) ->
io:format(user, "~s:handle_info: WHA? ~p\n", [?MODULE, Else]),
{noreply, S}.
terminate(normal, _S) ->
ok;
terminate(Reason, _S) ->
lager:warning("~s:terminate: ~w", [?MODULE, Reason]),
ok.
code_change(_OldVsn, S, _Extra) ->
{ok, S}.
append_server_dispatch(From, NSInfo,
Prefix, Chunk, TCSum, Opts, FluName, EpochId) ->
Result = case handle_append(NSInfo,
Prefix, Chunk, TCSum, Opts, FluName, EpochId) of
{ok, File, Offset} ->
{assignment, Offset, File};
Other ->
Other
end,
_ = gen_server:reply(From, Result),
ok.
handle_append(NSInfo,
Prefix, Chunk, TCSum, Opts, FluName, EpochId) ->
Res = machi_flu_filename_mgr:find_or_make_filename_from_prefix(
FluName, EpochId, {prefix, Prefix}, NSInfo),
case Res of
{file, F} ->
case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, F}) of
{ok, Pid} ->
{Tag, CS} = machi_util:unmake_tagged_csum(TCSum),
Meta = [{client_csum_tag, Tag}, {client_csum, CS}],
Extra = Opts#append_opts.chunk_extra,
machi_file_proxy:append(Pid, Meta, Extra, Chunk);
{error, trimmed} = E ->
E
end;
Error ->
Error
end.

View file

@ -38,6 +38,71 @@
%% TODO This EDoc was written first, and the EDoc and also `-type' and
%% `-spec' definitions for {@link machi_proxy_flu1_client} and {@link
%% machi_cr_client} must be improved.
%%
%% == Client API implementation notes ==
%%
%% At the moment, there are several modules that implement various
%% subsets of the Machi API. The table below attempts to show how and
%% why they differ.
%%
%% ```
%% |--------------------------+-------+-----+------+------+-------+----------------|
%% | | PB | | # | | Conn | Epoch & NS |
%% | Module name | Level | CR? | FLUS | Impl | Life? | version aware? |
%% |--------------------------+-------+-----+------+------+-------+----------------|
%% | machi_pb_high_api_client | high | yes | many | proc | long | no |
%% | machi_cr_client | low | yes | many | proc | long | no |
%% | machi_proxy_flu1_client | low | no | 1 | proc | long | yes |
%% | machi_flu1_client | low | no | 1 | lib | short | yes |
%% |--------------------------+-------+-----+------+------+-------+----------------|
%% '''
%%
%% In terms of use and API layering, the table rows are in highest`->'lowest
%% order: each level calls the layer immediately below it.
%%
%% <dl>
%% <dt> <b> PB Level</b> </dt>
%% <dd> The Protocol Buffers API is divided logically into two levels,
%% "low" and "high". The low-level protocol is used for intra-chain
%% communication. The high-level protocol is used for clients outside
%% of a Machi chain or Machi cluster of chains.
%% </dd>
%% <dt> <b> CR?</b> </dt>
%% <dd> Does this API support (directly or indirectly) Chain
%% Replication? If `no', then the API has no awareness of multiple
%% replicas of any file or file chunk; unaware clients can only
%% perform operations at a single Machi FLU's file service or
%% projection store service.
%% </dd>
%% <dt> <b> # FLUs</b> </dt>
%% <dd> Now many FLUs does this API layer communicate with
%% simultaneously? Note that there is a one-to-one correspondence
%% between this value and the "CR?" column's value.
%% </dd>
%% <dt> <b> Impl</b> </dt>
%% <dd> Implementation: library-only or an Erlang process,
%% e.g., `gen_server'.
%% </dd>
%% <dt> <b> Conn Life?</b> </dt>
%% <dd> Expected TCP session connection life: short or long. At the
%% lowest level, the {@link machi_flu1_client} API implementation takes
%% no effort to reconnect to a remote FLU when its single TCP session
%% is broken. For long-lived connection life APIs, the server side will
%% automatically attempt to reconnect to remote FLUs when a TCP session
%% is broken.
%% </dd>
%% <dt> <b> Epoch &amp; NS version aware?</b> </dt>
%% <dd> Are clients of this API responsible for knowing a chain's EpochID
%% and namespace version numbers? If `no', then the server side of the
%% API will automatically attempt to discover/re-discover the EpochID and
%% namespace version numbers whenever they change.
%% </dd>
%% </dl>
%%
%% The only protocol that we expect to be used by entities outside of
%% a single Machi chain or a multi-chain cluster is the "high"
%% Protocol Buffers API. The {@link riak_pb_high_api_client} module
%% is an Erlang reference implementation of this PB API.
-module(machi_flu1_client).
@ -45,14 +110,20 @@
-include("machi_pb.hrl").
-include("machi_projection.hrl").
-define(HARD_TIMEOUT, 2500).
-ifdef(PULSE).
-compile({parse_transform, pulse_instrument}).
-include_lib("pulse_otp/include/pulse_otp.hrl").
-endif.
-define(SHORT_TIMEOUT, 2500).
-define(LONG_TIMEOUT, (60*1000)).
-export([
%% File API
append_chunk/4, append_chunk/5,
append_chunk_extra/5, append_chunk_extra/6,
read_chunk/5, read_chunk/6,
checksum_list/3, checksum_list/4,
append_chunk/6, append_chunk/7,
append_chunk/8, append_chunk/9,
read_chunk/7, read_chunk/8,
checksum_list/2, checksum_list/3,
list_files/2, list_files/3,
wedge_status/1, wedge_status/2,
@ -63,6 +134,7 @@
write_projection/3, write_projection/4,
get_all_projections/2, get_all_projections/3,
list_all_projections/2, list_all_projections/3,
kick_projection_reaction/2, kick_projection_reaction/3,
%% Common API
echo/2, echo/3,
@ -73,34 +145,48 @@
]).
%% For "internal" replication only.
-export([
write_chunk/5, write_chunk/6,
write_chunk/7, write_chunk/8,
trim_chunk/6,
delete_migration/3, delete_migration/4,
trunc_hack/3, trunc_hack/4
]).
-type port_wrap() :: {w,atom(),term()}.
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
-spec append_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk()) ->
-spec append_chunk(port_wrap(),
'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(),
machi_dt:file_prefix(), machi_dt:chunk(),
machi_dt:chunk_csum()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk(Sock, EpochID, Prefix, Chunk) ->
append_chunk2(Sock, EpochID, Prefix, Chunk, 0).
append_chunk(Sock, NSInfo, EpochID, Prefix, Chunk, CSum) ->
append_chunk(Sock, NSInfo, EpochID, Prefix, Chunk, CSum,
#append_opts{}, ?LONG_TIMEOUT).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
%% with `Prefix' and also request an additional `Extra' bytes.
%%
%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then
%% the file offsets that follow `Chunk''s position for the following 4K will
%% be reserved by the file sequencer for later write(s) by the
%% `write_chunk()' API.
-spec append_chunk(machi_dt:inet_host(), machi_dt:inet_port(),
machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk()) ->
'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(),
machi_dt:file_prefix(), machi_dt:chunk(),
machi_dt:chunk_csum()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk(Host, TcpPort, EpochID, Prefix, Chunk) ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
append_chunk2(Sock, EpochID, Prefix, Chunk, 0)
after
disconnect(Sock)
end.
append_chunk(Host, TcpPort, NSInfo, EpochID, Prefix, Chunk, CSum) ->
append_chunk(Host, TcpPort, NSInfo, EpochID, Prefix, Chunk, CSum,
#append_opts{}, ?LONG_TIMEOUT).
-spec append_chunk(port_wrap(),
'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(),
machi_dt:file_prefix(), machi_dt:chunk(),
machi_dt:chunk_csum(), machi_dt:append_opts(), timeout()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk(Sock, NSInfo0, EpochID, Prefix, Chunk, CSum, Opts, Timeout) ->
NSInfo = machi_util:ns_info_default(NSInfo0),
append_chunk2(Sock, NSInfo, EpochID, Prefix, Chunk, CSum, Opts, Timeout).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix' and also request an additional `Extra' bytes.
@ -110,66 +196,62 @@ append_chunk(Host, TcpPort, EpochID, Prefix, Chunk) ->
%% be reserved by the file sequencer for later write(s) by the
%% `write_chunk()' API.
-spec append_chunk_extra(port_wrap(), machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) ->
-spec append_chunk(machi_dt:inet_host(), machi_dt:inet_port(),
'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(),
machi_dt:file_prefix(), machi_dt:chunk(),
machi_dt:chunk_csum(), machi_dt:append_opts(), timeout()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk_extra(Sock, EpochID, Prefix, Chunk, ChunkExtra)
when is_integer(ChunkExtra), ChunkExtra >= 0 ->
append_chunk2(Sock, EpochID, Prefix, Chunk, ChunkExtra).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix' and also request an additional `Extra' bytes.
%%
%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then
%% the file offsets that follow `Chunk''s position for the following 4K will
%% be reserved by the file sequencer for later write(s) by the
%% `write_chunk()' API.
-spec append_chunk_extra(machi_dt:inet_host(), machi_dt:inet_port(),
machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) ->
{ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}.
append_chunk_extra(Host, TcpPort, EpochID, Prefix, Chunk, ChunkExtra)
when is_integer(ChunkExtra), ChunkExtra >= 0 ->
append_chunk(Host, TcpPort, NSInfo0, EpochID,
Prefix, Chunk, CSum, Opts, Timeout) ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
append_chunk2(Sock, EpochID, Prefix, Chunk, ChunkExtra)
NSInfo = machi_util:ns_info_default(NSInfo0),
append_chunk2(Sock, NSInfo, EpochID,
Prefix, Chunk, CSum, Opts, Timeout)
after
disconnect(Sock)
end.
%% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
-spec read_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size()) ->
{ok, machi_dt:chunk_s()} |
-spec read_chunk(port_wrap(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size(),
machi_dt:read_opts_x()) ->
{ok, {[machi_dt:chunk_summary()], [machi_dt:chunk_pos()]}} |
{error, machi_dt:error_general() | 'not_written' | 'partial_read'} |
{error, term()}.
read_chunk(Sock, EpochID, File, Offset, Size)
read_chunk(Sock, NSInfo0, EpochID, File, Offset, Size, Opts0)
when Offset >= ?MINIMUM_OFFSET, Size >= 0 ->
read_chunk2(Sock, EpochID, File, Offset, Size).
NSInfo = machi_util:ns_info_default(NSInfo0),
Opts = machi_util:read_opts_default(Opts0),
read_chunk2(Sock, NSInfo, EpochID, File, Offset, Size, Opts).
%% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
-spec read_chunk(machi_dt:inet_host(), machi_dt:inet_port(), machi_dt:epoch_id(),
machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size()) ->
{ok, machi_dt:chunk_s()} |
-spec read_chunk(machi_dt:inet_host(), machi_dt:inet_port(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(),
machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size(),
machi_dt:read_opts_x()) ->
{ok, [machi_dt:chunk_summary()]} |
{error, machi_dt:error_general() | 'not_written' | 'partial_read'} |
{error, term()}.
read_chunk(Host, TcpPort, EpochID, File, Offset, Size)
read_chunk(Host, TcpPort, NSInfo0, EpochID, File, Offset, Size, Opts0)
when Offset >= ?MINIMUM_OFFSET, Size >= 0 ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
NSInfo = machi_util:ns_info_default(NSInfo0),
Opts = machi_util:read_opts_default(Opts0),
try
read_chunk2(Sock, EpochID, File, Offset, Size)
read_chunk2(Sock, NSInfo, EpochID, File, Offset, Size, Opts)
after
disconnect(Sock)
end.
%% @doc Fetch the list of chunk checksums for `File'.
-spec checksum_list(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name()) ->
-spec checksum_list(port_wrap(), machi_dt:file_name()) ->
{ok, binary()} |
{error, machi_dt:error_general() | 'no_such_file' | 'partial_read'} |
{error, term()}.
checksum_list(Sock, EpochID, File) ->
checksum_list2(Sock, EpochID, File).
checksum_list(Sock, File) ->
checksum_list2(Sock, File).
%% @doc Fetch the list of chunk checksums for `File'.
%%
@ -193,13 +275,13 @@ checksum_list(Sock, EpochID, File) ->
%% Details of the encoding used inside the `binary()' blog can be found
%% in the EDoc comments for {@link machi_flu1:decode_csum_file_entry/1}.
-spec checksum_list(machi_dt:inet_host(), machi_dt:inet_port(), machi_dt:epoch_id(), machi_dt:file_name()) ->
-spec checksum_list(machi_dt:inet_host(), machi_dt:inet_port(), machi_dt:file_name()) ->
{ok, binary()} |
{error, machi_dt:error_general() | 'no_such_file'} | {error, term()}.
checksum_list(Host, TcpPort, EpochID, File) when is_integer(TcpPort) ->
checksum_list(Host, TcpPort, File) when is_integer(TcpPort) ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
checksum_list2(Sock, EpochID, File)
checksum_list2(Sock, File)
after
disconnect(Sock)
end.
@ -226,7 +308,7 @@ list_files(Host, TcpPort, EpochID) when is_integer(TcpPort) ->
%% @doc Fetch the wedge status from the remote FLU.
-spec wedge_status(port_wrap()) ->
{ok, {boolean(), machi_dt:epoch_id()}} | {error, term()}.
{ok, {boolean(), machi_dt:epoch_id(), machi_dt:namespace_version(),machi_dt:namespace()}} | {error, term()}.
wedge_status(Sock) ->
wedge_status2(Sock).
@ -234,7 +316,7 @@ wedge_status(Sock) ->
%% @doc Fetch the wedge status from the remote FLU.
-spec wedge_status(machi_dt:inet_host(), machi_dt:inet_port()) ->
{ok, {boolean(), machi_dt:epoch_id()}} | {error, term()}.
{ok, {boolean(), machi_dt:epoch_id(), machi_dt:namespace_version(),machi_dt:namespace()}} | {error, term()}.
wedge_status(Host, TcpPort) when is_integer(TcpPort) ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
@ -374,6 +456,27 @@ list_all_projections(Host, TcpPort, ProjType)
disconnect(Sock)
end.
%% @doc Kick (politely) the remote chain manager to react to a
%% projection change.
-spec kick_projection_reaction(port_wrap(), list()) ->
ok.
kick_projection_reaction(Sock, Options) ->
kick_projection_reaction2(Sock, Options).
%% @doc Kick (politely) the remote chain manager to react to a
%% projection change.
-spec kick_projection_reaction(machi_dt:inet_host(), machi_dt:inet_port(), list()) ->
ok.
kick_projection_reaction(Host, TcpPort, Options) ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
kick_projection_reaction2(Sock, Options)
after
disconnect(Sock)
end.
%% @doc Echo -- test protocol round-trip.
-spec echo(port_wrap(), string()) ->
@ -424,27 +527,46 @@ disconnect(_) ->
%% @doc Restricted API: Write a chunk of already-sequenced data to
%% `File' at `Offset'.
-spec write_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk()) ->
-spec write_chunk(port_wrap(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk(), machi_dt:chunk_csum()) ->
ok | {error, machi_dt:error_general()} | {error, term()}.
write_chunk(Sock, EpochID, File, Offset, Chunk)
write_chunk(Sock, NSInfo0, EpochID, File, Offset, Chunk, CSum)
when Offset >= ?MINIMUM_OFFSET ->
write_chunk2(Sock, EpochID, File, Offset, Chunk).
NSInfo = machi_util:ns_info_default(NSInfo0),
write_chunk2(Sock, NSInfo, EpochID, File, Offset, Chunk, CSum).
%% @doc Restricted API: Write a chunk of already-sequenced data to
%% `File' at `Offset'.
-spec write_chunk(machi_dt:inet_host(), machi_dt:inet_port(),
machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk()) ->
'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk(), machi_dt:chunk_csum()) ->
ok | {error, machi_dt:error_general()} | {error, term()}.
write_chunk(Host, TcpPort, EpochID, File, Offset, Chunk)
write_chunk(Host, TcpPort, NSInfo0, EpochID, File, Offset, Chunk, CSum)
when Offset >= ?MINIMUM_OFFSET ->
Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}),
try
write_chunk2(Sock, EpochID, File, Offset, Chunk)
NSInfo = machi_util:ns_info_default(NSInfo0),
write_chunk2(Sock, NSInfo, EpochID, File, Offset, Chunk, CSum)
after
disconnect(Sock)
end.
%% @doc Restricted API: Write a chunk of already-sequenced data to
%% `File' at `Offset'.
-spec trim_chunk(port_wrap(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size()) ->
ok | {error, machi_dt:error_general()} | {error, term()}.
trim_chunk(Sock, NSInfo0, EpochID, File0, Offset, Size)
when Offset >= ?MINIMUM_OFFSET ->
ReqID = <<"id">>,
NSInfo = machi_util:ns_info_default(NSInfo0),
#ns_info{version=NSVersion, name=NS} = NSInfo,
File = machi_util:make_binary(File0),
true = (Offset >= ?MINIMUM_OFFSET),
Req = machi_pb_translate:to_pb_request(
ReqID,
{low_trim_chunk, NSVersion, NS, EpochID, File, Offset, Size, 0}),
do_pb_request_common(Sock, ReqID, Req).
%% @doc Restricted API: Delete a file after it has been successfully
%% migrated.
@ -489,83 +611,88 @@ trunc_hack(Host, TcpPort, EpochID, File) when is_integer(TcpPort) ->
%%%%%%%%%%%%%%%%%%%%%%%%%%%
read_chunk2(Sock, EpochID, File0, Offset, Size) ->
read_chunk2(Sock, NSInfo, EpochID, File0, Offset, Size, Opts) ->
ReqID = <<"id">>,
#ns_info{version=NSVersion, name=NS} = NSInfo,
File = machi_util:make_binary(File0),
Req = machi_pb_translate:to_pb_request(
ReqID,
{low_read_chunk, EpochID, File, Offset, Size, []}),
{low_read_chunk, NSVersion, NS, EpochID, File, Offset, Size, Opts}),
do_pb_request_common(Sock, ReqID, Req).
append_chunk2(Sock, EpochID, Prefix0, Chunk0, ChunkExtra) ->
append_chunk2(Sock, NSInfo, EpochID,
Prefix0, Chunk, CSum0, Opts, Timeout) ->
ReqID = <<"id">>,
{Chunk, CSum_tag, CSum} =
case Chunk0 of
X when is_binary(X) ->
{Chunk0, ?CSUM_TAG_NONE, <<>>};
{ChunkCSum, Chk} ->
{Tag, CS} = machi_util:unmake_tagged_csum(ChunkCSum),
{Chk, Tag, CS}
end,
PKey = <<>>, % TODO
Prefix = machi_util:make_binary(Prefix0),
{CSum_tag, CSum} = case CSum0 of
<<>> ->
{?CSUM_TAG_NONE, <<>>};
{_Tag, _CS} ->
CSum0;
B when is_binary(B) ->
machi_util:unmake_tagged_csum(CSum0)
end,
#ns_info{version=NSVersion, name=NS, locator=NSLocator} = NSInfo,
%% NOTE: The tuple position of NSLocator is a bit odd, because EpochID
%% _must_ be in the 4th position (as NSV & NS must be in 2nd & 3rd).
Req = machi_pb_translate:to_pb_request(
ReqID,
{low_append_chunk, EpochID, PKey, Prefix, Chunk, CSum_tag, CSum,
ChunkExtra}),
do_pb_request_common(Sock, ReqID, Req).
{low_append_chunk, NSVersion, NS, EpochID, NSLocator,
Prefix, Chunk, CSum_tag, CSum, Opts}),
do_pb_request_common(Sock, ReqID, Req, true, Timeout).
write_chunk2(Sock, EpochID, File0, Offset, Chunk0) ->
write_chunk2(Sock, NSInfo, EpochID, File0, Offset, Chunk, CSum0) ->
ReqID = <<"id">>,
#ns_info{version=NSVersion, name=NS} = NSInfo,
File = machi_util:make_binary(File0),
true = (Offset >= ?MINIMUM_OFFSET),
{Chunk, CSum_tag, CSum} =
case Chunk0 of
X when is_binary(X) ->
{Chunk0, ?CSUM_TAG_NONE, <<>>};
{ChunkCSum, Chk} ->
{Tag, CS} = machi_util:unmake_tagged_csum(ChunkCSum),
{Chk, Tag, CS}
end,
{CSum_tag, CSum} = case CSum0 of
<<>> ->
{?CSUM_TAG_NONE, <<>>};
{_Tag, _CS} ->
CSum0;
B when is_binary(B) ->
machi_util:unmake_tagged_csum(CSum0)
end,
Req = machi_pb_translate:to_pb_request(
ReqID,
{low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, CSum}),
{low_write_chunk, NSVersion, NS, EpochID, File, Offset, Chunk, CSum_tag, CSum}),
do_pb_request_common(Sock, ReqID, Req).
list2(Sock, EpochID) ->
ReqID = <<"id">>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_list_files, EpochID}),
ReqID, {low_skip_wedge, {low_list_files, EpochID}}),
do_pb_request_common(Sock, ReqID, Req).
wedge_status2(Sock) ->
ReqID = <<"id">>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_wedge_status, undefined}),
ReqID, {low_skip_wedge, {low_wedge_status}}),
do_pb_request_common(Sock, ReqID, Req).
echo2(Sock, Message) ->
ReqID = <<"id">>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_echo, undefined, Message}),
ReqID, {low_skip_wedge, {low_echo, Message}}),
do_pb_request_common(Sock, ReqID, Req).
checksum_list2(Sock, EpochID, File) ->
checksum_list2(Sock, File) ->
ReqID = <<"id">>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_checksum_list, EpochID, File}),
ReqID, {low_skip_wedge, {low_checksum_list, File}}),
do_pb_request_common(Sock, ReqID, Req).
delete_migration2(Sock, EpochID, File) ->
ReqID = <<"id">>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_delete_migration, EpochID, File}),
ReqID, {low_skip_wedge, {low_delete_migration, EpochID, File}}),
do_pb_request_common(Sock, ReqID, Req).
trunc_hack2(Sock, EpochID, File) ->
ReqID = <<"id-trunc">>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_trunc_hack, EpochID, File}),
ReqID, {low_skip_wedge, {low_trunc_hack, EpochID, File}}),
do_pb_request_common(Sock, ReqID, Req).
get_latest_epochid2(Sock, ProjType) ->
@ -604,19 +731,32 @@ list_all_projections2(Sock, ProjType) ->
ReqID, {low_proj, {list_all_projections, ProjType}}),
do_pb_request_common(Sock, ReqID, Req).
kick_projection_reaction2(Sock, _Options) ->
ReqID = <<42>>,
Req = machi_pb_translate:to_pb_request(
ReqID, {low_proj, {kick_projection_reaction}}),
do_pb_request_common(Sock, ReqID, Req, false, ?LONG_TIMEOUT).
do_pb_request_common(Sock, ReqID, Req) ->
do_pb_request_common(Sock, ReqID, Req, true, ?LONG_TIMEOUT).
do_pb_request_common(Sock, ReqID, Req, GetReply_p, Timeout) ->
erase(bad_sock),
try
ReqBin = list_to_binary(machi_pb:encode_mpb_ll_request(Req)),
ok = w_send(Sock, ReqBin),
case w_recv(Sock, 0) of
{ok, RespBin} ->
Resp = machi_pb:decode_mpb_ll_response(RespBin),
{ReqID2, Reply} = machi_pb_translate:from_pb_response(Resp),
true = (ReqID == ReqID2 orelse ReqID2 == <<>>),
Reply;
{error, _}=Err ->
throw(Err)
if GetReply_p ->
case w_recv(Sock, 0, Timeout) of
{ok, RespBin} ->
Resp = machi_pb:decode_mpb_ll_response(RespBin),
{ReqID2, Reply} = machi_pb_translate:from_pb_response(Resp),
true = (ReqID == ReqID2 orelse ReqID2 == <<>>),
Reply;
{error, _}=Err ->
throw(Err)
end;
not GetReply_p ->
ok
end
catch
throw:Error ->
@ -627,7 +767,17 @@ do_pb_request_common(Sock, ReqID, Req) ->
{error, {badmatch, Noo, erlang:get_stacktrace()}};
error:{badmatch,_}=BadMatch ->
put(bad_sock, Sock),
{error, {badmatch, BadMatch, erlang:get_stacktrace()}}
{error, {badmatch, BadMatch, erlang:get_stacktrace()}};
error:Whoa ->
put(bad_sock, Sock),
%% TODO: The machi_chain_manager1_converge_demo:t() test can
%% create a large number of these errors when moving from
%% no partitions to many partitions:
%% Whoa undefined: function_clause
%% In theory this is harmless, because the client will retry
%% with a new socket. But, fix it anyway.
io:format(user, "DBG Whoa ~w: ~w at ~w ~P\n", [Sock, Whoa, time(), erlang:get_stacktrace(), 25]), timer:sleep(250),
{error, {whoa, Whoa, erlang:get_stacktrace()}}
end.
filter_sock_error_result({error, closed}) ->
@ -637,11 +787,13 @@ filter_sock_error_result(Error) ->
%%%%%%%%%%%%%%%%%%%%%%%%%%%
w_connect(#p_srvr{proto_mod=?MODULE, address=Host, port=Port, props=Props})->
w_connect(#p_srvr{proto_mod=?MODULE, address=Host, port=Port, props=Props}=_P)->
try
case proplists:get_value(session_proto, Props, tcp) of
tcp ->
Sock = machi_util:connect(Host, Port, ?HARD_TIMEOUT),
put(xxx, goofus),
Sock = machi_util:connect(Host, Port, ?SHORT_TIMEOUT),
put(xxx, Sock),
ok = inet:setopts(Sock, ?PB_PACKET_OPTS),
{w,tcp,Sock};
%% sctp ->
@ -655,7 +807,8 @@ w_connect(#p_srvr{proto_mod=?MODULE, address=Host, port=Port, props=Props})->
{w,ssl,SslSock}
end
catch
_:_ ->
_X:_Y ->
%% io:format(user, "DBG Whoa ~w w_connect port ~w sock ~w err ~w ~w\n", [time(), Port, get(xxx), _X, _Y]),
undefined
end.
@ -663,8 +816,8 @@ w_close({w,tcp,Sock}) ->
catch gen_tcp:close(Sock),
ok.
w_recv({w,tcp,Sock}, Amt) ->
gen_tcp:recv(Sock, Amt, ?HARD_TIMEOUT).
w_recv({w,tcp,Sock}, Amt, Timeout) ->
gen_tcp:recv(Sock, Amt, Timeout).
w_send({w,tcp,Sock}, IoData) ->
gen_tcp:send(Sock, IoData).

View file

@ -0,0 +1,634 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc Ranch protocol callback module to handle PB protocol over
%% transport, including both high and low modes.
%% TODO
%% - Two modes, high and low should be separated at listener level?
-module(machi_flu1_net_server).
-behaviour(gen_server).
-behaviour(ranch_protocol).
-export([start_link/4]).
-export([init/1]).
-export([handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
-include_lib("kernel/include/file.hrl").
-include("machi.hrl").
-include("machi_pb.hrl").
-include("machi_projection.hrl").
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
-endif. % TEST
-record(state, {
%% Ranch's transport management stuff
ref :: ranch:ref(),
socket :: socket(),
transport :: module(),
%% Machi FLU configurations, common for low and high
data_dir :: string(),
witness :: boolean(),
pb_mode :: undefined | high | low,
%% - Used in projection related requests in low mode
%% - Used in spawning CR client in high mode
proj_store :: pid(),
%% Low mode only items
%% Current best knowledge, used for wedge_self / bad_epoch check
epoch_id :: undefined | machi_dt:epoch_id(),
%% Used in dispatching append_chunk* reqs to the
%% append serializing process
flu_name :: pv1_server(),
%% Used in server_wedge_status to lookup the table
epoch_tab :: ets:tab(),
%% Clustering: cluster map version number
namespace_version = 0 :: machi_dt:namespace_version(),
%% Clustering: my (and my chain's) assignment to a specific namespace
namespace = <<>> :: machi_dt:namespace(),
%% High mode only
high_clnt :: pid(),
%% anything you want
props = [] :: proplists:proplist()
}).
-type socket() :: any().
-type state() :: #state{}.
-spec start_link(ranch:ref(), socket(), module(), [term()]) -> {ok, pid()}.
start_link(Ref, Socket, Transport, [FluName, Witness, DataDir, EpochTab, ProjStore, Props]) ->
NS = proplists:get_value(namespace, Props, <<>>),
true = is_binary(NS),
proc_lib:start_link(?MODULE, init, [#state{ref=Ref,
socket=Socket,
transport=Transport,
flu_name=FluName,
witness=Witness,
data_dir=DataDir,
epoch_tab=EpochTab,
proj_store=ProjStore,
namespace=NS,
props=Props}]).
-spec init(state()) -> no_return().
init(#state{ref=Ref, socket=Socket, transport=Transport}=State) ->
ok = proc_lib:init_ack({ok, self()}),
ok = ranch:accept_ack(Ref),
{_Wedged_p, CurrentEpochID} = lookup_epoch(State),
ok = Transport:setopts(Socket, [{active, once}|?PB_PACKET_OPTS]),
gen_server:enter_loop(?MODULE, [], State#state{epoch_id=CurrentEpochID}).
handle_call(Request, _From, S) ->
lager:warning("~s:handle_call UNKNOWN message: ~w", [?MODULE, Request]),
Reply = {error, {unknown_message, Request}},
{reply, Reply, S}.
handle_cast(_Msg, S) ->
lager:warning("~s:handle_cast UNKNOWN message: ~w", [?MODULE, _Msg]),
{noreply, S}.
%% TODO: Other transport support needed?? TLS/SSL, SCTP
handle_info({tcp, Socket, Data}=_Info, #state{socket=Socket}=S) ->
lager:debug("~s:handle_info: ~w", [?MODULE, _Info]),
transport_received(Socket, Data, S);
handle_info({tcp_closed, Socket}=_Info, #state{socket=Socket}=S) ->
lager:debug("~s:handle_info: ~w", [?MODULE, _Info]),
transport_closed(Socket, S);
handle_info({tcp_error, Socket, Reason}=_Info, #state{socket=Socket}=S) ->
lager:warning("~s:handle_info (socket=~w) tcp_error: ~w", [?MODULE, Socket, Reason]),
transport_error(Socket, Reason, S);
handle_info(_Info, S) ->
lager:warning("~s:handle_info UNKNOWN message: ~w", [?MODULE, _Info]),
{noreply, S}.
terminate(normal, #state{socket=undefined}=_S) ->
ok;
terminate(Reason, #state{socket=undefined}=_S) ->
lager:warning("~s:terminate (socket=undefined): ~w", [?MODULE, Reason]),
ok;
terminate(normal, #state{socket=Socket}=_S) ->
(catch gen_tcp:close(Socket)),
ok;
terminate(Reason, #state{socket=Socket}=_S) ->
lager:warning("~s:terminate (socket=Socket): ~w", [?MODULE, Reason]),
(catch gen_tcp:close(Socket)),
ok.
code_change(_OldVsn, S, _Extra) ->
{ok, S}.
%% -- private
%%%% Common transport handling
-spec transport_received(socket(), machi_dt:chunk(), state()) ->
{noreply, state()}.
transport_received(Socket, <<"QUIT\n">>, #state{socket=Socket}=S) ->
{stop, normal, S};
transport_received(Socket, Bin, #state{transport=Transport}=S) ->
{RespBin, S2} =
case machi_pb:decode_mpb_ll_request(Bin) of
LL_req when LL_req#mpb_ll_request.do_not_alter == 2 ->
{R, NewS} = do_pb_ll_request(LL_req, S),
{maybe_encode_response(R), set_mode(low, NewS)};
_ ->
HL_req = machi_pb:decode_mpb_request(Bin),
1 = HL_req#mpb_request.do_not_alter,
{R, NewS} = do_pb_hl_request(HL_req, make_high_clnt(S)),
{machi_pb:encode_mpb_response(R), set_mode(high, NewS)}
end,
case RespBin of
async_no_response ->
Transport:setopts(Socket, [{active, once}]),
{noreply, S2};
_ ->
case Transport:send(Socket, RespBin) of
ok ->
Transport:setopts(Socket, [{active, once}]),
{noreply, S2};
{error, Reason} ->
transport_error(Socket, Reason, S2)
end
end.
-spec transport_closed(socket(), state()) -> {stop, term(), state()}.
transport_closed(_Socket, S) ->
{stop, normal, S}.
-spec transport_error(socket(), term(), state()) -> no_return().
transport_error(Socket, Reason, #state{transport=Transport}=_S) ->
Msg = io_lib:format("Socket error ~w", [Reason]),
R = #mpb_ll_response{req_id= <<>>,
generic=#mpb_errorresp{code=1, msg=Msg}},
_Resp = machi_pb:encode_mpb_ll_response(R),
%% TODO for TODO comments: comments below with four %s are copy-n-paste'd,
%% then it should be considered they are still open and should be addressed.
%%%% TODO: Weird that sometimes neither catch nor try/catch
%%%% can prevent OTP's SASL from logging an error here.
%%%% Error in process <0.545.0> with exit value: {badarg,[{erlang,port_command,.......
%%%% TODO: is this what causes the intermittent PULSE deadlock errors?
%%%% _ = (catch gen_tcp:send(Sock, _Resp)), timer:sleep(1000),
(catch Transport:close(Socket)),
_ = lager:warning("Socket error (~w -> ~w): ~w",
[Transport:sockname(Socket), Transport:peername(Socket), Reason]),
%% TODO: better to exit with `Reason' without logging?
exit(normal).
maybe_encode_response(async_no_response=R) ->
R;
maybe_encode_response(R) ->
machi_pb:encode_mpb_ll_response(R).
set_mode(Mode, #state{pb_mode=undefined}=S) ->
S#state{pb_mode=Mode};
set_mode(_, S) ->
S.
%%%% Low PB mode %%%%
do_pb_ll_request(#mpb_ll_request{req_id=ReqID}, #state{pb_mode=high}=S) ->
Result = {high_error, 41, "Low protocol request while in high mode"},
{machi_pb_translate:to_pb_response(ReqID, unused, Result), S};
do_pb_ll_request(PB_request, S) ->
Req = machi_pb_translate:from_pb_request(PB_request),
{ReqID, Cmd, Result, S2} =
case Req of
{RqID, {low_skip_wedge, LowSubCmd}=Cmd0} ->
%% Skip wedge check for these unprivileged commands
{Rs, NewS} = do_pb_ll_request3(LowSubCmd, S),
{RqID, Cmd0, Rs, NewS};
{RqID, {low_proj, _LowSubCmd}=Cmd0} ->
{Rs, NewS} = do_pb_ll_request3(Cmd0, S),
{RqID, Cmd0, Rs, NewS};
{RqID, Cmd0} ->
%% All remaining must have NSVersion, NS, & EpochID at next pos
NSVersion = element(2, Cmd0),
NS = element(3, Cmd0),
EpochID = element(4, Cmd0),
{Rs, NewS} = do_pb_ll_request2(NSVersion, NS, EpochID, Cmd0, S),
{RqID, Cmd0, Rs, NewS}
end,
{machi_pb_translate:to_pb_response(ReqID, Cmd, Result), S2}.
%% do_pb_ll_request2(): Verification of epoch details & namespace details.
do_pb_ll_request2(NSVersion, NS, EpochID, CMD, S) ->
{Wedged_p, CurrentEpochID} = lookup_epoch(S),
if not is_tuple(EpochID) orelse tuple_size(EpochID) /= 2 ->
exit({bad_epoch_id, EpochID, for, CMD});
Wedged_p == true ->
{{error, wedged}, S#state{epoch_id=CurrentEpochID}};
EpochID /= CurrentEpochID ->
{Epoch, _} = EpochID,
{CurrentEpoch, _} = CurrentEpochID,
if Epoch < CurrentEpoch ->
{{error, bad_epoch}, S};
true ->
_ = machi_flu1:wedge_myself(S#state.flu_name, CurrentEpochID),
{{error, wedged}, S#state{epoch_id=CurrentEpochID}}
end;
true ->
#state{namespace_version=MyNSVersion, namespace=MyNS} = S,
if NSVersion /= MyNSVersion ->
{{error, bad_epoch}, S};
NS /= MyNS ->
{{error, bad_arg}, S};
true ->
do_pb_ll_request3(CMD, S)
end
end.
lookup_epoch(#state{epoch_tab=T}) ->
%% TODO: race in shutdown to access ets table after owner dies
ets:lookup_element(T, epoch, 2).
%% Witness status does not matter below.
do_pb_ll_request3({low_echo, Msg}, S) ->
{Msg, S};
do_pb_ll_request3({low_auth, _User, _Pass}, S) ->
{-6, S};
do_pb_ll_request3({low_wedge_status}, S) ->
{do_server_wedge_status(S), S};
do_pb_ll_request3({low_proj, PCMD}, S) ->
{do_server_proj_request(PCMD, S), S};
%% Witness status *matters* below
do_pb_ll_request3({low_append_chunk, NSVersion, NS, EpochID, NSLocator,
Prefix, Chunk, CSum_tag,
CSum, Opts},
#state{witness=false}=S) ->
NSInfo = #ns_info{version=NSVersion, name=NS, locator=NSLocator},
{do_server_append_chunk(NSInfo, EpochID,
Prefix, Chunk, CSum_tag, CSum,
Opts, S), S};
do_pb_ll_request3({low_write_chunk, _NSVersion, _NS, _EpochID, File, Offset, Chunk, CSum_tag,
CSum},
#state{witness=false}=S) ->
{do_server_write_chunk(File, Offset, Chunk, CSum_tag, CSum, S), S};
do_pb_ll_request3({low_read_chunk, _NSVersion, _NS, _EpochID, File, Offset, Size, Opts},
#state{witness=false} = S) ->
{do_server_read_chunk(File, Offset, Size, Opts, S), S};
do_pb_ll_request3({low_trim_chunk, _NSVersion, _NS, _EpochID, File, Offset, Size, TriggerGC},
#state{witness=false}=S) ->
{do_server_trim_chunk(File, Offset, Size, TriggerGC, S), S};
do_pb_ll_request3({low_checksum_list, File},
#state{witness=false}=S) ->
{do_server_checksum_listing(File, S), S};
do_pb_ll_request3({low_list_files, _EpochID},
#state{witness=false}=S) ->
{do_server_list_files(S), S};
do_pb_ll_request3({low_delete_migration, _EpochID, File},
#state{witness=false}=S) ->
{do_server_delete_migration(File, S),
#state{witness=false}=S};
do_pb_ll_request3({low_trunc_hack, _EpochID, File},
#state{witness=false}=S) ->
{do_server_trunc_hack(File, S), S};
do_pb_ll_request3(_, #state{witness=true}=S) ->
{{error, bad_arg}, S}. % TODO: new status code??
do_server_proj_request({get_latest_epochid, ProjType},
#state{proj_store=ProjStore}) ->
machi_projection_store:get_latest_epochid(ProjStore, ProjType);
do_server_proj_request({read_latest_projection, ProjType},
#state{proj_store=ProjStore}) ->
machi_projection_store:read_latest_projection(ProjStore, ProjType);
do_server_proj_request({read_projection, ProjType, Epoch},
#state{proj_store=ProjStore}) ->
machi_projection_store:read(ProjStore, ProjType, Epoch);
do_server_proj_request({write_projection, ProjType, Proj},
#state{flu_name=FluName, proj_store=ProjStore}) ->
if Proj#projection_v1.epoch_number == ?SPAM_PROJ_EPOCH ->
%% io:format(user, "DBG ~s ~w ~P\n", [?MODULE, ?LINE, Proj, 5]),
Chmgr = machi_flu_psup:make_fitness_regname(FluName),
[Map] = Proj#projection_v1.dbg,
catch machi_fitness:send_fitness_update_spam(
Chmgr, Proj#projection_v1.author_server, Map);
true ->
catch machi_projection_store:write(ProjStore, ProjType, Proj)
end;
do_server_proj_request({get_all_projections, ProjType},
#state{proj_store=ProjStore}) ->
machi_projection_store:get_all_projections(ProjStore, ProjType);
do_server_proj_request({list_all_projections, ProjType},
#state{proj_store=ProjStore}) ->
machi_projection_store:list_all_projections(ProjStore, ProjType);
do_server_proj_request({kick_projection_reaction},
#state{flu_name=FluName}) ->
%% Tell my chain manager that it might want to react to
%% this new world.
Chmgr = machi_chain_manager1:make_chmgr_regname(FluName),
spawn(fun() ->
catch machi_chain_manager1:trigger_react_to_env(Chmgr)
end),
async_no_response.
do_server_append_chunk(NSInfo, EpochID,
Prefix, Chunk, CSum_tag, CSum,
Opts, S) ->
case sanitize_prefix(Prefix) of
ok ->
do_server_append_chunk2(NSInfo, EpochID,
Prefix, Chunk, CSum_tag, CSum,
Opts, S);
_ ->
{error, bad_arg}
end.
do_server_append_chunk2(NSInfo, EpochID,
Prefix, Chunk, CSum_tag, Client_CSum,
Opts, #state{flu_name=FluName,
epoch_id=EpochID}=_S) ->
%% TODO: Do anything with PKey?
try
TaggedCSum = check_or_make_tagged_checksum(CSum_tag, Client_CSum,Chunk),
R = {seq_append, self(), NSInfo, EpochID,
Prefix, Chunk, TaggedCSum, Opts},
case gen_server:call(FluName, R, 10*1000) of
{assignment, Offset, File} ->
Size = iolist_size(Chunk),
{ok, {Offset, Size, File}};
witness ->
{error, bad_arg};
wedged ->
{error, wedged};
{error, timeout} ->
{error, partition}
end
catch
throw:{bad_csum, _CS} ->
{error, bad_checksum};
error:badarg ->
lager:error("badarg at ~w:do_server_append_chunk2:~w ~w",
[?MODULE, ?LINE, erlang:get_stacktrace()]),
{error, bad_arg}
end.
do_server_write_chunk(File, Offset, Chunk, CSum_tag, CSum, #state{flu_name=FluName}) ->
case sanitize_file_string(File) of
ok ->
case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, File}) of
{ok, Pid} ->
Meta = [{client_csum_tag, CSum_tag}, {client_csum, CSum}],
machi_file_proxy:write(Pid, Offset, Meta, Chunk);
{error, trimmed} = Error ->
Error
end;
_ ->
{error, bad_arg}
end.
do_server_read_chunk(File, Offset, Size, Opts, #state{flu_name=FluName})->
case sanitize_file_string(File) of
ok ->
case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, File}) of
{ok, Pid} ->
case machi_file_proxy:read(Pid, Offset, Size, Opts) of
%% XXX FIXME
%% For now we are omiting the checksum data because it blows up
%% protobufs.
{ok, ChunksAndTrimmed} -> {ok, ChunksAndTrimmed};
Other -> Other
end;
{error, trimmed} = Error ->
Error
end;
_ ->
{error, bad_arg}
end.
do_server_trim_chunk(File, Offset, Size, TriggerGC, #state{flu_name=FluName}) ->
lager:debug("Hi there! I'm trimming this: ~s, (~p, ~p), ~p~n",
[File, Offset, Size, TriggerGC]),
case sanitize_file_string(File) of
ok ->
case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, File}) of
{ok, Pid} ->
machi_file_proxy:trim(Pid, Offset, Size, TriggerGC);
{error, trimmed} = Trimmed ->
%% Should be returned back to (maybe) trigger repair
Trimmed
end;
_ ->
{error, bad_arg}
end.
do_server_checksum_listing(File, #state{flu_name=FluName, data_dir=DataDir}=_S) ->
case sanitize_file_string(File) of
ok ->
case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, File}) of
{ok, Pid} ->
{ok, List} = machi_file_proxy:checksum_list(Pid),
Bin = erlang:term_to_binary(List),
if byte_size(Bin) > (?PB_MAX_MSG_SIZE - 1024) ->
%% TODO: Fix this limitation by streaming the
%% binary in multiple smaller PB messages.
%% Also, don't read the file all at once. ^_^
error_logger:error_msg("~s:~w oversize ~s\n",
[?MODULE, ?LINE, DataDir]),
{error, bad_arg};
true ->
{ok, Bin}
end;
{error, trimmed} ->
{error, trimmed}
end;
_ ->
{error, bad_arg}
end.
do_server_list_files(#state{data_dir=DataDir}=_S) ->
{_, WildPath} = machi_util:make_data_filename(DataDir, ""),
Files = filelib:wildcard("*", WildPath),
{ok, [begin
{ok, FI} = file:read_file_info(WildPath ++ "/" ++ File),
Size = FI#file_info.size,
{Size, File}
end || File <- Files]}.
do_server_wedge_status(#state{namespace_version=NSVersion, namespace=NS}=S) ->
{Wedged_p, CurrentEpochID0} = lookup_epoch(S),
CurrentEpochID = if CurrentEpochID0 == undefined ->
?DUMMY_PV1_EPOCH;
true ->
CurrentEpochID0
end,
{Wedged_p, CurrentEpochID, NSVersion, NS}.
do_server_delete_migration(File, #state{data_dir=DataDir}=_S) ->
case sanitize_file_string(File) of
ok ->
{_, Path} = machi_util:make_data_filename(DataDir, File),
case file:delete(Path) of
ok ->
ok;
{error, enoent} ->
{error, no_such_file};
_ ->
{error, bad_arg}
end;
_ ->
{error, bad_arg}
end.
do_server_trunc_hack(File, #state{data_dir=DataDir}=_S) ->
case sanitize_file_string(File) of
ok ->
{_, Path} = machi_util:make_data_filename(DataDir, File),
case file:open(Path, [read, write, binary, raw]) of
{ok, FH} ->
try
{ok, ?MINIMUM_OFFSET} = file:position(FH,
?MINIMUM_OFFSET),
ok = file:truncate(FH),
ok
after
file:close(FH)
end;
{error, enoent} ->
{error, no_such_file};
_ ->
{error, bad_arg}
end;
_ ->
{error, bad_arg}
end.
sanitize_file_string(Str) ->
case has_no_prohibited_chars(Str) andalso machi_util:is_valid_filename(Str) of
true -> ok;
false -> error
end.
has_no_prohibited_chars(Str) ->
case re:run(Str, "/") of
nomatch ->
true;
_ ->
true
end.
sanitize_prefix(Prefix) ->
%% We are using '^' as our component delimiter
case re:run(Prefix, "/|\\^") of
nomatch ->
ok;
_ ->
error
end.
check_or_make_tagged_checksum(?CSUM_TAG_NONE, _Client_CSum, Chunk) ->
%% TODO: If the client was foolish enough to use
%% this type of non-checksum, then the client gets
%% what it deserves wrt data integrity, alas. In
%% the client-side Chain Replication method, each
%% server will calculated this independently, which
%% isn't exactly what ought to happen for best data
%% integrity checking. In server-side CR, the csum
%% should be calculated by the head and passed down
%% the chain together with the value.
CS = machi_util:checksum_chunk(Chunk),
machi_util:make_tagged_csum(server_sha, CS);
check_or_make_tagged_checksum(?CSUM_TAG_CLIENT_SHA, Client_CSum, Chunk) ->
CS = machi_util:checksum_chunk(Chunk),
if CS == Client_CSum ->
machi_util:make_tagged_csum(server_sha,
Client_CSum);
true ->
throw({bad_csum, CS})
end.
%%%% High PB mode %%%%
do_pb_hl_request(#mpb_request{req_id=ReqID}, #state{pb_mode=low}=S) ->
Result = {low_error, 41, "High protocol request while in low mode"},
{machi_pb_translate:to_pb_response(ReqID, unused, Result), S};
do_pb_hl_request(PB_request, S) ->
{ReqID, Cmd} = machi_pb_translate:from_pb_request(PB_request),
{Result, S2} = do_pb_hl_request2(Cmd, S),
{machi_pb_translate:to_pb_response(ReqID, Cmd, Result), S2}.
do_pb_hl_request2({high_echo, Msg}, S) ->
{Msg, S};
do_pb_hl_request2({high_auth, _User, _Pass}, S) ->
{-77, S};
do_pb_hl_request2({high_append_chunk=Op, NS, Prefix, Chunk, TaggedCSum, Opts},
#state{high_clnt=Clnt}=S) ->
NSInfo = #ns_info{name=NS}, % TODO populate other fields
todo_perhaps_remind_ns_locator_not_chosen(Op),
Res = machi_cr_client:append_chunk(Clnt, NSInfo,
Prefix, Chunk, TaggedCSum, Opts),
{Res, S};
do_pb_hl_request2({high_write_chunk=Op, File, Offset, Chunk, CSum},
#state{high_clnt=Clnt}=S) ->
NSInfo = undefined,
todo_perhaps_remind_ns_locator_not_chosen(Op),
Res = machi_cr_client:write_chunk(Clnt, NSInfo, File, Offset, Chunk, CSum),
{Res, S};
do_pb_hl_request2({high_read_chunk=Op, File, Offset, Size, Opts},
#state{high_clnt=Clnt}=S) ->
NSInfo = undefined,
todo_perhaps_remind_ns_locator_not_chosen(Op),
Res = machi_cr_client:read_chunk(Clnt, NSInfo, File, Offset, Size, Opts),
{Res, S};
do_pb_hl_request2({high_trim_chunk=Op, File, Offset, Size},
#state{high_clnt=Clnt}=S) ->
NSInfo = undefined,
todo_perhaps_remind_ns_locator_not_chosen(Op),
Res = machi_cr_client:trim_chunk(Clnt, NSInfo, File, Offset, Size),
{Res, S};
do_pb_hl_request2({high_checksum_list, File}, #state{high_clnt=Clnt}=S) ->
Res = machi_cr_client:checksum_list(Clnt, File),
{Res, S};
do_pb_hl_request2({high_list_files}, #state{high_clnt=Clnt}=S) ->
Res = machi_cr_client:list_files(Clnt),
{Res, S}.
make_high_clnt(#state{high_clnt=undefined}=S) ->
{ok, Proj} = machi_projection_store:read_latest_projection(
S#state.proj_store, private),
Ps = [P_srvr || {_, P_srvr} <- orddict:to_list(
Proj#projection_v1.members_dict)],
{ok, Clnt} = machi_cr_client:start_link(Ps),
S#state{high_clnt=Clnt};
make_high_clnt(S) ->
S.
todo_perhaps_remind_ns_locator_not_chosen(Op) ->
Key = {?MODULE, Op},
case get(Key) of
undefined ->
io:format(user, "TODO op ~w is using default locator value\n",
[Op]),
put(Key, true);
_ ->
ok
end.

118
src/machi_flu1_subsup.erl Normal file
View file

@ -0,0 +1,118 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc A supervisor to hold dynamic processes inside single
%% FLU service, ranch listener and append server.
%% TODO: This supervisor is maybe useless. First introduced for
%% workaround to start listener dynamically in flu1 initialization
%% phase. Because `machi_flu_psup' is being blocked in flu1
%% initialization time, adding a child to the supervisor leads to
%% deadlock. If initialization can be done only by static arguments,
%% then this supervisor should be removed and added as a direct child
%% of `machi_flu_psup'.
-module(machi_flu1_subsup).
-behaviour(supervisor).
%% public API
-export([start_link/1,
start_append_server/4,
stop_append_server/1,
start_listener/7,
stop_listener/1,
subsup_name/1,
listener_name/1]).
%% supervisor callback
-export([init/1]).
-include("machi_projection.hrl").
-define(SHUTDOWN, 5000).
-define(BACKLOG, 8192).
-spec start_link(pv1_server()) -> {ok, pid()}.
start_link(FluName) ->
supervisor:start_link({local, subsup_name(FluName)}, ?MODULE, []).
-spec start_append_server(pv1_server(), boolean(), boolean(),
undefined | machi_dt:epoch_id()) ->
{ok, pid()}.
start_append_server(FluName, Witness_p, Wedged_p, EpochId) ->
supervisor:start_child(subsup_name(FluName),
append_server_spec(FluName, Witness_p, Wedged_p, EpochId)).
-spec stop_append_server(pv1_server()) -> ok.
stop_append_server(FluName) ->
SubSup = listener_name(FluName),
ok = supervisor:terminate_child(SubSup, FluName),
ok = supervisor:delete_child(SubSup, FluName).
-spec start_listener(pv1_server(), inet:port_number(), boolean(),
string(), ets:tab(), atom() | pid(),
proplists:proplist()) -> {ok, pid()}.
start_listener(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore,
Props) ->
supervisor:start_child(subsup_name(FluName),
listener_spec(FluName, TcpPort, Witness, DataDir,
EpochTab, ProjStore, Props)).
-spec stop_listener(pv1_server()) -> ok.
stop_listener(FluName) ->
SupName = subsup_name(FluName),
ListenerName = listener_name(FluName),
ok = supervisor:terminate_child(SupName, ListenerName),
ok = supervisor:delete_child(SupName, ListenerName).
-spec subsup_name(pv1_server()) -> atom().
subsup_name(FluName) when is_atom(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_flu1_subsup").
-spec listener_name(pv1_server()) -> atom().
listener_name(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_listener").
%% Supervisor callback
init([]) ->
SupFlags = {one_for_all, 1000, 10},
{ok, {SupFlags, []}}.
%% private
-spec listener_spec(pv1_server(), inet:port_number(), boolean(),
string(), ets:tab(), atom() | pid(),
proplists:proplist()) -> supervisor:child_spec().
listener_spec(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore, Props) ->
ListenerName = listener_name(FluName),
NbAcceptors = 10,
TcpOpts = [{port, TcpPort}, {backlog, ?BACKLOG}],
NetServerOpts = [FluName, Witness, DataDir, EpochTab, ProjStore, Props],
ranch:child_spec(ListenerName, NbAcceptors,
ranch_tcp, TcpOpts,
machi_flu1_net_server, NetServerOpts).
-spec append_server_spec(pv1_server(), boolean(), boolean(),
undefined | machi_dt:epoch_id()) -> supervisor:child_spec().
append_server_spec(FluName, Witness_p, Wedged_p, EpochId) ->
{FluName, {machi_flu1_append_server, start_link,
[FluName, Witness_p, Wedged_p, EpochId]},
permanent, ?SHUTDOWN, worker, [machi_flu1_append_server]}.

View file

@ -0,0 +1,232 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%%
%% @doc This process is responsible for managing filenames assigned to
%% prefixes. It's started out of `machi_flu_psup'.
%%
%% Supported operations include finding the "current" filename assigned to
%% a prefix. Incrementing the sequence number and returning a new file name
%% and listing all data files assigned to a given prefix.
%%
%% All prefixes should have the form of `{prefix, P}'. Single filename
%% return values have the form of `{file, F}'.
%%
%% <h2>Finding the current file associated with a sequence</h2>
%% First it looks up the sequence number from the prefix name. If
%% no sequence file is found, it uses 0 as the sequence number and searches
%% for a matching file with the prefix and 0 as the sequence number.
%% If no file is found, the it generates a new filename by incorporating
%% the given prefix, a randomly generated (v4) UUID and 0 as the
%% sequence number.
%%
%% If the sequence number is > 0, then the process scans the filesystem
%% looking for a filename which matches the prefix and given sequence number and
%% returns that.
-module(machi_flu_filename_mgr).
-behavior(gen_server).
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
-compile(export_all).
-endif.
-export([
child_spec/2,
start_link/2,
find_or_make_filename_from_prefix/4,
increment_prefix_sequence/3,
list_files_by_prefix/2
]).
%% gen_server callbacks
-export([
init/1,
handle_cast/2,
handle_call/3,
handle_info/2,
terminate/2,
code_change/3
]).
-define(TIMEOUT, 10 * 1000).
-include("machi.hrl"). %% included for #ns_info record
-include("machi_projection.hrl"). %% included for pv1_epoch type
-record(state, {fluname :: atom(),
tid :: ets:tid(),
datadir :: string(),
epoch :: pv1_epoch()
}).
%% public API
child_spec(FluName, DataDir) ->
Name = make_filename_mgr_name(FluName),
{Name,
{?MODULE, start_link, [FluName, DataDir]},
permanent, 5000, worker, [?MODULE]}.
start_link(FluName, DataDir) when is_atom(FluName) andalso is_list(DataDir) ->
N = make_filename_mgr_name(FluName),
gen_server:start_link({local, N}, ?MODULE, [FluName, DataDir], []).
-spec find_or_make_filename_from_prefix( FluName :: atom(),
EpochId :: pv1_epoch(),
Prefix :: {prefix, string()},
machi_dt:ns_info()) ->
{file, Filename :: string()} | {error, Reason :: term() } | timeout.
% @doc Find the latest available or make a filename from a prefix. A prefix
% should be in the form of a tagged tuple `{prefix, P}'. Returns a tagged
% tuple in the form of `{file, F}' or an `{error, Reason}'
find_or_make_filename_from_prefix(FluName, EpochId,
{prefix, Prefix},
#ns_info{}=NSInfo)
when is_atom(FluName) ->
N = make_filename_mgr_name(FluName),
gen_server:call(N, {find_filename, FluName, EpochId, NSInfo, Prefix}, ?TIMEOUT);
find_or_make_filename_from_prefix(_FluName, _EpochId, Other, Other2) ->
lager:error("~p is not a valid prefix/locator ~p", [Other, Other2]),
error(badarg).
-spec increment_prefix_sequence( FluName :: atom(), NSInfo :: machi_dt:ns_info(), Prefix :: {prefix, string()} ) ->
ok | {error, Reason :: term() } | timeout.
% @doc Increment the sequence counter for a given prefix. Prefix should
% be in the form of `{prefix, P}'.
increment_prefix_sequence(FluName, #ns_info{}=NSInfo, {prefix, Prefix}) when is_atom(FluName) ->
gen_server:call(make_filename_mgr_name(FluName), {increment_sequence, NSInfo, Prefix}, ?TIMEOUT);
increment_prefix_sequence(_FluName, _NSInfo, Other) ->
lager:error("~p is not a valid prefix.", [Other]),
error(badarg).
-spec list_files_by_prefix( FluName :: atom(), Prefix :: {prefix, string()} ) ->
[ file:name() ] | timeout | {error, Reason :: term() }.
% @doc Given a prefix in the form of `{prefix, P}' return
% all the data files associated with that prefix. Returns
% a list.
list_files_by_prefix(FluName, {prefix, Prefix}) when is_atom(FluName) ->
gen_server:call(make_filename_mgr_name(FluName), {list_files, Prefix}, ?TIMEOUT);
list_files_by_prefix(_FluName, Other) ->
lager:error("~p is not a valid prefix.", [Other]),
error(badarg).
%% gen_server API
init([FluName, DataDir]) ->
Tid = ets:new(make_filename_mgr_name(FluName), [named_table, {read_concurrency, true}]),
{ok, #state{fluname = FluName,
epoch = ?DUMMY_PV1_EPOCH,
datadir = DataDir,
tid = Tid}}.
handle_cast(Req, State) ->
lager:warning("Got unknown cast ~p", [Req]),
{noreply, State}.
%% Important assumption: by the time we reach here the EpochId is kosher.
%% the FLU has already validated that the caller's epoch id and the FLU's epoch id
%% are the same. So we *assume* that remains the case here - that is to say, we
%% are not wedged.
handle_call({find_filename, FluName, EpochId, NSInfo, Prefix}, _From,
S = #state{ datadir = DataDir, epoch = EpochId, tid = Tid }) ->
%% Our state and the caller's epoch ids are the same. Business as usual.
File = handle_find_file(FluName, Tid, NSInfo, Prefix, DataDir),
{reply, {file, File}, S};
handle_call({find_filename, _FluName, EpochId, NSInfo, Prefix}, _From, S = #state{ datadir = DataDir, tid = Tid }) ->
%% If the epoch id in our state and the caller's epoch id were the same, it would've
%% matched the above clause. Since we're here, we know that they are different.
%% If epoch ids between our state and the caller's are different, we must increment the
%% sequence number, generate a filename and then cache it.
File = increment_and_cache_filename(Tid, DataDir, NSInfo, Prefix),
{reply, {file, File}, S#state{epoch = EpochId}};
handle_call({increment_sequence, #ns_info{name=NS, locator=NSLocator}, Prefix}, _From, S = #state{ datadir = DataDir, tid=Tid }) ->
NSInfo = #ns_info{name=NS, locator=NSLocator},
_File = increment_and_cache_filename(Tid, DataDir, NSInfo, Prefix),
{reply, ok, S};
handle_call({list_files, Prefix}, From, S = #state{ datadir = DataDir }) ->
spawn(fun() ->
L = list_files(DataDir, Prefix),
gen_server:reply(From, L)
end),
{noreply, S};
handle_call(Req, From, State) ->
lager:warning("Got unknown call ~p from ~p", [Req, From]),
{reply, hoge, State}.
handle_info(Info, State) ->
lager:warning("Got unknown info ~p", [Info]),
{noreply, State}.
terminate(Reason, _State) ->
lager:info("Shutting down because ~p", [Reason]),
ok.
code_change(_OldVsn, State, _Extra) ->
{ok, State}.
%% private
%% Quoted from https://github.com/afiskon/erlang-uuid-v4/blob/master/src/uuid.erl
%% MIT License
generate_uuid_v4_str() ->
<<A:32, B:16, C:16, D:16, E:48>> = crypto:strong_rand_bytes(16),
io_lib:format("~8.16.0b-~4.16.0b-4~3.16.0b-~4.16.0b-~12.16.0b",
[A, B, C band 16#0fff, D band 16#3fff bor 16#8000, E]).
list_files(DataDir, Prefix) ->
{F_bin, Path} = machi_util:make_data_filename(DataDir, "*^" ++ Prefix ++ "^*"),
filelib:wildcard(binary_to_list(F_bin), filename:dirname(Path)).
make_filename_mgr_name(FluName) when is_atom(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_filename_mgr").
handle_find_file(_FluName, Tid, #ns_info{name=NS, locator=NSLocator}, Prefix, DataDir) ->
case ets:lookup(Tid, {NS, NSLocator, Prefix}) of
[] ->
N = machi_util:read_max_filenum(DataDir, NS, NSLocator, Prefix),
F = generate_filename(DataDir, NS, NSLocator, Prefix, N),
true = ets:insert(Tid, {{NS, NSLocator, Prefix}, F}),
F;
[{_Key, File}] ->
File
end.
generate_filename(DataDir, NS, NSLocator, Prefix, N) ->
{F, _Q} = machi_util:make_data_filename(
DataDir,
NS, NSLocator, Prefix,
generate_uuid_v4_str(),
N),
binary_to_list(F).
increment_and_cache_filename(Tid, DataDir, #ns_info{name=NS,locator=NSLocator}, Prefix) ->
ok = machi_util:increment_max_filenum(DataDir, NS, NSLocator, Prefix),
N = machi_util:read_max_filenum(DataDir, NS, NSLocator, Prefix),
F = generate_filename(DataDir, NS, NSLocator, Prefix, N),
true = ets:insert(Tid, {{NS, NSLocator, Prefix}, F}),
F.
-ifdef(TEST).
-endif.

View file

@ -0,0 +1,297 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc This is a metadata service for the machi FLU which currently
%% tracks the mappings between filenames and file proxies.
%%
%% The service takes a given hash space and spreads it out over a
%% pool of N processes which are responsible for 1/Nth the hash
%% space. When a user requests an operation on a particular file
%% the filename is hashed into the hash space and the request
%% forwarded to a particular manager responsible for that slice
%% of the hash space.
%%
%% The current hash implementation is `erlang:phash2/1' which has
%% a range between 0..2^27-1 or 134,217,727.
-module(machi_flu_metadata_mgr).
-behaviour(gen_server).
-include("machi.hrl").
-define(MAX_MGRS, 10). %% number of managers to start by default.
-define(HASH(X), erlang:phash2(X)). %% hash algorithm to use
-define(TIMEOUT, 10 * 1000). %% 10 second timeout
-define(KNOWN_FILES_LIST_PREFIX, "known_files_").
-record(state, {fluname :: atom(),
datadir :: string(),
tid :: ets:tid(),
cnt :: non_neg_integer(),
trimmed_files :: machi_plist:plist()
}).
%% This record goes in the ets table where filename is the key
-record(md, {filename :: string(),
proxy_pid :: undefined|pid(),
mref :: undefined|reference() %% monitor ref for file proxy
}).
%% public api
-export([
child_spec/4,
start_link/4,
lookup_manager_pid/2,
lookup_proxy_pid/2,
start_proxy_pid/2,
stop_proxy_pid/2,
stop_proxy_pid_rollover/2,
build_metadata_mgr_name/2,
trim_file/2
]).
%% gen_server callbacks
-export([
init/1,
handle_cast/2,
handle_call/3,
handle_info/2,
terminate/2,
code_change/3
]).
%% Public API
build_metadata_mgr_name(FluName, N) when is_atom(FluName) andalso is_integer(N) ->
list_to_atom(atom_to_list(FluName) ++ "_metadata_mgr_" ++ integer_to_list(N)).
child_spec(FluName, C, DataDir, N) ->
Name = build_metadata_mgr_name(FluName, C),
{Name,
{?MODULE, start_link, [FluName, Name, DataDir, N]},
permanent, 5000, worker, [?MODULE]}.
start_link(FluName, Name, DataDir, Num) when is_atom(Name) andalso is_list(DataDir) ->
gen_server:start_link({local, Name}, ?MODULE, [FluName, Name, DataDir, Num], []).
lookup_manager_pid(FluName, {file, Filename}) ->
whereis(get_manager_atom(FluName, Filename)).
lookup_proxy_pid(FluName, {file, Filename}) ->
gen_server:call(get_manager_atom(FluName, Filename), {proxy_pid, Filename}, ?TIMEOUT).
start_proxy_pid(FluName, {file, Filename}) ->
gen_server:call(get_manager_atom(FluName, Filename), {start_proxy_pid, Filename}, ?TIMEOUT).
stop_proxy_pid(FluName, {file, Filename}) ->
gen_server:call(get_manager_atom(FluName, Filename), {stop_proxy_pid, false, Filename}, ?TIMEOUT).
stop_proxy_pid_rollover(FluName, {file, Filename}) ->
gen_server:call(get_manager_atom(FluName, Filename), {stop_proxy_pid, true, Filename}, ?TIMEOUT).
trim_file(FluName, {file, Filename}) ->
gen_server:call(get_manager_atom(FluName, Filename), {trim_file, Filename}, ?TIMEOUT).
%% gen_server callbacks
init([FluName, Name, DataDir, Num]) ->
%% important: we'll need another persistent storage to
%% remember deleted (trimmed) file, to prevent resurrection after
%% flu restart and append.
FileListFileName =
filename:join([DataDir, ?KNOWN_FILES_LIST_PREFIX ++ atom_to_list(FluName)]),
{ok, PList} = machi_plist:open(FileListFileName, []),
%% TODO make sure all files non-existent, if any remaining files
%% here, just delete it. They're in the list *because* they're all
%% trimmed.
Tid = ets:new(Name, [{keypos, 2}, {read_concurrency, true}, {write_concurrency, true}]),
{ok, #state{fluname = FluName, datadir = DataDir, tid = Tid, cnt = Num,
trimmed_files=PList}}.
handle_cast(Req, State) ->
lager:warning("Got unknown cast ~p", [Req]),
{noreply, State}.
handle_call({proxy_pid, Filename}, _From, State = #state{ tid = Tid }) ->
Reply = case lookup_md(Tid, Filename) of
not_found -> undefined;
R -> R#md.proxy_pid
end,
{reply, Reply, State};
handle_call({start_proxy_pid, Filename}, _From,
State = #state{ fluname = N, tid = Tid, datadir = D,
trimmed_files=TrimmedFiles}) ->
case machi_plist:find(TrimmedFiles, Filename) of
false ->
NewR = case lookup_md(Tid, Filename) of
not_found ->
start_file_proxy(N, D, Filename);
#md{ proxy_pid = undefined } = R0 ->
start_file_proxy(N, D, R0);
#md{ proxy_pid = _Pid } = R1 ->
R1
end,
update_ets(Tid, NewR),
{reply, {ok, NewR#md.proxy_pid}, State};
true ->
{reply, {error, trimmed}, State}
end;
handle_call({stop_proxy_pid, Rollover_p, Filename}, _From, State = #state{ tid = Tid }) ->
case lookup_md(Tid, Filename) of
not_found ->
ok;
#md{ proxy_pid = undefined } ->
ok;
#md{ proxy_pid = Pid, mref = M } = R ->
demonitor(M, [flush]),
if Rollover_p ->
do_rollover(Filename, State);
true ->
machi_file_proxy:stop(Pid),
update_ets(Tid, R#md{ proxy_pid = undefined,
mref = undefined })
end
end,
{reply, ok, State};
handle_call({trim_file, Filename}, _,
S = #state{trimmed_files = TrimmedFiles }) ->
case machi_plist:add(TrimmedFiles, Filename) of
{ok, TrimmedFiles2} ->
{reply, ok, S#state{trimmed_files=TrimmedFiles2}};
Error ->
{reply, Error, S}
end;
handle_call(Req, From, State) ->
lager:warning("Got unknown call ~p from ~p", [Req, From]),
{reply, hoge, State}.
handle_info({'DOWN', Mref, process, Pid, normal}, State = #state{ tid = Tid }) ->
lager:debug("file proxy ~p shutdown normally", [Pid]),
clear_ets(Tid, Mref),
{noreply, State};
handle_info({'DOWN', Mref, process, Pid, wedged}, State = #state{ tid = Tid }) ->
lager:error("file proxy ~p shutdown because it's wedged", [Pid]),
clear_ets(Tid, Mref),
{noreply, State};
handle_info({'DOWN', _Mref, process, Pid, trimmed}, State = #state{ tid = _Tid }) ->
lager:debug("file proxy ~p shutdown because the file was trimmed", [Pid]),
{noreply, State};
handle_info({'DOWN', Mref, process, Pid, Error}, State = #state{ tid = Tid }) ->
lager:error("file proxy ~p shutdown because ~p", [Pid, Error]),
clear_ets(Tid, Mref),
{noreply, State};
handle_info(Info, State) ->
lager:warning("Got unknown info ~p", [Info]),
{noreply, State}.
terminate(Reason, _State = #state{trimmed_files=TrimmedFiles}) ->
lager:info("Shutting down because ~p", [Reason]),
machi_plist:close(TrimmedFiles),
ok.
code_change(_OldVsn, State, _Extra) ->
{ok, State}.
%% Private functions
compute_hash(Data) ->
?HASH(Data).
compute_worker(Hash) ->
MgrCount = get_env(metadata_manager_count, ?MAX_MGRS),
(Hash rem MgrCount) + 1.
%% TODO MARK: Hrm, the intermittent failures of both of the tests
%% in machi_cr_client_test.erl were due to this func returning 0.
%% But machi_flu_metadata_mgr_sup:init() doesn't make a child with N=0.
%%
%% The remaining puzzle for me, which I'm now punting to you, sorry,
%% is why those two tests would sometimes pass, which implies to me that
%% sometimes the code path used by those tests never chooses worker 0
%% and occasionally it does choose worker 0.
get_manager_atom(FluName, Data) ->
build_metadata_mgr_name(FluName, compute_worker(compute_hash(Data))).
lookup_md(Tid, Data) ->
case ets:lookup(Tid, Data) of
[] -> not_found;
[R] -> R
end.
start_file_proxy(FluName, D, R = #md{filename = F} ) ->
{ok, Pid} = machi_file_proxy_sup:start_proxy(FluName, D, F),
Mref = monitor(process, Pid),
R#md{ proxy_pid = Pid, mref = Mref };
start_file_proxy(FluName, D, Filename) ->
start_file_proxy(FluName, D, #md{ filename = Filename }).
update_ets(Tid, R) ->
ets:insert(Tid, R).
clear_ets(Tid, Mref) ->
R = get_md_record_by_mref(Tid, Mref),
update_ets(Tid, R#md{ proxy_pid = undefined, mref = undefined }).
purge_ets(Tid, R) ->
true = ets:delete_object(Tid, R).
get_md_record_by_mref(Tid, Mref) ->
[R] = ets:match_object(Tid, {md, '_', '_', Mref}),
R.
get_md_record_by_filename(Tid, Filename) ->
[R] = ets:lookup(Tid, Filename),
R.
get_env(Setting, Default) ->
case application:get_env(machi, Setting) of
undefined -> Default;
{ok, V} -> V
end.
do_rollover(Filename, _State = #state{ fluname = FluName,
tid = Tid }) ->
R = get_md_record_by_filename(Tid, Filename),
lager:info("file ~p proxy ~p shutdown because of file rollover",
[Filename, R#md.proxy_pid]),
{Prefix, NS, NSLocator, _, _} =
machi_util:parse_filename(R#md.filename),
%% We only increment the counter here. The filename will be generated on the
%% next append request to that prefix and since the filename will have a new
%% sequence number it probably will be associated with a different metadata
%% manager. That's why we don't want to generate a new file name immediately
%% and use it to start a new file proxy.
NSInfo = #ns_info{name=NS, locator=NSLocator},
lager:warning("INCR: ~p ~p\n", [FluName, Prefix]),
ok = machi_flu_filename_mgr:increment_prefix_sequence(FluName, NSInfo, {prefix, Prefix}),
%% purge our ets table of this entry completely since it is likely the
%% new filename (whenever it comes) will be in a different manager than
%% us.
purge_ets(Tid, R),
ok.

View file

@ -0,0 +1,59 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc This is the supervisor for the collection of metadata
%% managers. It's started out of `machi_flu_psup'. It reads an
%% application environment variable named `metadata_manager_count'
%% with a default of 10 if it is not set.
-module(machi_flu_metadata_mgr_sup).
-behaviour(supervisor).
%% public API
-export([
child_spec/3,
start_link/3
]).
%% supervisor callback
-export([init/1]).
child_spec(FluName, DataDir, N) ->
{make_sup_name(FluName),
{?MODULE, start_link, [FluName, DataDir, N]},
permanent, 5000, supervisor, [?MODULE]}.
start_link(FluName, DataDir, N) ->
supervisor:start_link({local, make_sup_name(FluName)}, ?MODULE, [FluName, DataDir, N]).
init([FluName, DataDir, N]) ->
Restart = one_for_one,
MaxRestarts = 1000,
SecondsBetween = 3600,
SupFlags = {Restart, MaxRestarts, SecondsBetween},
Children =
[ machi_flu_metadata_mgr:child_spec(FluName, C, DataDir, N) || C <- lists:seq(1,N) ],
{ok, {SupFlags, Children}}.
make_sup_name(FluName) when is_atom(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_metadata_mgr_sup").

View file

@ -61,19 +61,43 @@
-behaviour(supervisor).
-include("machi_projection.hrl").
-include("machi_verbose.hrl").
-ifdef(PULSE).
-compile({parse_transform, pulse_instrument}).
-include_lib("pulse_otp/include/pulse_otp.hrl").
-define(SHUTDOWN, infinity).
-else.
-define(SHUTDOWN, 5000).
-endif.
%% External API
-export([make_package_spec/4, start_flu_package/4, stop_flu_package/1]).
-export([make_package_spec/1, make_package_spec/4,
start_flu_package/1, start_flu_package/4, stop_flu_package/1]).
%% Internal API
-export([start_link/4,
make_p_regname/1, make_mgr_supname/1, make_proj_supname/1]).
make_flu_regname/1, make_p_regname/1, make_mgr_supname/1,
make_proj_supname/1, make_fitness_regname/1]).
%% Supervisor callbacks
-export([init/1]).
make_package_spec(#p_srvr{name=FluName, port=TcpPort, props=Props}) when is_list(Props) ->
make_package_spec({FluName, TcpPort, Props});
make_package_spec({FluName, TcpPort, Props}) when is_list(Props) ->
FluDataDir = get_env(flu_data_dir, undefined_is_invalid),
MyDataDir = filename:join(FluDataDir, atom_to_list(FluName)),
make_package_spec(FluName, TcpPort, MyDataDir, Props).
make_package_spec(FluName, TcpPort, DataDir, Props) ->
{FluName, {machi_flu_psup, start_link,
[FluName, TcpPort, DataDir, Props]},
permanent, 5000, supervisor, []}.
permanent, ?SHUTDOWN, supervisor, []}.
start_flu_package(#p_srvr{name=FluName, port=TcpPort, props=Props}) ->
DataDir = get_data_dir(FluName, Props),
start_flu_package(FluName, TcpPort, DataDir, Props).
start_flu_package(FluName, TcpPort, DataDir, Props) ->
Spec = make_package_spec(FluName, TcpPort, DataDir, Props),
@ -103,16 +127,40 @@ init([FluName, TcpPort, DataDir, Props0]) ->
ProjSpec = {ProjRegName,
{machi_projection_store, start_link,
[ProjRegName, DataDir, FluName]},
permanent, 5000, worker, []},
permanent, ?SHUTDOWN, worker, []},
FitnessRegName = make_fitness_regname(FluName),
FitnessSpec = {FitnessRegName,
{machi_fitness, start_link,
[ [{FluName}|Props] ]},
permanent, ?SHUTDOWN, worker, []},
MgrSpec = {make_mgr_supname(FluName),
{machi_chain_manager1, start_link,
[FluName, [], Props]},
permanent, 5000, worker, []},
permanent, ?SHUTDOWN, worker, []},
FNameMgrSpec = machi_flu_filename_mgr:child_spec(FluName, DataDir),
MetaMgrCnt = get_env(metadata_manager_count, 10),
MetaSupSpec = machi_flu_metadata_mgr_sup:child_spec(FluName, DataDir, MetaMgrCnt),
FProxySupSpec = machi_file_proxy_sup:child_spec(FluName),
Flu1SubSupSpec = {machi_flu1_subsup:subsup_name(FluName),
{machi_flu1_subsup, start_link, [FluName]},
permanent, ?SHUTDOWN, supervisor, []},
FluSpec = {FluName,
{machi_flu1, start_link,
[ [{FluName, TcpPort, DataDir}|Props] ]},
permanent, 5000, worker, []},
{ok, {SupFlags, [ProjSpec, MgrSpec, FluSpec]}}.
permanent, ?SHUTDOWN, worker, []},
{ok, {SupFlags, [
ProjSpec, FitnessSpec, MgrSpec,
FProxySupSpec, FNameMgrSpec, MetaSupSpec,
Flu1SubSupSpec, FluSpec]}}.
make_flu_regname(FluName) when is_atom(FluName) ->
FluName.
make_p_regname(FluName) when is_atom(FluName) ->
list_to_atom("flusup_" ++ atom_to_list(FluName)).
@ -122,3 +170,21 @@ make_mgr_supname(MgrName) when is_atom(MgrName) ->
make_proj_supname(ProjName) when is_atom(ProjName) ->
list_to_atom(atom_to_list(ProjName) ++ "_pstore").
make_fitness_regname(FluName) when is_atom(FluName) ->
list_to_atom(atom_to_list(FluName) ++ "_fitness").
get_env(Setting, Default) ->
case application:get_env(machi, Setting) of
undefined -> Default;
{ok, V} -> V
end.
get_data_dir(FluName, Props) ->
case proplists:get_value(data_dir, Props) of
Path when is_list(Path) ->
Path;
undefined ->
{ok, Dir} = application:get_env(machi, flu_data_dir),
Dir ++ "/" ++ atom_to_list(FluName)
end.

View file

@ -21,6 +21,9 @@
%% @doc Supervisor for Machi FLU servers and their related support
%% servers.
%%
%% Responsibility for managing FLU and chain lifecycle after the initial
%% application startup is delegated to {@link machi_lifecycle_mgr}.
%%
%% See {@link machi_flu_psup} for an illustration of the entire Machi
%% application process structure.
@ -28,8 +31,25 @@
-behaviour(supervisor).
-include("machi.hrl").
-include("machi_projection.hrl").
-include("machi_verbose.hrl").
-ifdef(TEST).
-compile(export_all).
-ifdef(PULSE).
-compile({parse_transform, pulse_instrument}).
-include_lib("pulse_otp/include/pulse_otp.hrl").
-define(SHUTDOWN, infinity).
-else.
-define(SHUTDOWN, 5000).
-endif.
-endif. %TEST
%% API
-export([start_link/0]).
-export([start_link/0,
get_initial_flus/0, load_rc_d_files_from_dir/1,
sanitize_p_srvr_records/1]).
%% Supervisor callbacks
-export([init/1]).
@ -45,10 +65,79 @@ init([]) ->
MaxSecondsBetweenRestarts = 3600,
SupFlags = {RestartStrategy, MaxRestarts, MaxSecondsBetweenRestarts},
Ps = application:get_env(machi, initial_flus, []),
FLU_specs = [machi_flu_psup:make_package_spec(FluName, TcpPort,
DataDir, Props) ||
{FluName, TcpPort, DataDir, Props} <- Ps],
_Tab = ets:new(?TEST_ETS_TABLE, [named_table, public, ordered_set,
{read_concurrency,true}]),
Ps = get_initial_flus(),
FLU_specs = [machi_flu_psup:make_package_spec(P) || P <- Ps],
{ok, {SupFlags, FLU_specs}}.
-ifdef(PULSE).
get_initial_flus() ->
[].
-else. % PULSE
get_initial_flus() ->
DoesNotExist = "/tmp/does/not/exist",
ConfigDir = case application:get_env(machi, flu_config_dir, DoesNotExist) of
DoesNotExist ->
DoesNotExist;
Dir ->
Dir
end,
Ps = [P || {_File, P} <- load_rc_d_files_from_dir(ConfigDir)],
sanitize_p_srvr_records(Ps).
-endif. % PULSE
load_rc_d_files_from_dir(Dir) ->
Files = filelib:wildcard(Dir ++ "/*"),
[case file:consult(File) of
{ok, [X]} ->
{File, X};
_ ->
lager:warning("Error parsing file '~s', ignoring",
[File]),
{File, []}
end || File <- Files].
sanitize_p_srvr_records(Ps) ->
{Sane, _} = lists:foldl(fun sanitize_p_srvr_rec/2, {[], dict:new()}, Ps),
Sane.
sanitize_p_srvr_rec(Whole, {Acc, D}) ->
try
#p_srvr{name=Name,
proto_mod=PMod,
address=Address,
port=Port,
props=Props} = Whole,
true = is_atom(Name),
NameK = {name, Name},
error = dict:find(NameK, D),
true = is_atom(PMod),
case code:is_loaded(PMod) of
{file, _} ->
ok;
_ ->
{module, _} = code:load_file(PMod),
ok
end,
if is_list(Address) -> ok;
is_tuple(Address) -> ok % Erlang-style IPv4 or IPv6
end,
true = is_integer(Port) andalso Port >= 1024 andalso Port =< 65534,
PortK = {port, Port},
error = dict:find(PortK, D),
true = is_list(Props),
%% All is sane enough.
D2 = dict:store(NameK, Name,
dict:store(PortK, Port, D)),
{[Whole|Acc], D2}
catch _:_ ->
_ = lager:log(error, self(),
"~s: Bad (or duplicate name/port) p_srvr record, "
"skipping: ~P\n",
[?MODULE, Whole, 15]),
{Acc, D}
end.

1015
src/machi_lifecycle_mgr.erl Normal file

File diff suppressed because it is too large Load diff

156
src/machi_merkle_tree.erl Normal file
View file

@ -0,0 +1,156 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc Creates a Merkle tree per file based on the checksum data for
%% a given data file.
%%
%% The `naive' implementation representation is:
%%
%% `<<Length:64, Offset:32, 0>>' for unwritten bytes
%% `<<Length:64, Offset:32, 1>>' for trimmed bytes
%% `<<Length:64, Offset:32, Csum/binary>>' for written bytes
%%
%% The tree feeds these leaf nodes into hashes representing chunks of a minimum
%% size of at least 1024 KB (1 MB), but if the file size is larger, we will try
%% to get about 100 chunks for the first rollup "Level 1." We aim for around 10
%% hashes at level 2, and then 2 hashes level 3 and finally the root.
-module(machi_merkle_tree).
-include("machi.hrl").
-include("machi_merkle_tree.hrl").
-ifdef(TEST).
-compile(export_all).
-else.
-export([
open/2,
open/3,
tree/1,
filename/1,
diff/2
]).
-endif.
-define(TRIMMED, <<1>>).
-define(UNWRITTEN, <<0>>).
-define(NAIVE_ENCODE(Offset, Size, Data), <<Offset:64/unsigned-big, Size:32/unsigned-big, Data/binary>>).
-define(MINIMUM_CHUNK, 1048576). %% 1024 * 1024
-define(LEVEL_SIZE, 10).
-define(H, sha).
%% public API
open(Filename, DataDir) ->
open(Filename, DataDir, naive).
open(Filename, DataDir, Type) ->
Tree = load_filename(Filename, DataDir, Type),
{ok, #mt{ filename = Filename, tree = Tree, backend = Type}}.
tree(#mt{ tree = T, backend = naive }) ->
case T#naive.recalc of
true -> build_tree(T);
false -> T
end.
filename(#mt{ filename = F }) -> F.
diff(#mt{backend = naive, tree = T1}, #mt{backend = naive, tree = T2}) ->
case T1#naive.root == T2#naive.root of
true -> same;
false -> naive_diff(T1, T2)
end;
diff(_, _) -> error(badarg).
%% private
% @private
load_filename(Filename, DataDir, naive) ->
{Last, M} = do_load(Filename, DataDir, fun insert_csum_naive/2, []),
ChunkSize = max(?MINIMUM_CHUNK, Last div 100),
T = #naive{ leaves = lists:reverse(M), chunk_size = ChunkSize, recalc = true },
build_tree(T).
do_load(Filename, DataDir, FoldFun, AccInit) ->
CsumFile = machi_util:make_checksum_filename(DataDir, Filename),
{ok, T} = machi_csum_table:open(CsumFile, []),
Acc = machi_csum_table:foldl_chunks(FoldFun, {0, AccInit}, T),
ok = machi_csum_table:close(T),
Acc.
% @private
insert_csum_naive({Last, Size, _Csum}=In, {Last, MT}) ->
%% no gap
{Last+Size, update_acc(In, MT)};
insert_csum_naive({Offset, Size, _Csum}=In, {Last, MT}) ->
Hole = Offset - Last,
MT0 = update_acc({Last, Hole, unwritten}, MT),
{Offset+Size, update_acc(In, MT0)}.
% @private
update_acc({Offset, Size, unwritten}, MT) ->
[ {Offset, Size, ?NAIVE_ENCODE(Offset, Size, ?UNWRITTEN)} | MT ];
update_acc({Offset, Size, trimmed}, MT) ->
[ {Offset, Size, ?NAIVE_ENCODE(Offset, Size, ?TRIMMED)} | MT ];
update_acc({Offset, Size, <<_Tag:8, Csum/binary>>}, MT) ->
[ {Offset, Size, ?NAIVE_ENCODE(Offset, Size, Csum)} | MT ].
build_tree(MT = #naive{ leaves = L, chunk_size = ChunkSize }) ->
Lvl1s = build_level_1(ChunkSize, L, 1, [ crypto:hash_init(?H) ]),
Mod2 = length(Lvl1s) div ?LEVEL_SIZE,
Lvl2s = build_int_level(Mod2, Lvl1s, 1, [ crypto:hash_init(?H) ]),
Mod3 = length(Lvl2s) div 2,
Lvl3s = build_int_level(Mod3, Lvl2s, 1, [ crypto:hash_init(?H) ]),
Root = build_root(Lvl3s, crypto:hash_init(?H)),
MT#naive{ root = Root, lvl1 = Lvl1s, lvl2 = Lvl2s, lvl3 = Lvl3s, recalc = false }.
build_root([], Ctx) ->
crypto:hash_final(Ctx);
build_root([H|T], Ctx) ->
build_root(T, crypto:hash_update(Ctx, H)).
build_int_level(_Mod, [], _Cnt, [ Ctx | Rest ]) ->
lists:reverse( [ crypto:hash_final(Ctx) | Rest ] );
build_int_level(Mod, [H|T], Cnt, [ Ctx | Rest ]) when Cnt rem Mod == 0 ->
NewCtx = crypto:hash_init(?H),
build_int_level(Mod, T, Cnt + 1, [ crypto:hash_update(NewCtx, H), crypto:hash_final(Ctx) | Rest ]);
build_int_level(Mod, [H|T], Cnt, [ Ctx | Rest ]) ->
build_int_level(Mod, T, Cnt+1, [ crypto:hash_update(Ctx, H) | Rest ]).
build_level_1(_Size, [], _Multiple, [ Ctx | Rest ]) ->
lists:reverse([ crypto:hash_final(Ctx) | Rest ]);
build_level_1(Size, [{Pos, Len, Hash}|T], Multiple, [ Ctx | Rest ])
when ( Pos + Len ) > ( Size * Multiple ) ->
NewCtx = crypto:hash_init(?H),
build_level_1(Size, T, Multiple+1,
[ crypto:hash_update(NewCtx, Hash), crypto:hash_final(Ctx) | Rest ]);
build_level_1(Size, [{Pos, Len, Hash}|T], Multiple, [ Ctx | Rest ])
when ( Pos + Len ) =< ( Size * Multiple ) ->
build_level_1(Size, T, Multiple, [ crypto:hash_update(Ctx, Hash) | Rest ]).
naive_diff(#naive{lvl1 = L1}, #naive{lvl1=L2, chunk_size=CS2}) ->
Set1 = gb_sets:from_list(lists:zip(lists:seq(1, length(L1)), L1)),
Set2 = gb_sets:from_list(lists:zip(lists:seq(1, length(L2)), L2)),
%% The byte ranges in list 2 that do not match in list 1
%% Or should we do something else?
[ {(X-1)*CS2, CS2, SHA} || {X, SHA} <- gb_sets:to_list(gb_sets:subtract(Set1, Set2)) ].

View file

@ -25,6 +25,10 @@
%% to a single socket connection, and there is no code to deal with
%% multiple connections/load balancing/error handling to several/all
%% Machi cluster servers.
%%
%% Please see {@link machi_flu1_client} the "Client API implemntation notes"
%% section for how this module relates to the rest of the client API
%% implementation.
-module(machi_pb_high_client).
@ -40,7 +44,8 @@
auth/3, auth/4,
append_chunk/6, append_chunk/7,
write_chunk/5, write_chunk/6,
read_chunk/4, read_chunk/5,
read_chunk/5, read_chunk/6,
trim_chunk/4, trim_chunk/5,
checksum_list/2, checksum_list/3,
list_files/1, list_files/2
]).
@ -57,48 +62,114 @@
count=0 :: non_neg_integer()
}).
%% Official error types that is specific in Machi
-type machi_client_error_reason() :: bad_arg | wedged | bad_checksum |
partition | not_written | written |
trimmed | no_such_file | partial_read |
bad_epoch | inet:posix().
%% @doc Creates a client process
-spec start_link(p_srvr_dict()) -> {ok, pid()} | {error, machi_client_error_reason()}.
start_link(P_srvr_list) ->
gen_server:start_link(?MODULE, [P_srvr_list], []).
%% @doc Stops a client process.
-spec quit(pid()) -> ok.
quit(PidSpec) ->
gen_server:call(PidSpec, quit, infinity).
connected_p(PidSpec) ->
gen_server:call(PidSpec, connected_p, infinity).
-spec echo(pid(), string()) -> {ok, string()} | {error, machi_client_error_reason()}.
echo(PidSpec, String) ->
echo(PidSpec, String, ?DEFAULT_TIMEOUT).
-spec echo(pid(), string(), non_neg_integer()) -> {ok, string()} | {error, machi_client_error_reason()}.
echo(PidSpec, String, Timeout) ->
send_sync(PidSpec, {echo, String}, Timeout).
%% TODO: auth() is not implemented. Auth requires SSL, and this client
%% doesn't support SSL yet. This is just a placeholder and reminder.
-spec auth(pid(), string(), string()) -> ok | {error, machi_client_error_reason()}.
auth(PidSpec, User, Pass) ->
auth(PidSpec, User, Pass, ?DEFAULT_TIMEOUT).
-spec auth(pid(), string(), string(), non_neg_integer()) -> ok | {error, machi_client_error_reason()}.
auth(PidSpec, User, Pass, Timeout) ->
send_sync(PidSpec, {auth, User, Pass}, Timeout).
append_chunk(PidSpec, PlacementKey, Prefix, Chunk, CSum, ChunkExtra) ->
append_chunk(PidSpec, PlacementKey, Prefix, Chunk, CSum, ChunkExtra, ?DEFAULT_TIMEOUT).
-spec append_chunk(pid(),
NS::machi_dt:namespace(), Prefix::machi_dt:file_prefix(),
Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum(),
Opts::machi_dt:append_opts()) ->
{ok, Filename::string(), Offset::machi_dt:file_offset()} |
{error, machi_client_error_reason()}.
append_chunk(PidSpec, NS, Prefix, Chunk, CSum, Opts) ->
append_chunk(PidSpec, NS, Prefix, Chunk, CSum, Opts, ?DEFAULT_TIMEOUT).
append_chunk(PidSpec, PlacementKey, Prefix, Chunk, CSum, ChunkExtra, Timeout) ->
send_sync(PidSpec, {append_chunk, PlacementKey, Prefix, Chunk, CSum, ChunkExtra}, Timeout).
-spec append_chunk(pid(),
NS::machi_dt:namespace(), Prefix::machi_dt:file_prefix(),
Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum(),
Opts::machi_dt:append_opts(),
Timeout::non_neg_integer()) ->
{ok, Filename::string(), Offset::machi_dt:file_offset()} |
{error, machi_client_error_reason()}.
append_chunk(PidSpec, NS, Prefix, Chunk, CSum, Opts, Timeout) ->
send_sync(PidSpec, {append_chunk, NS, Prefix, Chunk, CSum, Opts}, Timeout).
-spec write_chunk(pid(), File::string(), machi_dt:file_offset(),
Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum()) ->
ok | {error, machi_client_error_reason()}.
write_chunk(PidSpec, File, Offset, Chunk, CSum) ->
write_chunk(PidSpec, File, Offset, Chunk, CSum, ?DEFAULT_TIMEOUT).
-spec write_chunk(pid(), File::string(), machi_dt:file_offset(),
Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum(), Timeout::non_neg_integer()) ->
ok | {error, machi_client_error_reason()}.
write_chunk(PidSpec, File, Offset, Chunk, CSum, Timeout) ->
send_sync(PidSpec, {write_chunk, File, Offset, Chunk, CSum}, Timeout).
read_chunk(PidSpec, File, Offset, Size) ->
read_chunk(PidSpec, File, Offset, Size, ?DEFAULT_TIMEOUT).
%% @doc Tries to read a chunk of a specified file. It returns `{ok,
%% {Chunks, TrimmedChunks}}' for live file while it returns `{error,
%% trimmed}' if all bytes of the file was trimmed.
-spec read_chunk(pid(), File::string(), machi_dt:file_offset(), machi_dt:chunk_size(),
machi_dt:read_opts_x()) ->
{ok, {Chunks::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), binary()}],
Trimmed::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size()}]}} |
{error, machi_client_error_reason()}.
read_chunk(PidSpec, File, Offset, Size, Opts) ->
read_chunk(PidSpec, File, Offset, Size, Opts, ?DEFAULT_TIMEOUT).
read_chunk(PidSpec, File, Offset, Size, Timeout) ->
send_sync(PidSpec, {read_chunk, File, Offset, Size}, Timeout).
-spec read_chunk(pid(), File::string(), machi_dt:file_offset(), machi_dt:chunk_size(),
machi_dt:read_opts_x(),
Timeout::non_neg_integer()) ->
{ok, {Chunks::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), binary()}],
Trimmed::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size()}]}} |
{error, machi_client_error_reason()}.
read_chunk(PidSpec, File, Offset, Size, Opts0, Timeout) ->
Opts = machi_util:read_opts_default(Opts0),
send_sync(PidSpec, {read_chunk, File, Offset, Size, Opts}, Timeout).
%% @doc Trims arbitrary binary range of any file. If a specified range
%% has any byte trimmed, it fails and returns `{error, trimmed}'.
%% Otherwise it trims all bytes in that range. If there are
%% overlapping chunks with client-specified checksum, they will cut
%% off and checksum are re-calculated in server side. TODO: Add
%% option specifying whether to trigger GC.
-spec trim_chunk(pid(), string(), non_neg_integer(), machi_dt:chunk_size()) ->
ok | {error, machi_client_error_reason()}.
trim_chunk(PidSpec, File, Offset, Size) ->
trim_chunk(PidSpec, File, Offset, Size, ?DEFAULT_TIMEOUT).
trim_chunk(PidSpec, File, Offset, Size, Timeout) ->
send_sync(PidSpec, {trim_chunk, File, Offset, Size}, Timeout).
%% @doc Returns a binary that has checksums and chunks encoded inside
%% (This is because encoding-decoding them are inefficient). TODO:
%% return a structured list of them.
-spec checksum_list(pid(), string()) -> {ok, binary()} | {error, machi_client_error_reason()}.
checksum_list(PidSpec, File) ->
checksum_list(PidSpec, File, ?DEFAULT_TIMEOUT).
@ -218,19 +289,19 @@ do_send_sync2({auth, User, Pass}, #state{sock=Sock}=S) ->
Res = {bummer, {X, Y, erlang:get_stacktrace()}},
{Res, S}
end;
do_send_sync2({append_chunk, PlacementKey, Prefix, Chunk, CSum, ChunkExtra},
do_send_sync2({append_chunk, NS, Prefix, Chunk, CSum, Opts},
#state{sock=Sock, sock_id=Index, count=Count}=S) ->
try
ReqID = <<Index:64/big, Count:64/big>>,
PK = if PlacementKey == <<>> -> undefined;
true -> PlacementKey
end,
CSumT = convert_csum_req(CSum, Chunk),
Req = #mpb_appendchunkreq{placement_key=PK,
{ChunkExtra, Pref, FailPref} = machi_pb_translate:conv_from_append_opts(Opts),
Req = #mpb_appendchunkreq{namespace=NS,
prefix=Prefix,
chunk=Chunk,
csum=CSumT,
chunk_extra=ChunkExtra},
chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref},
R1a = #mpb_request{req_id=ReqID, do_not_alter=1,
append_chunk=Req},
Bin1a = machi_pb:encode_mpb_request(R1a),
@ -253,10 +324,11 @@ do_send_sync2({write_chunk, File, Offset, Chunk, CSum},
try
ReqID = <<Index:64/big, Count:64/big>>,
CSumT = convert_csum_req(CSum, Chunk),
Req = #mpb_writechunkreq{file=File,
offset=Offset,
chunk=Chunk,
csum=CSumT},
Req = #mpb_writechunkreq{chunk=
#mpb_chunk{chunk=Chunk,
file_name=File,
offset=Offset,
csum=CSumT}},
R1a = #mpb_request{req_id=ReqID, do_not_alter=1,
write_chunk=Req},
Bin1a = machi_pb:encode_mpb_request(R1a),
@ -274,13 +346,19 @@ do_send_sync2({write_chunk, File, Offset, Chunk, CSum},
Res = {bummer, {X, Y, erlang:get_stacktrace()}},
{Res, S#state{count=Count+1}}
end;
do_send_sync2({read_chunk, File, Offset, Size},
do_send_sync2({read_chunk, File, Offset, Size, Opts},
#state{sock=Sock, sock_id=Index, count=Count}=S) ->
try
ReqID = <<Index:64/big, Count:64/big>>,
Req = #mpb_readchunkreq{file=File,
offset=Offset,
size=Size},
#read_opts{no_checksum=FlagNoChecksum,
no_chunk=FlagNoChunk,
needs_trimmed=NeedsTrimmed} = Opts,
Req = #mpb_readchunkreq{chunk_pos=#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size},
flag_no_checksum=machi_util:bool2int(FlagNoChecksum),
flag_no_chunk=machi_util:bool2int(FlagNoChunk),
flag_needs_trimmed=machi_util:bool2int(NeedsTrimmed)},
R1a = #mpb_request{req_id=ReqID, do_not_alter=1,
read_chunk=Req},
Bin1a = machi_pb:encode_mpb_request(R1a),
@ -298,6 +376,30 @@ do_send_sync2({read_chunk, File, Offset, Size},
Res = {bummer, {X, Y, erlang:get_stacktrace()}},
{Res, S#state{count=Count+1}}
end;
do_send_sync2({trim_chunk, File, Offset, Size},
#state{sock=Sock, sock_id=Index, count=Count}=S) ->
try
ReqID = <<Index:64/big, Count:64/big>>,
Req = #mpb_trimchunkreq{chunk_pos=#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size}},
R1a = #mpb_request{req_id=ReqID, do_not_alter=1,
trim_chunk=Req},
Bin1a = machi_pb:encode_mpb_request(R1a),
ok = gen_tcp:send(Sock, Bin1a),
{ok, Bin1B} = gen_tcp:recv(Sock, 0),
case (catch machi_pb:decode_mpb_response(Bin1B)) of
#mpb_response{req_id=ReqID, trim_chunk=R} when R /= undefined ->
Result = convert_trim_chunk_resp(R),
{Result, S#state{count=Count+1}};
#mpb_response{req_id=ReqID, generic=G} when G /= undefined ->
#mpb_errorresp{code=Code, msg=Msg, extra=Extra} = G,
{{error, {Code, Msg, Extra}}, S#state{count=Count+1}}
end
catch X:Y ->
Res = {bummer, {X, Y, erlang:get_stacktrace()}},
{Res, S#state{count=Count+1}}
end;
do_send_sync2({checksum_list, File},
#state{sock=Sock, sock_id=Index, count=Count}=S) ->
try
@ -343,9 +445,15 @@ do_send_sync2({list_files},
{Res, S#state{count=Count+1}}
end.
%% We only convert the checksum types that make sense here:
%% none or client_sha. None of the other types should be sent
%% to us via the PB high protocol.
convert_csum_req(none, Chunk) ->
#mpb_chunkcsum{type='CSUM_TAG_CLIENT_SHA',
csum=machi_util:checksum_chunk(Chunk)};
convert_csum_req(<<>>, Chunk) ->
convert_csum_req(none, Chunk);
convert_csum_req({client_sha, CSumBin}, _Chunk) ->
#mpb_chunkcsum{type='CSUM_TAG_CLIENT_SHA',
csum=CSumBin}.
@ -370,6 +478,8 @@ convert_general_status_code('NOT_WRITTEN') ->
{error, not_written};
convert_general_status_code('WRITTEN') ->
{error, written};
convert_general_status_code('TRIMMED') ->
{error, trimmed};
convert_general_status_code('NO_SUCH_FILE') ->
{error, no_such_file};
convert_general_status_code('PARTIAL_READ') ->
@ -384,11 +494,29 @@ convert_write_chunk_resp(#mpb_writechunkresp{status='OK'}) ->
convert_write_chunk_resp(#mpb_writechunkresp{status=Status}) ->
convert_general_status_code(Status).
convert_read_chunk_resp(#mpb_readchunkresp{status='OK', chunk=Chunk}) ->
{ok, Chunk};
convert_read_chunk_resp(#mpb_readchunkresp{status='OK', chunks=PB_Chunks, trimmed=PB_Trimmed}) ->
Chunks = lists:map(fun(#mpb_chunk{offset=Offset,
file_name=File,
chunk=Chunk,
csum=#mpb_chunkcsum{type=T, csum=Ck}}) ->
%% TODO: cleanup export
Csum = <<(machi_pb_translate:conv_to_csum_tag(T)):8, Ck/binary>>,
{list_to_binary(File), Offset, Chunk, Csum}
end, PB_Chunks),
Trimmed = lists:map(fun(#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size}) ->
{list_to_binary(File), Offset, Size}
end, PB_Trimmed),
{ok, {Chunks, Trimmed}};
convert_read_chunk_resp(#mpb_readchunkresp{status=Status}) ->
convert_general_status_code(Status).
convert_trim_chunk_resp(#mpb_trimchunkresp{status='OK'}) ->
ok;
convert_trim_chunk_resp(#mpb_trimchunkresp{status=Status}) ->
convert_general_status_code(Status).
convert_checksum_list_resp(#mpb_checksumlistresp{status='OK', chunk=Chunk}) ->
{ok, Chunk};
convert_checksum_list_resp(#mpb_checksumlistresp{status=Status}) ->

View file

@ -15,8 +15,8 @@
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%%
-module(machi_pb_translate).
@ -26,88 +26,123 @@
-include("machi_pb.hrl").
-include("machi_projection.hrl").
-ifdef(PULSE).
-compile({parse_transform, pulse_instrument}).
-include_lib("pulse_otp/include/pulse_otp.hrl").
-endif.
-export([from_pb_request/1,
from_pb_response/1,
to_pb_request/2,
to_pb_response/3
to_pb_response/3,
conv_from_append_opts/1,
conv_to_append_opts/1
]).
%% TODO: fixme cleanup
-export([conv_to_csum_tag/1]).
from_pb_request(#mpb_ll_request{
req_id=ReqID,
echo=#mpb_echoreq{message=Msg}}) ->
{ReqID, {low_echo, undefined, Msg}};
{ReqID, {low_skip_wedge, {low_echo, Msg}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
auth=#mpb_authreq{user=User, password=Pass}}) ->
{ReqID, {low_auth, undefined, User, Pass}};
{ReqID, {low_skip_wedge, {low_auth, User, Pass}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
append_chunk=#mpb_ll_appendchunkreq{
append_chunk=IR=#mpb_ll_appendchunkreq{
namespace_version=NSVersion,
namespace=NS_str,
locator=NSLocator,
epoch_id=PB_EpochID,
placement_key=PKey,
prefix=Prefix,
chunk=Chunk,
csum=#mpb_chunkcsum{type=CSum_type, csum=CSum},
chunk_extra=ChunkExtra}}) ->
csum=#mpb_chunkcsum{type=CSum_type, csum=CSum}}}) ->
NS = list_to_binary(NS_str),
EpochID = conv_to_epoch_id(PB_EpochID),
CSum_tag = conv_to_csum_tag(CSum_type),
{ReqID, {low_append_chunk, EpochID, PKey, Prefix, Chunk, CSum_tag, CSum,
ChunkExtra}};
Opts = conv_to_append_opts(IR),
%% NOTE: The tuple position of NSLocator is a bit odd, because EpochID
%% _must_ be in the 4th position (as NSV & NS must be in 2nd & 3rd).
{ReqID, {low_append_chunk, NSVersion, NS, EpochID, NSLocator,
Prefix, Chunk, CSum_tag, CSum, Opts}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
write_chunk=#mpb_ll_writechunkreq{
namespace_version=NSVersion,
namespace=NS_str,
epoch_id=PB_EpochID,
file=File,
offset=Offset,
chunk=Chunk,
csum=#mpb_chunkcsum{type=CSum_type, csum=CSum}}}) ->
chunk=#mpb_chunk{file_name=File,
offset=Offset,
chunk=Chunk,
csum=#mpb_chunkcsum{type=CSum_type, csum=CSum}}}}) ->
NS = list_to_binary(NS_str),
EpochID = conv_to_epoch_id(PB_EpochID),
CSum_tag = conv_to_csum_tag(CSum_type),
{ReqID, {low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, CSum}};
{ReqID, {low_write_chunk, NSVersion, NS, EpochID, File, Offset, Chunk, CSum_tag, CSum}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
read_chunk=#mpb_ll_readchunkreq{
namespace_version=NSVersion,
namespace=NS_str,
epoch_id=PB_EpochID,
chunk_pos=ChunkPos,
flag_no_checksum=PB_GetNoChecksum,
flag_no_chunk=PB_GetNoChunk,
flag_needs_trimmed=PB_NeedsTrimmed}}) ->
NS = list_to_binary(NS_str),
EpochID = conv_to_epoch_id(PB_EpochID),
Opts = #read_opts{no_checksum=PB_GetNoChecksum,
no_chunk=PB_GetNoChunk,
needs_trimmed=PB_NeedsTrimmed},
#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size} = ChunkPos,
{ReqID, {low_read_chunk, NSVersion, NS, EpochID, File, Offset, Size, Opts}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
trim_chunk=#mpb_ll_trimchunkreq{
namespace_version=NSVersion,
namespace=NS_str,
epoch_id=PB_EpochID,
file=File,
offset=Offset,
size=Size,
flag_get_checksum=PB_GetChecksum,
flag_no_chunk=PB_GetNoChunk}}) ->
trigger_gc=TriggerGC}}) ->
NS = list_to_binary(NS_str),
EpochID = conv_to_epoch_id(PB_EpochID),
Opts = [{get_checksum, conv_to_boolean(PB_GetChecksum)},
{no_chunk, conv_to_boolean(PB_GetNoChunk)}],
{ReqID, {low_read_chunk, EpochID, File, Offset, Size, Opts}};
{ReqID, {low_trim_chunk, NSVersion, NS, EpochID, File, Offset, Size, TriggerGC}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
checksum_list=#mpb_ll_checksumlistreq{
epoch_id=PB_EpochID,
file=File}}) ->
EpochID = conv_to_epoch_id(PB_EpochID),
{ReqID, {low_checksum_list, EpochID, File}};
{ReqID, {low_skip_wedge, {low_checksum_list, File}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
list_files=#mpb_ll_listfilesreq{
epoch_id=PB_EpochID}}) ->
EpochID = conv_to_epoch_id(PB_EpochID),
{ReqID, {low_list_files, EpochID}};
{ReqID, {low_skip_wedge, {low_list_files, EpochID}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
wedge_status=#mpb_ll_wedgestatusreq{}}) ->
{ReqID, {low_wedge_status, undefined}};
{ReqID, {low_skip_wedge, {low_wedge_status}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
delete_migration=#mpb_ll_deletemigrationreq{
epoch_id=PB_EpochID,
file=File}}) ->
EpochID = conv_to_epoch_id(PB_EpochID),
{ReqID, {low_delete_migration, EpochID, File}};
{ReqID, {low_skip_wedge, {low_delete_migration, EpochID, File}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
trunc_hack=#mpb_ll_trunchackreq{
epoch_id=PB_EpochID,
file=File}}) ->
EpochID = conv_to_epoch_id(PB_EpochID),
{ReqID, {low_trunc_hack, EpochID, File}};
{ReqID, {low_skip_wedge, {low_trunc_hack, EpochID, File}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
proj_gl=#mpb_ll_getlatestepochidreq{type=ProjType}}) ->
@ -135,6 +170,10 @@ from_pb_request(#mpb_ll_request{
req_id=ReqID,
proj_la=#mpb_ll_listallprojectionsreq{type=ProjType}}) ->
{ReqID, {low_proj, {list_all_projections, conv_to_type(ProjType)}}};
from_pb_request(#mpb_ll_request{
req_id=ReqID,
proj_kp=#mpb_ll_kickprojectionreactionreq{}}) ->
{ReqID, {low_proj, {kick_projection_reaction}}};
%%qqq
from_pb_request(#mpb_request{req_id=ReqID,
echo=#mpb_echoreq{message=Msg}}) ->
@ -144,28 +183,40 @@ from_pb_request(#mpb_request{req_id=ReqID,
{ReqID, {high_auth, User, Pass}};
from_pb_request(#mpb_request{req_id=ReqID,
append_chunk=IR=#mpb_appendchunkreq{}}) ->
#mpb_appendchunkreq{placement_key=__todoPK,
#mpb_appendchunkreq{namespace=NS_str,
prefix=Prefix,
chunk=Chunk,
csum=CSum,
chunk_extra=ChunkExtra} = IR,
csum=CSum} = IR,
NS = list_to_binary(NS_str),
TaggedCSum = make_tagged_csum(CSum, Chunk),
{ReqID, {high_append_chunk, __todoPK, Prefix, Chunk, TaggedCSum,
ChunkExtra}};
Opts = conv_to_append_opts(IR),
{ReqID, {high_append_chunk, NS, Prefix, Chunk, TaggedCSum, Opts}};
from_pb_request(#mpb_request{req_id=ReqID,
write_chunk=IR=#mpb_writechunkreq{}}) ->
#mpb_writechunkreq{file=File,
offset=Offset,
chunk=Chunk,
csum=CSum} = IR,
TaggedCSum = make_tagged_csum(CSum, Chunk),
{ReqID, {high_write_chunk, File, Offset, Chunk, TaggedCSum}};
#mpb_writechunkreq{chunk=#mpb_chunk{file_name=File,
offset=Offset,
chunk=Chunk,
csum=CSumRec}} = IR,
CSum = make_tagged_csum(CSumRec, Chunk),
{ReqID, {high_write_chunk, File, Offset, Chunk, CSum}};
from_pb_request(#mpb_request{req_id=ReqID,
read_chunk=IR=#mpb_readchunkreq{}}) ->
#mpb_readchunkreq{file=File,
offset=Offset,
size=Size} = IR,
{ReqID, {high_read_chunk, File, Offset, Size}};
#mpb_readchunkreq{chunk_pos=#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size},
flag_no_checksum=FlagNoChecksum,
flag_no_chunk=FlagNoChunk,
flag_needs_trimmed=NeedsTrimmed} = IR,
Opts = #read_opts{no_checksum=FlagNoChecksum,
no_chunk=FlagNoChunk,
needs_trimmed=NeedsTrimmed},
{ReqID, {high_read_chunk, File, Offset, Size, Opts}};
from_pb_request(#mpb_request{req_id=ReqID,
trim_chunk=IR=#mpb_trimchunkreq{}}) ->
#mpb_trimchunkreq{chunk_pos=#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size}} = IR,
{ReqID, {high_trim_chunk, File, Offset, Size}};
from_pb_request(#mpb_request{req_id=ReqID,
checksum_list=IR=#mpb_checksumlistreq{}}) ->
#mpb_checksumlistreq{file=File} = IR,
@ -176,15 +227,22 @@ from_pb_request(#mpb_request{req_id=ReqID,
from_pb_request(#mpb_request{req_id=ReqID}) ->
{ReqID, {high_error, 999966, "Unknown request"}};
from_pb_request(_Else) ->
io:format(user, "\nRRR from_pb_request(~p)\n", [_Else]), timer:sleep(2000),
io:format(user, "\nRRR from_pb_request(~p)\n", [_Else]), %%timer:sleep(2000),
{<<>>, {high_error, 999667, "Unknown PB request"}}.
from_pb_response(#mpb_ll_response{
req_id=ReqID,
% There is no separate LL error response record
generic=#mpb_errorresp{code=Code, msg=Msg}}) ->
{ReqID, {error, {Code, Msg}}};
from_pb_response(#mpb_ll_response{
req_id=ReqID,
% There is no separate LL echo response record
echo=#mpb_echoresp{message=Msg}}) ->
{ReqID, Msg};
from_pb_response(#mpb_ll_response{
req_id=ReqID,
% There is no separate LL auth response record
auth=#mpb_authresp{code=Code}}) ->
{ReqID, Code};
from_pb_response(#mpb_ll_response{
@ -207,13 +265,30 @@ from_pb_response(#mpb_ll_response{
from_pb_response(#mpb_ll_response{
req_id=ReqID,
read_chunk=#mpb_ll_readchunkresp{status=Status,
chunk=Chunk}}) ->
chunks=PB_Chunks,
trimmed=PB_Trimmed}}) ->
case Status of
'OK' ->
{ReqID, {ok, Chunk}};
Chunks = lists:map(fun(#mpb_chunk{file_name=File,
offset=Offset,
chunk=Bytes,
csum=#mpb_chunkcsum{type=T,csum=Ck}}) ->
Csum = <<(conv_to_csum_tag(T)):8, Ck/binary>>,
{list_to_binary(File), Offset, Bytes, Csum}
end, PB_Chunks),
Trimmed = lists:map(fun(#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size}) ->
{list_to_binary(File), Offset, Size}
end, PB_Trimmed),
{ReqID, {ok, {Chunks, Trimmed}}};
_ ->
{ReqID, machi_pb_high_client:convert_general_status_code(Status)}
end;
from_pb_response(#mpb_ll_response{
req_id=ReqID,
trim_chunk=#mpb_ll_trimchunkresp{status=Status}}) ->
{ReqID, machi_pb_high_client:convert_general_status_code(Status)};
from_pb_response(#mpb_ll_response{
req_id=ReqID,
checksum_list=#mpb_ll_checksumlistresp{
@ -240,12 +315,16 @@ from_pb_response(#mpb_ll_response{
from_pb_response(#mpb_ll_response{
req_id=ReqID,
wedge_status=#mpb_ll_wedgestatusresp{
epoch_id=PB_EpochID, wedged_flag=PB_Wedged}}) ->
status=Status,
epoch_id=PB_EpochID, wedged_flag=Wedged_p,
namespace_version=NSVersion, namespace=NS_str}}) ->
GeneralStatus = case machi_pb_high_client:convert_general_status_code(Status) of
ok -> ok;
_Else -> {yukky, _Else}
end,
EpochID = conv_to_epoch_id(PB_EpochID),
Wedged_p = if PB_Wedged == 1 -> true;
PB_Wedged == 0 -> false
end,
{ReqID, {ok, {Wedged_p, EpochID}}};
NS = list_to_binary(NS_str),
{ReqID, {GeneralStatus, {Wedged_p, EpochID, NSVersion, NS}}};
from_pb_response(#mpb_ll_response{
req_id=ReqID,
delete_migration=#mpb_ll_deletemigrationresp{
@ -311,73 +390,100 @@ from_pb_response(#mpb_ll_response{
'OK' ->
{ReqID, {ok, Epochs}};
_ ->
{ReqID< machi_pb_high_client:convert_general_status_code(Status)}
{ReqID, machi_pb_high_client:convert_general_status_code(Status)}
end.
%% No response for proj_kp/kick_projection_reaction
%% TODO: move the #mbp_* record making code from
%% machi_pb_high_client:do_send_sync() clauses into to_pb_request().
to_pb_request(ReqID, {low_echo, _BogusEpochID, Msg}) ->
to_pb_request(ReqID, {low_skip_wedge, {low_echo, Msg}}) ->
#mpb_ll_request{
req_id=ReqID, do_not_alter=2,
echo=#mpb_echoreq{message=Msg}};
to_pb_request(ReqID, {low_auth, _BogusEpochID, User, Pass}) ->
to_pb_request(ReqID, {low_skip_wedge, {low_auth, User, Pass}}) ->
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
auth=#mpb_authreq{user=User, password=Pass}};
to_pb_request(ReqID, {low_append_chunk, EpochID, PKey, Prefix, Chunk,
CSum_tag, CSum, ChunkExtra}) ->
%% NOTE: The tuple position of NSLocator is a bit odd, because EpochID
%% _must_ be in the 4th position (as NSV & NS must be in 2nd & 3rd).
to_pb_request(ReqID, {low_append_chunk, NSVersion, NS, EpochID, NSLocator,
Prefix, Chunk, CSum_tag, CSum, Opts}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
CSum_type = conv_from_csum_tag(CSum_tag),
PB_CSum = #mpb_chunkcsum{type=CSum_type, csum=CSum},
{ChunkExtra, Pref, FailPref} = conv_from_append_opts(Opts),
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
append_chunk=#mpb_ll_appendchunkreq{
namespace_version=NSVersion,
namespace=NS,
locator=NSLocator,
epoch_id=PB_EpochID,
placement_key=PKey,
prefix=Prefix,
chunk=Chunk,
csum=PB_CSum,
chunk_extra=ChunkExtra}};
to_pb_request(ReqID, {low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, CSum}) ->
chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref}};
to_pb_request(ReqID, {low_write_chunk, NSVersion, NS, EpochID, File, Offset, Chunk, CSum_tag, CSum}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
CSum_type = conv_from_csum_tag(CSum_tag),
PB_CSum = #mpb_chunkcsum{type=CSum_type, csum=CSum},
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
write_chunk=#mpb_ll_writechunkreq{
namespace_version=NSVersion,
namespace=NS,
epoch_id=PB_EpochID,
file=File,
offset=Offset,
chunk=Chunk,
csum=PB_CSum}};
to_pb_request(ReqID, {low_read_chunk, EpochID, File, Offset, Size, _Opts}) ->
%% TODO: stop ignoring Opts ^_^
chunk=#mpb_chunk{file_name=File,
offset=Offset,
chunk=Chunk,
csum=PB_CSum}}};
to_pb_request(ReqID, {low_read_chunk, NSVersion, NS, EpochID, File, Offset, Size, Opts}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
#read_opts{no_checksum=FNChecksum,
no_chunk=FNChunk,
needs_trimmed=NeedsTrimmed} = Opts,
#mpb_ll_request{
req_id=ReqID, do_not_alter=2,
read_chunk=#mpb_ll_readchunkreq{
epoch_id=PB_EpochID,
file=File,
offset=Offset,
size=Size}};
to_pb_request(ReqID, {low_checksum_list, EpochID, File}) ->
namespace_version=NSVersion,
namespace=NS,
epoch_id=PB_EpochID,
chunk_pos=#mpb_chunkpos{
file_name=File,
offset=Offset,
chunk_size=Size},
flag_no_checksum=FNChecksum,
flag_no_chunk=FNChunk,
flag_needs_trimmed=NeedsTrimmed}};
to_pb_request(ReqID, {low_trim_chunk, NSVersion, NS, EpochID, File, Offset, Size, TriggerGC}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
trim_chunk=#mpb_ll_trimchunkreq{
namespace_version=NSVersion,
namespace=NS,
epoch_id=PB_EpochID,
file=File,
offset=Offset,
size=Size,
trigger_gc=TriggerGC}};
to_pb_request(ReqID, {low_skip_wedge, {low_checksum_list, File}}) ->
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
checksum_list=#mpb_ll_checksumlistreq{
epoch_id=PB_EpochID,
file=File}};
to_pb_request(ReqID, {low_list_files, EpochID}) ->
to_pb_request(ReqID, {low_skip_wedge, {low_list_files, EpochID}}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
list_files=#mpb_ll_listfilesreq{epoch_id=PB_EpochID}};
to_pb_request(ReqID, {low_wedge_status, _BogusEpochID}) ->
to_pb_request(ReqID, {low_skip_wedge, {low_wedge_status}}) ->
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
wedge_status=#mpb_ll_wedgestatusreq{}};
to_pb_request(ReqID, {low_delete_migration, EpochID, File}) ->
to_pb_request(ReqID, {low_skip_wedge, {low_delete_migration, EpochID, File}}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
delete_migration=#mpb_ll_deletemigrationreq{
epoch_id=PB_EpochID,
file=File}};
to_pb_request(ReqID, {low_trunc_hack, EpochID, File}) ->
to_pb_request(ReqID, {low_skip_wedge, {low_trunc_hack, EpochID, File}}) ->
PB_EpochID = conv_from_epoch_id(EpochID),
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
trunc_hack=#mpb_ll_trunchackreq{
@ -403,20 +509,25 @@ to_pb_request(ReqID, {low_proj, {get_all_projections, ProjType}}) ->
proj_ga=#mpb_ll_getallprojectionsreq{type=conv_from_type(ProjType)}};
to_pb_request(ReqID, {low_proj, {list_all_projections, ProjType}}) ->
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
proj_la=#mpb_ll_listallprojectionsreq{type=conv_from_type(ProjType)}}.
proj_la=#mpb_ll_listallprojectionsreq{type=conv_from_type(ProjType)}};
to_pb_request(ReqID, {low_proj, {kick_projection_reaction}}) ->
#mpb_ll_request{req_id=ReqID, do_not_alter=2,
proj_kp=#mpb_ll_kickprojectionreactionreq{}}.
%%qqq
to_pb_response(_ReqID, _, async_no_response=X) ->
X;
to_pb_response(ReqID, _, {low_error, ErrCode, ErrMsg}) ->
make_ll_error_resp(ReqID, ErrCode, ErrMsg);
to_pb_response(ReqID, {low_echo, _BogusEpochID, _Msg}, Resp) ->
to_pb_response(ReqID, {low_skip_wedge, {low_echo, _Msg}}, Resp) ->
#mpb_ll_response{
req_id=ReqID,
echo=#mpb_echoresp{message=Resp}};
to_pb_response(ReqID, {low_auth, _, _, _}, __TODO_Resp) ->
to_pb_response(ReqID, {low_skip_wedge, {low_auth, _, _}}, __TODO_Resp) ->
#mpb_ll_response{req_id=ReqID,
generic=#mpb_errorresp{code=1,
msg="AUTH not implemented"}};
to_pb_response(ReqID, {low_append_chunk, _EID, _PKey, _Pfx, _Ch, _CST, _CS, _CE}, Resp)->
to_pb_response(ReqID, {low_append_chunk, _NSV, _NS, _EID, _NSL, _Pfx, _Ch, _CST, _CS, _O}, Resp)->
case Resp of
{ok, {Offset, Size, File}} ->
Where = #mpb_chunkpos{offset=Offset,
@ -432,18 +543,30 @@ to_pb_response(ReqID, {low_append_chunk, _EID, _PKey, _Pfx, _Ch, _CST, _CS, _CE}
_Else ->
make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {low_write_chunk, _EID, _Fl, _Off, _Ch, _CST, _CS},Resp)->
to_pb_response(ReqID, {low_write_chunk, _NSV, _NS, _EID, _Fl, _Off, _Ch, _CST, _CS},Resp)->
Status = conv_from_status(Resp),
#mpb_ll_response{req_id=ReqID,
write_chunk=#mpb_ll_writechunkresp{status=Status}};
to_pb_response(ReqID, {low_read_chunk, _EID, _Fl, _Off, _Sz, _Opts}, Resp)->
to_pb_response(ReqID, {low_read_chunk, _NSV, _NS, _EID, _Fl, _Off, _Sz, _Opts}, Resp)->
case Resp of
{ok, Chunk} ->
CSum = undefined, % TODO not implemented
{ok, {Chunks, Trimmed}} ->
PB_Chunks = lists:map(fun({File, Offset, Bytes, Csum}) ->
{Tag, Ck} = machi_util:unmake_tagged_csum(Csum),
#mpb_chunk{file_name=File,
offset=Offset,
chunk=Bytes,
csum=#mpb_chunkcsum{type=conv_from_csum_tag(Tag),
csum=Ck}}
end, Chunks),
PB_Trimmed = lists:map(fun({File, Offset, Size}) ->
#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size}
end, Trimmed),
#mpb_ll_response{req_id=ReqID,
read_chunk=#mpb_ll_readchunkresp{status='OK',
chunk=Chunk,
csum=CSum}};
chunks=PB_Chunks,
trimmed=PB_Trimmed}};
{error, _}=Error ->
Status = conv_from_status(Error),
#mpb_ll_response{req_id=ReqID,
@ -451,7 +574,19 @@ to_pb_response(ReqID, {low_read_chunk, _EID, _Fl, _Off, _Sz, _Opts}, Resp)->
_Else ->
make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {low_checksum_list, _EpochID, _File}, Resp) ->
to_pb_response(ReqID, {low_trim_chunk, _, _, _, _, _, _, _}, Resp) ->
case Resp of
ok ->
#mpb_ll_response{req_id=ReqID,
trim_chunk=#mpb_ll_trimchunkresp{status='OK'}};
{error, _}=Error ->
Status = conv_from_status(Error),
#mpb_ll_response{req_id=ReqID,
trim_chunk=#mpb_ll_trimchunkresp{status=Status}};
_Else ->
make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {low_skip_wedge, {low_checksum_list, _File}}, Resp) ->
case Resp of
{ok, Chunk} ->
#mpb_ll_response{req_id=ReqID,
@ -464,7 +599,7 @@ to_pb_response(ReqID, {low_checksum_list, _EpochID, _File}, Resp) ->
_Else ->
make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {low_list_files, _EpochID}, Resp) ->
to_pb_response(ReqID, {low_skip_wedge, {low_list_files, _EpochID}}, Resp) ->
case Resp of
{ok, FileInfo} ->
PB_Files = [#mpb_fileinfo{file_size=Size, file_name=Name} ||
@ -479,26 +614,28 @@ to_pb_response(ReqID, {low_list_files, _EpochID}, Resp) ->
_Else ->
make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {low_wedge_status, _BogusEpochID}, Resp) ->
to_pb_response(ReqID, {low_skip_wedge, {low_wedge_status}}, Resp) ->
case Resp of
{error, _}=Error ->
Status = conv_from_status(Error),
#mpb_ll_response{req_id=ReqID,
wedge_status=#mpb_ll_wedgestatusresp{status=Status}};
{Wedged_p, EpochID} ->
PB_Wedged = conv_from_boolean(Wedged_p),
{Wedged_p, EpochID, NSVersion, NS} ->
PB_EpochID = conv_from_epoch_id(EpochID),
#mpb_ll_response{req_id=ReqID,
wedge_status=#mpb_ll_wedgestatusresp{
status='OK',
epoch_id=PB_EpochID,
wedged_flag=PB_Wedged}}
wedged_flag=Wedged_p,
namespace_version=NSVersion,
namespace=NS
}}
end;
to_pb_response(ReqID, {low_delete_migration, _EID, _Fl}, Resp)->
to_pb_response(ReqID, {low_skip_wedge, {low_delete_migration, _EID, _Fl}}, Resp)->
Status = conv_from_status(Resp),
#mpb_ll_response{req_id=ReqID,
delete_migration=#mpb_ll_deletemigrationresp{status=Status}};
to_pb_response(ReqID, {low_trunc_hack, _EID, _Fl}, Resp)->
to_pb_response(ReqID, {low_skip_wedge, {low_trunc_hack, _EID, _Fl}}, Resp)->
Status = conv_from_status(Resp),
#mpb_ll_response{req_id=ReqID,
trunc_hack=#mpb_ll_trunchackresp{status=Status}};
@ -567,6 +704,7 @@ to_pb_response(ReqID, {low_proj, {list_all_projections, _ProjType}}, Resp)->
proj_la=#mpb_ll_listallprojectionsresp{
status=Status}}
end;
%% No response for {kick_projection_reaction}!
%%qqq
to_pb_response(ReqID, _, {high_error, ErrCode, ErrMsg}) ->
make_error_resp(ReqID, ErrCode, ErrMsg);
@ -578,7 +716,7 @@ to_pb_response(ReqID, {high_auth, _User, _Pass}, _Resp) ->
#mpb_response{req_id=ReqID,
generic=#mpb_errorresp{code=1,
msg="AUTH not implemented"}};
to_pb_response(ReqID, {high_append_chunk, _TODO, _Prefix, _Chunk, _TSum, _CE}, Resp)->
to_pb_response(ReqID, {high_append_chunk, _NS, _Prefix, _Chunk, _TSum, _O}, Resp)->
case Resp of
{ok, {Offset, Size, File}} ->
Where = #mpb_chunkpos{offset=Offset,
@ -594,7 +732,7 @@ to_pb_response(ReqID, {high_append_chunk, _TODO, _Prefix, _Chunk, _TSum, _CE}, R
_Else ->
make_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {high_write_chunk, _File, _Offset, _Chunk, _TaggedCSum}, Resp) ->
to_pb_response(ReqID, {high_write_chunk, _File, _Offset, _Chunk, _CSum}, Resp) ->
case Resp of
{ok, {_,_,_}} ->
%% machi_cr_client returns ok 2-tuple, convert to simple ok.
@ -607,12 +745,26 @@ to_pb_response(ReqID, {high_write_chunk, _File, _Offset, _Chunk, _TaggedCSum}, R
_Else ->
make_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {high_read_chunk, _File, _Offset, _Size}, Resp) ->
to_pb_response(ReqID, {high_read_chunk, _File, _Offset, _Size, _}, Resp) ->
case Resp of
{ok, Chunk} ->
{ok, {Chunks, Trimmed}} ->
PB_Chunks = lists:map(fun({File, Offset, Bytes, Csum}) ->
{Tag, Ck} = machi_util:unmake_tagged_csum(Csum),
#mpb_chunk{
offset=Offset,
file_name=File,
chunk=Bytes,
csum=#mpb_chunkcsum{type=conv_from_csum_tag(Tag), csum=Ck}}
end, Chunks),
PB_Trimmed = lists:map(fun({File, Offset, Size}) ->
#mpb_chunkpos{file_name=File,
offset=Offset,
chunk_size=Size}
end, Trimmed),
#mpb_response{req_id=ReqID,
read_chunk=#mpb_readchunkresp{status='OK',
chunk=Chunk}};
chunks=PB_Chunks,
trimmed=PB_Trimmed}};
{error, _}=Error ->
Status = conv_from_status(Error),
#mpb_response{req_id=ReqID,
@ -620,6 +772,18 @@ to_pb_response(ReqID, {high_read_chunk, _File, _Offset, _Size}, Resp) ->
_Else ->
make_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {high_trim_chunk, _File, _Offset, _Size}, Resp) ->
case Resp of
ok ->
#mpb_response{req_id=ReqID,
trim_chunk=#mpb_trimchunkresp{status='OK'}};
{error, _}=Error ->
Status = conv_from_status(Error),
#mpb_response{req_id=ReqID,
trim_chunk=#mpb_trimchunkresp{status=Status}};
_Else ->
make_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else]))
end;
to_pb_response(ReqID, {high_checksum_list, _File}, Resp) ->
case Resp of
{ok, Chunk} ->
@ -658,12 +822,12 @@ make_tagged_csum(#mpb_chunkcsum{type='CSUM_TAG_CLIENT_SHA', csum=CSum}, _CB) ->
make_ll_error_resp(ReqID, Code, Msg) ->
#mpb_ll_response{req_id=ReqID,
generic=#mpb_errorresp{code=Code,
msg=Msg}}.
msg=Msg}}.
make_error_resp(ReqID, Code, Msg) ->
#mpb_response{req_id=ReqID,
generic=#mpb_errorresp{code=Code,
msg=Msg}}.
msg=Msg}}.
conv_from_epoch_id({Epoch, EpochCSum}) ->
#mpb_epochid{epoch_number=Epoch,
@ -676,28 +840,28 @@ conv_to_epoch_id(#mpb_epochid{epoch_number=Epoch,
conv_to_projection_v1(#mpb_projectionv1{epoch_number=Epoch,
epoch_csum=CSum,
author_server=Author,
chain_name=ChainName,
all_members=AllMembers,
witnesses=Witnesses,
creation_time=CTime,
mode=Mode,
upi=UPI,
repairing=Repairing,
down=Down,
opaque_flap=Flap,
opaque_inner=Inner,
opaque_dbg=Dbg,
opaque_dbg2=Dbg2,
members_dict=MembersDict}) ->
#projection_v1{epoch_number=Epoch,
epoch_csum=CSum,
author_server=to_atom(Author),
chain_name=to_atom(ChainName),
all_members=[to_atom(X) || X <- AllMembers],
witnesses=[to_atom(X) || X <- Witnesses],
creation_time=conv_to_now(CTime),
mode=conv_to_mode(Mode),
upi=[to_atom(X) || X <- UPI],
repairing=[to_atom(X) || X <- Repairing],
down=[to_atom(X) || X <- Down],
flap=dec_optional_sexp(Flap),
inner=dec_optional_sexp(Inner),
dbg=dec_sexp(Dbg),
dbg2=dec_sexp(Dbg2),
members_dict=conv_to_members_dict(MembersDict)}.
@ -708,16 +872,6 @@ enc_sexp(T) ->
dec_sexp(Bin) when is_binary(Bin) ->
binary_to_term(Bin).
enc_optional_sexp(undefined) ->
undefined;
enc_optional_sexp(T) ->
enc_sexp(T).
dec_optional_sexp(undefined) ->
undefined;
dec_optional_sexp(T) ->
dec_sexp(T).
conv_from_members_dict(D) ->
%% Use list_to_binary() here to "flatten" the serialized #p_srvr{}
[#mpb_membersdictentry{key=to_list(K), val=conv_from_p_srvr(V)} ||
@ -821,6 +975,8 @@ conv_from_status({error, not_written}) ->
'NOT_WRITTEN';
conv_from_status({error, written}) ->
'WRITTEN';
conv_from_status({error, trimmed}) ->
'TRIMMED';
conv_from_status({error, no_such_file}) ->
'NO_SUCH_FILE';
conv_from_status({error, partial_read}) ->
@ -828,46 +984,55 @@ conv_from_status({error, partial_read}) ->
conv_from_status({error, bad_epoch}) ->
'BAD_EPOCH';
conv_from_status(_OOPS) ->
io:format(user, "HEY, ~s:~w got ~w\n", [?MODULE, ?LINE, _OOPS]),
io:format(user, "HEY, ~s:~w got ~p\n", [?MODULE, ?LINE, _OOPS]),
'BAD_JOSS'.
conv_to_boolean(undefined) ->
false;
conv_to_boolean(0) ->
false;
conv_to_boolean(N) when is_integer(N) ->
true.
conv_from_append_opts(#append_opts{chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref}) ->
{ChunkExtra, Pref, FailPref}.
conv_from_boolean(false) ->
0;
conv_from_boolean(true) ->
1.
conv_to_append_opts(#mpb_appendchunkreq{
chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref}) ->
#append_opts{chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref};
conv_to_append_opts(#mpb_ll_appendchunkreq{
chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref}) ->
#append_opts{chunk_extra=ChunkExtra,
preferred_file_name=Pref,
flag_fail_preferred=FailPref}.
conv_from_projection_v1(#projection_v1{epoch_number=Epoch,
epoch_csum=CSum,
author_server=Author,
chain_name=ChainName,
all_members=AllMembers,
witnesses=Witnesses,
creation_time=CTime,
mode=Mode,
upi=UPI,
repairing=Repairing,
down=Down,
flap=Flap,
inner=Inner,
dbg=Dbg,
dbg2=Dbg2,
members_dict=MembersDict}) ->
#mpb_projectionv1{epoch_number=Epoch,
epoch_csum=CSum,
author_server=to_list(Author),
chain_name=to_list(ChainName),
all_members=[to_list(X) || X <- AllMembers],
witnesses=[to_list(X) || X <- Witnesses],
creation_time=conv_from_now(CTime),
mode=conv_from_mode(Mode),
upi=[to_list(X) || X <- UPI],
repairing=[to_list(X) || X <- Repairing],
down=[to_list(X) || X <- Down],
opaque_flap=enc_optional_sexp(Flap),
opaque_inner=enc_optional_sexp(Inner),
opaque_dbg=enc_sexp(Dbg),
opaque_dbg2=enc_sexp(Dbg2),
members_dict=conv_from_members_dict(MembersDict)}.

89
src/machi_plist.erl Normal file
View file

@ -0,0 +1,89 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2016 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(machi_plist).
%%% @doc persistent list of binaries
-export([open/2, close/1, find/2, add/2]).
-ifdef(TEST).
-export([all/1]).
-endif.
-record(machi_plist,
{filename :: file:filename_all(),
fd :: file:io_device(),
list = [] :: list(string)}).
-type plist() :: #machi_plist{}.
-export_type([plist/0]).
-spec open(file:filename_all(), proplists:proplist()) ->
{ok, plist()} | {error, file:posix()}.
open(Filename, _Opt) ->
%% TODO: This decode could fail if the file didn't finish writing
%% whole contents, which should be fixed by some persistent
%% solution.
List = case file:read_file(Filename) of
{ok, <<>>} -> [];
{ok, Bin} -> binary_to_term(Bin);
{error, enoent} -> []
end,
case file:open(Filename, [read, write, raw, binary, sync]) of
{ok, Fd} ->
{ok, #machi_plist{filename=Filename,
fd=Fd,
list=List}};
Error ->
Error
end.
-spec close(plist()) -> ok.
close(#machi_plist{fd=Fd}) ->
_ = file:close(Fd).
-spec find(plist(), string()) -> boolean().
find(#machi_plist{list=List}, Name) ->
lists:member(Name, List).
-spec add(plist(), string()) -> {ok, plist()} | {error, file:posix()}.
add(Plist = #machi_plist{list=List0, fd=Fd}, Name) ->
case find(Plist, Name) of
true ->
{ok, Plist};
false ->
List = lists:append(List0, [Name]),
%% TODO: partial write could break the file with other
%% persistent info (even lose data of trimmed states);
%% needs a solution.
case file:pwrite(Fd, 0, term_to_binary(List)) of
ok ->
{ok, Plist#machi_plist{list=List}};
Error ->
Error
end
end.
-ifdef(TEST).
-spec all(plist()) -> [file:filename()].
all(#machi_plist{list=List}) ->
List.
-endif.

View file

@ -29,13 +29,15 @@
update_checksum/1,
update_dbg2/2,
compare/2,
get_epoch_id/1,
make_summary/1,
make_members_dict/1
make_members_dict/1,
make_epoch_id/1
]).
%% @doc Create a new projection record.
new(MyName, MemberDict, UPI_list, Down_list, Repairing_list, Ps) ->
new(MyName, MemberDict, Down_list, UPI_list, Repairing_list, Ps) ->
new(0, MyName, MemberDict, Down_list, UPI_list, Repairing_list, Ps).
%% @doc Create a new projection record.
@ -75,7 +77,6 @@ new(EpochNum, MyName, [_|_] = MembersDict0, Down_list, UPI_list, Repairing_list,
RepairingSet = ordsets:from_list(Repairing_list),
true = ordsets:is_element(MyName, AllSet),
true = (AllSet == ordsets:union([DownSet, UPISet, RepairingSet])),
true = ordsets:is_disjoint(DownSet, UPISet),
true = ordsets:is_disjoint(DownSet, RepairingSet),
true = ordsets:is_disjoint(UPISet, RepairingSet),
@ -110,8 +111,25 @@ new(EpochNum, MyName, [] = _MembersDict0, _Down_list, _UPI_list,_Repairing_list,
%% @doc Update the checksum element of a projection record.
update_checksum(P) ->
%% Fields that we ignore when calculating checksum:
%% * epoch_csum
%% * dbg2: humming consensus participants may modify this at will without
%% voiding the identity of the projection as a whole.
%% * flap: In some cases in CP mode, coode upstream of C120 may have
%% updated the flapping information. That's OK enough: we aren't
%% going to violate chain replication safety rules (or
%% accidentally encourage someone else sometime later) by
%% replacing flapping information with our own local view at
%% this instant in time.
%% * creation_time: With CP mode & inner projections, it's damn annoying
%% to have to copy this around 100% correctly. {sigh}
%% That's a negative state of the code. However, there
%% isn't a safety violation if the creation_time is
%% altered for any reason: it's there only for human
%% benefit for debugging.
CSum = crypto:hash(sha,
term_to_binary(P#projection_v1{epoch_csum= <<>>,
creation_time=undefined,
dbg2=[]})),
P#projection_v1{epoch_csum=CSum}.
@ -138,18 +156,28 @@ compare(#projection_v1{epoch_number=E1},
E1 > E2 -> 1
end.
%% @doc Return the epoch_id of the projection.
get_epoch_id(#projection_v1{epoch_number=Epoch, epoch_csum=CSum}) ->
{Epoch, CSum}.
%% @doc Create a proplist-style summary of a projection record.
make_summary(#projection_v1{epoch_number=EpochNum,
epoch_csum= <<_CSum4:4/binary, _/binary>>,
all_members=_All_list,
mode=CMode,
witnesses=Witness_list,
down=Down_list,
author_server=Author,
upi=UPI_list,
repairing=Repairing_list,
dbg=Dbg, dbg2=Dbg2}) ->
[{epoch,EpochNum},{author,Author},
{upi,UPI_list},{repair,Repairing_list},{down,Down_list},
{d,Dbg}, {d2,Dbg2}].
[{epoch,EpochNum}, {csum,_CSum4},
{all, _All_list},
{author,Author}, {mode,CMode},{witnesses, Witness_list},
{upi,UPI_list},{repair,Repairing_list},{down,Down_list}] ++
[{d,Dbg}, {d2,Dbg2}].
%% @doc Make a `p_srvr_dict()' out of a list of `p_srvr()' or out of a
%% `p_srvr_dict()'.
@ -183,3 +211,6 @@ make_members_dict(Ps) ->
exit({badarg, {make_members_dict, lists:filter(F_neither, Ps)}})
end
end.
make_epoch_id(#projection_v1{epoch_number=Epoch, epoch_csum=CSum}) ->
{Epoch, CSum}.

View file

@ -40,7 +40,15 @@
-module(machi_projection_store).
-include("machi.hrl").
-include("machi_projection.hrl").
-define(V(X,Y), ok).
%% -include("machi_verbose.hrl").
%% -ifdef(PULSE).
%% -compile({parse_transform, pulse_instrument}).
%% -include_lib("pulse_otp/include/pulse_otp.hrl").
%% -endif.
%% API
-export([
@ -52,7 +60,8 @@
get_all_projections/2, get_all_projections/3,
list_all_projections/2, list_all_projections/3
]).
-export([set_wedge_notify_pid/2]).
-export([set_wedge_notify_pid/2, get_wedge_notify_pid/1,
set_consistency_mode/2]).
%% gen_server callbacks
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
@ -65,7 +74,8 @@
private_dir = "" :: string(),
wedge_notify_pid :: pid() | atom(),
max_public_epochid = ?NO_EPOCH :: {-1 | non_neg_integer(), binary()},
max_private_epochid = ?NO_EPOCH :: {-1 | non_neg_integer(), binary()}
max_private_epochid = ?NO_EPOCH :: {-1 | non_neg_integer(), binary()},
consistency_mode=ap_mode :: 'ap_mode' | 'cp_mode'
}).
%% @doc Start a new projection store server.
@ -127,6 +137,7 @@ write(PidSpec, ProjType, Proj, Timeout)
is_record(Proj, projection_v1),
is_integer(Proj#projection_v1.epoch_number),
Proj#projection_v1.epoch_number >= 0 ->
testing_sleep_perhaps(),
g_call(PidSpec, {write, ProjType, Proj}, Timeout).
%% @doc Fetch all projection records of type `ProjType'.
@ -152,7 +163,16 @@ list_all_projections(PidSpec, ProjType, Timeout)
g_call(PidSpec, {list_all_projections, ProjType}, Timeout).
set_wedge_notify_pid(PidSpec, NotifyWedgeStateChanges) ->
gen_server:call(PidSpec, {set_wedge_notify_pid, NotifyWedgeStateChanges}).
gen_server:call(PidSpec, {set_wedge_notify_pid, NotifyWedgeStateChanges},
infinity).
get_wedge_notify_pid(PidSpec) ->
gen_server:call(PidSpec, {get_wedge_notify_pid},
infinity).
set_consistency_mode(PidSpec, CMode)
when CMode == ap_mode; CMode == cp_mode ->
gen_server:call(PidSpec, {set_consistency_mode, CMode}, infinity).
%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -217,6 +237,10 @@ handle_call({{list_all_projections, ProjType}, LC1}, _From, S) ->
{reply, {{ok, find_all(Dir)}, LC2}, S};
handle_call({set_wedge_notify_pid, NotifyWedgeStateChanges}, _From, S) ->
{reply, ok, S#state{wedge_notify_pid=NotifyWedgeStateChanges}};
handle_call({get_wedge_notify_pid}, _From, S) ->
{reply, {ok, S#state.wedge_notify_pid}, S};
handle_call({set_consistency_mode, CMode}, _From, S) ->
{reply, ok, S#state{consistency_mode=CMode}};
handle_call(_Request, _From, S) ->
Reply = {whaaaaaaaaaaaaazz, _Request},
{reply, Reply, S}.
@ -254,52 +278,90 @@ do_proj_read(ProjType, Epoch, S_or_Dir) ->
{{error, Else}, S_or_Dir}
end.
do_proj_write(ProjType, #projection_v1{epoch_number=Epoch}=Proj, S) ->
do_proj_write(ProjType, Proj, S) ->
do_proj_write2(ProjType, Proj, S).
do_proj_write2(ProjType, #projection_v1{epoch_csum=CSum}=Proj, S) ->
case (machi_projection:update_checksum(Proj))#projection_v1.epoch_csum of
CSum2 when CSum2 == CSum ->
do_proj_write3(ProjType, Proj, S);
_Else ->
{{error, bad_arg}, S}
end.
do_proj_write3(ProjType, #projection_v1{epoch_number=Epoch,
epoch_csum=CSum}=Proj, S) ->
%% TODO: We probably ought to check the projection checksum for sanity, eh?
Dir = pick_path(ProjType, S),
Path = filename:join(Dir, epoch2name(Epoch)),
case file:read_file_info(Path) of
{ok, _FI} ->
case file:read_file(Path) of
{ok, _Bin} when ProjType == public ->
{{error, written}, S};
{ok, Bin} when ProjType == private ->
#projection_v1{epoch_number=CurEpoch,
epoch_csum=CurCSum} = _CurProj = binary_to_term(Bin),
%% We've already checked that CSum is correct matches the
%% contents of this new projection version. If the epoch_csum
%% values match, and if we trust the value on disk (TODO paranoid
%% check that, also), then the only difference must be the dbg2
%% list, which is ok.
if CurEpoch == Epoch, CurCSum == CSum ->
do_proj_write4(ProjType, Proj, Path, Epoch, S);
true ->
%% io:format(user, "OUCH: on disk: ~w\n", [machi_projection:make_summary(binary_to_term(Bin))]),
%% io:format(user, "OUCH: clobber: ~w\n", [machi_projection:make_summary(Proj)]),
%% io:format(user, "OUCH: clobber: ~p\n", [Proj#projection_v1.dbg2]),
%% {{error, written, CurEpoch, Epoch, CurCSum, CSum}, S}
{{error, written}, S}
end;
{error, enoent} ->
{ok, FH} = file:open(Path, [write, raw, binary]),
ok = file:write(FH, term_to_binary(Proj)),
ok = file:sync(FH),
ok = file:close(FH),
EffectiveProj = machi_chain_manager1:inner_projection_or_self(Proj),
EffectiveEpoch = EffectiveProj#projection_v1.epoch_number,
EpochId = {Epoch, Proj#projection_v1.epoch_csum},
EffectiveEpochId = {EffectiveEpoch, EffectiveProj#projection_v1.epoch_csum},
%%
NewS = if ProjType == public,
Epoch > element(1, S#state.max_public_epochid) ->
if Epoch == EffectiveEpoch ->
%% This is a regular projection, i.e.,
%% does not have an inner proj.
update_wedge_state(
S#state.wedge_notify_pid, true,
EffectiveEpochId);
Epoch /= EffectiveEpoch ->
%% This projection has an inner proj.
%% The outer proj is flapping, so we do
%% not bother wedging.
ok
end,
S#state{max_public_epochid=EpochId};
ProjType == private,
Epoch > element(1, S#state.max_private_epochid) ->
update_wedge_state(
S#state.wedge_notify_pid, false,
EffectiveEpochId),
S#state{max_private_epochid=EpochId};
true ->
S
end,
{ok, NewS};
do_proj_write4(ProjType, Proj, Path, Epoch, S);
{error, Else} ->
{{error, Else}, S}
end.
do_proj_write4(ProjType, Proj, Path, Epoch, #state{consistency_mode=CMode}=S) ->
{{ok, FH}, Epoch, Path} = {file:open(Path, [write, raw, binary]), Epoch, Path},
ok = file:write(FH, term_to_binary(Proj)),
ok = file:sync(FH),
ok = file:close(FH),
EffectiveProj = Proj,
EffectiveEpoch = EffectiveProj#projection_v1.epoch_number,
EpochId = machi_projection:get_epoch_id(Proj),
EffectiveEpochId = machi_projection:get_epoch_id(EffectiveProj),
%%
NewS = if ProjType == public,
Epoch > element(1, S#state.max_public_epochid) ->
if Epoch == EffectiveEpoch ->
%% This is a regular projection, i.e.,
%% does not have an inner proj.
update_wedge_state(
S#state.wedge_notify_pid, true,
EffectiveEpochId);
Epoch /= EffectiveEpoch ->
%% This projection has an inner proj.
%% The outer proj is flapping, so we do
%% not bother wedging.
ok
end,
S#state{max_public_epochid=EpochId};
ProjType == private,
Epoch > element(1, S#state.max_private_epochid) ->
if CMode == ap_mode ->
update_wedge_state(
S#state.wedge_notify_pid, false,
EffectiveEpochId);
true ->
%% If ProjType == private and CMode == cp_mode, then
%% the unwedge action is not performed here!
ok
end,
S#state{max_private_epochid=EpochId};
true ->
S
end,
{ok, NewS}.
update_wedge_state(PidSpec, Boolean, {0,_}=EpochId) ->
%% Epoch #0 is a special case: no projection has been written yet.
%% However, given the way that machi_flu_psup starts the
@ -325,7 +387,6 @@ wait_for_liveness(PidSpec, StartTime, WaitTime) ->
undefined ->
case timer:now_diff(os:timestamp(), StartTime) div 1000 of
X when X < WaitTime ->
io:format(user, "\nYOO ~p ~p\n", [PidSpec, lists:sort(registered())]),
timer:sleep(1),
wait_for_liveness(PidSpec, StartTime, WaitTime)
end;
@ -371,6 +432,18 @@ lclock_get() ->
lclock_update(LC) ->
lamport_clock:update(LC).
testing_sleep_perhaps() ->
try
[{_,Max}] = ets:lookup(?TEST_ETS_TABLE, projection_store_sleep_time),
MSec = random:uniform(Max),
io:format(user, "{", []),
timer:sleep(MSec),
io:format(user, "}", []),
ok
catch _X:_Y ->
ok
end.
-else. % TEST
lclock_init() ->
@ -382,4 +455,7 @@ lclock_get() ->
lclock_update(_LC) ->
ok.
testing_sleep_perhaps() ->
ok.
-endif. % TEST

View file

@ -22,6 +22,10 @@
%% proxy-process style API for hiding messy details such as TCP
%% connection/disconnection with the remote Machi server.
%%
%% Please see {@link machi_flu1_client} the "Client API implemntation notes"
%% section for how this module relates to the rest of the client API
%% implementation.
%%
%% Machi is intentionally avoiding using distributed Erlang for
%% Machi's communication. This design decision makes Erlang-side code
%% more difficult &amp; complex, but it's the price to pay for some
@ -47,16 +51,19 @@
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
-ifdef(PULSE).
-compile({parse_transform, pulse_instrument}).
-include_lib("pulse_otp/include/pulse_otp.hrl").
-endif.
-endif. % TEST.
-export([start_link/1]).
%% FLU1 API
-export([
%% File API
append_chunk/4, append_chunk/5,
append_chunk_extra/5, append_chunk_extra/6,
read_chunk/5, read_chunk/6,
checksum_list/3, checksum_list/4,
append_chunk/6, append_chunk/8,
read_chunk/7, read_chunk/8,
checksum_list/2, checksum_list/3,
list_files/2, list_files/3,
wedge_status/1, wedge_status/2,
@ -68,12 +75,14 @@
write_projection/3, write_projection/4,
get_all_projections/2, get_all_projections/3,
list_all_projections/2, list_all_projections/3,
kick_projection_reaction/2, kick_projection_reaction/3,
%% Common API
quit/1,
%% Internal API
write_chunk/5, write_chunk/6,
write_chunk/7, write_chunk/8,
trim_chunk/6, trim_chunk/7,
%% Helpers
stop_proxies/1, start_proxies/1
@ -98,51 +107,39 @@ start_link(#p_srvr{}=I) ->
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk(PidSpec, EpochID, Prefix, Chunk) ->
append_chunk(PidSpec, EpochID, Prefix, Chunk, infinity).
append_chunk(PidSpec, NSInfo, EpochID, Prefix, Chunk, CSum) ->
append_chunk(PidSpec, NSInfo, EpochID, Prefix, Chunk, CSum,
#append_opts{}, infinity).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk(PidSpec, EpochID, Prefix, Chunk, Timeout) ->
gen_server:call(PidSpec, {req, {append_chunk, EpochID, Prefix, Chunk}},
Timeout).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk_extra(PidSpec, EpochID, Prefix, Chunk, ChunkExtra)
when is_integer(ChunkExtra), ChunkExtra >= 0 ->
append_chunk_extra(PidSpec, EpochID, Prefix, Chunk, ChunkExtra, infinity).
%% @doc Append a chunk (binary- or iolist-style) of data to a file
%% with `Prefix'.
append_chunk_extra(PidSpec, EpochID, Prefix, Chunk, ChunkExtra, Timeout) ->
gen_server:call(PidSpec, {req, {append_chunk_extra, EpochID, Prefix,
Chunk, ChunkExtra}},
append_chunk(PidSpec, NSInfo, EpochID, Prefix, Chunk, CSum, Opts,
Timeout) ->
gen_server:call(PidSpec, {req, {append_chunk, NSInfo, EpochID,
Prefix, Chunk, CSum, Opts, Timeout}},
Timeout).
%% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
read_chunk(PidSpec, EpochID, File, Offset, Size) ->
read_chunk(PidSpec, EpochID, File, Offset, Size, infinity).
read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, Opts) ->
read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, Opts, infinity).
%% @doc Read a chunk of data of size `Size' from `File' at `Offset'.
read_chunk(PidSpec, EpochID, File, Offset, Size, Timeout) ->
gen_server:call(PidSpec, {req, {read_chunk, EpochID, File, Offset, Size}},
read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, Opts, Timeout) ->
gen_server:call(PidSpec, {req, {read_chunk, NSInfo, EpochID, File, Offset, Size, Opts}},
Timeout).
%% @doc Fetch the list of chunk checksums for `File'.
checksum_list(PidSpec, EpochID, File) ->
checksum_list(PidSpec, EpochID, File, infinity).
checksum_list(PidSpec, File) ->
checksum_list(PidSpec, File, infinity).
%% @doc Fetch the list of chunk checksums for `File'.
checksum_list(PidSpec, EpochID, File, Timeout) ->
gen_server:call(PidSpec, {req, {checksum_list, EpochID, File}},
checksum_list(PidSpec, File, Timeout) ->
gen_server:call(PidSpec, {req, {checksum_list, File}},
Timeout).
%% @doc Fetch the list of all files on the remote FLU.
@ -218,8 +215,26 @@ write_projection(PidSpec, ProjType, Proj) ->
%% @doc Write a projection `Proj' of type `ProjType'.
write_projection(PidSpec, ProjType, Proj, Timeout) ->
gen_server:call(PidSpec, {req, {write_projection, ProjType, Proj}},
Timeout).
case gen_server:call(PidSpec, {req, {write_projection, ProjType, Proj}},
Timeout) of
{error, written}=Err ->
Epoch = Proj#projection_v1.epoch_number,
case read_projection(PidSpec, ProjType, Epoch, Timeout) of
{ok, Proj2} when Proj2 == Proj ->
%% The proxy made (at least) two attempts to write
%% this projection. An earlier one appeared to
%% have failed, so the proxy retried. The later
%% attempt returned to us {error,written} because
%% the earlier attempt was actually received &
%% processed by the server. So, we consider this
%% a successful write.
ok;
_ ->
Err
end;
Else ->
Else
end.
%% @doc Get all projections from the FLU's projection store.
@ -243,6 +258,19 @@ list_all_projections(PidSpec, ProjType, Timeout) ->
gen_server:call(PidSpec, {req, {list_all_projections, ProjType}},
Timeout).
%% @doc Kick (politely) the remote chain manager to react to a
%% projection change.
kick_projection_reaction(PidSpec, Options) ->
kick_projection_reaction(PidSpec, Options, infinity).
%% @doc Kick (politely) the remote chain manager to react to a
%% projection change.
kick_projection_reaction(PidSpec, Options, Timeout) ->
gen_server:call(PidSpec, {req, {kick_projection_reaction, Options}},
Timeout).
%% @doc Quit &amp; close the connection to remote FLU and stop our
%% proxy process.
@ -252,14 +280,38 @@ quit(PidSpec) ->
%% @doc Write a chunk (binary- or iolist-style) of data to a file
%% with `Prefix' at `Offset'.
write_chunk(PidSpec, EpochID, File, Offset, Chunk) ->
write_chunk(PidSpec, EpochID, File, Offset, Chunk, infinity).
write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum) ->
write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum, infinity).
%% @doc Write a chunk (binary- or iolist-style) of data to a file
%% with `Prefix' at `Offset'.
write_chunk(PidSpec, EpochID, File, Offset, Chunk, Timeout) ->
gen_server:call(PidSpec, {req, {write_chunk, EpochID, File, Offset, Chunk}},
write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum, Timeout) ->
case gen_server:call(PidSpec, {req, {write_chunk, NSInfo, EpochID, File, Offset, Chunk, CSum}},
Timeout) of
{error, written}=Err ->
Size = byte_size(Chunk),
case read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, undefined, Timeout) of
{ok, {[{File, Offset, Chunk2, _}], []}} when Chunk2 == Chunk ->
%% See equivalent comment inside write_projection().
ok;
_ ->
Err
end;
Else ->
Else
end.
trim_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size) ->
trim_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, infinity).
%% @doc Write a chunk (binary- or iolist-style) of data to a file
%% with `Prefix' at `Offset'.
trim_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, Timeout) ->
gen_server:call(PidSpec,
{req, {trim_chunk, NSInfo, EpochID, File, Offset, Chunk}},
Timeout).
%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -323,21 +375,24 @@ do_req_retry(_Req, 2, Err, S) ->
do_req_retry(Req, Depth, _Err, S) ->
do_req(Req, Depth + 1, try_connect(disconnect(S))).
make_req_fun({append_chunk, EpochID, Prefix, Chunk},
make_req_fun({append_chunk, NSInfo, EpochID,
Prefix, Chunk, CSum, Opts, Timeout},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:append_chunk(Sock, EpochID, Prefix, Chunk) end;
make_req_fun({append_chunk_extra, EpochID, Prefix, Chunk, ChunkExtra},
fun() -> Mod:append_chunk(Sock, NSInfo, EpochID,
Prefix, Chunk, CSum, Opts, Timeout)
end;
make_req_fun({read_chunk, NSInfo, EpochID, File, Offset, Size, Opts},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:append_chunk_extra(Sock, EpochID, Prefix, Chunk, ChunkExtra) end;
make_req_fun({read_chunk, EpochID, File, Offset, Size},
fun() -> Mod:read_chunk(Sock, NSInfo, EpochID, File, Offset, Size, Opts) end;
make_req_fun({write_chunk, NSInfo, EpochID, File, Offset, Chunk, CSum},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:read_chunk(Sock, EpochID, File, Offset, Size) end;
make_req_fun({write_chunk, EpochID, File, Offset, Chunk},
fun() -> Mod:write_chunk(Sock, NSInfo, EpochID, File, Offset, Chunk, CSum) end;
make_req_fun({trim_chunk, NSInfo, EpochID, File, Offset, Size},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:write_chunk(Sock, EpochID, File, Offset, Chunk) end;
make_req_fun({checksum_list, EpochID, File},
fun() -> Mod:trim_chunk(Sock, NSInfo, EpochID, File, Offset, Size) end;
make_req_fun({checksum_list, File},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:checksum_list(Sock, EpochID, File) end;
fun() -> Mod:checksum_list(Sock, File) end;
make_req_fun({list_files, EpochID},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:list_files(Sock, EpochID) end;
@ -349,8 +404,7 @@ make_req_fun({get_epoch_id},
fun() -> case Mod:read_latest_projection(Sock, private) of
{ok, P} ->
#projection_v1{epoch_number=Epoch,
epoch_csum=CSum} =
machi_chain_manager1:inner_projection_or_self(P),
epoch_csum=CSum} = P,
{ok, {Epoch, CSum}};
Error ->
Error
@ -373,7 +427,10 @@ make_req_fun({get_all_projections, ProjType},
fun() -> Mod:get_all_projections(Sock, ProjType) end;
make_req_fun({list_all_projections, ProjType},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:list_all_projections(Sock, ProjType) end.
fun() -> Mod:list_all_projections(Sock, ProjType) end;
make_req_fun({kick_projection_reaction, Options},
#state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) ->
fun() -> Mod:kick_projection_reaction(Sock, Options) end.
connected_p(#state{sock=SockMaybe,
i=#p_srvr{proto_mod=Mod}=_I}=_S) ->

View file

@ -1,194 +0,0 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% @doc "Mothballed" sequencer code, perhaps to be reused sometime in
%% the future?
-module(machi_sequencer).
-compile(export_all).
-include_lib("kernel/include/file.hrl").
-define(CONFIG_DIR, "./config").
-define(DATA_DIR, "./data").
seq(Server, Prefix, Size) when is_binary(Prefix), is_integer(Size), Size > -1 ->
Server ! {seq, self(), Prefix, Size},
receive
{assignment, File, Offset} ->
{File, Offset}
after 1*1000 ->
bummer
end.
seq_direct(Prefix, Size) when is_binary(Prefix), is_integer(Size), Size > -1 ->
RegName = make_regname(Prefix),
seq(RegName, Prefix, Size).
start_server() ->
start_server(?MODULE).
start_server(Name) ->
spawn_link(fun() -> run_server(Name) end).
run_server(Name) ->
register(Name, self()),
ets:new(?MODULE, [named_table, public, {write_concurrency, true}]),
server_loop().
server_loop() ->
receive
{seq, From, Prefix, Size} ->
spawn(fun() -> server_dispatch(From, Prefix, Size) end),
server_loop()
end.
server_dispatch(From, Prefix, Size) ->
RegName = make_regname(Prefix),
case whereis(RegName) of
undefined ->
start_prefix_server(Prefix),
timer:sleep(1),
server_dispatch(From, Prefix, Size);
Pid ->
Pid ! {seq, From, Prefix, Size}
end,
exit(normal).
start_prefix_server(Prefix) ->
spawn(fun() -> run_prefix_server(Prefix) end).
run_prefix_server(Prefix) ->
true = register(make_regname(Prefix), self()),
ok = filelib:ensure_dir(?CONFIG_DIR ++ "/unused"),
ok = filelib:ensure_dir(?DATA_DIR ++ "/unused"),
FileNum = read_max_filenum(Prefix) + 1,
ok = increment_max_filenum(Prefix),
prefix_server_loop(Prefix, FileNum).
prefix_server_loop(Prefix, FileNum) ->
File = make_data_filename(Prefix, FileNum),
prefix_server_loop(Prefix, File, FileNum, 0).
prefix_server_loop(Prefix, File, FileNum, Offset) ->
receive
{seq, From, Prefix, Size} ->
From ! {assignment, File, Offset},
prefix_server_loop(Prefix, File, FileNum, Offset + Size)
after 30*1000 ->
io:format("timeout: ~p server stopping\n", [Prefix]),
exit(normal)
end.
make_regname(Prefix) ->
erlang:binary_to_atom(Prefix, latin1).
make_config_filename(Prefix) ->
lists:flatten(io_lib:format("~s/~s", [?CONFIG_DIR, Prefix])).
make_data_filename(Prefix, FileNum) ->
erlang:iolist_to_binary(io_lib:format("~s/~s.~w",
[?DATA_DIR, Prefix, FileNum])).
read_max_filenum(Prefix) ->
case file:read_file_info(make_config_filename(Prefix)) of
{error, enoent} ->
0;
{ok, FI} ->
FI#file_info.size
end.
increment_max_filenum(Prefix) ->
{ok, FH} = file:open(make_config_filename(Prefix), [append]),
ok = file:write(FH, "x"),
%% ok = file:sync(FH),
ok = file:close(FH).
%%%%%%%%%%%%%%%%%
%% basho_bench callbacks
-define(SEQ, ?MODULE).
new(1) ->
start_server(),
timer:sleep(100),
{ok, unused};
new(_Id) ->
{ok, unused}.
run(null, _KeyGen, _ValgueGen, State) ->
{ok, State};
run(keygen_then_null, KeyGen, _ValgueGen, State) ->
_Prefix = KeyGen(),
{ok, State};
run(seq, KeyGen, _ValgueGen, State) ->
Prefix = KeyGen(),
{_, _} = ?SEQ:seq(?SEQ, Prefix, 1),
{ok, State};
run(seq_direct, KeyGen, _ValgueGen, State) ->
Prefix = KeyGen(),
Name = ?SEQ:make_regname(Prefix),
case get(Name) of
undefined ->
case whereis(Name) of
undefined ->
{_, _} = ?SEQ:seq(?SEQ, Prefix, 1);
Pid ->
put(Name, Pid),
{_, _} = ?SEQ:seq(Pid, Prefix, 1)
end;
Pid ->
{_, _} = ?SEQ:seq(Pid, Prefix, 1)
end,
{ok, State};
run(seq_ets, KeyGen, _ValgueGen, State) ->
Tab = ?MODULE,
Prefix = KeyGen(),
Res = try
BigNum = ets:update_counter(Tab, Prefix, 1),
BigBin = <<BigNum:80/big>>,
<<FileNum:32/big, Offset:48/big>> = BigBin,
%% if Offset rem 1000 == 0 ->
%% io:format("~p,~p ", [FileNum, Offset]);
%% true ->
%% ok
%% end,
{fakefake, FileNum, Offset}
catch error:badarg ->
FileNum2 = 1, Offset2 = 0,
FileBin = <<FileNum2:32/big>>,
OffsetBin = <<Offset2:48/big>>,
Glop = <<FileBin/binary, OffsetBin/binary>>,
<<Base:80/big>> = Glop,
%% if Prefix == <<"42">> -> io:format("base:~w\n", [Base]); true -> ok end,
%% Base = 0,
case ets:insert_new(Tab, {Prefix, Base}) of
true ->
{<<"fakefakefake">>, Base};
false ->
Result2 = ets:update_counter(Tab, Prefix, 1),
{<<"fakefakefake">>, Result2}
end
end,
Res = Res,
{ok, State}.

View file

@ -27,6 +27,14 @@
-behaviour(supervisor).
-ifdef(PULSE).
-compile({parse_transform, pulse_instrument}).
-include_lib("pulse_otp/include/pulse_otp.hrl").
-define(SHUTDOWN, infinity).
-else.
-define(SHUTDOWN, 5000).
-endif.
%% API
-export([start_link/0]).
@ -46,15 +54,22 @@ init([]) ->
SupFlags = {RestartStrategy, MaxRestarts, MaxSecondsBetweenRestarts},
Restart = permanent,
Shutdown = 5000,
Shutdown = ?SHUTDOWN,
Type = supervisor,
ServerSup =
{machi_flu_sup, {machi_flu_sup, start_link, []},
Restart, Shutdown, Type, []},
{ok, {SupFlags, [ServerSup]}}.
%% AChild = {'AName', {'AModule', start_link, []},
%% Restart, Shutdown, Type, ['AModule']},
%% {ok, {SupFlags, [AChild]}}.
RanchSup = {ranch_sup, {ranch_sup, start_link, []},
Restart, Shutdown, supervisor, [ranch_sup]},
LifecycleMgr =
{machi_lifecycle_mgr, {machi_lifecycle_mgr, start_link, []},
Restart, Shutdown, worker, []},
RunningApps = [A || {A,_D,_V} <- application:which_applications()],
Specs = case lists:member(ranch, RunningApps) of
true ->
[ServerSup, LifecycleMgr];
false ->
[ServerSup, RanchSup, LifecycleMgr]
end,
{ok, {SupFlags, Specs}}.

View file

@ -25,22 +25,35 @@
-export([
checksum_chunk/1,
make_tagged_csum/1, make_tagged_csum/2,
make_client_csum/1,
unmake_tagged_csum/1,
hexstr_to_bin/1, bin_to_hexstr/1,
hexstr_to_int/1, int_to_hexstr/2, int_to_hexbin/2,
make_binary/1, make_string/1,
make_regname/1,
make_config_filename/2,
make_config_filename/4, make_config_filename/2,
make_checksum_filename/4, make_checksum_filename/2,
make_data_filename/4, make_data_filename/2,
make_projection_filename/2,
read_max_filenum/2, increment_max_filenum/2,
make_data_filename/6, make_data_filename/2,
make_projection_filename/2,
is_valid_filename/1,
parse_filename/1,
read_max_filenum/4, increment_max_filenum/4,
info_msg/2, verb/1, verb/2,
mbytes/1,
mbytes/1,
pretty_time/0, pretty_time/2,
%% TCP protocol helpers
connect/2, connect/3
connect/2, connect/3,
%% List twiddling
permutations/1, perms/1,
combinations/1, ordered_combinations/1,
mk_order/2,
%% Other
wait_for_death/2, wait_for_life/2,
bool2int/1,
int2bool/1,
read_opts_default/1,
ns_info_default/1
]).
-compile(export_all).
-include("machi.hrl").
-include("machi_projection.hrl").
@ -58,17 +71,27 @@ make_regname(Prefix) when is_list(Prefix) ->
%% @doc Calculate a config file path, by common convention.
-spec make_config_filename(string(), machi_dt:namespace(), machi_dt:locator(), string()) ->
string().
make_config_filename(DataDir, NS, NSLocator, Prefix) ->
NSLocator_str = int_to_hexstr(NSLocator, 32),
lists:flatten(io_lib:format("~s/config/~s^~s^~s",
[DataDir, Prefix, NS, NSLocator_str])).
%% @doc Calculate a config file path, by common convention.
-spec make_config_filename(string(), string()) ->
string().
make_config_filename(DataDir, Prefix) ->
lists:flatten(io_lib:format("~s/config/~s", [DataDir, Prefix])).
make_config_filename(DataDir, Filename) ->
lists:flatten(io_lib:format("~s/config/~s",
[DataDir, Filename])).
%% @doc Calculate a checksum file path, by common convention.
-spec make_checksum_filename(string(), string(), atom()|string()|binary(), integer()) ->
string().
make_checksum_filename(DataDir, Prefix, SequencerName, FileNum) ->
lists:flatten(io_lib:format("~s/config/~s.~s.~w.csum",
lists:flatten(io_lib:format("~s/config/~s^~s^~w.csum",
[DataDir, Prefix, SequencerName, FileNum])).
%% @doc Calculate a checksum file path, by common convention.
@ -82,11 +105,22 @@ make_checksum_filename(DataDir, FileName) ->
%% @doc Calculate a file data file path, by common convention.
-spec make_data_filename(string(), string(), atom()|string()|binary(), integer()) ->
-spec make_data_filename(string(), machi_dt:namespace(), machi_dt:locator(), string(), atom()|string()|binary(), integer()|string()) ->
{binary(), string()}.
make_data_filename(DataDir, Prefix, SequencerName, FileNum) ->
File = erlang:iolist_to_binary(io_lib:format("~s.~s.~w",
[Prefix, SequencerName, FileNum])),
make_data_filename(DataDir, NS, NSLocator, Prefix, SequencerName, FileNum)
when is_integer(FileNum) ->
NSLocator_str = int_to_hexstr(NSLocator, 32),
File = erlang:iolist_to_binary(io_lib:format("~s^~s^~s^~s^~w",
[Prefix, NS, NSLocator_str, SequencerName, FileNum])),
make_data_filename2(DataDir, File);
make_data_filename(DataDir, NS, NSLocator, Prefix, SequencerName, String)
when is_list(String) ->
NSLocator_str = int_to_hexstr(NSLocator, 32),
File = erlang:iolist_to_binary(io_lib:format("~s^~s^~s^~s^~s",
[Prefix, NS, NSLocator_str, SequencerName, string])),
make_data_filename2(DataDir, File).
make_data_filename2(DataDir, File) ->
FullPath = lists:flatten(io_lib:format("~s/data/~s", [DataDir, File])),
{File, FullPath}.
@ -110,13 +144,49 @@ make_projection_filename(DataDir, "") ->
make_projection_filename(DataDir, File) ->
lists:flatten(io_lib:format("~s/projection/~s", [DataDir, File])).
%% @doc Given a filename, return true if it is a valid machi filename,
%% false otherwise.
-spec is_valid_filename( Filename :: string() ) -> true | false.
is_valid_filename(Filename) ->
case parse_filename(Filename) of
{} -> false;
{_,_,_,_,_} -> true
end.
%% @doc Given a machi filename, return a set of components in a list.
%% The components will be:
%% <ul>
%% <li>Prefix</li>
%% <li>Cluster namespace</li>
%% <li>Cluster locator</li>
%% <li>UUID</li>
%% <li>Sequence number</li>
%% </ul>
%%
%% Invalid filenames will return an empty list.
-spec parse_filename( Filename :: string() ) -> {} | {string(), machi_dt:namespace(), machi_dt:locator(), string(), string() }.
parse_filename(Filename) ->
case string:tokens(Filename, "^") of
[Prefix, NS, NSLocator, UUID, SeqNo] ->
{Prefix, NS, list_to_integer(NSLocator), UUID, SeqNo};
[Prefix, NSLocator, UUID, SeqNo] ->
%% string:tokens() doesn't consider "foo^^bar" as 3 tokens {sigh}
case re:replace(Filename, "[^^]+", "x", [global,{return,binary}]) of
<<"x^^x^x^x">> ->
{Prefix, <<"">>, list_to_integer(NSLocator), UUID, SeqNo};
_ ->
{}
end;
_ -> {}
end.
%% @doc Read the file size of a config file, which is used as the
%% basis for a minimum sequence number.
-spec read_max_filenum(string(), string()) ->
-spec read_max_filenum(string(), machi_dt:namespace(), machi_dt:locator(), string()) ->
non_neg_integer().
read_max_filenum(DataDir, Prefix) ->
case file:read_file_info(make_config_filename(DataDir, Prefix)) of
read_max_filenum(DataDir, NS, NSLocator, Prefix) ->
case file:read_file_info(make_config_filename(DataDir, NS, NSLocator, Prefix)) of
{error, enoent} ->
0;
{ok, FI} ->
@ -126,13 +196,13 @@ read_max_filenum(DataDir, Prefix) ->
%% @doc Increase the file size of a config file, which is used as the
%% basis for a minimum sequence number.
-spec increment_max_filenum(string(), string()) ->
-spec increment_max_filenum(string(), machi_dt:namespace(), machi_dt:locator(), string()) ->
ok | {error, term()}.
increment_max_filenum(DataDir, Prefix) ->
increment_max_filenum(DataDir, NS, NSLocator, Prefix) ->
try
{ok, FH} = file:open(make_config_filename(DataDir, Prefix), [append]),
{ok, FH} = file:open(make_config_filename(DataDir, NS, NSLocator, Prefix), [append]),
ok = file:write(FH, "x"),
%% ok = file:sync(FH),
ok = file:sync(FH),
ok = file:close(FH)
catch
error:{badmatch,_}=Error ->
@ -219,22 +289,48 @@ int_to_hexbin(I, I_size) ->
checksum_chunk(Chunk) when is_binary(Chunk); is_list(Chunk) ->
crypto:hash(sha, Chunk).
convert_csum_tag(A) when is_atom(A)->
A;
convert_csum_tag(?CSUM_TAG_NONE) ->
?CSUM_TAG_NONE_ATOM;
convert_csum_tag(?CSUM_TAG_CLIENT_SHA) ->
?CSUM_TAG_CLIENT_SHA_ATOM;
convert_csum_tag(?CSUM_TAG_SERVER_SHA) ->
?CSUM_TAG_SERVER_SHA_ATOM;
convert_csum_tag(?CSUM_TAG_SERVER_REGEN_SHA) ->
?CSUM_TAG_SERVER_REGEN_SHA_ATOM.
%% @doc Create a tagged checksum
make_tagged_csum(none) ->
<<?CSUM_TAG_NONE:8>>;
make_tagged_csum({Tag, CSum}) ->
make_tagged_csum(Tag, CSum).
make_tagged_csum(none, _SHA) ->
make_tagged_csum(<<>>) ->
<<?CSUM_TAG_NONE:8>>;
make_tagged_csum(client_sha, SHA) ->
make_tagged_csum({Tag, CSum}) ->
make_tagged_csum(convert_csum_tag(Tag), CSum).
%% @doc Makes tagged csum. Each meanings are:
%% none / ?CSUM_TAG_NONE
%% - a suspicious and nonsense checksum
%% client_sha / ?CSUM_TAG_CLIENT_SHA
%% - a valid checksum given by client and stored in server
%% server_sha / ?CSUM_TAG_SERVER_SHA
%% - a valid checksum generated by and stored in server
%% server_regen_sha / ?CSUM_TAG_SERVER_REGEN_SHA
%% - a valid checksum generated by server in an ad hoc manner, not stored in server
-spec make_tagged_csum(machi_dt:csum_tag(), binary()) -> machi_dt:chunk_csum().
make_tagged_csum(?CSUM_TAG_NONE_ATOM, _SHA) ->
<<?CSUM_TAG_NONE:8>>;
make_tagged_csum(?CSUM_TAG_CLIENT_SHA_ATOM, SHA) ->
<<?CSUM_TAG_CLIENT_SHA:8, SHA/binary>>;
make_tagged_csum(server_sha, SHA) ->
make_tagged_csum(?CSUM_TAG_SERVER_SHA_ATOM, SHA) ->
<<?CSUM_TAG_SERVER_SHA:8, SHA/binary>>;
make_tagged_csum(server_regen_sha, SHA) ->
make_tagged_csum(?CSUM_TAG_SERVER_REGEN_SHA_ATOM, SHA) ->
<<?CSUM_TAG_SERVER_REGEN_SHA:8, SHA/binary>>.
make_client_csum(BinOrList) ->
make_tagged_csum(?CSUM_TAG_CLIENT_SHA_ATOM, checksum_chunk(BinOrList)).
unmake_tagged_csum(<<Tag:8, Rest/binary>>) ->
{Tag, Rest}.
@ -258,6 +354,15 @@ mbytes(0) ->
mbytes(Size) ->
lists:flatten(io_lib:format("~.1.0f", [max(0.1, Size / (1024*1024))])).
pretty_time() ->
{_,_,C} = os:timestamp(),
MSec = trunc(C / 1000),
pretty_time(time(), MSec).
pretty_time({HH,MM,SS}, MSec) ->
lists:flatten(
io_lib:format("~2..0w:~2..0w:~2..0w.~3..0w", [HH, MM, SS, MSec])).
%% @doc Log an 'info' level message.
-spec info_msg(string(), list()) -> term().
@ -272,11 +377,22 @@ wait_for_death(Pid, Iters) when is_pid(Pid) ->
case erlang:is_process_alive(Pid) of
false ->
ok;
true ->
timer:sleep(1),
true ->
timer:sleep(10),
wait_for_death(Pid, Iters-1)
end.
wait_for_life(Reg, 0) ->
exit({not_alive_yet, Reg});
wait_for_life(Reg, Iters) when is_atom(Reg) ->
case erlang:whereis(Reg) of
Pid when is_pid(Pid) ->
{ok, Pid};
_ ->
timer:sleep(1),
wait_for_life(Reg, Iters-1)
end.
%%%%%%%%%%%%%%%%%
%% @doc Create a TCP connection to a remote Machi server.
@ -301,3 +417,49 @@ escript_connect(Host, Port, Timeout) when is_integer(Port) ->
{ok, Sock} = gen_tcp:connect(Host, Port, [{active,false}, {mode,binary},
{packet, raw}], Timeout),
Sock.
permutations(L) ->
perms(L).
perms([]) -> [[]];
perms(L) -> [[H|T] || H <- L, T <- perms(L--[H])].
combinations(L) ->
lists:usort(perms(L) ++ lists:append([ combinations(L -- [X]) || X <- L])).
ordered_combinations(Master) ->
[L || L <- combinations(Master), is_ordered(L, Master)].
is_ordered(L, Reference) ->
L_order = mk_order(L, Reference),
lists:all(fun(X) -> is_integer(X) end, L_order) andalso
L_order == lists:sort(L_order).
mk_order(UPI2, Repair1) ->
R1 = length(Repair1),
Repair1_order_d = orddict:from_list(lists:zip(Repair1, lists:seq(1, R1))),
UPI2_order = [case orddict:find(X, Repair1_order_d) of
{ok, Idx} -> Idx;
error -> error
end || X <- UPI2],
UPI2_order.
%% C-style conversion for PB usage.
bool2int(true) -> 1;
bool2int(false) -> 0.
int2bool(0) -> false;
int2bool(I) when is_integer(I) -> true.
read_opts_default(#read_opts{}=NSInfo) ->
NSInfo;
read_opts_default(A) when A == 'undefined'; A == 'noopt'; A == 'none' ->
#read_opts{};
read_opts_default(A) when is_atom(A) ->
#read_opts{}.
ns_info_default(#ns_info{}=NSInfo) ->
NSInfo;
ns_info_default(A) when is_atom(A) ->
#ns_info{}.

View file

@ -22,6 +22,8 @@
-module(machi_yessir_client).
-ifdef(TODO_refactoring_deferred).
-include("machi.hrl").
-include("machi_projection.hrl").
@ -30,7 +32,7 @@
append_chunk/4, append_chunk/5,
append_chunk_extra/5, append_chunk_extra/6,
read_chunk/5, read_chunk/6,
checksum_list/3, checksum_list/4,
checksum_list/2, checksum_list/3,
list_files/2, list_files/3,
wedge_status/1, wedge_status/2,
@ -173,24 +175,24 @@ read_chunk(_Host, _TcpPort, EpochID, File, Offset, Size)
%% @doc Fetch the list of chunk checksums for `File'.
checksum_list(#yessir{name=Name,chunk_size=ChunkSize}, _EpochID, File) ->
checksum_list(#yessir{name=Name,chunk_size=ChunkSize}, File) ->
case get({Name,offset,File}) of
undefined ->
{error, no_such_file};
MaxOffset ->
C = machi_util:make_tagged_csum(client_sha,
make_csum(Name, ChunkSize)),
Cs = [machi_flu1:encode_csum_file_entry_bin(Offset, ChunkSize, C) ||
Offset <- lists:seq(?MINIMUM_OFFSET, MaxOffset, ChunkSize)],
{ok, Cs}
Cs = [{Offset, ChunkSize, C} ||
Offset <- lists:seq(?MINIMUM_OFFSET, MaxOffset, ChunkSize)],
{ok, term_to_binary(Cs)}
end.
%% @doc Fetch the list of chunk checksums for `File'.
checksum_list(_Host, _TcpPort, EpochID, File) ->
checksum_list(_Host, _TcpPort, File) ->
Sock = connect(#p_srvr{proto_mod=?MODULE}),
try
checksum_list(Sock, EpochID, File)
checksum_list(Sock, File)
after
disconnect(Sock)
end.
@ -450,18 +452,18 @@ connect(#p_srvr{name=Name, props=Props})->
chunk_size=ChunkSize
},
%% Add fake dict entries for these files
[begin
Prefix = list_to_binary(io_lib:format("fake~w", [X])),
{ok, _} = append_chunk_extra(Sock, {1,<<"unused">>}, Prefix, <<>>, FileSize)
end || X <- lists:seq(1, NumFiles)],
_ = [begin
Prefix = list_to_binary(io_lib:format("fake~w", [X])),
{ok, _} = append_chunk_extra(Sock, {1,<<"unused">>}, Prefix, <<>>, FileSize)
end || X <- lists:seq(1, NumFiles)],
Sock.
disconnect(#yessir{name=Name}) ->
[erase(K) || {{N,offset,_}=K, _V} <- get(), N == Name],
[erase(K) || {{N,chunk,_}=K, _V} <- get(), N == Name],
[erase(K) || {{N,csum,_}=K, _V} <- get(), N == Name],
[erase(K) || {{N,proj,_,_}=K, _V} <- get(), N == Name],
_ = [erase(K) || {{N,offset,_}=K, _V} <- get(), N == Name],
_ = [erase(K) || {{N,chunk,_}=K, _V} <- get(), N == Name],
_ = [erase(K) || {{N,csum,_}=K, _V} <- get(), N == Name],
_ = [erase(K) || {{N,proj,_,_}=K, _V} <- get(), N == Name],
ok.
%% Example use:
@ -509,3 +511,5 @@ disconnect(#yessir{name=Name}) ->
%% =INFO REPORT==== 17-May-2015::18:57:52 ===
%% Repair success: tail a of [a] finished ap_mode repair ID {a,{1431,856671,140404}}: ok
%% Stats [{t_in_files,0},{t_in_chunks,10413},{t_in_bytes,682426368},{t_out_files,0},{t_out_chunks,10413},{t_out_bytes,682426368},{t_bad_chunks,0},{t_elapsed_seconds,1.591}]
-endif. % TODO_refactoring_deferred

View file

@ -0,0 +1,357 @@
%% -------------------------------------------------------------------
%%
%% Machi: a small village of replicated files
%%
%% Copyright (c) 2014-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(chain_mgr_legacy).
-compile(export_all).
-include("machi_projection.hrl").
-include("machi_chain_manager.hrl").
-define(RETURN1(X), begin put(why1, [?LINE|get(why1)]), X end).
%% This is old code, and it's broken. We keep it around in case
%% someone wants to execute the QuickCheck property
%% machi_chain_manager1_test:prop_compare_legacy_with_v2_chain_transition_check().
%% In all counterexamples, this legacy code returns 'true' (i.e., the
%% state transition is OK) where the v2 new code correcly returns
%% 'false' (i.e. the state transition is BAD). Fun, good times.
%% Hooray about more systematic/mathematical reasoning, code
%% structure, and property-based testing.
projection_transition_is_sane(
#projection_v1{epoch_number=Epoch1,
epoch_csum=CSum1,
creation_time=CreationTime1,
author_server=AuthorServer1,
all_members=All_list1,
down=Down_list1,
upi=UPI_list1,
repairing=Repairing_list1,
dbg=Dbg1} = P1,
#projection_v1{epoch_number=Epoch2,
epoch_csum=CSum2,
creation_time=CreationTime2,
author_server=AuthorServer2,
all_members=All_list2,
down=Down_list2,
upi=UPI_list2,
repairing=Repairing_list2,
dbg=Dbg2} = P2,
RelativeToServer, RetrospectiveP) ->
try
put(why1, []),
%% General notes:
%%
%% I'm making no attempt to be "efficient" here. All of these data
%% structures are small, and they're not called zillions of times per
%% second.
%%
%% The chain sequence/order checks at the bottom of this function aren't
%% as easy-to-read as they ought to be. However, I'm moderately confident
%% that it isn't buggy. TODO: refactor them for clarity.
true = is_integer(Epoch1) andalso is_integer(Epoch2),
true = is_binary(CSum1) andalso is_binary(CSum2),
{_,_,_} = CreationTime1,
{_,_,_} = CreationTime2,
true = is_atom(AuthorServer1) andalso is_atom(AuthorServer2), % todo type may change?
true = is_list(All_list1) andalso is_list(All_list2),
true = is_list(Down_list1) andalso is_list(Down_list2),
true = is_list(UPI_list1) andalso is_list(UPI_list2),
true = is_list(Repairing_list1) andalso is_list(Repairing_list2),
true = is_list(Dbg1) andalso is_list(Dbg2),
true = Epoch2 > Epoch1,
All_list1 = All_list2, % todo will probably change
%% No duplicates
true = lists:sort(Down_list2) == lists:usort(Down_list2),
true = lists:sort(UPI_list2) == lists:usort(UPI_list2),
true = lists:sort(Repairing_list2) == lists:usort(Repairing_list2),
%% Disjoint-ness
true = lists:sort(All_list2) == lists:sort(Down_list2 ++ UPI_list2 ++
Repairing_list2),
[] = [X || X <- Down_list2, not lists:member(X, All_list2)],
[] = [X || X <- UPI_list2, not lists:member(X, All_list2)],
[] = [X || X <- Repairing_list2, not lists:member(X, All_list2)],
DownS2 = sets:from_list(Down_list2),
UPIS2 = sets:from_list(UPI_list2),
RepairingS2 = sets:from_list(Repairing_list2),
true = sets:is_disjoint(DownS2, UPIS2),
true = sets:is_disjoint(DownS2, RepairingS2),
true = sets:is_disjoint(UPIS2, RepairingS2),
%% Additions to the UPI chain may only be at the tail
UPI_common_prefix = find_common_prefix(UPI_list1, UPI_list2),
true =
if UPI_common_prefix == [] ->
if UPI_list1 == [] orelse UPI_list2 == [] ->
%% If the common prefix is empty, then one of the
%% inputs must be empty.
?RETURN1(true);
true ->
%% Otherwise, we have a case of UPI changing from
%% one of these two situations:
%%
%% UPI_list1 -> UPI_list2
%% -------------------------------------------------
%% [d,c,b,a] -> [c,a]
%% [d,c,b,a] -> [c,a,repair_finished_added_to_tail].
NotUPI2 = (Down_list2 ++ Repairing_list2),
case lists:prefix(UPI_list1 -- NotUPI2, UPI_list2) of
true ->
?RETURN1(true);
false ->
%% Here's a possible failure scenario:
%% UPI_list1 -> UPI_list2
%% Repairing_list1 -> Repairing_list2
%% -----------------------------------
%% [a,b,c] author=a -> [c,a] author=c
%% [] [b]
%%
%% ... where RelativeToServer=b. In this case, b
%% has been partitioned for a while and has only
%% now just learned of several epoch transitions.
%% If the author of both is also in the UPI of
%% both, then those authors would not have allowed
%% a bad transition, so we will assume this
%% transition is OK.
?RETURN1(
lists:member(AuthorServer1, UPI_list1)
andalso
lists:member(AuthorServer2, UPI_list2)
)
end
end;
true ->
?RETURN1(true)
end,
true = lists:prefix(UPI_common_prefix, UPI_list1),
true = lists:prefix(UPI_common_prefix, UPI_list2),
UPI_1_suffix = UPI_list1 -- UPI_common_prefix,
UPI_2_suffix = UPI_list2 -- UPI_common_prefix,
_ = ?RETURN1(yo),
MoreCheckingP =
RelativeToServer == undefined
orelse
not (lists:member(RelativeToServer, Down_list2) orelse
lists:member(RelativeToServer, Repairing_list2)),
_ = ?RETURN1(yo),
UPIs_are_disjointP = ordsets:is_disjoint(ordsets:from_list(UPI_list1),
ordsets:from_list(UPI_list2)),
case UPI_2_suffix -- UPI_list1 of
[] ->
?RETURN1(true);
[_|_] = _Added_by_2 ->
if RetrospectiveP ->
%% Any servers added to the UPI must be added from the
%% repairing list ... but in retrospective mode (where
%% we're checking only the transitions where all
%% UPI+repairing participants have unanimous private
%% projections!), and if we're under asymmetric
%% partition/churn, then we may not see the repairing
%% list. So we will not check that condition here.
?RETURN1(true);
not RetrospectiveP ->
%% We're not retrospective. So, if some server was
%% added by to the UPI, then that means that it was
%% added by repair. And repair is coordinated by the
%% UPI tail/last.
%io:format(user, "g: UPI_list1=~w, UPI_list2=~w, UPI_2_suffix=~w, ",
% [UPI_list1, UPI_list2, UPI_2_suffix]),
%io:format(user, "g", []),
?RETURN1(true = UPI_list1 == [] orelse
UPIs_are_disjointP orelse
(lists:last(UPI_list1) == AuthorServer2) )
end
end,
if not MoreCheckingP ->
?RETURN1(ok);
MoreCheckingP ->
%% Where did elements in UPI_2_suffix come from?
%% Only two sources are permitted.
Oops_check_UPI_2_suffix =
[lists:member(X, Repairing_list1) % X added after repair done
orelse
lists:member(X, UPI_list1) % X in UPI_list1 after common pref
|| X <- UPI_2_suffix],
%% Grrrrr, ok, so this check isn't good, at least at bootstrap time.
%% TODO: false = lists:member(false, Oops_check_UPI_2_suffix),
%% The UPI_2_suffix must exactly be equal to: ordered items from
%% UPI_list1 concat'ed with ordered items from Repairing_list1.
%% Both temp vars below preserve relative order!
UPI_2_suffix_from_UPI1 = [X || X <- UPI_1_suffix,
lists:member(X, UPI_list2)],
UPI_2_suffix_from_Repairing1 = [X || X <- UPI_2_suffix,
lists:member(X, Repairing_list1)],
%% true?
UPI_2_concat = (UPI_2_suffix_from_UPI1 ++ UPI_2_suffix_from_Repairing1),
if UPI_2_suffix == UPI_2_concat ->
?RETURN1(ok);
true ->
%% 'make dialyzer' will believe that this can never succeed.
%% 'make dialyzer-test' will not complain, however.
if RetrospectiveP ->
%% We are in retrospective mode. But there are
%% some transitions that are difficult to find
%% when standing outside of all of the FLUs and
%% examining their behavior. (In contrast to
%% this same function being called "in the path"
%% of a projection transition by a particular FLU
%% which knows exactly its prior projection and
%% exactly what it intends to do.) Perhaps this
%% exception clause here can go away with
%% better/more clever retrospection analysis?
%%
%% Here's a case that PULSE found:
%% FLU B:
%% E=257: UPI=[c,a], REPAIRING=[b]
%% E=284: UPI=[c,a], REPAIRING=[b]
%% FLU a:
%% E=251: UPI=[c], REPAIRING=[a,b]
%% E=284: UPI=[c,a], REPAIRING=[b]
%% FLU c:
%% E=282: UPI=[c], REPAIRING=[a,b]
%% E=284: UPI=[c,a], REPAIRING=[b]
%%
%% From the perspective of each individual FLU,
%% the unanimous transition at epoch #284 is
%% good. The repair that is done by FLU c -> a
%% is likewise good.
%%
%% From a retrospective point of view (and the
%% current implementation), there's a bad-looking
%% transition from epoch #269 to #284. This is
%% from the point of view of the last two
%% unanimous private projection store epochs:
%%
%% E=269: UPI=[c], REPAIRING=[], DOWN=[a,b]
%% E=284: UPI=[c,a], REPAIRING=[b]
%%
%% The retrospective view by
%% machi_chain_manager1_pulse.erl just can't
%% reason correctly about this situation. We
%% will instead rely on the non-retrospective
%% sanity checking that each FLU does before it
%% writes to its private projection store and
%% then adopts that projection (and unwedges
%% itself, etc etc).
if UPIs_are_disjointP ->
?RETURN1(true);
true ->
?RETURN1(todo),
exit({todo, revisit, ?MODULE, ?LINE,
[
{oops_check_UPI_2_suffix, Oops_check_UPI_2_suffix},
{upi_2_suffix, UPI_2_suffix},
{upi_2_concat, UPI_2_concat},
{retrospectivep, RetrospectiveP}
]}),
io:format(user, "|~p,~p TODO revisit|",
[?MODULE, ?LINE]),
ok
end;
true ->
%% The following is OK: We're shifting from a
%% normal projection to an inner one. The old
%% normal has a UPI that has nothing to do with
%% RelativeToServer a.k.a. me.
%% Or else the UPI_list1 is empty, and I'm
%% the only member of UPI_list2
%% But the new/suffix is definitely me.
%% from:
%% {epoch,847},{author,c},{upi,[c]},{repair,[]},
%% {down,[a,b,d]}
%% to:
%% {epoch,848},{author,a},{upi,[a]},{repair,[]},
%% {down,[b,c,d]}
FirstCase_p = (UPI_2_suffix == [AuthorServer2])
andalso
((inner_projection_exists(P1) == false
andalso
inner_projection_exists(P2) == true)
orelse UPI_list1 == []),
%% Here's another case that's alright:
%%
%% {a,{err,exit,
%% {upi_2_suffix_error,[c]}, ....
%%
%% from:
%% {epoch,937},{author,a},{upi,[a,b]},{repair,[]},
%% {down,[c]}
%% to:
%% {epoch,943},{author,a},{upi,{a,b,c},{repair,[]},
%% {down,[]}
%% The author server doesn't matter. However,
%% there were two other epochs in between, 939
%% and 941, where there wasn't universal agreement
%% of private projections. The repair controller
%% at the tail, 'b', had decided that the repair
%% of 'c' was finished @ epoch 941.
SecondCase_p = ((UPI_2_suffix -- Repairing_list1)
== []),
if FirstCase_p ->
?RETURN1(true);
SecondCase_p ->
?RETURN1(true);
UPIs_are_disjointP ->
%% If there's no overlap at all between
%% UPI_list1 & UPI_list2, then we're OK
%% here.
?RETURN1(true);
true ->
exit({upi_2_suffix_error, UPI_2_suffix})
end
end
end
end,
?RETURN1(true)
catch
_Type:_Err ->
?RETURN1(oops),
S1 = machi_projection:make_summary(P1),
S2 = machi_projection:make_summary(P2),
Trace = erlang:get_stacktrace(),
{err, _Type, _Err, from, S1, to, S2, relative_to, RelativeToServer,
history, (catch lists:sort([no_history])),
stack, Trace}
end.
find_common_prefix([], _) ->
[];
find_common_prefix(_, []) ->
[];
find_common_prefix([H|L1], [H|L2]) ->
[H|find_common_prefix(L1, L2)];
find_common_prefix(_, _) ->
[].
inner_projection_exists(_) ->
false.

View file

@ -33,49 +33,59 @@
-define(FLU_C, machi_flu1_client).
verify_file_checksums_test_() ->
{timeout, 60, fun() -> verify_file_checksums_test2() end}.
{setup,
fun() -> os:cmd("rm -rf ./data") end,
fun(_) -> os:cmd("rm -rf ./data") end,
{timeout, 60, fun() -> verify_file_checksums_test2() end}
}.
verify_file_checksums_test2() ->
Host = "localhost",
TcpPort = 32958,
DataDir = "./data",
W_props = [{initial_wedged, false}],
FLU1 = machi_flu1_test:setup_test_flu(verify1_flu, TcpPort, DataDir,
W_props),
Sock1 = ?FLU_C:connect(#p_srvr{address=Host, port=TcpPort}),
NSInfo = undefined,
NoCSum = <<>>,
try
Prefix = <<"verify_prefix">>,
NumChunks = 10,
[{ok, _} = ?FLU_C:append_chunk(Sock1, ?DUMMY_PV1_EPOCH,
Prefix, <<X:(X*8)/big>>) ||
X <- lists:seq(1, NumChunks)],
{ok, [{_FileSize,File}]} = ?FLU_C:list_files(Sock1, ?DUMMY_PV1_EPOCH),
{ok, []} = machi_admin_util:verify_file_checksums_remote(
Host, TcpPort, ?DUMMY_PV1_EPOCH, File),
machi_test_util:start_flu_package(verify1_flu, TcpPort, DataDir,
W_props),
Sock1 = ?FLU_C:connect(#p_srvr{address=Host, port=TcpPort}),
try
Prefix = <<"verify_prefix">>,
NumChunks = 10,
[{ok, _} = ?FLU_C:append_chunk(Sock1, NSInfo, ?DUMMY_PV1_EPOCH,
Prefix, <<X:(X*8)/big>>, NoCSum) ||
X <- lists:seq(1, NumChunks)],
{ok, [{_FileSize,File}]} = ?FLU_C:list_files(Sock1, ?DUMMY_PV1_EPOCH),
?assertEqual({ok, []},
machi_admin_util:verify_file_checksums_remote(
Host, TcpPort, ?DUMMY_PV1_EPOCH, File)),
%% Clobber the first 3 chunks, which are sizes 1/2/3.
{_, Path} = machi_util:make_data_filename(DataDir,binary_to_list(File)),
{ok, FH} = file:open(Path, [read,write]),
{ok, _} = file:position(FH, ?MINIMUM_OFFSET),
ok = file:write(FH, "y"),
ok = file:write(FH, "yo"),
ok = file:write(FH, "yo!"),
ok = file:close(FH),
%% Clobber the first 3 chunks, which are sizes 1/2/3.
{_, Path} = machi_util:make_data_filename(DataDir,binary_to_list(File)),
{ok, FH} = file:open(Path, [read,write]),
{ok, _} = file:position(FH, ?MINIMUM_OFFSET),
ok = file:write(FH, "y"),
ok = file:write(FH, "yo"),
ok = file:write(FH, "yo!"),
ok = file:close(FH),
%% Check the local flavor of the API: should be 3 bad checksums
{ok, Res1} = machi_admin_util:verify_file_checksums_local(
Host, TcpPort, ?DUMMY_PV1_EPOCH, Path),
3 = length(Res1),
%% Check the local flavor of the API: should be 3 bad checksums
{ok, Res1} = machi_admin_util:verify_file_checksums_local(
Host, TcpPort, ?DUMMY_PV1_EPOCH, Path),
3 = length(Res1),
%% Check the remote flavor of the API: should be 3 bad checksums
{ok, Res2} = machi_admin_util:verify_file_checksums_remote(
Host, TcpPort, ?DUMMY_PV1_EPOCH, File),
3 = length(Res2),
%% Check the remote flavor of the API: should be 3 bad checksums
{ok, Res2} = machi_admin_util:verify_file_checksums_remote(
Host, TcpPort, ?DUMMY_PV1_EPOCH, File),
3 = length(Res2),
ok
ok
after
catch ?FLU_C:quit(Sock1)
end
after
catch ?FLU_C:quit(Sock1),
ok = ?FLU:stop(FLU1)
catch machi_test_util:stop_flu_package()
end.
-endif. % !PULSE

View file

@ -0,0 +1,616 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%% EQC single-threaded and concurrent test for file operations and repair
%% under simulated network partition.
%% The main purpose is to confirm no dataloss, i.e. every chunk that
%% has been successfully written (ACK received) by append/write
%% opration will be read after partition heals.
%%
%% All updating -- append, write and trim -- operations are executed
%% through CR client, not directly by flu1 client, in order to be
%% end-to-end test (in single chain point of veiw.) There may be churn
%% for projections by simulated network partition.
%%
%% Test steps
%% 1. Setup single chain.
%% 2. Execute updating operations and simulated partition (by eqc_statem).
%% Every updating results are recorded in ETS tables.
%% 3. When {error, timeout|partition} happens, trigger management tick for
%% every chain manager process.
%% 4. After commands are executed, remove patition and wait for the chain
%% without down nodes nor repairing nodes.
%% 5. Asserting written results so that each record be read from the
%% chain and data be the same with written one.
%% Improvements to-do's
%% - Use higher concurrency, e.g. 10+
%% - Random length for binary to write
%% - Operations other than append, write, trim
%% - Use checksum instead of binary to save memory
%% - More variety for partitioning pattern: non-constant failure
%% - Stop and restart
%% - Suspend and resume of some erlang processes
-module(machi_ap_repair_eqc).
-ifdef(TEST).
-ifdef(EQC).
-compile(export_all).
-include("machi.hrl").
-include("machi_projection.hrl").
-include("machi_verbose.hrl").
-include_lib("eqc/include/eqc.hrl").
-include_lib("eqc/include/eqc_statem.hrl").
-include_lib("eunit/include/eunit.hrl").
-record(target, {verbose=false,
flu_names,
mgr_names}).
-record(state, {num,
verbose=false,
flu_names,
mgr_names,
cr_count}).
%% ETS table names
-define(WRITTEN_TAB, written). % Successfully written data
-define(ACCPT_TAB, accpt). % Errors with no harm, e.g. timeout
-define(FAILED_TAB, failed). % Uncategorized errors, when happenes
% it should be re-categorized to accept or critical
-define(CRITICAL_TAB, critical). % Critical errors, e.g. double write to the same key
-define(QC_OUT(P),
eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)).
%% EUNIT TEST DEFINITION
prop_repair_test_() ->
{PropTO, EUnitTO} = eqc_timeout(60),
Verbose = eqc_verbose(),
{spawn,
[{timeout, EUnitTO,
?_assertEqual(
true,
eqc:quickcheck(eqc:testing_time(
PropTO, ?QC_OUT(noshrink(prop_repair(Verbose))))))}]}.
prop_repair_par_test_() ->
{PropTO, EUnitTO} = eqc_timeout(60),
Verbose = eqc_verbose(),
{spawn,
[{timeout, EUnitTO,
?_assertEqual(
true,
eqc:quickcheck(eqc:testing_time(
PropTO, ?QC_OUT(noshrink(prop_repair_par(Verbose))))))}]}.
%% Model
weight(_S, change_partition) -> 20;
weight(_S, _) -> 100.
%% Append
append_args(#state{cr_count=CRCount}=S) ->
[choose(1, CRCount), chunk(), S].
append(CRIndex, Bin, #state{verbose=V}=S) ->
CRList = cr_list(),
{_SimSelfName, C} = lists:nth(CRIndex, CRList),
Prefix = <<"pre">>,
Len = byte_size(Bin),
NSInfo = #ns_info{},
NoCSum = <<>>,
Opts1 = #append_opts{},
Res = (catch machi_cr_client:append_chunk(C, NSInfo, Prefix, Bin, NoCSum, Opts1, sec(1))),
case Res of
{ok, {_Off, Len, _FileName}=Key} ->
case ets:insert_new(?WRITTEN_TAB, {Key, Bin}) of
true ->
[?V("<o>", []) || V],
ok;
false ->
%% The Key is alread written, WHY!!!????
case ets:lookup(?WRITTEN_TAB, Key) of
[{Key, Bin}] ->
%% TODO: The identical binary is alread inserted in
%% written table. Is this acceptable??? Hmm, maybe NO...
[?V("<dws:~w>", [Key]) || V],
true = ets:insert_new(?ACCPT_TAB,
{make_ref(), double_write_same, Key}),
{acceptable_error, doublewrite_the_same};
[{Key, OtherBin}] ->
[?V("<dwd:~w:~w>", [Key, {OtherBin, Bin}]) || V],
true = ets:insert_new(?CRITICAL_TAB,
{make_ref(), double_write_diff, Key}),
R = {critical_error,
{doublewrite_diff, Key, {OtherBin, Bin}}},
%% TODO: when double write happens, it seems that
%% repair process got stack with endless loop. To
%% avoit it, return error here.
%% If this error/1 will be removed, one can possibly
%% know double write frequency/rate.
error(R)
end
end;
{error, partition} ->
[?V("<pt>", []) || V],
true = ets:insert_new(?ACCPT_TAB, {make_ref(), timeout}),
_ = tick(S),
{acceptable_error, partition};
{'EXIT', {timeout, _}} ->
[?V("<to:~w:~w>", [_SimSelfName, C]) || V],
true = ets:insert_new(?ACCPT_TAB, {make_ref(), timeout}),
_ = tick(S),
{acceptable_error, timeout};
{ok, {_Off, UnexpectedLen, _FileName}=Key} ->
[?V("<XX>", []) || V],
true = ets:insert_new(?CRITICAL_TAB, {make_ref(), unexpected_len, Key}),
{critical_error, {unexpected_len, Key, Len, UnexpectedLen}};
{error, _Reason} = Error ->
[?V("<er>", []) || V],
true = ets:insert_new(?FAILED_TAB, {make_ref(), Error}),
{other_error, Error};
Other ->
[?V("<er>", []) || V],
true = ets:insert_new(?FAILED_TAB, {make_ref(), Other}),
{other_error, Other}
end.
%% Change partition
change_partition_args(#state{flu_names=FLUNames}=S) ->
%% [partition(FLUNames), S].
[partition_sym(FLUNames), S].
change_partition(Partition,
#state{verbose=Verbose, flu_names=FLUNames}=S) ->
[case Partition of
[] -> ?V("## Turn OFF partition: ~w~n", [Partition]);
_ -> ?V("## Turn ON partition: ~w~n", [Partition])
end || Verbose],
machi_partition_simulator:always_these_partitions(Partition),
_ = machi_partition_simulator:get(FLUNames),
%% Don't wait for stable chain, tick will be executed on demand
%% in append oprations
_ = tick(S),
ok.
%% Generators
num() ->
choose(2, 5).
cr_count(Num) ->
Num * 3.
%% Returns a list like
%% `[{#p_srvr{name=a, port=7501, ..}, "./eqc/data.eqc.a/"}, ...]'
all_list_extra(Num) ->
{PortBase, DirBase} = get_port_dir_base(),
[begin
FLUNameStr = [$a + I - 1],
FLUName = list_to_atom(FLUNameStr),
MgrName = machi_flu_psup:make_mgr_supname(FLUName),
{#p_srvr{name=FLUName, address="localhost", port=PortBase+I,
props=[{chmgr, MgrName}]},
DirBase ++ "/data.eqc." ++ FLUNameStr}
end || I <- lists:seq(1, Num)].
sublist(L) ->
?LET(K, nat(),
?LET(L2, eqc_gen:vector(K, eqc_gen:oneof(L)),
lists:usort(L2))).
%% Generator for possibly assymmetric partition information
partition(FLUNames) ->
frequency([{10, return([])},
{20, non_empty(sublist(flu_ordered_pairs(FLUNames)))}]).
%% Generator for symmetric partition information
partition_sym(FLUNames) ->
?LET(Pairs, non_empty(sublist(flu_pairs(FLUNames))),
lists:flatmap(fun({One, Another}) -> [{One, Another}, {Another, One}] end,
Pairs)).
flu_ordered_pairs(FLUNames) ->
[{From, To} || From <- FLUNames, To <- FLUNames, From =/= To].
flu_pairs(FLUNames) ->
[{One, Another} || One <- FLUNames, Another <- FLUNames, One > Another].
chunk() ->
non_empty(binary(10)).
%% Properties
prop_repair(Verbose) ->
error_logger:tty(false),
application:load(sasl),
application:set_env(sasl, sasl_error_logger, false),
Seed = {1445,935441,287549},
?FORALL(Num, num(),
?FORALL(Cmds, commands(?MODULE, initial_state(Num, Verbose)),
begin
Target = setup_target(Num, Seed, Verbose),
{H, S1, Res0} = run_commands(?MODULE, Cmds),
%% ?V("S1=~w~n", [S1]),
?V("==== Start post operations, stabilize and confirm results~n", []),
_ = stabilize(commands_len(Cmds), Target),
{Dataloss, Critical} = confirm_result(Target),
_ = cleanup(Target),
pretty_commands(
?MODULE, Cmds, {H, S1, Res0},
aggregate(with_title(cmds), command_names(Cmds),
collect(with_title(length5), (length(Cmds) div 5) * 5,
{Dataloss, Critical} =:= {0, 0})))
end)).
prop_repair_par(Verbose) ->
error_logger:tty(false),
application:load(sasl),
application:set_env(sasl, sasl_error_logger, false),
Seed = {1445,935441,287549},
?FORALL(Num, num(),
?FORALL(Cmds,
%% Now try-and-err'ing, how to control command length and concurrency?
?SUCHTHAT(Cmds0, ?SIZED(Size, resize(Size,
parallel_commands(?MODULE, initial_state(Num, Verbose)))),
commands_len(Cmds0) > 20
andalso
concurrency(Cmds0) > 2),
begin
CmdsLen= commands_len(Cmds),
Target = setup_target(Num, Seed, Verbose),
{Seq, Par, Res0} = run_parallel_commands(?MODULE, Cmds),
%% ?V("Seq=~w~n", [Seq]),
%% ?V("Par=~w~n", [Par]),
?V("==== Start post operations, stabilize and confirm results~n", []),
{FinalRes, {Dataloss, Critical}} =
case Res0 of
ok ->
Res1 = stabilize(CmdsLen, Target),
{Res1, confirm_result(Target)};
_ ->
?V("Res0=~w~n", [Res0]),
{Res0, {undefined, undefined}}
end,
_ = cleanup(Target),
%% Process is leaking? This log line can be removed after fix.
[?V("process_count=~w~n", [erlang:system_info(process_count)]) || Verbose],
pretty_commands(
?MODULE, Cmds, {Seq, Par, Res0},
aggregate(with_title(cmds), command_names(Cmds),
collect(with_title(length5), (CmdsLen div 5) * 5,
collect(with_title(conc), concurrency(Cmds),
{FinalRes, {Dataloss, Critical}} =:= {ok, {0, 0}})))
)
end)).
%% Initilization / setup
%% Fake initialization function for debugging in shell like:
%% > eqc_gen:sample(eqc_statem:commands(machi_ap_repair_eqc)).
%% but not so helpful.
initial_state() ->
#state{cr_count=3}.
initial_state(Num, Verbose) ->
AllListE = all_list_extra(Num),
FLUNames = [P#p_srvr.name || {P, _Dir} <- AllListE],
MgrNames = [{Name, machi_flu_psup:make_mgr_supname(Name)} || Name <- FLUNames],
#state{num=Num, verbose=Verbose,
flu_names=FLUNames, mgr_names=MgrNames,
cr_count=cr_count(Num)}.
setup_target(Num, Seed, Verbose) ->
%% ?V("setup_target(Num=~w, Seed=~w~nn", [Num, Seed]),
AllListE = all_list_extra(Num),
FLUNames = [P#p_srvr.name || {P, _Dir} <- AllListE],
MgrNames = [{Name, machi_flu_psup:make_mgr_supname(Name)} || Name <- FLUNames],
Dict = orddict:from_list([{P#p_srvr.name, P} || {P, _Dir} <- AllListE]),
setup_chain(Seed, AllListE, FLUNames, MgrNames, Dict),
_ = setup_cpool(AllListE, FLUNames, Dict),
Target = #target{flu_names=FLUNames, mgr_names=MgrNames,
verbose=Verbose},
%% Don't wait for complete chain. Even partialy completed, the chain
%% should work fine. Right?
wait_until_stable(chain_state_all_ok(FLUNames), FLUNames, MgrNames,
20, Verbose),
Target.
setup_chain(Seed, AllListE, FLUNames, MgrNames, Dict) ->
ok = shutdown_hard(),
[begin
machi_test_util:clean_up_dir(Dir),
filelib:ensure_dir(Dir ++ "/not-used")
end || {_P, Dir} <- AllListE],
[catch ets:delete(T) || T <- tabs()],
[ets:new(T, [set, public, named_table,
{write_concurrency, true}, {read_concurrency, true}]) ||
T <- tabs()],
{ok, _} = application:ensure_all_started(machi),
SimSpec = {part_sim,
{machi_partition_simulator, start_link, [{0,0,0}, 0, 100]},
permanent, 500, worker, []},
{ok, _PSimPid} = supervisor:start_child(machi_sup, SimSpec),
ok = machi_partition_simulator:set_seed(Seed),
_Partitions = machi_partition_simulator:get(FLUNames),
%% Start FLUs and setup the chain
FLUOpts = [{use_partition_simulator, true},
%% {private_write_verbose, true},
{active_mode, false},
{simulate_repair, false}],
[{ok, _} = machi_flu_psup:start_flu_package(Name, Port, Dir, FLUOpts) ||
{#p_srvr{name=Name, port=Port}, Dir} <- AllListE],
[machi_chain_manager1:set_chain_members(MgrName, Dict) || {_, MgrName} <- MgrNames],
ok.
setup_cpool(AllListE, FLUNames, Dict) ->
Num = length(AllListE),
FCList = [begin
{ok, PCPid} = machi_proxy_flu1_client:start_link(P),
{Name, PCPid}
end || {_, #p_srvr{name=Name}=P} <- Dict],
%% CR clients are pooled, each has "name" which is interpreted "From"
%% side of simulated partition.
SimSelfNames = lists:append(lists:duplicate(cr_count(Num), FLUNames)),
CRList = [begin
{ok, C} = machi_cr_client:start_link(
[P || {_, P} <- Dict],
[{use_partition_simulator, true},
{simulator_self_name, SimSelfName},
{simulator_members, FLUNames}]),
{SimSelfName, C}
end || SimSelfName <- SimSelfNames],
catch ets:delete(cpool),
ets:new(cpool, [set, protected, named_table, {read_concurrency, true}]),
ets:insert(cpool, {fc_list, FCList}),
ets:insert(cpool, {cr_list, CRList}),
{CRList, FCList}.
fc_list() ->
[{fc_list, FCList}] = ets:lookup(cpool, fc_list),
FCList.
cr_list() ->
[{cr_list, CRList}] = ets:lookup(cpool, cr_list),
CRList.
%% Post run_commands
stabilize(0, _T) ->
ok;
stabilize(_CmdsLen, #target{flu_names=FLUNames, mgr_names=MgrNames,
verbose=Verbose}) ->
machi_partition_simulator:no_partitions(),
true = wait_until_stable(chain_state_all_ok(FLUNames), FLUNames, MgrNames,
100, Verbose),
ok.
chain_state_all_ok(FLUNames) ->
[{FLUName, {FLUNames, [], []}} || FLUName <- FLUNames].
confirm_result(_T) ->
[{_, C} | _] = cr_list(),
[{written, _Written}, {accpt, Accpt},
{failed, Failed}, {critical, Critical}] = tab_counts(),
{OK, Dataloss} = confirm_written(C),
?V(" Written=~w, DATALOSS=~w, Acceptable=~w~n", [OK, Dataloss, Accpt]),
?V(" Failed=~w, Critical=~w~n~n", [Failed, Critical]),
DirBase = get_dir_base(),
Suffix = dump_file_suffix(),
case Failed of
0 -> ok;
_ ->
DumpFailed = filename:join(DirBase, "dump-failed-" ++ Suffix),
?V("Dump failed ETS tab to: ~s~n", [DumpFailed]),
ets:tab2file(?FAILED_TAB, DumpFailed)
end,
case Critical of
0 -> ok;
_ ->
DumpCritical = filename:join(DirBase, "dump-critical-" ++ Suffix),
?V("Dump critical ETS tab to: ~w~n", [DumpCritical]),
ets:tab2file(?CRITICAL_TAB, DumpCritical)
end,
{Dataloss, Critical}.
confirm_written(C) ->
ets:foldl(
fun({Key, Bin}, {OK, NG}) ->
case assert_chunk(C, Key, Bin) of
ok -> {OK+1, NG};
{error, _} -> {OK, NG+1}
end
end, {0, 0}, ?WRITTEN_TAB).
assert_chunk(C, {Off, Len, FileName}=Key, Bin) ->
%% TODO: This probably a bug, read_chunk respnds with filename of `string()' type
%% TODO : Use CSum instead of binary (after disuccsion about CSum is calmed down?)
NSInfo = undefined,
case (catch machi_cr_client:read_chunk(C, NSInfo, FileName, Off, Len, undefined, sec(3))) of
{ok, {[{FileName, Off, Bin, _}], []}} ->
ok;
{ok, Got} ->
?V("read_chunk got different binary for Key=~p~n", [Key]),
?V(" Expected: ~p~n", [{[{FileName, Off, Bin, <<"CSum-NYI">>}], []}]),
?V(" Got: ~p~n", [Got]),
{error, different_binary};
{error, Reason} ->
?V("read_chunk error for Key=~p: ~p~n", [Key, Reason]),
{error, Reason};
Other ->
?V("read_chunk other error for Key=~p: ~p~n", [Key, Other]),
{error, Other}
end.
cleanup(_Target) ->
[begin unlink(FC), catch exit(FC, kill) end || {_, FC} <- fc_list()],
[begin unlink(CR), catch exit(CR, kill) end || {_, CR} <- cr_list()],
_ = shutdown_hard().
%% Internal misc utilities
eqc_verbose() ->
os:getenv("EQC_VERBOSE") =:= "true".
eqc_timeout(Default) ->
PropTimeout = case os:getenv("EQC_TIME") of
false -> Default;
V -> list_to_integer(V)
end,
{PropTimeout, PropTimeout * 300}.
get_port_dir_base() ->
I = case os:getenv("EQC_BASE_PORT") of
false -> 0;
V -> list_to_integer(V)
end,
D = get_dir_base(),
{7400 + (I * 100), D ++ "/" ++ integer_to_list(I)}.
get_dir_base() ->
case os:getenv("EQC_BASE_DIR") of
false -> "./eqc";
DD -> DD
end.
shutdown_hard() ->
_STOP = application:stop(machi),
timer:sleep(100).
tick(#state{flu_names=FLUNames, mgr_names=MgrNames,
verbose=Verbose}) ->
tick(FLUNames, MgrNames, Verbose).
tick(FLUNames, MgrNames, Verbose) ->
tick(FLUNames, MgrNames, 2, 100, Verbose).
tick(FLUNames, MgrNames, Iter, SleepMax, Verbose) ->
TickFun = tick_fun(FLUNames, MgrNames, self()),
TickFun(Iter, 0, SleepMax),
FCList = fc_list(),
[?V("## Chain state after tick()=~w~n", [chain_state(FCList)]) || Verbose].
tick_fun(FLUNames, MgrNames, Parent) ->
fun(Iters, SleepMin, SleepMax) ->
%% ?V("^", []),
Trigger =
fun(FLUName, MgrName) ->
random:seed(now()),
[begin
erlang:yield(),
SleepMaxRand = random:uniform(SleepMax + 1),
%% io:format(user, "{t}", []),
Elapsed = machi_chain_manager1:sleep_ranked_order(
SleepMin, SleepMaxRand,
FLUName, FLUNames),
MgrName ! tick_check_environment,
%% Be more unfair by not sleeping here.
timer:sleep(max(SleepMax - Elapsed, 1)),
ok
end || _ <- lists:seq(1, Iters)],
Parent ! {done, self()}
end,
Pids = [{spawn(fun() -> Trigger(FLUName, MgrName) end), FLUName} ||
{FLUName, MgrName} <- MgrNames ],
[receive
{done, ThePid} ->
ok
after 120*1000 ->
exit({icky_timeout, M_name})
end || {ThePid, M_name} <- Pids]
end.
wait_until_stable(ExpectedChainState, FLUNames, MgrNames, Verbose) ->
wait_until_stable(ExpectedChainState, FLUNames, MgrNames, 20, Verbose).
wait_until_stable(ExpectedChainState, FLUNames, MgrNames, Retries, Verbose) ->
TickFun = tick_fun(FLUNames, MgrNames, self()),
FCList = fc_list(),
wait_until_stable1(ExpectedChainState, TickFun, FCList, Retries, Verbose).
wait_until_stable1(ExpectedChainState, _TickFun, FCList, 0, _Verbose) ->
?V(" [ERROR] _ExpectedChainState ~p\n", [ExpectedChainState]),
?V(" [ERROR] wait_until_stable failed.... : ~p~n", [chain_state(FCList)]),
?V(" [ERROR] norm.... : ~p~n", [normalize_chain_state(chain_state(FCList))]),
false;
wait_until_stable1(ExpectedChainState, TickFun, FCList, Reties, Verbose) ->
[TickFun(3, 0, 100) || _ <- lists:seq(1, 3)],
Normalized = normalize_chain_state(chain_state(FCList)),
case Normalized of
ExpectedChainState ->
[?V(" Got stable chain: ~w~n", [chain_state(FCList)]) || Verbose],
true;
_ ->
[?V(" NOT YET stable chain: ~w~n", [chain_state(FCList)]) || Verbose],
wait_until_stable1(ExpectedChainState, TickFun, FCList, Reties-1, Verbose)
end.
normalize_chain_state(ChainState) ->
lists:usort([{FLUName,
{lists:usort(UPI), lists:usort(Repairing), lists:usort(Down)}} ||
{FLUName, {_EpochNo, UPI, Repairing, Down}} <- ChainState]).
chain_state(FCList) ->
lists:usort(
[case (catch machi_proxy_flu1_client:read_latest_projection(C, private, sec(5))) of
{ok, #projection_v1{epoch_number=EpochNo, upi=UPI,
repairing=Repairing, down=Down}} ->
{FLUName, {EpochNo, UPI, Repairing, Down}};
Other ->
{FLUName, Other}
end || {FLUName, C} <- FCList]).
tabs() -> [?WRITTEN_TAB, ?ACCPT_TAB, ?FAILED_TAB, ?CRITICAL_TAB].
tab_counts() ->
[{T, ets:info(T, size)} || T <- tabs()].
sec(Sec) ->
timer:seconds(Sec).
commands_len({SeqCmds, ParCmdsList} = _Cmds) ->
lists:sum([length(SeqCmds) | [length(P) || P <- ParCmdsList]]);
commands_len(Cmds) ->
length(Cmds).
concurrency({_SeqCmds, ParCmdsList} = _Cmds) -> length(ParCmdsList);
concurrency(_) -> 1.
dump_file_suffix() ->
{{Year, Month, Day}, {Hour, Min, Sec}} = calendar:local_time(),
lists:flatten(
io_lib:format("~4.10.0B-~2.10.0B-~2.10.0BT~2.10.0B:~2.10.0B:~2.10.0B.000Z",
[Year, Month, Day, Hour, Min, Sec])).
-endif. % EQC
-endif. % TEST

File diff suppressed because it is too large Load diff

View file

@ -2,7 +2,7 @@
%%
%% Machi: a small village of replicated files
%%
%% Copyright (c) 2014 Basho Technologies, Inc. All Rights Reserved.
%% Copyright (c) 2014-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
@ -27,6 +27,7 @@
-compile(export_all).
-include("machi_projection.hrl").
-include("machi_verbose.hrl").
-include_lib("eqc/include/eqc.hrl").
-include_lib("eqc/include/eqc_statem.hrl").
@ -38,7 +39,9 @@
-compile({pulse_replace_module, [{application, pulse_application}]}).
%% The following functions contains side_effects but are run outside
%% PULSE, i.e. PULSE needs to leave them alone
-compile({pulse_skip,[{prop_pulse_test_,0}, {shutdown_hard,0}]}).
-compile({pulse_skip,[{prop_pulse_test_,0}, {prop_pulse_regression_test_,0},
{prop_pulse,1},
{shutdown_hard,0}]}).
-compile({pulse_no_side_effect,[{file,'_','_'}, {erlang, now, 0}]}).
%% Used for output within EUnit...
@ -70,11 +73,58 @@ gen_seed() ->
noshrink({choose(1, 10000), choose(1, 10000), choose(1, 10000)}).
gen_old_threshold() ->
noshrink(choose(1, 100)).
noshrink(frequency([
{10, {keep}},
{10, choose(1, 100)},
{10, oneof([{island1}])},
{10, oneof([{asymm1}, {asymm2}, {asymm3}])}
])).
gen_no_partition_threshold() ->
noshrink(choose(1, 100)).
gen_commands(new) ->
non_empty(commands(?MODULE));
gen_commands(regression) ->
%% These regression tests include only few, very limited command
%% sequences that have been helpful in turning up bugs in the past.
%% For this style test, QuickCheck is basically just choosing random
%% seeds + PULSE execution to see if one of the oldies-but-goodies can
%% find another execution/interleaving that still shows a problem.
Cmd_a = [{set,{var,1},
{call,machi_chain_manager1_pulse,setup,[3,{846,1222,4424}]}},
{set,{var,2},
{call,machi_chain_manager1_pulse,do_ticks,[6,{var,1},13,48]}}],
Cmd_b = [{set,{var,1},
{call,machi_chain_manager1_pulse,setup,[4,{354,7401,1237}]}},
{set,{var,2},
{call,machi_chain_manager1_pulse,do_ticks,[10,{var,1},15,77]}},
{set,{var,3},
{call,machi_chain_manager1_pulse,do_ticks,[7,{var,1},92,39]}}],
Cmd_c = [{set,{var,1},
{call,machi_chain_manager1_pulse,setup,[2,{5202,467,3157}]}},
{set,{var,2},
{call,machi_chain_manager1_pulse,do_ticks,[8,{var,1},98,3]}},
{set,{var,3},
{call,machi_chain_manager1_pulse,do_ticks,[5,{var,1},56,49]}},
{set,{var,4},
{call,machi_chain_manager1_pulse,do_ticks,[10,{var,1},33,72]}},
{set,{var,5},
{call,machi_chain_manager1_pulse,do_ticks,[10,{var,1},88,20]}},
{set,{var,6},
{call,machi_chain_manager1_pulse,do_ticks,[8,{var,1},67,10]}},
{set,{var,7},
{call,machi_chain_manager1_pulse,do_ticks,[5,{var,1},86,25]}},
{set,{var,8},
{call,machi_chain_manager1_pulse,do_ticks,[6,{var,1},74,88]}},
{set,{var,9},
{call,machi_chain_manager1_pulse,do_ticks,[8,{var,1},78,39]}}],
Cmd_d = [{set,{var,1},
{call,machi_chain_manager1_pulse,setup,[5,{436,5950,9085}]}},
{set,{var,2},
{call,machi_chain_manager1_pulse,do_ticks,[7,{var,1},19,80]}}],
noshrink(oneof([Cmd_a, Cmd_b, Cmd_c, Cmd_d])).
command(#state{step=0}) ->
{call, ?MODULE, setup, [gen_num_pids(), gen_seed()]};
command(S) ->
@ -82,7 +132,7 @@ command(S) ->
{ 1, {call, ?MODULE, change_partitions,
[gen_old_threshold(), gen_no_partition_threshold()]}},
{50, {call, ?MODULE, do_ticks,
[choose(5, 200), S#state.pids,
[choose(1, 100), S#state.pids,
gen_old_threshold(), gen_no_partition_threshold()]}}
]).
@ -103,38 +153,50 @@ postcondition(_S, {call, _, _Func, _Args}, _Res) ->
true.
all_list_extra() ->
{PortBase, DirBase} = get_port_dir_base(),
[ %% Genenerators assume that this list is at least 2 items
{#p_srvr{name=a, address="localhost", port=7400,
props=[{chmgr, a_chmgr}]}, "./data.pulse.a"}
, {#p_srvr{name=b, address="localhost", port=7401,
props=[{chmgr, b_chmgr}]}, "./data.pulse.b"}
, {#p_srvr{name=c, address="localhost", port=7402,
props=[{chmgr, c_chmgr}]}, "./data.pulse.c"}
, {#p_srvr{name=d, address="localhost", port=7403,
props=[{chmgr, d_chmgr}]}, "./data.pulse.d"}
, {#p_srvr{name=e, address="localhost", port=7404,
props=[{chmgr, e_chmgr}]}, "./data.pulse.e"}
{#p_srvr{name=a, address="localhost", port=PortBase+0,
props=[{chmgr, a_chmgr}]}, DirBase ++ "/data.pulse.a"}
, {#p_srvr{name=b, address="localhost", port=PortBase+1,
props=[{chmgr, b_chmgr}]}, DirBase ++ "/data.pulse.b"}
, {#p_srvr{name=c, address="localhost", port=PortBase+2,
props=[{chmgr, c_chmgr}]}, DirBase ++ "//data.pulse.c"}
, {#p_srvr{name=d, address="localhost", port=PortBase+3,
props=[{chmgr, d_chmgr}]}, DirBase ++ "/data.pulse.d"}
, {#p_srvr{name=e, address="localhost", port=PortBase+4,
props=[{chmgr, e_chmgr}]}, DirBase ++ "/data.pulse.e"}
].
all_list() ->
[P#p_srvr.name || {P, _Dir} <- all_list_extra()].
setup(Num, Seed) ->
?QC_FMT("\nsetup(~w", [Num]),
error_logger:tty(false),
?V("\nsetup(~w,~w", [Num, Seed]),
[catch erlang:garbage_collect(P) || P <- processes()],
All_list = lists:sublist(all_list(), Num),
All_listE = lists:sublist(all_list_extra(), Num),
%% shutdown_hard() has taken care of killing all relevant procs.
[machi_flu1_test:clean_up_data_dir(Dir) || {_P, Dir} <- All_listE],
?QC_FMT(",z~w", [?LINE]),
[begin
machi_flu1_test:clean_up_data_dir(Dir),
filelib:ensure_dir(Dir ++ "/not-used")
end || {_P, Dir} <- All_listE],
?V(",z~w", [?LINE]),
%% Start partition simulator
{ok, PSimPid} = machi_partition_simulator:start_link(Seed, 0, 100),
%% GRRR, not PULSE: {ok, _} = application:ensure_all_started(machi),
[begin
_QQ = (catch application:start(App))
end || App <- [machi] ],
?V(",z~w", [?LINE]),
SimSpec = {part_sim, {machi_partition_simulator, start_link,
[{0,0,0}, 0, 100]},
permanent, 500, worker, []},
{ok, PSimPid} = supervisor:start_child(machi_sup, SimSpec),
ok = machi_partition_simulator:set_seed(Seed),
_Partitions = machi_partition_simulator:get(All_list),
?QC_FMT(",z~w", [?LINE]),
?V(",z~w", [?LINE]),
%% Start FLUs and their associated procs
{ok, SupPid} = machi_flu_sup:start_link(),
FluOpts = [{use_partition_simulator, true}, {active_mode, false}],
[begin
#p_srvr{name=Name, port=Port} = P,
@ -142,31 +204,49 @@ setup(Num, Seed) ->
end || {P, Dir} <- All_listE],
%% Set up the chain
Dict = orddict:from_list([{P#p_srvr.name, P} || {P, _Dir} <- All_listE]),
?QC_FMT(",z~w", [?LINE]),
[machi_chain_manager1:set_chain_members(get_chmgr(P), Dict) ||
{P, _Dir} <- All_listE],
?V(",z~w", [?LINE]),
%% Trigger some environment reactions for humming consensus: first
%% do all the same server first, then round-robin evenly across
%% servers.
[begin
_QQa = machi_chain_manager1:test_react_to_env(get_chmgr(P))
_QQa = machi_chain_manager1:trigger_react_to_env(get_chmgr(P))
end || {P, _Dir} <- All_listE, _I <- lists:seq(1,20), _Repeat <- [1,2]],
?QC_FMT(",z~w", [?LINE]),
[begin
_QQa = machi_chain_manager1:test_react_to_env(get_chmgr(P))
_QQa = machi_chain_manager1:trigger_react_to_env(get_chmgr(P))
end || _I <- lists:seq(1,20), {P, _Dir} <- All_listE, _Repeat <- [1,2]],
?QC_FMT(",z~w", [?LINE]),
?V(",z~w", [?LINE]),
ProxiesDict = ?FLU_PC:start_proxies(Dict),
Res = {PSimPid, SupPid, ProxiesDict, All_listE},
Res = {PSimPid, 'machi_flu_sup', ProxiesDict, All_listE},
put(manager_pids_hack, Res),
?QC_FMT("),", []),
?V("),", []),
Res.
change_partitions(OldThreshold, NoPartitionThreshold) ->
change_partitions(OldThreshold, NoPartitionThreshold)
when is_integer(OldThreshold) ->
machi_partition_simulator:reset_thresholds(OldThreshold,
NoPartitionThreshold).
NoPartitionThreshold);
change_partitions({keep}, _NoPartitionThreshold) ->
ok;
change_partitions({island1}, _NoPartitionThreshold) ->
AB = [a,b],
NotAB = all_list() -- AB,
Partitions = lists:usort([{X, Y} || X <- AB, Y <- NotAB] ++
[{X, Y} || X <- NotAB, Y <- AB]),
machi_partition_simulator:always_these_partitions(Partitions);
change_partitions({asymm1}, _NoPartitionThreshold) ->
Partitions = [{a,b}],
machi_partition_simulator:always_these_partitions(Partitions);
change_partitions({asymm2}, _NoPartitionThreshold) ->
Partitions = [{a,b},{a,c},{a,d},{a,e},{b,a},{b,c},{b,e},{c,a},{c,b},{c,d},{c,e},{d,a},{d,c},{d,e},{e,a},{e,b},{e,c},{e,d}],
machi_partition_simulator:always_these_partitions(Partitions);
change_partitions({asymm3}, _NoPartitionThreshold) ->
Partitions = [{a,b},{a,c},{a,d},{a,e},{b,a},{b,d},{b,e},{c,d},{d,a},{d,c},{d,e},{e,a},{e,b},{e,d}],
machi_partition_simulator:always_these_partitions(Partitions).
always_last_partitions() ->
machi_partition_simulator:always_last_partitions().
@ -175,29 +255,28 @@ private_stable_check() ->
{_PSimPid, _SupPid, ProxiesDict, All_listE} = get(manager_pids_hack),
Res = private_projections_are_stable_check(ProxiesDict, All_listE),
if not Res ->
io:format(user, "BUMMER: private stable check failed!\n", []);
?QC_FMT("BUMMER: private stable check failed!\n", []);
true ->
ok
end,
Res.
do_ticks(Num, PidsMaybe, OldThreshold, NoPartitionThreshold) ->
io:format(user, "~p,~p,~p|", [Num, OldThreshold, NoPartitionThreshold]),
?V("~p,~p,~p|", [Num, OldThreshold, NoPartitionThreshold]),
{_PSimPid, _SupPid, ProxiesDict, All_listE} =
case PidsMaybe of
undefined -> get(manager_pids_hack);
_ -> PidsMaybe
end,
if is_integer(OldThreshold) ->
machi_partition_simulator:reset_thresholds(OldThreshold,
NoPartitionThreshold);
if is_atom(OldThreshold) ->
?V("{e=~w},", [get_biggest_private_epoch_number(ProxiesDict)]),
machi_partition_simulator:no_partitions();
true ->
?QC_FMT("{e=~w},", [get_biggest_private_epoch_number(ProxiesDict)]),
machi_partition_simulator:no_partitions()
change_partitions(OldThreshold, NoPartitionThreshold)
end,
Res = exec_ticks(Num, All_listE),
if not is_integer(OldThreshold) ->
?QC_FMT("{e=~w},", [get_biggest_private_epoch_number(ProxiesDict)]);
?V("{e=~w},", [get_biggest_private_epoch_number(ProxiesDict)]);
true ->
ok
end,
@ -214,73 +293,37 @@ get_biggest_private_epoch_number(ProxiesDict) ->
dump_state() ->
try
?QC_FMT("dump_state(", []),
?V("dump_state(", []),
{_PSimPid, _SupPid, ProxiesDict, _AlE} = get(manager_pids_hack),
Report = ?MGRTEST:unanimous_report(ProxiesDict),
?V(",z~w", [?LINE]),
Namez = ProxiesDict,
%% ?QC_FMT("Report ~p\n", [Report]),
%% Diag1 = [begin
%% {ok, Ps} = ?FLU_PC:get_all_projections(Proxy, Type),
%% [io_lib:format("~p ~p ~p: ~w\n", [FLUName, Type, P#projection_v1.epoch_number, machi_projection:make_summary(P)]) || P <- Ps]
%% end || {FLUName, Proxy} <- orddict:to_list(ProxiesDict),
%% Type <- [public] ],
UniquePrivateEs =
lists:usort(lists:flatten(
[element(2,?FLU_PC:list_all_projections(Proxy,private)) ||
{_FLUName, Proxy} <- orddict:to_list(ProxiesDict)])),
P_lists0 = [{FLUName, Type,
element(2,?FLU_PC:get_all_projections(Proxy, Type))} ||
{FLUName, Proxy} <- orddict:to_list(ProxiesDict),
Type <- [public,private]],
P_lists = [{FLUName, Type, P} || {FLUName, Type, Ps} <- P_lists0,
P <- Ps],
AllDict = lists:foldl(fun({FLU, Type, P}, D) ->
K = {FLU, Type, P#projection_v1.epoch_number},
dict:store(K, P, D)
end, dict:new(), lists:flatten(P_lists)),
DumbFinderBackward =
fun(FLUName) ->
fun(E, error_unwritten) ->
case dict:find({FLUName, private, E}, AllDict) of
{ok, T} -> T;
error -> error_unwritten
end;
(_E, Acc) ->
Acc
end
end,
%% Diag2 = [[
%% io_lib:format("~p private: ~w\n",
%% [FLUName,
%% machi_projection:make_summary(
%% lists:foldl(DumbFinderBackward(FLUName),
%% error_unwritten,
%% lists:seq(Epoch, 0, -1)))])
%% || {FLUName, _FLU} <- Namez]
%% || Epoch <- UniquePrivateEs],
PrivProjs = [{Name, begin
{ok, Ps} = ?FLU_PC:get_all_projections(Proxy,
private),
?V(",z~w", [?LINE]),
[P || P <- Ps,
P#projection_v1.epoch_number /= 0]
end} || {Name, Proxy} <- ProxiesDict],
?QC_FMT(")", []),
?V(",z~w", [?LINE]),
?V(",~w", [catch application:stop(machi)]),
?V(")", []),
Diag1 = Diag2 = "skip_diags",
{Report, PrivProjs, lists:flatten([Diag1, Diag2])}
catch XX:YY ->
?QC_FMT("OUCH: ~p ~p @ ~p\n", [XX, YY, erlang:get_stacktrace()]),
?QC_FMT("Exiting now to move to manual post-mortem....\n", []),
erlang:halt(0),
?V("OUCH: ~p ~p @ ~p\n", [XX, YY, erlang:get_stacktrace()]),
?V("Exiting now to move to manual post-mortem....\n", []),
erlang:halt(66),
false
end.
prop_pulse() ->
?FORALL({Cmds0, Seed}, {non_empty(commands(?MODULE)), pulse:seed()},
?IMPLIES(1 < length(Cmds0) andalso length(Cmds0) < 5,
prop_pulse(new).
prop_pulse(Style) when Style == new; Style == regression ->
_ = application:start(crypto),
?FORALL({Cmds0, Seed}, {gen_commands(Style), pulse:seed()},
?IMPLIES(length(Cmds0) < 11,
begin
ok = shutdown_hard(),
%% PULSE can be really unfair, of course, including having exec_ticks
@ -294,32 +337,32 @@ prop_pulse() ->
{call, ?MODULE, private_stable_check, []}}],
LastTriggerTicks = {set,{var,99999997},
{call, ?MODULE, do_ticks, [123, undefined, no, no]}},
Cmds1 = lists:duplicate(2, LastTriggerTicks),
%% Cmds1 = lists:duplicate(length(all_list())*2, LastTriggerTicks),
Cmds1 = lists:duplicate(4, LastTriggerTicks),
Cmds = Cmds0 ++
Stabilize1 ++
Cmds1 ++
Stabilize2 ++
[{set,{var,99999999}, {call, ?MODULE, dump_state, []}}],
error_logger:tty(false),
pulse:verbose([format]),
{_H2, S2, Res} = pulse:run(
fun() ->
{_H, _S, _R} = run_commands(?MODULE, Cmds)
end, [{seed, Seed},
{strategy, unfair}]),
%% ?QC_FMT("S2 ~p\n", [S2]),
case S2#state.dump_state of
undefined ->
?QC_FMT("BUMMER Cmds = ~p\n", [Cmds]);
_ ->
ok
end,
ok = shutdown_hard(),
{Report, PrivProjs, Diag} = S2#state.dump_state,
%% Report is ordered by Epoch. For each private projection
%% written during any given epoch, confirm that all chain
%% members appear in only one unique chain, i.e., the sets of
%% unique chains are disjoint.
AllDisjointP = ?MGRTEST:all_reports_are_disjoint(Report),
{AllDisjointP, AllDisjointDetail} =
case ?MGRTEST:all_reports_are_disjoint(Report) of
true -> {true, true};
Else -> {false, Else}
end,
%% For each chain transition experienced by a particular FLU,
%% confirm that each state transition is OK.
@ -328,40 +371,82 @@ prop_pulse() ->
Ps, FLU)} ||
{FLU, Ps} <- PrivProjs],
SaneP = lists:all(fun({_FLU, SaneRes}) -> SaneRes == true end, Sane),
%% The final report item should say that all are agreed_membership.
%% On a really bad day, this could trigger a badmatch exception....
{_LastEpoch, {ok_disjoint, LastRepXs}} = lists:last(Report),
AgreedOrNot = lists:usort([element(1, X) || X <- LastRepXs]),
%% TODO: Check that we've converged to a single chain with no repairs.
SingleChainNoRepair = case LastRepXs of
[{agreed_membership,{_UPI,[]}}] ->
true;
_ ->
LastRepXs
end,
{SingleChainNoRepair_p, SingleChainNoRepairDetail} =
case LastRepXs of
[LastUPI] when length(LastUPI) == S2#state.num_pids ->
{true, true};
_ ->
{false, LastRepXs}
end,
ok = shutdown_hard(),
?WHENFAIL(
begin
%% ?QC_FMT("PrivProjs = ~P\n", [PrivProjs, 50]),
?QC_FMT("Report = ~p\n", [Report]),
?QC_FMT("Cmds = ~p\n", [Cmds]),
?QC_FMT("Res = ~p\n", [Res]),
?QC_FMT("Diag = ~s\n", [Diag]),
?QC_FMT("Report = ~p\n", [Report]),
?QC_FMT("PrivProjs = ~p\n", [PrivProjs]),
?QC_FMT("Sane = ~p\n", [Sane]),
?QC_FMT("SingleChainNoRepair failure =\n ~p\n", [SingleChainNoRepair])
,erlang:halt(0)
?QC_FMT("AllDisjointDetail = ~p\n", [AllDisjointDetail]),
?QC_FMT("SingleChainNoRepair failure = ~p\n", [SingleChainNoRepairDetail])
,?QC_FMT("\n\nHalting now!!!!!!!!!!\n\n", []),timer:sleep(500),erlang:halt(1)
end,
conjunction([{res, Res == true orelse Res == ok},
{all_disjoint, AllDisjointP},
{sane, SaneP},
{all_agreed_at_end, AgreedOrNot == [agreed_membership]},
{single_chain_no_repair, SingleChainNoRepair}
{single_chain_no_repair, SingleChainNoRepair_p}
]))
end)).
prop_pulse_test_() ->
-define(FIXTURE(TIMEOUT, EXTRATO, FUN), {timeout, (Timeout+ExtraTO+600), FUN}).
prop_pulse_new_test_() ->
{Timeout, ExtraTO} = get_timeouts(),
DoShrink = get_do_shrink(),
F = fun() ->
?assert(do_quickcheck(DoShrink, Timeout, new))
end,
case os:getenv("PULSE_SKIP_NEW") of
false ->
?FIXTURE(Timeout, ExtraTO, F);
_ ->
{timeout, 5,
fun() -> timer:sleep(200),
io:format(user, " (skip new style) ", []) end}
end.
%% See gen_commands() for more detail on the regression tests.
prop_pulse_regression_test_() ->
{Timeout, ExtraTO} = get_timeouts(),
DoShrink = get_do_shrink(),
F = fun() ->
?assert(do_quickcheck(DoShrink, Timeout, regression))
end,
case os:getenv("PULSE_SKIP_REGRESSION") of
false ->
?FIXTURE(Timeout, ExtraTO, F);
_ ->
{timeout, 5,
fun() -> timer:sleep(200),
io:format(user, " (skip regression style) ", []) end}
end.
do_quickcheck(Timeout, Style) ->
do_quickcheck(true, Timeout, Style).
do_quickcheck(true, Timeout, Style) ->
eqc:quickcheck(eqc:testing_time(Timeout,
?QC_OUT(prop_pulse(Style))));
do_quickcheck(false, Timeout, Style) ->
eqc:quickcheck(eqc:testing_time(Timeout,
?QC_OUT(noshrink(prop_pulse(Style))))).
get_timeouts() ->
Timeout = case os:getenv("PULSE_TIME") of
false -> 60;
Val -> list_to_integer(Val)
@ -370,39 +455,34 @@ prop_pulse_test_() ->
false -> 0;
Val2 -> list_to_integer(Val2)
end,
{timeout, (Timeout+ExtraTO+600), % 600 = a bit more fudge time
fun() ->
?assert(eqc:quickcheck(eqc:testing_time(Timeout,
?QC_OUT(prop_pulse()))))
end}.
{Timeout, ExtraTO}.
get_do_shrink() ->
case os:getenv("PULSE_NOSHRINK") of
false ->
true;
_ ->
false
end.
shutdown_hard() ->
?QC_FMT("shutdown(", []),
(catch unlink(whereis(machi_partition_simulator))),
[begin
Pid = whereis(X),
spawn(fun() -> (catch X:stop()) end),
timer:sleep(50),
(catch unlink(Pid)),
timer:sleep(10),
(catch exit(Pid, shutdown)),
timer:sleep(1),
(catch exit(Pid, kill))
end || X <- [machi_partition_simulator, machi_flu_sup] ],
timer:sleep(1),
?QC_FMT(")", []),
_STOP = application:stop(App)
end || App <- [machi] ],
timer:sleep(100),
ok.
exec_ticks(Num, All_listE) ->
Parent = self(),
Pids = [spawn_link(fun() ->
[begin
erlang:yield(),
M_name = P#p_srvr.name,
Max = 10,
%% Max = 10,
Max = 25,
Elapsed =
?MGR:sleep_ranked_order(1, Max, M_name, all_list()),
Res = ?MGR:test_react_to_env(get_chmgr(P)),
Res = ?MGR:trigger_react_to_env(get_chmgr(P)),
timer:sleep(erlang:max(0, Max - Elapsed)),
Res=Res %% ?D({self(), Res})
end || _ <- lists:seq(1,Num)],
@ -421,10 +501,15 @@ private_projections_are_stable_check(ProxiesDict, All_listE) ->
%% also check for flapping, and if yes, to see if all_hosed are
%% all exactly equal.
_ = exec_ticks(40, All_listE),
%% gobble_calls() workaround: many small exec_ticks() calls +
%% sleep after each.
[begin
_ = exec_ticks(10, All_listE),
timer:sleep(10)
end|| _ <- lists:seq(1, 40)],
Private1 = [?FLU_PC:get_latest_epochid(Proxy, private) ||
{_FLU, Proxy} <- orddict:to_list(ProxiesDict)],
_ = exec_ticks(5, All_listE),
_ = exec_ticks(3*20, All_listE),
Private2 = [?FLU_PC:get_latest_epochid(Proxy, private) ||
{_FLU, Proxy} <- orddict:to_list(ProxiesDict)],
@ -433,4 +518,20 @@ private_projections_are_stable_check(ProxiesDict, All_listE) ->
get_chmgr(#p_srvr{props=Ps}) ->
proplists:get_value(chmgr, Ps).
%% {PortBase, DirBase} = get_port_dir_base(),
get_port_dir_base() ->
I = case os:getenv("PULSE_BASE_PORT") of
false ->
0;
II ->
list_to_integer(II)
end,
D = case os:getenv("PULSE_BASE_DIR") of
false ->
"/tmp/c/";
DD ->
DD
end,
{7400 + (I * 100), D ++ "/" ++ integer_to_list(I)}.
-endif. % PULSE

View file

@ -45,172 +45,308 @@
-include_lib("eunit/include/eunit.hrl").
-compile(export_all).
%% @doc Create a summary report of all of the *private* projections of
%% each of the FLUs in the chain, and create a summary for each
%% epoch number.
%%
%% Report format: list({EpochNumber:non_neg_integer(), Report:rpt()})
%% rpt(): {'ok_disjoint', unique_upi_repair_lists()} |
%% {'bummer_NOT_DISJOINT', {flat(), summaries()}
%% unique_upi_repair_lists(): list(upi_and_repair_lists_concatenated())
%% flat(): debugging term; any duplicate in this list is an invalid FLU.
%% summaries(): list({FLU, ProjectionSummary:string() | 'not_in_this_epoch'})
%%
%% Example:
%% [{1,{ok_disjoint,[{agreed_membership,{[a],[b,c]}}]}},
%% {3,{ok_disjoint,[{agreed_membership,{[a],[b,c]}}]}},
%% {8,
%% {ok_disjoint,[{not_agreed,{[a],[b,c]},
%% [{b,not_in_this_epoch},
%% <<65,159,66,113,232,15,156,244,197,
%% 210,39,82,229,84,192,19,27,45,161,38>>]}]}},
%% {10,{ok_disjoint,[{agreed_membership,{[c],[]}}]}},
%% ...]
%%
%% [{1,{ok_disjoint,[[a,b,c]]}},
%% {4,{ok_disjoint,[[a,b,c]]}},
%% {6,{ok_disjoint,[[a,b,c]]}},
%% {16,{ok_disjoint,[[a,b,c]]}},
%% {22,{ok_disjoint,[[b]]}},
%% {1174,
%% {bummer_NOT_DISJOINT,{[a,a,b],
%% [{a,"[{epoch,1174},{author,a},{upi,[a]},{repair,[]},{down,[b]},{d,[{ps,[{a,b},{b,a}]},{nodes_up,[a]}]},{d2,[]}]"},
%% {b,"[{epoch,1174},{author,b},{upi,[b]},{repair,[a]},{down,[]},{d,[{ps,[]},{nodes_up,[a,b]}]},{d2,[]}]"}]}}},
%% ...]
unanimous_report(Namez) ->
UniquePrivateEs =
lists:usort(lists:flatten(
[element(2, ?FLU_PC:list_all_projections(FLU, private)) ||
{_FLUName, FLU} <- Namez])),
[unanimous_report(Epoch, Namez) || Epoch <- UniquePrivateEs,
Epoch /= 0].
[{Epoch, unanimous_report(Epoch, Namez)} || Epoch <- UniquePrivateEs,
Epoch /= 0].
unanimous_report(Epoch, Namez) ->
Projs = [{FLUName,
case ?FLU_PC:read_projection(FLU, private, Epoch) of
{ok, T} ->
machi_chain_manager1:inner_projection_or_self(T);
_Else ->
{FLUName, not_in_this_epoch}
end} || {FLUName, FLU} <- Namez],
FLU_Projs = [{FLUName,
case ?FLU_PC:read_projection(FLU, private, Epoch) of
{ok, T} ->
T;
_Else ->
not_in_this_epoch
end} || {FLUName, FLU} <- Namez],
unanimous_report2(FLU_Projs).
unanimous_report2(FLU_Projs) ->
ProjsSumms = [{FLU, if is_tuple(P) ->
Summ = machi_projection:make_summary(P),
lists:flatten(io_lib:format("~w", [Summ]));
is_atom(P) ->
P
end} || {FLU, P} <- FLU_Projs],
UPI_R_Sums = [{Proj#projection_v1.upi, Proj#projection_v1.repairing,
Proj#projection_v1.epoch_csum} ||
{_FLUname, Proj} <- Projs,
{_FLUname, Proj} <- FLU_Projs,
is_record(Proj, projection_v1)],
UniqueUPIs = lists:usort([UPI || {UPI, _Repairing, _CSum} <- UPI_R_Sums]),
Res =
[begin
case lists:usort([CSum || {U, _Repairing, CSum} <- UPI_R_Sums,
U == UPI]) of
[_1CSum] ->
%% Yay, there's only 1 checksum. Let's check
%% that all FLUs are in agreement.
{UPI, Repairing, _CSum} =
lists:keyfind(UPI, 1, UPI_R_Sums),
Tmp = [{FLU, case proplists:get_value(FLU, Projs) of
P when is_record(P, projection_v1) ->
P#projection_v1.epoch_csum;
Else ->
Else
end} || FLU <- UPI ++ Repairing],
case lists:usort([CSum || {_FLU, CSum} <- Tmp]) of
[_] ->
{agreed_membership, {UPI, Repairing}};
Else2 ->
{not_agreed, {UPI, Repairing}, Else2}
end;
_Else ->
{not_agreed, {undefined, undefined}, Projs}
end
end || UPI <- UniqueUPIs],
AgreedResUPI_Rs = [UPI++Repairing ||
{agreed_membership, {UPI, Repairing}} <- Res],
Tag = case lists:usort(lists:flatten(AgreedResUPI_Rs)) ==
lists:sort(lists:flatten(AgreedResUPI_Rs)) of
true ->
ok_disjoint;
false ->
bummer_NOT_DISJOINT
end,
{Epoch, {Tag, Res}}.
if length(UniqueUPIs) =< 1 ->
{ok_disjoint, UniqueUPIs};
true ->
Flat = lists:flatten(UniqueUPIs),
case lists:usort(Flat) == lists:sort(Flat) of
true ->
{ok_disjoint, UniqueUPIs};
false ->
{bummer_NOT_DISJOINT, {lists:sort(Flat), ProjsSumms}}
end
end.
all_reports_are_disjoint(Report) ->
[] == [X || {_Epoch, Tuple}=X <- Report,
element(1, Tuple) /= ok_disjoint].
extract_chains_relative_to_flu(FLU, Report) ->
{FLU, [{Epoch, UPI, Repairing} ||
{Epoch, {ok_disjoint, Es}} <- Report,
{agreed_membership, {UPI, Repairing}} <- Es,
lists:member(FLU, UPI) orelse lists:member(FLU, Repairing)]}.
chain_to_projection(MyName, Epoch, UPI_list, Repairing_list, All_list) ->
MemberDict = orddict:from_list([{FLU, #p_srvr{name=FLU}} ||
FLU <- All_list]),
machi_projection:new(Epoch, MyName, MemberDict,
All_list -- (UPI_list ++ Repairing_list),
UPI_list, Repairing_list, [{artificial_by, ?MODULE}]).
case [X || {_Epoch, Tuple}=X <- Report,
element(1, Tuple) /= ok_disjoint] of
[] ->
true;
Else ->
Else
end.
-ifndef(PULSE).
simple_chain_state_transition_is_sane_test_() ->
{timeout, 60, fun() -> simple_chain_state_transition_is_sane_test2() end}.
simple_chain_state_transition_is_sane_test2() ->
%% All: A list of all FLUS for a particular test
%% UPI1: some combination of All that represents UPI1
%% Repair1: Some combination of (All -- UP1) that represents Repairing1
%% ... then we test check_simple_chain_state_transition_is_sane() with all
%% possible UPI1 and Repair1.
[true = check_simple_chain_state_transition_is_sane(UPI1, Repair1) ||
%% The five elements below runs on my MacBook Pro in about 4.8 seconds
%% All <- [ [a], [a,b], [a,b,c], [a,b,c,d], [a,b,c,d,e] ],
%% For elements on the same MBP is about 0.15 seconds.
All <- [ [a], [a,b], [a,b,c], [a,b,c,d] ],
UPI1 <- machi_util:combinations(All),
Repair1 <- machi_util:combinations(All -- UPI1)].
%% Given a UPI1 and Repair1 list, we calculate all possible good UPI2
%% lists. For all good {UPI1, Repair1} -> UPI2 transitions, then the
%% simple_chain_state_transition_is_sane() function must be true. For
%% all other UPI2 transitions, simple_chain_state_transition_is_sane()
%% must be false.
%%
%% We include adding an extra possible participant, 'bogus', to the
%% list of all possible UPI2 transitions, just to demonstrate that
%% adding an extra element/participant/thingie is never sane.
check_simple_chain_state_transition_is_sane([], []) ->
true;
check_simple_chain_state_transition_is_sane(UPI1, Repair1) ->
Good_UPI2s = [ X ++ Y || X <- machi_util:ordered_combinations(UPI1),
Y <- machi_util:ordered_combinations(Repair1)],
All_UPI2s = machi_util:combinations(lists:usort(UPI1 ++ Repair1) ++
[bogus]),
[true = ?MGR:simple_chain_state_transition_is_sane(UPI1, Repair1, UPI2) ||
UPI2 <- Good_UPI2s],
[false = ?MGR:simple_chain_state_transition_is_sane(UPI1, Repair1, UPI2) ||
UPI2 <- (All_UPI2s -- Good_UPI2s)],
true.
-ifdef(EQC).
%% This QuickCheck property is crippled: because the old chain state
%% transition check, chain_mgr_legacy:projection_transition_is_sane(),
%% is so buggy and the new check is (apparently) so much better, I
%% have changed the ?WHENFAIL() criteria to check for either agreement
%% _or_ a case where the legacy check says true but the new check says
%% false.
%%
%% On my MacBook Pro, less than 1000 tests are required to find at
%% least one problem case for the legacy check that the new check is
%% correct for. Running for two seconds can do about 3,500 test
%% cases.
compare_eqc_setup_test_() ->
%% Silly QuickCheck can take a long time to start up, check its
%% license, etcetc.
%% machi_chain_manager1_test: compare_eqc_setup_test...[1.788 s] ok
{timeout, 30,
fun() -> eqc:quickcheck(eqc:testing_time(0.1, true)) end}.
-define(COMPARE_TIMEOUT, 1.2).
%% -define(COMPARE_TIMEOUT, 4.8).
compare_legacy_with_v2_chain_transition_check1_test() ->
eqc:quickcheck(
?QC_OUT(
eqc:testing_time(
?COMPARE_TIMEOUT,
prop_compare_legacy_with_v2_chain_transition_check(primitive)))).
compare_legacy_with_v2_chain_transition_check2_test() ->
eqc:quickcheck(
?QC_OUT(
eqc:testing_time(
?COMPARE_TIMEOUT,
prop_compare_legacy_with_v2_chain_transition_check(primitive)))).
prop_compare_legacy_with_v2_chain_transition_check() ->
prop_compare_legacy_with_v2_chain_transition_check(primitive).
prop_compare_legacy_with_v2_chain_transition_check(Style) ->
%% ?FORALL(All, nonempty(list([a,b,c,d,e])),
?FORALL(All, non_empty(some([a,b,c,d])),
?FORALL({Author1, UPI1, Repair1x, Author2, UPI2, Repair2x},
{elements(All),some(All),some(All),elements(All),some(All),some(All)},
?IMPLIES(length(lists:usort(UPI1 ++ Repair1x)) > 0 andalso
length(lists:usort(UPI2 ++ Repair2x)) > 0,
begin
MembersDict = orddict:from_list([{X, #p_srvr{name=X}} || X <- All]),
Repair1 = Repair1x -- UPI1,
Down1 = All -- (UPI1 ++ Repair1),
Repair2 = Repair2x -- UPI2,
Down2 = All -- (UPI2 ++ Repair2),
P1 = machi_projection:new(1, Author1, MembersDict,
Down1, UPI1, Repair1, []),
P2 = machi_projection:new(2, Author2, MembersDict,
Down2, UPI2, Repair2, []),
Old_res = chain_mgr_legacy:projection_transition_is_sane(
P1, P2, Author1, false),
Old_p = case Old_res of true -> true;
_ -> false
end,
case Style of
primitive ->
New_res = ?MGR:chain_state_transition_is_sane(
Author1, UPI1, Repair1, Author2, UPI2, Author2),
New_p = case New_res of true -> true;
_ -> false
end;
whole ->
New_res = machi_chain_manager1:projection_transition_is_sane(
P1, P2, Author1, false),
New_p = case New_res of true -> true;
_ -> false
end
end,
(catch ets:insert(count,
{{Author1, UPI1, Repair1, Author2, UPI2, Repair2}, true})),
?WHENFAIL(io:format(user,
"Old_res: ~p/~p (~p)\nNew_res: ~p/~p (why line ~P)\n",
[Old_p, Old_res, catch get(why1),
New_p, New_res, catch get(why2), 30]),
%% Old_p == New_p)
Old_p == New_p orelse (Old_p == true andalso New_p == false))
end))).
some(L) ->
?LET(L2, list(oneof(L)),
dedupe(L2)).
dedupe(L) ->
dedupe(L, []).
dedupe([H|T], Seen) ->
case lists:member(H, Seen) of
false ->
[H|dedupe(T, [H|Seen])];
true ->
dedupe(T, Seen)
end;
dedupe([], _) ->
[].
make_prop_ets() ->
ets:new(count, [named_table, set, public]).
-endif. % EQC
make_advance_fun(FitList, FLUList, MgrList, Num) ->
fun() ->
[begin
[catch machi_fitness:trigger_early_adjustment(Fit, Tgt) ||
Fit <- FitList,
Tgt <- FLUList ],
[catch ?MGR:trigger_react_to_env(Mgr) || Mgr <- MgrList],
ok
end || _ <- lists:seq(1, Num)]
end.
smoke0_test() ->
{ok, _} = machi_partition_simulator:start_link({1,2,3}, 50, 50),
Host = "localhost",
TcpPort = 6623,
{ok, FLUa} = machi_flu1:start_link([{a,TcpPort,"./data.a"}]),
Pa = #p_srvr{name=a, address=Host, port=TcpPort},
Members_Dict = machi_projection:make_members_dict([Pa]),
%% Egadz, more racing on startup, yay. TODO fix.
timer:sleep(1),
{[Pa], [M0], _Dirs} = machi_test_util:start_flu_packages(
1, TcpPort, "./data.", []),
{ok, FLUaP} = ?FLU_PC:start_link(Pa),
{ok, M0} = ?MGR:start_link(a, Members_Dict, [{active_mode, false}]),
_SockA = machi_util:connect(Host, TcpPort),
try
pong = ?MGR:ping(M0)
after
ok = ?MGR:stop(M0),
ok = machi_flu1:stop(FLUa),
ok = ?FLU_PC:quit(FLUaP),
ok = machi_partition_simulator:stop()
machi_test_util:stop_flu_packages()
end.
smoke1_test() ->
machi_partition_simulator:start_link({1,2,3}, 100, 0),
TcpPort = 62777,
FluInfo = [{a,TcpPort+0,"./data.a"}, {b,TcpPort+1,"./data.b"}, {c,TcpPort+2,"./data.c"}],
P_s = [#p_srvr{name=Name, address="localhost", port=Port} ||
{Name,Port,_Dir} <- FluInfo],
smoke1_test_() ->
{timeout, 1*60, fun() -> smoke1_test2() end}.
[machi_flu1_test:clean_up_data_dir(Dir) || {_,_,Dir} <- FluInfo],
FLUs = [element(2, machi_flu1:start_link([{Name,Port,Dir}])) ||
{Name,Port,Dir} <- FluInfo],
MembersDict = machi_projection:make_members_dict(P_s),
{ok, M0} = ?MGR:start_link(a, MembersDict, [{active_mode,false}]),
smoke1_test2() ->
TcpPort = 62777,
MgrOpts = [{active_mode,false}],
try
{ok, P1} = ?MGR:test_calc_projection(M0, false),
{Ps, MgrNames, _Dirs} = machi_test_util:start_flu_packages(
3, TcpPort, "./data.", MgrOpts),
MembersDict = machi_projection:make_members_dict(Ps),
[machi_chain_manager1:set_chain_members(M, MembersDict) || M <- MgrNames],
Ma = hd(MgrNames),
{ok, P1} = ?MGR:test_calc_projection(Ma, false),
% DERP! Check for race with manager's proxy vs. proj listener
case ?MGR:test_read_latest_public_projection(M0, false) of
{error, partition} -> timer:sleep(500);
_ -> ok
end,
{local_write_result, ok,
{remote_write_results, [{b,ok},{c,ok}]}} =
?MGR:test_write_public_projection(M0, P1),
{unanimous, P1, Extra1} = ?MGR:test_read_latest_public_projection(M0, false),
ok = lists:foldl(
fun(_, {_,{true,[{c,ok},{b,ok},{a,ok}]}}) ->
ok; % Short-circuit remaining attempts
(_, ok) ->
ok; % Skip remaining!
(_, _Else) ->
timer:sleep(10),
?MGR:test_write_public_projection(Ma, P1)
end, not_ok, lists:seq(1, 1000)),
%% Writing the exact same projection multiple times returns ok:
%% no change!
{_,{true,[{c,ok},{b,ok},{a,ok}]}} = ?MGR:test_write_public_projection(Ma, P1),
{unanimous, P1, Extra1} = ?MGR:test_read_latest_public_projection(Ma, false),
ok
after
ok = ?MGR:stop(M0),
[ok = machi_flu1:stop(X) || X <- FLUs],
ok = machi_partition_simulator:stop()
machi_test_util:stop_flu_packages()
end.
nonunanimous_setup_and_fix_test() ->
%% TODO attack list:
%% __ Add start option to chain manager to be "passive" only, i.e.,
%% not immediately go to work on
%% 1. Start FLUs with full complement of FLU+proj+chmgr.
%% 2. Put each of them under a supervisor?
%% - Sup proc could be a created-specifically-for-test thing, perhaps?
%% Rather than relying on a supervisor with reg name + OTP app started
%% plus plus more more yaddayadda?
%% 3. Add projection catalog/orddict of #p_srvr records??
%% 4. Fix this test, etc etc.
machi_partition_simulator:start_link({1,2,3}, 100, 0),
nonunanimous_setup_and_fix_test_() ->
os:cmd("rm -f /tmp/moomoo.*"),
{timeout, 1*60, fun() -> nonunanimous_setup_and_fix_test2() end}.
nonunanimous_setup_and_fix_test2() ->
TcpPort = 62877,
FluInfo = [{a,TcpPort+0,"./data.a"}, {b,TcpPort+1,"./data.b"}],
P_s = [#p_srvr{name=Name, address="localhost", port=Port} ||
{Name,Port,_Dir} <- FluInfo],
[machi_flu1_test:clean_up_data_dir(Dir) || {_,_,Dir} <- FluInfo],
FLUs = [element(2, machi_flu1:start_link([{Name,Port,Dir}])) ||
{Name,Port,Dir} <- FluInfo],
[Proxy_a, Proxy_b] = Proxies =
[element(2,?FLU_PC:start_link(P)) || P <- P_s],
MembersDict = machi_projection:make_members_dict(P_s),
XX = [],
%% XX = [{private_write_verbose,true}],
{ok, Ma} = ?MGR:start_link(a, MembersDict, [{active_mode, false}]++XX),
{ok, Mb} = ?MGR:start_link(b, MembersDict, [{active_mode, false}]++XX),
MgrOpts = [{active_mode,false}],
{Ps, [Ma,Mb,Mc], Dirs} = machi_test_util:start_flu_packages(
3, TcpPort, "./data.", MgrOpts),
MembersDict = machi_projection:make_members_dict(Ps),
ChainName = my_little_chain,
[machi_chain_manager1:set_chain_members(M, ChainName, 0, ap_mode,
MembersDict, []) || M <- [Ma, Mb]],
[Proxy_a, Proxy_b, Proxy_c] = Proxies =
[element(2, ?FLU_PC:start_link(P)) || P <- Ps],
try
{ok, P1} = ?MGR:test_calc_projection(Ma, false),
@ -224,38 +360,187 @@ nonunanimous_setup_and_fix_test() ->
ok = ?FLU_PC:write_projection(Proxy_b, public, P1b),
%% ?D(x),
{not_unanimous,_,_}=_XX = ?MGR:test_read_latest_public_projection(Ma, false),
{not_unanimous,_,_}=_XX = ?MGR:test_read_latest_public_projection(
Ma, false),
%% ?Dw(_XX),
{not_unanimous,_,_}=_YY = ?MGR:test_read_latest_public_projection(Ma, true),
{not_unanimous,_,_}=_YY = ?MGR:test_read_latest_public_projection(
Ma, true),
%% The read repair here doesn't automatically trigger the creation of
%% a new projection (to try to create a unanimous projection). So
%% we expect nothing to change when called again.
{not_unanimous,_,_}=_YY = ?MGR:test_read_latest_public_projection(Ma, true),
{not_unanimous,_,_}=_YY = ?MGR:test_read_latest_public_projection(
Ma, true),
{now_using, _, EpochNum_a} = ?MGR:test_react_to_env(Ma),
{no_change, _, EpochNum_a} = ?MGR:test_react_to_env(Ma),
{now_using, _, EpochNum_a} = ?MGR:trigger_react_to_env(Ma),
{no_change, _, EpochNum_a} = ?MGR:trigger_react_to_env(Ma),
{unanimous,P2,_E2} = ?MGR:test_read_latest_public_projection(Ma, false),
{ok, P2pa} = ?FLU_PC:read_latest_projection(Proxy_a, private),
P2 = P2pa#projection_v1{dbg2=[]},
%% FLUb should have nothing written to private because it hasn't
%% reacted yet.
{error, not_written} = ?FLU_PC:read_latest_projection(Proxy_b, private),
%% Poke FLUb to react ... should be using the same private proj
%% as FLUa.
{now_using, _, EpochNum_a} = ?MGR:test_react_to_env(Mb),
{now_using, _, EpochNum_a} = ?MGR:trigger_react_to_env(Mb),
{ok, P2pb} = ?FLU_PC:read_latest_projection(Proxy_b, private),
P2 = P2pb#projection_v1{dbg2=[]},
Mgrs = [a_chmgr, b_chmgr, c_chmgr],
Advance = make_advance_fun([a_fitness,b_fitness,c_fitness],
[a,b,c],
Mgrs,
3),
Advance(),
{_, _, TheEpoch_3} = ?MGR:trigger_react_to_env(Ma),
{_, _, TheEpoch_3} = ?MGR:trigger_react_to_env(Mb),
{_, _, TheEpoch_3} = ?MGR:trigger_react_to_env(Mc),
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Remove 'a' from the chain.\n", []),
MembersDict4 = machi_projection:make_members_dict(tl(Ps)),
ok = machi_chain_manager1:set_chain_members(
Mb, ChainName, TheEpoch_3, ap_mode, MembersDict4, []),
Advance(),
{ok, {true, _,_,_}} = ?FLU_PC:wedge_status(Proxy_a),
{_, _, TheEpoch_4} = ?MGR:trigger_react_to_env(Mb),
{_, _, TheEpoch_4} = ?MGR:trigger_react_to_env(Mc),
[{ok, #projection_v1{upi=[b,c], repairing=[]}} =
?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- tl(Proxies)],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Add a to the chain again (a is running).\n", []),
MembersDict5 = machi_projection:make_members_dict(Ps),
ok = machi_chain_manager1:set_chain_members(
Mb, ChainName, TheEpoch_4, ap_mode, MembersDict5, []),
Advance(),
{_, _, TheEpoch_5} = ?MGR:trigger_react_to_env(Ma),
{_, _, TheEpoch_5} = ?MGR:trigger_react_to_env(Mb),
{_, _, TheEpoch_5} = ?MGR:trigger_react_to_env(Mc),
[{ok, #projection_v1{upi=[b,c], repairing=[a]}} =
?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- Proxies],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Stop a while a chain member, advance b&c.\n", []),
ok = machi_flu_psup:stop_flu_package(a),
Advance(),
{_, _, TheEpoch_6} = ?MGR:trigger_react_to_env(Mb),
{_, _, TheEpoch_6} = ?MGR:trigger_react_to_env(Mc),
[{ok, #projection_v1{upi=[b,c], repairing=[]}} =
?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- tl(Proxies)],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Remove 'a' from the chain.\n", []),
MembersDict7 = machi_projection:make_members_dict(tl(Ps)),
ok = machi_chain_manager1:set_chain_members(
Mb, ChainName, TheEpoch_6, ap_mode, MembersDict7, []),
Advance(),
{_, _, TheEpoch_7} = ?MGR:trigger_react_to_env(Mb),
{_, _, TheEpoch_7} = ?MGR:trigger_react_to_env(Mc),
[{ok, #projection_v1{upi=[b,c], repairing=[]}} =
?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- tl(Proxies)],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Start a, advance.\n", []),
Opts = [{active_mode, false}, {initial_wedged, true}],
#p_srvr{name=NameA} = hd(Ps),
{ok,_}=machi_flu_psup:start_flu_package(NameA, TcpPort+1, hd(Dirs), Opts),
Advance(),
{ok, {true, _,_,_}} = ?FLU_PC:wedge_status(Proxy_a),
{ok, {false, EpochID_8,_,_}} = ?FLU_PC:wedge_status(Proxy_b),
{ok, {false, EpochID_8,_,_}} = ?FLU_PC:wedge_status(Proxy_c),
[{ok, #projection_v1{upi=[b,c], repairing=[]}} =
?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- tl(Proxies)],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Stop a, delete a's data, leave it stopped\n", []),
ok = machi_flu_psup:stop_flu_package(a),
Advance(),
machi_flu1_test:clean_up_data_dir(hd(Dirs)),
{ok, {false, _,_,_}} = ?FLU_PC:wedge_status(Proxy_b),
{ok, {false, _,_,_}} = ?FLU_PC:wedge_status(Proxy_c),
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Add a to the chain again (a is stopped).\n", []),
MembersDict9 = machi_projection:make_members_dict(Ps),
{_, _, TheEpoch_9} = ?MGR:trigger_react_to_env(Mb),
ok = machi_chain_manager1:set_chain_members(
Mb, ChainName, TheEpoch_9, ap_mode, MembersDict9, []),
Advance(),
{_, _, TheEpoch_9b} = ?MGR:trigger_react_to_env(Mb),
true = (TheEpoch_9b > TheEpoch_9),
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("STEP: Start a, and it joins like it ought to\n", []),
{ok,_}=machi_flu_psup:start_flu_package(NameA, TcpPort+1, hd(Dirs), Opts),
Advance(),
{ok, {false, {TheEpoch10,_},_,_}} = ?FLU_PC:wedge_status(Proxy_a),
{ok, {false, {TheEpoch10,_},_,_}} = ?FLU_PC:wedge_status(Proxy_b),
{ok, {false, {TheEpoch10,_},_,_}} = ?FLU_PC:wedge_status(Proxy_c),
[{ok, #projection_v1{upi=[b,c], repairing=[a]}} =
?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- Proxies],
ok
after
ok = ?MGR:stop(Ma),
ok = ?MGR:stop(Mb),
[ok = ?FLU_PC:quit(X) || X <- Proxies],
[ok = machi_flu1:stop(X) || X <- FLUs],
ok = machi_partition_simulator:stop()
machi_test_util:stop_flu_packages()
end.
unanimous_report_test() ->
TcpPort = 63877,
FluInfo = [{a,TcpPort+0,"./data.a"}, {b,TcpPort+1,"./data.b"}],
P_s = [#p_srvr{name=Name, address="localhost", port=Port} ||
{Name,Port,_Dir} <- FluInfo],
MembersDict = machi_projection:make_members_dict(P_s),
E5 = 5,
UPI5 = [a,b],
Rep5 = [],
Report5 = [UPI5],
P5 = machi_projection:new(E5, a, MembersDict, [], UPI5, Rep5, []),
{ok_disjoint, Report5} =
unanimous_report2([{a, P5}, {b, P5}]),
{ok_disjoint, Report5} =
unanimous_report2([{a, not_in_this_epoch}, {b, P5}]),
{ok_disjoint, Report5} =
unanimous_report2([{a, P5}, {b, not_in_this_epoch}]),
UPI5_b = [a],
Rep5_b = [],
P5_b = machi_projection:new(E5, b, MembersDict, [b], UPI5_b, Rep5_b, []),
{bummer_NOT_DISJOINT, _} = unanimous_report2([{a, P5}, {b, P5_b}]),
UPI5_c = [b],
Rep5_c = [a],
P5_c = machi_projection:new(E5, b, MembersDict, [], UPI5_c, Rep5_c, []),
{bummer_NOT_DISJOINT, _} =
unanimous_report2([{a, P5}, {b, P5_c}]),
P_s3 = [#p_srvr{name=Name, address="localhost", port=Port} ||
{Name,Port,_Dir} <- FluInfo ++ [{c,TcpPort+0,"./data.c"}]],
MembersDict3 = machi_projection:make_members_dict(P_s3),
UPI5_d = [c],
Rep5_d = [a],
Report5d = [UPI5, UPI5_d],
P5_d = machi_projection:new(E5, b, MembersDict3, [b], UPI5_d, Rep5_d, []),
{ok_disjoint, Report5d} = unanimous_report2([{a, P5}, {b, P5_d}]),
UPI5_e = [b],
Rep5_e = [c],
Report5be = [UPI5_b, UPI5_e],
P5_e = machi_projection:new(E5, b, MembersDict3, [a], UPI5_e, Rep5_e, []),
{bummer_NOT_DISJOINT, _} = unanimous_report2([{a, P5}, {b, P5_e}]),
{ok_disjoint, Report5be} = unanimous_report2([{a, P5_b}, {b, P5_e}]),
ok.
-endif. % !PULSE
-endif. % TEST

68
test/machi_cinfo_test.erl Normal file
View file

@ -0,0 +1,68 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(machi_cinfo_test).
-ifdef(TEST).
-ifndef(PULSE).
-include_lib("eunit/include/eunit.hrl").
-include("machi_projection.hrl").
%% smoke_test() will try just dump cluster_info and call each functions
smoke_test_() ->
{setup,
fun setup/0,
fun cleanup/1,
[
fun() -> machi_cinfo:public_projection(a) end,
fun() -> machi_cinfo:private_projection(a) end,
fun() -> machi_cinfo:fitness(a) end,
fun() -> machi_cinfo:chain_manager(a) end,
fun() -> machi_cinfo:flu1(a) end,
fun() -> machi_cinfo:dump() end
]}.
setup() ->
machi_cinfo:register(),
Ps = [{a,#p_srvr{name=a, address="localhost", port=5555, props="./data.a"}},
{b,#p_srvr{name=b, address="localhost", port=5556, props="./data.b"}},
{c,#p_srvr{name=c, address="localhost", port=5557, props="./data.c"}}
],
[os:cmd("rm -rf " ++ P#p_srvr.props) || {_,P} <- Ps],
{ok, SupPid} = machi_sup:start_link(),
%% Only run a, don't run b & c so we have 100% failures talking to them
[begin
#p_srvr{name=Name, port=Port, props=Dir} = P,
{ok, _} = machi_flu_psup:start_flu_package(Name, Port, Dir, [])
end || {_,P} <- [hd(Ps)]],
machi_chain_manager1:set_chain_members(a_chmgr, orddict:from_list(Ps)),
{SupPid, Ps}.
cleanup({SupPid, Ps}) ->
exit(SupPid, normal),
[os:cmd("rm -rf " ++ P#p_srvr.props) || {_,P} <- Ps],
machi_util:wait_for_death(SupPid, 100),
ok.
-endif. % !PULSE
-endif. % TEST

View file

@ -30,135 +30,263 @@
smoke_test_() -> {timeout, 1*60, fun() -> smoke_test2() end}.
smoke_test2() ->
setup_smoke_test(Host, PortBase, Os, Witness_list) ->
os:cmd("rm -rf ./data.a ./data.b ./data.c"),
{ok, SupPid} = machi_flu_sup:start_link(),
{ok, _} = machi_util:wait_for_life(machi_flu_sup, 100),
F = fun(X) -> case lists:member(X, Witness_list) of
true ->
[{witness_mode, true}|Os];
false ->
Os
end
end,
{ok,_}=machi_flu_psup:start_flu_package(a, PortBase+0, "./data.a", F(a)),
{ok,_}=machi_flu_psup:start_flu_package(b, PortBase+1, "./data.b", F(b)),
{ok,_}=machi_flu_psup:start_flu_package(c, PortBase+2, "./data.c", F(c)),
D = orddict:from_list(
[{a,{p_srvr,a,machi_flu1_client,"localhost",PortBase+0,[]}},
{b,{p_srvr,b,machi_flu1_client,"localhost",PortBase+1,[]}},
{c,{p_srvr,c,machi_flu1_client,"localhost",PortBase+2,[]}}]),
%% Force the chain to repair & fully assemble as quickly as possible.
%% 1. Use set_chain_members() on all 3
%% 2. Force a to run repair in a tight loop
%% 3. Stop as soon as we see UPI=[a,b,c] and also author=c.
%% Otherwise, we can have a race with later, minor
%% projection changes which will change our understanding of
%% the epoch id. (C is the author with highest weight.)
%% 4. Wait until all others are using epoch id from #3.
%%
%% Damn, this is a pain to make 100% deterministic, bleh.
CMode = if Witness_list == [] -> ap_mode;
Witness_list /= [] -> cp_mode
end,
ok = machi_chain_manager1:set_chain_members(a_chmgr, ch0, 0, CMode,
D, Witness_list),
ok = machi_chain_manager1:set_chain_members(b_chmgr, ch0, 0, CMode,
D, Witness_list),
ok = machi_chain_manager1:set_chain_members(c_chmgr, ch0, 0, CMode,
D, Witness_list),
run_ticks([a_chmgr,b_chmgr,c_chmgr]),
%% Everyone is settled on the same damn epoch id.
{ok, EpochID} = machi_flu1_client:get_latest_epochid(Host, PortBase+0,
private),
{ok, EpochID} = machi_flu1_client:get_latest_epochid(Host, PortBase+1,
private),
{ok, EpochID} = machi_flu1_client:get_latest_epochid(Host, PortBase+2,
private),
{D, EpochID}.
run_ticks(MgrList) ->
TickAll = fun() -> [begin
Pid ! tick_check_environment,
timer:sleep(50)
end || Pid <- MgrList]
end,
_ = lists:foldl(
fun(_, [{_,[a,b,c]}]=Acc) -> Acc;
(_, [{_,[b,c]}]=Acc) -> Acc; %% Fragile, but for witness=[a]
(_, [{_,[a,c]},{yy,b_pstore}]=Acc) -> Acc; %% Fragile, but for witness=[a]
(_, _Acc) ->
TickAll(), % has some sleep time inside
Xs = [try
{ok, Prj} = machi_projection_store:read_latest_projection(PStore, private),
{Prj#projection_v1.author_server,
Prj#projection_v1.upi}
catch _:_ ->
{yy, PStore}
end || PStore <- [a_pstore,b_pstore,c_pstore] ],
lists:usort(Xs)
end, undefined, lists:seq(1,10000)),
ok.
smoke_test2() ->
{ok, SupPid} = machi_sup:start_link(),
error_logger:tty(false),
try
Prefix = <<"pre">>,
Chunk1 = <<"yochunk">>,
NSInfo = undefined,
NoCSum = <<>>,
Host = "localhost",
PortBase = 64444,
PortBase = 64454,
Os = [{ignore_stability_time, true}, {active_mode, false}],
{ok,_}=machi_flu_psup:start_flu_package(a, PortBase+0, "./data.a", Os),
{ok,_}=machi_flu_psup:start_flu_package(b, PortBase+1, "./data.b", Os),
{ok,_}=machi_flu_psup:start_flu_package(c, PortBase+2, "./data.c", Os),
D = orddict:from_list(
[{a,{p_srvr,a,machi_flu1_client,"localhost",PortBase+0,[]}},
{b,{p_srvr,b,machi_flu1_client,"localhost",PortBase+1,[]}},
{c,{p_srvr,c,machi_flu1_client,"localhost",PortBase+2,[]}}]),
%% Force the chain to repair & fully assemble as quickly as possible.
%% 1. Use set_chain_members() on all 3
%% 2. Force a to run repair in a tight loop
%% 3. Stop as soon as we see UPI=[a,b,c] and also author=c.
%% Otherwise, we can have a race with later, minor
%% projection changes which will change our understanding of
%% the epoch id. (C is the author with highest weight.)
%% 4. Wait until all others are using epoch id from #3.
%%
%% Damn, this is a pain to make 100% deterministic, bleh.
ok = machi_chain_manager1:set_chain_members(a_chmgr, D),
ok = machi_chain_manager1:set_chain_members(b_chmgr, D),
ok = machi_chain_manager1:set_chain_members(c_chmgr, D),
TickAll = fun() -> [begin
Pid ! tick_check_environment,
timer:sleep(50)
end || Pid <- [a_chmgr,b_chmgr,c_chmgr] ]
end,
_ = lists:foldl(
fun(_, [{c,[a,b,c]}]=Acc) -> Acc;
(_, _Acc) ->
TickAll(), % has some sleep time inside
Xs = [begin
{ok, Prj} = machi_projection_store:read_latest_projection(PStore, private),
{Prj#projection_v1.author_server,
Prj#projection_v1.upi}
end || PStore <- [a_pstore,b_pstore,c_pstore] ],
lists:usort(Xs)
end, undefined, lists:seq(1,10000)),
%% Everyone is settled on the same damn epoch id.
{ok, EpochID} = machi_flu1_client:get_latest_epochid(Host, PortBase+0,
private),
{ok, EpochID} = machi_flu1_client:get_latest_epochid(Host, PortBase+1,
private),
{ok, EpochID} = machi_flu1_client:get_latest_epochid(Host, PortBase+2,
private),
{D, EpochID} = setup_smoke_test(Host, PortBase, Os, []),
%% Whew ... ok, now start some damn tests.
{ok, C1} = machi_cr_client:start_link([P || {_,P}<-orddict:to_list(D)]),
machi_cr_client:append_chunk(C1, Prefix, Chunk1),
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum),
{ok, {Off1,Size1,File1}} =
machi_cr_client:append_chunk(C1, Prefix, Chunk1),
Chunk1_badcs = {<<?CSUM_TAG_CLIENT_SHA:8, 0:(8*20)>>, Chunk1},
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum),
BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, "foo")},
{error, bad_checksum} =
machi_cr_client:append_chunk(C1, Prefix, Chunk1_badcs),
{ok, Chunk1} = machi_cr_client:read_chunk(C1, File1, Off1, Size1),
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, BadCSum),
{ok, {[{_, Off1, Chunk1, _}], []}} =
machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, Size1, undefined),
{ok, PPP} = machi_flu1_client:read_latest_projection(Host, PortBase+0,
private),
%% Verify that the client's CR wrote to all of them.
[{ok, Chunk1} = machi_flu1_client:read_chunk(
Host, PortBase+X, EpochID, File1, Off1, Size1) ||
[{ok, {[{_, Off1, Chunk1, _}], []}} =
machi_flu1_client:read_chunk(
Host, PortBase+X, NSInfo, EpochID, File1, Off1, Size1, undefined) ||
X <- [0,1,2] ],
%% Test read repair: Manually write to head, then verify that
%% read-repair fixes all.
FooOff1 = Off1 + (1024*1024),
[{error, not_written} = machi_flu1_client:read_chunk(
Host, PortBase+X, EpochID,
File1, FooOff1, Size1) || X <- [0,1,2] ],
ok = machi_flu1_client:write_chunk(Host, PortBase+0, EpochID,
File1, FooOff1, Chunk1),
{ok, Chunk1} = machi_cr_client:read_chunk(C1, File1, FooOff1, Size1),
[{X,{ok, Chunk1}} = {X,machi_flu1_client:read_chunk(
Host, PortBase+X, EpochID,
File1, FooOff1, Size1)} || X <- [0,1,2] ],
Host, PortBase+X, NSInfo, EpochID,
File1, FooOff1, Size1, undefined) || X <- [0,1,2] ],
ok = machi_flu1_client:write_chunk(Host, PortBase+0, NSInfo, EpochID,
File1, FooOff1, Chunk1, NoCSum),
{ok, {[{File1, FooOff1, Chunk1, _}=_YY], []}} =
machi_flu1_client:read_chunk(Host, PortBase+0, NSInfo, EpochID,
File1, FooOff1, Size1, undefined),
{ok, {[{File1, FooOff1, Chunk1, _}], []}} =
machi_cr_client:read_chunk(C1, NSInfo, File1, FooOff1, Size1, undefined),
[?assertMatch({X,{ok, {[{_, FooOff1, Chunk1, _}], []}}},
{X,machi_flu1_client:read_chunk(
Host, PortBase+X, NSInfo, EpochID,
File1, FooOff1, Size1, undefined)})
|| X <- [0,1,2] ],
%% Test read repair: Manually write to middle, then same checking.
FooOff2 = Off1 + (2*1024*1024),
Chunk2 = <<"Middle repair chunk">>,
Size2 = size(Chunk2),
ok = machi_flu1_client:write_chunk(Host, PortBase+1, EpochID,
File1, FooOff2, Chunk2),
{ok, Chunk2} = machi_cr_client:read_chunk(C1, File1, FooOff2, Size2),
[{X,{ok, Chunk2}} = {X,machi_flu1_client:read_chunk(
Host, PortBase+X, EpochID,
File1, FooOff2, Size2)} || X <- [0,1,2] ],
ok = machi_flu1_client:write_chunk(Host, PortBase+1, NSInfo, EpochID,
File1, FooOff2, Chunk2, NoCSum),
{ok, {[{File1, FooOff2, Chunk2, _}], []}} =
machi_cr_client:read_chunk(C1, NSInfo, File1, FooOff2, Size2, undefined),
[{X,{ok, {[{File1, FooOff2, Chunk2, _}], []}}} =
{X,machi_flu1_client:read_chunk(
Host, PortBase+X, NSInfo, EpochID,
File1, FooOff2, Size2, undefined)} || X <- [0,1,2] ],
%% Misc API smoke & minor regression checks
{error, not_written} = machi_cr_client:read_chunk(C1, <<"no">>,
999999999, 1),
{error, partial_read} = machi_cr_client:read_chunk(C1, File1,
Off1, 88888888),
{error, bad_arg} = machi_cr_client:read_chunk(C1, NSInfo, <<"no">>,
999999999, 1, undefined),
{ok, {[{File1,Off1,Chunk1,_}, {File1,FooOff1,Chunk1,_}, {File1,FooOff2,Chunk2,_}],
[]}} =
machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, 88888888, undefined),
%% Checksum list return value is a primitive binary().
{ok, KludgeBin} = machi_cr_client:checksum_list(C1, File1),
true = is_binary(KludgeBin),
{error, no_such_file} = machi_cr_client:checksum_list(C1, <<"!!!!">>),
%% Exactly one file right now
{error, bad_arg} = machi_cr_client:checksum_list(C1, <<"!!!!">>),
io:format(user, "\nFiles = ~p\n", [machi_cr_client:list_files(C1)]),
%% Exactly one file right now, e.g.,
%% {ok,[{2098202,<<"pre^b144ef13-db4d-4c9f-96e7-caff02dc754f^1">>}]}
{ok, [_]} = machi_cr_client:list_files(C1),
%% Go back and test append_chunk_extra() and write_chunk()
%% Go back and test append_chunk() + extra and write_chunk()
Chunk10 = <<"It's a different chunk!">>,
Size10 = byte_size(Chunk10),
Extra10 = 5,
Opts1 = #append_opts{chunk_extra=Extra10*Size10},
{ok, {Off10,Size10,File10}} =
machi_cr_client:append_chunk_extra(C1, Prefix, Chunk10,
Extra10 * Size10),
{ok, Chunk10} = machi_cr_client:read_chunk(C1, File10, Off10, Size10),
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk10,
NoCSum, Opts1),
{ok, {[{_, Off10, Chunk10, _}], []}} =
machi_cr_client:read_chunk(C1, NSInfo, File10, Off10, Size10, undefined),
[begin
Offx = Off10 + (Seq * Size10),
%% TODO: uncomment written/not_written enforcement is available.
%% {error,not_written} = machi_cr_client:read_chunk(C1, File10,
%% {error,not_written} = machi_cr_client:read_chunk(C1, NSInfo, File10,
%% Offx, Size10),
{ok, {Offx,Size10,File10}} =
machi_cr_client:write_chunk(C1, File10, Offx, Chunk10),
{ok, Chunk10} = machi_cr_client:read_chunk(C1, File10, Offx,
Size10)
machi_cr_client:write_chunk(C1, NSInfo, File10, Offx, Chunk10, NoCSum),
{ok, {[{_, Offx, Chunk10, _}], []}} =
machi_cr_client:read_chunk(C1, NSInfo, File10, Offx, Size10, undefined)
end || Seq <- lists:seq(1, Extra10)],
{ok, {Off11,Size11,File11}} =
machi_cr_client:append_chunk(C1, Prefix, Chunk10),
%% Double-check that our reserved extra bytes were really honored!
true = (Off11 > (Off10 + (Extra10 * Size10))),
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk10, NoCSum),
%% %% Double-check that our reserved extra bytes were really honored!
%% true = (Off11 > (Off10 + (Extra10 * Size10))),
io:format(user, "\nFiles = ~p\n", [machi_cr_client:list_files(C1)]),
ok
after
exit(SupPid, normal),
machi_util:wait_for_death(SupPid, 100),
error_logger:tty(true),
catch application:stop(machi)
end.
witness_smoke_test_() -> {timeout, 1*60, fun() -> witness_smoke_test2() end}.
witness_smoke_test2() ->
SupPid = case machi_sup:start_link() of
{ok, P} -> P;
{error, {already_started, P1}} -> P1;
Other -> error(Other)
end,
%% TODO: I wonder why commenting this out makes this test pass
%% error_logger:tty(true),
try
Prefix = <<"pre">>,
Chunk1 = <<"yochunk">>,
NSInfo = undefined,
NoCSum = <<>>,
Host = "localhost",
PortBase = 64444,
Os = [{ignore_stability_time, true}, {active_mode, false},
{consistency_mode, cp_mode}],
OurWitness = a,
{D, EpochID} = setup_smoke_test(Host, PortBase, Os, [OurWitness]),
%% Whew ... ok, now start some damn tests.
{ok, C1} = machi_cr_client:start_link([P || {_,P}<-orddict:to_list(D)]),
{ok, _} = machi_cr_client:append_chunk(C1, NSInfo, Prefix,
Chunk1, NoCSum),
{ok, {Off1,Size1,File1}} =
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum),
BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, "foo")},
{error, bad_checksum} =
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, BadCSum),
{ok, {[{_, Off1, Chunk1, _}], []}} =
machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, Size1, undefined),
%% Stop 'b' and let the chain reset.
ok = machi_flu_psup:stop_flu_package(b),
%% ok = machi_fitness:add_admin_down(a_fitness, admin_down_bogus_flu, [{why,because}]),
%% ok = machi_fitness:delete_admin_down(a_fitness, admin_down_bogus_flu),
%% Run ticks enough times to force auto-unwedge of both a & c.
[run_ticks([a_chmgr,c_chmgr]) || _ <- [1,2,3,4] ],
%% The chain should now be [a,c].
%% Let's wedge OurWitness and see what happens: timeout/partition.
#p_srvr{name=WitName, address=WitA, port=WitP} =
orddict:fetch(OurWitness, D),
{ok, {false, EpochID2,_,_}} = machi_flu1_client:wedge_status(WitA, WitP),
machi_flu1:wedge_myself(WitName, EpochID2),
case machi_flu1_client:wedge_status(WitA, WitP) of
{ok, {true, EpochID2,_,_}} ->
ok;
{ok, {false, EpochID2,_,_}} ->
%% This is racy. Work around it by sleeping a while.
timer:sleep(6*1000),
{ok, {true, EpochID2,_,_}} =
machi_flu1_client:wedge_status(WitA, WitP)
end,
%% Chunk1 is still readable: not affected by wedged witness head.
{ok, {[{_, Off1, Chunk1, _}], []}} =
machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, Size1, undefined),
%% But because the head is wedged, an append will fail.
{error, partition} =
machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum,
#append_opts{}, 1*1000),
%% The witness's wedge status should cause timeout/partition
%% for write_chunk also.
Chunk10 = <<"It's a different chunk!">>,
Size10 = byte_size(Chunk10),
File10 = File1,
Offx = Off1 + (1 * Size10),
{error, partition} =
machi_cr_client:write_chunk(C1, NSInfo, File10, Offx, Chunk10, NoCSum, 1*1000),
ok
after

View file

@ -0,0 +1,131 @@
-module(machi_csum_table_test).
-compile(export_all).
-include_lib("eunit/include/eunit.hrl").
-define(HDR, {0, 1024, none}).
cleanup(Dir) ->
os:cmd("rm -rf " ++ Dir).
smoke_test() ->
Filename = "./temp-checksum-dumb-file",
_ = cleanup(Filename),
{ok, MC} = machi_csum_table:open(Filename, []),
?assertEqual([{1024, infinity}],
machi_csum_table:calc_unwritten_bytes(MC)),
Entry = {Offset, Size, Checksum} = {1064, 34, <<"deadbeef">>},
[] = machi_csum_table:find(MC, Offset, Size),
ok = machi_csum_table:write(MC, Offset, Size, Checksum),
[{1024, 40}, {1098, infinity}] = machi_csum_table:calc_unwritten_bytes(MC),
?assertEqual([Entry], machi_csum_table:find(MC, Offset, Size)),
ok = machi_csum_table:trim(MC, Offset, Size, undefined, undefined),
?assertEqual([{Offset, Size, trimmed}],
machi_csum_table:find(MC, Offset, Size)),
ok = machi_csum_table:close(MC),
ok = machi_csum_table:delete(MC).
close_test() ->
Filename = "./temp-checksum-dumb-file-2",
_ = cleanup(Filename),
{ok, MC} = machi_csum_table:open(Filename, []),
Entry = {Offset, Size, Checksum} = {1064, 34, <<"deadbeef">>},
[] = machi_csum_table:find(MC, Offset, Size),
ok = machi_csum_table:write(MC, Offset, Size, Checksum),
[Entry] = machi_csum_table:find(MC, Offset, Size),
ok = machi_csum_table:close(MC),
{ok, MC2} = machi_csum_table:open(Filename, []),
[Entry] = machi_csum_table:find(MC2, Offset, Size),
ok = machi_csum_table:trim(MC2, Offset, Size, undefined, undefined),
[{Offset, Size, trimmed}] = machi_csum_table:find(MC2, Offset, Size),
ok = machi_csum_table:delete(MC2).
smoke2_test() ->
Filename = "./temp-checksum-dumb-file-3",
_ = cleanup(Filename),
{ok, MC} = machi_csum_table:open(Filename, []),
Entry = {Offset, Size, Checksum} = {1025, 10, <<"deadbeef">>},
ok = machi_csum_table:write(MC, Offset, Size, Checksum),
?assertEqual([], machi_csum_table:find(MC, 0, 0)),
?assertEqual([?HDR], machi_csum_table:find(MC, 0, 1)),
[Entry] = machi_csum_table:find(MC, Offset, Size),
[?HDR] = machi_csum_table:find(MC, 1, 1024),
?assertEqual([?HDR, Entry],
machi_csum_table:find(MC, 1023, 1024)),
[Entry] = machi_csum_table:find(MC, 1024, 1024),
[Entry] = machi_csum_table:find(MC, 1025, 1024),
ok = machi_csum_table:trim(MC, Offset, Size, undefined, undefined),
[{Offset, Size, trimmed}] = machi_csum_table:find(MC, Offset, Size),
ok = machi_csum_table:close(MC),
ok = machi_csum_table:delete(MC).
smoke3_test() ->
Filename = "./temp-checksum-dumb-file-4",
_ = cleanup(Filename),
{ok, MC} = machi_csum_table:open(Filename, []),
Scenario =
[%% Command, {Offset, Size, Csum}, LeftNeighbor, RightNeibor
{?LINE, write, {2000, 10, <<"heh">>}, undefined, undefined},
{?LINE, write, {3000, 10, <<"heh">>}, undefined, undefined},
{?LINE, write, {4000, 10, <<"heh2">>}, undefined, undefined},
{?LINE, write, {4000, 10, <<"heh2">>}, undefined, undefined},
{?LINE, write, {4005, 10, <<"heh3">>}, {4000, 5, <<"heh2">>}, undefined},
{?LINE, write, {4005, 10, <<"heh3">>}, undefined, undefined},
{?LINE, trim, {3005, 10, <<>>}, {3000, 5, <<"heh">>}, undefined},
{?LINE, trim, {2000, 10, <<>>}, undefined, undefined},
{?LINE, trim, {2005, 5, <<>>}, {2000, 5, trimmed}, undefined},
{?LINE, trim, {3000, 5, <<>>}, undefined, undefined},
{?LINE, trim, {4000, 10, <<>>}, undefined, {4010, 5, <<"heh3">>}},
{?LINE, trim, {4010, 5, <<>>}, undefined, undefined},
{?LINE, trim, {0, 1024, <<>>}, undefined, undefined}
],
[ begin
%% ?debugVal({_Line, Chunk}),
{Offset, Size, Csum} = Chunk,
?assertEqual(LeftN0,
machi_csum_table:find_leftneighbor(MC, Offset)),
?assertEqual(RightN0,
machi_csum_table:find_rightneighbor(MC, Offset+Size)),
LeftN = case LeftN0 of
{OffsL, SizeL, trimmed} -> {OffsL, SizeL, trimmed};
{OffsL, SizeL, _} -> {OffsL, SizeL, <<"boom">>};
OtherL -> OtherL
end,
RightN = case RightN0 of
{OffsR, SizeR, _} -> {OffsR, SizeR, <<"boot">>};
OtherR -> OtherR
end,
case Cmd of
write ->
ok = machi_csum_table:write(MC, Offset, Size, Csum,
LeftN, RightN);
trim ->
ok = machi_csum_table:trim(MC, Offset, Size,
LeftN, RightN)
end
end || {_Line, Cmd, Chunk, LeftN0, RightN0} <- Scenario ],
?assert(not machi_csum_table:all_trimmed(MC, 10000)),
machi_csum_table:trim(MC, 0, 10000, undefined, undefined),
?assert(machi_csum_table:all_trimmed(MC, 10000)),
ok = machi_csum_table:close(MC),
ok = machi_csum_table:delete(MC).
%% TODO: add quickcheck test here
%% Previous implementation
-spec all_trimmed2(machi_csum_table:table(),
non_neg_integer(), non_neg_integer()) -> boolean().
all_trimmed2(CsumT, Left, Right) ->
Chunks = machi_csum_table:find(CsumT, Left, Right),
runthru(Chunks, Left, Right).
%% @doc make sure all trimmed chunks are continously chained
%% TODO: test with EQC
runthru([], Pos, Pos) -> true;
runthru([], Pos0, Pos) when Pos0 < Pos -> false;
runthru([{Offset0, Size0, trimmed}|T], Offset, Pos) when Offset0 =< Offset ->
runthru(T, Offset0+Size0, Pos);
runthru(_L, _O, _P) ->
false.

View file

@ -0,0 +1,478 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(machi_file_proxy_eqc).
-ifdef(TEST).
-ifdef(EQC).
-compile(export_all).
-include("machi.hrl").
-include_lib("eqc/include/eqc.hrl").
-include_lib("eqc/include/eqc_statem.hrl").
-include_lib("eunit/include/eunit.hrl").
-define(QC_OUT(P),
eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)).
-define(TESTDIR, "./eqc").
%% EUNIT TEST DEFINITION
eqc_test_() ->
PropTimeout = case os:getenv("EQC_TIME") of
false -> 30;
V -> list_to_integer(V)
end,
{timeout, PropTimeout*2 + 30,
{spawn,
[
?_assertEqual(true, eqc:quickcheck(eqc:testing_time(PropTimeout, ?QC_OUT(prop_ok()))))
]
}}.
%% SHELL HELPERS
test() ->
test(100).
test(N) ->
quickcheck(numtests(N, prop_ok())).
check() ->
check(prop_ok(), current_counterexample()).
%% GENERATORS
csum_type() ->
elements([?CSUM_TAG_NONE, ?CSUM_TAG_CLIENT_SHA, ?CSUM_TAG_SERVER_SHA]).
csum(Type, Binary) ->
case Type of
?CSUM_TAG_NONE -> <<>>;
_ -> machi_util:checksum_chunk(Binary)
end.
position(P) ->
?LET(O, offset(), P + O).
offset() ->
?SUCHTHAT(X, int(), X >= 0).
offset_base() ->
elements([4096, 6144, 7168, 8192, 20480, 100000, 1000000]).
big_offset() ->
?LET(P, int(), ?LET(X, offset_base(), P+X)).
len() ->
?SUCHTHAT(X, int(), X >= 1).
data_with_csum() ->
?LET({B,T},{eqc_gen:largebinary(), csum_type()}, {B,T, csum(T, B)}).
%?LET({B,T},{eqc_gen:binary(), csum_type()}, {B,T, csum(T, B)}).
data_with_csum(Limit) ->
%?LET({B,T},{?LET(S, Limit, eqc_gen:largebinary(S)), csum_type()}, {B,T, csum(T, B)}).
?LET({B,T},{?LET(S, Limit, eqc_gen:binary(S)), csum_type()}, {B,T, csum(T, B)}).
intervals([]) ->
[];
intervals([N]) ->
[{N, choose(1,1)}];
intervals([A,B|T]) ->
[{A, oneof([choose(1, B-A), B-A])}|intervals([B|T])].
interval_list() ->
?LET(L,
oneof([list(choose(1025, 1033)), list(choose(1024, 4096))]),
intervals(lists:usort(L))).
shuffle_interval() ->
?LET(L, interval_list(), shuffle(L)).
get_written_interval(L) ->
?LET({O, Ln}, elements(L), {O+1, Ln-1}).
%% INITIALIZATION
-record(state, {pid, prev_extra = 0,
filename = undefined,
planned_writes=[],
planned_trims=[],
written=[],
trimmed=[]}).
initial_state() ->
{_, _, MS} = os:timestamp(),
Filename = test_server:temp_name("eqc_data") ++ "." ++ integer_to_list(MS),
#state{filename=Filename, written=[{0,1024}]}.
initial_state(I, T) ->
S=initial_state(),
S#state{written=[{0,1024}],
planned_writes=I,
planned_trims=T}.
weight(_S, rewrite) -> 1;
weight(_S, _) -> 2.
%% HELPERS
get_overlaps(_Offset, _Len, [], Acc) -> lists:reverse(Acc);
get_overlaps(Offset, Len, [{Pos, Sz} = Ck|T], Acc0)
%% Overlap judgement differnt from the one in machi_csum_table
%% [a=Offset, b), [x=Pos, y) ...
when
%% a =< x && x < b && b =< y
(Offset =< Pos andalso Pos < Offset + Len andalso Offset + Len =< Pos + Sz) orelse
%% a =< x && y < b
(Offset =< Pos andalso Pos + Sz < Offset + Len) orelse
%% x < a && a < y && y =< b
(Pos < Offset andalso Offset < Pos + Sz andalso Pos + Sz =< Offset + Len) orelse
%% x < a && b < y
(Pos < Offset + Len andalso Offset + Len < Pos + Sz) ->
get_overlaps(Offset, Len, T, [Ck|Acc0]);
get_overlaps(Offset, Len, [_Ck|T], Acc0) ->
get_overlaps(Offset, Len, T, Acc0).
%% Inefficient but simple easy code to verify by eyes - returns all
%% bytes that fits in (Offset, Len)
chop(Offset, Len, List) ->
ChopLeft = fun({Pos, Sz}) when Pos < Offset andalso Offset =< Pos + Sz ->
{Offset, Sz + Pos - Offset};
({Pos, Sz}) when Offset =< Pos andalso Pos + Sz < Offset + Len ->
{Pos, Sz};
({Pos, _Sz}) when Offset =< Pos ->
{Pos, Offset + Len - Pos}
end,
ChopRight = fun({Pos, Sz}) when Offset + Len < Pos + Sz ->
{Pos, Offset + Len - Pos};
({Pos, Sz}) ->
{Pos, Sz}
end,
Filter0 = fun({_, 0}) -> false;
(Other) -> {true, Other} end,
lists:filtermap(fun(E) -> Filter0(ChopRight(ChopLeft(E))) end,
List).
%% Returns all bytes that are at left side of the Offset
chopped_left(_Offset, []) -> undefined;
chopped_left(Offset, [{Pos,_Sz}|_]) when Pos < Offset ->
{Pos, Offset - Pos};
chopped_left(_, _) ->
undefined.
chopped_right(_Offset, []) -> undefined;
chopped_right(Offset, List) ->
{Pos, Sz} = lists:last(List),
if Offset < Pos + Sz ->
{Offset, Pos + Sz - Offset};
true ->
undefined
end.
cleanup_chunk(Offset, Length, ChunkList) ->
Overlaps = get_overlaps(Offset, Length, ChunkList, []),
NewCL0 = lists:foldl(fun lists:delete/2,
ChunkList, Overlaps),
NewCL1 = case chopped_left(Offset, Overlaps) of
undefined -> NewCL0;
LeftRemain -> [LeftRemain|NewCL0]
end,
NewCL2 = case chopped_right(Offset+Length, Overlaps) of
undefined -> NewCL1;
RightRemain -> [RightRemain|NewCL1]
end,
lists:sort(NewCL2).
is_error({error, _}) -> true;
is_error({error, _, _}) -> true;
is_error(Other) -> {expected_ERROR, Other}.
is_ok({ok, _, _}) -> true;
is_ok(ok) -> true;
is_ok(Other) -> {expected_OK, Other}.
get_offset({ok, _Filename, Offset}) -> Offset;
get_offset(_) -> error(badarg).
last_byte([]) -> 0;
last_byte(L0) ->
L1 = lists:map(fun({Pos, Sz}) -> Pos + Sz end, L0),
lists:last(lists:sort(L1)).
cleanup() ->
[begin
Fs = filelib:wildcard(?TESTDIR ++ Glob),
[file:delete(F) || F <- Fs],
[file:del_dir(F) || F <- Fs]
end || Glob <- ["*/*/*/*", "*/*/*", "*/*", "*"] ],
_ = file:del_dir(?TESTDIR),
ok.
%% start
start_pre(S) ->
S#state.pid =:= undefined.
start_command(S) ->
{call, ?MODULE, start, [S]}.
start(#state{filename=File}) ->
{ok, Pid} = machi_file_proxy:start_link(some_flu, File, ?TESTDIR),
unlink(Pid),
Pid.
start_next(S, Pid, _) ->
S#state{pid = Pid}.
%% read
read_pre(S) ->
S#state.pid /= undefined.
read_args(S) ->
[S#state.pid, oneof([offset(), big_offset()]), len()].
read_post(S, [_Pid, Off, L], Res) ->
Written = get_overlaps(Off, L, S#state.written, []),
Chopped = chop(Off, L, Written),
Trimmed = get_overlaps(Off, L, S#state.trimmed, []),
Eof = lists:max([Pos+Sz||{Pos,Sz}<-S#state.written]),
case Res of
{ok, {Written0, Trimmed0}} ->
Written1 = lists:map(fun({_, Pos, Chunk, _}) ->
{Pos, iolist_size(Chunk)}
end, Written0),
Trimmed1 = lists:map(fun({_, Pos, Sz}) -> {Pos, Sz} end, Trimmed0),
Chopped =:= Written1
andalso Trimmed =:= Trimmed1;
%% TODO: such response are ugly, rethink the SPEC
{error, not_written} when Eof < Off + L ->
true;
{error, not_written} when Chopped =:= [] andalso Trimmed =:= [] ->
true;
_Other ->
is_error(Res)
end.
read_next(S, _Res, _Args) -> S.
read(Pid, Offset, Length) ->
machi_file_proxy:read(Pid, Offset, Length, [{needs_trimmed, true}]).
%% write
write_pre(S) ->
S#state.pid /= undefined andalso S#state.planned_writes /= [].
%% do not allow writes with empty data
write_pre(_S, [_Pid, _Extra, {<<>>, _Tag, _Csum}]) ->
?assert(false),
false;
write_pre(_S, _Args) ->
true.
write_args(S) ->
{Off, Len} = hd(S#state.planned_writes),
[S#state.pid, Off, data_with_csum(Len)].
write_post(S, [_Pid, Off, {Bin, _Tag, _Csum}] = _Args, Res) ->
Size = iolist_size(Bin),
case {get_overlaps(Off, Size, S#state.written, []),
get_overlaps(Off, Size, S#state.trimmed, [])} of
{[], []} ->
%% No overlap neither with written ranges nor trimmed
%% ranges; OK to write things.
eq(Res, ok);
{_, _} ->
%% overlap found in either or both at written or at
%% trimmed ranges; can't write.
is_error(Res)
end.
write_next(S, Res, [_Pid, Offset, {Bin, _Tag, _Csum}]) ->
S0 = case is_ok(Res) of
true ->
S#state{written = lists:sort(S#state.written ++ [{Offset, iolist_size(Bin)}]) };
_ ->
S
end,
S0#state{prev_extra = 0, planned_writes=tl(S0#state.planned_writes)}.
write(Pid, Offset, {Bin, Tag, Csum}) ->
Meta = [{client_csum_tag, Tag},
{client_csum, Csum}],
machi_file_proxy:write(Pid, Offset, Meta, Bin).
%% append
append_pre(S) ->
S#state.pid /= undefined.
%% do not allow appends with empty binary data
append_pre(_S, [_Pid, _Extra, {<<>>, _Tag, _Csum}]) ->
false;
append_pre(_S, _Args) ->
true.
append_args(S) ->
[S#state.pid, default(0, len()), data_with_csum()].
append(Pid, Extra, {Bin, Tag, Csum}) ->
Meta = [{client_csum_tag, Tag},
{client_csum, Csum}],
machi_file_proxy:append(Pid, Meta, Extra, Bin).
append_next(S, Res, [_Pid, Extra, {Bin, _Tag, _Csum}]) ->
case is_ok(Res) of
true ->
Offset = get_offset(Res),
S#state{prev_extra = Extra,
written = lists:sort(S#state.written ++ [{Offset, iolist_size(Bin)}])};
_Other ->
S
end.
%% appends should always succeed unless the disk is full
%% or there's a hardware failure.
append_post(S, _Args, Res) ->
case is_ok(Res) of
true ->
Offset = get_offset(Res),
case erlang:max(last_byte(S#state.written),
last_byte(S#state.trimmed)) + S#state.prev_extra of
Offset ->
true;
UnexpectedByte ->
{wrong_offset_after_append,
{Offset, UnexpectedByte},
{S#state.written, S#state.prev_extra}}
end;
Error ->
Error
end.
%% rewrite
rewrite_pre(S) ->
S#state.pid /= undefined andalso
(S#state.written ++ S#state.trimmed) /= [] .
rewrite_args(S) ->
?LET({Off, Len},
get_written_interval(S#state.written ++ S#state.trimmed),
[S#state.pid, Off, data_with_csum(Len)]).
rewrite(Pid, Offset, {Bin, Tag, Csum}) ->
Meta = [{client_csum_tag, Tag},
{client_csum, Csum}],
machi_file_proxy:write(Pid, Offset, Meta, Bin).
rewrite_post(_S, _Args, Res) ->
is_error(Res).
rewrite_next(S, _Res, _Args) ->
S#state{prev_extra = 0}.
%% trim
trim_pre(S) ->
S#state.pid /= undefined andalso S#state.planned_trims /= [].
trim_args(S) ->
{Offset, Length} = hd(S#state.planned_trims),
[S#state.pid, Offset, Length].
trim(Pid, Offset, Length) ->
machi_file_proxy:trim(Pid, Offset, Length, false).
trim_post(_S, [_Pid, _Offset, _Length], ok) ->
true;
trim_post(_S, [_Pid, _Offset, _Length], _Res) ->
false.
trim_next(S, Res, [_Pid, Offset, Length]) ->
S1 = case is_ok(Res) of
true ->
NewWritten = cleanup_chunk(Offset, Length, S#state.written),
Trimmed1 = cleanup_chunk(Offset, Length, S#state.trimmed),
NewTrimmed = lists:sort([{Offset, Length}|Trimmed1]),
S#state{trimmed=NewTrimmed,
written=NewWritten};
_Other ->
S
end,
S1#state{prev_extra=0,
planned_trims=tl(S#state.planned_trims)}.
stop_pre(S) ->
S#state.pid /= undefined.
stop_args(S) ->
[S#state.pid].
stop(Pid) ->
catch machi_file_proxy:stop(Pid).
stop_post(_, _, _) -> true.
stop_next(S, _, _) ->
S#state{pid=undefined, prev_extra=0}.
%% Property
prop_ok() ->
cleanup(),
?FORALL({I, T},
{shuffle_interval(), shuffle_interval()},
?FORALL(Cmds, parallel_commands(?MODULE, initial_state(I, T)),
begin
{H, S, Res} = run_parallel_commands(?MODULE, Cmds),
cleanup(),
pretty_commands(?MODULE, Cmds, {H, S, Res},
aggregate(command_names(Cmds), Res == ok))
end)).
%% Test for tester functions
chopper_test_() ->
[?_assertEqual([{0, 1024}],
get_overlaps(1, 1, [{0, 1024}], [])),
?_assertEqual([],
get_overlaps(10, 5, [{9, 1}, {15, 1}], [])),
?_assertEqual([{9,2},{14,1}],
get_overlaps(10, 5, [{9, 2}, {14, 1}], [])),
?_assertEqual([], chop(0, 0, [{0,2}])),
?_assertEqual([{0, 1}], chop(0, 1, [{0,2}])),
?_assertEqual([], chop(1, 0, [{0,2}])),
?_assertEqual([{1, 1}], chop(1, 1, [{0,2}])),
?_assertEqual([{1, 1}], chop(1, 2, [{0,2}])),
?_assertEqual([], chop(2, 1, [{0,2}])),
?_assertEqual([], chop(2, 2, [{0,2}])),
?_assertEqual([{1, 1}], chop(1, 3, [{0,2}])),
?_assertError(_, chop(3, 1, [{0,2}])),
?_assertEqual([], chop(2, 3, [{0,2}])),
?_assertEqual({0, 1}, chopped_left(1, [{0, 1024}])),
?_assertEqual([{0, 1}, {2, 1022}], cleanup_chunk(1, 1, [{0, 1024}])),
?_assertEqual([{2, 1022}], cleanup_chunk(0, 2, [{0, 1}, {2, 1022}])),
?_assert(true)
].
-endif. % EQC
-endif. % TEST

View file

@ -0,0 +1,142 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(machi_file_proxy_test).
-ifdef(TEST).
-compile(export_all).
-include_lib("eunit/include/eunit.hrl").
-include("machi.hrl").
clean_up_data_dir(DataDir) ->
[begin
Fs = filelib:wildcard(DataDir ++ Glob),
[file:delete(F) || F <- Fs],
[file:del_dir(F) || F <- Fs]
end || Glob <- ["*/*/*/*", "*/*/*", "*/*", "*"] ],
_ = file:del_dir(DataDir),
ok.
-ifndef(PULSE).
-define(TESTDIR, "./t").
-define(HYOOGE, 75 * 1024 * 1024). % 75 MBytes
random_binary_single() ->
%% OK, I guess it's not that random...
<<"Four score and seven years ago our fathers brought forth on this
continent a new nation, conceived in liberty, and dedicated to the
proposition that all men are created equal.
Now we are engaged in a great civil war, testing whether that nation, or any
nation so conceived and so dedicated, can long endure. We are met on a great
battlefield of that war. We have come to dedicate a portion of that field, as a
final resting place for those who here gave their lives that that nation
might live. It is altogether fitting and proper that we should do this.
But, in a larger sense, we can not dedicate, we can not consecrate, we can not
hallow this ground. The brave men, living and dead, who struggled here, have
consecrated it, far above our poor power to add or detract. The world will
little note, nor long remember what we say here, but it can never forget what
they did here. It is for us the living, rather, to be dedicated here to the
unfinished work which they who fought here have thus far so nobly advanced. It
is rather for us to be here dedicated to the great task remaining before us
that from these honored dead we take increased devotion to that cause for which
they gave the last full measure of devotion that we here highly resolve that
these dead shall not have died in vain that this nation, under God, shall have
a new birth of freedom and that government of the people, by the people, for
the people, shall not perish from the earth.">>.
random_binary(Start, End) ->
Size = byte_size(random_binary_single()) - 1,
case End > Size of
true ->
Copies = ( End div Size ) + 1,
D0 = binary:copy(random_binary_single(), Copies),
binary:part(<<D0/binary>>, Start, End);
false ->
binary:part(random_binary_single(), Start, End)
end.
setup() ->
{ok, Pid} = machi_file_proxy:start_link(fluname, "test", ?TESTDIR),
Pid.
teardown(Pid) ->
catch machi_file_proxy:stop(Pid).
machi_file_proxy_test_() ->
clean_up_data_dir(?TESTDIR),
{setup,
fun setup/0,
fun teardown/1,
fun(Pid) ->
[
?_assertEqual({error, bad_arg}, machi_file_proxy:read(Pid, -1, -1)),
?_assertEqual({error, bad_arg}, machi_file_proxy:write(Pid, -1, <<"yo">>)),
?_assertEqual({error, bad_arg}, machi_file_proxy:append(Pid, [], -1, <<"krep">>)),
?_assertMatch({ok, {_, []}}, machi_file_proxy:read(Pid, 1, 1)),
?_assertEqual({error, not_written}, machi_file_proxy:read(Pid, 1024, 1)),
?_assertMatch({ok, {_, []}}, machi_file_proxy:read(Pid, 1, 1024)),
?_assertEqual({error, not_written}, machi_file_proxy:read(Pid, 1024, ?HYOOGE)),
?_assertEqual({error, not_written}, machi_file_proxy:read(Pid, ?HYOOGE, 1)),
{timeout, 10,
?_assertEqual({error, written}, machi_file_proxy:write(Pid, 1, random_binary(0, ?HYOOGE)))},
?_assertMatch({ok, "test", _}, machi_file_proxy:append(Pid, random_binary(0, 1024))),
?_assertEqual({error, written}, machi_file_proxy:write(Pid, 1024, <<"fail">>)),
?_assertEqual({error, written}, machi_file_proxy:write(Pid, 1, <<"fail">>)),
?_assertMatch({ok, {[{_, _, _, _}], []}}, machi_file_proxy:read(Pid, 1025, 1000)),
?_assertMatch({ok, "test", _}, machi_file_proxy:append(Pid, [], 1024, <<"mind the gap">>)),
?_assertEqual(ok, machi_file_proxy:write(Pid, 2060, [], random_binary(0, 1024)))
]
end}.
multiple_chunks_read_test_() ->
clean_up_data_dir(?TESTDIR),
{setup,
fun setup/0,
fun teardown/1,
fun(Pid) ->
[
?_assertEqual(ok, machi_file_proxy:trim(Pid, 0, 1, false)),
?_assertMatch({ok, {[], [{"test", 0, 1}]}},
machi_file_proxy:read(Pid, 0, 1,
#read_opts{needs_trimmed=true})),
?_assertMatch({ok, "test", _}, machi_file_proxy:append(Pid, random_binary(0, 1024))),
?_assertEqual(ok, machi_file_proxy:write(Pid, 10000, <<"fail">>)),
?_assertEqual(ok, machi_file_proxy:write(Pid, 20000, <<"fail">>)),
?_assertEqual(ok, machi_file_proxy:write(Pid, 30000, <<"fail">>)),
%% Freeza
?_assertEqual(ok, machi_file_proxy:write(Pid, 530000, <<"fail">>)),
?_assertMatch({ok, {[{"test", 1024, _, _},
{"test", 10000, <<"fail">>, _},
{"test", 20000, <<"fail">>, _},
{"test", 30000, <<"fail">>, _},
{"test", 530000, <<"fail">>, _}], []}},
machi_file_proxy:read(Pid, 1024, 530000)),
?_assertMatch({ok, {[{"test", 1, _, _}], [{"test", 0, 1}]}},
machi_file_proxy:read(Pid, 0, 1024,
#read_opts{needs_trimmed=true}))
]
end}.
-endif. % !PULSE
-endif. % TEST.

View file

@ -30,6 +30,22 @@
-define(FLU, machi_flu1).
-define(FLU_C, machi_flu1_client).
get_env_vars(App, Ks) ->
Raw = [application:get_env(App, K) || K <- Ks],
Old = lists:zip(Ks, Raw),
{App, Old}.
clean_up_env_vars({App, Old}) ->
[case Res of
undefined ->
application:unset_env(App, K);
{ok, V} ->
application:set_env(App, K, V)
end || {K, Res} <- Old].
filter_env_var({ok, V}) -> V;
filter_env_var(Else) -> Else.
clean_up_data_dir(DataDir) ->
[begin
Fs = filelib:wildcard(DataDir ++ Glob),
@ -39,79 +55,100 @@ clean_up_data_dir(DataDir) ->
_ = file:del_dir(DataDir),
ok.
setup_test_flu(RegName, TcpPort, DataDir) ->
setup_test_flu(RegName, TcpPort, DataDir, []).
start_flu_package(RegName, TcpPort, DataDir) ->
start_flu_package(RegName, TcpPort, DataDir, []).
setup_test_flu(RegName, TcpPort, DataDir, DbgProps) ->
case proplists:get_value(save_data_dir, DbgProps) of
start_flu_package(RegName, TcpPort, DataDir, Props) ->
case proplists:get_value(save_data_dir, Props) of
true ->
ok;
_ ->
clean_up_data_dir(DataDir)
end,
{ok, FLU1} = ?FLU:start_link([{RegName, TcpPort, DataDir},
{dbg, DbgProps}]),
%% TODO the process structuring/racy-ness of the various processes
%% of the FLU needs to be deterministic to remove this sleep race
%% "prevention".
timer:sleep(10),
FLU1.
maybe_start_sup(),
machi_flu_psup:start_flu_package(RegName, TcpPort, DataDir, Props).
stop_flu_package(FluName) ->
machi_flu_psup:stop_flu_package(FluName),
Pid = whereis(machi_sup),
exit(Pid, normal),
machi_util:wait_for_death(Pid, 100).
maybe_start_sup() ->
case whereis(machi_sup) of
undefined ->
machi_sup:start_link(),
%% evil but we have to let stuff start up
timer:sleep(10),
maybe_start_sup();
Pid -> Pid
end.
-ifndef(PULSE).
flu_smoke_test() ->
Host = "localhost",
TcpPort = 32957,
TcpPort = 12957,
DataDir = "./data",
NSInfo = undefined,
NoCSum = <<>>,
Prefix = <<"prefix!">>,
BadPrefix = BadFile = "no/good",
W_props = [{initial_wedged, false}],
FLU1 = setup_test_flu(smoke_flu, TcpPort, DataDir, W_props),
{_, _, _} = machi_test_util:start_flu_package(smoke_flu, TcpPort, DataDir, W_props),
try
Msg = "Hello, world!",
Msg = ?FLU_C:echo(Host, TcpPort, Msg),
{error, no_such_file} = ?FLU_C:checksum_list(Host, TcpPort,
?DUMMY_PV1_EPOCH,
"does-not-exist"),
{error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort,
?DUMMY_PV1_EPOCH, BadFile),
{error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort,"does-not-exist"),
{error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, BadFile),
{ok, []} = ?FLU_C:list_files(Host, TcpPort, ?DUMMY_PV1_EPOCH),
{ok, {false, _}} = ?FLU_C:wedge_status(Host, TcpPort),
{ok, {false, _,_,_}} = ?FLU_C:wedge_status(Host, TcpPort),
Chunk1 = <<"yo!">>,
{ok, {Off1,Len1,File1}} = ?FLU_C:append_chunk(Host, TcpPort,
{ok, {Off1,Len1,File1}} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo,
?DUMMY_PV1_EPOCH,
Prefix, Chunk1),
{ok, Chunk1} = ?FLU_C:read_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH,
File1, Off1, Len1),
{ok, KludgeBin} = ?FLU_C:checksum_list(Host, TcpPort,
?DUMMY_PV1_EPOCH, File1),
Prefix, Chunk1, NoCSum),
{ok, {[{_, Off1, Chunk1, _}], _}} = ?FLU_C:read_chunk(Host, TcpPort,
NSInfo, ?DUMMY_PV1_EPOCH,
File1, Off1, Len1,
noopt),
{ok, KludgeBin} = ?FLU_C:checksum_list(Host, TcpPort, File1),
true = is_binary(KludgeBin),
{error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort,
{error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo,
?DUMMY_PV1_EPOCH,
BadPrefix, Chunk1),
BadPrefix, Chunk1, NoCSum),
{ok, [{_,File1}]} = ?FLU_C:list_files(Host, TcpPort, ?DUMMY_PV1_EPOCH),
Len1 = size(Chunk1),
{error, not_written} = ?FLU_C:read_chunk(Host, TcpPort,
?DUMMY_PV1_EPOCH,
File1, Off1*983829323, Len1),
{error, partial_read} = ?FLU_C:read_chunk(Host, TcpPort,
?DUMMY_PV1_EPOCH,
File1, Off1, Len1*9999),
NSInfo, ?DUMMY_PV1_EPOCH,
File1, Off1*983829323, Len1,
noopt),
%% XXX FIXME
%%
%% This is failing because the read extends past the end of the file.
%% I guess the semantic here is that we should consider any read which
%% *starts* at a valid offset to be a partial read, even if the length
%% of the read will cause it to fail.
%%
%% {error, partial_read} = ?FLU_C:read_chunk(Host, TcpPort,
%% NSInfo, ?DUMMY_PV1_EPOCH,
%% File1, Off1, Len1*9999),
{ok, {Off1b,Len1b,File1b}} = ?FLU_C:append_chunk(Host, TcpPort,
{ok, {Off1b,Len1b,File1b}} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo,
?DUMMY_PV1_EPOCH,
Prefix, Chunk1),
Prefix, Chunk1,NoCSum),
Extra = 42,
{ok, {Off1c,Len1c,File1c}} = ?FLU_C:append_chunk_extra(Host, TcpPort,
Opts1 = #append_opts{chunk_extra=Extra},
{ok, {Off1c,Len1c,File1c}} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo,
?DUMMY_PV1_EPOCH,
Prefix, Chunk1, Extra),
Prefix, Chunk1, NoCSum,
Opts1, infinity),
{ok, {Off1d,Len1d,File1d}} = ?FLU_C:append_chunk(Host, TcpPort,
NSInfo,
?DUMMY_PV1_EPOCH,
Prefix, Chunk1),
Prefix, Chunk1,NoCSum),
if File1b == File1c, File1c == File1d ->
true = (Off1c == Off1b + Len1b),
true = (Off1d == Off1c + Len1c + Extra);
@ -119,27 +156,44 @@ flu_smoke_test() ->
exit(not_mandatory_but_test_expected_same_file_fixme)
end,
Chunk1_cs = {<<?CSUM_TAG_NONE:8, 0:(8*20)>>, Chunk1},
{ok, {Off1e,Len1e,File1e}} = ?FLU_C:append_chunk(Host, TcpPort,
?DUMMY_PV1_EPOCH,
Prefix, Chunk1_cs),
Chunk2 = <<"yo yo">>,
Len2 = byte_size(Chunk2),
Off2 = ?MINIMUM_OFFSET + 77,
File2 = "smoke-whole-file",
ok = ?FLU_C:write_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH,
File2, Off2, Chunk2),
{error, bad_arg} = ?FLU_C:write_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH,
BadFile, Off2, Chunk2),
{ok, Chunk2} = ?FLU_C:read_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH,
File2, Off2, Len2),
{error, not_written} = ?FLU_C:read_chunk(Host, TcpPort,
?DUMMY_PV1_EPOCH,
"no!!", Off2, Len2),
File2 = "smoke-whole-file^^0^1^1",
ok = ?FLU_C:write_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH,
File2, Off2, Chunk2, NoCSum),
{error, bad_arg} = ?FLU_C:write_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH,
BadFile, Off2, Chunk2, NoCSum),
{ok, {[{_, Off2, Chunk2, _}], _}} =
?FLU_C:read_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, File2, Off2, Len2, noopt),
{error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort,
?DUMMY_PV1_EPOCH,
BadFile, Off2, Len2),
NSInfo, ?DUMMY_PV1_EPOCH,
"no!!", Off2, Len2, noopt),
{error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort,
NSInfo, ?DUMMY_PV1_EPOCH,
BadFile, Off2, Len2, noopt),
%% Make a connected socket.
Sock1 = ?FLU_C:connect(#p_srvr{address=Host, port=TcpPort}),
%% Let's test some cluster version enforcement.
Good_EpochNum = 0,
Good_NSVersion = 0,
Good_NS = <<>>,
{ok, {false, {Good_EpochNum,_}, Good_NSVersion, GoodNS}} =
?FLU_C:wedge_status(Sock1),
NS_good = #ns_info{version=Good_NSVersion, name=Good_NS},
{ok, {[{_, Off2, Chunk2, _}], _}} =
?FLU_C:read_chunk(Sock1, NS_good, ?DUMMY_PV1_EPOCH,
File2, Off2, Len2, noopt),
NS_bad_version = #ns_info{version=1, name=Good_NS},
NS_bad_name = #ns_info{version=Good_NSVersion, name= <<"foons">>},
{error, bad_epoch} =
?FLU_C:read_chunk(Sock1, NS_bad_version, ?DUMMY_PV1_EPOCH,
File2, Off2, Len2, noopt),
{error, bad_arg} =
?FLU_C:read_chunk(Sock1, NS_bad_name, ?DUMMY_PV1_EPOCH,
File2, Off2, Len2, noopt),
%% We know that File1 still exists. Pretend that we've done a
%% migration and exercise the delete_migration() API.
@ -156,62 +210,107 @@ flu_smoke_test() ->
{error, bad_arg} = ?FLU_C:trunc_hack(Host, TcpPort,
?DUMMY_PV1_EPOCH, BadFile),
ok = ?FLU_C:quit(?FLU_C:connect(#p_srvr{address=Host,
port=TcpPort}))
ok = ?FLU_C:quit(Sock1)
after
ok = ?FLU:stop(FLU1)
machi_test_util:stop_flu_package()
end.
flu_projection_smoke_test() ->
Host = "localhost",
TcpPort = 32959,
DataDir = "./data",
FLU1 = setup_test_flu(projection_test_flu, TcpPort, DataDir),
TcpPort = 12959,
DataDir = "./data.projst",
{_,_,_} = machi_test_util:start_flu_package(projection_test_flu, TcpPort, DataDir),
try
[begin
{ok, {0,_}} = ?FLU_C:get_latest_epochid(Host, TcpPort, T),
{error, not_written} =
?FLU_C:read_latest_projection(Host, TcpPort, T),
{ok, []} = ?FLU_C:list_all_projections(Host, TcpPort, T),
{ok, []} = ?FLU_C:get_all_projections(Host, TcpPort, T),
P_a = #p_srvr{name=a, address="localhost", port=4321},
P1 = machi_projection:new(1, a, [P_a], [], [a], [], []),
ok = ?FLU_C:write_projection(Host, TcpPort, T, P1),
{error, written} = ?FLU_C:write_projection(Host, TcpPort, T, P1),
{ok, P1} = ?FLU_C:read_projection(Host, TcpPort, T, 1),
{ok, {1,_}} = ?FLU_C:get_latest_epochid(Host, TcpPort, T),
{ok, P1} = ?FLU_C:read_latest_projection(Host, TcpPort, T),
{ok, [1]} = ?FLU_C:list_all_projections(Host, TcpPort, T),
{ok, [P1]} = ?FLU_C:get_all_projections(Host, TcpPort, T),
{error, not_written} = ?FLU_C:read_projection(Host, TcpPort, T, 2)
end || T <- [public, private] ]
[ok = flu_projection_common(Host, TcpPort, T) ||
T <- [public, private] ]
%% , {ok, {false, EpochID1,_,_}} = ?FLU_C:wedge_status(Host, TcpPort),
%% io:format(user, "EpochID1 ~p\n", [EpochID1])
after
ok = ?FLU:stop(FLU1)
machi_test_util:stop_flu_package()
end.
flu_projection_common(Host, TcpPort, T) ->
{ok, {0,_}} = ?FLU_C:get_latest_epochid(Host, TcpPort, T),
{ok, #projection_v1{epoch_number=0}} =
?FLU_C:read_latest_projection(Host, TcpPort, T),
{ok, [0]} = ?FLU_C:list_all_projections(Host, TcpPort, T),
{ok, [#projection_v1{epoch_number=0}]} =
?FLU_C:get_all_projections(Host, TcpPort, T),
P_a = #p_srvr{name=a, address="localhost", port=4321},
P1 = machi_projection:new(1, a, [P_a], [], [a], [], []),
ok = ?FLU_C:write_projection(Host, TcpPort, T, P1),
case ?FLU_C:write_projection(Host, TcpPort, T, P1) of
{error, written} when T == public -> ok;
ok when T == private -> ok
end,
{ok, P1} = ?FLU_C:read_projection(Host, TcpPort, T, 1),
{ok, {1,_}} = ?FLU_C:get_latest_epochid(Host, TcpPort, T),
{ok, P1} = ?FLU_C:read_latest_projection(Host, TcpPort, T),
{ok, [0,1]} = ?FLU_C:list_all_projections(Host, TcpPort, T),
{ok, [_,P1]} = ?FLU_C:get_all_projections(Host, TcpPort, T),
{error, not_written} = ?FLU_C:read_projection(Host, TcpPort, T, 2),
ok.
bad_checksum_test() ->
Host = "localhost",
TcpPort = 32960,
DataDir = "./data",
TcpPort = 12960,
DataDir = "./data.bct",
Opts = [{initial_wedged, false}],
FLU1 = setup_test_flu(projection_test_flu, TcpPort, DataDir, Opts),
{_,_,_} = machi_test_util:start_flu_package(projection_test_flu, TcpPort, DataDir, Opts),
NSInfo = undefined,
try
Prefix = <<"some prefix">>,
Chunk1 = <<"yo yo yo">>,
Chunk1_badcs = {<<?CSUM_TAG_CLIENT_SHA:8, 0:(8*20)>>, Chunk1},
{error, bad_checksum} = ?FLU_C:append_chunk(Host, TcpPort,
BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, ".................")},
{error, bad_checksum} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo,
?DUMMY_PV1_EPOCH,
Prefix, Chunk1_badcs),
{error, bad_checksum} = ?FLU_C:write_chunk(Host, TcpPort,
?DUMMY_PV1_EPOCH,
<<"foo-file">>, 99832,
Chunk1_badcs),
Prefix,
Chunk1, BadCSum),
ok
after
ok = ?FLU:stop(FLU1)
machi_test_util:stop_flu_package()
end.
witness_test() ->
Host = "localhost",
TcpPort = 12961,
DataDir = "./data.witness",
Opts = [{initial_wedged, false}, {witness_mode, true}],
{_,_,_} = machi_test_util:start_flu_package(projection_test_flu, TcpPort, DataDir, Opts),
NSInfo = undefined,
NoCSum = <<>>,
try
Prefix = <<"some prefix">>,
Chunk1 = <<"yo yo yo">>,
%% All of the projection commands are ok.
[ok = flu_projection_common(Host, TcpPort, T) ||
T <- [public, private] ],
%% Projection has moved beyond initial 0, so get the current EpochID
{ok, EpochID1} = ?FLU_C:get_latest_epochid(Host, TcpPort, private),
%% Witness-protected ops all fail
{error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, EpochID1,
Prefix, Chunk1, NoCSum),
File = <<"foofile">>,
{error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort, NSInfo, EpochID1,
File, 9999, 9999, noopt),
{error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, File),
{error, bad_arg} = ?FLU_C:list_files(Host, TcpPort, EpochID1),
{ok, {false, EpochID1,_,_}} = ?FLU_C:wedge_status(Host, TcpPort),
{ok, _} = ?FLU_C:get_latest_epochid(Host, TcpPort, public),
{ok, _} = ?FLU_C:read_latest_projection(Host, TcpPort, public),
{error, not_written} = ?FLU_C:read_projection(Host, TcpPort,
public, 99999),
%% write_projection already tested by flu_projection_common
{ok, _} = ?FLU_C:get_all_projections(Host, TcpPort, public),
{ok, _} = ?FLU_C:list_all_projections(Host, TcpPort, public),
ok
after
machi_test_util:stop_flu_package()
end.
%% The purpose of timing_pb_encoding_test_ and timing_bif_encoding_test_ is
@ -249,7 +348,7 @@ timing_pb_encoding_test2() ->
RUN2 = timer:tc(fun() -> begin [_ = DoIt2() || _ <- XX], ok end end),
erlang:garbage_collect(),
Factor = (element(1, RUN1) / element(1, RUN2)),
io:format(user, " speed factor=~.2f ", [Factor]),
io:format(" speed factor=~.2f ", [Factor]),
ok.
-endif. % !PULSE

View file

@ -38,12 +38,12 @@ smoke_test_() ->
{timeout, 5*60, fun() -> smoke_test2() end}.
smoke_test2() ->
Ps = [{a,#p_srvr{name=a, address="localhost", port=5555, props="./data.a"}},
{b,#p_srvr{name=b, address="localhost", port=5556, props="./data.b"}},
{c,#p_srvr{name=c, address="localhost", port=5557, props="./data.c"}}
Ps = [{a,#p_srvr{name=a, address="localhost", port=5550, props="./data.a"}},
{b,#p_srvr{name=b, address="localhost", port=5551, props="./data.b"}},
{c,#p_srvr{name=c, address="localhost", port=5552, props="./data.c"}}
],
[os:cmd("rm -rf " ++ P#p_srvr.props) || {_,P} <- Ps],
{ok, SupPid} = machi_flu_sup:start_link(),
{ok, SupPid} = machi_sup:start_link(),
try
%% Only run a, don't run b & c so we have 100% failures talking to them
[begin
@ -51,9 +51,9 @@ smoke_test2() ->
{ok, _} = machi_flu_psup:start_flu_package(Name, Port, Dir, [])
end || {_,P} <- [hd(Ps)]],
[machi_chain_manager1:test_react_to_env(a_chmgr) || _ <-lists:seq(1,5)],
[machi_chain_manager1:trigger_react_to_env(a_chmgr) || _ <-lists:seq(1,5)],
machi_chain_manager1:set_chain_members(a_chmgr, orddict:from_list(Ps)),
[machi_chain_manager1:test_react_to_env(a_chmgr) || _ <-lists:seq(1,5)],
[machi_chain_manager1:trigger_react_to_env(a_chmgr) || _ <-lists:seq(1,5)],
ok
after
exit(SupPid, normal),
@ -66,16 +66,16 @@ partial_stop_restart_test_() ->
{timeout, 5*60, fun() -> partial_stop_restart2() end}.
partial_stop_restart2() ->
Ps = [{a,#p_srvr{name=a, address="localhost", port=5555, props="./data.a"}},
{b,#p_srvr{name=b, address="localhost", port=5556, props="./data.b"}},
{c,#p_srvr{name=c, address="localhost", port=5557, props="./data.c"}}
Ps = [{a,#p_srvr{name=a, address="localhost", port=5560, props="./data.a"}},
{b,#p_srvr{name=b, address="localhost", port=5561, props="./data.b"}},
{c,#p_srvr{name=c, address="localhost", port=5562, props="./data.c"}}
],
ChMgrs = [machi_flu_psup:make_mgr_supname(P#p_srvr.name) || {_,P} <-Ps],
PStores = [machi_flu_psup:make_proj_supname(P#p_srvr.name) || {_,P} <-Ps],
Dict = orddict:from_list(Ps),
[os:cmd("rm -rf " ++ P#p_srvr.props) || {_,P} <- Ps],
{ok, SupPid} = machi_flu_sup:start_link(),
DbgProps = [{dbg, [{initial_wedged, true}]}],
{ok, SupPid} = machi_sup:start_link(),
DbgProps = [{initial_wedged, true}],
Start = fun({_,P}) ->
#p_srvr{name=Name, port=Port, props=Dir} = P,
{ok, _} = machi_flu_psup:start_flu_package(
@ -84,32 +84,36 @@ partial_stop_restart2() ->
WedgeStatus = fun({_,#p_srvr{address=Addr, port=TcpPort}}) ->
machi_flu1_client:wedge_status(Addr, TcpPort)
end,
NSInfo = undefined,
Append = fun({_,#p_srvr{address=Addr, port=TcpPort}}, EpochID) ->
NoCSum = <<>>,
machi_flu1_client:append_chunk(Addr, TcpPort,
EpochID,
<<"prefix">>, <<"data">>)
NSInfo, EpochID,
<<"prefix">>,
<<"data">>, NoCSum)
end,
try
[Start(P) || P <- Ps],
[{ok, {true, _}} = WedgeStatus(P) || P <- Ps], % all are wedged
[{ok, {true, _,_,_}} = WedgeStatus(P) || P <- Ps], % all are wedged
[{error,wedged} = Append(P, ?DUMMY_PV1_EPOCH) || P <- Ps], % all are wedged
[machi_chain_manager1:set_chain_members(ChMgr, Dict) ||
ChMgr <- ChMgrs ],
{ok, {false, EpochID1}} = WedgeStatus(hd(Ps)),
[{ok, {false, EpochID1}} = WedgeStatus(P) || P <- Ps], % *not* wedged
{ok, {false, EpochID1,_,_}} = WedgeStatus(hd(Ps)),
[{ok, {false, EpochID1,_,_}} = WedgeStatus(P) || P <- Ps], % *not* wedged
[{ok,_} = Append(P, EpochID1) || P <- Ps], % *not* wedged
{ok, {_,_,File1}} = Append(hd(Ps), EpochID1),
{_,_,_} = machi_chain_manager1:test_react_to_env(hd(ChMgrs)),
{_,_,_} = machi_chain_manager1:trigger_react_to_env(hd(ChMgrs)),
[begin
_QQa = machi_chain_manager1:test_react_to_env(ChMgr)
_QQa = machi_chain_manager1:trigger_react_to_env(ChMgr)
end || _ <- lists:seq(1,125), ChMgr <- ChMgrs],
%% All chain managers & projection stores should be using the
%% same projection which is max projection in each store.
{no_change,_,Epoch_m} = machi_chain_manager1:test_react_to_env(
{no_change,_,Epoch_m} = machi_chain_manager1:trigger_react_to_env(
hd(ChMgrs)),
[{no_change,_,Epoch_m} = machi_chain_manager1:test_react_to_env(
[{no_change,_,Epoch_m} = machi_chain_manager1:trigger_react_to_env(
ChMgr )|| ChMgr <- ChMgrs],
{ok, Proj_m} = machi_projection_store:read_latest_projection(
hd(PStores), public),
@ -122,10 +126,10 @@ partial_stop_restart2() ->
Epoch_m = Proj_m#projection_v1.epoch_number,
%% Confirm that all FLUs are *not* wedged, with correct proj & epoch
Proj_mCSum = Proj_m#projection_v1.epoch_csum,
[{ok, {false, {Epoch_m, Proj_mCSum}}} = WedgeStatus(P) || % *not* wedged
[{ok, {false, {Epoch_m, Proj_mCSum},_,_}} = WedgeStatus(P) || % *not* wedged
P <- Ps],
{ok, {false, EpochID2}} = WedgeStatus(hd(Ps)),
[{ok,_} = Append(P, EpochID2) || P <- Ps], % *not* wedged
{ok, {false, EpochID1,_,_}} = WedgeStatus(hd(Ps)),
[{ok,_} = Append(P, EpochID1) || P <- Ps], % *not* wedged
%% Stop all but 'a'.
[ok = machi_flu_psup:stop_flu_package(Name) || {Name,_} <- tl(Ps)],
@ -134,28 +138,32 @@ partial_stop_restart2() ->
{FluName_a, _} = hd(Ps),
ok = machi_flu_psup:stop_flu_package(FluName_a),
{ok, _} = Start(hd(Ps)),
timer:sleep(123), % TODO fix server socket available race condition in better way
%% Remember: 'a' is not in active mode.
{ok, Proj_m3} = machi_projection_store:read_latest_projection(
hd(PStores), private),
true = (machi_projection:update_dbg2(Proj_m, []) ==
machi_projection:update_dbg2(Proj_m, [])),
machi_projection:update_dbg2(Proj_m3, [])),
%% Confirm that 'a' is wedged
{error, wedged} = Append(hd(Ps), EpochID2),
{error, wedged} = Append(hd(Ps), EpochID1),
{_, #p_srvr{address=Addr_a, port=TcpPort_a}} = hd(Ps),
{error, wedged} = machi_flu1_client:read_chunk(
Addr_a, TcpPort_a, ?DUMMY_PV1_EPOCH,
<<>>, 99999999, 1),
{error, wedged} = machi_flu1_client:checksum_list(
Addr_a, TcpPort_a, ?DUMMY_PV1_EPOCH, <<>>),
Addr_a, TcpPort_a, NSInfo, ?DUMMY_PV1_EPOCH,
<<>>, 99999999, 1, undefined),
{error, bad_arg} = machi_flu1_client:checksum_list(
Addr_a, TcpPort_a, <<>>),
%% list_files() is permitted despite wedged status
{ok, _} = machi_flu1_client:list_files(
Addr_a, TcpPort_a, ?DUMMY_PV1_EPOCH),
%% Iterate through humming consensus once
{now_using,_,Epoch_n} = machi_chain_manager1:test_react_to_env(
{now_using,_,Epoch_n} = machi_chain_manager1:trigger_react_to_env(
hd(ChMgrs)),
true = (Epoch_n > Epoch_m),
{ok, {false, EpochID3}} = WedgeStatus(hd(Ps)),
{ok, {false, EpochID3,_,_}} = WedgeStatus(hd(Ps)),
%% The file we're assigned should be different with the epoch change.
{ok, {_,_,File3}} = Append(hd(Ps), EpochID3),
true = (File1 /= File3),
%% Confirm that 'a' is *not* wedged
{ok, _} = Append(hd(Ps), EpochID3),
@ -168,6 +176,19 @@ partial_stop_restart2() ->
ok
end.
p_srvr_rec_test() ->
P = #p_srvr{name=a, address="localhost", port=1024, props=[yo]},
[P] = machi_flu_sup:sanitize_p_srvr_records([P]),
[P] = machi_flu_sup:sanitize_p_srvr_records([P,P]),
[] = machi_flu_sup:sanitize_p_srvr_records([nope]),
[] = machi_flu_sup:sanitize_p_srvr_records([#p_srvr{proto_mod=does_not_exist}]),
[] = machi_flu_sup:sanitize_p_srvr_records([#p_srvr{proto_mod="lists"}]),
[] = machi_flu_sup:sanitize_p_srvr_records([#p_srvr{address=7}]),
[] = machi_flu_sup:sanitize_p_srvr_records([#p_srvr{port=5}]),
[] = machi_flu_sup:sanitize_p_srvr_records([#p_srvr{port=foo}]),
[] = machi_flu_sup:sanitize_p_srvr_records([#p_srvr{props=foo}]),
ok.
-endif. % !PULSE
-endif. % TEST

View file

@ -0,0 +1,307 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2014 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(machi_lifecycle_mgr_test).
-compile(export_all).
-ifdef(TEST).
-ifndef(PULSE).
-include_lib("eunit/include/eunit.hrl").
-include("machi.hrl").
-include("machi_projection.hrl").
-define(MGR, machi_chain_manager1).
setup() ->
catch application:stop(machi),
{ok, SupPid} = machi_sup:start_link(),
error_logger:tty(false),
Dir = "./" ++ atom_to_list(?MODULE) ++ ".datadir",
machi_flu1_test:clean_up_data_dir(Dir ++ "/*/*"),
machi_flu1_test:clean_up_data_dir(Dir),
Envs = [{flu_data_dir, Dir ++ "/data/flu"},
{flu_config_dir, Dir ++ "/etc/flu-config"},
{chain_config_dir, Dir ++ "/etc/chain-config"},
{platform_data_dir, Dir ++ "/data"},
{platform_etc_dir, Dir ++ "/etc"},
{not_used_pending, Dir ++ "/etc/pending"}
],
EnvKeys = [K || {K,_V} <- Envs],
undefined = application:get_env(machi, yo),
Cleanup = machi_flu1_test:get_env_vars(machi, EnvKeys ++ [yo]),
[begin
filelib:ensure_dir(V ++ "/unused"),
application:set_env(machi, K, V)
end || {K, V} <- Envs],
{SupPid, Dir, Cleanup}.
cleanup({SupPid, Dir, Cleanup}) ->
exit(SupPid, normal),
machi_util:wait_for_death(SupPid, 100),
error_logger:tty(true),
catch application:stop(machi),
machi_flu1_test:clean_up_data_dir(Dir ++ "/*/*"),
machi_flu1_test:clean_up_data_dir(Dir),
machi_flu1_test:clean_up_env_vars(Cleanup),
undefined = application:get_env(machi, yo),
ok.
smoke_test_() ->
{timeout, 60, fun() -> smoke_test2() end}.
smoke_test2() ->
YoCleanup = setup(),
try
Prefix = <<"pre">>,
Chunk1 = <<"yochunk">>,
Host = "localhost",
PortBase = 60120,
Pa = #p_srvr{name=a,address="localhost",port=PortBase+0},
Pb = #p_srvr{name=b,address="localhost",port=PortBase+1},
Pc = #p_srvr{name=c,address="localhost",port=PortBase+2},
%% Pstore_a = machi_flu1:make_projection_server_regname(a),
%% Pstore_b = machi_flu1:make_projection_server_regname(b),
%% Pstore_c = machi_flu1:make_projection_server_regname(c),
Pstores = [Pstore_a, Pstore_b, Pstore_c] =
[machi_flu1:make_projection_server_regname(a),
machi_flu1:make_projection_server_regname(b),
machi_flu1:make_projection_server_regname(c)],
ChMgrs = [ChMgr_a, ChMgr_b, ChMgr_c] =
[machi_chain_manager1:make_chmgr_regname(a),
machi_chain_manager1:make_chmgr_regname(b),
machi_chain_manager1:make_chmgr_regname(c)],
Fits = [Fit_a, Fit_b, Fit_c] =
[machi_flu_psup:make_fitness_regname(a),
machi_flu_psup:make_fitness_regname(b),
machi_flu_psup:make_fitness_regname(c)],
Advance = machi_chain_manager1_test:make_advance_fun(
Fits, [a,b,c], ChMgrs, 3),
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("\nSTEP: Start 3 FLUs, no chain.\n", []),
[machi_lifecycle_mgr:make_pending_config(P) || P <- [Pa,Pb,Pc] ],
{[_,_,_],[]} = machi_lifecycle_mgr:process_pending(),
[{ok, #projection_v1{epoch_number=0}} =
machi_projection_store:read_latest_projection(PSTORE, private)
|| PSTORE <- Pstores],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("\nSTEP: Start chain = [a,b,c]\n", []),
C1 = #chain_def_v1{name=cx, mode=ap_mode, full=[Pa,Pb,Pc],
local_run=[a,b,c]},
machi_lifecycle_mgr:make_pending_config(C1),
{[],[_]} = machi_lifecycle_mgr:process_pending(),
Advance(),
[{ok, #projection_v1{all_members=[a,b,c]}} =
machi_projection_store:read_latest_projection(PSTORE, private)
|| PSTORE <- Pstores],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("\nSTEP: Reset chain = [b,c]\n", []),
C2 = #chain_def_v1{name=cx, mode=ap_mode, full=[Pb,Pc],
old_full=[a,b,c], old_witnesses=[],
local_stop=[a], local_run=[b,c]},
machi_lifecycle_mgr:make_pending_config(C2),
{[],[_]} = machi_lifecycle_mgr:process_pending(),
Advance(),
%% a should be down
{'EXIT', _} = (catch machi_projection_store:read_latest_projection(
hd(Pstores), private)),
[{ok, #projection_v1{all_members=[b,c]}} =
machi_projection_store:read_latest_projection(PSTORE, private)
|| PSTORE <- tl(Pstores)],
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
io:format("\nSTEP: Reset chain = []\n", []),
C3 = #chain_def_v1{name=cx, mode=ap_mode, full=[],
old_full=[b,c], old_witnesses=[],
local_stop=[b,c], local_run=[]},
machi_lifecycle_mgr:make_pending_config(C3),
{[],[_]} = machi_lifecycle_mgr:process_pending(),
Advance(),
%% a,b,c should be down
[{'EXIT', _} = (catch machi_projection_store:read_latest_projection(
PSTORE, private))
|| PSTORE <- Pstores],
ok
after
cleanup(YoCleanup)
end.
ast_tuple_syntax_test() ->
T = fun(L) -> machi_lifecycle_mgr:check_ast_tuple_syntax(L) end,
Canon1 = [ {host, "localhost", []},
{host, "localhost", [{client_interface, "1.2.3.4"},
{admin_interface, "5.6.7.8"}]},
{flu, 'fx', "foohost", 4000, []},
switch_old_and_new,
{chain, 'cy', ['fx', 'fy'], [{foo,"yay"},{bar,baz}]} ],
{_Good,[]=_Bad} = T(Canon1),
Canon1_norm = machi_lifecycle_mgr:normalize_ast_tuple_syntax(Canon1),
true = (length(Canon1) == length(Canon1_norm)),
{Canon1_norm_b, []} = T(Canon1_norm),
true = (length(Canon1_norm) == length(Canon1_norm_b)),
{[],[_,_,_,_]} =
T([ {host, 'localhost', []},
{host, 'localhost', yo},
{host, "localhost", [{client_interface, 77.88293829832}]},
{host, "localhost", [{client_interface, "1.2.3.4"},
{bummer, "5.6.7.8"}]} ]),
{[],[_,_,_,_,_,_]} =
T([ {flu, 'fx', 'foohost', 4000, []},
{flu, 'fx', <<"foohost">>, 4000, []},
{flu, 'fx', "foohost", -4000, []},
{flu, 'fx', "foohost", 40009999, []},
{flu, 'fx', "foohost", 4000, gack},
{flu, 'fx', "foohost", 4000, [22]} ]),
{[],[_,_,_]} =
T([ {chain, 'cy', ["fx", "fy"], [foo,{bar,baz}]},
yoloyolo,
{chain, "cy", ["fx", 27], oops,arity,way,way,way,too,big,x}
]).
ast_run_test() ->
PortBase = 20300,
R1 = [
{host, "localhost", "localhost", "localhost", []},
{flu, 'f0', "localhost", PortBase+0, []},
{flu, 'f1', "localhost", PortBase+1, []},
{chain, 'ca', ['f0'], []},
{chain, 'cb', ['f1'], []},
switch_old_and_new,
{flu, 'f2', "localhost", PortBase+2, []},
{flu, 'f3', "localhost", PortBase+3, []},
{flu, 'f4', "localhost", PortBase+4, []},
{chain, 'ca', ['f0', 'f2'], []},
{chain, 'cc', ['f3', 'f4'], []}
],
{ok, Env1} = machi_lifecycle_mgr:run_ast(R1),
%% Uncomment to examine the Env trees.
%% Y1 = {lists:sort(gb_trees:to_list(element(1, Env1))),
%% lists:sort(gb_trees:to_list(element(2, Env1))),
%% element(3, Env1)},
%% io:format(user, "\nY1 ~p\n", [Y1]),
Negative_after_R1 =
[
{host, "localhost", "foo", "foo", []}, % dupe host
{flu, 'f1', "other", PortBase+9999999, []}, % bogus port # (syntax)
{flu, 'f1', "other", PortBase+888, []}, % dupe flu name
{flu, 'f7', "localhost", PortBase+1, []}, % dupe host+port
{chain, 'ca', ['f7'], []}, % unknown flu
{chain, 'cc', ['f0'], []}, % flu previously assigned
{chain, 'ca', cp_mode, ['f0', 'f1', 'f2'], [], []} % mode change
],
[begin
%% io:format(user, "dbg: Neg ~p\n", [Neg]),
{error, _} = machi_lifecycle_mgr:run_ast(R1 ++ [Neg])
end || Neg <- Negative_after_R1],
%% The 'run' phase doesn't blow smoke. What about 'diff'?
{X1a, X1b} = machi_lifecycle_mgr:diff_env(Env1, "localhost"),
%% There's only one host, "localhost", so 'all' should be exactly equal.
{X1a, X1b} = machi_lifecycle_mgr:diff_env(Env1, all),
%% io:format(user, "X1b: ~p\n", [X1b]),
%% Append to the R1 scenario: for chain cc: add f5, remove f4
%% Expect: see pattern matching below on X2b.
R2 = (R1 -- [switch_old_and_new]) ++
[switch_old_and_new,
{flu, 'f5', "localhost", PortBase+5, []},
{chain, 'cc', ['f3','f5'], []}],
{ok, Env2} = machi_lifecycle_mgr:run_ast(R2),
{_X2a, X2b} = machi_lifecycle_mgr:diff_env(Env2, "localhost"),
%% io:format(user, "X2b: ~p\n", [X2b]),
F5_port = PortBase+5,
[#p_srvr{name='f5',address="localhost",port=F5_port},
#chain_def_v1{name='cc',
full=[#p_srvr{name='f3'},#p_srvr{name='f5'}], witnesses=[],
old_full=[f3,f4], old_witnesses=[],
local_run=[f5], local_stop=[f4]}] = X2b,
ok.
ast_then_apply_test_() ->
{timeout, 60, fun() -> ast_then_apply_test2() end}.
ast_then_apply_test2() ->
YoCleanup = setup(),
try
PortBase = 20400,
NumChains = 4,
ChainLen = 3,
FLU_num = NumChains * ChainLen,
FLU_defs = [{flu, list_to_atom("f"++integer_to_list(X)),
"localhost", PortBase+X, []} || X <- lists:seq(1,FLU_num)],
FLU_names = [FLU || {flu,FLU,_,_,_} <- FLU_defs],
Ch_defs = [{chain, list_to_atom("c"++integer_to_list(X)),
lists:sublist(FLU_names, X, 3),
[]} || X <- lists:seq(1, FLU_num, 3)],
R1 = [switch_old_and_new,
{host, "localhost", "localhost", "localhost", []}]
++ FLU_defs ++ Ch_defs,
{ok, Env1} = machi_lifecycle_mgr:run_ast(R1),
{_X1a, X1b} = machi_lifecycle_mgr:diff_env(Env1, "localhost"),
%% io:format(user, "X1b ~p\n", [X1b]),
[machi_lifecycle_mgr:make_pending_config(X) || X <- X1b],
{PassFLUs, PassChains} = machi_lifecycle_mgr:process_pending(),
true = (length(PassFLUs) == length(FLU_defs)),
true = (length(PassChains) == length(Ch_defs)),
%% Kick the chain managers into doing something useful right now.
Pstores = [list_to_atom(atom_to_list(X) ++ "_pstore") || X <- FLU_names],
Fits = [list_to_atom(atom_to_list(X) ++ "_fitness") || X <- FLU_names],
ChMgrs = [list_to_atom(atom_to_list(X) ++ "_chmgr") || X <- FLU_names],
Advance = machi_chain_manager1_test:make_advance_fun(
Fits, FLU_names, ChMgrs, 3),
Advance(),
%% Sanity check: everyone is configured properly.
[begin
{ok, #projection_v1{epoch_number=Epoch, all_members=All,
chain_name=ChainName, upi=UPI}} =
machi_projection_store:read_latest_projection(PStore, private),
%% io:format(user, "~p: epoch ~p all ~p\n", [PStore, Epoch, All]),
true = Epoch > 0,
ChainLen = length(All),
true = (length(UPI) > 0),
{chain, _, Full, []} = lists:keyfind(ChainName, 2, Ch_defs),
true = lists:sort(Full) == lists:sort(All)
end || PStore <- Pstores],
ok
after
cleanup(YoCleanup)
end.
-endif. % !PULSE
-endif. % TEST

View file

@ -0,0 +1,200 @@
%% -------------------------------------------------------------------
%%
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(machi_merkle_tree_test).
-compile([export_all]).
-include("machi_merkle_tree.hrl").
-include_lib("eunit/include/eunit.hrl").
-include_lib("kernel/include/file.hrl").
-define(GAP_CHANCE, 0.10).
%% unit tests
basic_test() ->
random:seed(os:timestamp()),
Fsz = choose_size() * 1024,
Filesize = max(Fsz, 10*1024*1024),
ChunkSize = max(1048576, Filesize div 100),
N = make_leaf_nodes(Filesize),
D0 = #naive{ leaves = N, chunk_size = ChunkSize, recalc = true },
T1 = machi_merkle_tree:build_tree(D0),
D1 = #naive{ leaves = tl(N), chunk_size = ChunkSize, recalc = true },
T2 = machi_merkle_tree:build_tree(D1),
?assertNotEqual(T1#naive.root, T2#naive.root),
?assertEqual(true, length(machi_merkle_tree:naive_diff(T1, T2)) == 1
orelse
Filesize > ChunkSize).
make_leaf_nodes(Filesize) ->
lists:reverse(
lists:foldl(fun(T, Acc) -> machi_merkle_tree:update_acc(T, Acc) end,
[],
generate_offsets(Filesize, 1024, []))
).
choose_int(Factor) ->
random:uniform(1024*Factor).
small_int() ->
choose_int(10).
medium_int() ->
choose_int(1024).
large_int() ->
choose_int(4096).
generate_offsets(Filesize, Current, Acc) when Current < Filesize ->
Length0 = choose_size(),
Length = case Length0 + Current > Filesize of
false -> Length0;
true -> Filesize - Current
end,
Data = term_to_binary(os:timestamp()),
Checksum = machi_util:make_tagged_csum(client_sha, machi_util:checksum_chunk(Data)),
Gap = maybe_gap(random:uniform()),
generate_offsets(Filesize, Current + Length + Gap, [ {Current, Length, Checksum} | Acc ]);
generate_offsets(_Filesize, _Current, Acc) ->
lists:reverse(Acc).
random_from_list(L) ->
N = random:uniform(length(L)),
lists:nth(N, L).
choose_size() ->
F = random_from_list([fun small_int/0, fun medium_int/0, fun large_int/0]),
F().
maybe_gap(Chance) when Chance < ?GAP_CHANCE ->
choose_size();
maybe_gap(_) -> 0.
%% Define or remove these ifdefs if benchmarking is desired.
-ifdef(BENCH).
generate_offsets(FH, Filesize, Current, Acc) when Current < Filesize ->
Length0 = choose_size(),
Length = case Length0 + Current > Filesize of
false -> Length0;
true -> Filesize - Current
end,
{ok, Data} = file:pread(FH, Current, Length),
Checksum = machi_util:make_tagged_csum(client_sha, machi_util:checksum_chunk(Data)),
Gap = maybe_gap(random:uniform()),
generate_offsets(FH, Filesize, Current + Length + Gap, [ {Current, Length, Checksum} | Acc ]);
generate_offsets(_FH, _Filesize, _Current, Acc) ->
lists:reverse(Acc).
make_offsets_from_file(Filename) ->
{ok, Info} = file:read_file_info(Filename),
Filesize = Info#file_info.size,
{ok, FH} = file:open(Filename, [read, raw, binary]),
Offsets = generate_offsets(FH, Filesize, 1024, []),
file:close(FH),
Offsets.
choose_filename() ->
random_from_list([
"def^c5ea7511-d649-47d6-a8c3-2b619379c237^1",
"jkl^b077eff7-b2be-4773-a73f-fea4acb8a732^1",
"stu^553fa47a-157c-4fac-b10f-2252c7d8c37a^1",
"vwx^ae015d68-7689-4c9f-9677-926c6664f513^1",
"yza^4c784dc2-19bf-4ac6-91f6-58bbe5aa88e0^1"
]).
make_csum_file(DataDir, Filename, Offsets) ->
Path = machi_util:make_checksum_filename(DataDir, Filename),
filelib:ensure_dir(Path),
{ok, MC} = machi_csum_table:open(Path, []),
lists:foreach(fun({Offset, Size, Checksum}) ->
machi_csum_table:write(MC, Offset, Size, Checksum) end,
Offsets),
machi_csum_table:close(MC).
test() ->
test(100).
test(N) ->
{ok, F} = file:open("results.txt", [raw, write]),
lists:foreach(fun(X) -> format_and_store(F, run_test(X)) end, lists:seq(1, N)).
format_and_store(F, {OffsetNum, {MTime, MSize}, {NTime, NSize}}) ->
S = io_lib:format("~w\t~w\t~w\t~w\t~w\n", [OffsetNum, MTime, MSize, NTime, NSize]),
ok = file:write(F, S).
run_test(C) ->
random:seed(os:timestamp()),
OffsetFn = "test/" ++ choose_filename(),
O = make_offsets_from_file(OffsetFn),
Fn = "csum_" ++ integer_to_list(C),
make_csum_file(".", Fn, O),
Osize = length(O),
{MTime, {ok, M}} = timer:tc(fun() -> machi_merkle_tree:open(Fn, ".", merklet) end),
{NTime, {ok, N}} = timer:tc(fun() -> machi_merkle_tree:open(Fn, ".", naive) end),
?assertEqual(Fn, machi_merkle_tree:filename(M)),
?assertEqual(Fn, machi_merkle_tree:filename(N)),
MTree = machi_merkle_tree:tree(M),
MSize = byte_size(term_to_binary(MTree)),
NTree = machi_merkle_tree:tree(N),
NSize = byte_size(term_to_binary(NTree)),
?assertEqual(same, machi_merkle_tree:diff(N, N)),
?assertEqual(same, machi_merkle_tree:diff(M, M)),
{Osize, {MTime, MSize}, {NTime, NSize}}.
torture_test(C) ->
Results = [ run_torture_test() || _ <- lists:seq(1, C) ],
{ok, F} = file:open("torture_results.txt", [raw, write]),
lists:foreach(fun({MSize, MTime, NSize, NTime}) ->
file:write(F, io_lib:format("~p\t~p\t~p\t~p\n",
[MSize, MTime, NSize, NTime]))
end, Results),
ok = file:close(F).
run_torture_test() ->
{NTime, N} = timer:tc(fun() -> naive_torture() end),
MSize = byte_size(term_to_binary(M)),
NSize = byte_size(term_to_binary(N)),
{MSize, MTime, NSize, NTime}.
naive_torture() ->
N = lists:foldl(fun(T, Acc) -> machi_merkle_tree:update_acc(T, Acc) end, [], torture_generator()),
T = #naive{ leaves = lists:reverse(N), chunk_size = 10010, recalc = true },
machi_merkle_tree:build_tree(T).
torture_generator() ->
[ {O, 1, crypto:hash(sha, term_to_binary(now()))} || O <- lists:seq(1024, 1000000) ].
-endif. % BENCH

Some files were not shown because too many files have changed in this diff Show more