diff --git a/.gitignore b/.gitignore index 063a61d..3af54ff 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,7 @@ prototype/chain-manager/patch.* .eqc-info .eunit deps -dev erl_crash.dump -eqc .concrete/DEV_MODE .rebar edoc @@ -22,7 +20,6 @@ include/machi_pb.hrl # Release packaging rel/machi -rel/vars/dev*vars.config # Misc Scott cruft *.patch diff --git a/FAQ.md b/FAQ.md index ee563c9..f2e37c1 100644 --- a/FAQ.md +++ b/FAQ.md @@ -11,14 +11,14 @@ + [1 Questions about Machi in general](#n1) + [1.1 What is Machi?](#n1.1) - + [1.2 What is a Machi chain?](#n1.2) - + [1.3 What is a Machi cluster?](#n1.3) - + [1.4 What is Machi like when operating in "eventually consistent" mode?](#n1.4) - + [1.5 What is Machi like when operating in "strongly consistent" mode?](#n1.5) - + [1.6 What does Machi's API look like?](#n1.6) - + [1.7 What licensing terms are used by Machi?](#n1.7) - + [1.8 Where can I find the Machi source code and documentation? Can I contribute?](#n1.8) - + [1.9 What is Machi's expected release schedule, packaging, and operating system/OS distribution support?](#n1.9) + + [1.2 What is a Machi "cluster of clusters"?](#n1.2) + + [1.2.1 This "cluster of clusters" idea needs a better name, don't you agree?](#n1.2.1) + + [1.3 What is Machi like when operating in "eventually consistent" mode?](#n1.3) + + [1.4 What is Machi like when operating in "strongly consistent" mode?](#n1.4) + + [1.5 What does Machi's API look like?](#n1.5) + + [1.6 What licensing terms are used by Machi?](#n1.6) + + [1.7 Where can I find the Machi source code and documentation? Can I contribute?](#n1.7) + + [1.8 What is Machi's expected release schedule, packaging, and operating system/OS distribution support?](#n1.8) + [2 Questions about Machi relative to {{something else}}](#n2) + [2.1 How is Machi better than Hadoop?](#n2.1) + [2.2 How does Machi differ from HadoopFS/HDFS?](#n2.2) @@ -28,15 +28,13 @@ + [3 Machi's specifics](#n3) + [3.1 What technique is used to replicate Machi's files? Can other techniques be used?](#n3.1) + [3.2 Does Machi have a reliance on a coordination service such as ZooKeeper or etcd?](#n3.2) - + [3.3 Are there any presentations available about Humming Consensus](#n3.3) - + [3.4 Is it true that there's an allegory written to describe Humming Consensus?](#n3.4) - + [3.5 How is Machi tested?](#n3.5) - + [3.6 Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks](#n3.6) - + [3.7 Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device?](#n3.7) - + [3.8 What language(s) is Machi written in?](#n3.8) - + [3.9 Can Machi run on Windows? Can Machi run on 32-bit platforms?](#n3.9) - + [3.10 Does Machi use the Erlang/OTP network distribution system (aka "disterl")?](#n3.10) - + [3.11 Can I use HTTP to write/read stuff into/from Machi?](#n3.11) + + [3.3 Is it true that there's an allegory written to describe humming consensus?](#n3.3) + + [3.4 How is Machi tested?](#n3.4) + + [3.5 Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks](#n3.5) + + [3.6 Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device?](#n3.6) + + [3.7 What language(s) is Machi written in?](#n3.7) + + [3.8 Does Machi use the Erlang/OTP network distribution system (aka "disterl")?](#n3.8) + + [3.9 Can I use HTTP to write/read stuff into/from Machi?](#n3.9) @@ -46,13 +44,13 @@ ### 1.1. What is Machi? -Very briefly, Machi is a very simple append-only blob/file store. +Very briefly, Machi is a very simple append-only file store. Machi is "dumber" than many other file stores (i.e., lacking many features -found in other file stores) such as HadoopFS or a simple NFS or CIFS file +found in other file stores) such as HadoopFS or simple NFS or CIFS file server. -However, Machi is a distributed blob/file store, which makes it different +However, Machi is a distributed file store, which makes it different (and, in some ways, more complicated) than a simple NFS or CIFS file server. @@ -84,39 +82,45 @@ For a much longer answer, please see the [Machi high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-machi.pdf). -### 1.2. What is a Machi chain? +### 1.2. What is a Machi "cluster of clusters"? -A Machi chain is a small number of machines that maintain a common set -of replicated files. A typical chain is of length 2 or 3. For -critical data that must be available despite several simultaneous -server failures, a chain length of 6 or 7 might be used. +Machi's design is based on using small, well-understood and provable +(mathematically) techniques to maintain multiple file copies without +data loss or data corruption. At its lowest level, Machi contains no +support for distribution/partitioning/sharding of files across many +servers. A typical, fully-functional Machi cluster will likely be two +or three machines. - -### 1.3. What is a Machi cluster? +However, Machi is designed to be an excellent building block for +building larger systems. A deployment of Machi "cluster of clusters" +will use the "random slicing" technique for partitioning files across +multiple Machi clusters that, as individuals, are unaware of the +larger cluster-of-clusters scheme. -A Machi cluster is a collection of Machi chains that -partitions/shards/distributes files (based on file name) across the -collection of chains. Machi uses the "random slicing" algorithm (a -variation of consistent hashing) to define the mapping of file name to -chain name. - -The cluster management service will be fully decentralized +The cluster-of-clusters management service will be fully decentralized and run as a separate software service installed on each Machi cluster. This manager will appear to the local Machi server as simply -another Machi file client. The cluster managers will take +another Machi file client. The cluster-of-clusters managers will take care of file migration as the cluster grows and shrinks in capacity and in response to day-to-day changes in workload. -Though the cluster manager has not yet been implemented, +Though the cluster-of-clusters manager has not yet been implemented, its design is fully decentralized and capable of operating despite -multiple partial failure of its member chains. We expect this +multiple partial failure of its member clusters. We expect this design to scale easily to at least one thousand servers. Please see the [Machi source repository's 'doc' directory for more details](https://github.com/basho/machi/tree/master/doc/). - -### 1.4. What is Machi like when operating in "eventually consistent" mode? + +#### 1.2.1. This "cluster of clusters" idea needs a better name, don't you agree? + +Yes. Please help us: we are bad at naming things. +For proof that naming things is hard, see +[http://martinfowler.com/bliki/TwoHardThings.html](http://martinfowler.com/bliki/TwoHardThings.html) + + +### 1.3. What is Machi like when operating in "eventually consistent" mode? Machi's operating mode dictates how a Machi cluster will react to network partitions. A network partition may be caused by: @@ -139,14 +143,13 @@ consistency mode during and after network partitions are: together from "all sides" of the partition(s). * Unique files are copied in their entirety. * Byte ranges within the same file are merged. This is possible - due to Machi's restrictions on file naming and file offset - assignment. Both file names and file offsets are always chosen - by Machi servers according to rules which guarantee safe - mergeability. Server-assigned names are a characteristic of a - "blob store". + due to Machi's restrictions on file naming (files names are + alwoys assigned by Machi servers) and file offset assignments + (byte offsets are also always chosen by Machi servers according + to rules which guarantee safe mergeability.). - -### 1.5. What is Machi like when operating in "strongly consistent" mode? + +### 1.4. What is Machi like when operating in "strongly consistent" mode? The consistency semantics of file operations while in strongly consistency mode during and after network partitions are: @@ -164,19 +167,19 @@ consistency mode during and after network partitions are: Machi's design can provide the illusion of quorum minority write availability if the cluster is configured to operate with "witness -servers". (This feaure partially implemented, as of December 2015.) +servers". (This feaure is not implemented yet, as of June 2015.) See Section 11 of [Machi chain manager high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-chain-mgr.pdf) for more details. - -### 1.6. What does Machi's API look like? + +### 1.5. What does Machi's API look like? The Machi API only contains a handful of API operations. The function -arguments shown below (in simplifed form) use Erlang-style type annotations. +arguments shown below use Erlang-style type annotations. - append_chunk(Prefix:binary(), Chunk:binary(), CheckSum:binary()). - append_chunk_extra(Prefix:binary(), Chunk:binary(), CheckSum:binary(), ExtraSpace:non_neg_integer()). + append_chunk(Prefix:binary(), Chunk:binary()). + append_chunk_extra(Prefix:binary(), Chunk:binary(), ExtraSpace:non_neg_integer()). read_chunk(File:binary(), Offset:non_neg_integer(), Size:non_neg_integer()). checksum_list(File:binary()). @@ -201,15 +204,15 @@ level" internal protocol are in a [Protocol Buffers](https://developers.google.com/protocol-buffers/docs/overview) definition at [./src/machi.proto](./src/machi.proto). - -### 1.7. What licensing terms are used by Machi? + +### 1.6. What licensing terms are used by Machi? All Machi source code and documentation is licensed by [Basho Technologies, Inc.](http://www.basho.com/) under the [Apache Public License version 2](https://github.com/basho/machi/tree/master/LICENSE). - -### 1.8. Where can I find the Machi source code and documentation? Can I contribute? + +### 1.7. Where can I find the Machi source code and documentation? Can I contribute? All Machi source code and documentation can be found at GitHub: [https://github.com/basho/machi](https://github.com/basho/machi). @@ -223,8 +226,8 @@ ideas for improvement, please see our contributing & collaboration guidelines at [https://github.com/basho/machi/blob/master/CONTRIBUTING.md](https://github.com/basho/machi/blob/master/CONTRIBUTING.md). - -### 1.9. What is Machi's expected release schedule, packaging, and operating system/OS distribution support? + +### 1.8. What is Machi's expected release schedule, packaging, and operating system/OS distribution support? Basho expects that Machi's first major product release will take place during the 2nd quarter of 2016. @@ -302,15 +305,15 @@ file's writable phase). Does not have any file distribution/partitioning/sharding across -Machi chains: in a single Machi chain, all files are replicated by -all servers in the chain. The "random slicing" technique is used +Machi clusters: in a single Machi cluster, all files are replicated by +all servers in the cluster. The "cluster of clusters" concept is used to distribute/partition/shard files across multiple Machi clusters. File distribution/partitioning/sharding is performed automatically by the HDFS "name node". - Machi requires no central "name node" for single chain use or -for multi-chain cluster use. + Machi requires no central "name node" for single cluster use. +Machi requires no central "name node" for "cluster of clusters" use Requires a single "namenode" server to maintain file system contents and file content mapping. (May be deployed with a "secondary namenode" to reduce unavailability when the primary namenode fails.) @@ -476,8 +479,8 @@ difficult to adapt to Machi's design goals: * Both protocols use quorum majority consensus, which requires a minimum of *2F + 1* working servers to tolerate *F* failures. For example, to tolerate 2 server failures, quorum majority protocols - require a minimum of 5 servers. To tolerate the same number of - failures, Chain Replication requires a minimum of only 3 servers. + require a minium of 5 servers. To tolerate the same number of + failures, Chain replication requires only 3 servers. * Machi's use of "humming consensus" to manage internal server metadata state would also (probably) require conversion to Paxos or Raft. (Or "outsourced" to a service such as ZooKeeper.) @@ -494,17 +497,7 @@ Humming consensus is described in the [Machi chain manager high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-chain-mgr.pdf). -### 3.3. Are there any presentations available about Humming Consensus - -Scott recently (November 2015) gave a presentation at the -[RICON 2015 conference](http://ricon.io) about one of the techniques -used by Machi; "Managing Chain Replication Metadata with -Humming Consensus" is available online now. -* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf) -* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q) - - -### 3.4. Is it true that there's an allegory written to describe Humming Consensus? +### 3.3. Is it true that there's an allegory written to describe humming consensus? Yes. In homage to Leslie Lamport's original paper about the Paxos protocol, "The Part-time Parliamant", there is an allegorical story @@ -515,8 +508,8 @@ The full story, full of wonder and mystery, is called There is also a [short followup blog posting](http://www.snookles.com/slf-blog/2015/03/20/on-humming-consensus-an-allegory-part-2/). - -### 3.5. How is Machi tested? + +### 3.4. How is Machi tested? While not formally proven yet, Machi's implementation of Chain Replication and of humming consensus have been extensively tested with @@ -545,16 +538,16 @@ All test code is available in the [./test](./test) subdirectory. Modules that use QuickCheck will use a file suffix of `_eqc`, for example, [./test/machi_ap_repair_eqc.erl](./test/machi_ap_repair_eqc.erl). - -### 3.6. Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks + +### 3.5. Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks No, Machi's design assumes that each Machi server is a fully independent hardware and assumes only standard local disks (Winchester and/or SSD style) with local-only interfaces (e.g. SATA, SCSI, PCI) in each machine. - -### 3.7. Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device? + +### 3.6. Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device? No. When used with servers with multiple disks, the intent is to deploy multiple Machi servers per machine: one Machi server per disk. @@ -572,10 +565,10 @@ deploy multiple Machi servers per machine: one Machi server per disk. placement relative to 12 servers is smaller than a placement problem of managing 264 seprate disks (if each of 12 servers has 22 disks). - -### 3.8. What language(s) is Machi written in? + +### 3.7. What language(s) is Machi written in? -So far, Machi is written in Erlang, mostly. Machi uses at least one +So far, Machi is written in 100% Erlang. Machi uses at least one library, [ELevelDB](https://github.com/basho/eleveldb), that is implemented both in C++ and in Erlang, using Erlang NIFs (Native Interface Functions) to allow Erlang code to call C++ functions. @@ -587,16 +580,8 @@ in C, Java, or other "gotta go fast fast FAST!!" programming language. We expect that the Chain Replication manager and other critical "control plane" software will remain in Erlang. - -### 3.9. Can Machi run on Windows? Can Machi run on 32-bit platforms? - -The ELevelDB NIF does not compile or run correctly on Erlang/OTP -Windows platforms, nor does it compile correctly on 32-bit platforms. -Machi should support all 64-bit UNIX-like platforms that are supported -by Erlang/OTP and ELevelDB. - - -### 3.10. Does Machi use the Erlang/OTP network distribution system (aka "disterl")? + +### 3.8. Does Machi use the Erlang/OTP network distribution system (aka "disterl")? No, Machi doesn't use Erlang/OTP's built-in distributed message passing system. The code would be *much* simpler if we did use @@ -611,8 +596,8 @@ All wire protocols used by Machi are defined & implemented using [Protocol Buffers](https://developers.google.com/protocol-buffers/docs/overview). The definition file can be found at [./src/machi.proto](./src/machi.proto). - -### 3.11. Can I use HTTP to write/read stuff into/from Machi? + +### 3.9. Can I use HTTP to write/read stuff into/from Machi? Short answer: No, not yet. diff --git a/Makefile b/Makefile index 01b1e99..7ff19ed 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ endif OVERLAY_VARS ?= EUNIT_OPTS = -v -.PHONY: rel stagedevrel deps package pkgclean edoc +.PHONY: rel deps package pkgclean edoc all: deps compile @@ -57,37 +57,6 @@ relclean: stage : rel $(foreach dep,$(wildcard deps/*), rm -rf rel/$(REPO)/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) rel/$(REPO)/lib;) -## -## Developer targets -## -## devN - Make a dev build for node N -## stagedevN - Make a stage dev build for node N (symlink libraries) -## devrel - Make a dev build for 1..$DEVNODES -## stagedevrel Make a stagedev build for 1..$DEVNODES -## -## Example, make a 68 node devrel cluster -## make stagedevrel DEVNODES=68 - -.PHONY : stagedevrel devrel -DEVNODES ?= 3 - -# 'seq' is not available on all *BSD, so using an alternate in awk -SEQ = $(shell awk 'BEGIN { for (i = 1; i < '$(DEVNODES)'; i++) printf("%i ", i); print i ;exit(0);}') - -$(eval stagedevrel : $(foreach n,$(SEQ),stagedev$(n))) -$(eval devrel : $(foreach n,$(SEQ),dev$(n))) - -dev% : all - mkdir -p dev - rel/gen_dev $@ rel/vars/dev_vars.config.src rel/vars/$@_vars.config - (cd rel && ../rebar generate target_dir=../dev/$@ overlay_vars=vars/$@_vars.config) - -stagedev% : dev% - $(foreach dep,$(wildcard deps/*), rm -rf dev/$^/lib/$(shell basename $(dep))* && ln -sf $(abspath $(dep)) dev/$^/lib;) - -devclean: clean - rm -rf dev - DIALYZER_APPS = kernel stdlib sasl erts ssl compiler eunit crypto public_key syntax_tools PLT = $(HOME)/.machi_dialyzer_plt diff --git a/README.md b/README.md index 25b9fff..28f77d2 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,19 @@ -# Machi: a distributed, decentralized blob/large file store +# Machi: a robust & reliable, distributed, highly available, large file store [Travis-CI](http://travis-ci.org/basho/machi) :: ![Travis-CI](https://secure.travis-ci.org/basho/machi.png) Outline -1. [Why another blob/file store?](#sec1) +1. [Why another file store?](#sec1) 2. [Where to learn more about Machi](#sec2) 3. [Development status summary](#sec3) 4. [Contributing to Machi's development](#sec4) -## 1. Why another blob/file store? +## 1. Why another file store? Our goal is a robust & reliable, distributed, highly available, large -file and blob store. Such stores already exist, both in the open source world +file store. Such stores already exist, both in the open source world and in the commercial world. Why reinvent the wheel? We believe there are three reasons, ordered by decreasing rarity. @@ -25,8 +25,9 @@ there are three reasons, ordered by decreasing rarity. 3. We want to manage file replicas in a way that's provably correct and also easy to test. -Criteria #3 is difficult to find in the open source world but perhaps -not impossible. +Of all the file stores in the open source & commercial worlds, only +criteria #3 is a viable option. Or so we hope. Or we just don't +care, and if data gets lost or corrupted, then ... so be it. If we have app use cases where availability is more important than consistency, then systems that meet criteria #2 are also rare. @@ -38,13 +39,12 @@ file data and attempts best-effort file reads? If we really do care about data loss and/or data corruption, then we really want both #3 and #1. Unfortunately, systems that meet -criteria #1 are _very rare_. (Nonexistant?) +criteria #1 are _very rare_. Why? This is 2015. We have decades of research that shows that computer hardware can (and indeed does) corrupt data at nearly every level of the modern client/server application stack. Systems with end-to-end data corruption detection should be ubiquitous today. Alas, they are not. - Machi is an effort to change the deplorable state of the world, one Erlang function at a time. @@ -64,68 +64,49 @@ Humming Consensus" is available online now. * [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf) * [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q) -See later in this document for how to run the Humming Consensus demos, -including the network partition simulator. - ## 3. Development status summary -Mid-March 2016: The Machi development team has been downsized in -recent months, and the pace of development has slowed. Here is a -summary of the status of Machi's major components. +Mid-December 2015: work is underway. -* Humming Consensus and the chain manager - * No new safety bugs have been found by model-checking tests. - * A new document, - (Hand-on experiments with Machi and Humming Consensus)[doc/humming-consensus-demo.md] - is now available. It is a tutorial for setting up a 3 virtual - machine Machi cluster and how to demonstrate the chain manager's - reactions to server stops & starts, crashes & restarts, and pauses - (simulated by `SIGSTOP` and `SIGCONT`). - * The chain manager can still make suboptimal-but-safe choices for - chain transitions when a server hangs/pauses temporarily. - * Recent chain manager changes have made the instability window - much shorter when the slow/paused server resumes execution. - * Scott believes that a modest change to the chain manager's - calculation of a new projection can reduce flapping in this (and - many other cases) less likely. Currently, the new local - projection is calculated using only local state (i.e., the chain - manager's internal state + the fitness server's state). - However, if the "latest" projection read from the public - projection stores were also input to the new projection - calculation function, then many obviously bad projections can be - avoided without needing rounds of Humming Consensus to - demonstrate that a bad projection is bad. +* In progress: + * Code refactoring: metadata management using + [ELevelDB](https://github.com/basho/eleveldb) + * File repair using file-centric, Merkle-style hash tree. + * Server-side socket handling is now performed by + [ranch](https://github.com/ninenines/ranch) + * QuickCheck tests for file repair correctness + * 2015-12-15: The EUnit test `machi_ap_repair_eqc` is + currently failing occasionally because it (correctly) detects + double-write errors. Double-write errors will be eliminated + when the ELevelDB integration work is complete. + * The `make stage` and `make release` commands can be used to + create a primitive "package". Use `./rel/machi/bin/machi console` + to start the Machi app in interactive mode. Substitute the word + `start` instead of console to start Machi in background/daemon + mode. The `./rel/machi/bin/machi` command without any arguments + will give a short usage summary. + * Chain Replication management using the Humming Consensus + algorithm to manage chain state is stable. + * ... with the caveat that it runs very well in a very harsh + and unforgiving network partition simulator but has not run + much yet in the real world. + * All Machi client/server protocols are based on + [Protocol Buffers](https://developers.google.com/protocol-buffers/docs/overview). + * The current specification for Machi's protocols can be found at + [https://github.com/basho/machi/blob/master/src/machi.proto](https://github.com/basho/machi/blob/master/src/machi.proto). + * The Machi PB protocol is not yet stable. Expect change! + * The Erlang language client implementation of the high-level + protocol flavor is brittle (e.g., little error handling yet). -* FLU/data server process - * All known correctness bugs have been fixed. - * Performance has not yet been measured. Performance measurement - and enhancements are scheduled to start in the middle of March 2016. - (This will include a much-needed update to the `basho_bench` driver.) +If you would like to run the network partition simulator +mentioned in the Ricon 2015 presentation about Humming Consensus, +please see the +[partition simulator convergence test doc.](./doc/machi_chain_manager1_converge_demo.md) -* Access protocols and client libraries - * The protocol used by both external clients and internally (instead - of using Erlang's native message passing mechanisms) is based on - Protocol Buffers. - * (Machi PB protocol specification: ./src/machi.proto)[./src/machi.proto] - * At the moment, the PB specification contains two protocols. - Sometime in the near future, the spec will be split to separate - the external client API (the "high" protocol) from the internal - communication API (the "low" protocol). - -* Recent conference talks about Machi - * Erlang Factory San Francisco 2016 - (the slides and video recording)[http://www.erlang-factory.com/sfbay2016/scott-lystig-fritchie] - will be available a few weeks after the conference ends on March - 11, 2016. - * Ricon 2015 - * (The slides)[http://ricon.io/archive/2015/slides/Scott_Fritchie_Ricon_2015.pdf] - * and the (video recording)[https://www.youtube.com/watch?v=yR5kHL1bu1Q&index=13&list=PL9Jh2HsAWHxIc7Tt2M6xez_TOP21GBH6M] - are now available. - * If you would like to run the Humming Consensus code (with or without - the network partition simulator) as described in the RICON 2015 - presentation, please see the - [Humming Consensus demo doc](./doc/humming_consensus_demo.md). +If you'd like to work on a protocol such as Thrift, UBF, +msgpack over UDP, or some other protocol, let us know by +[opening an issue to discuss it](./issues/new). ## 4. Contributing to Machi's development @@ -153,22 +134,13 @@ X. The only known limitations for using R16 are minor type specification difference between R16 and 17, but we strongly suggest continuing development using version 17. -We also assume that you have the standard UNIX/Linux developer -tool chain for C and C++ applications. Also, we assume -that Git and GNU Make are available. -The utility used to compile the Machi source code, +We also assume that you have the standard UNIX/Linux developers +tool chain for C and C++ applications. Specifically, we assume `make` +is available. The utility used to compile the Machi source code, `rebar`, is pre-compiled and included in the repo. -For more details, please see the -[Machi development environment prerequisites doc](./doc/dev-prerequisites.md). Machi has a dependency on the [ELevelDB](https://github.com/basho/eleveldb) library. ELevelDB only supports UNIX/Linux OSes and 64-bit versions of Erlang/OTP only; we apologize to Windows-based and 32-bit-based Erlang developers for this restriction. - -### 4.3 New protocols and features - -If you'd like to work on a protocol such as Thrift, UBF, -msgpack over UDP, or some other protocol, let us know by -[opening an issue to discuss it](./issues/new). diff --git a/doc/README.md b/doc/README.md index b8e1949..3ad424c 100644 --- a/doc/README.md +++ b/doc/README.md @@ -66,9 +66,9 @@ an introduction to the self-management algorithm proposed for Machi. Most material has been moved to the [high-level-chain-mgr.pdf](high-level-chain-mgr.pdf) document. -### cluster (directory) +### cluster-of-clusters (directory) -This directory contains the sketch of the cluster design +This directory contains the sketch of the "cluster of clusters" design strawman for partitioning/distributing/sharding files across a large -number of independent Machi chains. +number of independent Machi clusters. diff --git a/doc/cluster/migration-3to4.fig b/doc/cluster-of-clusters/migration-3to4.fig similarity index 85% rename from doc/cluster/migration-3to4.fig rename to doc/cluster-of-clusters/migration-3to4.fig index 0faad27..eadf105 100644 --- a/doc/cluster/migration-3to4.fig +++ b/doc/cluster-of-clusters/migration-3to4.fig @@ -88,16 +88,16 @@ Single 4 0 0 50 -1 2 14 0.0000 4 180 495 4425 3525 ~8%\001 4 0 0 50 -1 2 14 0.0000 4 240 1710 5025 3525 ~25% total keys\001 4 0 0 50 -1 2 14 0.0000 4 180 495 6825 3525 ~8%\001 +4 0 0 50 -1 2 24 0.0000 4 270 1485 600 600 Cluster1\001 +4 0 0 50 -1 2 24 0.0000 4 270 1485 3000 600 Cluster2\001 +4 0 0 50 -1 2 24 0.0000 4 270 1485 5400 600 Cluster3\001 +4 0 0 50 -1 2 24 0.0000 4 270 1485 300 2850 Cluster1\001 +4 0 0 50 -1 2 24 0.0000 4 270 1485 2700 2850 Cluster2\001 +4 0 0 50 -1 2 24 0.0000 4 270 1485 5175 2850 Cluster3\001 +4 0 0 50 -1 2 24 0.0000 4 270 405 2100 2625 Cl\001 +4 0 0 50 -1 2 24 0.0000 4 270 405 6900 2625 Cl\001 4 0 0 50 -1 2 24 0.0000 4 270 195 2175 3075 4\001 4 0 0 50 -1 2 24 0.0000 4 270 195 4575 3075 4\001 4 0 0 50 -1 2 24 0.0000 4 270 195 6975 3075 4\001 -4 0 0 50 -1 2 24 0.0000 4 270 1245 600 600 Chain1\001 -4 0 0 50 -1 2 24 0.0000 4 270 1245 3000 600 Chain2\001 -4 0 0 50 -1 2 24 0.0000 4 270 1245 5400 600 Chain3\001 -4 0 0 50 -1 2 24 0.0000 4 270 285 2100 2625 C\001 -4 0 0 50 -1 2 24 0.0000 4 270 285 4500 2625 C\001 -4 0 0 50 -1 2 24 0.0000 4 270 285 6900 2625 C\001 -4 0 0 50 -1 2 24 0.0000 4 270 1245 525 2850 Chain1\001 -4 0 0 50 -1 2 24 0.0000 4 270 1245 2925 2850 Chain2\001 -4 0 0 50 -1 2 24 0.0000 4 270 1245 5325 2850 Chain3\001 -4 0 0 50 -1 2 18 0.0000 4 240 4350 1350 4875 Cluster locator, on the unit interval\001 +4 0 0 50 -1 2 24 0.0000 4 270 405 4500 2625 Cl\001 +4 0 0 50 -1 2 18 0.0000 4 240 3990 1200 4875 CoC locator, on the unit interval\001 diff --git a/doc/cluster-of-clusters/migration-3to4.png b/doc/cluster-of-clusters/migration-3to4.png new file mode 100644 index 0000000..e7ec417 Binary files /dev/null and b/doc/cluster-of-clusters/migration-3to4.png differ diff --git a/doc/cluster-of-clusters/migration-4.png b/doc/cluster-of-clusters/migration-4.png new file mode 100644 index 0000000..3e1414d Binary files /dev/null and b/doc/cluster-of-clusters/migration-4.png differ diff --git a/doc/cluster-of-clusters/name-game-sketch.org b/doc/cluster-of-clusters/name-game-sketch.org new file mode 100644 index 0000000..44b5df0 --- /dev/null +++ b/doc/cluster-of-clusters/name-game-sketch.org @@ -0,0 +1,479 @@ +-*- mode: org; -*- +#+TITLE: Machi cluster-of-clusters "name game" sketch +#+AUTHOR: Scott +#+STARTUP: lognotedone hidestars indent showall inlineimages +#+SEQ_TODO: TODO WORKING WAITING DONE +#+COMMENT: M-x visual-line-mode +#+COMMENT: Also, disable auto-fill-mode + +* 1. "Name Games" with random-slicing style consistent hashing + +Our goal: to distribute lots of files very evenly across a cluster of +Machi clusters (hereafter called a "cluster of clusters" or "CoC"). + +* 2. Assumptions + +** Basic familiarity with Machi high level design and Machi's "projection" + +The [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] contains all of the basic +background assumed by the rest of this document. + +** Analogy: "neighborhood : city :: Machi : cluster-of-clusters" + +Analogy: The word "machi" in Japanese means small town or +neighborhood. As the Tokyo Metropolitan Area is built from many +machis and smaller cities, therefore a big, partitioned file store can +be built out of many small Machi clusters. + +** Familiarity with the Machi cluster-of-clusters/CoC concept + +It's clear (I hope!) from +the [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] that Machi alone does not support +any kind of file partitioning/distribution/sharding across multiple +small Machi clusters. There must be another layer above a Machi cluster to +provide such partitioning services. + +The name "cluster of clusters" originated within Basho to avoid +conflicting use of the word "cluster". A Machi cluster is usually +synonymous with a single Chain Replication chain and a single set of +machines (e.g. 2-5 machines). However, in the not-so-far future, we +expect much more complicated patterns of Chain Replication to be used +in real-world deployments. + +"Cluster of clusters" is clunky and long, but we haven't found a good +substitute yet. If you have a good suggestion, please contact us! +~^_^~ + +Using the [[https://github.com/basho/machi/tree/master/prototype/demo-day-hack][cluster-of-clusters quick-and-dirty prototype]] as an +architecture sketch, let's now assume that we have ~n~ independent Machi +clusters. We assume that each of these clusters has roughly the same +chain length in the nominal case, e.g. chain length of 3. +We wish to provide partitioned/distributed file storage +across all ~n~ clusters. We call the entire collection of ~n~ Machi +clusters a "cluster of clusters", or abbreviated "CoC". + +We may wish to have several types of Machi clusters, e.g. chain length +of 3 for normal data, longer for cannot-afford-data-loss files, and +shorter for don't-care-if-it-gets-lost files. Each of these types of +chains will have a name ~N~ in the CoC namespace. The role of the CoC +namespace will be demonstrated in Section 3 below. + +** Continue CoC prototype's assumption: a Machi cluster is unaware of CoC + +Let's continue with an assumption that an individual Machi cluster +inside of the cluster-of-clusters is completely unaware of the +cluster-of-clusters layer. + +TODO: We may need to break this assumption sometime in the future? + +** The reader is familiar with the random slicing technique + +I'd done something very-very-nearly-identical for the Hibari database +6 years ago. But the Hibari technique was based on stuff I did at +Sendmail, Inc, so it felt old news to me. {shrug} + +The Hibari documentation has a brief photo illustration of how random +slicing works, see [[http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration][Hibari Sysadmin Guide, chain migration]] + +For a comprehensive description, please see these two papers: + +#+BEGIN_QUOTE +Reliable and Randomized Data Distribution Strategies for Large Scale Storage Systems +Alberto Miranda et al. +http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.226.5609 + (short version, HIPC'11) + +Random Slicing: Efficient and Scalable Data Placement for Large-Scale + Storage Systems +Alberto Miranda et al. +DOI: http://dx.doi.org/10.1145/2632230 (long version, ACM Transactions + on Storage, Vol. 10, No. 3, Article 9, 2014) +#+END_QUOTE + +** CoC locator: We borrow from random slicing but do not hash any strings! + +We will use the general technique of random slicing, but we adapt the +technique to fit our use case. + +In general, random slicing says: + +- Hash a string onto the unit interval [0.0, 1.0) +- Calculate h(unit interval point, Map) -> bin, where ~Map~ partitions + the unit interval into bins. + +Our adaptation is in step 1: we do not hash any strings. Instead, we +store & use the unit interval point as-is, without using a hash +function in this step. This number is called the "CoC locator". + +As described later in this doc, Machi file names are structured into +several components. One component of the file name contains the "CoC +locator"; we use the number as-is for step 2 above. + +* 3. A simple illustration + +We use a variation of the Random Slicing hash that we will call +~rs_hash_with_float()~. The Erlang-style function type is shown +below. + +#+BEGIN_SRC erlang +%% type specs, Erlang-style +-spec rs_hash_with_float(float(), rs_hash:map()) -> rs_hash:cluster_id(). +#+END_SRC + +I'm borrowing an illustration from the HibariDB documentation here, +but it fits my purposes quite well. (I am the original creator of that +image, and also the use license is compatible.) + +#+CAPTION: Illustration of 'Map', using four Machi clusters + +[[./migration-4.png]] + +Assume that we have a random slicing map called ~Map~. This particular +~Map~ maps the unit interval onto 4 Machi clusters: + +| Hash range | Cluster ID | +|-------------+------------| +| 0.00 - 0.25 | Cluster1 | +| 0.25 - 0.33 | Cluster4 | +| 0.33 - 0.58 | Cluster2 | +| 0.58 - 0.66 | Cluster4 | +| 0.66 - 0.91 | Cluster3 | +| 0.91 - 1.00 | Cluster4 | + +Assume that the system chooses a CoC locator of 0.05. +According to ~Map~, the value of +~rs_hash_with_float(0.05,Map) = Cluster1~. +Similarly, ~rs_hash_with_float(0.26,Map) = Cluster4~. + +* 4. An additional assumption: clients will want some control over file location + +We will continue to use the 4-cluster diagram from the previous +section. + +** Our new assumption: client control over initial file location + +The CoC management scheme may decide that files need to migrate to +other clusters. The reason could be for storage load or I/O load +balancing reasons. It could be because a cluster is being +decommissioned by its owners. There are many legitimate reasons why a +file that is initially created on cluster ID X has been moved to +cluster ID Y. + +However, there are also legitimate reasons for why the client would want +control over the choice of Machi cluster when the data is first +written. The single biggest reason is load balancing. Assuming that +the client (or the CoC management layer acting on behalf of the CoC +client) knows the current utilization across the participating Machi +clusters, then it may be very helpful to send new append() requests to +under-utilized clusters. + +* 5. Use of the CoC namespace: name separation plus chain type + +Let us assume that the CoC framework provides several different types +of chains: + +| Chain length | CoC namespace | Mode | Comment | +|--------------+---------------+------+----------------------------------| +| 3 | normal | AP | Normal storage redundancy & cost | +| 2 | reduced | AP | Reduced cost storage | +| 1 | risky | AP | Really, really cheap storage | +| 9 | paranoid | AP | Safety-critical storage | +| 3 | sequential | CP | Strong consistency | +|--------------+---------------+------+----------------------------------| + +The client may want to choose the amount of redundancy that its +application requires: normal, reduced cost, or perhaps even a single +copy. The CoC namespace is used by the client to signal this +intention. + +Further, the CoC administrators may wish to use the namespace to +provide separate storage for different applications. Jane's +application may use the namespace "jane-normal" and Bob's app uses +"bob-reduced". The CoC administrators may definite separate groups of +chains on separate servers to serve these two applications. + +* 6. Floating point is not required ... it is merely convenient for explanation + +NOTE: Use of floating point terms is not required. For example, +integer arithmetic could be used, if using a sufficiently large +interval to create an even & smooth distribution of hashes across the +expected maximum number of clusters. + +For example, if the maximum CoC cluster size would be 4,000 individual +Machi clusters, then a minimum of 12 bits of integer space is required +to assign one integer per Machi cluster. However, for load balancing +purposes, a finer grain of (for example) 100 integers per Machi +cluster would permit file migration to move increments of +approximately 1% of single Machi cluster's storage capacity. A +minimum of 12+7=19 bits of hash space would be necessary to accommodate +these constraints. + +It is likely that Machi's final implementation will choose a 24 bit +integer to represent the CoC locator. + +* 7. Proposal: Break the opacity of Machi file names + +Machi assigns file names based on: + +~ClientSuppliedPrefix ++ "^" ++ SomeOpaqueFileNameSuffix~ + +What if the CoC client could peek inside of the opaque file name +suffix in order to look at the CoC location information that we might +code in the filename suffix? + +** The notation we use + +- ~T~ = the target CoC member/Cluster ID chosen by the CoC client at the time of ~append()~ +- ~p~ = file prefix, chosen by the CoC client. +- ~L~ = the CoC locator +- ~N~ = the CoC namespace +- ~u~ = the Machi file server unique opaque file name suffix, e.g. a GUID string +- ~F~ = a Machi file name, i.e., ~p^L^N^u~ + +** The details: CoC file write + +1. CoC client chooses ~p~, ~T~, and ~N~ (i.e., the file prefix, target + cluster, and target cluster namespace) +2. CoC client knows the CoC ~Map~ for namespace ~N~. +3. CoC client choose some CoC locator value ~L~ such that + ~rs_hash_with_float(L,Map) = T~ (see below). +4. CoC client sends its request to cluster + ~T~: ~append_chunk(p,L,N,...) -> {ok,p^L^N^u,ByteOffset}~ +5. CoC stores/uses the file name ~F = p^L^N^u~. + +** The details: CoC file read + +1. CoC client knows the file name ~F~ and parses it to find + the values of ~L~ and ~N~ (recall, ~F = p^L^N^u~). +2. CoC client knows the CoC ~Map~ for type ~N~. +3. CoC calculates ~rs_hash_with_float(L,Map) = T~ +4. CoC client sends request to cluster ~T~: ~read_chunk(F,...) ->~ ... success! + +** The details: calculating 'L' (the CoC locator) to match a desired target cluster + +1. We know ~Map~, the current CoC mapping for a CoC namespace ~N~. +2. We look inside of ~Map~, and we find all of the unit interval ranges + that map to our desired target cluster ~T~. Let's call this list + ~MapList = [Range1=(start,end],Range2=(start,end],...]~. +3. In our example, ~T=Cluster2~. The example ~Map~ contains a single + unit interval range for ~Cluster2~, ~[(0.33,0.58]]~. +4. Choose a uniformly random number ~r~ on the unit interval. +5. Calculate locator ~L~ by mapping ~r~ onto the concatenation + of the CoC hash space range intervals in ~MapList~. For example, + if ~r=0.5~, then ~L = 0.33 + 0.5*(0.58-0.33) = 0.455~, which is + exactly in the middle of the ~(0.33,0.58]~ interval. + +** A bit more about the CoC locator's meaning and use + +- If two files were written using exactly the same CoC locator and the + same CoC namespace, then the client is indicating that it wishes + that the two files be stored in the same chain. +- If two files have a different CoC locator, then the client has + absolutely no expectation of where the two files will be stored + relative to each other. + +Given the items above, then some consequences are: + +- If the client doesn't care about CoC placement, then picking a + random number is fine. Always choosing a different locator ~L~ for + each append will scatter data across the CoC as widely as possible. +- If the client believes that some physical locality is good, then the + client should reuse the same locator ~L~ for a batch of appends to + the same prefix ~p~ and namespace ~N~. We have no recommendations + for the batch size, yet; perhaps 10-1,000 might be a good start for + experiments? + +When the client choose CoC namespace ~N~ and CoC locator ~L~ (using +random number or target cluster technique), the client uses ~N~'s CoC +map to find the CoC target cluster, ~T~. The client has also chosen +the file prefix ~p~. The append op sent to cluster ~T~ would look +like: + +~append_chunk(N="reduced",L=0.25,p="myprefix",<<900-data-bytes>>,<>,...)~ + +A successful result would yield a chunk position: + +~{offset=883293,size=900,file="myprefix^reduced^0.25^OpaqueSuffix"}~ + +** A bit more about the CoC namespaces's meaning and use + +- The CoC framework will provide means of creating and managing + chains of different types, e.g., chain length, consistency mode. +- The CoC framework will manage the mapping of CoC namespace names to + the chains in the system. +- The CoC framework will provide a query service to map a CoC + namespace name to a Coc map, + e.g. ~coc_latest_map("reduced") -> Map{generation=7,...}~. + +For use by Riak CS, for example, we'd likely start with the following +namespaces ... working our way down the list as we add new features +and/or re-implement existing CS features. + +- "standard" = Chain length = 3, eventually consistency mode +- "reduced" = Chain length = 2, eventually consistency mode. +- "stanchion7" = Chain length = 7, strong consistency mode. Perhaps + use this namespace for the metadata required to re-implement the + operations that are performed by today's Stanchion application. + +* 8. File migration (a.k.a. rebalancing/reparitioning/resharding/redistribution) + +** What is "migration"? + +This section describes Machi's file migration. Other storage systems +call this process as "rebalancing", "repartitioning", "resharding" or +"redistribution". +For Riak Core applications, it is called "handoff" and "ring resizing" +(depending on the context). +See also the [[http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html#Balancer][Hadoop file balancer]] for another example of a data +migration process. + +As discussed in section 5, the client can have good reason for wanting +to have some control of the initial location of the file within the +cluster. However, the cluster manager has an ongoing interest in +balancing resources throughout the lifetime of the file. Disks will +get full, hardware will change, read workload will fluctuate, +etc etc. + +This document uses the word "migration" to describe moving data from +one Machi chain to another within a CoC system. + +A simple variation of the Random Slicing hash algorithm can easily +accommodate Machi's need to migrate files without interfering with +availability. Machi's migration task is much simpler due to the +immutable nature of Machi file data. + +** Change to Random Slicing + +The map used by the Random Slicing hash algorithm needs a few simple +changes to make file migration straightforward. + +- Add a "generation number", a strictly increasing number (similar to + a Machi cluster's "epoch number") that reflects the history of + changes made to the Random Slicing map +- Use a list of Random Slicing maps instead of a single map, one map + per chance that files may not have been migrated yet out of + that map. + +As an example: + +#+CAPTION: Illustration of 'Map', using four Machi clusters + +[[./migration-3to4.png]] + +And the new Random Slicing map for some CoC namespace ~N~ might look +like this: + +| Generation number / Namespace | 7 / reduced | +|-------------------------------+-------------| +| SubMap | 1 | +|-------------------------------+-------------| +| Hash range | Cluster ID | +|-------------------------------+-------------| +| 0.00 - 0.33 | Cluster1 | +| 0.33 - 0.66 | Cluster2 | +| 0.66 - 1.00 | Cluster3 | +|-------------------------------+-------------| +| SubMap | 2 | +|-------------------------------+-------------| +| Hash range | Cluster ID | +|-------------------------------+-------------| +| 0.00 - 0.25 | Cluster1 | +| 0.25 - 0.33 | Cluster4 | +| 0.33 - 0.58 | Cluster2 | +| 0.58 - 0.66 | Cluster4 | +| 0.66 - 0.91 | Cluster3 | +| 0.91 - 1.00 | Cluster4 | + +When a new Random Slicing map contains a single submap, then its use +is identical to the original Random Slicing algorithm. If the map +contains multiple submaps, then the access rules change a bit: + +- Write operations always go to the newest/largest submap. +- Read operations attempt to read from all unique submaps. + - Skip searching submaps that refer to the same cluster ID. + - In this example, unit interval value 0.10 is mapped to Cluster1 + by both submaps. + - Read from newest/largest submap to oldest/smallest submap. + - If not found in any submap, search a second time (to handle races + with file copying between submaps). + - If the requested data is found, optionally copy it directly to the + newest submap. (This is a variation of read repair (RR). RR here + accelerates the migration process and can reduce the number of + operations required to query servers in multiple submaps). + +The cluster-of-clusters manager is responsible for: + +- Managing the various generations of the CoC Random Slicing maps for + all namespaces. +- Distributing namespace maps to CoC clients. +- Managing the processes that are responsible for copying "cold" data, + i.e., files data that is not regularly accessed, to its new submap + location. +- When migration of a file to its new cluster is confirmed successful, + delete it from the old cluster. + +In example map #7, the CoC manager will copy files with unit interval +assignments in ~(0.25,0.33]~, ~(0.58,0.66]~, and ~(0.91,1.00]~ from their +old locations in cluster IDs Cluster1/2/3 to their new cluster, +Cluster4. When the CoC manager is satisfied that all such files have +been copied to Cluster4, then the CoC manager can create and +distribute a new map, such as: + +| Generation number / Namespace | 8 / reduced | +|-------------------------------+-------------| +| SubMap | 1 | +|-------------------------------+-------------| +| Hash range | Cluster ID | +|-------------------------------+-------------| +| 0.00 - 0.25 | Cluster1 | +| 0.25 - 0.33 | Cluster4 | +| 0.33 - 0.58 | Cluster2 | +| 0.58 - 0.66 | Cluster4 | +| 0.66 - 0.91 | Cluster3 | +| 0.91 - 1.00 | Cluster4 | + +The HibariDB system performs data migrations in almost exactly this +manner. However, one important +limitation of HibariDB is not being able to +perform more than one migration at a time. HibariDB's data is +mutable, and mutation causes many problems already when migrating data +across two submaps; three or more submaps was too complex to implement +quickly. + +Fortunately for Machi, its file data is immutable and therefore can +easily manage many migrations in parallel, i.e., its submap list may +be several maps long, each one for an in-progress file migration. + +* 9. Other considerations for FLU/sequencer implementations + +** Append to existing file when possible + +In the earliest Machi FLU implementation, it was impossible to append +to the same file after ~30 seconds. For example: + +- Client: ~append(prefix="foo",...) -> {ok,"foo^suffix1",Offset1}~ +- Client: ~append(prefix="foo",...) -> {ok,"foo^suffix1",Offset2}~ +- Client: ~append(prefix="foo",...) -> {ok,"foo^suffix1",Offset3}~ +- Client: sleep 40 seconds +- Server: after 30 seconds idle time, stop Erlang server process for + the ~"foo^suffix1"~ file +- Client: ...wakes up... +- Client: ~append(prefix="foo",...) -> {ok,"foo^suffix2",Offset4}~ + +Our ideal append behavior is to always append to the same file. Why? +It would be nice if Machi didn't create zillions of tiny files if the +client appends to some prefix very infrequently. In general, it is +better to create fewer & bigger files by re-using a Machi file name +when possible. + +The sequencer should always assign new offsets to the latest/newest +file for any prefix, as long as all prerequisites are also true, + +- The epoch has not changed. (In AP mode, epoch change -> mandatory file name suffix change.) +- The latest file for prefix ~p~ is smaller than maximum file size for a FLU's configuration. + +* 10. Acknowledgments + +The source for the "migration-4.png" and "migration-3to4.png" images +come from the [[http://hibari.github.io/hibari-doc/images/migration-3to4.png][HibariDB documentation]]. + diff --git a/doc/cluster/migration-3to4.png b/doc/cluster/migration-3to4.png deleted file mode 100644 index cbef7e9..0000000 Binary files a/doc/cluster/migration-3to4.png and /dev/null differ diff --git a/doc/cluster/migration-4.png b/doc/cluster/migration-4.png deleted file mode 100644 index b1e2b31..0000000 Binary files a/doc/cluster/migration-4.png and /dev/null differ diff --git a/doc/cluster/name-game-sketch.org b/doc/cluster/name-game-sketch.org deleted file mode 100644 index 21d2bd6..0000000 --- a/doc/cluster/name-game-sketch.org +++ /dev/null @@ -1,481 +0,0 @@ --*- mode: org; -*- -#+TITLE: Machi cluster "name game" sketch -#+AUTHOR: Scott -#+STARTUP: lognotedone hidestars indent showall inlineimages -#+SEQ_TODO: TODO WORKING WAITING DONE -#+COMMENT: M-x visual-line-mode -#+COMMENT: Also, disable auto-fill-mode - -* 1. "Name Games" with random-slicing style consistent hashing - -Our goal: to distribute lots of files very evenly across a large -collection of individual, small Machi chains. - -* 2. Assumptions - -** Basic familiarity with Machi high level design and Machi's "projection" - -The [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] contains all of the basic -background assumed by the rest of this document. - -** Analogy: "neighborhood : city :: Machi chain : Machi cluster" - -Analogy: The word "machi" in Japanese means small town or -neighborhood. As the Tokyo Metropolitan Area is built from many -machis and smaller cities, therefore a big, partitioned file store can -be built out of many small Machi chains. - -** Familiarity with the Machi chain concept - -It's clear (I hope!) from -the [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] that Machi alone does not support -any kind of file partitioning/distribution/sharding across multiple -small Machi chains. There must be another layer above a Machi chain to -provide such partitioning services. - -Using the [[https://github.com/basho/machi/tree/master/prototype/demo-day-hack][cluster quick-and-dirty prototype]] as an -architecture sketch, let's now assume that we have ~n~ independent Machi -chains. We assume that each of these chains has the same -chain length in the nominal case, e.g. chain length of 3. -We wish to provide partitioned/distributed file storage -across all ~n~ chains. We call the entire collection of ~n~ Machi -chains a "cluster". - -We may wish to have several types of Machi clusters. For example: - -+ Chain length of 1 for "don't care if it gets lost, - store stuff very very cheaply" data. -+ Chain length of 2 for normal data. - + Equivalent to quorum replication's reliability with 3 copies. -+ Chain length of 7 for critical, unreplaceable data. - + Equivalent to quorum replication's reliability with 15 copies. - -Each of these types of chains will have a name ~N~ in the -namespace. The role of the cluster namespace will be demonstrated in -Section 3 below. - -** Continue an early assumption: a Machi chain is unaware of clustering - -Let's continue with an assumption that an individual Machi chain -inside of a cluster is completely unaware of the cluster layer. - -** The reader is familiar with the random slicing technique - -I'd done something very-very-nearly-like-this for the Hibari database -6 years ago. But the Hibari technique was based on stuff I did at -Sendmail, Inc, in 2000, so this technique feels like old news to me. -{shrug} - -The following section provides an illustrated example. -Very quickly, the random slicing algorithm is: - -- Hash a string onto the unit interval [0.0, 1.0) -- Calculate h(unit interval point, Map) -> bin, where ~Map~ divides - the unit interval into bins (or partitions or shards). - -Machi's adaptation is in step 1: we do not hash any strings. Instead, we -simply choose a number on the unit interval. This number is called -the "cluster locator number". - -As described later in this doc, Machi file names are structured into -several components. One component of the file name contains the cluster -locator number; we use the number as-is for step 2 above. - -*** For more information about Random Slicing - -For a comprehensive description of random slicing, please see the -first two papers. For a quicker summary, please see the third -reference. - -#+BEGIN_QUOTE -Reliable and Randomized Data Distribution Strategies for Large Scale Storage Systems -Alberto Miranda et al. -http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.226.5609 - (short version, HIPC'11) - -Random Slicing: Efficient and Scalable Data Placement for Large-Scale - Storage Systems -Alberto Miranda et al. -DOI: http://dx.doi.org/10.1145/2632230 (long version, ACM Transactions - on Storage, Vol. 10, No. 3, Article 9, 2014) - -[[http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration][Hibari Sysadmin Guide, chain migration section]]. -http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration -#+END_QUOTE - -* 3. A simple illustration - -We use a variation of the Random Slicing hash that we will call -~rs_hash_with_float()~. The Erlang-style function type is shown -below. - -#+BEGIN_SRC erlang -%% type specs, Erlang-style --spec rs_hash_with_float(float(), rs_hash:map()) -> rs_hash:chain_id(). -#+END_SRC - -I'm borrowing an illustration from the HibariDB documentation here, -but it fits my purposes quite well. (I am the original creator of that -image, and also the use license is compatible.) - -#+CAPTION: Illustration of 'Map', using four Machi chains - -[[./migration-4.png]] - -Assume that we have a random slicing map called ~Map~. This particular -~Map~ maps the unit interval onto 4 Machi chains: - -| Hash range | Chain ID | -|-------------+----------| -| 0.00 - 0.25 | Chain1 | -| 0.25 - 0.33 | Chain4 | -| 0.33 - 0.58 | Chain2 | -| 0.58 - 0.66 | Chain4 | -| 0.66 - 0.91 | Chain3 | -| 0.91 - 1.00 | Chain4 | - -Assume that the system chooses a cluster locator of 0.05. -According to ~Map~, the value of -~rs_hash_with_float(0.05,Map) = Chain1~. -Similarly, ~rs_hash_with_float(0.26,Map) = Chain4~. - -This example should look very similar to Hibari's technique. -The Hibari documentation has a brief photo illustration of how random -slicing works, see [[http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration][Hibari Sysadmin Guide, chain migration]]. - -* 4. Use of the cluster namespace: name separation plus chain type - -Let us assume that the cluster framework provides several different types -of chains: - -| Chain length | Namespace | Consistency Mode | Comment | -|--------------+--------------+------------------+----------------------------------| -| 3 | ~normal~ | eventual | Normal storage redundancy & cost | -| 2 | ~reduced~ | eventual | Reduced cost storage | -| 1 | ~risky~ | eventual | Really, really cheap storage | -| 7 | ~paranoid~ | eventual | Safety-critical storage | -| 3 | ~sequential~ | strong | Strong consistency | -|--------------+--------------+------------------+----------------------------------| - -The client may want to choose the amount of redundancy that its -application requires: normal, reduced cost, or perhaps even a single -copy. The cluster namespace is used by the client to signal this -intention. - -Further, the cluster administrators may wish to use the namespace to -provide separate storage for different applications. Jane's -application may use the namespace "jane-normal" and Bob's app uses -"bob-reduced". Administrators may definine separate groups of -chains on separate servers to serve these two applications. - -* 5. In its lifetime, a file may be moved to different chains - -The cluster management scheme may decide that files need to migrate to -other chains -- i.e., file that is initially created on chain ID ~X~ -has been moved to chain ID ~Y~. - -+ For storage load or I/O load balancing reasons. -+ Because a chain is being decommissioned by the sysadmin. - -* 6. Floating point is not required ... it is merely convenient for explanation - -NOTE: Use of floating point terms is not required. For example, -integer arithmetic could be used, if using a sufficiently large -interval to create an even & smooth distribution of hashes across the -expected maximum number of chains. - -For example, if the maximum cluster size would be 4,000 individual -Machi chains, then a minimum of 12 bits of integer space is required -to assign one integer per Machi chain. However, for load balancing -purposes, a finer grain of (for example) 100 integers per Machi -chain would permit file migration to move increments of -approximately 1% of single Machi chain's storage capacity. A -minimum of 12+7=19 bits of hash space would be necessary to accommodate -these constraints. - -It is likely that Machi's final implementation will choose a 24 bit -integer (or perhaps 32 bits) to represent the cluster locator. - -* 7. Proposal: Break the opacity of Machi file names, slightly. - -Machi assigns file names based on: - -~ClientSuppliedPrefix ++ "^" ++ SomeOpaqueFileNameSuffix~ - -What if some parts of the system could peek inside of the opaque file name -suffix in order to look at the cluster location information that we might -code in the filename suffix? - -We break the system into parts that speak two levels of protocols, -"high" and "low". - -+ The high level protocol is used outside of the Machi cluster -+ The low level protocol is used inside of the Machi cluster - -Both protocols are based on a Protocol Buffers specification and -implementation. Other protocols, such as HTTP, will be added later. - -#+BEGIN_SRC - +-----------------------+ - | Machi external client | - | e.g. Riak CS | - +-----------------------+ - ^ - | Machi "high" API - | ProtoBuffs protocol Machi cluster boundary: outside -......................................................................... - | Machi cluster boundary: inside - v - +--------------------------+ +------------------------+ - | Machi "high" API service | | Machi HTTP API service | - +--------------------------+ +------------------------+ - ^ | - | +------------------------+ - v v - +------------------------+ - | Cluster bridge service | - +------------------------+ - ^ - | Machi "low" API - | ProtoBuffs protocol - +----------------------------------------+----+----+ - | | | | - v v v v - +-------------------------+ ... other chains... - | Chain C1 (logical view) | - | +--------------+ | - | | FLU server 1 | | - | | +--------------+ | - | +--| FLU server 2 | | - | +--------------+ | In reality, API bridge talks directly - +-------------------------+ to each FLU server in a chain. -#+END_SRC - -** The notation we use - -- ~N~ = the cluster namespace, chosen by the client. -- ~p~ = file prefix, chosen by the client. -- ~L~ = the cluster locator (a number, type is implementation-dependent) -- ~Map~ = a mapping of cluster locators to chains -- ~T~ = the target chain ID/name -- ~u~ = a unique opaque file name suffix, e.g. a GUID string -- ~F~ = a Machi file name, i.e., a concatenation of ~p^L^N^u~ - -** The details: cluster file append - -0. Cluster client chooses ~N~ and ~p~ (i.e., cluster namespace and - file prefix) and sends the append request to a Machi cluster member - via the Protocol Buffers "high" API. -1. Cluster bridge chooses ~T~ (i.e., target chain), based on criteria - such as disk utilization percentage. -2. Cluster bridge knows the cluster ~Map~ for namespace ~N~. -3. Cluster bridge choose some cluster locator value ~L~ such that - ~rs_hash_with_float(L,Map) = T~ (see algorithm below). -4. Cluster bridge sends its request to chain - ~T~: ~append_chunk(p,L,N,...) -> {ok,p^L^N^u,ByteOffset}~ -5. Cluster bridge forwards the reply tuple to the client. -6. Client stores/uses the file name ~F = p^L^N^u~. - -** The details: Cluster file read - -0. Cluster client sends the read request to a Machi cluster member via - the Protocol Buffers "high" API. -1. Cluster bridge parses the file name ~F~ to find - the values of ~L~ and ~N~ (recall, ~F = p^L^N^u~). -2. Cluster bridge knows the Cluster ~Map~ for type ~N~. -3. Cluster bridge calculates ~rs_hash_with_float(L,Map) = T~ -4. Cluster bridge sends request to chain ~T~: - ~read_chunk(F,...) ->~ ... reply -5. Cluster bridge forwards the reply to the client. - -** The details: calculating 'L' (the cluster locator number) to match a desired target chain - -1. We know ~Map~, the current cluster mapping for a cluster namespace ~N~. -2. We look inside of ~Map~, and we find all of the unit interval ranges - that map to our desired target chain ~T~. Let's call this list - ~MapList = [Range1=(start,end],Range2=(start,end],...]~. -3. In our example, ~T=Chain2~. The example ~Map~ contains a single - unit interval range for ~Chain2~, ~[(0.33,0.58]]~. -4. Choose a uniformly random number ~r~ on the unit interval. -5. Calculate the cluster locator ~L~ by mapping ~r~ onto the concatenation - of the cluster hash space range intervals in ~MapList~. For example, - if ~r=0.5~, then ~L = 0.33 + 0.5*(0.58-0.33) = 0.455~, which is - exactly in the middle of the ~(0.33,0.58]~ interval. - -** A bit more about the cluster namespaces's meaning and use - -For use by Riak CS, for example, we'd likely start with the following -namespaces ... working our way down the list as we add new features -and/or re-implement existing CS features. - -- "standard" = Chain length = 3, eventually consistency mode -- "reduced" = Chain length = 2, eventually consistency mode. -- "stanchion7" = Chain length = 7, strong consistency mode. Perhaps - use this namespace for the metadata required to re-implement the - operations that are performed by today's Stanchion application. - -We want the cluster framework to: - -- provide means of creating and managing - chains of different types, e.g., chain length, consistency mode. -- manage the mapping of cluster namespace - names to the chains in the system. -- provide query functions to map a cluster - namespace name to a cluster map, - e.g. ~get_cluster_latest_map("reduced") -> Map{generation=7,...}~. - -* 8. File migration (a.k.a. rebalancing/reparitioning/resharding/redistribution) - -** What is "migration"? - -This section describes Machi's file migration. Other storage systems -call this process as "rebalancing", "repartitioning", "resharding" or -"redistribution". -For Riak Core applications, it is called "handoff" and "ring resizing" -(depending on the context). -See also the [[http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html#Balancer][Hadoop file balancer]] for another example of a data -migration process. - -As discussed in section 5, the client can have good reason for wanting -to have some control of the initial location of the file within the -chain. However, the chain manager has an ongoing interest in -balancing resources throughout the lifetime of the file. Disks will -get full, hardware will change, read workload will fluctuate, -etc etc. - -This document uses the word "migration" to describe moving data from -one Machi chain to another chain within a cluster system. - -A simple variation of the Random Slicing hash algorithm can easily -accommodate Machi's need to migrate files without interfering with -availability. Machi's migration task is much simpler due to the -immutable nature of Machi file data. - -** Change to Random Slicing - -The map used by the Random Slicing hash algorithm needs a few simple -changes to make file migration straightforward. - -- Add a "generation number", a strictly increasing number (similar to - a Machi chain's "epoch number") that reflects the history of - changes made to the Random Slicing map -- Use a list of Random Slicing maps instead of a single map, one map - per chance that files may not have been migrated yet out of - that map. - -As an example: - -#+CAPTION: Illustration of 'Map', using four Machi chains - -[[./migration-3to4.png]] - -And the new Random Slicing map for some cluster namespace ~N~ might look -like this: - -| Generation number / Namespace | 7 / reduced | -|-------------------------------+-------------| -| SubMap | 1 | -|-------------------------------+-------------| -| Hash range | Chain ID | -|-------------------------------+-------------| -| 0.00 - 0.33 | Chain1 | -| 0.33 - 0.66 | Chain2 | -| 0.66 - 1.00 | Chain3 | -|-------------------------------+-------------| -| SubMap | 2 | -|-------------------------------+-------------| -| Hash range | Chain ID | -|-------------------------------+-------------| -| 0.00 - 0.25 | Chain1 | -| 0.25 - 0.33 | Chain4 | -| 0.33 - 0.58 | Chain2 | -| 0.58 - 0.66 | Chain4 | -| 0.66 - 0.91 | Chain3 | -| 0.91 - 1.00 | Chain4 | - -When a new Random Slicing map contains a single submap, then its use -is identical to the original Random Slicing algorithm. If the map -contains multiple submaps, then the access rules change a bit: - -- Write operations always go to the newest/largest submap. -- Read operations attempt to read from all unique submaps. - - Skip searching submaps that refer to the same chain ID. - - In this example, unit interval value 0.10 is mapped to Chain1 - by both submaps. - - Read from newest/largest submap to oldest/smallest submap. - - If not found in any submap, search a second time (to handle races - with file copying between submaps). - - If the requested data is found, optionally copy it directly to the - newest submap. (This is a variation of read repair (RR). RR here - accelerates the migration process and can reduce the number of - operations required to query servers in multiple submaps). - -The cluster manager is responsible for: - -- Managing the various generations of the cluster Random Slicing maps for - all namespaces. -- Distributing namespace maps to cluster bridges. -- Managing the processes that are responsible for copying "cold" data, - i.e., files data that is not regularly accessed, to its new submap - location. -- When migration of a file to its new chain is confirmed successful, - delete it from the old chain. - -In example map #7, the cluster manager will copy files with unit interval -assignments in ~(0.25,0.33]~, ~(0.58,0.66]~, and ~(0.91,1.00]~ from their -old locations in chain IDs Chain1/2/3 to their new chain, -Chain4. When the cluster manager is satisfied that all such files have -been copied to Chain4, then the cluster manager can create and -distribute a new map, such as: - -| Generation number / Namespace | 8 / reduced | -|-------------------------------+-------------| -| SubMap | 1 | -|-------------------------------+-------------| -| Hash range | Chain ID | -|-------------------------------+-------------| -| 0.00 - 0.25 | Chain1 | -| 0.25 - 0.33 | Chain4 | -| 0.33 - 0.58 | Chain2 | -| 0.58 - 0.66 | Chain4 | -| 0.66 - 0.91 | Chain3 | -| 0.91 - 1.00 | Chain4 | - -The HibariDB system performs data migrations in almost exactly this -manner. However, one important -limitation of HibariDB is not being able to -perform more than one migration at a time. HibariDB's data is -mutable. Mutation causes many problems when migrating data -across two submaps; three or more submaps was too complex to implement -quickly and correctly. - -Fortunately for Machi, its file data is immutable and therefore can -easily manage many migrations in parallel, i.e., its submap list may -be several maps long, each one for an in-progress file migration. - -* 9. Other considerations for FLU/sequencer implementations - -** Append to existing file when possible - -The sequencer should always assign new offsets to the latest/newest -file for any prefix, as long as all prerequisites are also true, - -- The epoch has not changed. (In AP mode, epoch change -> mandatory - file name suffix change.) -- The cluster locator number is stable. -- The latest file for prefix ~p~ is smaller than maximum file size for - a FLU's configuration. - -The stability of the cluster locator number is an implementation detail that -must be managed by the cluster bridge. - -Reuse of the same file is not possible if the bridge always chooses a -different cluster locator number ~L~ or if the client always uses a unique -file prefix ~p~. The latter is a sign of a misbehaved client; the -former is a poorly-implemented bridge. - -* 10. Acknowledgments - -The original source for the "migration-4.png" and "migration-3to4.png" images -come from the [[http://hibari.github.io/hibari-doc/images/migration-3to4.png][HibariDB documentation]]. - diff --git a/doc/dev-clone-compile.md b/doc/dev-clone-compile.md deleted file mode 100644 index 3ba78e1..0000000 --- a/doc/dev-clone-compile.md +++ /dev/null @@ -1,30 +0,0 @@ -# Clone and compile Machi - -Clone the Machi source repo and compile the source and test code. Run -the following commands at your login shell: - - cd /tmp - git clone https://github.com/basho/machi.git - cd machi - git checkout master - make # or 'gmake' if GNU make uses an alternate name - -Then run the unit test suite. This may take up to two minutes or so -to finish. - - make test - -At the end, the test suite should report that all tests passed. The -actual number of tests shown in the "All `X` tests passed" line may be -different than the example below. - - [... many lines omitted ...] - module 'event_logger' - module 'chain_mgr_legacy' - ======================================================= - All 90 tests passed. - -If you had a test failure, a likely cause may be a limit on the number -of file descriptors available to your user process. (Recent releases -of OS X have a limit of 1024 file descriptors, which may be too slow.) -The output of the `limit -n` will tell you your file descriptor limit. diff --git a/doc/dev-prerequisites.md b/doc/dev-prerequisites.md deleted file mode 100644 index 66afd41..0000000 --- a/doc/dev-prerequisites.md +++ /dev/null @@ -1,38 +0,0 @@ -## Machi developer environment prerequisites - -1. Machi requires an 64-bit variant of UNIX: OS X, FreeBSD, Linux, or - Solaris machine is a standard developer environment for C and C++ - applications (64-bit versions). -2. You'll need the `git` source management utility. -3. You'll need the 64-bit Erlang/OTP 17 runtime environment. Please - don't use earlier or later versions until we have a chance to fix - the compilation warnings that versions R16B and 18 will trigger. - Also, please verify that you are not using a 32-bit Erlang/OTP - runtime package. - -For `git` and the Erlang runtime, please use your OS-specific -package manager to install these. If your package manager doesn't -have 64-bit Erlang/OTP version 17 available, then we recommend using the -[precompiled packages available at Erlang Solutions](https://www.erlang-solutions.com/resources/download.html). - -Also, please verify that you have enough file descriptors available to -your user processes. The output of `ulimit -n` should report at least -4,000 file descriptors available. If your limit is lower (a frequent -problem for OS X users), please increase it to at least 4,000. - -# Using Vagrant to set up a developer environment for Machi - -The Machi source directory contains a `Vagrantfile` for creating an -Ubuntu Linux-based virtual machine for compiling and running Machi. -This file is in the -[$SRC_TOP/priv/humming-consensus-demo.vagrant](../priv/humming-consensus-demo.vagrant) -directory. - -If used as-is, the virtual machine specification is modest. - -* 1 virtual CPU -* 512MB virtual memory -* 768MB swap space -* 79GB sparse virtual disk image. After installing prerequisites and - compiling Machi, the root file system uses approximately 2.7 GBytes. - diff --git a/doc/flu-and-chain-lifecycle.org b/doc/flu-and-chain-lifecycle.org index d81b326..4672080 100644 --- a/doc/flu-and-chain-lifecycle.org +++ b/doc/flu-and-chain-lifecycle.org @@ -14,10 +14,10 @@ complete yet, so we are working one small step at a time. + FLU and Chain Life Cycle Management + Terminology review + Terminology: Machi run-time components/services/thingies - + Terminology: Machi chain data structures - + Terminology: Machi cluster data structures + + Terminology: Machi data structures + + Terminology: Cluster-of-cluster (CoC) data structures + Overview of administrative life cycles - + Cluster administrative life cycle + + Cluster-of-clusters (CoC) administrative life cycle + Chain administrative life cycle + FLU server administrative life cycle + Quick admin: declarative management of Machi FLU and chain life cycles @@ -57,8 +57,10 @@ complete yet, so we are working one small step at a time. quorum replication technique requires ~2F+1~ members in the general case.) -+ Cluster: A collection of Machi chains that are used to store files - in a horizontally partitioned/sharded/distributed manner. ++ Cluster: this word can be used interchangeably with "chain". + ++ Cluster-of-clusters: A collection of Machi clusters where files are + horizontally partitioned/sharded/distributed across ** Terminology: Machi data structures @@ -73,13 +75,13 @@ complete yet, so we are working one small step at a time. to another, e.g., when the chain is temporarily shortened by the failure of a member FLU server. -** Terminology: Machi cluster data structures +** Terminology: Cluster-of-cluster (CoC) data structures + Namespace: A collection of human-friendly names that are mapped to groups of Machi chains that provide the same type of storage service: consistency mode, replication policy, etc. + A single namespace name, e.g. ~normal-ec~, is paired with a single - cluster map (see below). + CoC chart (see below). + Example: ~normal-ec~ might be a collection of Machi chains in eventually-consistent mode that are of length=3. + Example: ~risky-ec~ might be a collection of Machi chains in @@ -87,31 +89,32 @@ complete yet, so we are working one small step at a time. + Example: ~mgmt-critical~ might be a collection of Machi chains in strongly-consistent mode that are of length=7. -+ Cluster map: Encodes the rules which partition/shard/distribute - the files stored in a particular namespace across a group of chains - that collectively store the namespace's files. ++ CoC chart: Encodes the rules which partition/shard/distribute a + particular namespace across a group of chains that collectively + store the namespace's files. + + "chart: noun, a geographical map or plan, especially on used for + navigation by sea or air." -+ Chain weight: A value assigned to each chain within a cluster map ++ Chain weight: A value assigned to each chain within a CoC chart structure that defines the relative storage capacity of a chain within the namespace. For example, a chain weight=150 has 50% more capacity than a chain weight=100. -+ Cluster map epoch: The version number assigned to a cluster map. ++ CoC chart epoch: The version number assigned to a CoC chart. * Overview of administrative life cycles -** Cluster administrative life cycle +** Cluster-of-clusters (CoC) administrative life cycle -+ Cluster is first created -+ Adds namespaces (e.g. consistency policy + chain length policy) to - the cluster -+ Chains are added to/removed from a namespace to increase/decrease the ++ CoC is first created ++ CoC adds namespaces (e.g. consistency policy + chain length policy) ++ CoC adds/removes chains to a namespace to increase/decrease the namespace's storage capacity. -+ Adjust chain weights within a namespace, e.g., to shift files ++ CoC adjusts chain weights within a namespace, e.g., to shift files within the namespace to chains with greater storage capacity resources and/or runtime I/O resources. -A cluster "file migration" is the process of moving files from one +A CoC "file migration" is the process of moving files from one namespace member chain to another for purposes of shifting & re-balancing storage capacity and/or runtime I/O capacity. @@ -152,7 +155,7 @@ described in this section. As described at the top of http://basho.github.io/machi/edoc/machi_lifecycle_mgr.html, the "rc.d" config files do not manage "policy". "Policy" is doing the right -thing with a Machi cluster from a systems administrator's +thing with a Machi cluster-of-clusters from a systems administrator's point of view. The "rc.d" config files can only implement decisions made according to policy. diff --git a/doc/humming-consensus-demo.md b/doc/humming-consensus-demo.md deleted file mode 100644 index f92858f..0000000 --- a/doc/humming-consensus-demo.md +++ /dev/null @@ -1,372 +0,0 @@ - -# Table of contents - -* [Hand-on experiments with Machi and Humming Consensus](#hands-on) -* [Using the network partition simulator and convergence demo test code](#partition-simulator) - - -# Hand-on experiments with Machi and Humming Consensus - -## Prerequisites - -Please refer to the -[Machi development environment prerequisites doc](./dev-prerequisites.md) -for Machi developer environment prerequisites. - -If you do not have an Erlang/OTP runtime system available, but you do -have [the Vagrant virtual machine](https://www.vagrantup.com/) manager -available, then please refer to the instructions in the prerequisites -doc for using Vagrant. - - -## Clone and compile the code - -Please see the -[Machi 'clone and compile' doc](./dev-clone-compile.md) -for the short list of steps required to fetch the Machi source code -from GitHub and to compile & test Machi. - -## Running three Machi instances on a single machine - -All of the commands that should be run at your login shell (e.g. Bash, -c-shell) can be cut-and-pasted from this document directly to your -login shell prompt. - -Run the following command: - - make stagedevrel - -This will create a directory structure like this: - - |-dev1-|... stand-alone Machi app + subdirectories - |-dev-|-dev2-|... stand-alone Machi app + directories - |-dev3-|... stand-alone Machi app + directories - -Each of the `dev/dev1`, `dev/dev2`, and `dev/dev3` are stand-alone -application instances of Machi and can be run independently of each -other on the same machine. This demo will use all three. - -The lifecycle management utilities for Machi are a bit immature, -currently. They assume that each Machi server runs on a host with a -unique hostname -- there is no flexibility built-in yet to easily run -multiple Machi instances on the same machine. To continue with the -demo, we need to use `sudo` or `su` to obtain superuser privileges to -edit the `/etc/hosts` file. - -Please add the following line to `/etc/hosts`, using this command: - - sudo sh -c 'echo "127.0.0.1 machi1 machi2 machi3" >> /etc/hosts' - -Next, we will use a shell script to finish setting up our cluster. It -will do the following for us: - -* Verify that the new line that was added to `/etc/hosts` is correct. -* Modify the `etc/app.config` files to configure the Humming Consensus - chain manager's actions logged to the `log/console.log` file. -* Start the three application instances. -* Verify that the three instances are running correctly. -* Configure a single chain, with one FLU server per application - instance. - -Please run this script using this command: - - ./priv/humming-consensus-demo.setup.sh - -If the output looks like this (and exits with status zero), then the -script was successful. - - Step: Verify that the required entries in /etc/hosts are present - Step: add a verbose logging option to app.config - Step: start three three Machi application instances - pong - pong - pong - Step: configure one chain to start a Humming Consensus group with three members - Result: ok - Result: ok - Result: ok - -We have now created a single replica chain, called `c1`, that has -three file servers participating in the chain. Thanks to the -hostnames that we added to `/etc/hosts`, all are using the localhost -network interface. - - | App instance | Pseudo | FLU name | TCP port | - | directory | Hostname | | number | - |--------------+----------+----------+----------| - | dev1 | machi1 | flu1 | 20401 | - | dev2 | machi2 | flu2 | 20402 | - | dev3 | machi3 | flu3 | 20403 | - -The log files for each application instance can be found in the -`./dev/devN/log/console.log` file, where the `N` is the instance -number: 1, 2, or 3. - -## Understanding the chain manager's log file output - -After running the `./priv/humming-consensus-demo.setup.sh` script, -let's look at the last few lines of the `./dev/dev1/log/console.log` -log file for Erlang VM process #1. - - 2016-03-09 10:16:35.676 [info] <0.105.0>@machi_lifecycle_mgr:process_pending_flu:422 Started FLU f1 with supervisor pid <0.128.0> - 2016-03-09 10:16:35.676 [info] <0.105.0>@machi_lifecycle_mgr:move_to_flu_config:540 Creating FLU config file f1 - 2016-03-09 10:16:35.790 [info] <0.105.0>@machi_lifecycle_mgr:bootstrap_chain2:312 Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[] - 2016-03-09 10:16:35.790 [info] <0.105.0>@machi_lifecycle_mgr:move_to_chain_config:546 Creating chain config file c1 - 2016-03-09 10:16:44.139 [info] <0.132.0> CONFIRM epoch 1141 <<155,42,7,221>> upi [] rep [] auth f1 by f1 - 2016-03-09 10:16:44.271 [info] <0.132.0> CONFIRM epoch 1148 <<57,213,154,16>> upi [f1] rep [] auth f1 by f1 - 2016-03-09 10:16:44.864 [info] <0.132.0> CONFIRM epoch 1151 <<239,29,39,70>> upi [f1] rep [f3] auth f1 by f1 - 2016-03-09 10:16:45.235 [info] <0.132.0> CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1 - 2016-03-09 10:16:47.343 [info] <0.132.0> CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1 - -Let's pick apart some of these lines. We have started all three -servers at about the same time. We see some race conditions happen, -and some jostling and readjustment happens pretty quickly in the first -few seconds. - -* `Started FLU f1 with supervisor pid <0.128.0>` - * This VM, #1, - started a FLU (Machi data server) with the name `f1`. In the Erlang - process supervisor hierarchy, the process ID of the top supervisor - is `<0.128.0>`. -* `Configured chain c1 via FLU f1 to mode=ap_mode all=[f1,f2,f3] witnesses=[]` - * A bootstrap configuration for a chain named `c1` has been created. - * The FLUs/data servers that are eligible for participation in the - chain have names `f1`, `f2`, and `f3`. - * The chain will operate in eventual consistency mode (`ap_mode`) - * The witness server list is empty. Witness servers are never used - in eventual consistency mode. -* `CONFIRM epoch 1141 <<155,42,7,221>> upi [] rep [] auth f1 by f1` - * All participants in epoch 1141 are unanimous in adopting epoch - 1141's projection. All active membership lists are empty, so - there is no functional chain replication yet, at least as far as - server `f1` knows - * The epoch's abbreviated checksum is `<<155,42,7,221>>`. - * The UPI list, i.e. the replicas whose data is 100% in sync is - `[]`, the empty list. (UPI = Update Propagation Invariant) - * The list of servers that are under data repair (`rep`) is also - empty, `[]`. - * This projection was authored by server `f1`. - * The log message was generated by server `f1`. -* `CONFIRM epoch 1148 <<57,213,154,16>> upi [f1] rep [] auth f1 by f1` - * Now the server `f1` has created a chain of length 1, `[f1]`. - * Chain repair/file re-sync is not required when the UPI server list - changes from length 0 -> 1. -* `CONFIRM epoch 1151 <<239,29,39,70>> upi [f1] rep [f3] auth f1 by f1` - * Server `f1` has noticed that server `f3` is alive. Apparently it - has not yet noticed that server `f2` is also running. - * Server `f3` is in the repair list. -* `CONFIRM epoch 1152 <<173,17,66,225>> upi [f2] rep [f1,f3] auth f2 by f1` - * Server `f2` is apparently now aware that all three servers are running. - * The previous configuration used by `f2` was `upi [f2]`, i.e., `f2` - was running in a chain of one. `f2` noticed that `f1` and `f3` - were now available and has started adding them to the chain. - * All new servers are always added to the tail of the chain in the - repair list. - * In eventual consistency mode, a UPI change like this is OK. - * When performing a read, a client must read from both tail of the - UPI list and also from all repairing servers. - * When performing a write, the client writes to both the UPI - server list and also the repairing list, in that order. - * I.e., the client concatenates both lists, - `UPI ++ Repairing`, for its chain configuration for the write. - * Server `f2` will trigger file repair/re-sync shortly. - * The waiting time for starting repair has been configured to be - extremely short, 1 second. The default waiting time is 10 - seconds, in case Humming Consensus remains unstable. -* `CONFIRM epoch 1154 <<154,231,224,149>> upi [f2,f1,f3] rep [] auth f2 by f1` - * File repair/re-sync has finished. All file data on all servers - are now in sync. - * The UPI/in-sync part of the chain is now `[f2,f1,f3]`, and there - are no servers under repair. - -## Let's create some failures - -Here are some suggestions for creating failures. - -* Use the `./dev/devN/bin/machi stop` and `./dev/devN/bin/machi start` - commands to stop & start VM #`N`. -* Stop a VM abnormally by using `kill`. The OS process name to look - for is `beam.smp`. -* Suspend and resume a VM, using the `SIGSTOP` and `SIGCONT` signals. - * E.g. `kill -STOP 9823` and `kill -CONT 9823` - -The network partition simulator is not (yet) available when running -Machi in this mode. Please see the next section for instructions on -how to use partition simulator. - - - -# Using the network partition simulator and convergence demo test code - -This is the demo code mentioned in the presentation that Scott Lystig -Fritchie gave at the -[RICON 2015 conference](http://ricon.io). -* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf) -* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q) - -## A complete example of all input and output - -If you don't have an Erlang/OTP 17 runtime environment available, -please see this file for full input and output of a strong consistency -length=3 chain test: -https://gist.github.com/slfritchie/8352efc88cc18e62c72c -This file contains all commands input and all simulator output from a -sample run of the simulator. - -To help interpret the output of the test, please skip ahead to the -"The test output is very verbose" section. - -## Prerequisites - -If you don't have `git` and/or the Erlang 17 runtime system available -on your OS X, FreeBSD, Linux, or Solaris machine, please take a look -at the [Prerequisites section](#prerequisites) first. When you have -installed the prerequisite software, please return back here. - -## Clone and compile the code - -Please briefly visit the [Clone and compile the code](#clone-compile) -section. When finished, please return back here. - -## Run an interactive Erlang CLI shell - -Run the following command at your login shell: - - erl -pz .eunit ebin deps/*/ebin - -If you are using Erlang/OTP version 17, you should see some CLI output -that looks like this: - - Erlang/OTP 17 [erts-6.4] [source] [64-bit] [smp:8:8] [async-threads:10] [hipe] [kernel-poll:false] [dtrace] - - Eshell V6.4 (abort with ^G) - 1> - -## The test output is very verbose ... what are the important parts? - -The output of the Erlang command -`machi_chain_manager1_converge_demo:help()` will display the following -guide to the output of the tests. - - A visualization of the convergence behavior of the chain self-management - algorithm for Machi. - - 1. Set up some server and chain manager pairs. - 2. Create a number of different network partition scenarios, where - (simulated) partitions may be symmetric or asymmetric. Then stop changing - the partitions and keep the simulated network stable (and perhaps broken). - 3. Run a number of iterations of the algorithm in parallel by poking each - of the manager processes on a random'ish basis. - 4. Afterward, fetch the chain transition changes made by each FLU and - verify that no transition was unsafe. - - During the iteration periods, the following is a cheatsheet for the output. - See the internal source for interpreting the rest of the output. - - 'SET partitions = ' - - A pair-wise list of actors which cannot send messages. The - list is uni-directional. If there are three servers (a,b,c), - and if the partitions list is '[{a,b},{b,c}]' then all - messages from a->b and b->c will be dropped, but any other - sender->recipient messages will be delivered successfully. - - 'x uses:' - - The FLU x has made an internal state transition and is using - this epoch's projection as operating chain configuration. The - rest of the line is a summary of the projection. - - 'CONFIRM epoch {N}' - - This message confirms that all of the servers listed in the - UPI and repairing lists of the projection at epoch {N} have - agreed to use this projection because they all have written - this projection to their respective private projection stores. - The chain is now usable by/available to all clients. - - 'Sweet, private projections are stable' - - This report announces that this iteration of the test cycle - has passed successfully. The report that follows briefly - summarizes the latest private projection used by each - participating server. For example, when in strong consistency - mode with 'a' as a witness and 'b' and 'c' as real servers: - - %% Legend: - %% server name, epoch ID, UPI list, repairing list, down list, ... - %% ... witness list, 'false' (a constant value) - - [{a,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}, - {b,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}] - - Both servers 'a' and 'b' agree on epoch 1116 with epoch ID - {1116,<<23,143,246,55>>} where UPI=[a,b], repairing=[], - down=[c], and witnesses=[a]. - - Server 'c' is not shown because 'c' has wedged itself OOS (out - of service) by configuring a chain length of zero. - - If no servers are listed in the report (i.e. only '[]' is - displayed), then all servers have wedged themselves OOS, and - the chain is unavailable. - - 'DoIt,' - - This marks a group of tick events which trigger the manager - processes to evaluate their environment and perhaps make a - state transition. - - A long chain of 'DoIt,DoIt,DoIt,' means that the chain state has - (probably) settled to a stable configuration, which is the goal of the - algorithm. - - Press control-c to interrupt the test....". - -## Run a test in eventual consistency mode - -Run the following command at the Erlang CLI prompt: - - machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}]). - -The first argument, `3`, is the number of servers to participate in -the chain. Please note: - -* Chain lengths as short as 1 or 2 are valid, but the results are a - bit boring. -* Chain lengths as long as 7 or 9 can be used, but they may - suffer from longer periods of churn/instability before all chain - managers reach agreement via humming consensus. (It is future work - to shorten the worst of the unstable churn latencies.) -* In eventual consistency mode, chain lengths may be even numbers, - e.g. 2, 4, or 6. -* The simulator will choose partition events from the permutations of - all 1, 2, and 3 node partition pairs. The total runtime will - increase *dramatically* with chain length. - * Chain length 2: about 3 partition cases - * Chain length 3: about 35 partition cases - * Chain length 4: about 230 partition cases - * Chain length 5: about 1100 partition cases - -## Run a test in strong consistency mode (with witnesses): - -*NOTE:* Due to a bug in the test code, please do not try to run the - convergence test in strong consistency mode and also without the - correct minority number of witness servers! If in doubt, please run - the commands shown below exactly. - -Run the following command at the Erlang CLI prompt: - - machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]). - -The first argument, `3`, is the number of servers to participate in -the chain. Chain lengths as long as 7 or 9 can be used, but they may -suffer from longer periods of churn/instability before all chain -managers reach agreement via humming consensus. - -Due to the bug mentioned above, please use the following -commands when running with chain lengths of 5 or 7, respectively. - - machi_chain_manager1_converge_demo:t(5, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b]}]). - machi_chain_manager1_converge_demo:t(7, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b,c]}]). - diff --git a/doc/machi_chain_manager1_converge_demo.md b/doc/machi_chain_manager1_converge_demo.md new file mode 100644 index 0000000..2844bfa --- /dev/null +++ b/doc/machi_chain_manager1_converge_demo.md @@ -0,0 +1,185 @@ + +# Using the network partition simulator and convergence demo test code + +## A complete example of all input and output + +If you don't have an Erlang/OTP 17 runtime environment available, +please see this file for full input and output of a strong consistency +length=3 chain test: +https://gist.github.com/slfritchie/8352efc88cc18e62c72c +This file contains all commands input and all simulator output from a +sample run of the simulator. + +To help interpret the output of the test, please skip ahead to the +"The test output is very verbose" section. + +## Prerequisites + +1. You'll need the `git` source management +2. You'll need the Erlang/OTP 17 runtime environment. Please don't + use earlier or later versions until we have a chance to fix the + compilation warnings that versions R16B and 18 will trigger. + +All of the commands that should be run at your login shell (e.g. Bash, +c-shell) can be cut-and-pasted from this document directly to your +login shell prompt. + +## Clone and compile the code + +Clone the Machi source repo and compile the source and test code. Run +the following commands at your login shell: + + cd /tmp + git clone https://github.com/basho/machi.git + cd machi + git checkout master + make + +Then run the unit test suite. This may take up to two minutes or so +to finish. Most of the tests will be silent; please be patient until +the tests finish. + + make test + +## Run an interactive Erlang CLI shell + +Run the following command at your login shell: + + erl -pz .eunit ebin deps/*/ebin + +If you are using Erlang/OTP version 17, you should see some CLI output +that looks like this: + + Erlang/OTP 17 [erts-6.4] [source] [64-bit] [smp:8:8] [async-threads:10] [hipe] [kernel-poll:false] [dtrace] + + Eshell V6.4 (abort with ^G) + 1> + +## The test output is very verbose ... what are the important parts? + +The output of the Erlang command +`machi_chain_manager1_converge_demo:help()` will display the following +guide to the output of the tests. + + A visualization of the convergence behavior of the chain self-management + algorithm for Machi. + + 1. Set up some server and chain manager pairs. + 2. Create a number of different network partition scenarios, where + (simulated) partitions may be symmetric or asymmetric. Then stop changing + the partitions and keep the simulated network stable (and perhaps broken). + 3. Run a number of iterations of the algorithm in parallel by poking each + of the manager processes on a random'ish basis. + 4. Afterward, fetch the chain transition changes made by each FLU and + verify that no transition was unsafe. + + During the iteration periods, the following is a cheatsheet for the output. + See the internal source for interpreting the rest of the output. + + 'SET partitions = ' + + A pair-wise list of actors which cannot send messages. The + list is uni-directional. If there are three servers (a,b,c), + and if the partitions list is '[{a,b},{b,c}]' then all + messages from a->b and b->c will be dropped, but any other + sender->recipient messages will be delivered successfully. + + 'x uses:' + + The FLU x has made an internal state transition and is using + this epoch's projection as operating chain configuration. The + rest of the line is a summary of the projection. + + 'CONFIRM epoch {N}' + + This message confirms that all of the servers listed in the + UPI and repairing lists of the projection at epoch {N} have + agreed to use this projection because they all have written + this projection to their respective private projection stores. + The chain is now usable by/available to all clients. + + 'Sweet, private projections are stable' + + This report announces that this iteration of the test cycle + has passed successfully. The report that follows briefly + summarizes the latest private projection used by each + participating server. For example, when in strong consistency + mode with 'a' as a witness and 'b' and 'c' as real servers: + + %% Legend: + %% server name, epoch ID, UPI list, repairing list, down list, ... + %% ... witness list, 'false' (a constant value) + + [{a,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}, + {b,{{1116,<<23,143,246,55>>},[a,b],[],[c],[a],false}}] + + Both servers 'a' and 'b' agree on epoch 1116 with epoch ID + {1116,<<23,143,246,55>>} where UPI=[a,b], repairing=[], + down=[c], and witnesses=[a]. + + Server 'c' is not shown because 'c' has wedged itself OOS (out + of service) by configuring a chain length of zero. + + If no servers are listed in the report (i.e. only '[]' is + displayed), then all servers have wedged themselves OOS, and + the chain is unavailable. + + 'DoIt,' + + This marks a group of tick events which trigger the manager + processes to evaluate their environment and perhaps make a + state transition. + + A long chain of 'DoIt,DoIt,DoIt,' means that the chain state has + (probably) settled to a stable configuration, which is the goal of the + algorithm. + + Press control-c to interrupt the test....". + +## Run a test in eventual consistency mode + +Run the following command at the Erlang CLI prompt: + + machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}]). + +The first argument, `3`, is the number of servers to participate in +the chain. Please note: + +* Chain lengths as short as 1 or 2 are valid, but the results are a + bit boring. +* Chain lengths as long as 7 or 9 can be used, but they may + suffer from longer periods of churn/instability before all chain + managers reach agreement via humming consensus. (It is future work + to shorten the worst of the unstable churn latencies.) +* In eventual consistency mode, chain lengths may be even numbers, + e.g. 2, 4, or 6. +* The simulator will choose partition events from the permutations of + all 1, 2, and 3 node partition pairs. The total runtime will + increase *dramatically* with chain length. + * Chain length 2: about 3 partition cases + * Chain length 3: about 35 partition cases + * Chain length 4: about 230 partition cases + * Chain length 5: about 1100 partition cases + +## Run a test in strong consistency mode (with witnesses): + +*NOTE:* Due to a bug in the test code, please do not try to run the + convergence test in strong consistency mode and also without the + correct minority number of witness servers! If in doubt, please run + the commands shown below exactly. + +Run the following command at the Erlang CLI prompt: + + machi_chain_manager1_converge_demo:t(3, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a]}]). + +The first argument, `3`, is the number of servers to participate in +the chain. Chain lengths as long as 7 or 9 can be used, but they may +suffer from longer periods of churn/instability before all chain +managers reach agreement via humming consensus. + +Due to the bug mentioned above, please use the following +commands when running with chain lengths of 5 or 7, respectively. + + machi_chain_manager1_converge_demo:t(5, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b]}]). + machi_chain_manager1_converge_demo:t(7, [{private_write_verbose,true}, {consistency_mode, cp_mode}, {witnesses, [a,b,c]}]). + diff --git a/doc/process-protocol-module-overview.jpg b/doc/process-protocol-module-overview.jpg deleted file mode 100644 index eb7accf..0000000 Binary files a/doc/process-protocol-module-overview.jpg and /dev/null differ diff --git a/include/machi.hrl b/include/machi.hrl index 7974fd2..c15133a 100644 --- a/include/machi.hrl +++ b/include/machi.hrl @@ -21,7 +21,7 @@ %% @doc Now 4GiBytes, could be up to 64bit due to PB message limit of %% chunk size -define(DEFAULT_MAX_FILE_SIZE, ((1 bsl 32) - 1)). --define(MINIMUM_OFFSET, 1024). +-define(MINIMUM_OFFSET, 0). %% 0th draft of checksum typing with 1st byte. -define(CSUM_TAG_NONE, 0). % No csum provided by client @@ -43,21 +43,3 @@ -define(DEFAULT_COC_NAMESPACE, ""). -define(DEFAULT_COC_LOCATOR, 0). - --record(ns_info, { - version = 0 :: machi_dt:namespace_version(), - name = <<>> :: machi_dt:namespace(), - locator = 0 :: machi_dt:locator() - }). - --record(append_opts, { - chunk_extra = 0 :: machi_dt:chunk_size(), - preferred_file_name :: 'undefined' | machi_dt:file_name_s(), - flag_fail_preferred = false :: boolean() - }). - --record(read_opts, { - no_checksum = false :: boolean(), - no_chunk = false :: boolean(), - needs_trimmed = false :: boolean() - }). diff --git a/priv/humming-consensus-demo.setup.sh b/priv/humming-consensus-demo.setup.sh deleted file mode 100755 index dc57731..0000000 --- a/priv/humming-consensus-demo.setup.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/sh - -echo "Step: Verify that the required entries in /etc/hosts are present" -for i in 1 2 3; do - grep machi$i /etc/hosts | egrep -s '^127.0.0.1' > /dev/null 2>&1 - if [ $? -ne 0 ]; then - echo "" - echo "'grep -s machi$i' failed. Aborting, sorry." - exit 1 - fi - ping -c 1 machi$i > /dev/null 2>&1 - if [ $? -ne 0 ]; then - echo "" - echo "Ping attempt on host machi$i failed. Aborting." - echo "" - ping -c 1 machi$i - exit 1 - fi -done - -echo "Step: add a verbose logging option to app.config" -for i in 1 2 3; do - ed ./dev/dev$i/etc/app.config < /dev/null 2>&1 -/verbose_confirm -a -{chain_manager_opts, [{private_write_verbose_confirm,true}]}, -{stability_time, 1}, -. -w -q -EOF -done - -echo "Step: start three three Machi application instances" -for i in 1 2 3; do - ./dev/dev$i/bin/machi start - ./dev/dev$i/bin/machi ping - if [ $? -ne 0 ]; then - echo "Sorry, a 'ping' check for instance dev$i failed. Aborting." - exit 1 - fi -done - -echo "Step: configure one chain to start a Humming Consensus group with three members" - -# Note: $CWD of each Machi proc is two levels below the source code root dir. -LIFECYCLE000=../../priv/quick-admin-examples/demo-000 -for i in 3 2 1; do - ./dev/dev$i/bin/machi-admin quick-admin-apply $LIFECYCLE000 machi$i - if [ $? -ne 0 ]; then - echo "Sorry, 'machi-admin quick-admin-apply failed' on machi$i. Aborting." - exit 1 - fi -done - -exit 0 diff --git a/priv/humming-consensus-demo.vagrant/Vagrantfile b/priv/humming-consensus-demo.vagrant/Vagrantfile deleted file mode 100644 index ce0474d..0000000 --- a/priv/humming-consensus-demo.vagrant/Vagrantfile +++ /dev/null @@ -1,93 +0,0 @@ -# -*- mode: ruby -*- -# vi: set ft=ruby : - -# All Vagrant configuration is done below. The "2" in Vagrant.configure -# configures the configuration version (we support older styles for -# backwards compatibility). Please don't change it unless you know what -# you're doing. -Vagrant.configure(2) do |config| - # The most common configuration options are documented and commented below. - # For a complete reference, please see the online documentation at - # https://docs.vagrantup.com. - - # Every Vagrant development environment requires a box. You can search for - # boxes at https://atlas.hashicorp.com/search. - # If this Vagrant box has not been downloaded before (e.g. using "vagrant box add"), - # then Vagrant will automatically download the VM image from HashiCorp. - config.vm.box = "hashicorp/precise64" - # If using a FreeBSD box, Bash may not be installed. - # Use the config.ssh.shell setting to specify an alternate shell. - # Note, however, that any code in the 'config.vm.provision' section - # would then have to use this shell's syntax! - # config.ssh.shell = "/bin/csh -l" - - # Disable automatic box update checking. If you disable this, then - # boxes will only be checked for updates when the user runs - # `vagrant box outdated`. This is not recommended. - # config.vm.box_check_update = false - - # Create a forwarded port mapping which allows access to a specific port - # within the machine from a port on the host machine. In the example below, - # accessing "localhost:8080" will access port 80 on the guest machine. - # config.vm.network "forwarded_port", guest: 80, host: 8080 - - # Create a private network, which allows host-only access to the machine - # using a specific IP. - # config.vm.network "private_network", ip: "192.168.33.10" - - # Create a public network, which generally matched to bridged network. - # Bridged networks make the machine appear as another physical device on - # your network. - # config.vm.network "public_network" - - # Share an additional folder to the guest VM. The first argument is - # the path on the host to the actual folder. The second argument is - # the path on the guest to mount the folder. And the optional third - # argument is a set of non-required options. - # config.vm.synced_folder "../data", "/vagrant_data" - - # Provider-specific configuration so you can fine-tune various - # backing providers for Vagrant. These expose provider-specific options. - # Example for VirtualBox: - # - config.vm.provider "virtualbox" do |vb| - # Display the VirtualBox GUI when booting the machine - # vb.gui = true - - # Customize the amount of memory on the VM: - vb.memory = "512" - end - # - # View the documentation for the provider you are using for more - # information on available options. - - # Define a Vagrant Push strategy for pushing to Atlas. Other push strategies - # such as FTP and Heroku are also available. See the documentation at - # https://docs.vagrantup.com/v2/push/atlas.html for more information. - # config.push.define "atlas" do |push| - # push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME" - # end - - # Enable provisioning with a shell script. Additional provisioners such as - # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the - # documentation for more information about their specific syntax and use. - config.vm.provision "shell", inline: <<-SHELL - # Install prerequsites - # Support here for FreeBSD is experimental - apt-get update ; sudo apt-get install -y git sudo rsync ; # Ubuntu Linux - env ASSUME_ALWAYS_YES=yes pkg install -f git sudo rsync ; # FreeBSD 10 - - # Install dependent packages, using slf-configurator - git clone https://github.com/slfritchie/slf-configurator.git - chown -R vagrant ./slf-configurator - (cd slf-configurator ; sudo sh -x ./ALL.sh) - echo 'export PATH=${PATH}:/usr/local/erlang/17.5/bin' >> ~vagrant/.bashrc - export PATH=${PATH}:/usr/local/erlang/17.5/bin - ## echo 'set path = ( $path /usr/local/erlang/17.5/bin )' >> ~vagrant/.cshrc - ## setenv PATH /usr/local/erlang/17.5/bin:$PATH - - git clone https://github.com/basho/machi.git - (cd machi ; git checkout master ; make && make test ) - chown -R vagrant ./machi - SHELL -end diff --git a/priv/make-faq.pl b/priv/make-faq.pl index b7a3089..7edee07 100755 --- a/priv/make-faq.pl +++ b/priv/make-faq.pl @@ -36,7 +36,7 @@ while () { $indent = " " x ($count * 4); s/^#*\s*[0-9. ]*//; $anchor = "n$label"; - printf T1 "%s+ [%s. %s](#%s)\n", $indent, $label, $_, $anchor; + printf T1 "%s+ [%s %s](#%s)\n", $indent, $label, $_, $anchor; printf T2 "\n", $anchor; $line =~ s/(#+)\s*[0-9. ]*/$1 $label. /; print T2 $line; diff --git a/priv/quick-admin-examples/demo-000 b/priv/quick-admin-examples/demo-000 deleted file mode 100644 index 301f348..0000000 --- a/priv/quick-admin-examples/demo-000 +++ /dev/null @@ -1,7 +0,0 @@ -{host, "machi1", []}. -{host, "machi2", []}. -{host, "machi3", []}. -{flu,f1,"machi1",20401,[]}. -{flu,f2,"machi2",20402,[]}. -{flu,f3,"machi3",20403,[]}. -{chain,c1,[f1,f2,f3],[]}. diff --git a/rel/files/app.config b/rel/files/app.config index a2c55ee..eb330f3 100644 --- a/rel/files/app.config +++ b/rel/files/app.config @@ -16,10 +16,6 @@ %% Default = 10 %% {metadata_manager_count, 2}, - %% Default options for chain manager processes. - %% {chain_manager_opts, [{private_write_verbose,true}, - %% {private_write_verbose_confirm,true}]}, - %% Platform vars (mirror of reltool packaging) {platform_data_dir, "{{platform_data_dir}}"}, {platform_etc_dir, "{{platform_etc_dir}}"}, diff --git a/rel/gen_dev b/rel/gen_dev deleted file mode 100755 index 1b8ce1b..0000000 --- a/rel/gen_dev +++ /dev/null @@ -1,16 +0,0 @@ -#! /bin/sh -# -# Example usage: gen_dev dev4 vars.src vars -# -# Generate an overlay config for devNNN from vars.src and write to vars -# - -NAME=$1 -TEMPLATE=$2 -VARFILE=$3 - -NODE="$NAME@127.0.0.1" - -echo "Generating $NAME - node='$NODE'" -sed -e "s/@NODE@/$NODE/" \ - < $TEMPLATE > $VARFILE diff --git a/rel/reltool.config b/rel/reltool.config index eb015be..33df951 100644 --- a/rel/reltool.config +++ b/rel/reltool.config @@ -106,7 +106,6 @@ {copy, "../priv/quick-admin-examples/000", "priv/quick-admin-examples"}, {copy, "../priv/quick-admin-examples/001", "priv/quick-admin-examples"}, {copy, "../priv/quick-admin-examples/002", "priv/quick-admin-examples"}, - {copy, "../priv/quick-admin-examples/demo-000", "priv/quick-admin-examples/demo-000"}, {mkdir, "lib/basho-patches"} %% {copy, "../apps/machi/ebin/etop_txt.beam", "lib/basho-patches"} diff --git a/rel/vars.config b/rel/vars.config index b1bb405..06b3aa0 100644 --- a/rel/vars.config +++ b/rel/vars.config @@ -1,9 +1,6 @@ %% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*- %% ex: ft=erlang ts=4 sw=4 et -%% NOTE: When modifying this file, also keep its near cousin -%% config file rel/vars/dev_vars.config.src in sync! - %% Platform-specific installation paths {platform_bin_dir, "./bin"}. {platform_data_dir, "./data"}. diff --git a/rel/vars/dev_vars.config.src b/rel/vars/dev_vars.config.src deleted file mode 100644 index a5a3828..0000000 --- a/rel/vars/dev_vars.config.src +++ /dev/null @@ -1,48 +0,0 @@ -%% -*- mode: erlang;erlang-indent-level: 4;indent-tabs-mode: nil -*- -%% ex: ft=erlang ts=4 sw=4 et - -%% NOTE: When modifying this file, also keep its near cousin -%% config file rel/vars/dev_vars.config.src in sync! - -%% Platform-specific installation paths -{platform_bin_dir, "./bin"}. -{platform_data_dir, "./data"}. -{platform_etc_dir, "./etc"}. -{platform_lib_dir, "./lib"}. -{platform_log_dir, "./log"}. - -%% -%% etc/app.config -%% -{sasl_error_log, "{{platform_log_dir}}/sasl-error.log"}. -{sasl_log_dir, "{{platform_log_dir}}/sasl"}. - -%% lager -{console_log_default, file}. - -%% -%% etc/vm.args -%% -{node, "@NODE@"}. -{crash_dump, "{{platform_log_dir}}/erl_crash.dump"}. - -%% -%% bin/machi -%% -{runner_script_dir, "\`cd \\`dirname $0\\` 1>/dev/null && /bin/pwd\`"}. -{runner_base_dir, "{{runner_script_dir}}/.."}. -{runner_etc_dir, "$RUNNER_BASE_DIR/etc"}. -{runner_log_dir, "$RUNNER_BASE_DIR/log"}. -{runner_lib_dir, "$RUNNER_BASE_DIR/lib"}. -{runner_patch_dir, "$RUNNER_BASE_DIR/lib/basho-patches"}. -{pipe_dir, "/tmp/$RUNNER_BASE_DIR/"}. -{runner_user, ""}. -{runner_wait_process, "machi_flu_sup"}. -{runner_ulimit_warn, 65536}. - -%% -%% cuttlefish -%% -{cuttlefish, ""}. % blank = off -{cuttlefish_conf, "machi.conf"}. - diff --git a/src/machi.app.src b/src/machi.app.src index a9f96f0..c26154f 100644 --- a/src/machi.app.src +++ b/src/machi.app.src @@ -1,7 +1,7 @@ {application, machi, [ {description, "A village of write-once files."}, - {vsn, "0.0.1"}, - {applications, [kernel, stdlib, crypto, cluster_info, ranch]}, + {vsn, "0.0.0"}, + {applications, [kernel, stdlib, crypto, cluster_info]}, {mod,{machi_app,[]}}, {registered, []}, {env, [ diff --git a/src/machi.proto b/src/machi.proto index a9ac513..2645bde 100644 --- a/src/machi.proto +++ b/src/machi.proto @@ -170,18 +170,12 @@ message Mpb_AuthResp { // High level API: append_chunk() request & response message Mpb_AppendChunkReq { - // General namespace arguments - /* In single chain/non-clustered environment, use namespace="" */ - required string namespace = 1; - - required string prefix = 10; - required bytes chunk = 11; - required Mpb_ChunkCSum csum = 12; - - optional uint32 chunk_extra = 20; - optional string preferred_file_name = 21; - /* Fail the operation if our preferred file name is not available */ - optional bool flag_fail_preferred = 22 [default=false]; + required string coc_namespace = 1; + required uint32 coc_locator = 2; + required string prefix = 3; + required bytes chunk = 4; + required Mpb_ChunkCSum csum = 5; + optional uint32 chunk_extra = 6; } message Mpb_AppendChunkResp { @@ -193,7 +187,7 @@ message Mpb_AppendChunkResp { // High level API: write_chunk() request & response message Mpb_WriteChunkReq { - required Mpb_Chunk chunk = 10; + required Mpb_Chunk chunk = 1; } message Mpb_WriteChunkResp { @@ -203,22 +197,19 @@ message Mpb_WriteChunkResp { // High level API: read_chunk() request & response message Mpb_ReadChunkReq { - // No namespace arguments are required because NS is embedded - // inside of the file name. - - required Mpb_ChunkPos chunk_pos = 10; + required Mpb_ChunkPos chunk_pos = 1; // Use flag_no_checksum=non-zero to skip returning the chunk's checksum. // TODO: not implemented yet. - optional bool flag_no_checksum = 20 [default=false]; + optional uint32 flag_no_checksum = 2 [default=0]; // Use flag_no_chunk=non-zero to skip returning the chunk (which // only makes sense if flag_no_checksum is not set). // TODO: not implemented yet. - optional bool flag_no_chunk = 21 [default=false]; + optional uint32 flag_no_chunk = 3 [default=0]; // TODO: not implemented yet. - optional bool flag_needs_trimmed = 22 [default=false]; + optional uint32 flag_needs_trimmed = 4 [default=0]; } message Mpb_ReadChunkResp { @@ -254,8 +245,6 @@ message Mpb_ChecksumListResp { // High level API: list_files() request & response message Mpb_ListFilesReq { - // TODO: Add flag for file glob/regexp/other filter type - // TODO: What else could go wrong? } message Mpb_ListFilesResp { @@ -388,20 +377,14 @@ message Mpb_ProjectionV1 { // Low level API: append_chunk() message Mpb_LL_AppendChunkReq { - // General namespace arguments - required uint32 namespace_version = 1; - required string namespace = 2; - required uint32 locator = 3; - - required Mpb_EpochID epoch_id = 10; - required string prefix = 11; - required bytes chunk = 12; - required Mpb_ChunkCSum csum = 13; - - optional uint32 chunk_extra = 20; - optional string preferred_file_name = 21; - /* Fail the operation if our preferred file name is not available */ - optional bool flag_fail_preferred = 22 [default=false]; + required Mpb_EpochID epoch_id = 1; + /* To avoid CoC use, use coc_namespace="" and coc_locator=0 */ + required string coc_namespace = 2; + required uint32 coc_locator = 3; + required string prefix = 4; + required bytes chunk = 5; + required Mpb_ChunkCSum csum = 6; + optional uint32 chunk_extra = 7; } message Mpb_LL_AppendChunkResp { @@ -413,12 +396,8 @@ message Mpb_LL_AppendChunkResp { // Low level API: write_chunk() message Mpb_LL_WriteChunkReq { - // General namespace arguments - required uint32 namespace_version = 1; - required string namespace = 2; - - required Mpb_EpochID epoch_id = 10; - required Mpb_Chunk chunk = 11; + required Mpb_EpochID epoch_id = 1; + required Mpb_Chunk chunk = 2; } message Mpb_LL_WriteChunkResp { @@ -428,23 +407,19 @@ message Mpb_LL_WriteChunkResp { // Low level API: read_chunk() message Mpb_LL_ReadChunkReq { - // General namespace arguments - required uint32 namespace_version = 1; - required string namespace = 2; - - required Mpb_EpochID epoch_id = 10; - required Mpb_ChunkPos chunk_pos = 11; + required Mpb_EpochID epoch_id = 1; + required Mpb_ChunkPos chunk_pos = 2; // Use flag_no_checksum=non-zero to skip returning the chunk's checksum. // TODO: not implemented yet. - optional bool flag_no_checksum = 20 [default=false]; + optional uint32 flag_no_checksum = 3 [default=0]; // Use flag_no_chunk=non-zero to skip returning the chunk (which // only makes sense if flag_checksum is not set). // TODO: not implemented yet. - optional bool flag_no_chunk = 21 [default=false]; + optional uint32 flag_no_chunk = 4 [default=0]; - optional bool flag_needs_trimmed = 22 [default=false]; + optional uint32 flag_needs_trimmed = 5 [default=0]; } message Mpb_LL_ReadChunkResp { @@ -456,16 +431,11 @@ message Mpb_LL_ReadChunkResp { // Low level API: trim_chunk() message Mpb_LL_TrimChunkReq { - // General namespace arguments - required uint32 namespace_version = 1; - required string namespace = 2; - - required Mpb_EpochID epoch_id = 10; - required string file = 11; - required uint64 offset = 12; - required uint32 size = 13; - - optional bool trigger_gc = 20 [default=false]; + required Mpb_EpochID epoch_id = 1; + required string file = 2; + required uint64 offset = 3; + required uint32 size = 4; + optional uint32 trigger_gc = 5 [default=0]; } message Mpb_LL_TrimChunkResp { @@ -475,7 +445,8 @@ message Mpb_LL_TrimChunkResp { // Low level API: checksum_list() message Mpb_LL_ChecksumListReq { - required string file = 1; + required Mpb_EpochID epoch_id = 1; + required string file = 2; } message Mpb_LL_ChecksumListResp { @@ -506,9 +477,7 @@ message Mpb_LL_WedgeStatusReq { message Mpb_LL_WedgeStatusResp { required Mpb_GeneralStatusCode status = 1; optional Mpb_EpochID epoch_id = 2; - optional bool wedged_flag = 3; - optional uint32 namespace_version = 4; - optional string namespace = 5; + optional uint32 wedged_flag = 3; } // Low level API: delete_migration() diff --git a/src/machi_admin_util.erl b/src/machi_admin_util.erl index 41a4b5f..46f6c3d 100644 --- a/src/machi_admin_util.erl +++ b/src/machi_admin_util.erl @@ -90,16 +90,15 @@ verify_file_checksums_local2(Sock1, EpochID, Path0) -> end. verify_file_checksums_remote2(Sock1, EpochID, File) -> - NSInfo = undefined, ReadChunk = fun(File_name, Offset, Size) -> - ?FLU_C:read_chunk(Sock1, NSInfo, EpochID, - File_name, Offset, Size, undefined) + ?FLU_C:read_chunk(Sock1, EpochID, + File_name, Offset, Size, []) end, verify_file_checksums_common(Sock1, EpochID, File, ReadChunk). -verify_file_checksums_common(Sock1, _EpochID, File, ReadChunk) -> +verify_file_checksums_common(Sock1, EpochID, File, ReadChunk) -> try - case ?FLU_C:checksum_list(Sock1, File) of + case ?FLU_C:checksum_list(Sock1, EpochID, File) of {ok, InfoBin} -> Info = machi_csum_table:split_checksum_list_blob_decode(InfoBin), Res = lists:foldl(verify_chunk_checksum(File, ReadChunk), diff --git a/src/machi_basho_bench_driver.erl b/src/machi_basho_bench_driver.erl index 4adc052..4d36328 100644 --- a/src/machi_basho_bench_driver.erl +++ b/src/machi_basho_bench_driver.erl @@ -112,7 +112,7 @@ run(read, KeyGen, _ValueGen, #m{conn=Conn, max_key=MaxKey}=S) -> Idx = KeyGen() rem MaxKey, %% {File, Offset, Size, _CSum} = ets:lookup_element(?ETS_TAB, Idx, 2), {File, Offset, Size} = ets:lookup_element(?ETS_TAB, Idx, 2), - case machi_cr_client:read_chunk(Conn, File, Offset, Size, undefined, ?THE_TIMEOUT) of + case machi_cr_client:read_chunk(Conn, File, Offset, Size, [], ?THE_TIMEOUT) of {ok, _Chunk} -> {ok, S}; {error, _}=Err -> diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 66b0163..7f112d0 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -92,11 +92,8 @@ -define(REPAIR_START_STABILITY_TIME, 10). -endif. % TEST -%% Maximum length of the history of adopted projections (via C120). --define(MAX_HISTORY_LENGTH, 8). - -%% Magic constant for looping "too frequently" breaker. --define(TOO_FREQUENT_BREAKER, (?MAX_HISTORY_LENGTH+5)). +%% Magic constant for looping "too frequently" breaker. TODO revisit & revise. +-define(TOO_FREQUENT_BREAKER, 10). -define(RETURN2(X), begin (catch put(why2, [?LINE|get(why2)])), X end). @@ -106,6 +103,9 @@ %% Amount of epoch number skip-ahead for set_chain_members call -define(SET_CHAIN_MEMBERS_EPOCH_SKIP, 1111). +%% Maximum length of the history of adopted projections (via C120). +-define(MAX_HISTORY_LENGTH, 30). + %% API -export([start_link/2, start_link/3, stop/1, ping/1, set_chain_members/2, set_chain_members/6, set_active/2, @@ -234,13 +234,11 @@ test_read_latest_public_projection(Pid, ReadRepairP) -> %% manager's pid in MgrOpts and use direct gen_server calls to the %% local projection store. -init({MyName, InitMembersDict, MgrOpts0}) -> +init({MyName, InitMembersDict, MgrOpts}) -> put(ttt, [?LINE]), _ = random:seed(now()), init_remember_down_list(), - MgrOpts = MgrOpts0 ++ application:get_env(machi, chain_manager_opts, []), Opt = fun(Key, Default) -> proplists:get_value(Key, MgrOpts, Default) end, - InitWitness_list = Opt(witnesses, []), ZeroAll_list = [P#p_srvr.name || {_,P} <- orddict:to_list(InitMembersDict)], ZeroProj = make_none_projection(0, MyName, ZeroAll_list, @@ -390,7 +388,6 @@ handle_cast(_Cast, S) -> handle_info(tick_check_environment, #ch_mgr{ignore_timer=true}=S) -> {noreply, S}; handle_info(tick_check_environment, S) -> - gobble_ticks(), {{_Delta, Props, _Epoch}, S1} = do_react_to_env(S), S2 = sanitize_repair_state(S1), S3 = perhaps_start_repair(S2), @@ -463,7 +460,7 @@ get_my_proj_boot_info(MgrOpts, DefaultDict, DefaultProj, ProjType) -> {DefaultDict, DefaultProj}; Store -> {ok, P} = machi_projection_store:read_latest_projection(Store, - ProjType, 7789), + ProjType), {P#projection_v1.members_dict, P} end. @@ -840,10 +837,7 @@ calc_projection2(LastProj, RelativeToServer, AllHosed, Dbg, D_foo=[{repair_done, {repair_final_status, ok, (S#ch_mgr.proj)#projection_v1.epoch_number}}], {NewUPI_list ++ Repairing_list2, [], RunEnv2}; true -> - D_foo=[d_foo2, {sim_p,Simulator_p}, - {simr_p,SimRepair_p}, {same_epoch,SameEpoch_p}, - {rel_to,RelativeToServer}, - {repch,RepChk_LastInUPI}, {repair_fs,RepairFS}], + D_foo=[d_foo2], {NewUPI_list, OldRepairing_list, RunEnv2} end; {_ABC, _XYZ} -> @@ -1915,7 +1909,7 @@ react_to_env_C100_inner(Author_latest, NotSanesDict0, _MyName, S2 = S#ch_mgr{not_sanes=NotSanesDict, sane_transitions=0}, case orddict:fetch(Author_latest, NotSanesDict) of N when N > ?TOO_FREQUENT_BREAKER -> - ?V("\n\nYOYO ~w breaking the cycle insane-freq=~w by-author=~w of:\n current: ~w\n new : ~w\n", [_MyName, N, Author_latest, machi_projection:make_summary(S#ch_mgr.proj), machi_projection:make_summary(P_latest)]), + %% ?V("\n\nYOYO ~w breaking the cycle of:\n current: ~w\n new : ~w\n", [_MyName, machi_projection:make_summary(S#ch_mgr.proj), machi_projection:make_summary(P_latest)]), ?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}), react_to_env_C103(P_newprop, P_latest, P_current_calc, S2); N -> @@ -1943,7 +1937,7 @@ react_to_env_C103(#projection_v1{epoch_number=_Epoch_newprop} = _P_newprop, ?REACT({c103, ?LINE, [{current_epoch, P_current#projection_v1.epoch_number}, {none_projection_epoch, P_none#projection_v1.epoch_number}]}), - io:format(user, "SET add_admin_down(~w) at ~w current_epoch ~w none_proj_epoch ~w =====================================\n", [MyName, time(), P_current#projection_v1.epoch_number, P_none#projection_v1.epoch_number]), + io:format(user, "SET add_admin_down(~w) at ~w =====================================\n", [MyName, time()]), machi_fitness:add_admin_down(S#ch_mgr.fitness_svr, MyName, []), timer:sleep(5*1000), io:format(user, "SET delete_admin_down(~w) at ~w =====================================\n", [MyName, time()]), @@ -1980,7 +1974,7 @@ react_to_env_C110(P_latest, #ch_mgr{name=MyName} = S) -> %% In contrast to the public projection store writes, Humming Consensus %% doesn't care about the status of writes to the public store: it's %% always relying only on successful reads of the public store. - case {?FLU_PC:write_projection(MyStorePid, private, P_latest2,?TO*30+66),Goo} of + case {?FLU_PC:write_projection(MyStorePid, private, P_latest2,?TO*30),Goo} of {ok, Goo} -> ?REACT({c110, [{write, ok}]}), react_to_env_C111(P_latest, P_latest2, Extra1, MyStorePid, S); @@ -2066,6 +2060,7 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H, ?REACT(c120), H2 = add_and_trunc_history(P_latest, H, ?MAX_HISTORY_LENGTH), + %% diversion_c120_verbose_goop(P_latest, S), ?REACT({c120, [{latest, machi_projection:make_summary(P_latest)}]}), S2 = set_proj(S#ch_mgr{proj_history=H2, sane_transitions=Xtns + 1}, P_latest), @@ -2073,21 +2068,20 @@ react_to_env_C120(P_latest, FinalProps, #ch_mgr{proj_history=H, false -> S2; {{_ConfEpoch, _ConfCSum}, ConfTime} -> - P_latestEpoch = P_latest#projection_v1.epoch_number, - io:format(user, "\nCONFIRM debug C120 ~w was annotated ~w\n", [S#ch_mgr.name, P_latestEpoch]), + io:format(user, "\nCONFIRM debug C120 ~w was annotated ~w\n", [S#ch_mgr.name, P_latest#projection_v1.epoch_number]), S2#ch_mgr{proj_unanimous=ConfTime} end, V = case file:read_file("/tmp/moomoo."++atom_to_list(S#ch_mgr.name)) of {ok,_} -> true; _ -> false end, if V -> io:format("C120: ~w: ~p\n", [S#ch_mgr.name, get(react)]); true -> ok end, {{now_using, FinalProps, P_latest#projection_v1.epoch_number}, S3}. -add_and_trunc_history(#projection_v1{epoch_number=0}, H, _MaxLength) -> - H; -add_and_trunc_history(#projection_v1{} = P_latest, H, MaxLength) -> +add_and_trunc_history(P_latest, H, MaxLength) -> Latest_U_R = {P_latest#projection_v1.upi, P_latest#projection_v1.repairing}, - add_and_trunc_history(Latest_U_R, H, MaxLength); -add_and_trunc_history(Item, H, MaxLength) -> - H2 = queue:in(Item, H), + H2 = if P_latest#projection_v1.epoch_number > 0 -> + queue:in(Latest_U_R, H); + true -> + H + end, case queue:len(H2) of X when X > MaxLength -> {_V, Hxx} = queue:out(H2), @@ -2100,10 +2094,11 @@ react_to_env_C200(Retries, P_latest, S) -> ?REACT(c200), try AuthorProxyPid = proxy_pid(P_latest#projection_v1.author_server, S), - %% This is just advisory, we don't need a sync reply. - ?FLU_PC:kick_projection_reaction(AuthorProxyPid, [], 100) + ?FLU_PC:kick_projection_reaction(AuthorProxyPid, []) catch _Type:_Err -> - ok + %% ?V("TODO: tell_author_yo is broken: ~p ~p\n", + %% [_Type, _Err]), + ok end, react_to_env_C210(Retries, S). @@ -2490,23 +2485,19 @@ poll_private_proj_is_upi_unanimous3(#ch_mgr{name=MyName, proj=P_current} = S) -> ProjStore = get_projection_store_pid_or_regname(S), #projection_v1{epoch_number=_EpochRep, epoch_csum= <<_CSumRep:4/binary,_/binary>>, - author_server=AuthRep, upi=_UPIRep, repairing=_RepairingRep} = NewProj, ok = machi_projection_store:write(ProjStore, private, NewProj), - case proplists:get_value(private_write_verbose_confirm, S#ch_mgr.opts) of + case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of true -> - error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w auth ~w by ~w\n", [_EpochRep, _CSumRep, _UPIRep, _RepairingRep, AuthRep, MyName]); + io:format(user, "\n~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), _EpochRep, _CSumRep, _UPIRep, _RepairingRep, MyName]); _ -> ok end, %% Unwedge our FLU. {ok, NotifyPid} = machi_projection_store:get_wedge_notify_pid(ProjStore), _ = machi_flu1:update_wedge_state(NotifyPid, false, EpochID), - #ch_mgr{proj_history=H} = S2, - H2 = add_and_trunc_history({confirm, Epoch}, H, - ?MAX_HISTORY_LENGTH), - S2#ch_mgr{proj_unanimous=Now, proj_history=H2}; + S2#ch_mgr{proj_unanimous=Now}; _ -> S2 end; @@ -2546,14 +2537,6 @@ gobble_calls(StaticCall) -> ok end. -gobble_ticks() -> - receive - tick_check_environment -> - gobble_ticks() - after 0 -> - ok - end. - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% perhaps_start_repair(#ch_mgr{name=MyName, @@ -2569,13 +2552,12 @@ perhaps_start_repair(#ch_mgr{name=MyName, %% RepairOpts = [{repair_mode, check}, verbose], RepairFun = fun() -> do_repair(S, RepairOpts, CMode) end, LastUPI = lists:last(UPI), - StabilityTime = application:get_env(machi, stability_time, ?REPAIR_START_STABILITY_TIME), IgnoreStabilityTime_p = proplists:get_value(ignore_stability_time, S#ch_mgr.opts, false), case timer:now_diff(os:timestamp(), Start) div 1000000 of N when MyName == LastUPI andalso (IgnoreStabilityTime_p orelse - N >= StabilityTime) -> + N >= ?REPAIR_START_STABILITY_TIME) -> {WorkerPid, _Ref} = spawn_monitor(RepairFun), S#ch_mgr{repair_worker=WorkerPid, repair_start=os:timestamp(), @@ -2984,33 +2966,33 @@ zerf_find_last_annotated(FLU, MajoritySize, S) -> [] % lists:flatten() will destroy end. -perhaps_verbose_c111(P_latest2, #ch_mgr{name=MyName, opts=Opts}=S) -> - PrivWriteVerb = proplists:get_value(private_write_verbose, Opts, false), - PrivWriteVerbCONFIRM = proplists:get_value(private_write_verbose_confirm, Opts, false), - if PrivWriteVerb orelse PrivWriteVerbCONFIRM -> +perhaps_verbose_c111(P_latest2, S) -> + case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of + true -> Dbg2X = lists:keydelete(react, 1, P_latest2#projection_v1.dbg2) ++ [{is_annotated,is_annotated(P_latest2)}], P_latest2x = P_latest2#projection_v1{dbg2=Dbg2X}, % limit verbose len. Last2 = get(last_verbose), Summ2 = machi_projection:make_summary(P_latest2x), - if PrivWriteVerb, Summ2 /= Last2 -> - put(last_verbose, Summ2), - error_logger:info_msg("~p uses plain: ~w \n", - [MyName, Summ2]); - true -> - ok - end, - if PrivWriteVerbCONFIRM, - P_latest2#projection_v1.upi == [], + if P_latest2#projection_v1.upi == [], (S#ch_mgr.proj)#projection_v1.upi /= [] -> <> = P_latest2#projection_v1.epoch_csum, - error_logger:info_msg("CONFIRM epoch ~w ~w upi ~w rep ~w auth ~w by ~w\n", [(S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, P_latest2#projection_v1.author_server, S#ch_mgr.name]); + io:format(user, "\n~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), (S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]); true -> ok + end, + case proplists:get_value(private_write_verbose, + S#ch_mgr.opts) of + true when Summ2 /= Last2 -> + put(last_verbose, Summ2), + ?V("\n~s ~p uses plain: ~w \n", + [machi_util:pretty_time(), S#ch_mgr.name, Summ2]); + _ -> + ok end; - true -> + _ -> ok end. diff --git a/src/machi_chain_repair.erl b/src/machi_chain_repair.erl index 052fb1c..ee12b20 100644 --- a/src/machi_chain_repair.erl +++ b/src/machi_chain_repair.erl @@ -207,7 +207,7 @@ make_repair_compare_fun(SrcFLU) -> T_a =< T_b end. -make_repair_directives(ConsistencyMode, RepairMode, File, Size, _EpochID, +make_repair_directives(ConsistencyMode, RepairMode, File, Size, EpochID, Verb, Src, FLUs0, ProxiesDict, ETS) -> true = (Size < ?MAX_OFFSET), FLUs = lists:usort(FLUs0), @@ -216,7 +216,7 @@ make_repair_directives(ConsistencyMode, RepairMode, File, Size, _EpochID, Proxy = orddict:fetch(FLU, ProxiesDict), OffSzCs = case machi_proxy_flu1_client:checksum_list( - Proxy, File, ?LONG_TIMEOUT) of + Proxy, EpochID, File, ?LONG_TIMEOUT) of {ok, InfoBin} -> machi_csum_table:split_checksum_list_blob_decode(InfoBin); {error, no_such_file} -> @@ -236,6 +236,7 @@ make_repair_directives(ConsistencyMode, RepairMode, File, Size, _EpochID, make_repair_directives2(C2, ConsistencyMode, RepairMode, File, Verb, Src, FLUs, ProxiesDict, ETS) -> + ?VERB("."), make_repair_directives3(C2, ConsistencyMode, RepairMode, File, Verb, Src, FLUs, ProxiesDict, ETS, []). @@ -265,18 +266,7 @@ make_repair_directives3([{Offset, Size, CSum, _FLU}=A|Rest0], %% byte range from all FLUs %% 3b. Log big warning about data loss. %% 4. Log any other checksum discrepencies as they are found. - QQ = [begin - Pxy = orddict:fetch(FLU, ProxiesDict), - {ok, EpochID} = machi_proxy_flu1_client:get_epoch_id( - Pxy, ?SHORT_TIMEOUT), - NSInfo = undefined, - XX = machi_proxy_flu1_client:read_chunk( - Pxy, NSInfo, EpochID, File, Offset, Size, undefined, - ?SHORT_TIMEOUT), - {FLU, XX} - end || {__Offset, __Size, __CSum, FLU} <- As], - - exit({todo_repair_sanity_check, ?LINE, File, Offset, {as,As}, {qq,QQ}}) + exit({todo_repair_sanity_check, ?LINE, File, Offset, As}) end, %% List construction guarantees us that there's at least one ?MAX_OFFSET %% item remains. Sort order + our "taking" of all exact Offset+Size @@ -329,25 +319,23 @@ execute_repair_directives(ap_mode=_ConsistencyMode, Ds, _Src, EpochID, Verb, {ProxiesDict, EpochID, Verb, ETS}, Ds), ok. -execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, _Verb, ETS}=Acc) -> +execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, Verb, ETS}=Acc) -> EtsKeys = [{in_files, t_in_files}, {in_chunks, t_in_chunks}, {in_bytes, t_in_bytes}, {out_files, t_out_files}, {out_chunks, t_out_chunks}, {out_bytes, t_out_bytes}], [ets:insert(ETS, {L_K, 0}) || {L_K, _T_K} <- EtsKeys], F = fun({copy, {Offset, Size, TaggedCSum, MySrc}, MyDsts}, Acc2) -> SrcP = orddict:fetch(MySrc, ProxiesDict), - %% case ets:lookup_element(ETS, in_chunks, 2) rem 100 of - %% 0 -> ?VERB(".2", []); - %% _ -> ok - %% end, + case ets:lookup_element(ETS, in_chunks, 2) rem 100 of + 0 -> ?VERB(".", []); + _ -> ok + end, _T1 = os:timestamp(), %% TODO: support case multiple written or trimmed chunks returned - NSInfo = undefined, - {ok, {[{_, Offset, Chunk, _ReadCSum}|OtherChunks], []=_TrimmedList}} = + {ok, {[{_, Offset, Chunk, _}], _}} = machi_proxy_flu1_client:read_chunk( - SrcP, NSInfo, EpochID, File, Offset, Size, undefined, + SrcP, EpochID, File, Offset, Size, [], ?SHORT_TIMEOUT), - [] = OtherChunks, _T2 = os:timestamp(), <<_Tag:1/binary, CSum/binary>> = TaggedCSum, case machi_util:checksum_chunk(Chunk) of @@ -356,7 +344,7 @@ execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, _Verb, ETS}=Acc) - DstP = orddict:fetch(DstFLU, ProxiesDict), _T3 = os:timestamp(), ok = machi_proxy_flu1_client:write_chunk( - DstP, NSInfo, EpochID, File, Offset, Chunk, TaggedCSum, + DstP, EpochID, File, Offset, Chunk, ?SHORT_TIMEOUT), _T4 = os:timestamp() end || DstFLU <- MyDsts], diff --git a/src/machi_cr_client.erl b/src/machi_cr_client.erl index a726744..cec7c6a 100644 --- a/src/machi_cr_client.erl +++ b/src/machi_cr_client.erl @@ -21,9 +21,8 @@ %% @doc Erlang API for the Machi client-implemented Chain Replication %% (CORFU-style) protocol. %% -%% Please see {@link machi_flu1_client} the "Client API implemntation notes" -%% section for how this module relates to the rest of the client API -%% implementation. +%% See also the docs for {@link machi_flu1_client} for additional +%% details on data types and operation descriptions. %% %% The API here is much simpler than the {@link machi_flu1_client} or %% {@link machi_proxy_flu1_client} APIs. This module's API is a @@ -44,6 +43,64 @@ %% %% Doc TODO: Once this API stabilizes, add all relevant data type details %% to the EDoc here. +%% +%% +%% === Missing API features === +%% +%% So far, there is one missing client API feature that ought to be +%% added to Machi in the near future: more flexible checksum +%% management. +%% +%% Add a `source' annotation to all checksums to indicate where the +%% checksum was calculated. For example, +%% +%%
    +%% +%%
  • Calculated by client that performed the original chunk append, +%%
  • +%% +%%
  • Calculated by the 1st Machi server to receive an +%% un-checksummed append request +%%
  • +%% +%%
  • Re-calculated by Machi to manage fewer checksums of blocks of +%% data larger than the original client-specified chunks. +%%
  • +%%
+%% +%% Client-side checksums would be the "strongest" type of +%% checksum, meaning that any data corruption (of the original +%% data and/or of the checksum itself) can be detected after the +%% client-side calculation. There are too many horror stories on +%% The Net about IP PDUs that are corrupted but unnoticed due to +%% weak TCP checksums, buggy hardware, buggy OS drivers, etc. +%% Checksum versioning is also desirable if/when the current checksum +%% implementation changes from SHA-1 to something else. +%% +%% +%% === Implementation notes === +%% +%% The major operation processing is implemented in a state machine-like +%% manner. Before attempting an operation `X', there's an initial +%% operation `pre-X' that takes care of updating the epoch id, +%% restarting client protocol proxies, and if there's any server +%% instability (e.g. some server is wedged), then insert some sleep +%% time. When the chain appears to have stabilized, then we try the `X' +%% operation again. +%% +%% Function name for the `pre-X' stuff is usually `X()', and the +%% function name for the `X' stuff is usually `X2()'. (I.e., the `X' +%% stuff follows after `pre-X' and therefore has a `2' suffix on the +%% function name.) +%% +%% In the case of read repair, there are two stages: find the value to +%% perform the repair, then perform the repair writes. In the case of +%% the repair writes, the `pre-X' function is named `read_repair3()', +%% and the `X' function is named `read_repair4()'. +%% +%% TODO: It would be nifty to lift the very-nearly-but-not-quite-boilerplate +%% of the `pre-X' functions into a single common function ... but I'm not +%% sure yet on how to do it without making the code uglier. -module(machi_cr_client). @@ -61,11 +118,13 @@ %% FLU1 API -export([ %% File API - append_chunk/5, - append_chunk/6, append_chunk/7, - write_chunk/6, write_chunk/7, - read_chunk/6, read_chunk/7, - trim_chunk/5, trim_chunk/6, + append_chunk/3, append_chunk/4, + append_chunk/5, append_chunk/6, + append_chunk_extra/4, append_chunk_extra/5, + append_chunk_extra/6, append_chunk_extra/7, + write_chunk/4, write_chunk/5, + read_chunk/5, read_chunk/6, + trim_chunk/4, trim_chunk/5, checksum_list/2, checksum_list/3, list_files/1, list_files/2, @@ -106,61 +165,101 @@ start_link(P_srvr_list, Opts) -> %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. -append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum) -> - append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum, #append_opts{}, ?DEFAULT_TIMEOUT). +append_chunk(PidSpec, Prefix, Chunk) -> + append_chunk_extra(PidSpec, ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, + Prefix, Chunk, 0, ?DEFAULT_TIMEOUT). %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. -append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum, #append_opts{}=Opts) -> - append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum, Opts, ?DEFAULT_TIMEOUT). +append_chunk(PidSpec, Prefix, Chunk, Timeout) -> + append_chunk_extra(PidSpec, ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, + Prefix, Chunk, 0, Timeout). %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. -append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum, #append_opts{}=Opts, Timeout0) -> - NSInfo2 = machi_util:ns_info_default(NSInfo), +append_chunk(PidSpec, CoC_Namespace, CoC_Locator, Prefix, Chunk) -> + append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator, + Prefix, Chunk, 0, ?DEFAULT_TIMEOUT). + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix'. + +append_chunk(PidSpec, CoC_Namespace, CoC_Locator, Prefix, Chunk, Timeout) -> + append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator, + Prefix, Chunk, 0, Timeout). + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix'. + +append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra) + when is_integer(ChunkExtra), ChunkExtra >= 0 -> + append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra, ?DEFAULT_TIMEOUT). + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix'. + +append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra, Timeout0) -> {TO, Timeout} = timeout(Timeout0), - gen_server:call(PidSpec, {req, {append_chunk, - NSInfo2, Prefix, Chunk, CSum, Opts, TO}}, + gen_server:call(PidSpec, {req, {append_chunk_extra, + ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, + Prefix, + Chunk, ChunkExtra, TO}}, + Timeout). + +append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator, Prefix, Chunk, ChunkExtra) + when is_integer(ChunkExtra), ChunkExtra >= 0 -> + append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra, ?DEFAULT_TIMEOUT). + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix'. + +append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra, Timeout0) -> + {TO, Timeout} = timeout(Timeout0), + gen_server:call(PidSpec, {req, {append_chunk_extra, + CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, TO}}, Timeout). %% @doc Write a chunk of data (that has already been -%% allocated/sequenced by an earlier append_chunk() call) to +%% allocated/sequenced by an earlier append_chunk_extra() call) to %% `File' at `Offset'. -write_chunk(PidSpec, NSInfo, File, Offset, Chunk, CSum) -> - write_chunk(PidSpec, NSInfo, File, Offset, Chunk, CSum, ?DEFAULT_TIMEOUT). +write_chunk(PidSpec, File, Offset, Chunk) -> + write_chunk(PidSpec, File, Offset, Chunk, ?DEFAULT_TIMEOUT). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. -write_chunk(PidSpec, NSInfo, File, Offset, Chunk, CSum, Timeout0) -> +write_chunk(PidSpec, File, Offset, Chunk, Timeout0) -> {TO, Timeout} = timeout(Timeout0), - gen_server:call(PidSpec, {req, {write_chunk, NSInfo, File, Offset, Chunk, CSum, TO}}, + gen_server:call(PidSpec, {req, {write_chunk, File, Offset, Chunk, TO}}, Timeout). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. -read_chunk(PidSpec, NSInfo, File, Offset, Size, Opts) -> - read_chunk(PidSpec, NSInfo, File, Offset, Size, Opts, ?DEFAULT_TIMEOUT). +read_chunk(PidSpec, File, Offset, Size, Opts) -> + read_chunk(PidSpec, File, Offset, Size, Opts, ?DEFAULT_TIMEOUT). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. -read_chunk(PidSpec, NSInfo, File, Offset, Size, Opts, Timeout0) -> +read_chunk(PidSpec, File, Offset, Size, Opts, Timeout0) -> {TO, Timeout} = timeout(Timeout0), - gen_server:call(PidSpec, {req, {read_chunk, NSInfo, File, Offset, Size, Opts, TO}}, + gen_server:call(PidSpec, {req, {read_chunk, File, Offset, Size, Opts, TO}}, Timeout). %% @doc Trim a chunk of data of size `Size' from `File' at `Offset'. -trim_chunk(PidSpec, NSInfo, File, Offset, Size) -> - trim_chunk(PidSpec, NSInfo, File, Offset, Size, ?DEFAULT_TIMEOUT). +trim_chunk(PidSpec, File, Offset, Size) -> + trim_chunk(PidSpec, File, Offset, Size, ?DEFAULT_TIMEOUT). %% @doc Trim a chunk of data of size `Size' from `File' at `Offset'. -trim_chunk(PidSpec, NSInfo, File, Offset, Size, Timeout0) -> +trim_chunk(PidSpec, File, Offset, Size, Timeout0) -> {TO, Timeout} = timeout(Timeout0), - gen_server:call(PidSpec, {req, {trim_chunk, NSInfo, File, Offset, Size, TO}}, + gen_server:call(PidSpec, {req, {trim_chunk, File, Offset, Size, TO}}, Timeout). %% @doc Fetch the list of chunk checksums for `File'. @@ -225,27 +324,28 @@ code_change(_OldVsn, S, _Extra) -> %%%%%%%%%%%%%%%%%%%%%%%%%%% -handle_call2({append_chunk, NSInfo, - Prefix, Chunk, CSum, Opts, TO}, _From, S) -> - do_append_head(NSInfo, Prefix, - Chunk, CSum, Opts, 0, os:timestamp(), TO, S); -handle_call2({write_chunk, NSInfo, File, Offset, Chunk, CSum, TO}, _From, S) -> - do_write_head(NSInfo, File, Offset, Chunk, CSum, 0, os:timestamp(), TO, S); -handle_call2({read_chunk, NSInfo, File, Offset, Size, Opts, TO}, _From, S) -> - do_read_chunk(NSInfo, File, Offset, Size, Opts, 0, os:timestamp(), TO, S); -handle_call2({trim_chunk, NSInfo, File, Offset, Size, TO}, _From, S) -> - do_trim_chunk(NSInfo, File, Offset, Size, 0, os:timestamp(), TO, S); +handle_call2({append_chunk_extra, CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra, TO}, _From, S) -> + do_append_head(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, 0, os:timestamp(), TO, S); +handle_call2({write_chunk, File, Offset, Chunk, TO}, _From, S) -> + do_write_head(File, Offset, Chunk, 0, os:timestamp(), TO, S); +handle_call2({read_chunk, File, Offset, Size, Opts, TO}, _From, S) -> + do_read_chunk(File, Offset, Size, Opts, 0, os:timestamp(), TO, S); +handle_call2({trim_chunk, File, Offset, Size, TO}, _From, S) -> + do_trim_chunk(File, Offset, Size, 0, os:timestamp(), TO, S); handle_call2({checksum_list, File, TO}, _From, S) -> do_checksum_list(File, 0, os:timestamp(), TO, S); handle_call2({list_files, TO}, _From, S) -> do_list_files(0, os:timestamp(), TO, S). -do_append_head(NSInfo, Prefix, - Chunk, CSum, Opts, 0=Depth, STime, TO, S) -> - do_append_head2(NSInfo, Prefix, - Chunk, CSum, Opts, Depth + 1, STime, TO, S); -do_append_head(NSInfo, Prefix, - Chunk, CSum, Opts, Depth, STime, TO, #state{proj=P}=S) -> +do_append_head(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, 0=Depth, STime, TO, S) -> + do_append_head2(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, Depth + 1, STime, TO, S); +do_append_head(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, Depth, STime, TO, #state{proj=P}=S) -> + %% io:format(user, "head sleep1,", []), sleep_a_while(Depth), DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, if DiffMs > TO -> @@ -259,61 +359,62 @@ do_append_head(NSInfo, Prefix, case S2#state.proj of P2 when P2 == undefined orelse P2#projection_v1.upi == [] -> - do_append_head(NSInfo, Prefix, - Chunk, CSum, Opts, Depth + 1, + do_append_head(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, Depth + 1, STime, TO, S2); _ -> - do_append_head2(NSInfo, Prefix, - Chunk, CSum, Opts, Depth + 1, + do_append_head2(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, Depth + 1, STime, TO, S2) end end. -do_append_head2(NSInfo, Prefix, - Chunk, CSum, Opts, Depth, STime, TO, +do_append_head2(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, Depth, STime, TO, #state{proj=P}=S) -> [HeadFLU|_RestFLUs] = mutation_flus(P), case is_witness_flu(HeadFLU, P) of true -> case witnesses_use_our_epoch(S) of true -> - do_append_head3(NSInfo, Prefix, - Chunk, CSum, Opts, Depth, + do_append_head3(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, Depth, STime, TO, S); false -> %% Bummer, go back to the beginning and retry. - do_append_head(NSInfo, Prefix, - Chunk, CSum, Opts, Depth, + do_append_head(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, Depth, STime, TO, S) end; false -> - do_append_head3(NSInfo, Prefix, - Chunk, CSum, Opts, Depth, STime, TO, S) + do_append_head3(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, Depth, STime, TO, S) end. -do_append_head3(NSInfo, Prefix, - Chunk, CSum, Opts, Depth, STime, TO, +do_append_head3(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, Depth, STime, TO, #state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) -> [HeadFLU|RestFLUs] = non_witness_flus(mutation_flus(P), P), Proxy = orddict:fetch(HeadFLU, PD), - case ?FLU_PC:append_chunk(Proxy, NSInfo, EpochID, - Prefix, Chunk, CSum, Opts, ?TIMEOUT) of + case ?FLU_PC:append_chunk_extra(Proxy, EpochID, + CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, ?TIMEOUT) of {ok, {Offset, _Size, File}=_X} -> - do_wr_app_midtail(RestFLUs, NSInfo, Prefix, - File, Offset, Chunk, CSum, Opts, - [HeadFLU], 0, STime, TO, append, S); + do_append_midtail(RestFLUs, CoC_Namespace, CoC_Locator, Prefix, + File, Offset, Chunk, ChunkExtra, + [HeadFLU], 0, STime, TO, S); {error, bad_checksum}=BadCS -> {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_append_head(NSInfo, Prefix, - Chunk, CSum, Opts, Depth, STime, TO, S); + do_append_head(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, Depth, STime, TO, S); {error, written} -> %% Implicit sequencing + this error = we don't know where this %% written block is. But we lost a race. Repeat, with a new %% sequencer assignment. - do_append_head(NSInfo, Prefix, - Chunk, CSum, Opts, Depth, STime, TO, S); + do_append_head(CoC_Namespace, CoC_Locator, Prefix, + Chunk, ChunkExtra, Depth, STime, TO, S); {error, trimmed} = Err -> %% TODO: behaviour {reply, Err, S}; @@ -322,276 +423,17 @@ do_append_head3(NSInfo, Prefix, Prefix,iolist_size(Chunk)}) end. -do_wr_app_midtail(RestFLUs, NSInfo, Prefix, - File, Offset, Chunk, CSum, Opts, - Ws, Depth, STime, TO, MyOp, S) +do_append_midtail(RestFLUs, CoC_Namespace, CoC_Locator, Prefix, + File, Offset, Chunk, ChunkExtra, + Ws, Depth, STime, TO, S) when RestFLUs == [] orelse Depth == 0 -> - do_wr_app_midtail2(RestFLUs, NSInfo, Prefix, - File, Offset, Chunk, CSum, Opts, - Ws, Depth + 1, STime, TO, MyOp, S); -do_wr_app_midtail(_RestFLUs, NSInfo, Prefix, File, - Offset, Chunk, CSum, Opts, - Ws, Depth, STime, TO, MyOp, #state{proj=P}=S) -> - sleep_a_while(Depth), - DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, - if DiffMs > TO -> - {reply, {error, partition}, S}; - true -> - S2 = update_proj(S#state{proj=undefined, bad_proj=P}), - case S2#state.proj of - undefined -> - {reply, {error, partition}, S}; - P2 -> - RestFLUs2 = mutation_flus(P2), - case RestFLUs2 -- Ws of - RestFLUs2 -> - if Prefix == undefined -> % atom! not binary()!! - {error, partition}; - MyOp == append -> - %% None of the writes that we have done so - %% far are to FLUs that are in the - %% RestFLUs2 list. We are pessimistic - %% here and assume that those FLUs are - %% permanently dead. Start over with a - %% new sequencer assignment, at the 2nd - %% have of the impl (we have already slept - %% & refreshed the projection). - do_append_head2(NSInfo, - Prefix, Chunk, CSum, Opts, - Depth, STime, TO, S2); - MyOp == write -> - do_wr_app_midtail2(RestFLUs2, - NSInfo, - Prefix, File, Offset, - Chunk, CSum, Opts, - Ws, Depth + 1, STime, TO, - MyOp, S2) - end; - RestFLUs3 -> - do_wr_app_midtail2(RestFLUs3, - NSInfo, - Prefix, File, Offset, - Chunk, CSum, Opts, - Ws, Depth + 1, STime, TO, - MyOp, S2) - end - end - end. - -do_wr_app_midtail2([], _NSInfo, - _Prefix, File, Offset, Chunk, - _CSum, _Opts, _Ws, _Depth, _STime, _TO, _MyOp, S) -> - {reply, {ok, {Offset, chunk_wrapper_size(Chunk), File}}, S}; -do_wr_app_midtail2([FLU|RestFLUs]=FLUs, NSInfo, - Prefix, File, Offset, Chunk, - CSum, Opts, Ws, Depth, STime, TO, MyOp, - #state{epoch_id=EpochID, proxies_dict=PD}=S) -> - Proxy = orddict:fetch(FLU, PD), - case ?FLU_PC:write_chunk(Proxy, NSInfo, EpochID, File, Offset, Chunk, CSum, ?TIMEOUT) of - ok -> - do_wr_app_midtail2(RestFLUs, NSInfo, Prefix, - File, Offset, Chunk, - CSum, Opts, [FLU|Ws], Depth, STime, TO, MyOp, S); - {error, bad_checksum}=BadCS -> - %% TODO: alternate strategy? - {reply, BadCS, S}; - {error, Retry} - when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_wr_app_midtail(FLUs, NSInfo, Prefix, - File, Offset, Chunk, - CSum, Opts, Ws, Depth, STime, TO, MyOp, S); - {error, written} -> - %% We know what the chunk ought to be, so jump to the - %% middle of read-repair. - Resume = {append, Offset, iolist_size(Chunk), File}, - do_repair_chunk(FLUs, Resume, Chunk, CSum, [], NSInfo, File, Offset, - iolist_size(Chunk), Depth, STime, S); - {error, trimmed} = Err -> - %% TODO: nothing can be done - {reply, Err, S}; - {error, not_written} -> - exit({todo_should_never_happen,?MODULE,?LINE,File,Offset}) - end. - -witnesses_use_our_epoch(#state{proj=P}=S) -> - Witnesses = witness_flus(P#projection_v1.upi, P), - witnesses_use_our_epoch(Witnesses, S). - -witnesses_use_our_epoch([], _S) -> - true; -witnesses_use_our_epoch([FLU|RestFLUs], - #state{epoch_id=EpochID, proxies_dict=PD}=S) -> - Proxy = orddict:fetch(FLU, PD), - %% Check both that the EpochID is the same *and* not wedged! - case ?FLU_PC:wedge_status(Proxy, ?TIMEOUT) of - {ok, {false, EID,_,_}} when EID == EpochID -> - witnesses_use_our_epoch(RestFLUs, S); - _Else -> - false - end. - -do_write_head(NSInfo, File, Offset, Chunk, CSum, 0=Depth, STime, TO, S) -> - do_write_head2(NSInfo, File, Offset, Chunk, CSum, Depth + 1, STime, TO, S); -do_write_head(NSInfo, File, Offset, Chunk, CSum, Depth, STime, TO, #state{proj=P}=S) -> - sleep_a_while(Depth), - DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, - if DiffMs > TO -> - {reply, {error, partition}, S}; - true -> - %% This is suboptimal for performance: there are some paths - %% through this point where our current projection is good - %% enough. But we're going to try to keep the code as simple - %% as we can for now. - S2 = update_proj(S#state{proj=undefined, bad_proj=P}), - case S2#state.proj of - P2 when P2 == undefined orelse - P2#projection_v1.upi == [] -> - do_write_head(NSInfo, File, Offset, Chunk, CSum, Depth + 1, - STime, TO, S2); - _ -> - do_write_head2(NSInfo, File, Offset, Chunk, CSum, Depth + 1, - STime, TO, S2) - end - end. - -do_write_head2(NSInfo, File, Offset, Chunk, CSum, Depth, STime, TO, - #state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) -> - [HeadFLU|RestFLUs] = mutation_flus(P), - Proxy = orddict:fetch(HeadFLU, PD), - case ?FLU_PC:write_chunk(Proxy, NSInfo, EpochID, File, Offset, Chunk, CSum, ?TIMEOUT) of - ok -> - %% From this point onward, we use the same code & logic path as - %% append does. - Prefix=unused_write_path, - Opts=unused_write_path, - do_wr_app_midtail(RestFLUs, NSInfo, Prefix, - File, Offset, Chunk, - CSum, Opts, [HeadFLU], 0, STime, TO, write, S); - {error, bad_checksum}=BadCS -> - {reply, BadCS, S}; - {error, Retry} - when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_write_head(NSInfo, File, Offset, Chunk, CSum, Depth, STime, TO, S); - {error, written}=Err -> - {reply, Err, S}; - {error, trimmed}=Err -> - {reply, Err, S}; - {error, not_written} -> - exit({todo_should_never_happen,?MODULE,?LINE, - iolist_size(Chunk)}) - end. - -do_read_chunk(NSInfo, File, Offset, Size, Opts, 0=Depth, STime, TO, - #state{proj=#projection_v1{upi=[_|_]}}=S) -> % UPI is non-empty - do_read_chunk2(NSInfo, File, Offset, Size, Opts, Depth + 1, STime, TO, S); -do_read_chunk(NSInfo, File, Offset, Size, Opts, Depth, STime, TO, #state{proj=P}=S) -> - sleep_a_while(Depth), - DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, - if DiffMs > TO -> - {reply, {error, partition}, S}; - true -> - S2 = update_proj(S#state{proj=undefined, bad_proj=P}), - case S2#state.proj of - P2 when P2 == undefined orelse - P2#projection_v1.upi == [] -> - do_read_chunk(NSInfo, File, Offset, Size, Opts, Depth + 1, STime, TO, S2); - _ -> - do_read_chunk2(NSInfo, File, Offset, Size, Opts, Depth + 1, STime, TO, S2) - end - end. - -do_read_chunk2(NSInfo, File, Offset, Size, Opts, Depth, STime, TO, - #state{proj=P, epoch_id=EpochID, proxies_dict=PD}=S) -> - UPI = readonly_flus(P), - Tail = lists:last(UPI), - ConsistencyMode = P#projection_v1.mode, - case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), NSInfo, EpochID, - File, Offset, Size, Opts, ?TIMEOUT) of - {ok, {Chunks, Trimmed}} when is_list(Chunks), is_list(Trimmed) -> - %% After partition heal, there could happen that heads may - %% have chunk trimmed but tails may have chunk written - - %% such repair couldn't be triggered in read time (because - %% there's data!). In this case, repair should happen by - %% partition heal event or some background - %% hashtree-n-repair service. TODO. FIXME. - {reply, {ok, {Chunks, Trimmed}}, S}; - %% {ok, BadChunk} -> - %% %% TODO cleaner handling of bad chunks - %% exit({todo, bad_chunk_size, ?MODULE, ?LINE, File, Offset, Size, - %% got, byte_size(BadChunk)}); - {error, bad_arg} = BadArg -> - {reply, BadArg, S}; - {error, partial_read}=Err -> - %% TODO: maybe this case we might need another repair? - {reply, Err, S}; - {error, bad_checksum}=BadCS -> - %% TODO: alternate strategy? - %% Maybe we need read repair here, too? - {reply, BadCS, S}; - {error, Retry} - when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_read_chunk(NSInfo, File, Offset, Size, Opts, Depth, STime, TO, S); - {error, not_written} -> - read_repair(ConsistencyMode, read, NSInfo, File, Offset, Size, Depth, STime, S); - %% {reply, {error, not_written}, S}; - {error, written} -> - exit({todo_should_never_happen,?MODULE,?LINE,File,Offset,Size}); - {error, trimmed}=Err -> - {reply, Err, S} - end. - -do_trim_chunk(NSInfo, File, Offset, Size, 0=Depth, STime, TO, S) -> - do_trim_chunk(NSInfo, File, Offset, Size, Depth+1, STime, TO, S); - -do_trim_chunk(NSInfo, File, Offset, Size, Depth, STime, TO, #state{proj=P}=S) -> - sleep_a_while(Depth), - DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, - if DiffMs > TO -> - {reply, {error, partition}, S}; - true -> - %% This is suboptimal for performance: there are some paths - %% through this point where our current projection is good - %% enough. But we're going to try to keep the code as simple - %% as we can for now. - S2 = update_proj(S#state{proj=undefined, bad_proj=P}), - case S2#state.proj of - P2 when P2 == undefined orelse - P2#projection_v1.upi == [] -> - do_trim_chunk(NSInfo, File, Offset, Size, Depth + 1, - STime, TO, S2); - _ -> - do_trim_chunk2(NSInfo, File, Offset, Size, Depth + 1, - STime, TO, S2) - end - end. - -do_trim_chunk2(NSInfo, File, Offset, Size, Depth, STime, TO, - #state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) -> - [HeadFLU|RestFLUs] = mutation_flus(P), - Proxy = orddict:fetch(HeadFLU, PD), - case ?FLU_PC:trim_chunk(Proxy, NSInfo, EpochID, File, Offset, Size, ?TIMEOUT) of - ok -> - do_trim_midtail(RestFLUs, undefined, NSInfo, File, Offset, Size, - [HeadFLU], 0, STime, TO, S); - {error, trimmed} -> - %% Maybe the trim had failed in the middle of the tail so re-run - %% trim accross the whole chain. - do_trim_midtail(RestFLUs, undefined, NSInfo, File, Offset, Size, - [HeadFLU], 0, STime, TO, S); - {error, bad_checksum}=BadCS -> - {reply, BadCS, S}; - {error, Retry} - when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_trim_chunk(NSInfo, File, Offset, Size, Depth, STime, TO, S) - end. - -do_trim_midtail(RestFLUs, Prefix, NSInfo, File, Offset, Size, - Ws, Depth, STime, TO, S) - when RestFLUs == [] orelse Depth == 0 -> - do_trim_midtail2(RestFLUs, Prefix, NSInfo, File, Offset, Size, - Ws, Depth + 1, STime, TO, S); -do_trim_midtail(_RestFLUs, Prefix, NSInfo, File, Offset, Size, - Ws, Depth, STime, TO, #state{proj=P}=S) -> + do_append_midtail2(RestFLUs, CoC_Namespace, CoC_Locator, Prefix, + File, Offset, Chunk, ChunkExtra, + Ws, Depth + 1, STime, TO, S); +do_append_midtail(_RestFLUs, CoC_Namespace, CoC_Locator, Prefix, File, + Offset, Chunk, ChunkExtra, + Ws, Depth, STime, TO, #state{proj=P}=S) -> + %% io:format(user, "midtail sleep2,", []), sleep_a_while(Depth), DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, if DiffMs > TO -> @@ -616,36 +458,292 @@ do_trim_midtail(_RestFLUs, Prefix, NSInfo, File, Offset, Size, if Prefix == undefined -> % atom! not binary()!! {error, partition}; true -> - do_trim_chunk(NSInfo, Prefix, Offset, Size, + do_append_head2(CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra, + Depth, STime, TO, S2) + end; + RestFLUs3 -> + do_append_midtail2(RestFLUs3, + CoC_Namespace, CoC_Locator, + Prefix, File, Offset, + Chunk, ChunkExtra, + Ws, Depth + 1, STime, TO, S2) + end + end + end. + +do_append_midtail2([], _CoC_Namespace, _CoC_Locator, + _Prefix, File, Offset, Chunk, + _ChunkExtra, _Ws, _Depth, _STime, _TO, S) -> + %% io:format(user, "ok!\n", []), + {reply, {ok, {Offset, chunk_wrapper_size(Chunk), File}}, S}; +do_append_midtail2([FLU|RestFLUs]=FLUs, CoC_Namespace, CoC_Locator, + Prefix, File, Offset, Chunk, + ChunkExtra, Ws, Depth, STime, TO, + #state{epoch_id=EpochID, proxies_dict=PD}=S) -> + Proxy = orddict:fetch(FLU, PD), + case ?FLU_PC:write_chunk(Proxy, EpochID, File, Offset, Chunk, ?TIMEOUT) of + ok -> + %% io:format(user, "write ~w,", [FLU]), + do_append_midtail2(RestFLUs, CoC_Namespace, CoC_Locator, Prefix, + File, Offset, Chunk, + ChunkExtra, [FLU|Ws], Depth, STime, TO, S); + {error, bad_checksum}=BadCS -> + %% TODO: alternate strategy? + {reply, BadCS, S}; + {error, Retry} + when Retry == partition; Retry == bad_epoch; Retry == wedged -> + do_append_midtail(FLUs, CoC_Namespace, CoC_Locator, Prefix, + File, Offset, Chunk, + ChunkExtra, Ws, Depth, STime, TO, S); + {error, written} -> + %% We know what the chunk ought to be, so jump to the + %% middle of read-repair. + Resume = {append, Offset, iolist_size(Chunk), File}, + do_repair_chunk(FLUs, Resume, Chunk, [], File, Offset, + iolist_size(Chunk), Depth, STime, S); + {error, trimmed} = Err -> + %% TODO: nothing can be done + {reply, Err, S}; + {error, not_written} -> + exit({todo_should_never_happen,?MODULE,?LINE,File,Offset}) + end. + +witnesses_use_our_epoch(#state{proj=P}=S) -> + Witnesses = witness_flus(P#projection_v1.upi, P), + witnesses_use_our_epoch(Witnesses, S). + +witnesses_use_our_epoch([], _S) -> + true; +witnesses_use_our_epoch([FLU|RestFLUs], + #state{epoch_id=EpochID, proxies_dict=PD}=S) -> + Proxy = orddict:fetch(FLU, PD), + %% Check both that the EpochID is the same *and* not wedged! + case ?FLU_PC:wedge_status(Proxy, ?TIMEOUT) of + {ok, {false, EID}} when EID == EpochID -> + witnesses_use_our_epoch(RestFLUs, S); + _Else -> + false + end. + +do_write_head(File, Offset, Chunk, 0=Depth, STime, TO, S) -> + do_write_head2(File, Offset, Chunk, Depth + 1, STime, TO, S); +do_write_head(File, Offset, Chunk, Depth, STime, TO, #state{proj=P}=S) -> + %% io:format(user, "head sleep1,", []), + sleep_a_while(Depth), + DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, + if DiffMs > TO -> + {reply, {error, partition}, S}; + true -> + %% This is suboptimal for performance: there are some paths + %% through this point where our current projection is good + %% enough. But we're going to try to keep the code as simple + %% as we can for now. + S2 = update_proj(S#state{proj=undefined, bad_proj=P}), + case S2#state.proj of + P2 when P2 == undefined orelse + P2#projection_v1.upi == [] -> + do_write_head(File, Offset, Chunk, Depth + 1, + STime, TO, S2); + _ -> + do_write_head2(File, Offset, Chunk, Depth + 1, + STime, TO, S2) + end + end. + +do_write_head2(File, Offset, Chunk, Depth, STime, TO, + #state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) -> + [HeadFLU|RestFLUs] = mutation_flus(P), + Proxy = orddict:fetch(HeadFLU, PD), + case ?FLU_PC:write_chunk(Proxy, EpochID, File, Offset, Chunk, ?TIMEOUT) of + ok -> + %% From this point onward, we use the same code & logic path as + %% append does. + do_append_midtail(RestFLUs, undefined, undefined, undefined, + File, Offset, Chunk, + undefined, [HeadFLU], 0, STime, TO, S); + {error, bad_checksum}=BadCS -> + {reply, BadCS, S}; + {error, Retry} + when Retry == partition; Retry == bad_epoch; Retry == wedged -> + do_write_head(File, Offset, Chunk, Depth, STime, TO, S); + {error, written}=Err -> + {reply, Err, S}; + {error, trimmed}=Err -> + {reply, Err, S}; + {error, not_written} -> + exit({todo_should_never_happen,?MODULE,?LINE, + iolist_size(Chunk)}) + end. + +do_read_chunk(File, Offset, Size, Opts, 0=Depth, STime, TO, + #state{proj=#projection_v1{upi=[_|_]}}=S) -> % UPI is non-empty + do_read_chunk2(File, Offset, Size, Opts, Depth + 1, STime, TO, S); +do_read_chunk(File, Offset, Size, Opts, Depth, STime, TO, #state{proj=P}=S) -> + sleep_a_while(Depth), + DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, + if DiffMs > TO -> + {reply, {error, partition}, S}; + true -> + S2 = update_proj(S#state{proj=undefined, bad_proj=P}), + case S2#state.proj of + P2 when P2 == undefined orelse + P2#projection_v1.upi == [] -> + do_read_chunk(File, Offset, Size, Opts, Depth + 1, STime, TO, S2); + _ -> + do_read_chunk2(File, Offset, Size, Opts, Depth + 1, STime, TO, S2) + end + end. + +do_read_chunk2(File, Offset, Size, Opts, Depth, STime, TO, + #state{proj=P, epoch_id=EpochID, proxies_dict=PD}=S) -> + UPI = readonly_flus(P), + Tail = lists:last(UPI), + ConsistencyMode = P#projection_v1.mode, + case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), EpochID, + File, Offset, Size, Opts, ?TIMEOUT) of + {ok, {Chunks, Trimmed}} when is_list(Chunks), is_list(Trimmed) -> + %% After partition heal, there could happen that heads may + %% have chunk trimmed but tails may have chunk written - + %% such repair couldn't be triggered in read time (because + %% there's data!). In this case, repair should happen by + %% partition heal event or some background + %% hashtree-n-repair service. TODO. FIXME. + {reply, {ok, {Chunks, Trimmed}}, S}; + %% {ok, BadChunk} -> + %% %% TODO cleaner handling of bad chunks + %% exit({todo, bad_chunk_size, ?MODULE, ?LINE, File, Offset, Size, + %% got, byte_size(BadChunk)}); + {error, bad_arg} = BadArg -> + {reply, BadArg, S}; + {error, partial_read}=Err -> + %% TODO: maybe this case we might need another repair? + {reply, Err, S}; + {error, bad_checksum}=BadCS -> + %% TODO: alternate strategy? + %% Maybe we need read repair here, too? + {reply, BadCS, S}; + {error, Retry} + when Retry == partition; Retry == bad_epoch; Retry == wedged -> + do_read_chunk(File, Offset, Size, Opts, Depth, STime, TO, S); + {error, not_written} -> + read_repair(ConsistencyMode, read, File, Offset, Size, Depth, STime, S); + %% {reply, {error, not_written}, S}; + {error, written} -> + exit({todo_should_never_happen,?MODULE,?LINE,File,Offset,Size}); + {error, trimmed}=Err -> + {reply, Err, S} + end. + +do_trim_chunk(File, Offset, Size, 0=Depth, STime, TO, S) -> + do_trim_chunk(File, Offset, Size, Depth+1, STime, TO, S); + +do_trim_chunk(File, Offset, Size, Depth, STime, TO, #state{proj=P}=S) -> + sleep_a_while(Depth), + DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, + if DiffMs > TO -> + {reply, {error, partition}, S}; + true -> + %% This is suboptimal for performance: there are some paths + %% through this point where our current projection is good + %% enough. But we're going to try to keep the code as simple + %% as we can for now. + S2 = update_proj(S#state{proj=undefined, bad_proj=P}), + case S2#state.proj of + P2 when P2 == undefined orelse + P2#projection_v1.upi == [] -> + do_trim_chunk(File, Offset, Size, Depth + 1, + STime, TO, S2); + _ -> + do_trim_chunk2(File, Offset, Size, Depth + 1, + STime, TO, S2) + end + end. + +do_trim_chunk2(File, Offset, Size, Depth, STime, TO, + #state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) -> + [HeadFLU|RestFLUs] = mutation_flus(P), + Proxy = orddict:fetch(HeadFLU, PD), + case ?FLU_PC:trim_chunk(Proxy, EpochID, File, Offset, Size, ?TIMEOUT) of + ok -> + do_trim_midtail(RestFLUs, undefined, File, Offset, Size, + [HeadFLU], 0, STime, TO, S); + {error, trimmed} -> + %% Maybe the trim had failed in the middle of the tail so re-run + %% trim accross the whole chain. + do_trim_midtail(RestFLUs, undefined, File, Offset, Size, + [HeadFLU], 0, STime, TO, S); + {error, bad_checksum}=BadCS -> + {reply, BadCS, S}; + {error, Retry} + when Retry == partition; Retry == bad_epoch; Retry == wedged -> + do_trim_chunk(File, Offset, Size, Depth, STime, TO, S) + end. + +do_trim_midtail(RestFLUs, Prefix, File, Offset, Size, + Ws, Depth, STime, TO, S) + when RestFLUs == [] orelse Depth == 0 -> + do_trim_midtail2(RestFLUs, Prefix, File, Offset, Size, + Ws, Depth + 1, STime, TO, S); +do_trim_midtail(_RestFLUs, Prefix, File, Offset, Size, + Ws, Depth, STime, TO, #state{proj=P}=S) -> + %% io:format(user, "midtail sleep2,", []), + sleep_a_while(Depth), + DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, + if DiffMs > TO -> + {reply, {error, partition}, S}; + true -> + S2 = update_proj(S#state{proj=undefined, bad_proj=P}), + case S2#state.proj of + undefined -> + {reply, {error, partition}, S}; + P2 -> + RestFLUs2 = mutation_flus(P2), + case RestFLUs2 -- Ws of + RestFLUs2 -> + %% None of the writes that we have done so far + %% are to FLUs that are in the RestFLUs2 list. + %% We are pessimistic here and assume that + %% those FLUs are permanently dead. Start + %% over with a new sequencer assignment, at + %% the 2nd have of the impl (we have already + %% slept & refreshed the projection). + + if Prefix == undefined -> % atom! not binary()!! + {error, partition}; + true -> + do_trim_chunk(Prefix, Offset, Size, Depth, STime, TO, S2) end; RestFLUs3 -> - do_trim_midtail2(RestFLUs3, Prefix, NSInfo, File, Offset, Size, + do_trim_midtail2(RestFLUs3, Prefix, File, Offset, Size, Ws, Depth + 1, STime, TO, S2) end end end. -do_trim_midtail2([], _Prefix, _NSInfo, _File, _Offset, _Size, +do_trim_midtail2([], _Prefix, _File, _Offset, _Size, _Ws, _Depth, _STime, _TO, S) -> + %% io:format(user, "ok!\n", []), {reply, ok, S}; -do_trim_midtail2([FLU|RestFLUs]=FLUs, Prefix, NSInfo, File, Offset, Size, +do_trim_midtail2([FLU|RestFLUs]=FLUs, Prefix, File, Offset, Size, Ws, Depth, STime, TO, #state{epoch_id=EpochID, proxies_dict=PD}=S) -> Proxy = orddict:fetch(FLU, PD), - case ?FLU_PC:trim_chunk(Proxy, NSInfo, EpochID, File, Offset, Size, ?TIMEOUT) of + case ?FLU_PC:trim_chunk(Proxy, EpochID, File, Offset, Size, ?TIMEOUT) of ok -> - do_trim_midtail2(RestFLUs, Prefix, NSInfo, File, Offset, Size, + %% io:format(user, "write ~w,", [FLU]), + do_trim_midtail2(RestFLUs, Prefix, File, Offset, Size, [FLU|Ws], Depth, STime, TO, S); {error, trimmed} -> - do_trim_midtail2(RestFLUs, Prefix, NSInfo, File, Offset, Size, + do_trim_midtail2(RestFLUs, Prefix, File, Offset, Size, [FLU|Ws], Depth, STime, TO, S); {error, bad_checksum}=BadCS -> %% TODO: alternate strategy? {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_trim_midtail(FLUs, Prefix, NSInfo, File, Offset, Size, + do_trim_midtail(FLUs, Prefix, File, Offset, Size, Ws, Depth, STime, TO, S) end. @@ -661,11 +759,11 @@ do_trim_midtail2([FLU|RestFLUs]=FLUs, Prefix, NSInfo, File, Offset, Size, %% Never matches because Depth is always incremented beyond 0 prior to %% getting here. %% -%% read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, 0=Depth, +%% read_repair(ConsistencyMode, ReturnMode, File, Offset, Size, 0=Depth, %% STime, #state{proj=#projection_v1{upi=[_|_]}}=S) -> % UPI is non-empty -%% read_repair2(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, Depth + 1, +%% read_repair2(ConsistencyMode, ReturnMode, File, Offset, Size, Depth + 1, %% STime, S); -read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, Depth, +read_repair(ConsistencyMode, ReturnMode, File, Offset, Size, Depth, STime, #state{proj=P}=S) -> sleep_a_while(Depth), DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, @@ -676,26 +774,26 @@ read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, Depth, case S2#state.proj of P2 when P2 == undefined orelse P2#projection_v1.upi == [] -> - read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset, + read_repair(ConsistencyMode, ReturnMode, File, Offset, Size, Depth + 1, STime, S2); _ -> - read_repair2(ConsistencyMode, ReturnMode, NSInfo, File, Offset, + read_repair2(ConsistencyMode, ReturnMode, File, Offset, Size, Depth + 1, STime, S2) end end. read_repair2(cp_mode=ConsistencyMode, - ReturnMode, NSInfo, File, Offset, Size, Depth, STime, + ReturnMode, File, Offset, Size, Depth, STime, #state{proj=P, epoch_id=EpochID, proxies_dict=PD}=S) -> %% TODO WTF was I thinking here??.... Tail = lists:last(readonly_flus(P)), - case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), NSInfo, EpochID, - File, Offset, Size, undefined, ?TIMEOUT) of + case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), EpochID, + File, Offset, Size, [], ?TIMEOUT) of {ok, Chunks} when is_list(Chunks) -> %% TODO: change to {Chunks, Trimmed} and have them repaired ToRepair = mutation_flus(P) -- [Tail], {Reply, S1} = do_repair_chunks(Chunks, ToRepair, ReturnMode, - [Tail], NSInfo, File, Depth, STime, S, {ok, Chunks}), + [Tail], File, Depth, STime, S, {ok, Chunks}), {reply, Reply, S1}; %% {ok, BadChunk} -> %% exit({todo, bad_chunk_size, ?MODULE, ?LINE, File, Offset, @@ -705,7 +803,7 @@ read_repair2(cp_mode=ConsistencyMode, {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset, + read_repair(ConsistencyMode, ReturnMode, File, Offset, Size, Depth, STime, S); {error, not_written} -> {reply, {error, not_written}, S}; @@ -718,23 +816,24 @@ read_repair2(cp_mode=ConsistencyMode, exit({todo_should_repair_unlinked_files, ?MODULE, ?LINE, File}) end; read_repair2(ap_mode=ConsistencyMode, - ReturnMode, NSInfo, File, Offset, Size, Depth, STime, + ReturnMode, File, Offset, Size, Depth, STime, #state{proj=P}=S) -> Eligible = mutation_flus(P), - case try_to_find_chunk(Eligible, NSInfo, File, Offset, Size, S) of + case try_to_find_chunk(Eligible, File, Offset, Size, S) of {ok, {Chunks, _Trimmed}, GotItFrom} when is_list(Chunks) -> %% TODO: Repair trimmed chunks ToRepair = mutation_flus(P) -- [GotItFrom], - Reply = {ok, {Chunks, []}}, - {Reply, S1} = do_repair_chunks(Chunks, ToRepair, ReturnMode, [GotItFrom], - NSInfo, File, Depth, STime, S, Reply), + {Reply0, S1} = do_repair_chunks(Chunks, ToRepair, ReturnMode, [GotItFrom], + File, Depth, STime, S, {ok, Chunks}), + {ok, Chunks} = Reply0, + Reply = {ok, {Chunks, _Trimmed}}, {reply, Reply, S1}; {error, bad_checksum}=BadCS -> %% TODO: alternate strategy? {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - read_repair(ConsistencyMode, ReturnMode, NSInfo, File, + read_repair(ConsistencyMode, ReturnMode, File, Offset, Size, Depth, STime, S); {error, not_written} -> {reply, {error, not_written}, S}; @@ -746,22 +845,22 @@ read_repair2(ap_mode=ConsistencyMode, exit({todo_should_repair_unlinked_files, ?MODULE, ?LINE, File}) end. -do_repair_chunks([], _, _, _, _, _, _, _, S, Reply) -> +do_repair_chunks([], _, _, _, _, _, _, S, Reply) -> {Reply, S}; -do_repair_chunks([{_, Offset, Chunk, CSum}|T], - ToRepair, ReturnMode, [GotItFrom], NSInfo, File, Depth, STime, S, Reply) -> - true = not is_atom(CSum), +do_repair_chunks([{_, Offset, Chunk, _Csum}|T], + ToRepair, ReturnMode, [GotItFrom], File, Depth, STime, S, Reply) -> Size = iolist_size(Chunk), - case do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, [GotItFrom], NSInfo, File, Offset, + case do_repair_chunk(ToRepair, ReturnMode, Chunk, [GotItFrom], File, Offset, Size, Depth, STime, S) of - {reply, {ok, _}, S1} -> - do_repair_chunks(T, ToRepair, ReturnMode, [GotItFrom], NSInfo, File, Depth, STime, S1, Reply); + {ok, Chunk, S1} -> + do_repair_chunks(T, ToRepair, ReturnMode, [GotItFrom], File, Depth, STime, S1, Reply); Error -> Error end. -do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offset, +do_repair_chunk(ToRepair, ReturnMode, Chunk, Repaired, File, Offset, Size, Depth, STime, #state{proj=P}=S) -> + %% io:format(user, "read_repair3 sleep1,", []), sleep_a_while(Depth), DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, if DiffMs > ?MAX_RUNTIME -> @@ -771,42 +870,42 @@ do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offse case S2#state.proj of P2 when P2 == undefined orelse P2#projection_v1.upi == [] -> - do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, + do_repair_chunk(ToRepair, ReturnMode, Chunk, Repaired, File, Offset, Size, Depth + 1, STime, S2); P2 -> ToRepair2 = mutation_flus(P2) -- Repaired, - do_repair_chunk2(ToRepair2, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, + do_repair_chunk2(ToRepair2, ReturnMode, Chunk, Repaired, File, Offset, Size, Depth + 1, STime, S2) end end. -do_repair_chunk2([], ReturnMode, Chunk, CSum, _Repaired, _NSInfo, File, Offset, +do_repair_chunk2([], ReturnMode, Chunk, _Repaired, File, Offset, _IgnoreSize, _Depth, _STime, S) -> %% TODO: add stats for # of repairs, length(_Repaired)-1, etc etc? case ReturnMode of read -> - {reply, {ok, {[{File, Offset, Chunk, CSum}], []}}, S}; + {ok, Chunk, S}; {append, Offset, Size, File} -> - {reply, {ok, {[{Offset, Size, File}], []}}, S} + {ok, {Offset, Size, File}, S} end; -do_repair_chunk2([First|Rest]=ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offset, +do_repair_chunk2([First|Rest]=ToRepair, ReturnMode, Chunk, Repaired, File, Offset, Size, Depth, STime, #state{epoch_id=EpochID, proxies_dict=PD}=S) -> Proxy = orddict:fetch(First, PD), - case ?FLU_PC:write_chunk(Proxy, NSInfo, EpochID, File, Offset, Chunk, CSum, ?TIMEOUT) of + case ?FLU_PC:write_chunk(Proxy, EpochID, File, Offset, Chunk, ?TIMEOUT) of ok -> - do_repair_chunk2(Rest, ReturnMode, Chunk, CSum, [First|Repaired], NSInfo, File, + do_repair_chunk2(Rest, ReturnMode, Chunk, [First|Repaired], File, Offset, Size, Depth, STime, S); {error, bad_checksum}=BadCS -> %% TODO: alternate strategy? {BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, + do_repair_chunk(ToRepair, ReturnMode, Chunk, Repaired, File, Offset, Size, Depth, STime, S); {error, written} -> %% TODO: To be very paranoid, read the chunk here to verify %% that it is exactly our Chunk. - do_repair_chunk2(Rest, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, + do_repair_chunk2(Rest, ReturnMode, Chunk, Repaired, File, Offset, Size, Depth, STime, S); {error, trimmed} = _Error -> %% TODO @@ -838,9 +937,9 @@ do_checksum_list(File, Depth, STime, TO, #state{proj=P}=S) -> end. do_checksum_list2(File, Depth, STime, TO, - #state{proj=P, proxies_dict=PD}=S) -> + #state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) -> Proxy = orddict:fetch(lists:last(readonly_flus(P)), PD), - case ?FLU_PC:checksum_list(Proxy, File, ?TIMEOUT) of + case ?FLU_PC:checksum_list(Proxy, EpochID, File, ?TIMEOUT) of {ok, _}=OK -> {reply, OK, S}; {error, Retry} @@ -926,13 +1025,11 @@ update_proj2(Count, #state{bad_proj=BadProj, proxies_dict=ProxiesDict, update_proj2(Count + 1, S); P when P >= BadProj -> #projection_v1{epoch_number=Epoch, epoch_csum=CSum, - members_dict=NewMembersDict, dbg2=Dbg2} = P, + members_dict=NewMembersDict} = P, EpochID = {Epoch, CSum}, ?FLU_PC:stop_proxies(ProxiesDict), NewProxiesDict = ?FLU_PC:start_proxies(NewMembersDict), - %% Make crash reports shorter by getting rid of 'react' history. - P2 = P#projection_v1{dbg2=lists:keydelete(react, 1, Dbg2)}, - S#state{bad_proj=undefined, proj=P2, epoch_id=EpochID, + S#state{bad_proj=undefined, proj=P, epoch_id=EpochID, members_dict=NewMembersDict, proxies_dict=NewProxiesDict}; _P -> sleep_a_while(Count), @@ -977,14 +1074,14 @@ choose_best_proj(Rs) -> BestProj end, ?WORST_PROJ, Rs). -try_to_find_chunk(Eligible, NSInfo, File, Offset, Size, +try_to_find_chunk(Eligible, File, Offset, Size, #state{epoch_id=EpochID, proxies_dict=PD}) -> Timeout = 2*1000, Work = fun(FLU) -> Proxy = orddict:fetch(FLU, PD), - case ?FLU_PC:read_chunk(Proxy, NSInfo, EpochID, + case ?FLU_PC:read_chunk(Proxy, EpochID, %% TODO Trimmed is required here - File, Offset, Size, undefined) of + File, Offset, Size, []) of {ok, {_Chunks, _} = ChunksAndTrimmed} -> {FLU, {ok, ChunksAndTrimmed}}; Else -> diff --git a/src/machi_csum_table.erl b/src/machi_csum_table.erl index cc4dd08..7ac79a2 100644 --- a/src/machi_csum_table.erl +++ b/src/machi_csum_table.erl @@ -1,20 +1,26 @@ -module(machi_csum_table). +%% @doc Object Database mapper that translates +%% (file, checksum, offset, size)|(trimmed-file) <-> LevelDB key and value +%% Keys and values are both encoded with sext. + -export([open/2, - find/3, - write/6, write/4, trim/5, - find_leftneighbor/2, find_rightneighbor/2, - all_trimmed/3, any_trimmed/3, - all_trimmed/2, - calc_unwritten_bytes/1, + find/4, + write/7, write/5, trim/6, + find_leftneighbor/3, find_rightneighbor/3, + is_file_trimmed/2, + all_trimmed/4, any_trimmed/4, + calc_unwritten_bytes/2, split_checksum_list_blob_decode/1, - all/1, - close/1, delete/1, - foldl_chunks/3]). + all/2, all_files/1, + close/1, maybe_trim_file/3, + foldl_file_chunks/4, foldl_chunks/3]). -include("machi.hrl"). -ifdef(TEST). +-export([all/2]). + -include_lib("eunit/include/eunit.hrl"). -endif. @@ -40,12 +46,9 @@ open(CSumFilename, _Opts) -> %% operating system's file cache, which is for %% Machi's main read efficiency {total_leveldb_mem_percent, 10}], + ok = filelib:ensure_dir(CSumFilename), + {ok, T} = eleveldb:open(CSumFilename, LevelDBOptions), - %% Dummy entry for reserved headers - ok = eleveldb:put(T, - sext:encode({0, ?MINIMUM_OFFSET}), - sext:encode(?CSUM_TAG_NONE_ATOM), - [{sync, true}]), C0 = #machi_csum_table{ file=CSumFilename, table=T}, @@ -55,61 +58,53 @@ open(CSumFilename, _Opts) -> split_checksum_list_blob_decode(Bin) -> erlang:binary_to_term(Bin). - -define(has_overlap(LeftOffset, LeftSize, RightOffset, RightSize), ((LeftOffset - (RightOffset+RightSize)) * (LeftOffset+LeftSize - RightOffset) < 0)). --spec find(table(), machi_dt:file_offset(), machi_dt:chunk_size()) +-spec find(table(), binary(), machi_dt:file_offset(), machi_dt:chunk_size()) -> [chunk()]. -find(#machi_csum_table{table=T}, Offset, Size) -> - {ok, I} = eleveldb:iterator(T, [], keys_only), - EndKey = sext:encode({Offset+Size, 0}), - StartKey = sext:encode({Offset, Size}), +find(#machi_csum_table{table=T}, Filename, Offset, Size) when is_binary(Filename) -> + EndKey = sext:encode({Filename, Offset+Size, 0}), - {ok, FirstKey} = case eleveldb:iterator_move(I, StartKey) of - {error, invalid_iterator} -> - eleveldb:iterator_move(I, first); - {ok, _} = R0 -> - case eleveldb:iterator_move(I, prev) of - {error, invalid_iterator} -> - R0; - {ok, _} = R1 -> - R1 - end - end, - _ = eleveldb:iterator_close(I), - FoldFun = fun({K, V}, Acc) -> - {TargetOffset, TargetSize} = sext:decode(K), - case ?has_overlap(TargetOffset, TargetSize, Offset, Size) of - true -> - [{TargetOffset, TargetSize, sext:decode(V)}|Acc]; - false -> + case search_for_start_key(T, Filename, Offset, Size) of + undefined -> []; + FirstKey -> + + FoldFun = fun({K, V}, Acc) -> + {Filename, TargetOffset, TargetSize} = sext:decode(K), + case ?has_overlap(TargetOffset, TargetSize, Offset, Size) of + true -> + [{TargetOffset, TargetSize, sext:decode(V)}|Acc]; + false -> + Acc + end; + (_K, Acc) -> + lager:error("~p wrong option", [_K]), Acc - end; - (_K, Acc) -> - lager:error("~p wrong option", [_K]), - Acc - end, - lists:reverse(eleveldb_fold(T, FirstKey, EndKey, FoldFun, [])). + end, + lists:reverse(eleveldb_fold(T, FirstKey, EndKey, FoldFun, [])) + end. %% @doc Updates all chunk info, by deleting existing entries if exists %% and putting new chunk info --spec write(table(), +-spec write(table(), binary(), machi_dt:file_offset(), machi_dt:chunk_size(), machi_dt:chunk_csum()|'none'|'trimmed', undefined|chunk(), undefined|chunk()) -> ok | {error, term()}. -write(#machi_csum_table{table=T} = CsumT, Offset, Size, CSum, - LeftUpdate, RightUpdate) -> +write(#machi_csum_table{table=T} = CsumT, Filename, + Offset, Size, CSum, LeftUpdate, RightUpdate) when is_binary(Filename) -> + FileEntry = {put, sext:encode({file, Filename}), sext:encode(existent)}, PutOps = [{put, - sext:encode({Offset, Size}), - sext:encode(CSum)}] + sext:encode({Filename, Offset, Size}), + sext:encode(CSum)}, + FileEntry] ++ case LeftUpdate of {LO, LS, LCsum} when LO + LS =:= Offset -> [{put, - sext:encode({LO, LS}), + sext:encode({Filename, LO, LS}), sext:encode(LCsum)}]; undefined -> [] @@ -117,58 +112,68 @@ write(#machi_csum_table{table=T} = CsumT, Offset, Size, CSum, ++ case RightUpdate of {RO, RS, RCsum} when RO =:= Offset + Size -> [{put, - sext:encode({RO, RS}), + sext:encode({Filename, RO, RS}), sext:encode(RCsum)}]; undefined -> [] end, - Chunks = find(CsumT, Offset, Size), + Chunks = find(CsumT, Filename, Offset, Size), DeleteOps = lists:map(fun({O, L, _}) -> - {delete, sext:encode({O, L})} + {delete, sext:encode({Filename, O, L})} end, Chunks), eleveldb:write(T, DeleteOps ++ PutOps, [{sync, true}]). --spec find_leftneighbor(table(), non_neg_integer()) -> +-spec find_leftneighbor(table(), binary(), non_neg_integer()) -> undefined | chunk(). -find_leftneighbor(CsumT, Offset) -> - case find(CsumT, Offset, 1) of +find_leftneighbor(CsumT, Filename, Offset) when is_binary(Filename) -> + case find(CsumT, Filename, Offset, 1) of [] -> undefined; [{Offset, _, _}] -> undefined; [{LOffset, _, CsumOrTrimmed}] -> {LOffset, Offset - LOffset, CsumOrTrimmed} end. --spec find_rightneighbor(table(), non_neg_integer()) -> +-spec find_rightneighbor(table(), binary(), non_neg_integer()) -> undefined | chunk(). -find_rightneighbor(CsumT, Offset) -> - case find(CsumT, Offset, 1) of +find_rightneighbor(CsumT, Filename, Offset) when is_binary(Filename) -> + case find(CsumT, Filename, Offset, 1) of [] -> undefined; [{Offset, _, _}] -> undefined; [{ROffset, RSize, CsumOrTrimmed}] -> {Offset, ROffset + RSize - Offset, CsumOrTrimmed} end. --spec write(table(), machi_dt:file_offset(), machi_dt:file_size(), +-spec write(table(), binary(), machi_dt:file_offset(), machi_dt:file_size(), machi_dt:chunk_csum()|none|trimmed) -> ok | {error, trimmed|file:posix()}. -write(CsumT, Offset, Size, CSum) -> - write(CsumT, Offset, Size, CSum, undefined, undefined). +write(CsumT, Filename, Offset, Size, CSum) -> + write(CsumT, Filename, Offset, Size, CSum, undefined, undefined). -trim(CsumT, Offset, Size, LeftUpdate, RightUpdate) -> - write(CsumT, Offset, Size, +trim(CsumT, Filename, Offset, Size, LeftUpdate, RightUpdate) -> + write(CsumT, Filename, Offset, Size, trimmed, %% Should this be much smaller like $t or just 't' LeftUpdate, RightUpdate). +-spec is_file_trimmed(table(), binary()) -> boolean(). +is_file_trimmed(#machi_csum_table{table=T}, Filename) when is_binary(Filename) -> + case eleveldb:get(T, sext:encode({file, Filename}), []) of + {ok, V} -> + (sext:decode(V) =:= ts); + _E -> + false + end. + %% @doc returns whether all bytes in a specific window is continously %% trimmed or not --spec all_trimmed(table(), non_neg_integer(), non_neg_integer()) -> boolean(). -all_trimmed(#machi_csum_table{table=T}, Left, Right) -> - FoldFun = fun({_, _}, false) -> +-spec all_trimmed(table(), binary(), non_neg_integer(), non_neg_integer()) -> boolean(). +all_trimmed(#machi_csum_table{table=T}, Filename, Left, Right) when is_binary(Filename) -> + FoldFun = fun({K, V}, false) -> false; ({K, V}, Pos) when is_integer(Pos) andalso Pos =< Right -> case {sext:decode(K), sext:decode(V)} of - {{Pos, Size}, trimmed} -> + {{file, _}, _} -> Pos; + {{Filename, Pos, Size}, trimmed} -> Pos + Size; - {{Offset, Size}, _} + {{Filename, Offset, Size}, _} when Offset + Size =< Left -> Left; _Eh -> @@ -176,65 +181,108 @@ all_trimmed(#machi_csum_table{table=T}, Left, Right) -> end end, case eleveldb:fold(T, FoldFun, Left, [{verify_checksums, true}]) of - false -> false; - Right -> true; - LastTrimmed when LastTrimmed < Right -> false; + false -> + false; + Right -> + true; + LastTrimmed when LastTrimmed < Right -> + false; _ -> %% LastTrimmed > Pos0, which is a irregular case but ok true end. -%% @doc returns whether all bytes 0-Pos0 is continously trimmed or -%% not, including header. --spec all_trimmed(table(), non_neg_integer()) -> boolean(). -all_trimmed(CsumT, Pos0) -> - all_trimmed(CsumT, 0, Pos0). - --spec any_trimmed(table(), +-spec any_trimmed(table(), binary(), pos_integer(), machi_dt:chunk_size()) -> boolean(). -any_trimmed(CsumT, Offset, Size) -> - Chunks = find(CsumT, Offset, Size), +any_trimmed(CsumT, Filename, Offset, Size) -> + Chunks = find(CsumT, Filename, Offset, Size), lists:any(fun({_, _, State}) -> State =:= trimmed end, Chunks). --spec calc_unwritten_bytes(table()) -> [byte_sequence()]. -calc_unwritten_bytes(#machi_csum_table{table=_} = CsumT) -> - case lists:sort(all(CsumT)) of +-spec calc_unwritten_bytes(table(), binary()) -> [byte_sequence()]. +calc_unwritten_bytes(#machi_csum_table{table=_} = CsumT, Filename) -> + case lists:sort(all(CsumT, Filename)) of [] -> - [{?MINIMUM_OFFSET, infinity}]; - Sorted -> - {LastOffset, _, _} = hd(Sorted), - build_unwritten_bytes_list(Sorted, LastOffset, []) + [{0, infinity}]; + [{0, _, _}|_] = Sorted -> + build_unwritten_bytes_list(Sorted, 0, []); + [{LastOffset, _, _}|_] = Sorted -> + build_unwritten_bytes_list(Sorted, LastOffset, [{0, LastOffset}]) end. -all(CsumT) -> +all(CsumT, Filename) -> FoldFun = fun(E, Acc) -> [E|Acc] end, - lists:reverse(foldl_chunks(FoldFun, [], CsumT)). + lists:reverse(foldl_file_chunks(FoldFun, [], CsumT, Filename)). + +all_files(#machi_csum_table{table=T}) -> + FoldFun = fun({K, V}, Acc) -> + case sext:decode(K) of + {file, Filename} -> + [{binary_to_list(Filename), sext:decode(V)}|Acc]; + _ -> + Acc + end; + (_, Acc) -> Acc + end, + eleveldb_fold(T, sext:encode({file, ""}), sext:encode({file, [255,255,255]}), + FoldFun, []). -spec close(table()) -> ok. close(#machi_csum_table{table=T}) -> ok = eleveldb:close(T). --spec delete(table()) -> ok. -delete(#machi_csum_table{table=T, file=F}) -> - catch eleveldb:close(T), - %% TODO change this to directory walk - case os:cmd("rm -rf " ++ F) of - "" -> ok; - E -> E + +-spec maybe_trim_file(table(), binary(), non_neg_integer()) -> + {ok, trimmed|not_trimmed} | {error, term()}. +maybe_trim_file(#machi_csum_table{table=T} = CsumT, Filename, EofP) when is_binary(Filename) -> + %% TODO: optimize; this code runs fold on eleveldb twice. + case all_trimmed(CsumT, Filename, 0, EofP) of + true -> + + Chunks = all(CsumT, Filename), + DeleteOps = lists:map(fun({O, L, _}) -> + {delete, sext:encode({Filename, O, L})} + end, Chunks), + FileTombstone = {put, sext:encode({file, Filename}), sext:encode(ts)}, + case eleveldb:write(T, [FileTombstone|DeleteOps], [{sync, true}]) of + ok -> {ok, trimmed}; + Other -> Other + end; + false -> + {ok, not_trimmed} end. --spec foldl_chunks(fun((chunk(), Acc0 :: term()) -> Acc :: term()), - Acc0 :: term(), table()) -> Acc :: term(). -foldl_chunks(Fun, Acc0, #machi_csum_table{table=T}) -> +%% @doc Folds over all chunks of a file +-spec foldl_file_chunks(fun((chunk(), Acc0 :: term()) -> Acc :: term()), + Acc0 :: term(), table(), binary()) -> Acc :: term(). +foldl_file_chunks(Fun, Acc0, #machi_csum_table{table=T}, Filename) when is_binary(Filename) -> FoldFun = fun({K, V}, Acc) -> - {Offset, Len} = sext:decode(K), + {Filename, Offset, Len} = sext:decode(K), Fun({Offset, Len, sext:decode(V)}, Acc); (_K, Acc) -> _ = lager:error("~p: wrong option?", [_K]), Acc end, + StartKey = {Filename, 0, 0}, + EndKey = { <>, 0, 0}, + eleveldb_fold(T, sext:encode(StartKey), sext:encode(EndKey), + FoldFun, Acc0). + + +%% @doc Folds over all chunks of all files +-spec foldl_chunks(fun((chunk(), Acc0 :: term()) -> Acc :: term()), + Acc0 :: term(), table()) -> Acc :: term(). +foldl_chunks(Fun, Acc0, #machi_csum_table{table=T}) -> + FoldFun = fun({K, V}, Acc) -> + {Filename, Offset, Len} = sext:decode(K), + Fun({Filename, Offset, Len, sext:decode(V)}, Acc); + (_K, Acc) -> + _ = lager:error("~p: wrong option?", [_K]), + Acc + end, eleveldb:fold(T, FoldFun, Acc0, [{verify_checksums, true}]). +%% == internal functions == + -spec build_unwritten_bytes_list( CsumData :: [{ Offset :: non_neg_integer(), Size :: pos_integer(), Checksum :: binary() }], @@ -298,3 +346,50 @@ eleveldb_do_fold({error, iterator_closed}, _, _, _, Acc) -> eleveldb_do_fold({error, invalid_iterator}, _, _, _, Acc) -> %% Probably reached to end Acc. + + +%% Key1 < MaybeStartKey =< Key +%% FirstKey =< MaybeStartKey +search_for_start_key(T, Filename, Offset, Size) -> + MaybeStartKey = sext:encode({Filename, Offset, Size}), + FirstKey = sext:encode({Filename, 0, 0}), + {ok, I} = eleveldb:iterator(T, [], keys_only), + + try + case eleveldb:iterator_move(I, MaybeStartKey) of + {error, invalid_iterator} -> + %% No key in right - go for probably first key in the file + case eleveldb:iterator_move(I, FirstKey) of + {error, _} -> undefined; + {ok, Key0} -> goto_end(I, Key0, Offset) + end; + {ok, Key} when Key < FirstKey -> + FirstKey; + {ok, Key} -> + case eleveldb:iterator_move(I, prev) of + {error, invalid_iterator} -> + Key; + {ok, Key1} when Key1 < FirstKey -> + Key; + {ok, Key1} -> + Key1 + end + end + after + _ = eleveldb:iterator_close(I) + end. + +goto_end(I, Key, Offset) -> + case sext:decode(Key) of + {_Filename, O, L} when Offset =< O + L -> + Key; + {_Filename, O, L} when O + L < Offset -> + case eleveldb:iterator_move(I, next) of + {ok, NextKey} -> + goto_end(I, NextKey, Offset); + {error, _} -> + Key + end + end. + + diff --git a/src/machi_dt.erl b/src/machi_dt.erl index 0af3bb4..daf26dd 100644 --- a/src/machi_dt.erl +++ b/src/machi_dt.erl @@ -20,24 +20,18 @@ -module(machi_dt). --include("machi.hrl"). -include("machi_projection.hrl"). --type append_opts() :: #append_opts{}. --type chunk() :: chunk_bin() | iolist(). % client can choose either rep. --type chunk_bin() :: binary(). % server returns binary() only. --type chunk_csum() :: <<>> | chunk_csum_bin() | {csum_tag(), binary()}. --type chunk_csum_bin() :: binary(). % 1 byte tag, N-1 bytes checksum --type chunk_cstrm() :: 'trimmed' | chunk_csum(). --type chunk_summary() :: {file_offset(), chunk_size(), chunk_bin(), chunk_cstrm()}. +-type chunk() :: chunk_bin() | {chunk_csum(), chunk_bin()}. +-type chunk_bin() :: binary() | iolist(). % client can use either +-type chunk_csum() :: binary(). % 1 byte tag, N-1 bytes checksum +-type chunk_summary() :: {file_offset(), chunk_size(), binary()}. +-type chunk_s() :: 'trimmed' | binary(). -type chunk_pos() :: {file_offset(), chunk_size(), file_name_s()}. -type chunk_size() :: non_neg_integer(). - -%% Tags that stand for how that checksum was generated. See -%% machi_util:make_tagged_csum/{1,2} for further documentation and -%% implementation. --type csum_tag() :: none | client_sha | server_sha | server_regen_sha. - +-type coc_namespace() :: string(). +-type coc_nl() :: {coc, coc_namespace(), coc_locator()}. +-type coc_locator() :: non_neg_integer(). -type error_general() :: 'bad_arg' | 'wedged' | 'bad_checksum'. -type epoch_csum() :: binary(). -type epoch_num() :: -1 | non_neg_integer(). @@ -50,26 +44,26 @@ -type file_prefix() :: binary() | list(). -type inet_host() :: inet:ip_address() | inet:hostname(). -type inet_port() :: inet:port_number(). --type locator() :: number(). --type namespace() :: binary(). --type namespace_version() :: non_neg_integer(). --type ns_info() :: #ns_info{}. -type projection() :: #projection_v1{}. -type projection_type() :: 'public' | 'private'. --type read_opts() :: #read_opts{}. --type read_opts_x() :: 'undefined' | 'noopt' | 'none' | #read_opts{}. + +%% Tags that stand for how that checksum was generated. See +%% machi_util:make_tagged_csum/{1,2} for further documentation and +%% implementation. +-type csum_tag() :: none | client_sha | server_sha | server_regen_sha. -export_type([ - append_opts/0, chunk/0, chunk_bin/0, chunk_csum/0, - chunk_csum_bin/0, - chunk_cstrm/0, + csum_tag/0, chunk_summary/0, + chunk_s/0, chunk_pos/0, chunk_size/0, - csum_tag/0, + coc_namespace/0, + coc_nl/0, + coc_locator/0, error_general/0, epoch_csum/0, epoch_num/0, @@ -82,13 +76,7 @@ file_prefix/0, inet_host/0, inet_port/0, - locator/0, - namespace/0, - namespace_version/0, - ns_info/0, projection/0, - projection_type/0, - read_opts/0, - read_opts_x/0 + projection_type/0 ]). diff --git a/src/machi_file_proxy.erl b/src/machi_file_proxy.erl index bc9a539..e0a4f44 100644 --- a/src/machi_file_proxy.erl +++ b/src/machi_file_proxy.erl @@ -85,8 +85,6 @@ filename :: string() | undefined, data_path :: string() | undefined, wedged = false :: boolean(), - csum_file :: string()|undefined, - csum_path :: string()|undefined, data_filehandle :: file:io_device(), csum_table :: machi_csum_table:table(), eof_position = 0 :: non_neg_integer(), @@ -102,12 +100,14 @@ %% Public API -% @doc Start a new instance of the file proxy service. Takes the filename -% and data directory as arguments. This function is typically called by the -% `machi_file_proxy_sup:start_proxy/2' function. --spec start_link(FluName :: atom(), Filename :: string(), DataDir :: string()) -> any(). -start_link(FluName, Filename, DataDir) -> - gen_server:start_link(?MODULE, {FluName, Filename, DataDir}, []). +% @doc Start a new instance of the file proxy service. Takes the +% filename and data directory as arguments. This function is typically +% called by the `machi_file_proxy_sup:start_proxy/2' +% function. Checksum table is also passed at startup. +-spec start_link(Filename :: string(), + DataDir :: string(), CsumTable :: machi_csum_table:table()) -> any(). +start_link(Filename, DataDir, CsumTable) -> + gen_server:start_link(?MODULE, {Filename, DataDir, CsumTable}, []). % @doc Request to stop an instance of the file proxy service. -spec stop(Pid :: pid()) -> ok. @@ -141,18 +141,18 @@ sync(_Pid, Type) -> Data :: binary(), Checksum :: binary()}]} | {error, Reason :: term()}. read(Pid, Offset, Length) -> - read(Pid, Offset, Length, #read_opts{}). + read(Pid, Offset, Length, []). -spec read(Pid :: pid(), Offset :: non_neg_integer(), Length :: non_neg_integer(), - machi_dt:read_opts_x()) -> + [{no_checksum|no_chunk|needs_trimmed, boolean()}]) -> {ok, [{Filename::string(), Offset :: non_neg_integer(), Data :: binary(), Checksum :: binary()}]} | {error, Reason :: term()}. -read(Pid, Offset, Length, #read_opts{}=Opts) - when is_pid(Pid) andalso is_integer(Offset) andalso Offset >= 0 - andalso is_integer(Length) andalso Length > 0 -> +read(Pid, Offset, Length, Opts) when is_pid(Pid) andalso is_integer(Offset) andalso Offset >= 0 + andalso is_integer(Length) andalso Length > 0 + andalso is_list(Opts) -> gen_server:call(Pid, {read, Offset, Length, Opts}, ?TIMEOUT); read(_Pid, Offset, Length, Opts) -> lager:warning("Bad args to read: Offset ~p, Length ~p, Options ~p", [Offset, Length, Opts]), @@ -218,26 +218,24 @@ checksum_list(Pid) -> %% gen_server callbacks % @private -init({FluName, Filename, DataDir}) -> - CsumFile = machi_util:make_checksum_filename(DataDir, Filename), +init({Filename, DataDir, CsumTable}) -> {_, DPath} = machi_util:make_data_filename(DataDir, Filename), - ok = filelib:ensure_dir(CsumFile), ok = filelib:ensure_dir(DPath), - {ok, CsumTable} = machi_csum_table:open(CsumFile, []), - UnwrittenBytes = machi_csum_table:calc_unwritten_bytes(CsumTable), + CsumTable1 = case machi_csum_table:is_file_trimmed(CsumTable, list_to_binary(Filename)) of + false -> CsumTable; + true -> trimmed + end, + + UnwrittenBytes = machi_csum_table:calc_unwritten_bytes(CsumTable, iolist_to_binary(Filename)), {Eof, infinity} = lists:last(UnwrittenBytes), {ok, FHd} = file:open(DPath, [read, write, binary, raw]), - %% Reserve for EC and stuff, to prevent eof when read - ok = file:pwrite(FHd, 0, binary:copy(<<"so what?">>, ?MINIMUM_OFFSET div 8)), Tref = schedule_tick(), St = #state{ - fluname = FluName, filename = Filename, data_dir = DataDir, data_path = DPath, - csum_file = CsumFile, data_filehandle = FHd, - csum_table = CsumTable, + csum_table = CsumTable1, tref = Tref, eof_position = Eof, max_file_size = machi_config:max_file_size()}, @@ -281,6 +279,13 @@ handle_call({read, _Offset, _Length, _}, _From, }) -> {reply, {error, wedged}, State#state{writes = {T + 1, Err + 1}}}; +handle_call({read, _Offset, _Length, _Opts}, _From, + State = #state{ + csum_table = trimmed, + reads = {T, Err} + }) -> + {reply, {error, trimmed}, State#state{reads = {T+1, Err+1}}}; + handle_call({read, Offset, Length, _Opts}, _From, State = #state{eof_position = Eof, reads = {T, Err} @@ -298,15 +303,15 @@ handle_call({read, Offset, Length, Opts}, _From, }) -> %% TODO: use these options - NoChunk prevents reading from disks %% NoChecksum doesn't check checksums - #read_opts{no_checksum=NoChecksum, no_chunk=NoChunk, - needs_trimmed=NeedsTrimmed} = Opts, + NoChecksum = proplists:get_value(no_checksum, Opts, false), + NoChunk = proplists:get_value(no_chunk, Opts, false), {Resp, NewErr} = case do_read(FH, F, CsumTable, Offset, Length, NoChunk, NoChecksum) of {ok, {[], []}} -> {{error, not_written}, Err + 1}; {ok, {Chunks0, Trimmed0}} -> Chunks = slice_both_side(Chunks0, Offset, Offset+Length), - Trimmed = case NeedsTrimmed of + Trimmed = case proplists:get_value(needs_trimmed, Opts, false) of true -> Trimmed0; false -> [] end, @@ -325,6 +330,11 @@ handle_call({write, _Offset, _ClientMeta, _Data}, _From, }) -> {reply, {error, wedged}, State#state{writes = {T + 1, Err + 1}}}; +handle_call({write, _, _, _}, _From, + State = #state{writes = {T, Err}, + csum_table = trimmed}) -> + {reply, {error, trimmed}, State#state{writes = {T + 1, Err + 1}}}; + handle_call({write, Offset, ClientMeta, Data}, _From, State = #state{filename = F, writes = {T, Err}, @@ -348,7 +358,7 @@ handle_call({write, Offset, ClientMeta, Data}, _From, {Error, Err + 1} end end, - {NewEof, infinity} = lists:last(machi_csum_table:calc_unwritten_bytes(CsumTable)), + {NewEof, infinity} = lists:last(machi_csum_table:calc_unwritten_bytes(CsumTable, iolist_to_binary(F))), lager:debug("Wrote ~p bytes at ~p of file ~p, NewEOF = ~p~n", [iolist_size(Data), Offset, F, NewEof]), {reply, Resp, State#state{writes = {T+1, NewErr}, @@ -365,35 +375,33 @@ handle_call({trim, _Offset, _ClientMeta, _Data}, _From, handle_call({trim, Offset, Size, _TriggerGC}, _From, State = #state{data_filehandle=FHd, + filename=Filename, ops = Ops, trims = {T, Err}, csum_table = CsumTable}) -> - case machi_csum_table:all_trimmed(CsumTable, Offset, Offset+Size) of - true -> - NewState = State#state{ops=Ops+1, trims={T, Err+1}}, - %% All bytes of that range was already trimmed returns ok - %% here, not {error, trimmed}, which means the whole file - %% was trimmed + F = iolist_to_binary(Filename), + LUpdate = maybe_regenerate_checksum( + FHd, + machi_csum_table:find_leftneighbor(CsumTable, + F, + Offset)), + RUpdate = maybe_regenerate_checksum( + FHd, + machi_csum_table:find_rightneighbor(CsumTable, + F, + Offset+Size)), + + case machi_csum_table:trim(CsumTable, F, Offset, + Size, LUpdate, RUpdate) of + ok -> + {NewEof, infinity} = lists:last(machi_csum_table:calc_unwritten_bytes(CsumTable, F)), + NewState = State#state{ops=Ops+1, + trims={T+1, Err}, + eof_position=NewEof}, maybe_gc(ok, NewState); - false -> - LUpdate = maybe_regenerate_checksum( - FHd, - machi_csum_table:find_leftneighbor(CsumTable, Offset)), - RUpdate = maybe_regenerate_checksum( - FHd, - machi_csum_table:find_rightneighbor(CsumTable, Offset+Size)), - - case machi_csum_table:trim(CsumTable, Offset, Size, LUpdate, RUpdate) of - ok -> - {NewEof, infinity} = lists:last(machi_csum_table:calc_unwritten_bytes(CsumTable)), - NewState = State#state{ops=Ops+1, - trims={T+1, Err}, - eof_position=NewEof}, - maybe_gc(ok, NewState); - Error -> - {reply, Error, State#state{ops=Ops+1, trims={T, Err+1}}} - end + Error -> + {reply, Error, State#state{ops=Ops+1, trims={T, Err+1}}} end; %% APPENDS @@ -435,8 +443,9 @@ handle_call({append, ClientMeta, Extra, Data}, _From, {reply, Resp, State#state{appends = {T+1, NewErr}, eof_position = NewEof}}; -handle_call({checksum_list}, _FRom, State = #state{csum_table=T}) -> - All = machi_csum_table:all(T), +handle_call({checksum_list}, _FRom, State = #state{filename=Filename, + csum_table=T}) -> + All = machi_csum_table:all(T,iolist_to_binary(Filename)), {reply, {ok, All}, State}; handle_call(Req, _From, State) -> @@ -528,7 +537,6 @@ handle_info(Req, State) -> % @private terminate(Reason, #state{filename = F, data_filehandle = FHd, - csum_table = T, reads = {RT, RE}, writes = {WT, WE}, appends = {AT, AE} @@ -544,14 +552,7 @@ terminate(Reason, #state{filename = F, _ -> ok = file:sync(FHd), ok = file:close(FHd) - end, - case T of - undefined -> - noop; %% file deleted - _ -> - ok = machi_csum_table:close(T) - end, - ok. + end. % @private code_change(_OldVsn, State, _Extra) -> @@ -622,7 +623,8 @@ check_or_make_tagged_csum(OtherTag, _ClientCsum, _Data) -> do_read(FHd, Filename, CsumTable, Offset, Size, _, _) -> %% Note that find/3 only returns overlapping chunks, both borders %% are not aligned to original Offset and Size. - ChunkCsums = machi_csum_table:find(CsumTable, Offset, Size), + ChunkCsums = machi_csum_table:find(CsumTable, iolist_to_binary(Filename), + Offset, Size), read_all_ranges(FHd, Filename, ChunkCsums, [], []). -spec read_all_ranges(file:io_device(), string(), @@ -700,7 +702,7 @@ read_all_ranges(FHd, Filename, [{Offset, Size, TaggedCsum}|T], ReadChunks, Trimm handle_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Data) -> Size = iolist_size(Data), - case machi_csum_table:find(CsumTable, Offset, Size) of + case machi_csum_table:find(CsumTable, iolist_to_binary(Filename), Offset, Size) of [] -> %% Nothing should be there try do_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Size, Data) @@ -723,6 +725,7 @@ handle_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Data) -> ok; {ok, _Other} -> %% TODO: leave some debug/warning message here? + io:format(user, "baposdifa;lsdfkj<<<<<<<~n", []), {error, written} end; [{Offset, Size, OtherCsum}] -> @@ -731,11 +734,13 @@ handle_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Data) -> " a check for unwritten bytes gave us checksum ~p" " but the data we were trying to write has checksum ~p", [Offset, Filename, OtherCsum, TaggedCsum]), + io:format(user, "baposdifa;lsdfkj*************8~n", []), {error, written}; _Chunks -> %% TODO: Do we try to read all continuous chunks to see %% wether its total checksum matches client-provided checksum? - case machi_csum_table:any_trimmed(CsumTable, Offset, Size) of + case machi_csum_table:any_trimmed(CsumTable, iolist_to_binary(Filename), + Offset, Size) of true -> %% More than a byte is trimmed, besides, do we %% have to return exact written bytes? No. Clients @@ -744,6 +749,7 @@ handle_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Data) -> {error, trimmed}; false -> %% No byte is trimmed, but at least one byte is written + io:format(user, "baposdifa;lsdfkj*************8 ~p~n", [_Chunks]), {error, written} end end. @@ -761,6 +767,7 @@ handle_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Data) -> do_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Size, Data) -> case file:pwrite(FHd, Offset, Data) of ok -> + F = iolist_to_binary(Filename), lager:debug("Successful write in file ~p at offset ~p, length ~p", [Filename, Offset, Size]), @@ -769,11 +776,15 @@ do_write(FHd, CsumTable, Filename, TaggedCsum, Offset, Size, Data) -> %% as server_sha LUpdate = maybe_regenerate_checksum( FHd, - machi_csum_table:find_leftneighbor(CsumTable, Offset)), + machi_csum_table:find_leftneighbor(CsumTable, + F, + Offset)), RUpdate = maybe_regenerate_checksum( FHd, - machi_csum_table:find_rightneighbor(CsumTable, Offset+Size)), - ok = machi_csum_table:write(CsumTable, Offset, Size, + machi_csum_table:find_rightneighbor(CsumTable, + F, + Offset+Size)), + ok = machi_csum_table:write(CsumTable, F, Offset, Size, TaggedCsum, LUpdate, RUpdate), lager:debug("Successful write to checksum file for ~p", [Filename]), @@ -838,32 +849,27 @@ maybe_gc(Reply, S = #state{eof_position = Eof, lager:debug("The file is still small; not trying GC (Eof, MaxFileSize) = (~p, ~p)~n", [Eof, MaxFileSize]), {reply, Reply, S}; -maybe_gc(Reply, S = #state{fluname=FluName, - data_filehandle = FHd, +maybe_gc(Reply, S = #state{data_filehandle = FHd, data_dir = DataDir, filename = Filename, eof_position = Eof, csum_table=CsumTable}) -> - case machi_csum_table:all_trimmed(CsumTable, ?MINIMUM_OFFSET, Eof) of - true -> - lager:debug("GC? Let's do it: ~p.~n", [Filename]), - %% Before unlinking a file, it should inform - %% machi_flu_filename_mgr that this file is - %% deleted and mark it as "trimmed" to avoid - %% filename reuse and resurrection. Maybe garbage - %% will remain if a process crashed but it also - %% should be recovered at filename_mgr startup. + lager:debug("GC? Let's try it: ~p.~n", [Filename]), - %% Also, this should be informed *before* file proxy - %% deletes files. - ok = machi_flu_metadata_mgr:trim_file(FluName, {file, Filename}), + case machi_csum_table:maybe_trim_file(CsumTable, iolist_to_binary(Filename), Eof) of + {ok, trimmed} -> + %% Checksum table entries are all trimmed now, unlinking + %% file from operating system ok = file:close(FHd), {_, DPath} = machi_util:make_data_filename(DataDir, Filename), ok = file:delete(DPath), - machi_csum_table:delete(CsumTable), - {stop, normal, Reply, - S#state{data_filehandle=undefined, - csum_table=undefined}}; - false -> + lager:info("File ~s has been unlinked as all chunks" + " were trimmed.", [Filename]), + {stop, normal, Reply, S#state{data_filehandle=undefined}}; + {ok, not_trimmed} -> + {reply, Reply, S}; + {error, _} = Error -> + lager:error("machi_csum_table:maybe_trim_file/4 has been " + "unexpectedly failed: ~p", [Error]), {reply, Reply, S} end. diff --git a/src/machi_file_proxy_sup.erl b/src/machi_file_proxy_sup.erl index a165a68..7301b54 100644 --- a/src/machi_file_proxy_sup.erl +++ b/src/machi_file_proxy_sup.erl @@ -44,8 +44,9 @@ start_link(FluName) -> supervisor:start_link({local, make_proxy_name(FluName)}, ?MODULE, []). start_proxy(FluName, DataDir, Filename) -> + {ok, CsumTable} = machi_flu_filename_mgr:get_csum_table(FluName), supervisor:start_child(make_proxy_name(FluName), - [FluName, Filename, DataDir]). + [Filename, DataDir, CsumTable]). init([]) -> SupFlags = {simple_one_for_one, 1000, 10}, diff --git a/src/machi_fitness.erl b/src/machi_fitness.erl index 70af62a..2b54244 100644 --- a/src/machi_fitness.erl +++ b/src/machi_fitness.erl @@ -108,7 +108,6 @@ handle_call({update_local_down_list, Down, MembersDict}, _From, #state{my_flu_name=MyFluName, pending_map=OldMap, local_down=OldDown, members_dict=OldMembersDict, admin_down=AdminDown}=S) -> - verbose("FITNESS: ~w has down suspect ~w\n", [MyFluName, Down]), NewMap = store_in_map(OldMap, MyFluName, erlang:now(), Down, AdminDown, [props_yo]), S2 = if Down == OldDown, MembersDict == OldMembersDict -> @@ -120,17 +119,13 @@ handle_call({update_local_down_list, Down, MembersDict}, _From, end, {reply, ok, S2#state{local_down=Down}}; handle_call({add_admin_down, DownFLU, DownProps}, _From, - #state{my_flu_name=MyFluName, - local_down=OldDown, admin_down=AdminDown}=S) -> - verbose("FITNESS: ~w add admin down ~w\n", [MyFluName, DownFLU]), + #state{local_down=OldDown, admin_down=AdminDown}=S) -> NewAdminDown = [{DownFLU,DownProps}|lists:keydelete(DownFLU, 1, AdminDown)], S3 = finish_admin_down(erlang:now(), OldDown, NewAdminDown, [props_yo], S), {reply, ok, S3}; handle_call({delete_admin_down, DownFLU}, _From, - #state{my_flu_name=MyFluName, - local_down=OldDown, admin_down=AdminDown}=S) -> - verbose("FITNESS: ~w delete admin down ~w\n", [MyFluName, DownFLU]), + #state{local_down=OldDown, admin_down=AdminDown}=S) -> NewAdminDown = lists:keydelete(DownFLU, 1, AdminDown), S3 = finish_admin_down(erlang:now(), OldDown, NewAdminDown, [props_yo], S), @@ -148,8 +143,7 @@ handle_call(_Request, _From, S) -> handle_cast(_Msg, S) -> {noreply, S}. -handle_info({adjust_down_list, FLU}, #state{my_flu_name=MyFluName, - active_unfit=ActiveUnfit}=S) -> +handle_info({adjust_down_list, FLU}, #state{active_unfit=ActiveUnfit}=S) -> NewUnfit = make_unfit_list(S), Added_to_new = NewUnfit -- ActiveUnfit, Dropped_from_new = ActiveUnfit -- NewUnfit, @@ -190,11 +184,9 @@ handle_info({adjust_down_list, FLU}, #state{my_flu_name=MyFluName, {true, true} -> error({bad, ?MODULE, ?LINE, FLU, ActiveUnfit, NewUnfit}); {true, false} -> - NewActive = wrap_active(MyFluName,lists:usort(ActiveUnfit++[FLU])), - {noreply, S#state{active_unfit=NewActive}}; + {noreply, S#state{active_unfit=lists:usort(ActiveUnfit ++ [FLU])}}; {false, true} -> - NewActive = wrap_active(MyFluName,ActiveUnfit--[FLU]), - {noreply, S#state{active_unfit=NewActive}}; + {noreply, S#state{active_unfit=ActiveUnfit -- [FLU]}}; {false, false} -> {noreply, S} end; @@ -432,18 +424,6 @@ map_value(Map) -> map_merge(Map1, Map2) -> ?MAP:merge(Map1, Map2). -wrap_active(MyFluName, L) -> - verbose("FITNESS: ~w has new down list ~w\n", [MyFluName, L]), - L. - -verbose(Fmt, Args) -> - case application:get_env(machi, fitness_verbose) of - {ok, true} -> - error_logger:info_msg(Fmt, Args); - _ -> - ok - end. - -ifdef(TEST). dt_understanding_test() -> diff --git a/src/machi_flu1.erl b/src/machi_flu1.erl index 8a33a04..b75d955 100644 --- a/src/machi_flu1.erl +++ b/src/machi_flu1.erl @@ -129,8 +129,7 @@ main2(FluName, TcpPort, DataDir, Props) -> ok end, {ok, ListenerPid} = start_listen_server(FluName, TcpPort, Witness_p, DataDir, - ets_table_name(FluName), ProjectionPid, - Props), + ets_table_name(FluName), ProjectionPid), %% io:format(user, "Listener started: ~w~n", [{FluName, ListenerPid}]), Config_e = machi_util:make_config_filename(DataDir, "unused"), @@ -155,10 +154,9 @@ main2(FluName, TcpPort, DataDir, Props) -> start_append_server(FluName, Witness_p, Wedged_p, EpochId) -> machi_flu1_subsup:start_append_server(FluName, Witness_p, Wedged_p, EpochId). -start_listen_server(FluName, TcpPort, Witness_p, DataDir, EtsTab, ProjectionPid, - Props) -> +start_listen_server(FluName, TcpPort, Witness_p, DataDir, EtsTab, ProjectionPid) -> machi_flu1_subsup:start_listener(FluName, TcpPort, Witness_p, DataDir, - EtsTab, ProjectionPid, Props). + EtsTab, ProjectionPid). %% This is the name of the projection store that is spawned by the %% *flu*, for use primarily in testing scenarios. In normal use, we diff --git a/src/machi_flu1_append_server.erl b/src/machi_flu1_append_server.erl index a484410..a7b029c 100644 --- a/src/machi_flu1_append_server.erl +++ b/src/machi_flu1_append_server.erl @@ -82,25 +82,25 @@ init([Fluname, Witness_p, Wedged_p, EpochId]) -> {ok, #state{flu_name=Fluname, witness=Witness_p, wedged=Wedged_p, etstab=TID, epoch_id=EpochId}}. -handle_call({seq_append, _From2, _NSInfo, _EpochID, _Prefix, _Chunk, _TCSum, _Opts}, +handle_call({seq_append, _From2, _N, _L, _Prefix, _Chunk, _CSum, _Extra, _EpochID}, _From, #state{witness=true}=S) -> %% The FLU's machi_flu1_net_server process ought to filter all %% witness states, but we'll keep this clause for extra %% paranoia. {reply, witness, S}; -handle_call({seq_append, _From2, _NSInfo, _EpochID, _Prefix, _Chunk, _TCSum, _Opts}, +handle_call({seq_append, _From2, _N, _L, _Prefix, _Chunk, _CSum, _Extra, _EpochID}, _From, #state{wedged=true}=S) -> {reply, wedged, S}; -handle_call({seq_append, _From2, NSInfo, EpochID, - Prefix, Chunk, TCSum, Opts}, +handle_call({seq_append, _From2, CoC_Namespace, CoC_Locator, + Prefix, Chunk, CSum, Extra, EpochID}, From, #state{flu_name=FluName, epoch_id=OldEpochId}=S) -> %% Old is the one from our state, plain old 'EpochID' comes %% from the client. _ = case OldEpochId of EpochID -> spawn(fun() -> - append_server_dispatch(From, NSInfo, - Prefix, Chunk, TCSum, Opts, + append_server_dispatch(From, CoC_Namespace, CoC_Locator, + Prefix, Chunk, CSum, Extra, FluName, EpochID) end), {noreply, S}; @@ -161,10 +161,10 @@ terminate(Reason, _S) -> code_change(_OldVsn, S, _Extra) -> {ok, S}. -append_server_dispatch(From, NSInfo, - Prefix, Chunk, TCSum, Opts, FluName, EpochId) -> - Result = case handle_append(NSInfo, - Prefix, Chunk, TCSum, Opts, FluName, EpochId) of +append_server_dispatch(From, CoC_Namespace, CoC_Locator, + Prefix, Chunk, CSum, Extra, FluName, EpochId) -> + Result = case handle_append(CoC_Namespace, CoC_Locator, + Prefix, Chunk, CSum, Extra, FluName, EpochId) of {ok, File, Offset} -> {assignment, Offset, File}; Other -> @@ -173,17 +173,19 @@ append_server_dispatch(From, NSInfo, _ = gen_server:reply(From, Result), ok. -handle_append(NSInfo, - Prefix, Chunk, TCSum, Opts, FluName, EpochId) -> +handle_append(_N, _L, _Prefix, <<>>, _Csum, _Extra, _FluName, _EpochId) -> + {error, bad_arg}; +handle_append(CoC_Namespace, CoC_Locator, + Prefix, Chunk, Csum, Extra, FluName, EpochId) -> + CoC = {coc, CoC_Namespace, CoC_Locator}, Res = machi_flu_filename_mgr:find_or_make_filename_from_prefix( - FluName, EpochId, {prefix, Prefix}, NSInfo), + FluName, EpochId, {prefix, Prefix}, CoC), case Res of {file, F} -> case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, F}) of {ok, Pid} -> - {Tag, CS} = machi_util:unmake_tagged_csum(TCSum), + {Tag, CS} = machi_util:unmake_tagged_csum(Csum), Meta = [{client_csum_tag, Tag}, {client_csum, CS}], - Extra = Opts#append_opts.chunk_extra, machi_file_proxy:append(Pid, Meta, Extra, Chunk); {error, trimmed} = E -> E diff --git a/src/machi_flu1_client.erl b/src/machi_flu1_client.erl index 37e6d5a..e5b65fc 100644 --- a/src/machi_flu1_client.erl +++ b/src/machi_flu1_client.erl @@ -38,71 +38,6 @@ %% TODO This EDoc was written first, and the EDoc and also `-type' and %% `-spec' definitions for {@link machi_proxy_flu1_client} and {@link %% machi_cr_client} must be improved. -%% -%% == Client API implementation notes == -%% -%% At the moment, there are several modules that implement various -%% subsets of the Machi API. The table below attempts to show how and -%% why they differ. -%% -%% ``` -%% |--------------------------+-------+-----+------+------+-------+----------------| -%% | | PB | | # | | Conn | Epoch & NS | -%% | Module name | Level | CR? | FLUS | Impl | Life? | version aware? | -%% |--------------------------+-------+-----+------+------+-------+----------------| -%% | machi_pb_high_api_client | high | yes | many | proc | long | no | -%% | machi_cr_client | low | yes | many | proc | long | no | -%% | machi_proxy_flu1_client | low | no | 1 | proc | long | yes | -%% | machi_flu1_client | low | no | 1 | lib | short | yes | -%% |--------------------------+-------+-----+------+------+-------+----------------| -%% ''' -%% -%% In terms of use and API layering, the table rows are in highest`->'lowest -%% order: each level calls the layer immediately below it. -%% -%%
-%%
PB Level
-%%
The Protocol Buffers API is divided logically into two levels, -%% "low" and "high". The low-level protocol is used for intra-chain -%% communication. The high-level protocol is used for clients outside -%% of a Machi chain or Machi cluster of chains. -%%
-%%
CR?
-%%
Does this API support (directly or indirectly) Chain -%% Replication? If `no', then the API has no awareness of multiple -%% replicas of any file or file chunk; unaware clients can only -%% perform operations at a single Machi FLU's file service or -%% projection store service. -%%
-%%
# FLUs
-%%
Now many FLUs does this API layer communicate with -%% simultaneously? Note that there is a one-to-one correspondence -%% between this value and the "CR?" column's value. -%%
-%%
Impl
-%%
Implementation: library-only or an Erlang process, -%% e.g., `gen_server'. -%%
-%%
Conn Life?
-%%
Expected TCP session connection life: short or long. At the -%% lowest level, the {@link machi_flu1_client} API implementation takes -%% no effort to reconnect to a remote FLU when its single TCP session -%% is broken. For long-lived connection life APIs, the server side will -%% automatically attempt to reconnect to remote FLUs when a TCP session -%% is broken. -%%
-%%
Epoch & NS version aware?
-%%
Are clients of this API responsible for knowing a chain's EpochID -%% and namespace version numbers? If `no', then the server side of the -%% API will automatically attempt to discover/re-discover the EpochID and -%% namespace version numbers whenever they change. -%%
-%%
-%% -%% The only protocol that we expect to be used by entities outside of -%% a single Machi chain or a multi-chain cluster is the "high" -%% Protocol Buffers API. The {@link riak_pb_high_api_client} module -%% is an Erlang reference implementation of this PB API. -module(machi_flu1_client). @@ -115,15 +50,16 @@ -include_lib("pulse_otp/include/pulse_otp.hrl"). -endif. --define(SHORT_TIMEOUT, 2500). --define(LONG_TIMEOUT, (60*1000)). +-define(HARD_TIMEOUT, 2500). -export([ %% File API + append_chunk/4, append_chunk/5, append_chunk/6, append_chunk/7, - append_chunk/8, append_chunk/9, - read_chunk/7, read_chunk/8, - checksum_list/2, checksum_list/3, + append_chunk_extra/5, append_chunk_extra/6, + append_chunk_extra/7, append_chunk_extra/8, + read_chunk/6, read_chunk/7, + checksum_list/3, checksum_list/4, list_files/2, list_files/3, wedge_status/1, wedge_status/2, @@ -145,113 +81,190 @@ ]). %% For "internal" replication only. -export([ - write_chunk/7, write_chunk/8, - trim_chunk/6, + write_chunk/5, write_chunk/6, + trim_chunk/5, delete_migration/3, delete_migration/4, trunc_hack/3, trunc_hack/4 ]). -type port_wrap() :: {w,atom(),term()}. --spec append_chunk(port_wrap(), - 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), - machi_dt:file_prefix(), machi_dt:chunk(), - machi_dt:chunk_csum()) -> +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix'. + +-spec append_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk()) -> {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. -append_chunk(Sock, NSInfo, EpochID, Prefix, Chunk, CSum) -> - append_chunk(Sock, NSInfo, EpochID, Prefix, Chunk, CSum, - #append_opts{}, ?LONG_TIMEOUT). +append_chunk(Sock, EpochID, Prefix, Chunk) -> + append_chunk2(Sock, EpochID, + ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, + Prefix, Chunk, 0). %% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix' and also request an additional `Extra' bytes. -%% -%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then -%% the file offsets that follow `Chunk''s position for the following 4K will -%% be reserved by the file sequencer for later write(s) by the -%% `write_chunk()' API. +%% with `Prefix'. -spec append_chunk(machi_dt:inet_host(), machi_dt:inet_port(), - 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), - machi_dt:file_prefix(), machi_dt:chunk(), - machi_dt:chunk_csum()) -> + machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk()) -> {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. -append_chunk(Host, TcpPort, NSInfo, EpochID, Prefix, Chunk, CSum) -> - append_chunk(Host, TcpPort, NSInfo, EpochID, Prefix, Chunk, CSum, - #append_opts{}, ?LONG_TIMEOUT). - --spec append_chunk(port_wrap(), - 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), - machi_dt:file_prefix(), machi_dt:chunk(), - machi_dt:chunk_csum(), machi_dt:append_opts(), timeout()) -> - {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. -append_chunk(Sock, NSInfo0, EpochID, Prefix, Chunk, CSum, Opts, Timeout) -> - NSInfo = machi_util:ns_info_default(NSInfo0), - append_chunk2(Sock, NSInfo, EpochID, Prefix, Chunk, CSum, Opts, Timeout). - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix' and also request an additional `Extra' bytes. -%% -%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then -%% the file offsets that follow `Chunk''s position for the following 4K will -%% be reserved by the file sequencer for later write(s) by the -%% `write_chunk()' API. - --spec append_chunk(machi_dt:inet_host(), machi_dt:inet_port(), - 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), - machi_dt:file_prefix(), machi_dt:chunk(), - machi_dt:chunk_csum(), machi_dt:append_opts(), timeout()) -> - {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. -append_chunk(Host, TcpPort, NSInfo0, EpochID, - Prefix, Chunk, CSum, Opts, Timeout) -> +append_chunk(Host, TcpPort, EpochID, Prefix, Chunk) -> Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), try - NSInfo = machi_util:ns_info_default(NSInfo0), - append_chunk2(Sock, NSInfo, EpochID, - Prefix, Chunk, CSum, Opts, Timeout) + append_chunk2(Sock, EpochID, + ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, + Prefix, Chunk, 0) + after + disconnect(Sock) + end. + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix'. + +-spec append_chunk(port_wrap(), machi_dt:epoch_id(), + machi_dt:coc_namespace(), machi_dt:coc_locator(), + machi_dt:file_prefix(), machi_dt:chunk()) -> + {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. +append_chunk(Sock, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk) -> + append_chunk2(Sock, EpochID, + CoC_Namespace, CoC_Locator, + Prefix, Chunk, 0). + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix'. + +-spec append_chunk(machi_dt:inet_host(), machi_dt:inet_port(), + machi_dt:epoch_id(), + machi_dt:coc_namespace(), machi_dt:coc_locator(), + machi_dt:file_prefix(), machi_dt:chunk()) -> + {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. +append_chunk(Host, TcpPort, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk) -> + Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), + try + append_chunk2(Sock, EpochID, + CoC_Namespace, CoC_Locator, + Prefix, Chunk, 0) + after + disconnect(Sock) + end. + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix' and also request an additional `Extra' bytes. +%% +%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then +%% the file offsets that follow `Chunk''s position for the following 4K will +%% be reserved by the file sequencer for later write(s) by the +%% `write_chunk()' API. + +-spec append_chunk_extra(port_wrap(), machi_dt:epoch_id(), + machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) -> + {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. +append_chunk_extra(Sock, EpochID, Prefix, Chunk, ChunkExtra) + when is_integer(ChunkExtra), ChunkExtra >= 0 -> + append_chunk2(Sock, EpochID, + ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, + Prefix, Chunk, ChunkExtra). + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix' and also request an additional `Extra' bytes. +%% +%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then +%% the file offsets that follow `Chunk''s position for the following 4K will +%% be reserved by the file sequencer for later write(s) by the +%% `write_chunk()' API. + +-spec append_chunk_extra(machi_dt:inet_host(), machi_dt:inet_port(), + machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) -> + {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. +append_chunk_extra(Host, TcpPort, EpochID, Prefix, Chunk, ChunkExtra) + when is_integer(ChunkExtra), ChunkExtra >= 0 -> + Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), + try + append_chunk2(Sock, EpochID, + ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, + Prefix, Chunk, ChunkExtra) + after + disconnect(Sock) + end. + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix' and also request an additional `Extra' bytes. +%% +%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then +%% the file offsets that follow `Chunk''s position for the following 4K will +%% be reserved by the file sequencer for later write(s) by the +%% `write_chunk()' API. + +-spec append_chunk_extra(port_wrap(), machi_dt:epoch_id(), + machi_dt:coc_namespace(), machi_dt:coc_locator(), + machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) -> + {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. +append_chunk_extra(Sock, EpochID, CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra) + when is_integer(ChunkExtra), ChunkExtra >= 0 -> + append_chunk2(Sock, EpochID, + CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra). + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix' and also request an additional `Extra' bytes. +%% +%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then +%% the file offsets that follow `Chunk''s position for the following 4K will +%% be reserved by the file sequencer for later write(s) by the +%% `write_chunk()' API. + +-spec append_chunk_extra(machi_dt:inet_host(), machi_dt:inet_port(), + machi_dt:epoch_id(), + machi_dt:coc_namespace(), machi_dt:coc_locator(), + machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) -> + {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. +append_chunk_extra(Host, TcpPort, EpochID, + CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra) + when is_integer(ChunkExtra), ChunkExtra >= 0 -> + Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), + try + append_chunk2(Sock, EpochID, + CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra) after disconnect(Sock) end. %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. --spec read_chunk(port_wrap(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size(), - machi_dt:read_opts_x()) -> - {ok, {[machi_dt:chunk_summary()], [machi_dt:chunk_pos()]}} | +-spec read_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size(), + proplists:proplist()) -> + {ok, machi_dt:chunk_s()} | {error, machi_dt:error_general() | 'not_written' | 'partial_read'} | {error, term()}. -read_chunk(Sock, NSInfo0, EpochID, File, Offset, Size, Opts0) +read_chunk(Sock, EpochID, File, Offset, Size, Opts) when Offset >= ?MINIMUM_OFFSET, Size >= 0 -> - NSInfo = machi_util:ns_info_default(NSInfo0), - Opts = machi_util:read_opts_default(Opts0), - read_chunk2(Sock, NSInfo, EpochID, File, Offset, Size, Opts). + read_chunk2(Sock, EpochID, File, Offset, Size, Opts). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. --spec read_chunk(machi_dt:inet_host(), machi_dt:inet_port(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), +-spec read_chunk(machi_dt:inet_host(), machi_dt:inet_port(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size(), - machi_dt:read_opts_x()) -> - {ok, [machi_dt:chunk_summary()]} | + proplists:proplist()) -> + {ok, machi_dt:chunk_s()} | {error, machi_dt:error_general() | 'not_written' | 'partial_read'} | {error, term()}. -read_chunk(Host, TcpPort, NSInfo0, EpochID, File, Offset, Size, Opts0) +read_chunk(Host, TcpPort, EpochID, File, Offset, Size, Opts) when Offset >= ?MINIMUM_OFFSET, Size >= 0 -> Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), - NSInfo = machi_util:ns_info_default(NSInfo0), - Opts = machi_util:read_opts_default(Opts0), try - read_chunk2(Sock, NSInfo, EpochID, File, Offset, Size, Opts) + read_chunk2(Sock, EpochID, File, Offset, Size, Opts) after disconnect(Sock) end. %% @doc Fetch the list of chunk checksums for `File'. --spec checksum_list(port_wrap(), machi_dt:file_name()) -> +-spec checksum_list(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name()) -> {ok, binary()} | {error, machi_dt:error_general() | 'no_such_file' | 'partial_read'} | {error, term()}. -checksum_list(Sock, File) -> - checksum_list2(Sock, File). +checksum_list(Sock, EpochID, File) -> + checksum_list2(Sock, EpochID, File). %% @doc Fetch the list of chunk checksums for `File'. %% @@ -275,13 +288,13 @@ checksum_list(Sock, File) -> %% Details of the encoding used inside the `binary()' blog can be found %% in the EDoc comments for {@link machi_flu1:decode_csum_file_entry/1}. --spec checksum_list(machi_dt:inet_host(), machi_dt:inet_port(), machi_dt:file_name()) -> +-spec checksum_list(machi_dt:inet_host(), machi_dt:inet_port(), machi_dt:epoch_id(), machi_dt:file_name()) -> {ok, binary()} | {error, machi_dt:error_general() | 'no_such_file'} | {error, term()}. -checksum_list(Host, TcpPort, File) when is_integer(TcpPort) -> +checksum_list(Host, TcpPort, EpochID, File) when is_integer(TcpPort) -> Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), try - checksum_list2(Sock, File) + checksum_list2(Sock, EpochID, File) after disconnect(Sock) end. @@ -308,7 +321,7 @@ list_files(Host, TcpPort, EpochID) when is_integer(TcpPort) -> %% @doc Fetch the wedge status from the remote FLU. -spec wedge_status(port_wrap()) -> - {ok, {boolean(), machi_dt:epoch_id(), machi_dt:namespace_version(),machi_dt:namespace()}} | {error, term()}. + {ok, {boolean(), machi_dt:epoch_id()}} | {error, term()}. wedge_status(Sock) -> wedge_status2(Sock). @@ -316,7 +329,7 @@ wedge_status(Sock) -> %% @doc Fetch the wedge status from the remote FLU. -spec wedge_status(machi_dt:inet_host(), machi_dt:inet_port()) -> - {ok, {boolean(), machi_dt:epoch_id(), machi_dt:namespace_version(),machi_dt:namespace()}} | {error, term()}. + {ok, {boolean(), machi_dt:epoch_id()}} | {error, term()}. wedge_status(Host, TcpPort) when is_integer(TcpPort) -> Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), try @@ -527,25 +540,23 @@ disconnect(_) -> %% @doc Restricted API: Write a chunk of already-sequenced data to %% `File' at `Offset'. --spec write_chunk(port_wrap(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk(), machi_dt:chunk_csum()) -> +-spec write_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk()) -> ok | {error, machi_dt:error_general()} | {error, term()}. -write_chunk(Sock, NSInfo0, EpochID, File, Offset, Chunk, CSum) +write_chunk(Sock, EpochID, File, Offset, Chunk) when Offset >= ?MINIMUM_OFFSET -> - NSInfo = machi_util:ns_info_default(NSInfo0), - write_chunk2(Sock, NSInfo, EpochID, File, Offset, Chunk, CSum). + write_chunk2(Sock, EpochID, File, Offset, Chunk). %% @doc Restricted API: Write a chunk of already-sequenced data to %% `File' at `Offset'. -spec write_chunk(machi_dt:inet_host(), machi_dt:inet_port(), - 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk(), machi_dt:chunk_csum()) -> + machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk()) -> ok | {error, machi_dt:error_general()} | {error, term()}. -write_chunk(Host, TcpPort, NSInfo0, EpochID, File, Offset, Chunk, CSum) +write_chunk(Host, TcpPort, EpochID, File, Offset, Chunk) when Offset >= ?MINIMUM_OFFSET -> Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), try - NSInfo = machi_util:ns_info_default(NSInfo0), - write_chunk2(Sock, NSInfo, EpochID, File, Offset, Chunk, CSum) + write_chunk2(Sock, EpochID, File, Offset, Chunk) after disconnect(Sock) end. @@ -553,18 +564,16 @@ write_chunk(Host, TcpPort, NSInfo0, EpochID, File, Offset, Chunk, CSum) %% @doc Restricted API: Write a chunk of already-sequenced data to %% `File' at `Offset'. --spec trim_chunk(port_wrap(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size()) -> +-spec trim_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size()) -> ok | {error, machi_dt:error_general()} | {error, term()}. -trim_chunk(Sock, NSInfo0, EpochID, File0, Offset, Size) +trim_chunk(Sock, EpochID, File0, Offset, Size) when Offset >= ?MINIMUM_OFFSET -> ReqID = <<"id">>, - NSInfo = machi_util:ns_info_default(NSInfo0), - #ns_info{version=NSVersion, name=NS} = NSInfo, File = machi_util:make_binary(File0), true = (Offset >= ?MINIMUM_OFFSET), Req = machi_pb_translate:to_pb_request( ReqID, - {low_trim_chunk, NSVersion, NS, EpochID, File, Offset, Size, 0}), + {low_trim_chunk, EpochID, File, Offset, Size, 0}), do_pb_request_common(Sock, ReqID, Req). %% @doc Restricted API: Delete a file after it has been successfully @@ -611,88 +620,83 @@ trunc_hack(Host, TcpPort, EpochID, File) when is_integer(TcpPort) -> %%%%%%%%%%%%%%%%%%%%%%%%%%% -read_chunk2(Sock, NSInfo, EpochID, File0, Offset, Size, Opts) -> +read_chunk2(Sock, EpochID, File0, Offset, Size, Opts) -> ReqID = <<"id">>, - #ns_info{version=NSVersion, name=NS} = NSInfo, File = machi_util:make_binary(File0), Req = machi_pb_translate:to_pb_request( ReqID, - {low_read_chunk, NSVersion, NS, EpochID, File, Offset, Size, Opts}), + {low_read_chunk, EpochID, File, Offset, Size, Opts}), do_pb_request_common(Sock, ReqID, Req). -append_chunk2(Sock, NSInfo, EpochID, - Prefix0, Chunk, CSum0, Opts, Timeout) -> +append_chunk2(Sock, EpochID, CoC_Namespace, CoC_Locator, + Prefix0, Chunk0, ChunkExtra) -> ReqID = <<"id">>, + {Chunk, CSum_tag, CSum} = + case Chunk0 of + X when is_binary(X) -> + {Chunk0, ?CSUM_TAG_NONE, <<>>}; + {ChunkCSum, Chk} -> + {Tag, CS} = machi_util:unmake_tagged_csum(ChunkCSum), + {Chk, Tag, CS} + end, Prefix = machi_util:make_binary(Prefix0), - {CSum_tag, CSum} = case CSum0 of - <<>> -> - {?CSUM_TAG_NONE, <<>>}; - {_Tag, _CS} -> - CSum0; - B when is_binary(B) -> - machi_util:unmake_tagged_csum(CSum0) - end, - #ns_info{version=NSVersion, name=NS, locator=NSLocator} = NSInfo, - %% NOTE: The tuple position of NSLocator is a bit odd, because EpochID - %% _must_ be in the 4th position (as NSV & NS must be in 2nd & 3rd). Req = machi_pb_translate:to_pb_request( ReqID, - {low_append_chunk, NSVersion, NS, EpochID, NSLocator, - Prefix, Chunk, CSum_tag, CSum, Opts}), - do_pb_request_common(Sock, ReqID, Req, true, Timeout). + {low_append_chunk, EpochID, CoC_Namespace, CoC_Locator, + Prefix, Chunk, CSum_tag, CSum, ChunkExtra}), + do_pb_request_common(Sock, ReqID, Req). -write_chunk2(Sock, NSInfo, EpochID, File0, Offset, Chunk, CSum0) -> +write_chunk2(Sock, EpochID, File0, Offset, Chunk0) -> ReqID = <<"id">>, - #ns_info{version=NSVersion, name=NS} = NSInfo, File = machi_util:make_binary(File0), true = (Offset >= ?MINIMUM_OFFSET), - {CSum_tag, CSum} = case CSum0 of - <<>> -> - {?CSUM_TAG_NONE, <<>>}; - {_Tag, _CS} -> - CSum0; - B when is_binary(B) -> - machi_util:unmake_tagged_csum(CSum0) - end, + {Chunk, CSum_tag, CSum} = + case Chunk0 of + X when is_binary(X) -> + {Chunk0, ?CSUM_TAG_NONE, <<>>}; + {ChunkCSum, Chk} -> + {Tag, CS} = machi_util:unmake_tagged_csum(ChunkCSum), + {Chk, Tag, CS} + end, Req = machi_pb_translate:to_pb_request( ReqID, - {low_write_chunk, NSVersion, NS, EpochID, File, Offset, Chunk, CSum_tag, CSum}), + {low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, CSum}), do_pb_request_common(Sock, ReqID, Req). list2(Sock, EpochID) -> ReqID = <<"id">>, Req = machi_pb_translate:to_pb_request( - ReqID, {low_skip_wedge, {low_list_files, EpochID}}), + ReqID, {low_list_files, EpochID}), do_pb_request_common(Sock, ReqID, Req). wedge_status2(Sock) -> ReqID = <<"id">>, Req = machi_pb_translate:to_pb_request( - ReqID, {low_skip_wedge, {low_wedge_status}}), + ReqID, {low_wedge_status, undefined}), do_pb_request_common(Sock, ReqID, Req). echo2(Sock, Message) -> ReqID = <<"id">>, Req = machi_pb_translate:to_pb_request( - ReqID, {low_skip_wedge, {low_echo, Message}}), + ReqID, {low_echo, undefined, Message}), do_pb_request_common(Sock, ReqID, Req). -checksum_list2(Sock, File) -> +checksum_list2(Sock, EpochID, File) -> ReqID = <<"id">>, Req = machi_pb_translate:to_pb_request( - ReqID, {low_skip_wedge, {low_checksum_list, File}}), + ReqID, {low_checksum_list, EpochID, File}), do_pb_request_common(Sock, ReqID, Req). delete_migration2(Sock, EpochID, File) -> ReqID = <<"id">>, Req = machi_pb_translate:to_pb_request( - ReqID, {low_skip_wedge, {low_delete_migration, EpochID, File}}), + ReqID, {low_delete_migration, EpochID, File}), do_pb_request_common(Sock, ReqID, Req). trunc_hack2(Sock, EpochID, File) -> ReqID = <<"id-trunc">>, Req = machi_pb_translate:to_pb_request( - ReqID, {low_skip_wedge, {low_trunc_hack, EpochID, File}}), + ReqID, {low_trunc_hack, EpochID, File}), do_pb_request_common(Sock, ReqID, Req). get_latest_epochid2(Sock, ProjType) -> @@ -735,18 +739,18 @@ kick_projection_reaction2(Sock, _Options) -> ReqID = <<42>>, Req = machi_pb_translate:to_pb_request( ReqID, {low_proj, {kick_projection_reaction}}), - do_pb_request_common(Sock, ReqID, Req, false, ?LONG_TIMEOUT). + do_pb_request_common(Sock, ReqID, Req, false). do_pb_request_common(Sock, ReqID, Req) -> - do_pb_request_common(Sock, ReqID, Req, true, ?LONG_TIMEOUT). + do_pb_request_common(Sock, ReqID, Req, true). -do_pb_request_common(Sock, ReqID, Req, GetReply_p, Timeout) -> +do_pb_request_common(Sock, ReqID, Req, GetReply_p) -> erase(bad_sock), try ReqBin = list_to_binary(machi_pb:encode_mpb_ll_request(Req)), ok = w_send(Sock, ReqBin), if GetReply_p -> - case w_recv(Sock, 0, Timeout) of + case w_recv(Sock, 0) of {ok, RespBin} -> Resp = machi_pb:decode_mpb_ll_response(RespBin), {ReqID2, Reply} = machi_pb_translate:from_pb_response(Resp), @@ -792,7 +796,7 @@ w_connect(#p_srvr{proto_mod=?MODULE, address=Host, port=Port, props=Props}=_P)-> case proplists:get_value(session_proto, Props, tcp) of tcp -> put(xxx, goofus), - Sock = machi_util:connect(Host, Port, ?SHORT_TIMEOUT), + Sock = machi_util:connect(Host, Port, ?HARD_TIMEOUT), put(xxx, Sock), ok = inet:setopts(Sock, ?PB_PACKET_OPTS), {w,tcp,Sock}; @@ -816,8 +820,8 @@ w_close({w,tcp,Sock}) -> catch gen_tcp:close(Sock), ok. -w_recv({w,tcp,Sock}, Amt, Timeout) -> - gen_tcp:recv(Sock, Amt, Timeout). +w_recv({w,tcp,Sock}, Amt) -> + gen_tcp:recv(Sock, Amt, ?HARD_TIMEOUT). w_send({w,tcp,Sock}, IoData) -> gen_tcp:send(Sock, IoData). diff --git a/src/machi_flu1_net_server.erl b/src/machi_flu1_net_server.erl index ed3d980..029a5dd 100644 --- a/src/machi_flu1_net_server.erl +++ b/src/machi_flu1_net_server.erl @@ -66,25 +66,19 @@ flu_name :: pv1_server(), %% Used in server_wedge_status to lookup the table epoch_tab :: ets:tab(), - %% Clustering: cluster map version number - namespace_version = 0 :: machi_dt:namespace_version(), - %% Clustering: my (and my chain's) assignment to a specific namespace - namespace = <<>> :: machi_dt:namespace(), %% High mode only high_clnt :: pid(), %% anything you want - props = [] :: proplists:proplist() + props = [] :: list() % proplist }). -type socket() :: any(). -type state() :: #state{}. -spec start_link(ranch:ref(), socket(), module(), [term()]) -> {ok, pid()}. -start_link(Ref, Socket, Transport, [FluName, Witness, DataDir, EpochTab, ProjStore, Props]) -> - NS = proplists:get_value(namespace, Props, <<>>), - true = is_binary(NS), +start_link(Ref, Socket, Transport, [FluName, Witness, DataDir, EpochTab, ProjStore]) -> proc_lib:start_link(?MODULE, init, [#state{ref=Ref, socket=Socket, transport=Transport, @@ -92,9 +86,7 @@ start_link(Ref, Socket, Transport, [FluName, Witness, DataDir, EpochTab, ProjSto witness=Witness, data_dir=DataDir, epoch_tab=EpochTab, - proj_store=ProjStore, - namespace=NS, - props=Props}]). + proj_store=ProjStore}]). -spec init(state()) -> no_return(). init(#state{ref=Ref, socket=Socket, transport=Transport}=State) -> @@ -217,51 +209,44 @@ do_pb_ll_request(#mpb_ll_request{req_id=ReqID}, #state{pb_mode=high}=S) -> {machi_pb_translate:to_pb_response(ReqID, unused, Result), S}; do_pb_ll_request(PB_request, S) -> Req = machi_pb_translate:from_pb_request(PB_request), + %% io:format(user, "[~w] do_pb_ll_request Req: ~w~n", [S#state.flu_name, Req]), {ReqID, Cmd, Result, S2} = case Req of - {RqID, {low_skip_wedge, LowSubCmd}=Cmd0} -> + {RqID, {LowCmd, _}=Cmd0} + when LowCmd =:= low_proj; + LowCmd =:= low_wedge_status; + LowCmd =:= low_list_files -> %% Skip wedge check for these unprivileged commands - {Rs, NewS} = do_pb_ll_request3(LowSubCmd, S), - {RqID, Cmd0, Rs, NewS}; - {RqID, {low_proj, _LowSubCmd}=Cmd0} -> {Rs, NewS} = do_pb_ll_request3(Cmd0, S), {RqID, Cmd0, Rs, NewS}; {RqID, Cmd0} -> - %% All remaining must have NSVersion, NS, & EpochID at next pos - NSVersion = element(2, Cmd0), - NS = element(3, Cmd0), - EpochID = element(4, Cmd0), - {Rs, NewS} = do_pb_ll_request2(NSVersion, NS, EpochID, Cmd0, S), + EpochID = element(2, Cmd0), % by common convention + {Rs, NewS} = do_pb_ll_request2(EpochID, Cmd0, S), {RqID, Cmd0, Rs, NewS} end, {machi_pb_translate:to_pb_response(ReqID, Cmd, Result), S2}. -%% do_pb_ll_request2(): Verification of epoch details & namespace details. - -do_pb_ll_request2(NSVersion, NS, EpochID, CMD, S) -> +do_pb_ll_request2(EpochID, CMD, S) -> {Wedged_p, CurrentEpochID} = lookup_epoch(S), - if not is_tuple(EpochID) orelse tuple_size(EpochID) /= 2 -> - exit({bad_epoch_id, EpochID, for, CMD}); - Wedged_p == true -> + %% io:format(user, "{Wedged_p, CurrentEpochID}: ~w~n", [{Wedged_p, CurrentEpochID}]), + if Wedged_p == true -> {{error, wedged}, S#state{epoch_id=CurrentEpochID}}; + is_tuple(EpochID) + andalso EpochID /= CurrentEpochID -> {Epoch, _} = EpochID, {CurrentEpoch, _} = CurrentEpochID, if Epoch < CurrentEpoch -> - {{error, bad_epoch}, S}; + ok; true -> + %% We're at same epoch # but different checksum, or + %% we're at a newer/bigger epoch #. _ = machi_flu1:wedge_myself(S#state.flu_name, CurrentEpochID), - {{error, wedged}, S#state{epoch_id=CurrentEpochID}} - end; + ok + end, + {{error, bad_epoch}, S#state{epoch_id=CurrentEpochID}}; true -> - #state{namespace_version=MyNSVersion, namespace=MyNS} = S, - if NSVersion /= MyNSVersion -> - {{error, bad_epoch}, S}; - NS /= MyNS -> - {{error, bad_arg}, S}; - true -> - do_pb_ll_request3(CMD, S) - end + do_pb_ll_request3(CMD, S#state{epoch_id=CurrentEpochID}) end. lookup_epoch(#state{epoch_tab=T}) -> @@ -269,35 +254,34 @@ lookup_epoch(#state{epoch_tab=T}) -> ets:lookup_element(T, epoch, 2). %% Witness status does not matter below. -do_pb_ll_request3({low_echo, Msg}, S) -> +do_pb_ll_request3({low_echo, _BogusEpochID, Msg}, S) -> {Msg, S}; -do_pb_ll_request3({low_auth, _User, _Pass}, S) -> +do_pb_ll_request3({low_auth, _BogusEpochID, _User, _Pass}, S) -> {-6, S}; -do_pb_ll_request3({low_wedge_status}, S) -> +do_pb_ll_request3({low_wedge_status, _EpochID}, S) -> {do_server_wedge_status(S), S}; do_pb_ll_request3({low_proj, PCMD}, S) -> {do_server_proj_request(PCMD, S), S}; %% Witness status *matters* below -do_pb_ll_request3({low_append_chunk, NSVersion, NS, EpochID, NSLocator, +do_pb_ll_request3({low_append_chunk, _EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk, CSum_tag, - CSum, Opts}, + CSum, ChunkExtra}, #state{witness=false}=S) -> - NSInfo = #ns_info{version=NSVersion, name=NS, locator=NSLocator}, - {do_server_append_chunk(NSInfo, EpochID, + {do_server_append_chunk(CoC_Namespace, CoC_Locator, Prefix, Chunk, CSum_tag, CSum, - Opts, S), S}; -do_pb_ll_request3({low_write_chunk, _NSVersion, _NS, _EpochID, File, Offset, Chunk, CSum_tag, + ChunkExtra, S), S}; +do_pb_ll_request3({low_write_chunk, _EpochID, File, Offset, Chunk, CSum_tag, CSum}, #state{witness=false}=S) -> {do_server_write_chunk(File, Offset, Chunk, CSum_tag, CSum, S), S}; -do_pb_ll_request3({low_read_chunk, _NSVersion, _NS, _EpochID, File, Offset, Size, Opts}, +do_pb_ll_request3({low_read_chunk, _EpochID, File, Offset, Size, Opts}, #state{witness=false} = S) -> {do_server_read_chunk(File, Offset, Size, Opts, S), S}; -do_pb_ll_request3({low_trim_chunk, _NSVersion, _NS, _EpochID, File, Offset, Size, TriggerGC}, +do_pb_ll_request3({low_trim_chunk, _EpochID, File, Offset, Size, TriggerGC}, #state{witness=false}=S) -> {do_server_trim_chunk(File, Offset, Size, TriggerGC, S), S}; -do_pb_ll_request3({low_checksum_list, File}, +do_pb_ll_request3({low_checksum_list, _EpochID, File}, #state{witness=false}=S) -> {do_server_checksum_listing(File, S), S}; do_pb_ll_request3({low_list_files, _EpochID}, @@ -350,27 +334,27 @@ do_server_proj_request({kick_projection_reaction}, end), async_no_response. -do_server_append_chunk(NSInfo, EpochID, +do_server_append_chunk(CoC_Namespace, CoC_Locator, Prefix, Chunk, CSum_tag, CSum, - Opts, S) -> + ChunkExtra, S) -> case sanitize_prefix(Prefix) of ok -> - do_server_append_chunk2(NSInfo, EpochID, + do_server_append_chunk2(CoC_Namespace, CoC_Locator, Prefix, Chunk, CSum_tag, CSum, - Opts, S); + ChunkExtra, S); _ -> {error, bad_arg} end. -do_server_append_chunk2(NSInfo, EpochID, +do_server_append_chunk2(CoC_Namespace, CoC_Locator, Prefix, Chunk, CSum_tag, Client_CSum, - Opts, #state{flu_name=FluName, - epoch_id=EpochID}=_S) -> + ChunkExtra, #state{flu_name=FluName, + epoch_id=EpochID}=_S) -> %% TODO: Do anything with PKey? try TaggedCSum = check_or_make_tagged_checksum(CSum_tag, Client_CSum,Chunk), - R = {seq_append, self(), NSInfo, EpochID, - Prefix, Chunk, TaggedCSum, Opts}, + R = {seq_append, self(), CoC_Namespace, CoC_Locator, + Prefix, Chunk, TaggedCSum, ChunkExtra, EpochID}, case gen_server:call(FluName, R, 10*1000) of {assignment, Offset, File} -> Size = iolist_size(Chunk), @@ -408,17 +392,14 @@ do_server_write_chunk(File, Offset, Chunk, CSum_tag, CSum, #state{flu_name=FluNa do_server_read_chunk(File, Offset, Size, Opts, #state{flu_name=FluName})-> case sanitize_file_string(File) of ok -> - case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, File}) of - {ok, Pid} -> - case machi_file_proxy:read(Pid, Offset, Size, Opts) of - %% XXX FIXME - %% For now we are omiting the checksum data because it blows up - %% protobufs. - {ok, ChunksAndTrimmed} -> {ok, ChunksAndTrimmed}; - Other -> Other - end; - {error, trimmed} = Error -> - Error + {ok, Pid} = machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, File}), + case machi_file_proxy:read(Pid, Offset, Size, Opts) of + %% XXX FIXME + %% For now we are omiting the checksum data because it blows up + %% protobufs. + {ok, ChunksAndTrimmed} -> {ok, ChunksAndTrimmed}; + Other -> + Other end; _ -> {error, bad_arg} @@ -473,14 +454,14 @@ do_server_list_files(#state{data_dir=DataDir}=_S) -> {Size, File} end || File <- Files]}. -do_server_wedge_status(#state{namespace_version=NSVersion, namespace=NS}=S) -> +do_server_wedge_status(S) -> {Wedged_p, CurrentEpochID0} = lookup_epoch(S), CurrentEpochID = if CurrentEpochID0 == undefined -> ?DUMMY_PV1_EPOCH; true -> CurrentEpochID0 end, - {Wedged_p, CurrentEpochID, NSVersion, NS}. + {Wedged_p, CurrentEpochID}. do_server_delete_migration(File, #state{data_dir=DataDir}=_S) -> case sanitize_file_string(File) of @@ -579,30 +560,26 @@ do_pb_hl_request2({high_echo, Msg}, S) -> {Msg, S}; do_pb_hl_request2({high_auth, _User, _Pass}, S) -> {-77, S}; -do_pb_hl_request2({high_append_chunk=Op, NS, Prefix, Chunk, TaggedCSum, Opts}, - #state{high_clnt=Clnt}=S) -> - NSInfo = #ns_info{name=NS}, % TODO populate other fields - todo_perhaps_remind_ns_locator_not_chosen(Op), - Res = machi_cr_client:append_chunk(Clnt, NSInfo, - Prefix, Chunk, TaggedCSum, Opts), +do_pb_hl_request2({high_append_chunk, CoC_Namespace, CoC_Locator, + Prefix, ChunkBin, TaggedCSum, + ChunkExtra}, #state{high_clnt=Clnt}=S) -> + Chunk = {TaggedCSum, ChunkBin}, + Res = machi_cr_client:append_chunk_extra(Clnt, CoC_Namespace, CoC_Locator, + Prefix, Chunk, + ChunkExtra), {Res, S}; -do_pb_hl_request2({high_write_chunk=Op, File, Offset, Chunk, CSum}, +do_pb_hl_request2({high_write_chunk, File, Offset, ChunkBin, TaggedCSum}, #state{high_clnt=Clnt}=S) -> - NSInfo = undefined, - todo_perhaps_remind_ns_locator_not_chosen(Op), - Res = machi_cr_client:write_chunk(Clnt, NSInfo, File, Offset, Chunk, CSum), + Chunk = {TaggedCSum, ChunkBin}, + Res = machi_cr_client:write_chunk(Clnt, File, Offset, Chunk), {Res, S}; -do_pb_hl_request2({high_read_chunk=Op, File, Offset, Size, Opts}, +do_pb_hl_request2({high_read_chunk, File, Offset, Size, Opts}, #state{high_clnt=Clnt}=S) -> - NSInfo = undefined, - todo_perhaps_remind_ns_locator_not_chosen(Op), - Res = machi_cr_client:read_chunk(Clnt, NSInfo, File, Offset, Size, Opts), + Res = machi_cr_client:read_chunk(Clnt, File, Offset, Size, Opts), {Res, S}; -do_pb_hl_request2({high_trim_chunk=Op, File, Offset, Size}, +do_pb_hl_request2({high_trim_chunk, File, Offset, Size}, #state{high_clnt=Clnt}=S) -> - NSInfo = undefined, - todo_perhaps_remind_ns_locator_not_chosen(Op), - Res = machi_cr_client:trim_chunk(Clnt, NSInfo, File, Offset, Size), + Res = machi_cr_client:trim_chunk(Clnt, File, Offset, Size), {Res, S}; do_pb_hl_request2({high_checksum_list, File}, #state{high_clnt=Clnt}=S) -> Res = machi_cr_client:checksum_list(Clnt, File), @@ -620,15 +597,3 @@ make_high_clnt(#state{high_clnt=undefined}=S) -> S#state{high_clnt=Clnt}; make_high_clnt(S) -> S. - -todo_perhaps_remind_ns_locator_not_chosen(Op) -> - Key = {?MODULE, Op}, - case get(Key) of - undefined -> - io:format(user, "TODO op ~w is using default locator value\n", - [Op]), - put(Key, true); - _ -> - ok - end. - diff --git a/src/machi_flu1_subsup.erl b/src/machi_flu1_subsup.erl index 566c118..21fd6f5 100644 --- a/src/machi_flu1_subsup.erl +++ b/src/machi_flu1_subsup.erl @@ -36,7 +36,7 @@ -export([start_link/1, start_append_server/4, stop_append_server/1, - start_listener/7, + start_listener/6, stop_listener/1, subsup_name/1, listener_name/1]). @@ -67,13 +67,11 @@ stop_append_server(FluName) -> ok = supervisor:delete_child(SubSup, FluName). -spec start_listener(pv1_server(), inet:port_number(), boolean(), - string(), ets:tab(), atom() | pid(), - proplists:proplist()) -> {ok, pid()}. -start_listener(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore, - Props) -> + string(), ets:tab(), atom() | pid()) -> {ok, pid()}. +start_listener(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore) -> supervisor:start_child(subsup_name(FluName), listener_spec(FluName, TcpPort, Witness, DataDir, - EpochTab, ProjStore, Props)). + EpochTab, ProjStore)). -spec stop_listener(pv1_server()) -> ok. stop_listener(FluName) -> @@ -99,13 +97,12 @@ init([]) -> %% private -spec listener_spec(pv1_server(), inet:port_number(), boolean(), - string(), ets:tab(), atom() | pid(), - proplists:proplist()) -> supervisor:child_spec(). -listener_spec(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore, Props) -> + string(), ets:tab(), atom() | pid()) -> supervisor:child_spec(). +listener_spec(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore) -> ListenerName = listener_name(FluName), NbAcceptors = 10, TcpOpts = [{port, TcpPort}, {backlog, ?BACKLOG}], - NetServerOpts = [FluName, Witness, DataDir, EpochTab, ProjStore, Props], + NetServerOpts = [FluName, Witness, DataDir, EpochTab, ProjStore], ranch:child_spec(ListenerName, NbAcceptors, ranch_tcp, TcpOpts, machi_flu1_net_server, NetServerOpts). diff --git a/src/machi_flu_filename_mgr.erl b/src/machi_flu_filename_mgr.erl index b25d146..2b08071 100644 --- a/src/machi_flu_filename_mgr.erl +++ b/src/machi_flu_filename_mgr.erl @@ -48,13 +48,12 @@ -compile(export_all). -endif. --export([ - child_spec/2, - start_link/2, - find_or_make_filename_from_prefix/4, - increment_prefix_sequence/3, - list_files_by_prefix/2 - ]). +-export([child_spec/2, + start_link/2, + find_or_make_filename_from_prefix/4, + increment_prefix_sequence/3, + list_files_by_prefix/2, + get_csum_table/1]). %% gen_server callbacks -export([ @@ -67,13 +66,13 @@ ]). -define(TIMEOUT, 10 * 1000). --include("machi.hrl"). %% included for #ns_info record -include("machi_projection.hrl"). %% included for pv1_epoch type -record(state, {fluname :: atom(), tid :: ets:tid(), datadir :: string(), - epoch :: pv1_epoch() + epoch :: pv1_epoch(), + csum_table :: machi_csum_table:table() }). %% public API @@ -91,28 +90,28 @@ start_link(FluName, DataDir) when is_atom(FluName) andalso is_list(DataDir) -> -spec find_or_make_filename_from_prefix( FluName :: atom(), EpochId :: pv1_epoch(), Prefix :: {prefix, string()}, - machi_dt:ns_info()) -> + machi_dt:coc_nl()) -> {file, Filename :: string()} | {error, Reason :: term() } | timeout. % @doc Find the latest available or make a filename from a prefix. A prefix % should be in the form of a tagged tuple `{prefix, P}'. Returns a tagged % tuple in the form of `{file, F}' or an `{error, Reason}' find_or_make_filename_from_prefix(FluName, EpochId, {prefix, Prefix}, - #ns_info{}=NSInfo) + {coc, _CoC_Ns, _CoC_Loc}=CoC_NL) when is_atom(FluName) -> N = make_filename_mgr_name(FluName), - gen_server:call(N, {find_filename, FluName, EpochId, NSInfo, Prefix}, ?TIMEOUT); + gen_server:call(N, {find_filename, EpochId, CoC_NL, Prefix}, ?TIMEOUT); find_or_make_filename_from_prefix(_FluName, _EpochId, Other, Other2) -> - lager:error("~p is not a valid prefix/locator ~p", [Other, Other2]), + lager:error("~p is not a valid prefix/CoC ~p", [Other, Other2]), error(badarg). --spec increment_prefix_sequence( FluName :: atom(), NSInfo :: machi_dt:ns_info(), Prefix :: {prefix, string()} ) -> +-spec increment_prefix_sequence( FluName :: atom(), CoC_NL :: machi_dt:coc_nl(), Prefix :: {prefix, string()} ) -> ok | {error, Reason :: term() } | timeout. % @doc Increment the sequence counter for a given prefix. Prefix should % be in the form of `{prefix, P}'. -increment_prefix_sequence(FluName, #ns_info{}=NSInfo, {prefix, Prefix}) when is_atom(FluName) -> - gen_server:call(make_filename_mgr_name(FluName), {increment_sequence, NSInfo, Prefix}, ?TIMEOUT); -increment_prefix_sequence(_FluName, _NSInfo, Other) -> +increment_prefix_sequence(FluName, {coc,_CoC_Namespace,_CoC_Locator}=CoC_NL, {prefix, Prefix}) when is_atom(FluName) -> + gen_server:call(make_filename_mgr_name(FluName), {increment_sequence, CoC_NL, Prefix}, ?TIMEOUT); +increment_prefix_sequence(_FluName, _CoC_NL, Other) -> lager:error("~p is not a valid prefix.", [Other]), error(badarg). @@ -127,13 +126,25 @@ list_files_by_prefix(_FluName, Other) -> lager:error("~p is not a valid prefix.", [Other]), error(badarg). +get_csum_table(FluName) when is_atom(FluName) -> + gen_server:call(make_filename_mgr_name(FluName), get_csum_table, ?TIMEOUT). + %% gen_server API init([FluName, DataDir]) -> Tid = ets:new(make_filename_mgr_name(FluName), [named_table, {read_concurrency, true}]), + + %% metadata includes checksums, offsets and filenames + CsumTableDir = filename:join(DataDir, "metadata"), + {ok, CsumTable} = machi_csum_table:open(CsumTableDir, []), + %% TODO make sure all files non-existent, if any remaining files + %% here, just delete it. They're in the list *because* they're all + %% trimmed. + {ok, #state{fluname = FluName, epoch = ?DUMMY_PV1_EPOCH, datadir = DataDir, - tid = Tid}}. + tid = Tid, + csum_table = CsumTable}}. handle_cast(Req, State) -> lager:warning("Got unknown cast ~p", [Req]), @@ -143,22 +154,23 @@ handle_cast(Req, State) -> %% the FLU has already validated that the caller's epoch id and the FLU's epoch id %% are the same. So we *assume* that remains the case here - that is to say, we %% are not wedged. -handle_call({find_filename, FluName, EpochId, NSInfo, Prefix}, _From, - S = #state{ datadir = DataDir, epoch = EpochId, tid = Tid }) -> +handle_call({find_filename, EpochId, CoC_NL, Prefix}, _From, S = #state{ datadir = DataDir, + epoch = EpochId, + tid = Tid }) -> %% Our state and the caller's epoch ids are the same. Business as usual. - File = handle_find_file(FluName, Tid, NSInfo, Prefix, DataDir), + File = handle_find_file(Tid, CoC_NL, Prefix, DataDir), {reply, {file, File}, S}; -handle_call({find_filename, _FluName, EpochId, NSInfo, Prefix}, _From, S = #state{ datadir = DataDir, tid = Tid }) -> +handle_call({find_filename, EpochId, CoC_NL, Prefix}, _From, S = #state{ datadir = DataDir, tid = Tid }) -> %% If the epoch id in our state and the caller's epoch id were the same, it would've %% matched the above clause. Since we're here, we know that they are different. %% If epoch ids between our state and the caller's are different, we must increment the %% sequence number, generate a filename and then cache it. - File = increment_and_cache_filename(Tid, DataDir, NSInfo, Prefix), + File = increment_and_cache_filename(Tid, DataDir, CoC_NL, Prefix), {reply, {file, File}, S#state{epoch = EpochId}}; -handle_call({increment_sequence, #ns_info{name=NS, locator=NSLocator}, Prefix}, _From, S = #state{ datadir = DataDir }) -> - ok = machi_util:increment_max_filenum(DataDir, NS, NSLocator, Prefix), +handle_call({increment_sequence, {coc,CoC_Namespace,CoC_Locator}=_CoC_NL, Prefix}, _From, S = #state{ datadir = DataDir }) -> + ok = machi_util:increment_max_filenum(DataDir, CoC_Namespace,CoC_Locator, Prefix), {reply, ok, S}; handle_call({list_files, Prefix}, From, S = #state{ datadir = DataDir }) -> spawn(fun() -> @@ -167,6 +179,9 @@ handle_call({list_files, Prefix}, From, S = #state{ datadir = DataDir }) -> end), {noreply, S}; +handle_call(get_csum_table, _From, S = #state{ csum_table = CsumTable }) -> + {reply, {ok, CsumTable}, S}; + handle_call(Req, From, State) -> lager:warning("Got unknown call ~p from ~p", [Req, From]), {reply, hoge, State}. @@ -175,8 +190,9 @@ handle_info(Info, State) -> lager:warning("Got unknown info ~p", [Info]), {noreply, State}. -terminate(Reason, _State) -> +terminate(Reason, _State = #state{ csum_table = CsumTable} ) -> lager:info("Shutting down because ~p", [Reason]), + ok = machi_csum_table:close(CsumTable), ok. code_change(_OldVsn, State, _Extra) -> @@ -191,6 +207,12 @@ generate_uuid_v4_str() -> io_lib:format("~8.16.0b-~4.16.0b-4~3.16.0b-~4.16.0b-~12.16.0b", [A, B, C band 16#0fff, D band 16#3fff bor 16#8000, E]). +find_file(DataDir, {coc,CoC_Namespace,CoC_Locator}=_CoC_NL, Prefix, N) -> + {_Filename, Path} = machi_util:make_data_filename(DataDir, + CoC_Namespace,CoC_Locator, + Prefix, "*", N), + filelib:wildcard(Path). + list_files(DataDir, Prefix) -> {F_bin, Path} = machi_util:make_data_filename(DataDir, "*^" ++ Prefix ++ "^*"), filelib:wildcard(binary_to_list(F_bin), filename:dirname(Path)). @@ -198,31 +220,50 @@ list_files(DataDir, Prefix) -> make_filename_mgr_name(FluName) when is_atom(FluName) -> list_to_atom(atom_to_list(FluName) ++ "_filename_mgr"). -handle_find_file(_FluName, Tid, #ns_info{name=NS, locator=NSLocator}, Prefix, DataDir) -> - case ets:lookup(Tid, {NS, NSLocator, Prefix}) of +handle_find_file(Tid, {coc,CoC_Namespace,CoC_Locator}=CoC_NL, Prefix, DataDir) -> + N = machi_util:read_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix), + {File, Cleanup} = case find_file(DataDir, CoC_NL, Prefix, N) of + [] -> + {find_or_make_filename(Tid, DataDir, CoC_Namespace, CoC_Locator, Prefix, N), false}; + [H] -> {H, true}; + [Fn | _ ] = L -> + lager:debug( + "Searching for a matching file to prefix ~p and sequence number ~p gave multiples: ~p", + [Prefix, N, L]), + {Fn, true} + end, + maybe_cleanup(Tid, {CoC_Namespace, CoC_Locator, Prefix, N}, Cleanup), + filename:basename(File). + +find_or_make_filename(Tid, DataDir, CoC_Namespace, CoC_Locator, Prefix, N) -> + case ets:lookup(Tid, {CoC_Namespace, CoC_Locator, Prefix, N}) of [] -> - N = machi_util:read_max_filenum(DataDir, NS, NSLocator, Prefix), - F = generate_filename(DataDir, NS, NSLocator, Prefix, N), - true = ets:insert(Tid, {{NS, NSLocator, Prefix}, F}), + F = generate_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, N), + true = ets:insert_new(Tid, {{CoC_Namespace, CoC_Locator, Prefix, N}, F}), F; [{_Key, File}] -> File end. -generate_filename(DataDir, NS, NSLocator, Prefix, N) -> - {F, _Q} = machi_util:make_data_filename( +generate_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, N) -> + {F, _} = machi_util:make_data_filename( DataDir, - NS, NSLocator, Prefix, + CoC_Namespace, CoC_Locator, Prefix, generate_uuid_v4_str(), N), binary_to_list(F). -increment_and_cache_filename(Tid, DataDir, #ns_info{name=NS,locator=NSLocator}, Prefix) -> - ok = machi_util:increment_max_filenum(DataDir, NS, NSLocator, Prefix), - N = machi_util:read_max_filenum(DataDir, NS, NSLocator, Prefix), - F = generate_filename(DataDir, NS, NSLocator, Prefix, N), - true = ets:insert(Tid, {{NS, NSLocator, Prefix}, F}), - F. +maybe_cleanup(_Tid, _Key, false) -> + ok; +maybe_cleanup(Tid, Key, true) -> + true = ets:delete(Tid, Key). + +increment_and_cache_filename(Tid, DataDir, {coc,CoC_Namespace,CoC_Locator}, Prefix) -> + ok = machi_util:increment_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix), + N = machi_util:read_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix), + F = generate_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, N), + true = ets:insert_new(Tid, {{CoC_Namespace, CoC_Locator, Prefix, N}, F}), + filename:basename(F). diff --git a/src/machi_flu_metadata_mgr.erl b/src/machi_flu_metadata_mgr.erl index b9c26c9..3b7bc4a 100644 --- a/src/machi_flu_metadata_mgr.erl +++ b/src/machi_flu_metadata_mgr.erl @@ -34,19 +34,15 @@ -module(machi_flu_metadata_mgr). -behaviour(gen_server). --include("machi.hrl"). -define(MAX_MGRS, 10). %% number of managers to start by default. -define(HASH(X), erlang:phash2(X)). %% hash algorithm to use -define(TIMEOUT, 10 * 1000). %% 10 second timeout --define(KNOWN_FILES_LIST_PREFIX, "known_files_"). - -record(state, {fluname :: atom(), datadir :: string(), tid :: ets:tid(), - cnt :: non_neg_integer(), - trimmed_files :: machi_plist:plist() + cnt :: non_neg_integer() }). %% This record goes in the ets table where filename is the key @@ -63,8 +59,7 @@ lookup_proxy_pid/2, start_proxy_pid/2, stop_proxy_pid/2, - build_metadata_mgr_name/2, - trim_file/2 + build_metadata_mgr_name/2 ]). %% gen_server callbacks @@ -102,24 +97,15 @@ start_proxy_pid(FluName, {file, Filename}) -> stop_proxy_pid(FluName, {file, Filename}) -> gen_server:call(get_manager_atom(FluName, Filename), {stop_proxy_pid, Filename}, ?TIMEOUT). -trim_file(FluName, {file, Filename}) -> - gen_server:call(get_manager_atom(FluName, Filename), {trim_file, Filename}, ?TIMEOUT). - %% gen_server callbacks init([FluName, Name, DataDir, Num]) -> %% important: we'll need another persistent storage to %% remember deleted (trimmed) file, to prevent resurrection after %% flu restart and append. - FileListFileName = - filename:join([DataDir, ?KNOWN_FILES_LIST_PREFIX ++ atom_to_list(FluName)]), - {ok, PList} = machi_plist:open(FileListFileName, []), - %% TODO make sure all files non-existent, if any remaining files - %% here, just delete it. They're in the list *because* they're all - %% trimmed. Tid = ets:new(Name, [{keypos, 2}, {read_concurrency, true}, {write_concurrency, true}]), - {ok, #state{fluname = FluName, datadir = DataDir, tid = Tid, cnt = Num, - trimmed_files=PList}}. + + {ok, #state{fluname = FluName, datadir = DataDir, tid = Tid, cnt = Num}}. handle_cast(Req, State) -> lager:warning("Got unknown cast ~p", [Req]), @@ -133,23 +119,17 @@ handle_call({proxy_pid, Filename}, _From, State = #state{ tid = Tid }) -> {reply, Reply, State}; handle_call({start_proxy_pid, Filename}, _From, - State = #state{ fluname = N, tid = Tid, datadir = D, - trimmed_files=TrimmedFiles}) -> - case machi_plist:find(TrimmedFiles, Filename) of - false -> - NewR = case lookup_md(Tid, Filename) of - not_found -> - start_file_proxy(N, D, Filename); - #md{ proxy_pid = undefined } = R0 -> - start_file_proxy(N, D, R0); - #md{ proxy_pid = _Pid } = R1 -> - R1 - end, - update_ets(Tid, NewR), - {reply, {ok, NewR#md.proxy_pid}, State}; - true -> - {reply, {error, trimmed}, State} - end; + State = #state{ fluname = N, tid = Tid, datadir = D}) -> + NewR = case lookup_md(Tid, Filename) of + not_found -> + start_file_proxy(N, D, Filename); + #md{ proxy_pid = undefined } = R0 -> + start_file_proxy(N, D, R0); + #md{ proxy_pid = _Pid } = R1 -> + R1 + end, + update_ets(Tid, NewR), + {reply, {ok, NewR#md.proxy_pid}, State}; handle_call({stop_proxy_pid, Filename}, _From, State = #state{ tid = Tid }) -> case lookup_md(Tid, Filename) of @@ -164,15 +144,6 @@ handle_call({stop_proxy_pid, Filename}, _From, State = #state{ tid = Tid }) -> end, {reply, ok, State}; -handle_call({trim_file, Filename}, _, - S = #state{trimmed_files = TrimmedFiles }) -> - case machi_plist:add(TrimmedFiles, Filename) of - {ok, TrimmedFiles2} -> - {reply, ok, S#state{trimmed_files=TrimmedFiles2}}; - Error -> - {reply, Error, S} - end; - handle_call(Req, From, State) -> lager:warning("Got unknown call ~p from ~p", [Req, From]), {reply, hoge, State}. @@ -186,16 +157,17 @@ handle_info({'DOWN', Mref, process, Pid, file_rollover}, State = #state{ fluname tid = Tid }) -> lager:info("file proxy ~p shutdown because of file rollover", [Pid]), R = get_md_record_by_mref(Tid, Mref), - {Prefix, NS, NSLocator, _, _} = + {Prefix, CoC_Namespace, CoC_Locator, _, _} = machi_util:parse_filename(R#md.filename), + %% CoC_Namespace = list_to_binary(CoC_Namespace_str), + %% CoC_Locator = list_to_integer(CoC_Locator_str), %% We only increment the counter here. The filename will be generated on the %% next append request to that prefix and since the filename will have a new %% sequence number it probably will be associated with a different metadata %% manager. That's why we don't want to generate a new file name immediately %% and use it to start a new file proxy. - NSInfo = #ns_info{name=NS, locator=NSLocator}, - ok = machi_flu_filename_mgr:increment_prefix_sequence(FluName, NSInfo, {prefix, Prefix}), + ok = machi_flu_filename_mgr:increment_prefix_sequence(FluName, {coc, CoC_Namespace, CoC_Locator}, {prefix, Prefix}), %% purge our ets table of this entry completely since it is likely the %% new filename (whenever it comes) will be in a different manager than @@ -219,9 +191,8 @@ handle_info(Info, State) -> lager:warning("Got unknown info ~p", [Info]), {noreply, State}. -terminate(Reason, _State = #state{trimmed_files=TrimmedFiles}) -> +terminate(Reason, _State) -> lager:info("Shutting down because ~p", [Reason]), - machi_plist:close(TrimmedFiles), ok. code_change(_OldVsn, State, _Extra) -> @@ -253,7 +224,7 @@ lookup_md(Tid, Data) -> [R] -> R end. -start_file_proxy(FluName, D, R = #md{filename = F} ) -> +start_file_proxy(FluName, D, R = #md{filename = F}) -> {ok, Pid} = machi_file_proxy_sup:start_proxy(FluName, D, F), Mref = monitor(process, Pid), R#md{ proxy_pid = Pid, mref = Mref }; diff --git a/src/machi_lifecycle_mgr.erl b/src/machi_lifecycle_mgr.erl index 80ea8b4..385c607 100644 --- a/src/machi_lifecycle_mgr.erl +++ b/src/machi_lifecycle_mgr.erl @@ -950,7 +950,7 @@ make_pending_config(Term) -> %% The largest numbered file is assumed to be all of the AST changes that we %% want to apply in a single batch. The AST tuples of all files with smaller %% numbers will be concatenated together to create the prior history of -%% the cluster. We assume that all transitions inside these earlier +%% cluster-of-clusters. We assume that all transitions inside these earlier %% files were actually safe & sane, therefore any sanity problem can only %% be caused by the contents of the largest numbered file. diff --git a/src/machi_pb_high_client.erl b/src/machi_pb_high_client.erl index f67479e..5b2ab22 100644 --- a/src/machi_pb_high_client.erl +++ b/src/machi_pb_high_client.erl @@ -25,10 +25,6 @@ %% to a single socket connection, and there is no code to deal with %% multiple connections/load balancing/error handling to several/all %% Machi cluster servers. -%% -%% Please see {@link machi_flu1_client} the "Client API implemntation notes" -%% section for how this module relates to the rest of the client API -%% implementation. -module(machi_pb_high_client). @@ -42,7 +38,7 @@ connected_p/1, echo/2, echo/3, auth/3, auth/4, - append_chunk/6, append_chunk/7, + append_chunk/7, append_chunk/8, write_chunk/5, write_chunk/6, read_chunk/5, read_chunk/6, trim_chunk/4, trim_chunk/5, @@ -100,33 +96,30 @@ auth(PidSpec, User, Pass) -> auth(PidSpec, User, Pass, Timeout) -> send_sync(PidSpec, {auth, User, Pass}, Timeout). --spec append_chunk(pid(), - NS::machi_dt:namespace(), Prefix::machi_dt:file_prefix(), - Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum(), - Opts::machi_dt:append_opts()) -> +-spec append_chunk(pid(), CoC_namespace::binary(), CoC_locator::integer(), Prefix::binary(), Chunk::binary(), + CSum::binary(), ChunkExtra::non_neg_integer()) -> {ok, Filename::string(), Offset::machi_dt:file_offset()} | {error, machi_client_error_reason()}. -append_chunk(PidSpec, NS, Prefix, Chunk, CSum, Opts) -> - append_chunk(PidSpec, NS, Prefix, Chunk, CSum, Opts, ?DEFAULT_TIMEOUT). +append_chunk(PidSpec, CoC_namespace, CoC_locator, Prefix, Chunk, CSum, ChunkExtra) -> + append_chunk(PidSpec, CoC_namespace, CoC_locator, Prefix, Chunk, CSum, ChunkExtra, ?DEFAULT_TIMEOUT). --spec append_chunk(pid(), - NS::machi_dt:namespace(), Prefix::machi_dt:file_prefix(), - Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum(), - Opts::machi_dt:append_opts(), +-spec append_chunk(pid(), CoC_namespace::binary(), CoC_locator::integer(), Prefix::binary(), + Chunk::binary(), CSum::binary(), + ChunkExtra::non_neg_integer(), Timeout::non_neg_integer()) -> {ok, Filename::string(), Offset::machi_dt:file_offset()} | {error, machi_client_error_reason()}. -append_chunk(PidSpec, NS, Prefix, Chunk, CSum, Opts, Timeout) -> - send_sync(PidSpec, {append_chunk, NS, Prefix, Chunk, CSum, Opts}, Timeout). +append_chunk(PidSpec, CoC_namespace, CoC_locator, Prefix, Chunk, CSum, ChunkExtra, Timeout) -> + send_sync(PidSpec, {append_chunk, CoC_namespace, CoC_locator, Prefix, Chunk, CSum, ChunkExtra}, Timeout). -spec write_chunk(pid(), File::string(), machi_dt:file_offset(), - Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum()) -> + Chunk::binary(), CSum::binary()) -> ok | {error, machi_client_error_reason()}. write_chunk(PidSpec, File, Offset, Chunk, CSum) -> write_chunk(PidSpec, File, Offset, Chunk, CSum, ?DEFAULT_TIMEOUT). -spec write_chunk(pid(), File::string(), machi_dt:file_offset(), - Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum(), Timeout::non_neg_integer()) -> + Chunk::binary(), CSum::binary(), Timeout::non_neg_integer()) -> ok | {error, machi_client_error_reason()}. write_chunk(PidSpec, File, Offset, Chunk, CSum, Timeout) -> send_sync(PidSpec, {write_chunk, File, Offset, Chunk, CSum}, Timeout). @@ -135,22 +128,21 @@ write_chunk(PidSpec, File, Offset, Chunk, CSum, Timeout) -> %% {Chunks, TrimmedChunks}}' for live file while it returns `{error, %% trimmed}' if all bytes of the file was trimmed. -spec read_chunk(pid(), File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), - machi_dt:read_opts_x()) -> + [{flag_no_checksum | flag_no_chunk | needs_trimmed, boolean()}]) -> {ok, {Chunks::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), binary()}], Trimmed::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size()}]}} | {error, machi_client_error_reason()}. -read_chunk(PidSpec, File, Offset, Size, Opts) -> - read_chunk(PidSpec, File, Offset, Size, Opts, ?DEFAULT_TIMEOUT). +read_chunk(PidSpec, File, Offset, Size, Options) -> + read_chunk(PidSpec, File, Offset, Size, Options, ?DEFAULT_TIMEOUT). -spec read_chunk(pid(), File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), - machi_dt:read_opts_x(), + [{flag_no_checksum | flag_no_chunk | needs_trimmed, boolean()}], Timeout::non_neg_integer()) -> {ok, {Chunks::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), binary()}], Trimmed::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size()}]}} | {error, machi_client_error_reason()}. -read_chunk(PidSpec, File, Offset, Size, Opts0, Timeout) -> - Opts = machi_util:read_opts_default(Opts0), - send_sync(PidSpec, {read_chunk, File, Offset, Size, Opts}, Timeout). +read_chunk(PidSpec, File, Offset, Size, Options, Timeout) -> + send_sync(PidSpec, {read_chunk, File, Offset, Size, Options}, Timeout). %% @doc Trims arbitrary binary range of any file. If a specified range %% has any byte trimmed, it fails and returns `{error, trimmed}'. @@ -289,19 +281,18 @@ do_send_sync2({auth, User, Pass}, #state{sock=Sock}=S) -> Res = {bummer, {X, Y, erlang:get_stacktrace()}}, {Res, S} end; -do_send_sync2({append_chunk, NS, Prefix, Chunk, CSum, Opts}, +do_send_sync2({append_chunk, CoC_Namespace, CoC_Locator, + Prefix, Chunk, CSum, ChunkExtra}, #state{sock=Sock, sock_id=Index, count=Count}=S) -> try ReqID = <>, CSumT = convert_csum_req(CSum, Chunk), - {ChunkExtra, Pref, FailPref} = machi_pb_translate:conv_from_append_opts(Opts), - Req = #mpb_appendchunkreq{namespace=NS, + Req = #mpb_appendchunkreq{coc_namespace=CoC_Namespace, + coc_locator=CoC_Locator, prefix=Prefix, chunk=Chunk, csum=CSumT, - chunk_extra=ChunkExtra, - preferred_file_name=Pref, - flag_fail_preferred=FailPref}, + chunk_extra=ChunkExtra}, R1a = #mpb_request{req_id=ReqID, do_not_alter=1, append_chunk=Req}, Bin1a = machi_pb:encode_mpb_request(R1a), @@ -346,13 +337,13 @@ do_send_sync2({write_chunk, File, Offset, Chunk, CSum}, Res = {bummer, {X, Y, erlang:get_stacktrace()}}, {Res, S#state{count=Count+1}} end; -do_send_sync2({read_chunk, File, Offset, Size, Opts}, +do_send_sync2({read_chunk, File, Offset, Size, Options}, #state{sock=Sock, sock_id=Index, count=Count}=S) -> try ReqID = <>, - #read_opts{no_checksum=FlagNoChecksum, - no_chunk=FlagNoChunk, - needs_trimmed=NeedsTrimmed} = Opts, + FlagNoChecksum = proplists:get_value(no_checksum, Options, false), + FlagNoChunk = proplists:get_value(no_chunk, Options, false), + NeedsTrimmed = proplists:get_value(needs_trimmed, Options, false), Req = #mpb_readchunkreq{chunk_pos=#mpb_chunkpos{file_name=File, offset=Offset, chunk_size=Size}, @@ -445,15 +436,9 @@ do_send_sync2({list_files}, {Res, S#state{count=Count+1}} end. -%% We only convert the checksum types that make sense here: -%% none or client_sha. None of the other types should be sent -%% to us via the PB high protocol. - convert_csum_req(none, Chunk) -> #mpb_chunkcsum{type='CSUM_TAG_CLIENT_SHA', csum=machi_util:checksum_chunk(Chunk)}; -convert_csum_req(<<>>, Chunk) -> - convert_csum_req(none, Chunk); convert_csum_req({client_sha, CSumBin}, _Chunk) -> #mpb_chunkcsum{type='CSUM_TAG_CLIENT_SHA', csum=CSumBin}. @@ -501,12 +486,12 @@ convert_read_chunk_resp(#mpb_readchunkresp{status='OK', chunks=PB_Chunks, trimme csum=#mpb_chunkcsum{type=T, csum=Ck}}) -> %% TODO: cleanup export Csum = <<(machi_pb_translate:conv_to_csum_tag(T)):8, Ck/binary>>, - {list_to_binary(File), Offset, Chunk, Csum} + {File, Offset, Chunk, Csum} end, PB_Chunks), Trimmed = lists:map(fun(#mpb_chunkpos{file_name=File, offset=Offset, chunk_size=Size}) -> - {list_to_binary(File), Offset, Size} + {File, Offset, Size} end, PB_Trimmed), {ok, {Chunks, Trimmed}}; convert_read_chunk_resp(#mpb_readchunkresp{status=Status}) -> diff --git a/src/machi_pb_translate.erl b/src/machi_pb_translate.erl index 1fd5f8b..cc8f728 100644 --- a/src/machi_pb_translate.erl +++ b/src/machi_pb_translate.erl @@ -34,9 +34,7 @@ -export([from_pb_request/1, from_pb_response/1, to_pb_request/2, - to_pb_response/3, - conv_from_append_opts/1, - conv_to_append_opts/1 + to_pb_response/3 ]). %% TODO: fixme cleanup @@ -45,104 +43,95 @@ from_pb_request(#mpb_ll_request{ req_id=ReqID, echo=#mpb_echoreq{message=Msg}}) -> - {ReqID, {low_skip_wedge, {low_echo, Msg}}}; + {ReqID, {low_echo, undefined, Msg}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, auth=#mpb_authreq{user=User, password=Pass}}) -> - {ReqID, {low_skip_wedge, {low_auth, User, Pass}}}; + {ReqID, {low_auth, undefined, User, Pass}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, - append_chunk=IR=#mpb_ll_appendchunkreq{ - namespace_version=NSVersion, - namespace=NS_str, - locator=NSLocator, + append_chunk=#mpb_ll_appendchunkreq{ epoch_id=PB_EpochID, + coc_namespace=CoC_Namespace, + coc_locator=CoC_Locator, prefix=Prefix, chunk=Chunk, - csum=#mpb_chunkcsum{type=CSum_type, csum=CSum}}}) -> - NS = list_to_binary(NS_str), + csum=#mpb_chunkcsum{type=CSum_type, csum=CSum}, + chunk_extra=ChunkExtra}}) -> EpochID = conv_to_epoch_id(PB_EpochID), CSum_tag = conv_to_csum_tag(CSum_type), - Opts = conv_to_append_opts(IR), - %% NOTE: The tuple position of NSLocator is a bit odd, because EpochID - %% _must_ be in the 4th position (as NSV & NS must be in 2nd & 3rd). - {ReqID, {low_append_chunk, NSVersion, NS, EpochID, NSLocator, - Prefix, Chunk, CSum_tag, CSum, Opts}}; + {ReqID, {low_append_chunk, EpochID, CoC_Namespace, CoC_Locator, + Prefix, Chunk, CSum_tag, CSum, + ChunkExtra}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, write_chunk=#mpb_ll_writechunkreq{ - namespace_version=NSVersion, - namespace=NS_str, epoch_id=PB_EpochID, chunk=#mpb_chunk{file_name=File, offset=Offset, chunk=Chunk, csum=#mpb_chunkcsum{type=CSum_type, csum=CSum}}}}) -> - NS = list_to_binary(NS_str), EpochID = conv_to_epoch_id(PB_EpochID), CSum_tag = conv_to_csum_tag(CSum_type), - {ReqID, {low_write_chunk, NSVersion, NS, EpochID, File, Offset, Chunk, CSum_tag, CSum}}; + {ReqID, {low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, CSum}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, read_chunk=#mpb_ll_readchunkreq{ - namespace_version=NSVersion, - namespace=NS_str, epoch_id=PB_EpochID, chunk_pos=ChunkPos, flag_no_checksum=PB_GetNoChecksum, flag_no_chunk=PB_GetNoChunk, flag_needs_trimmed=PB_NeedsTrimmed}}) -> - NS = list_to_binary(NS_str), EpochID = conv_to_epoch_id(PB_EpochID), - Opts = #read_opts{no_checksum=PB_GetNoChecksum, - no_chunk=PB_GetNoChunk, - needs_trimmed=PB_NeedsTrimmed}, + Opts = [{no_checksum, conv_to_boolean(PB_GetNoChecksum)}, + {no_chunk, conv_to_boolean(PB_GetNoChunk)}, + {needs_trimmed, conv_to_boolean(PB_NeedsTrimmed)}], #mpb_chunkpos{file_name=File, offset=Offset, chunk_size=Size} = ChunkPos, - {ReqID, {low_read_chunk, NSVersion, NS, EpochID, File, Offset, Size, Opts}}; + {ReqID, {low_read_chunk, EpochID, File, Offset, Size, Opts}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, trim_chunk=#mpb_ll_trimchunkreq{ - namespace_version=NSVersion, - namespace=NS_str, epoch_id=PB_EpochID, file=File, offset=Offset, size=Size, - trigger_gc=TriggerGC}}) -> - NS = list_to_binary(NS_str), + trigger_gc=PB_TriggerGC}}) -> EpochID = conv_to_epoch_id(PB_EpochID), - {ReqID, {low_trim_chunk, NSVersion, NS, EpochID, File, Offset, Size, TriggerGC}}; + TriggerGC = conv_to_boolean(PB_TriggerGC), + {ReqID, {low_trim_chunk, EpochID, File, Offset, Size, TriggerGC}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, checksum_list=#mpb_ll_checksumlistreq{ + epoch_id=PB_EpochID, file=File}}) -> - {ReqID, {low_skip_wedge, {low_checksum_list, File}}}; + EpochID = conv_to_epoch_id(PB_EpochID), + {ReqID, {low_checksum_list, EpochID, File}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, list_files=#mpb_ll_listfilesreq{ epoch_id=PB_EpochID}}) -> EpochID = conv_to_epoch_id(PB_EpochID), - {ReqID, {low_skip_wedge, {low_list_files, EpochID}}}; + {ReqID, {low_list_files, EpochID}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, wedge_status=#mpb_ll_wedgestatusreq{}}) -> - {ReqID, {low_skip_wedge, {low_wedge_status}}}; + {ReqID, {low_wedge_status, undefined}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, delete_migration=#mpb_ll_deletemigrationreq{ epoch_id=PB_EpochID, file=File}}) -> EpochID = conv_to_epoch_id(PB_EpochID), - {ReqID, {low_skip_wedge, {low_delete_migration, EpochID, File}}}; + {ReqID, {low_delete_migration, EpochID, File}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, trunc_hack=#mpb_ll_trunchackreq{ epoch_id=PB_EpochID, file=File}}) -> EpochID = conv_to_epoch_id(PB_EpochID), - {ReqID, {low_skip_wedge, {low_trunc_hack, EpochID, File}}}; + {ReqID, {low_trunc_hack, EpochID, File}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, proj_gl=#mpb_ll_getlatestepochidreq{type=ProjType}}) -> @@ -183,22 +172,23 @@ from_pb_request(#mpb_request{req_id=ReqID, {ReqID, {high_auth, User, Pass}}; from_pb_request(#mpb_request{req_id=ReqID, append_chunk=IR=#mpb_appendchunkreq{}}) -> - #mpb_appendchunkreq{namespace=NS_str, + #mpb_appendchunkreq{coc_namespace=CoC_namespace, + coc_locator=CoC_locator, prefix=Prefix, chunk=Chunk, - csum=CSum} = IR, - NS = list_to_binary(NS_str), + csum=CSum, + chunk_extra=ChunkExtra} = IR, TaggedCSum = make_tagged_csum(CSum, Chunk), - Opts = conv_to_append_opts(IR), - {ReqID, {high_append_chunk, NS, Prefix, Chunk, TaggedCSum, Opts}}; + {ReqID, {high_append_chunk, CoC_namespace, CoC_locator, Prefix, Chunk, + TaggedCSum, ChunkExtra}}; from_pb_request(#mpb_request{req_id=ReqID, write_chunk=IR=#mpb_writechunkreq{}}) -> #mpb_writechunkreq{chunk=#mpb_chunk{file_name=File, offset=Offset, chunk=Chunk, - csum=CSumRec}} = IR, - CSum = make_tagged_csum(CSumRec, Chunk), - {ReqID, {high_write_chunk, File, Offset, Chunk, CSum}}; + csum=CSum}} = IR, + TaggedCSum = make_tagged_csum(CSum, Chunk), + {ReqID, {high_write_chunk, File, Offset, Chunk, TaggedCSum}}; from_pb_request(#mpb_request{req_id=ReqID, read_chunk=IR=#mpb_readchunkreq{}}) -> #mpb_readchunkreq{chunk_pos=#mpb_chunkpos{file_name=File, @@ -207,10 +197,11 @@ from_pb_request(#mpb_request{req_id=ReqID, flag_no_checksum=FlagNoChecksum, flag_no_chunk=FlagNoChunk, flag_needs_trimmed=NeedsTrimmed} = IR, - Opts = #read_opts{no_checksum=FlagNoChecksum, - no_chunk=FlagNoChunk, - needs_trimmed=NeedsTrimmed}, - {ReqID, {high_read_chunk, File, Offset, Size, Opts}}; + %% I want MAPS + Options = [{no_checksum, machi_util:int2bool(FlagNoChecksum)}, + {no_chunk, machi_util:int2bool(FlagNoChunk)}, + {needs_trimmed, machi_util:int2bool(NeedsTrimmed)}], + {ReqID, {high_read_chunk, File, Offset, Size, Options}}; from_pb_request(#mpb_request{req_id=ReqID, trim_chunk=IR=#mpb_trimchunkreq{}}) -> #mpb_trimchunkreq{chunk_pos=#mpb_chunkpos{file_name=File, @@ -274,12 +265,12 @@ from_pb_response(#mpb_ll_response{ chunk=Bytes, csum=#mpb_chunkcsum{type=T,csum=Ck}}) -> Csum = <<(conv_to_csum_tag(T)):8, Ck/binary>>, - {list_to_binary(File), Offset, Bytes, Csum} + {File, Offset, Bytes, Csum} end, PB_Chunks), Trimmed = lists:map(fun(#mpb_chunkpos{file_name=File, offset=Offset, chunk_size=Size}) -> - {list_to_binary(File), Offset, Size} + {File, Offset, Size} end, PB_Trimmed), {ReqID, {ok, {Chunks, Trimmed}}}; _ -> @@ -315,16 +306,12 @@ from_pb_response(#mpb_ll_response{ from_pb_response(#mpb_ll_response{ req_id=ReqID, wedge_status=#mpb_ll_wedgestatusresp{ - status=Status, - epoch_id=PB_EpochID, wedged_flag=Wedged_p, - namespace_version=NSVersion, namespace=NS_str}}) -> - GeneralStatus = case machi_pb_high_client:convert_general_status_code(Status) of - ok -> ok; - _Else -> {yukky, _Else} - end, + epoch_id=PB_EpochID, wedged_flag=PB_Wedged}}) -> EpochID = conv_to_epoch_id(PB_EpochID), - NS = list_to_binary(NS_str), - {ReqID, {GeneralStatus, {Wedged_p, EpochID, NSVersion, NS}}}; + Wedged_p = if PB_Wedged == 1 -> true; + PB_Wedged == 0 -> false + end, + {ReqID, {ok, {Wedged_p, EpochID}}}; from_pb_response(#mpb_ll_response{ req_id=ReqID, delete_migration=#mpb_ll_deletemigrationresp{ @@ -390,100 +377,90 @@ from_pb_response(#mpb_ll_response{ 'OK' -> {ReqID, {ok, Epochs}}; _ -> - {ReqID, machi_pb_high_client:convert_general_status_code(Status)} + {ReqID< machi_pb_high_client:convert_general_status_code(Status)} end. %% No response for proj_kp/kick_projection_reaction %% TODO: move the #mbp_* record making code from %% machi_pb_high_client:do_send_sync() clauses into to_pb_request(). -to_pb_request(ReqID, {low_skip_wedge, {low_echo, Msg}}) -> +to_pb_request(ReqID, {low_echo, _BogusEpochID, Msg}) -> #mpb_ll_request{ req_id=ReqID, do_not_alter=2, echo=#mpb_echoreq{message=Msg}}; -to_pb_request(ReqID, {low_skip_wedge, {low_auth, User, Pass}}) -> +to_pb_request(ReqID, {low_auth, _BogusEpochID, User, Pass}) -> #mpb_ll_request{req_id=ReqID, do_not_alter=2, auth=#mpb_authreq{user=User, password=Pass}}; -%% NOTE: The tuple position of NSLocator is a bit odd, because EpochID -%% _must_ be in the 4th position (as NSV & NS must be in 2nd & 3rd). -to_pb_request(ReqID, {low_append_chunk, NSVersion, NS, EpochID, NSLocator, - Prefix, Chunk, CSum_tag, CSum, Opts}) -> +to_pb_request(ReqID, {low_append_chunk, EpochID, CoC_Namespace, CoC_Locator, + Prefix, Chunk, CSum_tag, CSum, ChunkExtra}) -> PB_EpochID = conv_from_epoch_id(EpochID), CSum_type = conv_from_csum_tag(CSum_tag), PB_CSum = #mpb_chunkcsum{type=CSum_type, csum=CSum}, - {ChunkExtra, Pref, FailPref} = conv_from_append_opts(Opts), #mpb_ll_request{req_id=ReqID, do_not_alter=2, append_chunk=#mpb_ll_appendchunkreq{ - namespace_version=NSVersion, - namespace=NS, - locator=NSLocator, epoch_id=PB_EpochID, + coc_namespace=CoC_Namespace, + coc_locator=CoC_Locator, prefix=Prefix, chunk=Chunk, csum=PB_CSum, - chunk_extra=ChunkExtra, - preferred_file_name=Pref, - flag_fail_preferred=FailPref}}; -to_pb_request(ReqID, {low_write_chunk, NSVersion, NS, EpochID, File, Offset, Chunk, CSum_tag, CSum}) -> + chunk_extra=ChunkExtra}}; +to_pb_request(ReqID, {low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, CSum}) -> PB_EpochID = conv_from_epoch_id(EpochID), CSum_type = conv_from_csum_tag(CSum_tag), PB_CSum = #mpb_chunkcsum{type=CSum_type, csum=CSum}, #mpb_ll_request{req_id=ReqID, do_not_alter=2, write_chunk=#mpb_ll_writechunkreq{ - namespace_version=NSVersion, - namespace=NS, epoch_id=PB_EpochID, chunk=#mpb_chunk{file_name=File, offset=Offset, chunk=Chunk, csum=PB_CSum}}}; -to_pb_request(ReqID, {low_read_chunk, NSVersion, NS, EpochID, File, Offset, Size, Opts}) -> +to_pb_request(ReqID, {low_read_chunk, EpochID, File, Offset, Size, Opts}) -> PB_EpochID = conv_from_epoch_id(EpochID), - #read_opts{no_checksum=FNChecksum, - no_chunk=FNChunk, - needs_trimmed=NeedsTrimmed} = Opts, + FNChecksum = proplists:get_value(no_checksum, Opts, false), + FNChunk = proplists:get_value(no_chunk, Opts, false), + NeedsTrimmed = proplists:get_value(needs_trimmed, Opts, false), #mpb_ll_request{ req_id=ReqID, do_not_alter=2, read_chunk=#mpb_ll_readchunkreq{ - namespace_version=NSVersion, - namespace=NS, - epoch_id=PB_EpochID, - chunk_pos=#mpb_chunkpos{ + epoch_id=PB_EpochID, + chunk_pos=#mpb_chunkpos{ file_name=File, offset=Offset, chunk_size=Size}, - flag_no_checksum=FNChecksum, - flag_no_chunk=FNChunk, - flag_needs_trimmed=NeedsTrimmed}}; -to_pb_request(ReqID, {low_trim_chunk, NSVersion, NS, EpochID, File, Offset, Size, TriggerGC}) -> + flag_no_checksum=machi_util:bool2int(FNChecksum), + flag_no_chunk=machi_util:bool2int(FNChunk), + flag_needs_trimmed=machi_util:bool2int(NeedsTrimmed)}}; +to_pb_request(ReqID, {low_trim_chunk, EpochID, File, Offset, Size, TriggerGC}) -> PB_EpochID = conv_from_epoch_id(EpochID), #mpb_ll_request{req_id=ReqID, do_not_alter=2, trim_chunk=#mpb_ll_trimchunkreq{ - namespace_version=NSVersion, - namespace=NS, epoch_id=PB_EpochID, file=File, offset=Offset, size=Size, trigger_gc=TriggerGC}}; -to_pb_request(ReqID, {low_skip_wedge, {low_checksum_list, File}}) -> +to_pb_request(ReqID, {low_checksum_list, EpochID, File}) -> + PB_EpochID = conv_from_epoch_id(EpochID), #mpb_ll_request{req_id=ReqID, do_not_alter=2, checksum_list=#mpb_ll_checksumlistreq{ + epoch_id=PB_EpochID, file=File}}; -to_pb_request(ReqID, {low_skip_wedge, {low_list_files, EpochID}}) -> +to_pb_request(ReqID, {low_list_files, EpochID}) -> PB_EpochID = conv_from_epoch_id(EpochID), #mpb_ll_request{req_id=ReqID, do_not_alter=2, list_files=#mpb_ll_listfilesreq{epoch_id=PB_EpochID}}; -to_pb_request(ReqID, {low_skip_wedge, {low_wedge_status}}) -> +to_pb_request(ReqID, {low_wedge_status, _BogusEpochID}) -> #mpb_ll_request{req_id=ReqID, do_not_alter=2, wedge_status=#mpb_ll_wedgestatusreq{}}; -to_pb_request(ReqID, {low_skip_wedge, {low_delete_migration, EpochID, File}}) -> +to_pb_request(ReqID, {low_delete_migration, EpochID, File}) -> PB_EpochID = conv_from_epoch_id(EpochID), #mpb_ll_request{req_id=ReqID, do_not_alter=2, delete_migration=#mpb_ll_deletemigrationreq{ epoch_id=PB_EpochID, file=File}}; -to_pb_request(ReqID, {low_skip_wedge, {low_trunc_hack, EpochID, File}}) -> +to_pb_request(ReqID, {low_trunc_hack, EpochID, File}) -> PB_EpochID = conv_from_epoch_id(EpochID), #mpb_ll_request{req_id=ReqID, do_not_alter=2, trunc_hack=#mpb_ll_trunchackreq{ @@ -519,15 +496,15 @@ to_pb_response(_ReqID, _, async_no_response=X) -> X; to_pb_response(ReqID, _, {low_error, ErrCode, ErrMsg}) -> make_ll_error_resp(ReqID, ErrCode, ErrMsg); -to_pb_response(ReqID, {low_skip_wedge, {low_echo, _Msg}}, Resp) -> +to_pb_response(ReqID, {low_echo, _BogusEpochID, _Msg}, Resp) -> #mpb_ll_response{ req_id=ReqID, echo=#mpb_echoresp{message=Resp}}; -to_pb_response(ReqID, {low_skip_wedge, {low_auth, _, _}}, __TODO_Resp) -> +to_pb_response(ReqID, {low_auth, _, _, _}, __TODO_Resp) -> #mpb_ll_response{req_id=ReqID, generic=#mpb_errorresp{code=1, msg="AUTH not implemented"}}; -to_pb_response(ReqID, {low_append_chunk, _NSV, _NS, _EID, _NSL, _Pfx, _Ch, _CST, _CS, _O}, Resp)-> +to_pb_response(ReqID, {low_append_chunk, _EID, _N, _L, _Pfx, _Ch, _CST, _CS, _CE}, Resp)-> case Resp of {ok, {Offset, Size, File}} -> Where = #mpb_chunkpos{offset=Offset, @@ -543,11 +520,11 @@ to_pb_response(ReqID, {low_append_chunk, _NSV, _NS, _EID, _NSL, _Pfx, _Ch, _CST, _Else -> make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {low_write_chunk, _NSV, _NS, _EID, _Fl, _Off, _Ch, _CST, _CS},Resp)-> +to_pb_response(ReqID, {low_write_chunk, _EID, _Fl, _Off, _Ch, _CST, _CS},Resp)-> Status = conv_from_status(Resp), #mpb_ll_response{req_id=ReqID, write_chunk=#mpb_ll_writechunkresp{status=Status}}; -to_pb_response(ReqID, {low_read_chunk, _NSV, _NS, _EID, _Fl, _Off, _Sz, _Opts}, Resp)-> +to_pb_response(ReqID, {low_read_chunk, _EID, _Fl, _Off, _Sz, _Opts}, Resp)-> case Resp of {ok, {Chunks, Trimmed}} -> PB_Chunks = lists:map(fun({File, Offset, Bytes, Csum}) -> @@ -574,7 +551,7 @@ to_pb_response(ReqID, {low_read_chunk, _NSV, _NS, _EID, _Fl, _Off, _Sz, _Opts}, _Else -> make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {low_trim_chunk, _, _, _, _, _, _, _}, Resp) -> +to_pb_response(ReqID, {low_trim_chunk, _, _, _, _, _}, Resp) -> case Resp of ok -> #mpb_ll_response{req_id=ReqID, @@ -582,11 +559,11 @@ to_pb_response(ReqID, {low_trim_chunk, _, _, _, _, _, _, _}, Resp) -> {error, _}=Error -> Status = conv_from_status(Error), #mpb_ll_response{req_id=ReqID, - trim_chunk=#mpb_ll_trimchunkresp{status=Status}}; + read_chunk=#mpb_ll_trimchunkresp{status=Status}}; _Else -> make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {low_skip_wedge, {low_checksum_list, _File}}, Resp) -> +to_pb_response(ReqID, {low_checksum_list, _EpochID, _File}, Resp) -> case Resp of {ok, Chunk} -> #mpb_ll_response{req_id=ReqID, @@ -599,7 +576,7 @@ to_pb_response(ReqID, {low_skip_wedge, {low_checksum_list, _File}}, Resp) -> _Else -> make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {low_skip_wedge, {low_list_files, _EpochID}}, Resp) -> +to_pb_response(ReqID, {low_list_files, _EpochID}, Resp) -> case Resp of {ok, FileInfo} -> PB_Files = [#mpb_fileinfo{file_size=Size, file_name=Name} || @@ -614,28 +591,26 @@ to_pb_response(ReqID, {low_skip_wedge, {low_list_files, _EpochID}}, Resp) -> _Else -> make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {low_skip_wedge, {low_wedge_status}}, Resp) -> +to_pb_response(ReqID, {low_wedge_status, _BogusEpochID}, Resp) -> case Resp of {error, _}=Error -> Status = conv_from_status(Error), #mpb_ll_response{req_id=ReqID, wedge_status=#mpb_ll_wedgestatusresp{status=Status}}; - {Wedged_p, EpochID, NSVersion, NS} -> + {Wedged_p, EpochID} -> + PB_Wedged = conv_from_boolean(Wedged_p), PB_EpochID = conv_from_epoch_id(EpochID), #mpb_ll_response{req_id=ReqID, wedge_status=#mpb_ll_wedgestatusresp{ status='OK', epoch_id=PB_EpochID, - wedged_flag=Wedged_p, - namespace_version=NSVersion, - namespace=NS - }} + wedged_flag=PB_Wedged}} end; -to_pb_response(ReqID, {low_skip_wedge, {low_delete_migration, _EID, _Fl}}, Resp)-> +to_pb_response(ReqID, {low_delete_migration, _EID, _Fl}, Resp)-> Status = conv_from_status(Resp), #mpb_ll_response{req_id=ReqID, delete_migration=#mpb_ll_deletemigrationresp{status=Status}}; -to_pb_response(ReqID, {low_skip_wedge, {low_trunc_hack, _EID, _Fl}}, Resp)-> +to_pb_response(ReqID, {low_trunc_hack, _EID, _Fl}, Resp)-> Status = conv_from_status(Resp), #mpb_ll_response{req_id=ReqID, trunc_hack=#mpb_ll_trunchackresp{status=Status}}; @@ -716,7 +691,7 @@ to_pb_response(ReqID, {high_auth, _User, _Pass}, _Resp) -> #mpb_response{req_id=ReqID, generic=#mpb_errorresp{code=1, msg="AUTH not implemented"}}; -to_pb_response(ReqID, {high_append_chunk, _NS, _Prefix, _Chunk, _TSum, _O}, Resp)-> +to_pb_response(ReqID, {high_append_chunk, _CoC_n, _CoC_l, _Prefix, _Chunk, _TSum, _CE}, Resp)-> case Resp of {ok, {Offset, Size, File}} -> Where = #mpb_chunkpos{offset=Offset, @@ -732,7 +707,7 @@ to_pb_response(ReqID, {high_append_chunk, _NS, _Prefix, _Chunk, _TSum, _O}, Resp _Else -> make_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {high_write_chunk, _File, _Offset, _Chunk, _CSum}, Resp) -> +to_pb_response(ReqID, {high_write_chunk, _File, _Offset, _Chunk, _TaggedCSum}, Resp) -> case Resp of {ok, {_,_,_}} -> %% machi_cr_client returns ok 2-tuple, convert to simple ok. @@ -822,12 +797,12 @@ make_tagged_csum(#mpb_chunkcsum{type='CSUM_TAG_CLIENT_SHA', csum=CSum}, _CB) -> make_ll_error_resp(ReqID, Code, Msg) -> #mpb_ll_response{req_id=ReqID, generic=#mpb_errorresp{code=Code, - msg=Msg}}. + msg=Msg}}. make_error_resp(ReqID, Code, Msg) -> #mpb_response{req_id=ReqID, generic=#mpb_errorresp{code=Code, - msg=Msg}}. + msg=Msg}}. conv_from_epoch_id({Epoch, EpochCSum}) -> #mpb_epochid{epoch_number=Epoch, @@ -987,26 +962,17 @@ conv_from_status(_OOPS) -> io:format(user, "HEY, ~s:~w got ~p\n", [?MODULE, ?LINE, _OOPS]), 'BAD_JOSS'. -conv_from_append_opts(#append_opts{chunk_extra=ChunkExtra, - preferred_file_name=Pref, - flag_fail_preferred=FailPref}) -> - {ChunkExtra, Pref, FailPref}. +conv_to_boolean(undefined) -> + false; +conv_to_boolean(0) -> + false; +conv_to_boolean(N) when is_integer(N) -> + true. - -conv_to_append_opts(#mpb_appendchunkreq{ - chunk_extra=ChunkExtra, - preferred_file_name=Pref, - flag_fail_preferred=FailPref}) -> - #append_opts{chunk_extra=ChunkExtra, - preferred_file_name=Pref, - flag_fail_preferred=FailPref}; -conv_to_append_opts(#mpb_ll_appendchunkreq{ - chunk_extra=ChunkExtra, - preferred_file_name=Pref, - flag_fail_preferred=FailPref}) -> - #append_opts{chunk_extra=ChunkExtra, - preferred_file_name=Pref, - flag_fail_preferred=FailPref}. +conv_from_boolean(false) -> + 0; +conv_from_boolean(true) -> + 1. conv_from_projection_v1(#projection_v1{epoch_number=Epoch, epoch_csum=CSum, diff --git a/src/machi_plist.erl b/src/machi_plist.erl deleted file mode 100644 index 7750b0a..0000000 --- a/src/machi_plist.erl +++ /dev/null @@ -1,69 +0,0 @@ --module(machi_plist). - -%%% @doc persistent list of binaries - --export([open/2, close/1, find/2, add/2]). - --ifdef(TEST). --export([all/1]). --endif. - --record(machi_plist, - {filename :: file:filename_all(), - fd :: file:io_device(), - list = [] :: list(string)}). - --type plist() :: #machi_plist{}. --export_type([plist/0]). - --spec open(file:filename_all(), proplists:proplist()) -> - {ok, plist()} | {error, file:posix()}. -open(Filename, _Opt) -> - %% TODO: This decode could fail if the file didn't finish writing - %% whole contents, which should be fixed by some persistent - %% solution. - List = case file:read_file(Filename) of - {ok, <<>>} -> []; - {ok, Bin} -> binary_to_term(Bin); - {error, enoent} -> [] - end, - case file:open(Filename, [read, write, raw, binary, sync]) of - {ok, Fd} -> - {ok, #machi_plist{filename=Filename, - fd=Fd, - list=List}}; - Error -> - Error - end. - --spec close(plist()) -> ok. -close(#machi_plist{fd=Fd}) -> - _ = file:close(Fd). - --spec find(plist(), string()) -> boolean(). -find(#machi_plist{list=List}, Name) -> - lists:member(Name, List). - --spec add(plist(), string()) -> {ok, plist()} | {error, file:posix()}. -add(Plist = #machi_plist{list=List0, fd=Fd}, Name) -> - case find(Plist, Name) of - true -> - {ok, Plist}; - false -> - List = lists:append(List0, [Name]), - %% TODO: partial write could break the file with other - %% persistent info (even lose data of trimmed states); - %% needs a solution. - case file:pwrite(Fd, 0, term_to_binary(List)) of - ok -> - {ok, Plist#machi_plist{list=List}}; - Error -> - Error - end - end. - --ifdef(TEST). --spec all(plist()) -> [file:filename()]. -all(#machi_plist{list=List}) -> - List. --endif. diff --git a/src/machi_proxy_flu1_client.erl b/src/machi_proxy_flu1_client.erl index 8f9dcf6..e4bc0d2 100644 --- a/src/machi_proxy_flu1_client.erl +++ b/src/machi_proxy_flu1_client.erl @@ -22,10 +22,6 @@ %% proxy-process style API for hiding messy details such as TCP %% connection/disconnection with the remote Machi server. %% -%% Please see {@link machi_flu1_client} the "Client API implemntation notes" -%% section for how this module relates to the rest of the client API -%% implementation. -%% %% Machi is intentionally avoiding using distributed Erlang for %% Machi's communication. This design decision makes Erlang-side code %% more difficult & complex, but it's the price to pay for some @@ -61,9 +57,12 @@ %% FLU1 API -export([ %% File API - append_chunk/6, append_chunk/8, - read_chunk/7, read_chunk/8, - checksum_list/2, checksum_list/3, + append_chunk/4, append_chunk/5, + append_chunk/6, append_chunk/7, + append_chunk_extra/5, append_chunk_extra/6, + append_chunk_extra/7, append_chunk_extra/8, + read_chunk/6, read_chunk/7, + checksum_list/3, checksum_list/4, list_files/2, list_files/3, wedge_status/1, wedge_status/2, @@ -81,8 +80,8 @@ quit/1, %% Internal API - write_chunk/7, write_chunk/8, - trim_chunk/6, trim_chunk/7, + write_chunk/5, write_chunk/6, + trim_chunk/5, trim_chunk/6, %% Helpers stop_proxies/1, start_proxies/1 @@ -107,39 +106,80 @@ start_link(#p_srvr{}=I) -> %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. -append_chunk(PidSpec, NSInfo, EpochID, Prefix, Chunk, CSum) -> - append_chunk(PidSpec, NSInfo, EpochID, Prefix, Chunk, CSum, - #append_opts{}, infinity). +append_chunk(PidSpec, EpochID, Prefix, Chunk) -> + append_chunk(PidSpec, EpochID, Prefix, Chunk, infinity). %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. -append_chunk(PidSpec, NSInfo, EpochID, Prefix, Chunk, CSum, Opts, - Timeout) -> - gen_server:call(PidSpec, {req, {append_chunk, NSInfo, EpochID, - Prefix, Chunk, CSum, Opts, Timeout}}, +append_chunk(PidSpec, EpochID, Prefix, Chunk, Timeout) -> + append_chunk_extra(PidSpec, EpochID, + ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, + Prefix, Chunk, 0, Timeout). + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix'. + +append_chunk(PidSpec, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk) -> + append_chunk(PidSpec, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk, infinity). + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix'. + +append_chunk(PidSpec, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk, Timeout) -> + append_chunk_extra(PidSpec, EpochID, + CoC_Namespace, CoC_Locator, + Prefix, Chunk, 0, Timeout). + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix'. + +append_chunk_extra(PidSpec, EpochID, Prefix, Chunk, ChunkExtra) + when is_integer(ChunkExtra), ChunkExtra >= 0 -> + append_chunk_extra(PidSpec, EpochID, + ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, + Prefix, Chunk, ChunkExtra, infinity). + +%% @doc Append a chunk (binary- or iolist-style) of data to a file +%% with `Prefix'. + +append_chunk_extra(PidSpec, EpochID, Prefix, Chunk, ChunkExtra, Timeout) -> + append_chunk_extra(PidSpec, EpochID, + ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, + Prefix, Chunk, ChunkExtra, Timeout). + +append_chunk_extra(PidSpec, EpochID, CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra) -> + append_chunk_extra(PidSpec, EpochID, CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra, infinity). + +append_chunk_extra(PidSpec, EpochID, CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra, Timeout) -> + gen_server:call(PidSpec, {req, {append_chunk_extra, EpochID, + CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra}}, Timeout). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. -read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, Opts) -> - read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, Opts, infinity). +read_chunk(PidSpec, EpochID, File, Offset, Size, Opts) -> + read_chunk(PidSpec, EpochID, File, Offset, Size, Opts, infinity). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. -read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, Opts, Timeout) -> - gen_server:call(PidSpec, {req, {read_chunk, NSInfo, EpochID, File, Offset, Size, Opts}}, +read_chunk(PidSpec, EpochID, File, Offset, Size, Opts, Timeout) -> + gen_server:call(PidSpec, {req, {read_chunk, EpochID, File, Offset, Size, Opts}}, Timeout). %% @doc Fetch the list of chunk checksums for `File'. -checksum_list(PidSpec, File) -> - checksum_list(PidSpec, File, infinity). +checksum_list(PidSpec, EpochID, File) -> + checksum_list(PidSpec, EpochID, File, infinity). %% @doc Fetch the list of chunk checksums for `File'. -checksum_list(PidSpec, File, Timeout) -> - gen_server:call(PidSpec, {req, {checksum_list, File}}, +checksum_list(PidSpec, EpochID, File, Timeout) -> + gen_server:call(PidSpec, {req, {checksum_list, EpochID, File}}, Timeout). %% @doc Fetch the list of all files on the remote FLU. @@ -280,18 +320,18 @@ quit(PidSpec) -> %% @doc Write a chunk (binary- or iolist-style) of data to a file %% with `Prefix' at `Offset'. -write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum) -> - write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum, infinity). +write_chunk(PidSpec, EpochID, File, Offset, Chunk) -> + write_chunk(PidSpec, EpochID, File, Offset, Chunk, infinity). %% @doc Write a chunk (binary- or iolist-style) of data to a file %% with `Prefix' at `Offset'. -write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum, Timeout) -> - case gen_server:call(PidSpec, {req, {write_chunk, NSInfo, EpochID, File, Offset, Chunk, CSum}}, +write_chunk(PidSpec, EpochID, File, Offset, Chunk, Timeout) -> + case gen_server:call(PidSpec, {req, {write_chunk, EpochID, File, Offset, Chunk}}, Timeout) of {error, written}=Err -> Size = byte_size(Chunk), - case read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, undefined, Timeout) of + case read_chunk(PidSpec, EpochID, File, Offset, Size, [], Timeout) of {ok, {[{File, Offset, Chunk2, _}], []}} when Chunk2 == Chunk -> %% See equivalent comment inside write_projection(). ok; @@ -303,15 +343,15 @@ write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum, Timeout) -> end. -trim_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size) -> - trim_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, infinity). +trim_chunk(PidSpec, EpochID, File, Offset, Size) -> + trim_chunk(PidSpec, EpochID, File, Offset, Size, infinity). %% @doc Write a chunk (binary- or iolist-style) of data to a file %% with `Prefix' at `Offset'. -trim_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, Timeout) -> +trim_chunk(PidSpec, EpochID, File, Offset, Chunk, Timeout) -> gen_server:call(PidSpec, - {req, {trim_chunk, NSInfo, EpochID, File, Offset, Chunk}}, + {req, {trim_chunk, EpochID, File, Offset, Chunk}}, Timeout). %%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -375,24 +415,24 @@ do_req_retry(_Req, 2, Err, S) -> do_req_retry(Req, Depth, _Err, S) -> do_req(Req, Depth + 1, try_connect(disconnect(S))). -make_req_fun({append_chunk, NSInfo, EpochID, - Prefix, Chunk, CSum, Opts, Timeout}, +make_req_fun({append_chunk_extra, EpochID, CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> - fun() -> Mod:append_chunk(Sock, NSInfo, EpochID, - Prefix, Chunk, CSum, Opts, Timeout) + fun() -> Mod:append_chunk_extra(Sock, EpochID, CoC_Namespace, CoC_Locator, + Prefix, Chunk, ChunkExtra) end; -make_req_fun({read_chunk, NSInfo, EpochID, File, Offset, Size, Opts}, +make_req_fun({read_chunk, EpochID, File, Offset, Size, Opts}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> - fun() -> Mod:read_chunk(Sock, NSInfo, EpochID, File, Offset, Size, Opts) end; -make_req_fun({write_chunk, NSInfo, EpochID, File, Offset, Chunk, CSum}, + fun() -> Mod:read_chunk(Sock, EpochID, File, Offset, Size, Opts) end; +make_req_fun({write_chunk, EpochID, File, Offset, Chunk}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> - fun() -> Mod:write_chunk(Sock, NSInfo, EpochID, File, Offset, Chunk, CSum) end; -make_req_fun({trim_chunk, NSInfo, EpochID, File, Offset, Size}, + fun() -> Mod:write_chunk(Sock, EpochID, File, Offset, Chunk) end; +make_req_fun({trim_chunk, EpochID, File, Offset, Size}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> - fun() -> Mod:trim_chunk(Sock, NSInfo, EpochID, File, Offset, Size) end; -make_req_fun({checksum_list, File}, + fun() -> Mod:trim_chunk(Sock, EpochID, File, Offset, Size) end; +make_req_fun({checksum_list, EpochID, File}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> - fun() -> Mod:checksum_list(Sock, File) end; + fun() -> Mod:checksum_list(Sock, EpochID, File) end; make_req_fun({list_files, EpochID}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> fun() -> Mod:list_files(Sock, EpochID) end; diff --git a/src/machi_sup.erl b/src/machi_sup.erl index f7ddd10..6cf7695 100644 --- a/src/machi_sup.erl +++ b/src/machi_sup.erl @@ -65,11 +65,5 @@ init([]) -> LifecycleMgr = {machi_lifecycle_mgr, {machi_lifecycle_mgr, start_link, []}, Restart, Shutdown, worker, []}, - RunningApps = [A || {A,_D,_V} <- application:which_applications()], - Specs = case lists:member(ranch, RunningApps) of - true -> - [ServerSup, LifecycleMgr]; - false -> - [ServerSup, RanchSup, LifecycleMgr] - end, - {ok, {SupFlags, Specs}}. + + {ok, {SupFlags, [ServerSup, RanchSup, LifecycleMgr]}}. diff --git a/src/machi_util.erl b/src/machi_util.erl index 95a42a5..aa5f070 100644 --- a/src/machi_util.erl +++ b/src/machi_util.erl @@ -49,9 +49,7 @@ %% Other wait_for_death/2, wait_for_life/2, bool2int/1, - int2bool/1, - read_opts_default/1, - ns_info_default/1 + int2bool/1 ]). -include("machi.hrl"). @@ -70,12 +68,12 @@ make_regname(Prefix) when is_list(Prefix) -> %% @doc Calculate a config file path, by common convention. --spec make_config_filename(string(), machi_dt:namespace(), machi_dt:locator(), string()) -> +-spec make_config_filename(string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string()) -> string(). -make_config_filename(DataDir, NS, NSLocator, Prefix) -> - NSLocator_str = int_to_hexstr(NSLocator, 32), +make_config_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix) -> + Locator_str = int_to_hexstr(CoC_Locator, 32), lists:flatten(io_lib:format("~s/config/~s^~s^~s", - [DataDir, Prefix, NS, NSLocator_str])). + [DataDir, Prefix, CoC_Namespace, Locator_str])). %% @doc Calculate a config file path, by common convention. @@ -104,19 +102,19 @@ make_checksum_filename(DataDir, FileName) -> %% @doc Calculate a file data file path, by common convention. --spec make_data_filename(string(), machi_dt:namespace(), machi_dt:locator(), string(), atom()|string()|binary(), integer()|string()) -> +-spec make_data_filename(string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string(), atom()|string()|binary(), integer()|string()) -> {binary(), string()}. -make_data_filename(DataDir, NS, NSLocator, Prefix, SequencerName, FileNum) +make_data_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, SequencerName, FileNum) when is_integer(FileNum) -> - NSLocator_str = int_to_hexstr(NSLocator, 32), + Locator_str = int_to_hexstr(CoC_Locator, 32), File = erlang:iolist_to_binary(io_lib:format("~s^~s^~s^~s^~w", - [Prefix, NS, NSLocator_str, SequencerName, FileNum])), + [Prefix, CoC_Namespace, Locator_str, SequencerName, FileNum])), make_data_filename2(DataDir, File); -make_data_filename(DataDir, NS, NSLocator, Prefix, SequencerName, String) +make_data_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, SequencerName, String) when is_list(String) -> - NSLocator_str = int_to_hexstr(NSLocator, 32), + Locator_str = int_to_hexstr(CoC_Locator, 32), File = erlang:iolist_to_binary(io_lib:format("~s^~s^~s^~s^~s", - [Prefix, NS, NSLocator_str, SequencerName, string])), + [Prefix, CoC_Namespace, Locator_str, SequencerName, string])), make_data_filename2(DataDir, File). make_data_filename2(DataDir, File) -> @@ -156,36 +154,37 @@ is_valid_filename(Filename) -> %% The components will be: %%
    %%
  • Prefix
  • -%%
  • Cluster namespace
  • -%%
  • Cluster locator
  • +%%
  • CoC Namespace
  • +%%
  • CoC locator
  • %%
  • UUID
  • %%
  • Sequence number
  • %%
%% %% Invalid filenames will return an empty list. --spec parse_filename( Filename :: string() ) -> {} | {string(), machi_dt:namespace(), machi_dt:locator(), string(), string() }. +-spec parse_filename( Filename :: string() ) -> {} | {string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string(), string() }. parse_filename(Filename) -> case string:tokens(Filename, "^") of - [Prefix, NS, NSLocator, UUID, SeqNo] -> - {Prefix, NS, list_to_integer(NSLocator), UUID, SeqNo}; - [Prefix, NSLocator, UUID, SeqNo] -> + [Prefix, CoC_NS, CoC_Loc, UUID, SeqNo] -> + {Prefix, CoC_NS, list_to_integer(CoC_Loc), UUID, SeqNo}; + [Prefix, CoC_Loc, UUID, SeqNo] -> %% string:tokens() doesn't consider "foo^^bar" as 3 tokens {sigh} case re:replace(Filename, "[^^]+", "x", [global,{return,binary}]) of <<"x^^x^x^x">> -> - {Prefix, <<"">>, list_to_integer(NSLocator), UUID, SeqNo}; + {Prefix, <<"">>, list_to_integer(CoC_Loc), UUID, SeqNo}; _ -> {} end; _ -> {} end. + %% @doc Read the file size of a config file, which is used as the %% basis for a minimum sequence number. --spec read_max_filenum(string(), machi_dt:namespace(), machi_dt:locator(), string()) -> +-spec read_max_filenum(string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string()) -> non_neg_integer(). -read_max_filenum(DataDir, NS, NSLocator, Prefix) -> - case file:read_file_info(make_config_filename(DataDir, NS, NSLocator, Prefix)) of +read_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix) -> + case file:read_file_info(make_config_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix)) of {error, enoent} -> 0; {ok, FI} -> @@ -195,11 +194,11 @@ read_max_filenum(DataDir, NS, NSLocator, Prefix) -> %% @doc Increase the file size of a config file, which is used as the %% basis for a minimum sequence number. --spec increment_max_filenum(string(), machi_dt:namespace(), machi_dt:locator(), string()) -> +-spec increment_max_filenum(string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string()) -> ok | {error, term()}. -increment_max_filenum(DataDir, NS, NSLocator, Prefix) -> +increment_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix) -> try - {ok, FH} = file:open(make_config_filename(DataDir, NS, NSLocator, Prefix), [append]), + {ok, FH} = file:open(make_config_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix), [append]), ok = file:write(FH, "x"), ok = file:sync(FH), ok = file:close(FH) @@ -288,25 +287,12 @@ int_to_hexbin(I, I_size) -> checksum_chunk(Chunk) when is_binary(Chunk); is_list(Chunk) -> crypto:hash(sha, Chunk). -convert_csum_tag(A) when is_atom(A)-> - A; -convert_csum_tag(?CSUM_TAG_NONE) -> - ?CSUM_TAG_NONE_ATOM; -convert_csum_tag(?CSUM_TAG_CLIENT_SHA) -> - ?CSUM_TAG_CLIENT_SHA_ATOM; -convert_csum_tag(?CSUM_TAG_SERVER_SHA) -> - ?CSUM_TAG_SERVER_SHA_ATOM; -convert_csum_tag(?CSUM_TAG_SERVER_REGEN_SHA) -> - ?CSUM_TAG_SERVER_REGEN_SHA_ATOM. - %% @doc Create a tagged checksum make_tagged_csum(none) -> <>; -make_tagged_csum(<<>>) -> - <>; make_tagged_csum({Tag, CSum}) -> - make_tagged_csum(convert_csum_tag(Tag), CSum). + make_tagged_csum(Tag, CSum). %% @doc Makes tagged csum. Each meanings are: %% none / ?CSUM_TAG_NONE @@ -374,7 +360,7 @@ wait_for_death(Pid, Iters) when is_pid(Pid) -> false -> ok; true -> - timer:sleep(10), + timer:sleep(1), wait_for_death(Pid, Iters-1) end. @@ -445,17 +431,3 @@ bool2int(true) -> 1; bool2int(false) -> 0. int2bool(0) -> false; int2bool(I) when is_integer(I) -> true. - -read_opts_default(#read_opts{}=NSInfo) -> - NSInfo; -read_opts_default(A) when A == 'undefined'; A == 'noopt'; A == 'none' -> - #read_opts{}; -read_opts_default(A) when is_atom(A) -> - #read_opts{}. - -ns_info_default(#ns_info{}=NSInfo) -> - NSInfo; -ns_info_default(A) when is_atom(A) -> - #ns_info{}. - - diff --git a/src/machi_yessir_client.erl b/src/machi_yessir_client.erl index 8721824..1bdef2a 100644 --- a/src/machi_yessir_client.erl +++ b/src/machi_yessir_client.erl @@ -22,8 +22,6 @@ -module(machi_yessir_client). --ifdef(TODO_refactoring_deferred). - -include("machi.hrl"). -include("machi_projection.hrl"). @@ -32,7 +30,7 @@ append_chunk/4, append_chunk/5, append_chunk_extra/5, append_chunk_extra/6, read_chunk/5, read_chunk/6, - checksum_list/2, checksum_list/3, + checksum_list/3, checksum_list/4, list_files/2, list_files/3, wedge_status/1, wedge_status/2, @@ -175,7 +173,7 @@ read_chunk(_Host, _TcpPort, EpochID, File, Offset, Size) %% @doc Fetch the list of chunk checksums for `File'. -checksum_list(#yessir{name=Name,chunk_size=ChunkSize}, File) -> +checksum_list(#yessir{name=Name,chunk_size=ChunkSize}, _EpochID, File) -> case get({Name,offset,File}) of undefined -> {error, no_such_file}; @@ -189,10 +187,10 @@ checksum_list(#yessir{name=Name,chunk_size=ChunkSize}, File) -> %% @doc Fetch the list of chunk checksums for `File'. -checksum_list(_Host, _TcpPort, File) -> +checksum_list(_Host, _TcpPort, EpochID, File) -> Sock = connect(#p_srvr{proto_mod=?MODULE}), try - checksum_list(Sock, File) + checksum_list(Sock, EpochID, File) after disconnect(Sock) end. @@ -511,5 +509,3 @@ disconnect(#yessir{name=Name}) -> %% =INFO REPORT==== 17-May-2015::18:57:52 === %% Repair success: tail a of [a] finished ap_mode repair ID {a,{1431,856671,140404}}: ok %% Stats [{t_in_files,0},{t_in_chunks,10413},{t_in_bytes,682426368},{t_out_files,0},{t_out_chunks,10413},{t_out_bytes,682426368},{t_bad_chunks,0},{t_elapsed_seconds,1.591}] - --endif. % TODO_refactoring_deferred diff --git a/test/machi_admin_util_test.erl b/test/machi_admin_util_test.erl index cd4d813..1ebbbf3 100644 --- a/test/machi_admin_util_test.erl +++ b/test/machi_admin_util_test.erl @@ -44,8 +44,6 @@ verify_file_checksums_test2() -> TcpPort = 32958, DataDir = "./data", W_props = [{initial_wedged, false}], - NSInfo = undefined, - NoCSum = <<>>, try machi_test_util:start_flu_package(verify1_flu, TcpPort, DataDir, W_props), @@ -53,8 +51,8 @@ verify_file_checksums_test2() -> try Prefix = <<"verify_prefix">>, NumChunks = 10, - [{ok, _} = ?FLU_C:append_chunk(Sock1, NSInfo, ?DUMMY_PV1_EPOCH, - Prefix, <>, NoCSum) || + [{ok, _} = ?FLU_C:append_chunk(Sock1, ?DUMMY_PV1_EPOCH, + Prefix, <>) || X <- lists:seq(1, NumChunks)], {ok, [{_FileSize,File}]} = ?FLU_C:list_files(Sock1, ?DUMMY_PV1_EPOCH), ?assertEqual({ok, []}, diff --git a/test/machi_ap_repair_eqc.erl b/test/machi_ap_repair_eqc.erl index 55bc082..7d87d35 100644 --- a/test/machi_ap_repair_eqc.erl +++ b/test/machi_ap_repair_eqc.erl @@ -118,10 +118,7 @@ append(CRIndex, Bin, #state{verbose=V}=S) -> {_SimSelfName, C} = lists:nth(CRIndex, CRList), Prefix = <<"pre">>, Len = byte_size(Bin), - NSInfo = #ns_info{}, - NoCSum = <<>>, - Opts1 = #append_opts{}, - Res = (catch machi_cr_client:append_chunk(C, NSInfo, Prefix, Bin, NoCSum, Opts1, sec(1))), + Res = (catch machi_cr_client:append_chunk(C, Prefix, Bin, {sec(1), sec(1)})), case Res of {ok, {_Off, Len, _FileName}=Key} -> case ets:insert_new(?WRITTEN_TAB, {Key, Bin}) of @@ -193,7 +190,6 @@ change_partition(Partition, %% Don't wait for stable chain, tick will be executed on demand %% in append oprations _ = tick(S), - ok. %% Generators @@ -431,7 +427,7 @@ confirm_result(_T) -> 0 -> ok; _ -> DumpFailed = filename:join(DirBase, "dump-failed-" ++ Suffix), - ?V("Dump failed ETS tab to: ~s~n", [DumpFailed]), + ?V("Dump failed ETS tab to: ~w~n", [DumpFailed]), ets:tab2file(?FAILED_TAB, DumpFailed) end, case Critical of @@ -454,14 +450,14 @@ confirm_written(C) -> assert_chunk(C, {Off, Len, FileName}=Key, Bin) -> %% TODO: This probably a bug, read_chunk respnds with filename of `string()' type + FileNameStr = binary_to_list(FileName), %% TODO : Use CSum instead of binary (after disuccsion about CSum is calmed down?) - NSInfo = undefined, - case (catch machi_cr_client:read_chunk(C, NSInfo, FileName, Off, Len, undefined, sec(3))) of - {ok, {[{FileName, Off, Bin, _}], []}} -> + case (catch machi_cr_client:read_chunk(C, FileName, Off, Len, [], sec(3))) of + {ok, {[{FileNameStr, Off, Bin, _}], []}} -> ok; {ok, Got} -> ?V("read_chunk got different binary for Key=~p~n", [Key]), - ?V(" Expected: ~p~n", [{[{FileName, Off, Bin, <<"CSum-NYI">>}], []}]), + ?V(" Expected: ~p~n", [{[{FileNameStr, Off, Bin, <<"CSum-NYI">>}], []}]), ?V(" Got: ~p~n", [Got]), {error, different_binary}; {error, Reason} -> @@ -483,7 +479,7 @@ eqc_verbose() -> os:getenv("EQC_VERBOSE") =:= "true". eqc_timeout(Default) -> - PropTimeout = case os:getenv("EQC_TIME") of + PropTimeout = case os:getenv("EQC_TIMEOUT") of false -> Default; V -> list_to_integer(V) end, @@ -558,10 +554,8 @@ wait_until_stable(ExpectedChainState, FLUNames, MgrNames, Retries, Verbose) -> FCList = fc_list(), wait_until_stable1(ExpectedChainState, TickFun, FCList, Retries, Verbose). -wait_until_stable1(ExpectedChainState, _TickFun, FCList, 0, _Verbose) -> - ?V(" [ERROR] _ExpectedChainState ~p\n", [ExpectedChainState]), +wait_until_stable1(_ExpectedChainState, _TickFun, FCList, 0, _Verbose) -> ?V(" [ERROR] wait_until_stable failed.... : ~p~n", [chain_state(FCList)]), - ?V(" [ERROR] norm.... : ~p~n", [normalize_chain_state(chain_state(FCList))]), false; wait_until_stable1(ExpectedChainState, TickFun, FCList, Reties, Verbose) -> [TickFun(3, 0, 100) || _ <- lists:seq(1, 3)], diff --git a/test/machi_chain_manager1_converge_demo.erl b/test/machi_chain_manager1_converge_demo.erl index c1299cd..cee7a78 100644 --- a/test/machi_chain_manager1_converge_demo.erl +++ b/test/machi_chain_manager1_converge_demo.erl @@ -134,7 +134,6 @@ Press control-c to interrupt the test....". %% convergence_demo_testfun(3). -define(DEFAULT_MGR_OPTS, [{private_write_verbose, false}, - {private_write_verbose_confirm, true}, {active_mode,false}, {use_partition_simulator, true}]). @@ -151,8 +150,7 @@ convergence_demo_testfun(NumFLUs, MgrOpts0) -> %% Faster test startup, commented: io:format(user, short_doc(), []), %% Faster test startup, commented: timer:sleep(3000), - Apps = [sasl, ranch], - [application:start(App) || App <- Apps], + application:start(sasl), MgrOpts = MgrOpts0 ++ ?DEFAULT_MGR_OPTS, TcpPort = proplists:get_value(port_base, MgrOpts, 62877), @@ -395,8 +393,7 @@ timer:sleep(1234), exit(SupPid, normal), ok = machi_partition_simulator:stop(), [ok = ?FLU_PC:quit(PPid) || {_, PPid} <- Namez], - machi_util:wait_for_death(SupPid, 100), - [application:start(App) || App <- lists:reverse(Apps)] + machi_util:wait_for_death(SupPid, 100) end. %% Many of the static partition lists below have been problematic at one diff --git a/test/machi_chain_manager1_test.erl b/test/machi_chain_manager1_test.erl index 80296d2..02010ff 100644 --- a/test/machi_chain_manager1_test.erl +++ b/test/machi_chain_manager1_test.erl @@ -401,7 +401,7 @@ nonunanimous_setup_and_fix_test2() -> Mb, ChainName, TheEpoch_3, ap_mode, MembersDict4, []), Advance(), - {ok, {true, _,_,_}} = ?FLU_PC:wedge_status(Proxy_a), + {ok, {true, _}} = ?FLU_PC:wedge_status(Proxy_a), {_, _, TheEpoch_4} = ?MGR:trigger_react_to_env(Mb), {_, _, TheEpoch_4} = ?MGR:trigger_react_to_env(Mc), [{ok, #projection_v1{upi=[b,c], repairing=[]}} = @@ -451,9 +451,9 @@ nonunanimous_setup_and_fix_test2() -> #p_srvr{name=NameA} = hd(Ps), {ok,_}=machi_flu_psup:start_flu_package(NameA, TcpPort+1, hd(Dirs), Opts), Advance(), - {ok, {true, _,_,_}} = ?FLU_PC:wedge_status(Proxy_a), - {ok, {false, EpochID_8,_,_}} = ?FLU_PC:wedge_status(Proxy_b), - {ok, {false, EpochID_8,_,_}} = ?FLU_PC:wedge_status(Proxy_c), + {ok, {true, _}} = ?FLU_PC:wedge_status(Proxy_a), + {ok, {false, EpochID_8}} = ?FLU_PC:wedge_status(Proxy_b), + {ok, {false, EpochID_8}} = ?FLU_PC:wedge_status(Proxy_c), [{ok, #projection_v1{upi=[b,c], repairing=[]}} = ?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- tl(Proxies)], @@ -463,8 +463,8 @@ nonunanimous_setup_and_fix_test2() -> ok = machi_flu_psup:stop_flu_package(a), Advance(), machi_flu1_test:clean_up_data_dir(hd(Dirs)), - {ok, {false, _,_,_}} = ?FLU_PC:wedge_status(Proxy_b), - {ok, {false, _,_,_}} = ?FLU_PC:wedge_status(Proxy_c), + {ok, {false, _}} = ?FLU_PC:wedge_status(Proxy_b), + {ok, {false, _}} = ?FLU_PC:wedge_status(Proxy_c), %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% io:format("STEP: Add a to the chain again (a is stopped).\n", []), @@ -482,9 +482,9 @@ nonunanimous_setup_and_fix_test2() -> {ok,_}=machi_flu_psup:start_flu_package(NameA, TcpPort+1, hd(Dirs), Opts), Advance(), - {ok, {false, {TheEpoch10,_},_,_}} = ?FLU_PC:wedge_status(Proxy_a), - {ok, {false, {TheEpoch10,_},_,_}} = ?FLU_PC:wedge_status(Proxy_b), - {ok, {false, {TheEpoch10,_},_,_}} = ?FLU_PC:wedge_status(Proxy_c), + {ok, {false, {TheEpoch10,_}}} = ?FLU_PC:wedge_status(Proxy_a), + {ok, {false, {TheEpoch10,_}}} = ?FLU_PC:wedge_status(Proxy_b), + {ok, {false, {TheEpoch10,_}}} = ?FLU_PC:wedge_status(Proxy_c), [{ok, #projection_v1{upi=[b,c], repairing=[a]}} = ?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- Proxies], ok diff --git a/test/machi_cr_client_test.erl b/test/machi_cr_client_test.erl index 29e1d13..5179fc8 100644 --- a/test/machi_cr_client_test.erl +++ b/test/machi_cr_client_test.erl @@ -107,8 +107,6 @@ smoke_test2() -> try Prefix = <<"pre">>, Chunk1 = <<"yochunk">>, - NSInfo = undefined, - NoCSum = <<>>, Host = "localhost", PortBase = 64454, Os = [{ignore_stability_time, true}, {active_mode, false}], @@ -116,92 +114,91 @@ smoke_test2() -> %% Whew ... ok, now start some damn tests. {ok, C1} = machi_cr_client:start_link([P || {_,P}<-orddict:to_list(D)]), - machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum), + machi_cr_client:append_chunk(C1, Prefix, Chunk1), {ok, {Off1,Size1,File1}} = - machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum), - BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, "foo")}, + machi_cr_client:append_chunk(C1, Prefix, Chunk1), + Chunk1_badcs = {<>, Chunk1}, {error, bad_checksum} = - machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, BadCSum), + machi_cr_client:append_chunk(C1, Prefix, Chunk1_badcs), {ok, {[{_, Off1, Chunk1, _}], []}} = - machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, Size1, undefined), + machi_cr_client:read_chunk(C1, File1, Off1, Size1, []), {ok, PPP} = machi_flu1_client:read_latest_projection(Host, PortBase+0, private), %% Verify that the client's CR wrote to all of them. [{ok, {[{_, Off1, Chunk1, _}], []}} = machi_flu1_client:read_chunk( - Host, PortBase+X, NSInfo, EpochID, File1, Off1, Size1, undefined) || + Host, PortBase+X, EpochID, File1, Off1, Size1, []) || X <- [0,1,2] ], %% Test read repair: Manually write to head, then verify that %% read-repair fixes all. FooOff1 = Off1 + (1024*1024), [{error, not_written} = machi_flu1_client:read_chunk( - Host, PortBase+X, NSInfo, EpochID, - File1, FooOff1, Size1, undefined) || X <- [0,1,2] ], - ok = machi_flu1_client:write_chunk(Host, PortBase+0, NSInfo, EpochID, - File1, FooOff1, Chunk1, NoCSum), - {ok, {[{File1, FooOff1, Chunk1, _}=_YY], []}} = - machi_flu1_client:read_chunk(Host, PortBase+0, NSInfo, EpochID, - File1, FooOff1, Size1, undefined), - {ok, {[{File1, FooOff1, Chunk1, _}], []}} = - machi_cr_client:read_chunk(C1, NSInfo, File1, FooOff1, Size1, undefined), + Host, PortBase+X, EpochID, + File1, FooOff1, Size1, []) || X <- [0,1,2] ], + ok = machi_flu1_client:write_chunk(Host, PortBase+0, EpochID, + File1, FooOff1, Chunk1), + {ok, {[{_, FooOff1, Chunk1, _}], []}} = + machi_flu1_client:read_chunk(Host, PortBase+0, EpochID, + File1, FooOff1, Size1, []), + {ok, {[{_, FooOff1, Chunk1, _}], []}} = + machi_cr_client:read_chunk(C1, File1, FooOff1, Size1, []), [?assertMatch({X,{ok, {[{_, FooOff1, Chunk1, _}], []}}}, {X,machi_flu1_client:read_chunk( - Host, PortBase+X, NSInfo, EpochID, - File1, FooOff1, Size1, undefined)}) + Host, PortBase+X, EpochID, + File1, FooOff1, Size1, [])}) || X <- [0,1,2] ], %% Test read repair: Manually write to middle, then same checking. FooOff2 = Off1 + (2*1024*1024), Chunk2 = <<"Middle repair chunk">>, Size2 = size(Chunk2), - ok = machi_flu1_client:write_chunk(Host, PortBase+1, NSInfo, EpochID, - File1, FooOff2, Chunk2, NoCSum), - {ok, {[{File1, FooOff2, Chunk2, _}], []}} = - machi_cr_client:read_chunk(C1, NSInfo, File1, FooOff2, Size2, undefined), - [{X,{ok, {[{File1, FooOff2, Chunk2, _}], []}}} = + ok = machi_flu1_client:write_chunk(Host, PortBase+1, EpochID, + File1, FooOff2, Chunk2), + {ok, {[{_, FooOff2, Chunk2, _}], []}} = + machi_cr_client:read_chunk(C1, File1, FooOff2, Size2, []), + [{X,{ok, {[{_, FooOff2, Chunk2, _}], []}}} = {X,machi_flu1_client:read_chunk( - Host, PortBase+X, NSInfo, EpochID, - File1, FooOff2, Size2, undefined)} || X <- [0,1,2] ], + Host, PortBase+X, EpochID, + File1, FooOff2, Size2, [])} || X <- [0,1,2] ], %% Misc API smoke & minor regression checks - {error, bad_arg} = machi_cr_client:read_chunk(C1, NSInfo, <<"no">>, - 999999999, 1, undefined), - {ok, {[{File1,Off1,Chunk1,_}, {File1,FooOff1,Chunk1,_}, {File1,FooOff2,Chunk2,_}], + {error, bad_arg} = machi_cr_client:read_chunk(C1, <<"no">>, + 999999999, 1, []), + {ok, {[{_,Off1,Chunk1,_}, {_,FooOff1,Chunk1,_}, {_,FooOff2,Chunk2,_}], []}} = - machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, 88888888, undefined), + machi_cr_client:read_chunk(C1, File1, Off1, 88888888, []), %% Checksum list return value is a primitive binary(). {ok, KludgeBin} = machi_cr_client:checksum_list(C1, File1), true = is_binary(KludgeBin), {error, bad_arg} = machi_cr_client:checksum_list(C1, <<"!!!!">>), - io:format(user, "\nFiles = ~p\n", [machi_cr_client:list_files(C1)]), +io:format(user, "\nFiles = ~p\n", [machi_cr_client:list_files(C1)]), %% Exactly one file right now, e.g., %% {ok,[{2098202,<<"pre^b144ef13-db4d-4c9f-96e7-caff02dc754f^1">>}]} {ok, [_]} = machi_cr_client:list_files(C1), - %% Go back and test append_chunk() + extra and write_chunk() + %% Go back and test append_chunk_extra() and write_chunk() Chunk10 = <<"It's a different chunk!">>, Size10 = byte_size(Chunk10), Extra10 = 5, - Opts1 = #append_opts{chunk_extra=Extra10*Size10}, {ok, {Off10,Size10,File10}} = - machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk10, - NoCSum, Opts1), + machi_cr_client:append_chunk_extra(C1, Prefix, Chunk10, + Extra10 * Size10), {ok, {[{_, Off10, Chunk10, _}], []}} = - machi_cr_client:read_chunk(C1, NSInfo, File10, Off10, Size10, undefined), + machi_cr_client:read_chunk(C1, File10, Off10, Size10, []), [begin Offx = Off10 + (Seq * Size10), %% TODO: uncomment written/not_written enforcement is available. - %% {error,not_written} = machi_cr_client:read_chunk(C1, NSInfo, File10, + %% {error,not_written} = machi_cr_client:read_chunk(C1, File10, %% Offx, Size10), {ok, {Offx,Size10,File10}} = - machi_cr_client:write_chunk(C1, NSInfo, File10, Offx, Chunk10, NoCSum), + machi_cr_client:write_chunk(C1, File10, Offx, Chunk10), {ok, {[{_, Offx, Chunk10, _}], []}} = - machi_cr_client:read_chunk(C1, NSInfo, File10, Offx, Size10, undefined) + machi_cr_client:read_chunk(C1, File10, Offx, Size10, []) end || Seq <- lists:seq(1, Extra10)], {ok, {Off11,Size11,File11}} = - machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk10, NoCSum), + machi_cr_client:append_chunk(C1, Prefix, Chunk10), %% %% Double-check that our reserved extra bytes were really honored! %% true = (Off11 > (Off10 + (Extra10 * Size10))), io:format(user, "\nFiles = ~p\n", [machi_cr_client:list_files(C1)]), @@ -227,8 +224,6 @@ witness_smoke_test2() -> try Prefix = <<"pre">>, Chunk1 = <<"yochunk">>, - NSInfo = undefined, - NoCSum = <<>>, Host = "localhost", PortBase = 64444, Os = [{ignore_stability_time, true}, {active_mode, false}, @@ -238,15 +233,14 @@ witness_smoke_test2() -> %% Whew ... ok, now start some damn tests. {ok, C1} = machi_cr_client:start_link([P || {_,P}<-orddict:to_list(D)]), - {ok, _} = machi_cr_client:append_chunk(C1, NSInfo, Prefix, - Chunk1, NoCSum), + {ok, _} = machi_cr_client:append_chunk(C1, Prefix, Chunk1), {ok, {Off1,Size1,File1}} = - machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum), - BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, "foo")}, + machi_cr_client:append_chunk(C1, Prefix, Chunk1), + Chunk1_badcs = {<>, Chunk1}, {error, bad_checksum} = - machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, BadCSum), + machi_cr_client:append_chunk(C1, Prefix, Chunk1_badcs), {ok, {[{_, Off1, Chunk1, _}], []}} = - machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, Size1, undefined), + machi_cr_client:read_chunk(C1, File1, Off1, Size1, []), %% Stop 'b' and let the chain reset. ok = machi_flu_psup:stop_flu_package(b), @@ -259,25 +253,24 @@ witness_smoke_test2() -> %% Let's wedge OurWitness and see what happens: timeout/partition. #p_srvr{name=WitName, address=WitA, port=WitP} = orddict:fetch(OurWitness, D), - {ok, {false, EpochID2,_,_}} = machi_flu1_client:wedge_status(WitA, WitP), + {ok, {false, EpochID2}} = machi_flu1_client:wedge_status(WitA, WitP), machi_flu1:wedge_myself(WitName, EpochID2), case machi_flu1_client:wedge_status(WitA, WitP) of - {ok, {true, EpochID2,_,_}} -> + {ok, {true, EpochID2}} -> ok; - {ok, {false, EpochID2,_,_}} -> + {ok, {false, EpochID2}} -> %% This is racy. Work around it by sleeping a while. timer:sleep(6*1000), - {ok, {true, EpochID2,_,_}} = + {ok, {true, EpochID2}} = machi_flu1_client:wedge_status(WitA, WitP) end, %% Chunk1 is still readable: not affected by wedged witness head. {ok, {[{_, Off1, Chunk1, _}], []}} = - machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, Size1, undefined), + machi_cr_client:read_chunk(C1, File1, Off1, Size1, []), %% But because the head is wedged, an append will fail. {error, partition} = - machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum, - #append_opts{}, 1*1000), + machi_cr_client:append_chunk(C1, Prefix, Chunk1, 1*1000), %% The witness's wedge status should cause timeout/partition %% for write_chunk also. @@ -286,7 +279,7 @@ witness_smoke_test2() -> File10 = File1, Offx = Off1 + (1 * Size10), {error, partition} = - machi_cr_client:write_chunk(C1, NSInfo, File10, Offx, Chunk10, NoCSum, 1*1000), + machi_cr_client:write_chunk(C1, File10, Offx, Chunk10, 1*1000), ok after diff --git a/test/machi_csum_table_test.erl b/test/machi_csum_table_test.erl index f2b7a4f..82c499d 100644 --- a/test/machi_csum_table_test.erl +++ b/test/machi_csum_table_test.erl @@ -2,68 +2,69 @@ -compile(export_all). -include_lib("eunit/include/eunit.hrl"). --define(HDR, {0, 1024, none}). cleanup(Dir) -> os:cmd("rm -rf " ++ Dir). smoke_test() -> - Filename = "./temp-checksum-dumb-file", - _ = cleanup(Filename), - {ok, MC} = machi_csum_table:open(Filename, []), - ?assertEqual([{1024, infinity}], - machi_csum_table:calc_unwritten_bytes(MC)), + DBFile = "./temp-checksum-dumb-file", + Filename = <<"/some/puppy/and/cats^^^42">>, + _ = cleanup(DBFile), + {ok, MC} = machi_csum_table:open(DBFile, []), + ?assertEqual([{0, infinity}], + machi_csum_table:calc_unwritten_bytes(MC, Filename)), Entry = {Offset, Size, Checksum} = {1064, 34, <<"deadbeef">>}, - [] = machi_csum_table:find(MC, Offset, Size), - ok = machi_csum_table:write(MC, Offset, Size, Checksum), - [{1024, 40}, {1098, infinity}] = machi_csum_table:calc_unwritten_bytes(MC), - ?assertEqual([Entry], machi_csum_table:find(MC, Offset, Size)), - ok = machi_csum_table:trim(MC, Offset, Size, undefined, undefined), + [] = machi_csum_table:find(MC, Filename, Offset, Size), + ok = machi_csum_table:write(MC, Filename, Offset, Size, Checksum), + [{0, 1064}, {1098, infinity}] = machi_csum_table:calc_unwritten_bytes(MC, Filename), + ?assertEqual([Entry], machi_csum_table:find(MC, Filename, Offset, Size)), + ok = machi_csum_table:trim(MC, Filename, Offset, Size, undefined, undefined), ?assertEqual([{Offset, Size, trimmed}], - machi_csum_table:find(MC, Offset, Size)), - ok = machi_csum_table:close(MC), - ok = machi_csum_table:delete(MC). + machi_csum_table:find(MC, Filename, Offset, Size)), + ok = machi_csum_table:close(MC). close_test() -> - Filename = "./temp-checksum-dumb-file-2", - _ = cleanup(Filename), - {ok, MC} = machi_csum_table:open(Filename, []), + DBFile = "./temp-checksum-dumb-file-2", + Filename = <<"/some/puppy/and/cats^^^43">>, + _ = cleanup(DBFile), + {ok, MC} = machi_csum_table:open(DBFile, []), Entry = {Offset, Size, Checksum} = {1064, 34, <<"deadbeef">>}, - [] = machi_csum_table:find(MC, Offset, Size), - ok = machi_csum_table:write(MC, Offset, Size, Checksum), - [Entry] = machi_csum_table:find(MC, Offset, Size), + [] = machi_csum_table:find(MC, Filename, Offset, Size), + ok = machi_csum_table:write(MC, Filename, Offset, Size, Checksum), + [Entry] = machi_csum_table:find(MC, Filename, Offset, Size), ok = machi_csum_table:close(MC), - {ok, MC2} = machi_csum_table:open(Filename, []), - [Entry] = machi_csum_table:find(MC2, Offset, Size), - ok = machi_csum_table:trim(MC2, Offset, Size, undefined, undefined), - [{Offset, Size, trimmed}] = machi_csum_table:find(MC2, Offset, Size), - ok = machi_csum_table:delete(MC2). + {ok, MC2} = machi_csum_table:open(DBFile, []), + [Entry] = machi_csum_table:find(MC2, Filename, Offset, Size), + ok = machi_csum_table:trim(MC2, Filename, Offset, Size, undefined, undefined), + [{Offset, Size, trimmed}] = machi_csum_table:find(MC2, Filename, Offset, Size), + ok = machi_csum_table:close(MC2). smoke2_test() -> - Filename = "./temp-checksum-dumb-file-3", - _ = cleanup(Filename), - {ok, MC} = machi_csum_table:open(Filename, []), + DBFile = "./temp-checksum-dumb-file-3", + Filename = <<"/some/puppy/and/cats^^^43">>, + _ = cleanup(DBFile), + {ok, MC} = machi_csum_table:open(DBFile, []), Entry = {Offset, Size, Checksum} = {1025, 10, <<"deadbeef">>}, - ok = machi_csum_table:write(MC, Offset, Size, Checksum), - ?assertEqual([], machi_csum_table:find(MC, 0, 0)), - ?assertEqual([?HDR], machi_csum_table:find(MC, 0, 1)), - [Entry] = machi_csum_table:find(MC, Offset, Size), - [?HDR] = machi_csum_table:find(MC, 1, 1024), - ?assertEqual([?HDR, Entry], - machi_csum_table:find(MC, 1023, 1024)), - [Entry] = machi_csum_table:find(MC, 1024, 1024), - [Entry] = machi_csum_table:find(MC, 1025, 1024), + ok = machi_csum_table:write(MC, Filename, Offset, Size, Checksum), + ?assertEqual([], machi_csum_table:find(MC, Filename, 0, 0)), + ?assertEqual([], machi_csum_table:find(MC, Filename, 0, 1)), + [Entry] = machi_csum_table:find(MC, Filename, Offset, Size), + [] = machi_csum_table:find(MC, Filename, 1, 1024), + ?assertEqual([Entry], + machi_csum_table:find(MC, Filename, 1023, 1024)), + [Entry] = machi_csum_table:find(MC, Filename, 1024, 1024), + [Entry] = machi_csum_table:find(MC, Filename, 1025, 1024), - ok = machi_csum_table:trim(MC, Offset, Size, undefined, undefined), - [{Offset, Size, trimmed}] = machi_csum_table:find(MC, Offset, Size), - ok = machi_csum_table:close(MC), - ok = machi_csum_table:delete(MC). + ok = machi_csum_table:trim(MC, Filename, Offset, Size, undefined, undefined), + [{Offset, Size, trimmed}] = machi_csum_table:find(MC, Filename, Offset, Size), + ok = machi_csum_table:close(MC). smoke3_test() -> - Filename = "./temp-checksum-dumb-file-4", - _ = cleanup(Filename), - {ok, MC} = machi_csum_table:open(Filename, []), + DBFile = "./temp-checksum-dumb-file-4", + Filename = <<"/some/puppy/and/cats^^^44">>, + _ = cleanup(DBFile), + {ok, MC} = machi_csum_table:open(DBFile, []), Scenario = [%% Command, {Offset, Size, Csum}, LeftNeighbor, RightNeibor {?LINE, write, {2000, 10, <<"heh">>}, undefined, undefined}, @@ -84,9 +85,9 @@ smoke3_test() -> %% ?debugVal({_Line, Chunk}), {Offset, Size, Csum} = Chunk, ?assertEqual(LeftN0, - machi_csum_table:find_leftneighbor(MC, Offset)), + machi_csum_table:find_leftneighbor(MC, Filename, Offset)), ?assertEqual(RightN0, - machi_csum_table:find_rightneighbor(MC, Offset+Size)), + machi_csum_table:find_rightneighbor(MC, Filename, Offset+Size)), LeftN = case LeftN0 of {OffsL, SizeL, trimmed} -> {OffsL, SizeL, trimmed}; {OffsL, SizeL, _} -> {OffsL, SizeL, <<"boom">>}; @@ -98,19 +99,18 @@ smoke3_test() -> end, case Cmd of write -> - ok = machi_csum_table:write(MC, Offset, Size, Csum, + ok = machi_csum_table:write(MC, Filename, Offset, Size, Csum, LeftN, RightN); trim -> - ok = machi_csum_table:trim(MC, Offset, Size, + ok = machi_csum_table:trim(MC, Filename, Offset, Size, LeftN, RightN) end end || {_Line, Cmd, Chunk, LeftN0, RightN0} <- Scenario ], - ?assert(not machi_csum_table:all_trimmed(MC, 10000)), - machi_csum_table:trim(MC, 0, 10000, undefined, undefined), - ?assert(machi_csum_table:all_trimmed(MC, 10000)), + ?assert(not machi_csum_table:all_trimmed(MC, Filename, 0, 10000)), + machi_csum_table:trim(MC, Filename, 0, 10000, undefined, undefined), + ?assert(machi_csum_table:all_trimmed(MC, Filename, 0, 10000)), - ok = machi_csum_table:close(MC), - ok = machi_csum_table:delete(MC). + ok = machi_csum_table:close(MC). %% TODO: add quickcheck test here diff --git a/test/machi_file_proxy_eqc.erl b/test/machi_file_proxy_eqc.erl index c7a50e2..53337a2 100644 --- a/test/machi_file_proxy_eqc.erl +++ b/test/machi_file_proxy_eqc.erl @@ -35,14 +35,10 @@ %% EUNIT TEST DEFINITION eqc_test_() -> - PropTimeout = case os:getenv("EQC_TIME") of - false -> 30; - V -> list_to_integer(V) - end, - {timeout, PropTimeout*2 + 30, + {timeout, 60, {spawn, [ - ?_assertEqual(true, eqc:quickcheck(eqc:testing_time(PropTimeout, ?QC_OUT(prop_ok())))) + ?_assertEqual(true, eqc:quickcheck(eqc:testing_time(30, ?QC_OUT(prop_ok())))) ] }}. @@ -120,11 +116,11 @@ get_written_interval(L) -> initial_state() -> {_, _, MS} = os:timestamp(), Filename = test_server:temp_name("eqc_data") ++ "." ++ integer_to_list(MS), - #state{filename=Filename, written=[{0,1024}]}. + #state{filename=Filename, written=[]}. initial_state(I, T) -> S=initial_state(), - S#state{written=[{0,1024}], + S#state{written=[], planned_writes=I, planned_trims=T}. @@ -234,7 +230,8 @@ start_command(S) -> {call, ?MODULE, start, [S]}. start(#state{filename=File}) -> - {ok, Pid} = machi_file_proxy:start_link(some_flu, File, ?TESTDIR), + CsumT = get_csum_table(), + {ok, Pid} = machi_file_proxy:start_link(File, ?TESTDIR, CsumT), unlink(Pid), Pid. @@ -436,6 +433,40 @@ stop_post(_, _, _) -> true. stop_next(S, _, _) -> S#state{pid=undefined, prev_extra=0}. +csum_table_holder() -> + Parent = self(), + spawn_link(fun() -> + CsumFile = test_server:temp_name("eqc_data-csum"), + filelib:ensure_dir(CsumFile), + {ok, CsumT} = machi_csum_table:open(CsumFile, []), + erlang:register(csum_table_holder, self()), + Parent ! ok, + csum_table_holder_loop(CsumT), + machi_csum_table:close(CsumT), + erlang:unregister(csum_table_holder) + end), + receive + Other -> Other + after 1000 -> + timeout + end. + +csum_table_holder_loop(CsumT) -> + receive + {get, From} -> + From ! CsumT; + stop -> + ok + end. + +get_csum_table() -> + csum_table_holder ! {get, self()}, + receive CsumT -> CsumT + end. + +stop_csum_table_holder() -> + catch csum_table_holder ! stop. + %% Property prop_ok() -> @@ -444,7 +475,9 @@ prop_ok() -> {shuffle_interval(), shuffle_interval()}, ?FORALL(Cmds, parallel_commands(?MODULE, initial_state(I, T)), begin + ok = csum_table_holder(), {H, S, Res} = run_parallel_commands(?MODULE, Cmds), + stop_csum_table_holder(), cleanup(), pretty_commands(?MODULE, Cmds, {H, S, Res}, aggregate(command_names(Cmds), Res == ok)) diff --git a/test/machi_file_proxy_test.erl b/test/machi_file_proxy_test.erl index 10e16bf..727ca35 100644 --- a/test/machi_file_proxy_test.erl +++ b/test/machi_file_proxy_test.erl @@ -38,7 +38,7 @@ clean_up_data_dir(DataDir) -> -ifndef(PULSE). -define(TESTDIR, "./t"). --define(HYOOGE, 75 * 1024 * 1024). % 75 MBytes +-define(HYOOGE, 1 * 1024 * 1024 * 1024). % 1 long GB random_binary_single() -> %% OK, I guess it's not that random... @@ -77,25 +77,28 @@ random_binary(Start, End) -> end. setup() -> - {ok, Pid} = machi_file_proxy:start_link(fluname, "test", ?TESTDIR), - Pid. + {ok, CsumT} = machi_csum_table:open(filename:join([?TESTDIR, "csumfile"]), []), + {ok, Pid} = machi_file_proxy:start_link("test", ?TESTDIR, CsumT), + {Pid, CsumT}. -teardown(Pid) -> - catch machi_file_proxy:stop(Pid). +teardown({Pid, CsumT}) -> + catch machi_file_proxy:stop(Pid), + catch machi_csum_table:close(CsumT). machi_file_proxy_test_() -> clean_up_data_dir(?TESTDIR), {setup, fun setup/0, fun teardown/1, - fun(Pid) -> + fun({Pid, _}) -> [ ?_assertEqual({error, bad_arg}, machi_file_proxy:read(Pid, -1, -1)), ?_assertEqual({error, bad_arg}, machi_file_proxy:write(Pid, -1, <<"yo">>)), ?_assertEqual({error, bad_arg}, machi_file_proxy:append(Pid, [], -1, <<"krep">>)), - ?_assertMatch({ok, {_, []}}, machi_file_proxy:read(Pid, 1, 1)), + ?_assertMatch({error, not_written}, machi_file_proxy:read(Pid, 1, 1)), ?_assertEqual({error, not_written}, machi_file_proxy:read(Pid, 1024, 1)), - ?_assertMatch({ok, {_, []}}, machi_file_proxy:read(Pid, 1, 1024)), + ?_assertMatch({ok, "test", _}, machi_file_proxy:append(Pid, random_binary(0, 1024))), + ?_assertMatch({ok, _}, machi_file_proxy:read(Pid, 1, 1024)), ?_assertEqual({error, not_written}, machi_file_proxy:read(Pid, 1024, ?HYOOGE)), ?_assertEqual({error, not_written}, machi_file_proxy:read(Pid, ?HYOOGE, 1)), {timeout, 10, @@ -114,12 +117,12 @@ multiple_chunks_read_test_() -> {setup, fun setup/0, fun teardown/1, - fun(Pid) -> + fun({Pid, _}) -> [ ?_assertEqual(ok, machi_file_proxy:trim(Pid, 0, 1, false)), ?_assertMatch({ok, {[], [{"test", 0, 1}]}}, machi_file_proxy:read(Pid, 0, 1, - #read_opts{needs_trimmed=true})), + [{needs_trimmed, true}])), ?_assertMatch({ok, "test", _}, machi_file_proxy:append(Pid, random_binary(0, 1024))), ?_assertEqual(ok, machi_file_proxy:write(Pid, 10000, <<"fail">>)), ?_assertEqual(ok, machi_file_proxy:write(Pid, 20000, <<"fail">>)), @@ -134,7 +137,7 @@ multiple_chunks_read_test_() -> machi_file_proxy:read(Pid, 1024, 530000)), ?_assertMatch({ok, {[{"test", 1, _, _}], [{"test", 0, 1}]}}, machi_file_proxy:read(Pid, 0, 1024, - #read_opts{needs_trimmed=true})) + [{needs_trimmed, true}])) ] end}. diff --git a/test/machi_flu1_test.erl b/test/machi_flu1_test.erl index 74490d2..99d7887 100644 --- a/test/machi_flu1_test.erl +++ b/test/machi_flu1_test.erl @@ -91,8 +91,6 @@ flu_smoke_test() -> Host = "localhost", TcpPort = 12957, DataDir = "./data", - NSInfo = undefined, - NoCSum = <<>>, Prefix = <<"prefix!">>, BadPrefix = BadFile = "no/good", W_props = [{initial_wedged, false}], @@ -100,31 +98,32 @@ flu_smoke_test() -> try Msg = "Hello, world!", Msg = ?FLU_C:echo(Host, TcpPort, Msg), - {error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort,"does-not-exist"), - {error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, BadFile), + {error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, + ?DUMMY_PV1_EPOCH, + "does-not-exist"), + {error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, + ?DUMMY_PV1_EPOCH, BadFile), {ok, []} = ?FLU_C:list_files(Host, TcpPort, ?DUMMY_PV1_EPOCH), - {ok, {false, _,_,_}} = ?FLU_C:wedge_status(Host, TcpPort), + {ok, {false, _}} = ?FLU_C:wedge_status(Host, TcpPort), Chunk1 = <<"yo!">>, - {ok, {Off1,Len1,File1}} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, + {ok, {Off1,Len1,File1}} = ?FLU_C:append_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, - Prefix, Chunk1, NoCSum), - {ok, {[{_, Off1, Chunk1, _}], _}} = ?FLU_C:read_chunk(Host, TcpPort, - NSInfo, ?DUMMY_PV1_EPOCH, - File1, Off1, Len1, - noopt), - {ok, KludgeBin} = ?FLU_C:checksum_list(Host, TcpPort, File1), + Prefix, Chunk1), + {ok, {[{_, Off1, Chunk1, _}], _}} = ?FLU_C:read_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, + File1, Off1, Len1, []), + {ok, KludgeBin} = ?FLU_C:checksum_list(Host, TcpPort, + ?DUMMY_PV1_EPOCH, File1), true = is_binary(KludgeBin), - {error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, + {error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, - BadPrefix, Chunk1, NoCSum), + BadPrefix, Chunk1), {ok, [{_,File1}]} = ?FLU_C:list_files(Host, TcpPort, ?DUMMY_PV1_EPOCH), Len1 = size(Chunk1), {error, not_written} = ?FLU_C:read_chunk(Host, TcpPort, - NSInfo, ?DUMMY_PV1_EPOCH, - File1, Off1*983829323, Len1, - noopt), + ?DUMMY_PV1_EPOCH, + File1, Off1*983829323, Len1, []), %% XXX FIXME %% %% This is failing because the read extends past the end of the file. @@ -133,22 +132,19 @@ flu_smoke_test() -> %% of the read will cause it to fail. %% %% {error, partial_read} = ?FLU_C:read_chunk(Host, TcpPort, - %% NSInfo, ?DUMMY_PV1_EPOCH, + %% ?DUMMY_PV1_EPOCH, %% File1, Off1, Len1*9999), - {ok, {Off1b,Len1b,File1b}} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, + {ok, {Off1b,Len1b,File1b}} = ?FLU_C:append_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, - Prefix, Chunk1,NoCSum), + Prefix, Chunk1), Extra = 42, - Opts1 = #append_opts{chunk_extra=Extra}, - {ok, {Off1c,Len1c,File1c}} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, + {ok, {Off1c,Len1c,File1c}} = ?FLU_C:append_chunk_extra(Host, TcpPort, ?DUMMY_PV1_EPOCH, - Prefix, Chunk1, NoCSum, - Opts1, infinity), + Prefix, Chunk1, Extra), {ok, {Off1d,Len1d,File1d}} = ?FLU_C:append_chunk(Host, TcpPort, - NSInfo, ?DUMMY_PV1_EPOCH, - Prefix, Chunk1,NoCSum), + Prefix, Chunk1), if File1b == File1c, File1c == File1d -> true = (Off1c == Off1b + Len1b), true = (Off1d == Off1c + Len1c + Extra); @@ -156,44 +152,27 @@ flu_smoke_test() -> exit(not_mandatory_but_test_expected_same_file_fixme) end, + Chunk1_cs = {<>, Chunk1}, + {ok, {Off1e,Len1e,File1e}} = ?FLU_C:append_chunk(Host, TcpPort, + ?DUMMY_PV1_EPOCH, + Prefix, Chunk1_cs), + Chunk2 = <<"yo yo">>, Len2 = byte_size(Chunk2), Off2 = ?MINIMUM_OFFSET + 77, File2 = "smoke-whole-file^^0^1^1", - ok = ?FLU_C:write_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, - File2, Off2, Chunk2, NoCSum), - {error, bad_arg} = ?FLU_C:write_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, - BadFile, Off2, Chunk2, NoCSum), + ok = ?FLU_C:write_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, + File2, Off2, Chunk2), + {error, bad_arg} = ?FLU_C:write_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, + BadFile, Off2, Chunk2), {ok, {[{_, Off2, Chunk2, _}], _}} = - ?FLU_C:read_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, File2, Off2, Len2, noopt), + ?FLU_C:read_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, File2, Off2, Len2, []), {error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort, - NSInfo, ?DUMMY_PV1_EPOCH, - "no!!", Off2, Len2, noopt), + ?DUMMY_PV1_EPOCH, + "no!!", Off2, Len2, []), {error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort, - NSInfo, ?DUMMY_PV1_EPOCH, - BadFile, Off2, Len2, noopt), - - %% Make a connected socket. - Sock1 = ?FLU_C:connect(#p_srvr{address=Host, port=TcpPort}), - - %% Let's test some cluster version enforcement. - Good_EpochNum = 0, - Good_NSVersion = 0, - Good_NS = <<>>, - {ok, {false, {Good_EpochNum,_}, Good_NSVersion, GoodNS}} = - ?FLU_C:wedge_status(Sock1), - NS_good = #ns_info{version=Good_NSVersion, name=Good_NS}, - {ok, {[{_, Off2, Chunk2, _}], _}} = - ?FLU_C:read_chunk(Sock1, NS_good, ?DUMMY_PV1_EPOCH, - File2, Off2, Len2, noopt), - NS_bad_version = #ns_info{version=1, name=Good_NS}, - NS_bad_name = #ns_info{version=Good_NSVersion, name= <<"foons">>}, - {error, bad_epoch} = - ?FLU_C:read_chunk(Sock1, NS_bad_version, ?DUMMY_PV1_EPOCH, - File2, Off2, Len2, noopt), - {error, bad_arg} = - ?FLU_C:read_chunk(Sock1, NS_bad_name, ?DUMMY_PV1_EPOCH, - File2, Off2, Len2, noopt), + ?DUMMY_PV1_EPOCH, + BadFile, Off2, Len2, []), %% We know that File1 still exists. Pretend that we've done a %% migration and exercise the delete_migration() API. @@ -210,7 +189,8 @@ flu_smoke_test() -> {error, bad_arg} = ?FLU_C:trunc_hack(Host, TcpPort, ?DUMMY_PV1_EPOCH, BadFile), - ok = ?FLU_C:quit(Sock1) + ok = ?FLU_C:quit(?FLU_C:connect(#p_srvr{address=Host, + port=TcpPort})) after machi_test_util:stop_flu_package() end. @@ -223,7 +203,7 @@ flu_projection_smoke_test() -> try [ok = flu_projection_common(Host, TcpPort, T) || T <- [public, private] ] -%% , {ok, {false, EpochID1,_,_}} = ?FLU_C:wedge_status(Host, TcpPort), +%% , {ok, {false, EpochID1}} = ?FLU_C:wedge_status(Host, TcpPort), %% io:format(user, "EpochID1 ~p\n", [EpochID1]) after machi_test_util:stop_flu_package() @@ -258,15 +238,13 @@ bad_checksum_test() -> DataDir = "./data.bct", Opts = [{initial_wedged, false}], {_,_,_} = machi_test_util:start_flu_package(projection_test_flu, TcpPort, DataDir, Opts), - NSInfo = undefined, try Prefix = <<"some prefix">>, Chunk1 = <<"yo yo yo">>, - BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, ".................")}, - {error, bad_checksum} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, + Chunk1_badcs = {<>, Chunk1}, + {error, bad_checksum} = ?FLU_C:append_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, - Prefix, - Chunk1, BadCSum), + Prefix, Chunk1_badcs), ok after machi_test_util:stop_flu_package() @@ -278,8 +256,6 @@ witness_test() -> DataDir = "./data.witness", Opts = [{initial_wedged, false}, {witness_mode, true}], {_,_,_} = machi_test_util:start_flu_package(projection_test_flu, TcpPort, DataDir, Opts), - NSInfo = undefined, - NoCSum = <<>>, try Prefix = <<"some prefix">>, Chunk1 = <<"yo yo yo">>, @@ -292,14 +268,15 @@ witness_test() -> {ok, EpochID1} = ?FLU_C:get_latest_epochid(Host, TcpPort, private), %% Witness-protected ops all fail - {error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, EpochID1, - Prefix, Chunk1, NoCSum), + {error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, EpochID1, + Prefix, Chunk1), File = <<"foofile">>, - {error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort, NSInfo, EpochID1, - File, 9999, 9999, noopt), - {error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, File), + {error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort, EpochID1, + File, 9999, 9999, []), + {error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, EpochID1, + File), {error, bad_arg} = ?FLU_C:list_files(Host, TcpPort, EpochID1), - {ok, {false, EpochID1,_,_}} = ?FLU_C:wedge_status(Host, TcpPort), + {ok, {false, EpochID1}} = ?FLU_C:wedge_status(Host, TcpPort), {ok, _} = ?FLU_C:get_latest_epochid(Host, TcpPort, public), {ok, _} = ?FLU_C:read_latest_projection(Host, TcpPort, public), {error, not_written} = ?FLU_C:read_projection(Host, TcpPort, diff --git a/test/machi_flu_psup_test.erl b/test/machi_flu_psup_test.erl index bc43437..378ff74 100644 --- a/test/machi_flu_psup_test.erl +++ b/test/machi_flu_psup_test.erl @@ -84,23 +84,20 @@ partial_stop_restart2() -> WedgeStatus = fun({_,#p_srvr{address=Addr, port=TcpPort}}) -> machi_flu1_client:wedge_status(Addr, TcpPort) end, - NSInfo = undefined, Append = fun({_,#p_srvr{address=Addr, port=TcpPort}}, EpochID) -> - NoCSum = <<>>, machi_flu1_client:append_chunk(Addr, TcpPort, - NSInfo, EpochID, - <<"prefix">>, - <<"data">>, NoCSum) + EpochID, + <<"prefix">>, <<"data">>) end, try [Start(P) || P <- Ps], - [{ok, {true, _,_,_}} = WedgeStatus(P) || P <- Ps], % all are wedged + [{ok, {true, _}} = WedgeStatus(P) || P <- Ps], % all are wedged [{error,wedged} = Append(P, ?DUMMY_PV1_EPOCH) || P <- Ps], % all are wedged [machi_chain_manager1:set_chain_members(ChMgr, Dict) || ChMgr <- ChMgrs ], - {ok, {false, EpochID1,_,_}} = WedgeStatus(hd(Ps)), - [{ok, {false, EpochID1,_,_}} = WedgeStatus(P) || P <- Ps], % *not* wedged + {ok, {false, EpochID1}} = WedgeStatus(hd(Ps)), + [{ok, {false, EpochID1}} = WedgeStatus(P) || P <- Ps], % *not* wedged [{ok,_} = Append(P, EpochID1) || P <- Ps], % *not* wedged {ok, {_,_,File1}} = Append(hd(Ps), EpochID1), @@ -126,9 +123,9 @@ partial_stop_restart2() -> Epoch_m = Proj_m#projection_v1.epoch_number, %% Confirm that all FLUs are *not* wedged, with correct proj & epoch Proj_mCSum = Proj_m#projection_v1.epoch_csum, - [{ok, {false, {Epoch_m, Proj_mCSum},_,_}} = WedgeStatus(P) || % *not* wedged + [{ok, {false, {Epoch_m, Proj_mCSum}}} = WedgeStatus(P) || % *not* wedged P <- Ps], - {ok, {false, EpochID1,_,_}} = WedgeStatus(hd(Ps)), + {ok, {false, EpochID1}} = WedgeStatus(hd(Ps)), [{ok,_} = Append(P, EpochID1) || P <- Ps], % *not* wedged %% Stop all but 'a'. @@ -148,10 +145,10 @@ partial_stop_restart2() -> {error, wedged} = Append(hd(Ps), EpochID1), {_, #p_srvr{address=Addr_a, port=TcpPort_a}} = hd(Ps), {error, wedged} = machi_flu1_client:read_chunk( - Addr_a, TcpPort_a, NSInfo, ?DUMMY_PV1_EPOCH, - <<>>, 99999999, 1, undefined), - {error, bad_arg} = machi_flu1_client:checksum_list( - Addr_a, TcpPort_a, <<>>), + Addr_a, TcpPort_a, ?DUMMY_PV1_EPOCH, + <<>>, 99999999, 1, []), + {error, wedged} = machi_flu1_client:checksum_list( + Addr_a, TcpPort_a, ?DUMMY_PV1_EPOCH, <<>>), %% list_files() is permitted despite wedged status {ok, _} = machi_flu1_client:list_files( Addr_a, TcpPort_a, ?DUMMY_PV1_EPOCH), @@ -160,7 +157,7 @@ partial_stop_restart2() -> {now_using,_,Epoch_n} = machi_chain_manager1:trigger_react_to_env( hd(ChMgrs)), true = (Epoch_n > Epoch_m), - {ok, {false, EpochID3,_,_}} = WedgeStatus(hd(Ps)), + {ok, {false, EpochID3}} = WedgeStatus(hd(Ps)), %% The file we're assigned should be different with the epoch change. {ok, {_,_,File3}} = Append(hd(Ps), EpochID3), true = (File1 /= File3), diff --git a/test/machi_pb_high_client_test.erl b/test/machi_pb_high_client_test.erl index 68df0c9..85ed92b 100644 --- a/test/machi_pb_high_client_test.erl +++ b/test/machi_pb_high_client_test.erl @@ -24,7 +24,6 @@ -ifdef(TEST). -ifndef(PULSE). --include("machi.hrl"). -include("machi_pb.hrl"). -include("machi_projection.hrl"). -include_lib("eunit/include/eunit.hrl"). @@ -56,18 +55,17 @@ smoke_test2() -> %% a separate test module? Or separate test func? {error, _} = ?C:auth(Clnt, "foo", "bar"), + CoC_n = "", % CoC_namespace (not implemented) + CoC_l = 0, % CoC_locator (not implemented) Prefix = <<"prefix">>, Chunk1 = <<"Hello, chunk!">>, - NS = "", - NoCSum = <<>>, - Opts1 = #append_opts{}, {ok, {Off1, Size1, File1}} = - ?C:append_chunk(Clnt, NS, Prefix, Chunk1, NoCSum, Opts1), + ?C:append_chunk(Clnt, CoC_n, CoC_l, Prefix, Chunk1, none, 0), true = is_binary(File1), Chunk2 = "It's another chunk", CSum2 = {client_sha, machi_util:checksum_chunk(Chunk2)}, {ok, {Off2, Size2, File2}} = - ?C:append_chunk(Clnt, NS, Prefix, Chunk2, CSum2, Opts1), + ?C:append_chunk(Clnt, CoC_n, CoC_l, Prefix, Chunk2, CSum2, 1024), Chunk3 = ["This is a ", <<"test,">>, 32, [["Hello, world!"]]], File3 = File2, Off3 = Off2 + iolist_size(Chunk2), @@ -78,9 +76,9 @@ smoke_test2() -> {iolist_to_binary(Chunk2), File2, Off2, Size2}, {iolist_to_binary(Chunk3), File3, Off3, Size3}], [begin - File = Fl, + File = binary_to_list(Fl), ?assertMatch({ok, {[{File, Off, Ch, _}], []}}, - ?C:read_chunk(Clnt, Fl, Off, Sz, undefined)) + ?C:read_chunk(Clnt, Fl, Off, Sz, [])) end || {Ch, Fl, Off, Sz} <- Reads], {ok, KludgeBin} = ?C:checksum_list(Clnt, File1), @@ -93,10 +91,7 @@ smoke_test2() -> #p_srvr{name=Name, props=Props} = P, Dir = proplists:get_value(data_dir, Props), ?assertEqual({ok, [File1Bin]}, - file:list_dir(filename:join([Dir, "data"]))), - FileListFileName = filename:join([Dir, "known_files_" ++ atom_to_list(Name)]), - {ok, Plist} = machi_plist:open(FileListFileName, []), - ?assertEqual([], machi_plist:all(Plist)) + file:list_dir(filename:join([Dir, "data"]))) end || P <- Ps], [begin @@ -104,33 +99,32 @@ smoke_test2() -> end || {_Ch, Fl, Off, Sz} <- Reads], [begin {ok, {[], Trimmed}} = - ?C:read_chunk(Clnt, Fl, Off, Sz, #read_opts{needs_trimmed=true}), - Filename = Fl, + ?C:read_chunk(Clnt, Fl, Off, Sz, [{needs_trimmed, true}]), + Filename = binary_to_list(Fl), ?assertEqual([{Filename, Off, Sz}], Trimmed) end || {_Ch, Fl, Off, Sz} <- Reads], LargeBytes = binary:copy(<<"x">>, 1024*1024), LBCsum = {client_sha, machi_util:checksum_chunk(LargeBytes)}, {ok, {Offx, Sizex, Filex}} = - ?C:append_chunk(Clnt, NS, - Prefix, LargeBytes, LBCsum, Opts1), + ?C:append_chunk(Clnt, CoC_n, CoC_l, + Prefix, LargeBytes, LBCsum, 0), ok = ?C:trim_chunk(Clnt, Filex, Offx, Sizex), %% Make sure everything was trimmed File = binary_to_list(Filex), [begin #p_srvr{name=Name, props=Props} = P, - Dir = proplists:get_value(data_dir, Props), - ?assertEqual({ok, []}, - file:list_dir(filename:join([Dir, "data"]))), - FileListFileName = filename:join([Dir, "known_files_" ++ atom_to_list(Name)]), - {ok, Plist} = machi_plist:open(FileListFileName, []), - ?assertEqual([File], machi_plist:all(Plist)) + DataDir = filename:join([proplists:get_value(data_dir, Props), "data"]), + ?assertEqual({ok, []}, file:list_dir(DataDir)), + {ok, CsumT} = machi_flu_filename_mgr:get_csum_table(Name), + Plist = machi_csum_table:all_files(CsumT), + ?assertEqual([{File, ts}], Plist) end || P <- Ps], [begin {error, trimmed} = - ?C:read_chunk(Clnt, Fl, Off, Sz, undefined) + ?C:read_chunk(Clnt, Fl, Off, Sz, []) end || {_Ch, Fl, Off, Sz} <- Reads], ok after diff --git a/test/machi_plist_test.erl b/test/machi_plist_test.erl deleted file mode 100644 index a796c1b..0000000 --- a/test/machi_plist_test.erl +++ /dev/null @@ -1,17 +0,0 @@ --module(machi_plist_test). - --include_lib("eunit/include/eunit.hrl"). - -open_close_test() -> - FileName = "bark-bark-one", - file:delete(FileName), - {ok, PList0} = machi_plist:open(FileName, []), - {ok, PList1} = machi_plist:add(PList0, "boomar"), - ?assertEqual(["boomar"], machi_plist:all(PList1)), - ok = machi_plist:close(PList1), - - {ok, PList2} = machi_plist:open(FileName, []), - ?assertEqual(["boomar"], machi_plist:all(PList2)), - ok = machi_plist:close(PList2), - file:delete(FileName), - ok. diff --git a/test/machi_proxy_flu1_client_test.erl b/test/machi_proxy_flu1_client_test.erl index 7f8dcce..b8556b7 100644 --- a/test/machi_proxy_flu1_client_test.erl +++ b/test/machi_proxy_flu1_client_test.erl @@ -36,8 +36,6 @@ api_smoke_test() -> DataDir = "./data.api_smoke_flu", W_props = [{active_mode, false},{initial_wedged, false}], Prefix = <<"prefix">>, - NSInfo = undefined, - NoCSum = <<>>, try {[I], _, _} = machi_test_util:start_flu_package( @@ -45,42 +43,35 @@ api_smoke_test() -> {ok, Prox1} = ?MUT:start_link(I), try FakeEpoch = ?DUMMY_PV1_EPOCH, - [{ok, {_,_,_}} = ?MUT:append_chunk( - Prox1, NSInfo, FakeEpoch, - Prefix, <<"data">>, NoCSum) || - _ <- lists:seq(1,5)], + [{ok, {_,_,_}} = ?MUT:append_chunk(Prox1, + FakeEpoch, Prefix, <<"data">>, + infinity) || _ <- lists:seq(1,5)], %% Stop the FLU, what happens? machi_test_util:stop_flu_package(), - [{error,partition} = ?MUT:append_chunk(Prox1, NSInfo, + [{error,partition} = ?MUT:append_chunk(Prox1, FakeEpoch, Prefix, <<"data-stopped1">>, - NoCSum) || _ <- lists:seq(1,3)], + infinity) || _ <- lists:seq(1,3)], %% Start the FLU again, we should be able to do stuff immediately machi_test_util:start_flu_package(RegName, TcpPort, DataDir, [no_cleanup|W_props]), MyChunk = <<"my chunk data">>, {ok, {MyOff,MySize,MyFile}} = - ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, Prefix, MyChunk, - NoCSum), - {ok, {[{_, MyOff, MyChunk, _MyChunkCSUM}], []}} = - ?MUT:read_chunk(Prox1, NSInfo, FakeEpoch, MyFile, MyOff, MySize, undefined), - MyChunk2_parts = [<<"my chunk ">>, "data", <<", yeah, again">>], - MyChunk2 = iolist_to_binary(MyChunk2_parts), - Opts1 = #append_opts{chunk_extra=4242}, + ?MUT:append_chunk(Prox1, FakeEpoch, Prefix, MyChunk, + infinity), + {ok, {[{_, MyOff, MyChunk, _}], []}} = + ?MUT:read_chunk(Prox1, FakeEpoch, MyFile, MyOff, MySize, []), + MyChunk2 = <<"my chunk data, yeah, again">>, {ok, {MyOff2,MySize2,MyFile2}} = - ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, Prefix, - MyChunk2_parts, NoCSum, Opts1, infinity), - [{ok, {[{_, MyOff2, MyChunk2, _}], []}} = - ?MUT:read_chunk(Prox1, NSInfo, FakeEpoch, MyFile2, MyOff2, MySize2, DefaultOptions) || - DefaultOptions <- [undefined, noopt, none, any_atom_at_all] ], - - BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, "...................")}, - {error, bad_checksum} = ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, - Prefix, MyChunk, BadCSum), - {error, bad_checksum} = ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, - MyFile2, - MyOff2 + size(MyChunk2), - MyChunk, BadCSum, - infinity), + ?MUT:append_chunk_extra(Prox1, FakeEpoch, Prefix, + MyChunk2, 4242, infinity), + {ok, {[{_, MyOff2, MyChunk2, _}], []}} = + ?MUT:read_chunk(Prox1, FakeEpoch, MyFile2, MyOff2, MySize2, []), + MyChunk_badcs = {<>, MyChunk}, + {error, bad_checksum} = ?MUT:append_chunk(Prox1, FakeEpoch, + Prefix, MyChunk_badcs), + {error, bad_checksum} = ?MUT:write_chunk(Prox1, FakeEpoch, + <<"foo-file^^0^1^1">>, 99832, + MyChunk_badcs), %% Put kick_projection_reaction() in the middle of the test so %% that any problems with its async nature will (hopefully) @@ -89,9 +80,9 @@ api_smoke_test() -> %% Alright, now for the rest of the API, whee BadFile = <<"no-such-file">>, - {error, bad_arg} = ?MUT:checksum_list(Prox1, BadFile), + {error, bad_arg} = ?MUT:checksum_list(Prox1, FakeEpoch, BadFile), {ok, [_|_]} = ?MUT:list_files(Prox1, FakeEpoch), - {ok, {false, _,_,_}} = ?MUT:wedge_status(Prox1), + {ok, {false, _}} = ?MUT:wedge_status(Prox1), {ok, {0, _SomeCSum}} = ?MUT:get_latest_epochid(Prox1, public), {ok, #projection_v1{epoch_number=0}} = ?MUT:read_latest_projection(Prox1, public), @@ -120,8 +111,6 @@ flu_restart_test2() -> TcpPort = 17125, DataDir = "./data.api_smoke_flu2", W_props = [{initial_wedged, false}, {active_mode, false}], - NSInfo = undefined, - NoCSum = <<>>, try {[I], _, _} = machi_test_util:start_flu_package( @@ -131,8 +120,9 @@ flu_restart_test2() -> FakeEpoch = ?DUMMY_PV1_EPOCH, Data = <<"data!">>, Dataxx = <<"Fake!">>, - {ok, {Off1,Size1,File1}} = ?MUT:append_chunk(Prox1, NSInfo, - FakeEpoch, <<"prefix">>, Data, NoCSum), + {ok, {Off1,Size1,File1}} = ?MUT:append_chunk(Prox1, + FakeEpoch, <<"prefix">>, Data, + infinity), P_a = #p_srvr{name=a, address="localhost", port=6622}, P1 = machi_projection:new(1, RegName, [P_a], [], [RegName], [], []), P1xx = P1#projection_v1{dbg2=["dbg2 changes are ok"]}, @@ -156,7 +146,6 @@ flu_restart_test2() -> %% makes the code a bit convoluted. (No LFE or %% Elixir macros here, alas, they'd be useful.) - AppendOpts1 = #append_opts{chunk_extra=42}, ExpectedOps = [ fun(run) -> ?assertEqual({ok, EpochID}, ?MUT:get_epoch_id(Prox1)), @@ -238,37 +227,35 @@ flu_restart_test2() -> (stop) -> ?MUT:get_all_projections(Prox1, private) end, fun(run) -> {ok, {_,_,_}} = - ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, - <<"prefix">>, Data, NoCSum), + ?MUT:append_chunk(Prox1, FakeEpoch, + <<"prefix">>, Data, infinity), ok; (line) -> io:format("line ~p, ", [?LINE]); - (stop) -> ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, - <<"prefix">>, Data, NoCSum) + (stop) -> ?MUT:append_chunk(Prox1, FakeEpoch, + <<"prefix">>, Data, infinity) end, fun(run) -> {ok, {_,_,_}} = - ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, - <<"prefix">>, Data, NoCSum, - AppendOpts1, infinity), + ?MUT:append_chunk_extra(Prox1, FakeEpoch, + <<"prefix">>, Data, 42, infinity), ok; (line) -> io:format("line ~p, ", [?LINE]); - (stop) -> ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, - <<"prefix">>, Data, NoCSum, - AppendOpts1, infinity) + (stop) -> ?MUT:append_chunk_extra(Prox1, FakeEpoch, + <<"prefix">>, Data, 42, infinity) end, fun(run) -> {ok, {[{_, Off1, Data, _}], []}} = - ?MUT:read_chunk(Prox1, NSInfo, FakeEpoch, - File1, Off1, Size1, undefined), + ?MUT:read_chunk(Prox1, FakeEpoch, + File1, Off1, Size1, []), ok; (line) -> io:format("line ~p, ", [?LINE]); - (stop) -> ?MUT:read_chunk(Prox1, NSInfo, FakeEpoch, - File1, Off1, Size1, undefined) + (stop) -> ?MUT:read_chunk(Prox1, FakeEpoch, + File1, Off1, Size1, []) end, fun(run) -> {ok, KludgeBin} = - ?MUT:checksum_list(Prox1, File1), + ?MUT:checksum_list(Prox1, FakeEpoch, File1), true = is_binary(KludgeBin), ok; (line) -> io:format("line ~p, ", [?LINE]); - (stop) -> ?MUT:checksum_list(Prox1, File1) + (stop) -> ?MUT:checksum_list(Prox1, FakeEpoch, File1) end, fun(run) -> {ok, _} = ?MUT:list_files(Prox1, FakeEpoch), @@ -284,21 +271,21 @@ flu_restart_test2() -> end, fun(run) -> ok = - ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, File1, Off1, - Data, NoCSum, infinity), + ?MUT:write_chunk(Prox1, FakeEpoch, File1, Off1, + Data, infinity), ok; (line) -> io:format("line ~p, ", [?LINE]); - (stop) -> ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, File1, Off1, - Data, NoCSum, infinity) + (stop) -> ?MUT:write_chunk(Prox1, FakeEpoch, File1, Off1, + Data, infinity) end, fun(run) -> {error, written} = - ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, File1, Off1, - Dataxx, NoCSum, infinity), + ?MUT:write_chunk(Prox1, FakeEpoch, File1, Off1, + Dataxx, infinity), ok; (line) -> io:format("line ~p, ", [?LINE]); - (stop) -> ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, File1, Off1, - Dataxx, NoCSum, infinity) + (stop) -> ?MUT:write_chunk(Prox1, FakeEpoch, File1, Off1, + Dataxx, infinity) end ], diff --git a/test/machi_test_util.erl b/test/machi_test_util.erl index 70b02af..ff908b7 100644 --- a/test/machi_test_util.erl +++ b/test/machi_test_util.erl @@ -83,7 +83,7 @@ stop_machi_sup() -> undefined -> ok; Pid -> catch exit(whereis(machi_sup), normal), - machi_util:wait_for_death(Pid, 100) + machi_util:wait_for_death(Pid, 30) end. clean_up(FluInfo) ->