diff --git a/FAQ.md b/FAQ.md index f2e37c1..6d43e8f 100644 --- a/FAQ.md +++ b/FAQ.md @@ -11,14 +11,14 @@ + [1 Questions about Machi in general](#n1) + [1.1 What is Machi?](#n1.1) - + [1.2 What is a Machi "cluster of clusters"?](#n1.2) - + [1.2.1 This "cluster of clusters" idea needs a better name, don't you agree?](#n1.2.1) - + [1.3 What is Machi like when operating in "eventually consistent" mode?](#n1.3) - + [1.4 What is Machi like when operating in "strongly consistent" mode?](#n1.4) - + [1.5 What does Machi's API look like?](#n1.5) - + [1.6 What licensing terms are used by Machi?](#n1.6) - + [1.7 Where can I find the Machi source code and documentation? Can I contribute?](#n1.7) - + [1.8 What is Machi's expected release schedule, packaging, and operating system/OS distribution support?](#n1.8) + + [1.2 What is a Machi chain?](#n1.2) + + [1.3 What is a Machi cluster?](#n1.3) + + [1.4 What is Machi like when operating in "eventually consistent" mode?](#n1.4) + + [1.5 What is Machi like when operating in "strongly consistent" mode?](#n1.5) + + [1.6 What does Machi's API look like?](#n1.6) + + [1.7 What licensing terms are used by Machi?](#n1.7) + + [1.8 Where can I find the Machi source code and documentation? Can I contribute?](#n1.8) + + [1.9 What is Machi's expected release schedule, packaging, and operating system/OS distribution support?](#n1.9) + [2 Questions about Machi relative to {{something else}}](#n2) + [2.1 How is Machi better than Hadoop?](#n2.1) + [2.2 How does Machi differ from HadoopFS/HDFS?](#n2.2) @@ -28,13 +28,15 @@ + [3 Machi's specifics](#n3) + [3.1 What technique is used to replicate Machi's files? Can other techniques be used?](#n3.1) + [3.2 Does Machi have a reliance on a coordination service such as ZooKeeper or etcd?](#n3.2) - + [3.3 Is it true that there's an allegory written to describe humming consensus?](#n3.3) - + [3.4 How is Machi tested?](#n3.4) - + [3.5 Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks](#n3.5) - + [3.6 Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device?](#n3.6) - + [3.7 What language(s) is Machi written in?](#n3.7) - + [3.8 Does Machi use the Erlang/OTP network distribution system (aka "disterl")?](#n3.8) - + [3.9 Can I use HTTP to write/read stuff into/from Machi?](#n3.9) + + [3.3 Are there any presentations available about Humming Consensus](#n3.3) + + [3.4 Is it true that there's an allegory written to describe Humming Consensus?](#n3.4) + + [3.5 How is Machi tested?](#n3.5) + + [3.6 Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks](#n3.6) + + [3.7 Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device?](#n3.7) + + [3.8 What language(s) is Machi written in?](#n3.8) + + [3.9 Can Machi run on Windows? Can Machi run on 32-bit platforms?](#n3.9) + + [3.10 Does Machi use the Erlang/OTP network distribution system (aka "disterl")?](#n3.10) + + [3.11 Can I use HTTP to write/read stuff into/from Machi?](#n3.11) @@ -48,7 +50,7 @@ Very briefly, Machi is a very simple append-only file store. Machi is "dumber" than many other file stores (i.e., lacking many features -found in other file stores) such as HadoopFS or simple NFS or CIFS file +found in other file stores) such as HadoopFS or a simple NFS or CIFS file server. However, Machi is a distributed file store, which makes it different (and, in some ways, more complicated) than a simple NFS or CIFS file @@ -82,45 +84,39 @@ For a much longer answer, please see the [Machi high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-machi.pdf). -### 1.2. What is a Machi "cluster of clusters"? +### 1.2. What is a Machi chain? -Machi's design is based on using small, well-understood and provable -(mathematically) techniques to maintain multiple file copies without -data loss or data corruption. At its lowest level, Machi contains no -support for distribution/partitioning/sharding of files across many -servers. A typical, fully-functional Machi cluster will likely be two -or three machines. +A Machi chain is a small number of machines that maintain a common set +of replicated files. A typical chain is of length 2 or 3. For +critical data that must be available despite several simultaneous +server failures, a chain length of 6 or 7 might be used. -However, Machi is designed to be an excellent building block for -building larger systems. A deployment of Machi "cluster of clusters" -will use the "random slicing" technique for partitioning files across -multiple Machi clusters that, as individuals, are unaware of the -larger cluster-of-clusters scheme. + +### 1.3. What is a Machi cluster? -The cluster-of-clusters management service will be fully decentralized +A Machi cluster is a collection of Machi chains that +partitions/shards/distributes files (based on file name) across the +collection of chains. Machi uses the "random slicing" algorithm (a +variation of consistent hashing) to define the mapping of file name to +chain name. + +The cluster management service will be fully decentralized and run as a separate software service installed on each Machi cluster. This manager will appear to the local Machi server as simply -another Machi file client. The cluster-of-clusters managers will take +another Machi file client. The cluster managers will take care of file migration as the cluster grows and shrinks in capacity and in response to day-to-day changes in workload. -Though the cluster-of-clusters manager has not yet been implemented, +Though the cluster manager has not yet been implemented, its design is fully decentralized and capable of operating despite -multiple partial failure of its member clusters. We expect this +multiple partial failure of its member chains. We expect this design to scale easily to at least one thousand servers. Please see the [Machi source repository's 'doc' directory for more details](https://github.com/basho/machi/tree/master/doc/). - -#### 1.2.1. This "cluster of clusters" idea needs a better name, don't you agree? - -Yes. Please help us: we are bad at naming things. -For proof that naming things is hard, see -[http://martinfowler.com/bliki/TwoHardThings.html](http://martinfowler.com/bliki/TwoHardThings.html) - - -### 1.3. What is Machi like when operating in "eventually consistent" mode? + +### 1.4. What is Machi like when operating in "eventually consistent" mode? Machi's operating mode dictates how a Machi cluster will react to network partitions. A network partition may be caused by: @@ -143,13 +139,13 @@ consistency mode during and after network partitions are: together from "all sides" of the partition(s). * Unique files are copied in their entirety. * Byte ranges within the same file are merged. This is possible - due to Machi's restrictions on file naming (files names are - alwoys assigned by Machi servers) and file offset assignments - (byte offsets are also always chosen by Machi servers according - to rules which guarantee safe mergeability.). + due to Machi's restrictions on file naming and file offset + assignment. Both file names and file offsets are always chosen + by Machi servers according to rules which guarantee safe + mergeability. - -### 1.4. What is Machi like when operating in "strongly consistent" mode? + +### 1.5. What is Machi like when operating in "strongly consistent" mode? The consistency semantics of file operations while in strongly consistency mode during and after network partitions are: @@ -167,13 +163,13 @@ consistency mode during and after network partitions are: Machi's design can provide the illusion of quorum minority write availability if the cluster is configured to operate with "witness -servers". (This feaure is not implemented yet, as of June 2015.) +servers". (This feaure partially implemented, as of December 2015.) See Section 11 of [Machi chain manager high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-chain-mgr.pdf) for more details. - -### 1.5. What does Machi's API look like? + +### 1.6. What does Machi's API look like? The Machi API only contains a handful of API operations. The function arguments shown below use Erlang-style type annotations. @@ -204,15 +200,15 @@ level" internal protocol are in a [Protocol Buffers](https://developers.google.com/protocol-buffers/docs/overview) definition at [./src/machi.proto](./src/machi.proto). - -### 1.6. What licensing terms are used by Machi? + +### 1.7. What licensing terms are used by Machi? All Machi source code and documentation is licensed by [Basho Technologies, Inc.](http://www.basho.com/) under the [Apache Public License version 2](https://github.com/basho/machi/tree/master/LICENSE). - -### 1.7. Where can I find the Machi source code and documentation? Can I contribute? + +### 1.8. Where can I find the Machi source code and documentation? Can I contribute? All Machi source code and documentation can be found at GitHub: [https://github.com/basho/machi](https://github.com/basho/machi). @@ -226,8 +222,8 @@ ideas for improvement, please see our contributing & collaboration guidelines at [https://github.com/basho/machi/blob/master/CONTRIBUTING.md](https://github.com/basho/machi/blob/master/CONTRIBUTING.md). - -### 1.8. What is Machi's expected release schedule, packaging, and operating system/OS distribution support? + +### 1.9. What is Machi's expected release schedule, packaging, and operating system/OS distribution support? Basho expects that Machi's first major product release will take place during the 2nd quarter of 2016. @@ -305,15 +301,15 @@ file's writable phase). Does not have any file distribution/partitioning/sharding across -Machi clusters: in a single Machi cluster, all files are replicated by -all servers in the cluster. The "cluster of clusters" concept is used +Machi chains: in a single Machi chain, all files are replicated by +all servers in the chain. The "random slicing" technique is used to distribute/partition/shard files across multiple Machi clusters. File distribution/partitioning/sharding is performed automatically by the HDFS "name node". - Machi requires no central "name node" for single cluster use. -Machi requires no central "name node" for "cluster of clusters" use + Machi requires no central "name node" for single chain use or +for multi-chain cluster use. Requires a single "namenode" server to maintain file system contents and file content mapping. (May be deployed with a "secondary namenode" to reduce unavailability when the primary namenode fails.) @@ -479,8 +475,8 @@ difficult to adapt to Machi's design goals: * Both protocols use quorum majority consensus, which requires a minimum of *2F + 1* working servers to tolerate *F* failures. For example, to tolerate 2 server failures, quorum majority protocols - require a minium of 5 servers. To tolerate the same number of - failures, Chain replication requires only 3 servers. + require a minimum of 5 servers. To tolerate the same number of + failures, Chain Replication requires a minimum of only 3 servers. * Machi's use of "humming consensus" to manage internal server metadata state would also (probably) require conversion to Paxos or Raft. (Or "outsourced" to a service such as ZooKeeper.) @@ -497,7 +493,17 @@ Humming consensus is described in the [Machi chain manager high level design doc](https://github.com/basho/machi/tree/master/doc/high-level-chain-mgr.pdf). -### 3.3. Is it true that there's an allegory written to describe humming consensus? +### 3.3. Are there any presentations available about Humming Consensus + +Scott recently (November 2015) gave a presentation at the +[RICON 2015 conference](http://ricon.io) about one of the techniques +used by Machi; "Managing Chain Replication Metadata with +Humming Consensus" is available online now. +* [slides (PDF format)](http://ricon.io/speakers/slides/Scott_Fritchie_Ricon_2015.pdf) +* [video](https://www.youtube.com/watch?v=yR5kHL1bu1Q) + + +### 3.4. Is it true that there's an allegory written to describe Humming Consensus? Yes. In homage to Leslie Lamport's original paper about the Paxos protocol, "The Part-time Parliamant", there is an allegorical story @@ -508,8 +514,8 @@ The full story, full of wonder and mystery, is called There is also a [short followup blog posting](http://www.snookles.com/slf-blog/2015/03/20/on-humming-consensus-an-allegory-part-2/). - -### 3.4. How is Machi tested? + +### 3.5. How is Machi tested? While not formally proven yet, Machi's implementation of Chain Replication and of humming consensus have been extensively tested with @@ -538,16 +544,16 @@ All test code is available in the [./test](./test) subdirectory. Modules that use QuickCheck will use a file suffix of `_eqc`, for example, [./test/machi_ap_repair_eqc.erl](./test/machi_ap_repair_eqc.erl). - -### 3.5. Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks + +### 3.6. Does Machi require shared disk storage? e.g. iSCSI, NBD (Network Block Device), Fibre Channel disks No, Machi's design assumes that each Machi server is a fully independent hardware and assumes only standard local disks (Winchester and/or SSD style) with local-only interfaces (e.g. SATA, SCSI, PCI) in each machine. - -### 3.6. Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device? + +### 3.7. Does Machi require or assume that servers with large numbers of disks must use RAID-0/1/5/6/10/50/60 to create a single block device? No. When used with servers with multiple disks, the intent is to deploy multiple Machi servers per machine: one Machi server per disk. @@ -565,10 +571,10 @@ deploy multiple Machi servers per machine: one Machi server per disk. placement relative to 12 servers is smaller than a placement problem of managing 264 seprate disks (if each of 12 servers has 22 disks). - -### 3.7. What language(s) is Machi written in? + +### 3.8. What language(s) is Machi written in? -So far, Machi is written in 100% Erlang. Machi uses at least one +So far, Machi is written in Erlang, mostly. Machi uses at least one library, [ELevelDB](https://github.com/basho/eleveldb), that is implemented both in C++ and in Erlang, using Erlang NIFs (Native Interface Functions) to allow Erlang code to call C++ functions. @@ -580,8 +586,16 @@ in C, Java, or other "gotta go fast fast FAST!!" programming language. We expect that the Chain Replication manager and other critical "control plane" software will remain in Erlang. - -### 3.8. Does Machi use the Erlang/OTP network distribution system (aka "disterl")? + +### 3.9. Can Machi run on Windows? Can Machi run on 32-bit platforms? + +The ELevelDB NIF does not compile or run correctly on Erlang/OTP +Windows platforms, nor does it compile correctly on 32-bit platforms. +Machi should support all 64-bit UNIX-like platforms that are supported +by Erlang/OTP and ELevelDB. + + +### 3.10. Does Machi use the Erlang/OTP network distribution system (aka "disterl")? No, Machi doesn't use Erlang/OTP's built-in distributed message passing system. The code would be *much* simpler if we did use @@ -596,8 +610,8 @@ All wire protocols used by Machi are defined & implemented using [Protocol Buffers](https://developers.google.com/protocol-buffers/docs/overview). The definition file can be found at [./src/machi.proto](./src/machi.proto). - -### 3.9. Can I use HTTP to write/read stuff into/from Machi? + +### 3.11. Can I use HTTP to write/read stuff into/from Machi? Short answer: No, not yet. diff --git a/doc/README.md b/doc/README.md index 3ad424c..b8e1949 100644 --- a/doc/README.md +++ b/doc/README.md @@ -66,9 +66,9 @@ an introduction to the self-management algorithm proposed for Machi. Most material has been moved to the [high-level-chain-mgr.pdf](high-level-chain-mgr.pdf) document. -### cluster-of-clusters (directory) +### cluster (directory) -This directory contains the sketch of the "cluster of clusters" design +This directory contains the sketch of the cluster design strawman for partitioning/distributing/sharding files across a large -number of independent Machi clusters. +number of independent Machi chains. diff --git a/doc/cluster-of-clusters/migration-3to4.png b/doc/cluster-of-clusters/migration-3to4.png deleted file mode 100644 index e7ec417..0000000 Binary files a/doc/cluster-of-clusters/migration-3to4.png and /dev/null differ diff --git a/doc/cluster-of-clusters/migration-4.png b/doc/cluster-of-clusters/migration-4.png deleted file mode 100644 index 3e1414d..0000000 Binary files a/doc/cluster-of-clusters/migration-4.png and /dev/null differ diff --git a/doc/cluster-of-clusters/name-game-sketch.org b/doc/cluster-of-clusters/name-game-sketch.org deleted file mode 100644 index 44b5df0..0000000 --- a/doc/cluster-of-clusters/name-game-sketch.org +++ /dev/null @@ -1,479 +0,0 @@ --*- mode: org; -*- -#+TITLE: Machi cluster-of-clusters "name game" sketch -#+AUTHOR: Scott -#+STARTUP: lognotedone hidestars indent showall inlineimages -#+SEQ_TODO: TODO WORKING WAITING DONE -#+COMMENT: M-x visual-line-mode -#+COMMENT: Also, disable auto-fill-mode - -* 1. "Name Games" with random-slicing style consistent hashing - -Our goal: to distribute lots of files very evenly across a cluster of -Machi clusters (hereafter called a "cluster of clusters" or "CoC"). - -* 2. Assumptions - -** Basic familiarity with Machi high level design and Machi's "projection" - -The [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] contains all of the basic -background assumed by the rest of this document. - -** Analogy: "neighborhood : city :: Machi : cluster-of-clusters" - -Analogy: The word "machi" in Japanese means small town or -neighborhood. As the Tokyo Metropolitan Area is built from many -machis and smaller cities, therefore a big, partitioned file store can -be built out of many small Machi clusters. - -** Familiarity with the Machi cluster-of-clusters/CoC concept - -It's clear (I hope!) from -the [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] that Machi alone does not support -any kind of file partitioning/distribution/sharding across multiple -small Machi clusters. There must be another layer above a Machi cluster to -provide such partitioning services. - -The name "cluster of clusters" originated within Basho to avoid -conflicting use of the word "cluster". A Machi cluster is usually -synonymous with a single Chain Replication chain and a single set of -machines (e.g. 2-5 machines). However, in the not-so-far future, we -expect much more complicated patterns of Chain Replication to be used -in real-world deployments. - -"Cluster of clusters" is clunky and long, but we haven't found a good -substitute yet. If you have a good suggestion, please contact us! -~^_^~ - -Using the [[https://github.com/basho/machi/tree/master/prototype/demo-day-hack][cluster-of-clusters quick-and-dirty prototype]] as an -architecture sketch, let's now assume that we have ~n~ independent Machi -clusters. We assume that each of these clusters has roughly the same -chain length in the nominal case, e.g. chain length of 3. -We wish to provide partitioned/distributed file storage -across all ~n~ clusters. We call the entire collection of ~n~ Machi -clusters a "cluster of clusters", or abbreviated "CoC". - -We may wish to have several types of Machi clusters, e.g. chain length -of 3 for normal data, longer for cannot-afford-data-loss files, and -shorter for don't-care-if-it-gets-lost files. Each of these types of -chains will have a name ~N~ in the CoC namespace. The role of the CoC -namespace will be demonstrated in Section 3 below. - -** Continue CoC prototype's assumption: a Machi cluster is unaware of CoC - -Let's continue with an assumption that an individual Machi cluster -inside of the cluster-of-clusters is completely unaware of the -cluster-of-clusters layer. - -TODO: We may need to break this assumption sometime in the future? - -** The reader is familiar with the random slicing technique - -I'd done something very-very-nearly-identical for the Hibari database -6 years ago. But the Hibari technique was based on stuff I did at -Sendmail, Inc, so it felt old news to me. {shrug} - -The Hibari documentation has a brief photo illustration of how random -slicing works, see [[http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration][Hibari Sysadmin Guide, chain migration]] - -For a comprehensive description, please see these two papers: - -#+BEGIN_QUOTE -Reliable and Randomized Data Distribution Strategies for Large Scale Storage Systems -Alberto Miranda et al. -http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.226.5609 - (short version, HIPC'11) - -Random Slicing: Efficient and Scalable Data Placement for Large-Scale - Storage Systems -Alberto Miranda et al. -DOI: http://dx.doi.org/10.1145/2632230 (long version, ACM Transactions - on Storage, Vol. 10, No. 3, Article 9, 2014) -#+END_QUOTE - -** CoC locator: We borrow from random slicing but do not hash any strings! - -We will use the general technique of random slicing, but we adapt the -technique to fit our use case. - -In general, random slicing says: - -- Hash a string onto the unit interval [0.0, 1.0) -- Calculate h(unit interval point, Map) -> bin, where ~Map~ partitions - the unit interval into bins. - -Our adaptation is in step 1: we do not hash any strings. Instead, we -store & use the unit interval point as-is, without using a hash -function in this step. This number is called the "CoC locator". - -As described later in this doc, Machi file names are structured into -several components. One component of the file name contains the "CoC -locator"; we use the number as-is for step 2 above. - -* 3. A simple illustration - -We use a variation of the Random Slicing hash that we will call -~rs_hash_with_float()~. The Erlang-style function type is shown -below. - -#+BEGIN_SRC erlang -%% type specs, Erlang-style --spec rs_hash_with_float(float(), rs_hash:map()) -> rs_hash:cluster_id(). -#+END_SRC - -I'm borrowing an illustration from the HibariDB documentation here, -but it fits my purposes quite well. (I am the original creator of that -image, and also the use license is compatible.) - -#+CAPTION: Illustration of 'Map', using four Machi clusters - -[[./migration-4.png]] - -Assume that we have a random slicing map called ~Map~. This particular -~Map~ maps the unit interval onto 4 Machi clusters: - -| Hash range | Cluster ID | -|-------------+------------| -| 0.00 - 0.25 | Cluster1 | -| 0.25 - 0.33 | Cluster4 | -| 0.33 - 0.58 | Cluster2 | -| 0.58 - 0.66 | Cluster4 | -| 0.66 - 0.91 | Cluster3 | -| 0.91 - 1.00 | Cluster4 | - -Assume that the system chooses a CoC locator of 0.05. -According to ~Map~, the value of -~rs_hash_with_float(0.05,Map) = Cluster1~. -Similarly, ~rs_hash_with_float(0.26,Map) = Cluster4~. - -* 4. An additional assumption: clients will want some control over file location - -We will continue to use the 4-cluster diagram from the previous -section. - -** Our new assumption: client control over initial file location - -The CoC management scheme may decide that files need to migrate to -other clusters. The reason could be for storage load or I/O load -balancing reasons. It could be because a cluster is being -decommissioned by its owners. There are many legitimate reasons why a -file that is initially created on cluster ID X has been moved to -cluster ID Y. - -However, there are also legitimate reasons for why the client would want -control over the choice of Machi cluster when the data is first -written. The single biggest reason is load balancing. Assuming that -the client (or the CoC management layer acting on behalf of the CoC -client) knows the current utilization across the participating Machi -clusters, then it may be very helpful to send new append() requests to -under-utilized clusters. - -* 5. Use of the CoC namespace: name separation plus chain type - -Let us assume that the CoC framework provides several different types -of chains: - -| Chain length | CoC namespace | Mode | Comment | -|--------------+---------------+------+----------------------------------| -| 3 | normal | AP | Normal storage redundancy & cost | -| 2 | reduced | AP | Reduced cost storage | -| 1 | risky | AP | Really, really cheap storage | -| 9 | paranoid | AP | Safety-critical storage | -| 3 | sequential | CP | Strong consistency | -|--------------+---------------+------+----------------------------------| - -The client may want to choose the amount of redundancy that its -application requires: normal, reduced cost, or perhaps even a single -copy. The CoC namespace is used by the client to signal this -intention. - -Further, the CoC administrators may wish to use the namespace to -provide separate storage for different applications. Jane's -application may use the namespace "jane-normal" and Bob's app uses -"bob-reduced". The CoC administrators may definite separate groups of -chains on separate servers to serve these two applications. - -* 6. Floating point is not required ... it is merely convenient for explanation - -NOTE: Use of floating point terms is not required. For example, -integer arithmetic could be used, if using a sufficiently large -interval to create an even & smooth distribution of hashes across the -expected maximum number of clusters. - -For example, if the maximum CoC cluster size would be 4,000 individual -Machi clusters, then a minimum of 12 bits of integer space is required -to assign one integer per Machi cluster. However, for load balancing -purposes, a finer grain of (for example) 100 integers per Machi -cluster would permit file migration to move increments of -approximately 1% of single Machi cluster's storage capacity. A -minimum of 12+7=19 bits of hash space would be necessary to accommodate -these constraints. - -It is likely that Machi's final implementation will choose a 24 bit -integer to represent the CoC locator. - -* 7. Proposal: Break the opacity of Machi file names - -Machi assigns file names based on: - -~ClientSuppliedPrefix ++ "^" ++ SomeOpaqueFileNameSuffix~ - -What if the CoC client could peek inside of the opaque file name -suffix in order to look at the CoC location information that we might -code in the filename suffix? - -** The notation we use - -- ~T~ = the target CoC member/Cluster ID chosen by the CoC client at the time of ~append()~ -- ~p~ = file prefix, chosen by the CoC client. -- ~L~ = the CoC locator -- ~N~ = the CoC namespace -- ~u~ = the Machi file server unique opaque file name suffix, e.g. a GUID string -- ~F~ = a Machi file name, i.e., ~p^L^N^u~ - -** The details: CoC file write - -1. CoC client chooses ~p~, ~T~, and ~N~ (i.e., the file prefix, target - cluster, and target cluster namespace) -2. CoC client knows the CoC ~Map~ for namespace ~N~. -3. CoC client choose some CoC locator value ~L~ such that - ~rs_hash_with_float(L,Map) = T~ (see below). -4. CoC client sends its request to cluster - ~T~: ~append_chunk(p,L,N,...) -> {ok,p^L^N^u,ByteOffset}~ -5. CoC stores/uses the file name ~F = p^L^N^u~. - -** The details: CoC file read - -1. CoC client knows the file name ~F~ and parses it to find - the values of ~L~ and ~N~ (recall, ~F = p^L^N^u~). -2. CoC client knows the CoC ~Map~ for type ~N~. -3. CoC calculates ~rs_hash_with_float(L,Map) = T~ -4. CoC client sends request to cluster ~T~: ~read_chunk(F,...) ->~ ... success! - -** The details: calculating 'L' (the CoC locator) to match a desired target cluster - -1. We know ~Map~, the current CoC mapping for a CoC namespace ~N~. -2. We look inside of ~Map~, and we find all of the unit interval ranges - that map to our desired target cluster ~T~. Let's call this list - ~MapList = [Range1=(start,end],Range2=(start,end],...]~. -3. In our example, ~T=Cluster2~. The example ~Map~ contains a single - unit interval range for ~Cluster2~, ~[(0.33,0.58]]~. -4. Choose a uniformly random number ~r~ on the unit interval. -5. Calculate locator ~L~ by mapping ~r~ onto the concatenation - of the CoC hash space range intervals in ~MapList~. For example, - if ~r=0.5~, then ~L = 0.33 + 0.5*(0.58-0.33) = 0.455~, which is - exactly in the middle of the ~(0.33,0.58]~ interval. - -** A bit more about the CoC locator's meaning and use - -- If two files were written using exactly the same CoC locator and the - same CoC namespace, then the client is indicating that it wishes - that the two files be stored in the same chain. -- If two files have a different CoC locator, then the client has - absolutely no expectation of where the two files will be stored - relative to each other. - -Given the items above, then some consequences are: - -- If the client doesn't care about CoC placement, then picking a - random number is fine. Always choosing a different locator ~L~ for - each append will scatter data across the CoC as widely as possible. -- If the client believes that some physical locality is good, then the - client should reuse the same locator ~L~ for a batch of appends to - the same prefix ~p~ and namespace ~N~. We have no recommendations - for the batch size, yet; perhaps 10-1,000 might be a good start for - experiments? - -When the client choose CoC namespace ~N~ and CoC locator ~L~ (using -random number or target cluster technique), the client uses ~N~'s CoC -map to find the CoC target cluster, ~T~. The client has also chosen -the file prefix ~p~. The append op sent to cluster ~T~ would look -like: - -~append_chunk(N="reduced",L=0.25,p="myprefix",<<900-data-bytes>>,<>,...)~ - -A successful result would yield a chunk position: - -~{offset=883293,size=900,file="myprefix^reduced^0.25^OpaqueSuffix"}~ - -** A bit more about the CoC namespaces's meaning and use - -- The CoC framework will provide means of creating and managing - chains of different types, e.g., chain length, consistency mode. -- The CoC framework will manage the mapping of CoC namespace names to - the chains in the system. -- The CoC framework will provide a query service to map a CoC - namespace name to a Coc map, - e.g. ~coc_latest_map("reduced") -> Map{generation=7,...}~. - -For use by Riak CS, for example, we'd likely start with the following -namespaces ... working our way down the list as we add new features -and/or re-implement existing CS features. - -- "standard" = Chain length = 3, eventually consistency mode -- "reduced" = Chain length = 2, eventually consistency mode. -- "stanchion7" = Chain length = 7, strong consistency mode. Perhaps - use this namespace for the metadata required to re-implement the - operations that are performed by today's Stanchion application. - -* 8. File migration (a.k.a. rebalancing/reparitioning/resharding/redistribution) - -** What is "migration"? - -This section describes Machi's file migration. Other storage systems -call this process as "rebalancing", "repartitioning", "resharding" or -"redistribution". -For Riak Core applications, it is called "handoff" and "ring resizing" -(depending on the context). -See also the [[http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html#Balancer][Hadoop file balancer]] for another example of a data -migration process. - -As discussed in section 5, the client can have good reason for wanting -to have some control of the initial location of the file within the -cluster. However, the cluster manager has an ongoing interest in -balancing resources throughout the lifetime of the file. Disks will -get full, hardware will change, read workload will fluctuate, -etc etc. - -This document uses the word "migration" to describe moving data from -one Machi chain to another within a CoC system. - -A simple variation of the Random Slicing hash algorithm can easily -accommodate Machi's need to migrate files without interfering with -availability. Machi's migration task is much simpler due to the -immutable nature of Machi file data. - -** Change to Random Slicing - -The map used by the Random Slicing hash algorithm needs a few simple -changes to make file migration straightforward. - -- Add a "generation number", a strictly increasing number (similar to - a Machi cluster's "epoch number") that reflects the history of - changes made to the Random Slicing map -- Use a list of Random Slicing maps instead of a single map, one map - per chance that files may not have been migrated yet out of - that map. - -As an example: - -#+CAPTION: Illustration of 'Map', using four Machi clusters - -[[./migration-3to4.png]] - -And the new Random Slicing map for some CoC namespace ~N~ might look -like this: - -| Generation number / Namespace | 7 / reduced | -|-------------------------------+-------------| -| SubMap | 1 | -|-------------------------------+-------------| -| Hash range | Cluster ID | -|-------------------------------+-------------| -| 0.00 - 0.33 | Cluster1 | -| 0.33 - 0.66 | Cluster2 | -| 0.66 - 1.00 | Cluster3 | -|-------------------------------+-------------| -| SubMap | 2 | -|-------------------------------+-------------| -| Hash range | Cluster ID | -|-------------------------------+-------------| -| 0.00 - 0.25 | Cluster1 | -| 0.25 - 0.33 | Cluster4 | -| 0.33 - 0.58 | Cluster2 | -| 0.58 - 0.66 | Cluster4 | -| 0.66 - 0.91 | Cluster3 | -| 0.91 - 1.00 | Cluster4 | - -When a new Random Slicing map contains a single submap, then its use -is identical to the original Random Slicing algorithm. If the map -contains multiple submaps, then the access rules change a bit: - -- Write operations always go to the newest/largest submap. -- Read operations attempt to read from all unique submaps. - - Skip searching submaps that refer to the same cluster ID. - - In this example, unit interval value 0.10 is mapped to Cluster1 - by both submaps. - - Read from newest/largest submap to oldest/smallest submap. - - If not found in any submap, search a second time (to handle races - with file copying between submaps). - - If the requested data is found, optionally copy it directly to the - newest submap. (This is a variation of read repair (RR). RR here - accelerates the migration process and can reduce the number of - operations required to query servers in multiple submaps). - -The cluster-of-clusters manager is responsible for: - -- Managing the various generations of the CoC Random Slicing maps for - all namespaces. -- Distributing namespace maps to CoC clients. -- Managing the processes that are responsible for copying "cold" data, - i.e., files data that is not regularly accessed, to its new submap - location. -- When migration of a file to its new cluster is confirmed successful, - delete it from the old cluster. - -In example map #7, the CoC manager will copy files with unit interval -assignments in ~(0.25,0.33]~, ~(0.58,0.66]~, and ~(0.91,1.00]~ from their -old locations in cluster IDs Cluster1/2/3 to their new cluster, -Cluster4. When the CoC manager is satisfied that all such files have -been copied to Cluster4, then the CoC manager can create and -distribute a new map, such as: - -| Generation number / Namespace | 8 / reduced | -|-------------------------------+-------------| -| SubMap | 1 | -|-------------------------------+-------------| -| Hash range | Cluster ID | -|-------------------------------+-------------| -| 0.00 - 0.25 | Cluster1 | -| 0.25 - 0.33 | Cluster4 | -| 0.33 - 0.58 | Cluster2 | -| 0.58 - 0.66 | Cluster4 | -| 0.66 - 0.91 | Cluster3 | -| 0.91 - 1.00 | Cluster4 | - -The HibariDB system performs data migrations in almost exactly this -manner. However, one important -limitation of HibariDB is not being able to -perform more than one migration at a time. HibariDB's data is -mutable, and mutation causes many problems already when migrating data -across two submaps; three or more submaps was too complex to implement -quickly. - -Fortunately for Machi, its file data is immutable and therefore can -easily manage many migrations in parallel, i.e., its submap list may -be several maps long, each one for an in-progress file migration. - -* 9. Other considerations for FLU/sequencer implementations - -** Append to existing file when possible - -In the earliest Machi FLU implementation, it was impossible to append -to the same file after ~30 seconds. For example: - -- Client: ~append(prefix="foo",...) -> {ok,"foo^suffix1",Offset1}~ -- Client: ~append(prefix="foo",...) -> {ok,"foo^suffix1",Offset2}~ -- Client: ~append(prefix="foo",...) -> {ok,"foo^suffix1",Offset3}~ -- Client: sleep 40 seconds -- Server: after 30 seconds idle time, stop Erlang server process for - the ~"foo^suffix1"~ file -- Client: ...wakes up... -- Client: ~append(prefix="foo",...) -> {ok,"foo^suffix2",Offset4}~ - -Our ideal append behavior is to always append to the same file. Why? -It would be nice if Machi didn't create zillions of tiny files if the -client appends to some prefix very infrequently. In general, it is -better to create fewer & bigger files by re-using a Machi file name -when possible. - -The sequencer should always assign new offsets to the latest/newest -file for any prefix, as long as all prerequisites are also true, - -- The epoch has not changed. (In AP mode, epoch change -> mandatory file name suffix change.) -- The latest file for prefix ~p~ is smaller than maximum file size for a FLU's configuration. - -* 10. Acknowledgments - -The source for the "migration-4.png" and "migration-3to4.png" images -come from the [[http://hibari.github.io/hibari-doc/images/migration-3to4.png][HibariDB documentation]]. - diff --git a/doc/cluster-of-clusters/migration-3to4.fig b/doc/cluster/migration-3to4.fig similarity index 85% rename from doc/cluster-of-clusters/migration-3to4.fig rename to doc/cluster/migration-3to4.fig index eadf105..0faad27 100644 --- a/doc/cluster-of-clusters/migration-3to4.fig +++ b/doc/cluster/migration-3to4.fig @@ -88,16 +88,16 @@ Single 4 0 0 50 -1 2 14 0.0000 4 180 495 4425 3525 ~8%\001 4 0 0 50 -1 2 14 0.0000 4 240 1710 5025 3525 ~25% total keys\001 4 0 0 50 -1 2 14 0.0000 4 180 495 6825 3525 ~8%\001 -4 0 0 50 -1 2 24 0.0000 4 270 1485 600 600 Cluster1\001 -4 0 0 50 -1 2 24 0.0000 4 270 1485 3000 600 Cluster2\001 -4 0 0 50 -1 2 24 0.0000 4 270 1485 5400 600 Cluster3\001 -4 0 0 50 -1 2 24 0.0000 4 270 1485 300 2850 Cluster1\001 -4 0 0 50 -1 2 24 0.0000 4 270 1485 2700 2850 Cluster2\001 -4 0 0 50 -1 2 24 0.0000 4 270 1485 5175 2850 Cluster3\001 -4 0 0 50 -1 2 24 0.0000 4 270 405 2100 2625 Cl\001 -4 0 0 50 -1 2 24 0.0000 4 270 405 6900 2625 Cl\001 4 0 0 50 -1 2 24 0.0000 4 270 195 2175 3075 4\001 4 0 0 50 -1 2 24 0.0000 4 270 195 4575 3075 4\001 4 0 0 50 -1 2 24 0.0000 4 270 195 6975 3075 4\001 -4 0 0 50 -1 2 24 0.0000 4 270 405 4500 2625 Cl\001 -4 0 0 50 -1 2 18 0.0000 4 240 3990 1200 4875 CoC locator, on the unit interval\001 +4 0 0 50 -1 2 24 0.0000 4 270 1245 600 600 Chain1\001 +4 0 0 50 -1 2 24 0.0000 4 270 1245 3000 600 Chain2\001 +4 0 0 50 -1 2 24 0.0000 4 270 1245 5400 600 Chain3\001 +4 0 0 50 -1 2 24 0.0000 4 270 285 2100 2625 C\001 +4 0 0 50 -1 2 24 0.0000 4 270 285 4500 2625 C\001 +4 0 0 50 -1 2 24 0.0000 4 270 285 6900 2625 C\001 +4 0 0 50 -1 2 24 0.0000 4 270 1245 525 2850 Chain1\001 +4 0 0 50 -1 2 24 0.0000 4 270 1245 2925 2850 Chain2\001 +4 0 0 50 -1 2 24 0.0000 4 270 1245 5325 2850 Chain3\001 +4 0 0 50 -1 2 18 0.0000 4 240 4350 1350 4875 Cluster locator, on the unit interval\001 diff --git a/doc/cluster/migration-3to4.png b/doc/cluster/migration-3to4.png new file mode 100644 index 0000000..cbef7e9 Binary files /dev/null and b/doc/cluster/migration-3to4.png differ diff --git a/doc/cluster/migration-4.png b/doc/cluster/migration-4.png new file mode 100644 index 0000000..b1e2b31 Binary files /dev/null and b/doc/cluster/migration-4.png differ diff --git a/doc/cluster/name-game-sketch.org b/doc/cluster/name-game-sketch.org new file mode 100644 index 0000000..21d2bd6 --- /dev/null +++ b/doc/cluster/name-game-sketch.org @@ -0,0 +1,481 @@ +-*- mode: org; -*- +#+TITLE: Machi cluster "name game" sketch +#+AUTHOR: Scott +#+STARTUP: lognotedone hidestars indent showall inlineimages +#+SEQ_TODO: TODO WORKING WAITING DONE +#+COMMENT: M-x visual-line-mode +#+COMMENT: Also, disable auto-fill-mode + +* 1. "Name Games" with random-slicing style consistent hashing + +Our goal: to distribute lots of files very evenly across a large +collection of individual, small Machi chains. + +* 2. Assumptions + +** Basic familiarity with Machi high level design and Machi's "projection" + +The [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] contains all of the basic +background assumed by the rest of this document. + +** Analogy: "neighborhood : city :: Machi chain : Machi cluster" + +Analogy: The word "machi" in Japanese means small town or +neighborhood. As the Tokyo Metropolitan Area is built from many +machis and smaller cities, therefore a big, partitioned file store can +be built out of many small Machi chains. + +** Familiarity with the Machi chain concept + +It's clear (I hope!) from +the [[https://github.com/basho/machi/blob/master/doc/high-level-machi.pdf][Machi high level design document]] that Machi alone does not support +any kind of file partitioning/distribution/sharding across multiple +small Machi chains. There must be another layer above a Machi chain to +provide such partitioning services. + +Using the [[https://github.com/basho/machi/tree/master/prototype/demo-day-hack][cluster quick-and-dirty prototype]] as an +architecture sketch, let's now assume that we have ~n~ independent Machi +chains. We assume that each of these chains has the same +chain length in the nominal case, e.g. chain length of 3. +We wish to provide partitioned/distributed file storage +across all ~n~ chains. We call the entire collection of ~n~ Machi +chains a "cluster". + +We may wish to have several types of Machi clusters. For example: + ++ Chain length of 1 for "don't care if it gets lost, + store stuff very very cheaply" data. ++ Chain length of 2 for normal data. + + Equivalent to quorum replication's reliability with 3 copies. ++ Chain length of 7 for critical, unreplaceable data. + + Equivalent to quorum replication's reliability with 15 copies. + +Each of these types of chains will have a name ~N~ in the +namespace. The role of the cluster namespace will be demonstrated in +Section 3 below. + +** Continue an early assumption: a Machi chain is unaware of clustering + +Let's continue with an assumption that an individual Machi chain +inside of a cluster is completely unaware of the cluster layer. + +** The reader is familiar with the random slicing technique + +I'd done something very-very-nearly-like-this for the Hibari database +6 years ago. But the Hibari technique was based on stuff I did at +Sendmail, Inc, in 2000, so this technique feels like old news to me. +{shrug} + +The following section provides an illustrated example. +Very quickly, the random slicing algorithm is: + +- Hash a string onto the unit interval [0.0, 1.0) +- Calculate h(unit interval point, Map) -> bin, where ~Map~ divides + the unit interval into bins (or partitions or shards). + +Machi's adaptation is in step 1: we do not hash any strings. Instead, we +simply choose a number on the unit interval. This number is called +the "cluster locator number". + +As described later in this doc, Machi file names are structured into +several components. One component of the file name contains the cluster +locator number; we use the number as-is for step 2 above. + +*** For more information about Random Slicing + +For a comprehensive description of random slicing, please see the +first two papers. For a quicker summary, please see the third +reference. + +#+BEGIN_QUOTE +Reliable and Randomized Data Distribution Strategies for Large Scale Storage Systems +Alberto Miranda et al. +http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.226.5609 + (short version, HIPC'11) + +Random Slicing: Efficient and Scalable Data Placement for Large-Scale + Storage Systems +Alberto Miranda et al. +DOI: http://dx.doi.org/10.1145/2632230 (long version, ACM Transactions + on Storage, Vol. 10, No. 3, Article 9, 2014) + +[[http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration][Hibari Sysadmin Guide, chain migration section]]. +http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration +#+END_QUOTE + +* 3. A simple illustration + +We use a variation of the Random Slicing hash that we will call +~rs_hash_with_float()~. The Erlang-style function type is shown +below. + +#+BEGIN_SRC erlang +%% type specs, Erlang-style +-spec rs_hash_with_float(float(), rs_hash:map()) -> rs_hash:chain_id(). +#+END_SRC + +I'm borrowing an illustration from the HibariDB documentation here, +but it fits my purposes quite well. (I am the original creator of that +image, and also the use license is compatible.) + +#+CAPTION: Illustration of 'Map', using four Machi chains + +[[./migration-4.png]] + +Assume that we have a random slicing map called ~Map~. This particular +~Map~ maps the unit interval onto 4 Machi chains: + +| Hash range | Chain ID | +|-------------+----------| +| 0.00 - 0.25 | Chain1 | +| 0.25 - 0.33 | Chain4 | +| 0.33 - 0.58 | Chain2 | +| 0.58 - 0.66 | Chain4 | +| 0.66 - 0.91 | Chain3 | +| 0.91 - 1.00 | Chain4 | + +Assume that the system chooses a cluster locator of 0.05. +According to ~Map~, the value of +~rs_hash_with_float(0.05,Map) = Chain1~. +Similarly, ~rs_hash_with_float(0.26,Map) = Chain4~. + +This example should look very similar to Hibari's technique. +The Hibari documentation has a brief photo illustration of how random +slicing works, see [[http://hibari.github.io/hibari-doc/hibari-sysadmin-guide.en.html#chain-migration][Hibari Sysadmin Guide, chain migration]]. + +* 4. Use of the cluster namespace: name separation plus chain type + +Let us assume that the cluster framework provides several different types +of chains: + +| Chain length | Namespace | Consistency Mode | Comment | +|--------------+--------------+------------------+----------------------------------| +| 3 | ~normal~ | eventual | Normal storage redundancy & cost | +| 2 | ~reduced~ | eventual | Reduced cost storage | +| 1 | ~risky~ | eventual | Really, really cheap storage | +| 7 | ~paranoid~ | eventual | Safety-critical storage | +| 3 | ~sequential~ | strong | Strong consistency | +|--------------+--------------+------------------+----------------------------------| + +The client may want to choose the amount of redundancy that its +application requires: normal, reduced cost, or perhaps even a single +copy. The cluster namespace is used by the client to signal this +intention. + +Further, the cluster administrators may wish to use the namespace to +provide separate storage for different applications. Jane's +application may use the namespace "jane-normal" and Bob's app uses +"bob-reduced". Administrators may definine separate groups of +chains on separate servers to serve these two applications. + +* 5. In its lifetime, a file may be moved to different chains + +The cluster management scheme may decide that files need to migrate to +other chains -- i.e., file that is initially created on chain ID ~X~ +has been moved to chain ID ~Y~. + ++ For storage load or I/O load balancing reasons. ++ Because a chain is being decommissioned by the sysadmin. + +* 6. Floating point is not required ... it is merely convenient for explanation + +NOTE: Use of floating point terms is not required. For example, +integer arithmetic could be used, if using a sufficiently large +interval to create an even & smooth distribution of hashes across the +expected maximum number of chains. + +For example, if the maximum cluster size would be 4,000 individual +Machi chains, then a minimum of 12 bits of integer space is required +to assign one integer per Machi chain. However, for load balancing +purposes, a finer grain of (for example) 100 integers per Machi +chain would permit file migration to move increments of +approximately 1% of single Machi chain's storage capacity. A +minimum of 12+7=19 bits of hash space would be necessary to accommodate +these constraints. + +It is likely that Machi's final implementation will choose a 24 bit +integer (or perhaps 32 bits) to represent the cluster locator. + +* 7. Proposal: Break the opacity of Machi file names, slightly. + +Machi assigns file names based on: + +~ClientSuppliedPrefix ++ "^" ++ SomeOpaqueFileNameSuffix~ + +What if some parts of the system could peek inside of the opaque file name +suffix in order to look at the cluster location information that we might +code in the filename suffix? + +We break the system into parts that speak two levels of protocols, +"high" and "low". + ++ The high level protocol is used outside of the Machi cluster ++ The low level protocol is used inside of the Machi cluster + +Both protocols are based on a Protocol Buffers specification and +implementation. Other protocols, such as HTTP, will be added later. + +#+BEGIN_SRC + +-----------------------+ + | Machi external client | + | e.g. Riak CS | + +-----------------------+ + ^ + | Machi "high" API + | ProtoBuffs protocol Machi cluster boundary: outside +......................................................................... + | Machi cluster boundary: inside + v + +--------------------------+ +------------------------+ + | Machi "high" API service | | Machi HTTP API service | + +--------------------------+ +------------------------+ + ^ | + | +------------------------+ + v v + +------------------------+ + | Cluster bridge service | + +------------------------+ + ^ + | Machi "low" API + | ProtoBuffs protocol + +----------------------------------------+----+----+ + | | | | + v v v v + +-------------------------+ ... other chains... + | Chain C1 (logical view) | + | +--------------+ | + | | FLU server 1 | | + | | +--------------+ | + | +--| FLU server 2 | | + | +--------------+ | In reality, API bridge talks directly + +-------------------------+ to each FLU server in a chain. +#+END_SRC + +** The notation we use + +- ~N~ = the cluster namespace, chosen by the client. +- ~p~ = file prefix, chosen by the client. +- ~L~ = the cluster locator (a number, type is implementation-dependent) +- ~Map~ = a mapping of cluster locators to chains +- ~T~ = the target chain ID/name +- ~u~ = a unique opaque file name suffix, e.g. a GUID string +- ~F~ = a Machi file name, i.e., a concatenation of ~p^L^N^u~ + +** The details: cluster file append + +0. Cluster client chooses ~N~ and ~p~ (i.e., cluster namespace and + file prefix) and sends the append request to a Machi cluster member + via the Protocol Buffers "high" API. +1. Cluster bridge chooses ~T~ (i.e., target chain), based on criteria + such as disk utilization percentage. +2. Cluster bridge knows the cluster ~Map~ for namespace ~N~. +3. Cluster bridge choose some cluster locator value ~L~ such that + ~rs_hash_with_float(L,Map) = T~ (see algorithm below). +4. Cluster bridge sends its request to chain + ~T~: ~append_chunk(p,L,N,...) -> {ok,p^L^N^u,ByteOffset}~ +5. Cluster bridge forwards the reply tuple to the client. +6. Client stores/uses the file name ~F = p^L^N^u~. + +** The details: Cluster file read + +0. Cluster client sends the read request to a Machi cluster member via + the Protocol Buffers "high" API. +1. Cluster bridge parses the file name ~F~ to find + the values of ~L~ and ~N~ (recall, ~F = p^L^N^u~). +2. Cluster bridge knows the Cluster ~Map~ for type ~N~. +3. Cluster bridge calculates ~rs_hash_with_float(L,Map) = T~ +4. Cluster bridge sends request to chain ~T~: + ~read_chunk(F,...) ->~ ... reply +5. Cluster bridge forwards the reply to the client. + +** The details: calculating 'L' (the cluster locator number) to match a desired target chain + +1. We know ~Map~, the current cluster mapping for a cluster namespace ~N~. +2. We look inside of ~Map~, and we find all of the unit interval ranges + that map to our desired target chain ~T~. Let's call this list + ~MapList = [Range1=(start,end],Range2=(start,end],...]~. +3. In our example, ~T=Chain2~. The example ~Map~ contains a single + unit interval range for ~Chain2~, ~[(0.33,0.58]]~. +4. Choose a uniformly random number ~r~ on the unit interval. +5. Calculate the cluster locator ~L~ by mapping ~r~ onto the concatenation + of the cluster hash space range intervals in ~MapList~. For example, + if ~r=0.5~, then ~L = 0.33 + 0.5*(0.58-0.33) = 0.455~, which is + exactly in the middle of the ~(0.33,0.58]~ interval. + +** A bit more about the cluster namespaces's meaning and use + +For use by Riak CS, for example, we'd likely start with the following +namespaces ... working our way down the list as we add new features +and/or re-implement existing CS features. + +- "standard" = Chain length = 3, eventually consistency mode +- "reduced" = Chain length = 2, eventually consistency mode. +- "stanchion7" = Chain length = 7, strong consistency mode. Perhaps + use this namespace for the metadata required to re-implement the + operations that are performed by today's Stanchion application. + +We want the cluster framework to: + +- provide means of creating and managing + chains of different types, e.g., chain length, consistency mode. +- manage the mapping of cluster namespace + names to the chains in the system. +- provide query functions to map a cluster + namespace name to a cluster map, + e.g. ~get_cluster_latest_map("reduced") -> Map{generation=7,...}~. + +* 8. File migration (a.k.a. rebalancing/reparitioning/resharding/redistribution) + +** What is "migration"? + +This section describes Machi's file migration. Other storage systems +call this process as "rebalancing", "repartitioning", "resharding" or +"redistribution". +For Riak Core applications, it is called "handoff" and "ring resizing" +(depending on the context). +See also the [[http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html#Balancer][Hadoop file balancer]] for another example of a data +migration process. + +As discussed in section 5, the client can have good reason for wanting +to have some control of the initial location of the file within the +chain. However, the chain manager has an ongoing interest in +balancing resources throughout the lifetime of the file. Disks will +get full, hardware will change, read workload will fluctuate, +etc etc. + +This document uses the word "migration" to describe moving data from +one Machi chain to another chain within a cluster system. + +A simple variation of the Random Slicing hash algorithm can easily +accommodate Machi's need to migrate files without interfering with +availability. Machi's migration task is much simpler due to the +immutable nature of Machi file data. + +** Change to Random Slicing + +The map used by the Random Slicing hash algorithm needs a few simple +changes to make file migration straightforward. + +- Add a "generation number", a strictly increasing number (similar to + a Machi chain's "epoch number") that reflects the history of + changes made to the Random Slicing map +- Use a list of Random Slicing maps instead of a single map, one map + per chance that files may not have been migrated yet out of + that map. + +As an example: + +#+CAPTION: Illustration of 'Map', using four Machi chains + +[[./migration-3to4.png]] + +And the new Random Slicing map for some cluster namespace ~N~ might look +like this: + +| Generation number / Namespace | 7 / reduced | +|-------------------------------+-------------| +| SubMap | 1 | +|-------------------------------+-------------| +| Hash range | Chain ID | +|-------------------------------+-------------| +| 0.00 - 0.33 | Chain1 | +| 0.33 - 0.66 | Chain2 | +| 0.66 - 1.00 | Chain3 | +|-------------------------------+-------------| +| SubMap | 2 | +|-------------------------------+-------------| +| Hash range | Chain ID | +|-------------------------------+-------------| +| 0.00 - 0.25 | Chain1 | +| 0.25 - 0.33 | Chain4 | +| 0.33 - 0.58 | Chain2 | +| 0.58 - 0.66 | Chain4 | +| 0.66 - 0.91 | Chain3 | +| 0.91 - 1.00 | Chain4 | + +When a new Random Slicing map contains a single submap, then its use +is identical to the original Random Slicing algorithm. If the map +contains multiple submaps, then the access rules change a bit: + +- Write operations always go to the newest/largest submap. +- Read operations attempt to read from all unique submaps. + - Skip searching submaps that refer to the same chain ID. + - In this example, unit interval value 0.10 is mapped to Chain1 + by both submaps. + - Read from newest/largest submap to oldest/smallest submap. + - If not found in any submap, search a second time (to handle races + with file copying between submaps). + - If the requested data is found, optionally copy it directly to the + newest submap. (This is a variation of read repair (RR). RR here + accelerates the migration process and can reduce the number of + operations required to query servers in multiple submaps). + +The cluster manager is responsible for: + +- Managing the various generations of the cluster Random Slicing maps for + all namespaces. +- Distributing namespace maps to cluster bridges. +- Managing the processes that are responsible for copying "cold" data, + i.e., files data that is not regularly accessed, to its new submap + location. +- When migration of a file to its new chain is confirmed successful, + delete it from the old chain. + +In example map #7, the cluster manager will copy files with unit interval +assignments in ~(0.25,0.33]~, ~(0.58,0.66]~, and ~(0.91,1.00]~ from their +old locations in chain IDs Chain1/2/3 to their new chain, +Chain4. When the cluster manager is satisfied that all such files have +been copied to Chain4, then the cluster manager can create and +distribute a new map, such as: + +| Generation number / Namespace | 8 / reduced | +|-------------------------------+-------------| +| SubMap | 1 | +|-------------------------------+-------------| +| Hash range | Chain ID | +|-------------------------------+-------------| +| 0.00 - 0.25 | Chain1 | +| 0.25 - 0.33 | Chain4 | +| 0.33 - 0.58 | Chain2 | +| 0.58 - 0.66 | Chain4 | +| 0.66 - 0.91 | Chain3 | +| 0.91 - 1.00 | Chain4 | + +The HibariDB system performs data migrations in almost exactly this +manner. However, one important +limitation of HibariDB is not being able to +perform more than one migration at a time. HibariDB's data is +mutable. Mutation causes many problems when migrating data +across two submaps; three or more submaps was too complex to implement +quickly and correctly. + +Fortunately for Machi, its file data is immutable and therefore can +easily manage many migrations in parallel, i.e., its submap list may +be several maps long, each one for an in-progress file migration. + +* 9. Other considerations for FLU/sequencer implementations + +** Append to existing file when possible + +The sequencer should always assign new offsets to the latest/newest +file for any prefix, as long as all prerequisites are also true, + +- The epoch has not changed. (In AP mode, epoch change -> mandatory + file name suffix change.) +- The cluster locator number is stable. +- The latest file for prefix ~p~ is smaller than maximum file size for + a FLU's configuration. + +The stability of the cluster locator number is an implementation detail that +must be managed by the cluster bridge. + +Reuse of the same file is not possible if the bridge always chooses a +different cluster locator number ~L~ or if the client always uses a unique +file prefix ~p~. The latter is a sign of a misbehaved client; the +former is a poorly-implemented bridge. + +* 10. Acknowledgments + +The original source for the "migration-4.png" and "migration-3to4.png" images +come from the [[http://hibari.github.io/hibari-doc/images/migration-3to4.png][HibariDB documentation]]. + diff --git a/doc/flu-and-chain-lifecycle.org b/doc/flu-and-chain-lifecycle.org index 4672080..d81b326 100644 --- a/doc/flu-and-chain-lifecycle.org +++ b/doc/flu-and-chain-lifecycle.org @@ -14,10 +14,10 @@ complete yet, so we are working one small step at a time. + FLU and Chain Life Cycle Management + Terminology review + Terminology: Machi run-time components/services/thingies - + Terminology: Machi data structures - + Terminology: Cluster-of-cluster (CoC) data structures + + Terminology: Machi chain data structures + + Terminology: Machi cluster data structures + Overview of administrative life cycles - + Cluster-of-clusters (CoC) administrative life cycle + + Cluster administrative life cycle + Chain administrative life cycle + FLU server administrative life cycle + Quick admin: declarative management of Machi FLU and chain life cycles @@ -57,10 +57,8 @@ complete yet, so we are working one small step at a time. quorum replication technique requires ~2F+1~ members in the general case.) -+ Cluster: this word can be used interchangeably with "chain". - -+ Cluster-of-clusters: A collection of Machi clusters where files are - horizontally partitioned/sharded/distributed across ++ Cluster: A collection of Machi chains that are used to store files + in a horizontally partitioned/sharded/distributed manner. ** Terminology: Machi data structures @@ -75,13 +73,13 @@ complete yet, so we are working one small step at a time. to another, e.g., when the chain is temporarily shortened by the failure of a member FLU server. -** Terminology: Cluster-of-cluster (CoC) data structures +** Terminology: Machi cluster data structures + Namespace: A collection of human-friendly names that are mapped to groups of Machi chains that provide the same type of storage service: consistency mode, replication policy, etc. + A single namespace name, e.g. ~normal-ec~, is paired with a single - CoC chart (see below). + cluster map (see below). + Example: ~normal-ec~ might be a collection of Machi chains in eventually-consistent mode that are of length=3. + Example: ~risky-ec~ might be a collection of Machi chains in @@ -89,32 +87,31 @@ complete yet, so we are working one small step at a time. + Example: ~mgmt-critical~ might be a collection of Machi chains in strongly-consistent mode that are of length=7. -+ CoC chart: Encodes the rules which partition/shard/distribute a - particular namespace across a group of chains that collectively - store the namespace's files. - + "chart: noun, a geographical map or plan, especially on used for - navigation by sea or air." ++ Cluster map: Encodes the rules which partition/shard/distribute + the files stored in a particular namespace across a group of chains + that collectively store the namespace's files. -+ Chain weight: A value assigned to each chain within a CoC chart ++ Chain weight: A value assigned to each chain within a cluster map structure that defines the relative storage capacity of a chain within the namespace. For example, a chain weight=150 has 50% more capacity than a chain weight=100. -+ CoC chart epoch: The version number assigned to a CoC chart. ++ Cluster map epoch: The version number assigned to a cluster map. * Overview of administrative life cycles -** Cluster-of-clusters (CoC) administrative life cycle +** Cluster administrative life cycle -+ CoC is first created -+ CoC adds namespaces (e.g. consistency policy + chain length policy) -+ CoC adds/removes chains to a namespace to increase/decrease the ++ Cluster is first created ++ Adds namespaces (e.g. consistency policy + chain length policy) to + the cluster ++ Chains are added to/removed from a namespace to increase/decrease the namespace's storage capacity. -+ CoC adjusts chain weights within a namespace, e.g., to shift files ++ Adjust chain weights within a namespace, e.g., to shift files within the namespace to chains with greater storage capacity resources and/or runtime I/O resources. -A CoC "file migration" is the process of moving files from one +A cluster "file migration" is the process of moving files from one namespace member chain to another for purposes of shifting & re-balancing storage capacity and/or runtime I/O capacity. @@ -155,7 +152,7 @@ described in this section. As described at the top of http://basho.github.io/machi/edoc/machi_lifecycle_mgr.html, the "rc.d" config files do not manage "policy". "Policy" is doing the right -thing with a Machi cluster-of-clusters from a systems administrator's +thing with a Machi cluster from a systems administrator's point of view. The "rc.d" config files can only implement decisions made according to policy. diff --git a/doc/process-protocol-module-overview.jpg b/doc/process-protocol-module-overview.jpg new file mode 100644 index 0000000..eb7accf Binary files /dev/null and b/doc/process-protocol-module-overview.jpg differ diff --git a/include/machi.hrl b/include/machi.hrl index f825556..7974fd2 100644 --- a/include/machi.hrl +++ b/include/machi.hrl @@ -43,3 +43,21 @@ -define(DEFAULT_COC_NAMESPACE, ""). -define(DEFAULT_COC_LOCATOR, 0). + +-record(ns_info, { + version = 0 :: machi_dt:namespace_version(), + name = <<>> :: machi_dt:namespace(), + locator = 0 :: machi_dt:locator() + }). + +-record(append_opts, { + chunk_extra = 0 :: machi_dt:chunk_size(), + preferred_file_name :: 'undefined' | machi_dt:file_name_s(), + flag_fail_preferred = false :: boolean() + }). + +-record(read_opts, { + no_checksum = false :: boolean(), + no_chunk = false :: boolean(), + needs_trimmed = false :: boolean() + }). diff --git a/priv/make-faq.pl b/priv/make-faq.pl index 7edee07..b7a3089 100755 --- a/priv/make-faq.pl +++ b/priv/make-faq.pl @@ -36,7 +36,7 @@ while () { $indent = " " x ($count * 4); s/^#*\s*[0-9. ]*//; $anchor = "n$label"; - printf T1 "%s+ [%s %s](#%s)\n", $indent, $label, $_, $anchor; + printf T1 "%s+ [%s. %s](#%s)\n", $indent, $label, $_, $anchor; printf T2 "\n", $anchor; $line =~ s/(#+)\s*[0-9. ]*/$1 $label. /; print T2 $line; diff --git a/src/machi.proto b/src/machi.proto index 2645bde..a9ac513 100644 --- a/src/machi.proto +++ b/src/machi.proto @@ -170,12 +170,18 @@ message Mpb_AuthResp { // High level API: append_chunk() request & response message Mpb_AppendChunkReq { - required string coc_namespace = 1; - required uint32 coc_locator = 2; - required string prefix = 3; - required bytes chunk = 4; - required Mpb_ChunkCSum csum = 5; - optional uint32 chunk_extra = 6; + // General namespace arguments + /* In single chain/non-clustered environment, use namespace="" */ + required string namespace = 1; + + required string prefix = 10; + required bytes chunk = 11; + required Mpb_ChunkCSum csum = 12; + + optional uint32 chunk_extra = 20; + optional string preferred_file_name = 21; + /* Fail the operation if our preferred file name is not available */ + optional bool flag_fail_preferred = 22 [default=false]; } message Mpb_AppendChunkResp { @@ -187,7 +193,7 @@ message Mpb_AppendChunkResp { // High level API: write_chunk() request & response message Mpb_WriteChunkReq { - required Mpb_Chunk chunk = 1; + required Mpb_Chunk chunk = 10; } message Mpb_WriteChunkResp { @@ -197,19 +203,22 @@ message Mpb_WriteChunkResp { // High level API: read_chunk() request & response message Mpb_ReadChunkReq { - required Mpb_ChunkPos chunk_pos = 1; + // No namespace arguments are required because NS is embedded + // inside of the file name. + + required Mpb_ChunkPos chunk_pos = 10; // Use flag_no_checksum=non-zero to skip returning the chunk's checksum. // TODO: not implemented yet. - optional uint32 flag_no_checksum = 2 [default=0]; + optional bool flag_no_checksum = 20 [default=false]; // Use flag_no_chunk=non-zero to skip returning the chunk (which // only makes sense if flag_no_checksum is not set). // TODO: not implemented yet. - optional uint32 flag_no_chunk = 3 [default=0]; + optional bool flag_no_chunk = 21 [default=false]; // TODO: not implemented yet. - optional uint32 flag_needs_trimmed = 4 [default=0]; + optional bool flag_needs_trimmed = 22 [default=false]; } message Mpb_ReadChunkResp { @@ -245,6 +254,8 @@ message Mpb_ChecksumListResp { // High level API: list_files() request & response message Mpb_ListFilesReq { + // TODO: Add flag for file glob/regexp/other filter type + // TODO: What else could go wrong? } message Mpb_ListFilesResp { @@ -377,14 +388,20 @@ message Mpb_ProjectionV1 { // Low level API: append_chunk() message Mpb_LL_AppendChunkReq { - required Mpb_EpochID epoch_id = 1; - /* To avoid CoC use, use coc_namespace="" and coc_locator=0 */ - required string coc_namespace = 2; - required uint32 coc_locator = 3; - required string prefix = 4; - required bytes chunk = 5; - required Mpb_ChunkCSum csum = 6; - optional uint32 chunk_extra = 7; + // General namespace arguments + required uint32 namespace_version = 1; + required string namespace = 2; + required uint32 locator = 3; + + required Mpb_EpochID epoch_id = 10; + required string prefix = 11; + required bytes chunk = 12; + required Mpb_ChunkCSum csum = 13; + + optional uint32 chunk_extra = 20; + optional string preferred_file_name = 21; + /* Fail the operation if our preferred file name is not available */ + optional bool flag_fail_preferred = 22 [default=false]; } message Mpb_LL_AppendChunkResp { @@ -396,8 +413,12 @@ message Mpb_LL_AppendChunkResp { // Low level API: write_chunk() message Mpb_LL_WriteChunkReq { - required Mpb_EpochID epoch_id = 1; - required Mpb_Chunk chunk = 2; + // General namespace arguments + required uint32 namespace_version = 1; + required string namespace = 2; + + required Mpb_EpochID epoch_id = 10; + required Mpb_Chunk chunk = 11; } message Mpb_LL_WriteChunkResp { @@ -407,19 +428,23 @@ message Mpb_LL_WriteChunkResp { // Low level API: read_chunk() message Mpb_LL_ReadChunkReq { - required Mpb_EpochID epoch_id = 1; - required Mpb_ChunkPos chunk_pos = 2; + // General namespace arguments + required uint32 namespace_version = 1; + required string namespace = 2; + + required Mpb_EpochID epoch_id = 10; + required Mpb_ChunkPos chunk_pos = 11; // Use flag_no_checksum=non-zero to skip returning the chunk's checksum. // TODO: not implemented yet. - optional uint32 flag_no_checksum = 3 [default=0]; + optional bool flag_no_checksum = 20 [default=false]; // Use flag_no_chunk=non-zero to skip returning the chunk (which // only makes sense if flag_checksum is not set). // TODO: not implemented yet. - optional uint32 flag_no_chunk = 4 [default=0]; + optional bool flag_no_chunk = 21 [default=false]; - optional uint32 flag_needs_trimmed = 5 [default=0]; + optional bool flag_needs_trimmed = 22 [default=false]; } message Mpb_LL_ReadChunkResp { @@ -431,11 +456,16 @@ message Mpb_LL_ReadChunkResp { // Low level API: trim_chunk() message Mpb_LL_TrimChunkReq { - required Mpb_EpochID epoch_id = 1; - required string file = 2; - required uint64 offset = 3; - required uint32 size = 4; - optional uint32 trigger_gc = 5 [default=0]; + // General namespace arguments + required uint32 namespace_version = 1; + required string namespace = 2; + + required Mpb_EpochID epoch_id = 10; + required string file = 11; + required uint64 offset = 12; + required uint32 size = 13; + + optional bool trigger_gc = 20 [default=false]; } message Mpb_LL_TrimChunkResp { @@ -445,8 +475,7 @@ message Mpb_LL_TrimChunkResp { // Low level API: checksum_list() message Mpb_LL_ChecksumListReq { - required Mpb_EpochID epoch_id = 1; - required string file = 2; + required string file = 1; } message Mpb_LL_ChecksumListResp { @@ -477,7 +506,9 @@ message Mpb_LL_WedgeStatusReq { message Mpb_LL_WedgeStatusResp { required Mpb_GeneralStatusCode status = 1; optional Mpb_EpochID epoch_id = 2; - optional uint32 wedged_flag = 3; + optional bool wedged_flag = 3; + optional uint32 namespace_version = 4; + optional string namespace = 5; } // Low level API: delete_migration() diff --git a/src/machi_admin_util.erl b/src/machi_admin_util.erl index 46f6c3d..41a4b5f 100644 --- a/src/machi_admin_util.erl +++ b/src/machi_admin_util.erl @@ -90,15 +90,16 @@ verify_file_checksums_local2(Sock1, EpochID, Path0) -> end. verify_file_checksums_remote2(Sock1, EpochID, File) -> + NSInfo = undefined, ReadChunk = fun(File_name, Offset, Size) -> - ?FLU_C:read_chunk(Sock1, EpochID, - File_name, Offset, Size, []) + ?FLU_C:read_chunk(Sock1, NSInfo, EpochID, + File_name, Offset, Size, undefined) end, verify_file_checksums_common(Sock1, EpochID, File, ReadChunk). -verify_file_checksums_common(Sock1, EpochID, File, ReadChunk) -> +verify_file_checksums_common(Sock1, _EpochID, File, ReadChunk) -> try - case ?FLU_C:checksum_list(Sock1, EpochID, File) of + case ?FLU_C:checksum_list(Sock1, File) of {ok, InfoBin} -> Info = machi_csum_table:split_checksum_list_blob_decode(InfoBin), Res = lists:foldl(verify_chunk_checksum(File, ReadChunk), diff --git a/src/machi_basho_bench_driver.erl b/src/machi_basho_bench_driver.erl index 4d36328..4adc052 100644 --- a/src/machi_basho_bench_driver.erl +++ b/src/machi_basho_bench_driver.erl @@ -112,7 +112,7 @@ run(read, KeyGen, _ValueGen, #m{conn=Conn, max_key=MaxKey}=S) -> Idx = KeyGen() rem MaxKey, %% {File, Offset, Size, _CSum} = ets:lookup_element(?ETS_TAB, Idx, 2), {File, Offset, Size} = ets:lookup_element(?ETS_TAB, Idx, 2), - case machi_cr_client:read_chunk(Conn, File, Offset, Size, [], ?THE_TIMEOUT) of + case machi_cr_client:read_chunk(Conn, File, Offset, Size, undefined, ?THE_TIMEOUT) of {ok, _Chunk} -> {ok, S}; {error, _}=Err -> diff --git a/src/machi_chain_manager1.erl b/src/machi_chain_manager1.erl index 7f112d0..bdc142d 100644 --- a/src/machi_chain_manager1.erl +++ b/src/machi_chain_manager1.erl @@ -1909,7 +1909,7 @@ react_to_env_C100_inner(Author_latest, NotSanesDict0, _MyName, S2 = S#ch_mgr{not_sanes=NotSanesDict, sane_transitions=0}, case orddict:fetch(Author_latest, NotSanesDict) of N when N > ?TOO_FREQUENT_BREAKER -> - %% ?V("\n\nYOYO ~w breaking the cycle of:\n current: ~w\n new : ~w\n", [_MyName, machi_projection:make_summary(S#ch_mgr.proj), machi_projection:make_summary(P_latest)]), + ?V("\n\nYOYO ~w breaking the cycle insane-freq=~w by-author=~w of:\n current: ~w\n new : ~w\n", [_MyName, N, Author_latest, machi_projection:make_summary(S#ch_mgr.proj), machi_projection:make_summary(P_latest)]), ?REACT({c100, ?LINE, [{not_sanes_author_count, N}]}), react_to_env_C103(P_newprop, P_latest, P_current_calc, S2); N -> @@ -1937,7 +1937,7 @@ react_to_env_C103(#projection_v1{epoch_number=_Epoch_newprop} = _P_newprop, ?REACT({c103, ?LINE, [{current_epoch, P_current#projection_v1.epoch_number}, {none_projection_epoch, P_none#projection_v1.epoch_number}]}), - io:format(user, "SET add_admin_down(~w) at ~w =====================================\n", [MyName, time()]), + io:format(user, "SET add_admin_down(~w) at ~w current_epoch ~w none_proj_epoch ~w =====================================\n", [MyName, time(), P_current#projection_v1.epoch_number, P_none#projection_v1.epoch_number]), machi_fitness:add_admin_down(S#ch_mgr.fitness_svr, MyName, []), timer:sleep(5*1000), io:format(user, "SET delete_admin_down(~w) at ~w =====================================\n", [MyName, time()]), @@ -2979,12 +2979,13 @@ perhaps_verbose_c111(P_latest2, S) -> (S#ch_mgr.proj)#projection_v1.upi /= [] -> <> = P_latest2#projection_v1.epoch_csum, - io:format(user, "\n~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), (S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]); + io:format(user, "~s CONFIRM epoch ~w ~w upi ~w rep ~w by ~w\n", [machi_util:pretty_time(), (S#ch_mgr.proj)#projection_v1.epoch_number, CSumRep, P_latest2#projection_v1.upi, P_latest2#projection_v1.repairing, S#ch_mgr.name]); true -> ok end, case proplists:get_value(private_write_verbose, S#ch_mgr.opts) of + %% case true of true when Summ2 /= Last2 -> put(last_verbose, Summ2), ?V("\n~s ~p uses plain: ~w \n", diff --git a/src/machi_chain_repair.erl b/src/machi_chain_repair.erl index ee12b20..052fb1c 100644 --- a/src/machi_chain_repair.erl +++ b/src/machi_chain_repair.erl @@ -207,7 +207,7 @@ make_repair_compare_fun(SrcFLU) -> T_a =< T_b end. -make_repair_directives(ConsistencyMode, RepairMode, File, Size, EpochID, +make_repair_directives(ConsistencyMode, RepairMode, File, Size, _EpochID, Verb, Src, FLUs0, ProxiesDict, ETS) -> true = (Size < ?MAX_OFFSET), FLUs = lists:usort(FLUs0), @@ -216,7 +216,7 @@ make_repair_directives(ConsistencyMode, RepairMode, File, Size, EpochID, Proxy = orddict:fetch(FLU, ProxiesDict), OffSzCs = case machi_proxy_flu1_client:checksum_list( - Proxy, EpochID, File, ?LONG_TIMEOUT) of + Proxy, File, ?LONG_TIMEOUT) of {ok, InfoBin} -> machi_csum_table:split_checksum_list_blob_decode(InfoBin); {error, no_such_file} -> @@ -236,7 +236,6 @@ make_repair_directives(ConsistencyMode, RepairMode, File, Size, EpochID, make_repair_directives2(C2, ConsistencyMode, RepairMode, File, Verb, Src, FLUs, ProxiesDict, ETS) -> - ?VERB("."), make_repair_directives3(C2, ConsistencyMode, RepairMode, File, Verb, Src, FLUs, ProxiesDict, ETS, []). @@ -266,7 +265,18 @@ make_repair_directives3([{Offset, Size, CSum, _FLU}=A|Rest0], %% byte range from all FLUs %% 3b. Log big warning about data loss. %% 4. Log any other checksum discrepencies as they are found. - exit({todo_repair_sanity_check, ?LINE, File, Offset, As}) + QQ = [begin + Pxy = orddict:fetch(FLU, ProxiesDict), + {ok, EpochID} = machi_proxy_flu1_client:get_epoch_id( + Pxy, ?SHORT_TIMEOUT), + NSInfo = undefined, + XX = machi_proxy_flu1_client:read_chunk( + Pxy, NSInfo, EpochID, File, Offset, Size, undefined, + ?SHORT_TIMEOUT), + {FLU, XX} + end || {__Offset, __Size, __CSum, FLU} <- As], + + exit({todo_repair_sanity_check, ?LINE, File, Offset, {as,As}, {qq,QQ}}) end, %% List construction guarantees us that there's at least one ?MAX_OFFSET %% item remains. Sort order + our "taking" of all exact Offset+Size @@ -319,23 +329,25 @@ execute_repair_directives(ap_mode=_ConsistencyMode, Ds, _Src, EpochID, Verb, {ProxiesDict, EpochID, Verb, ETS}, Ds), ok. -execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, Verb, ETS}=Acc) -> +execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, _Verb, ETS}=Acc) -> EtsKeys = [{in_files, t_in_files}, {in_chunks, t_in_chunks}, {in_bytes, t_in_bytes}, {out_files, t_out_files}, {out_chunks, t_out_chunks}, {out_bytes, t_out_bytes}], [ets:insert(ETS, {L_K, 0}) || {L_K, _T_K} <- EtsKeys], F = fun({copy, {Offset, Size, TaggedCSum, MySrc}, MyDsts}, Acc2) -> SrcP = orddict:fetch(MySrc, ProxiesDict), - case ets:lookup_element(ETS, in_chunks, 2) rem 100 of - 0 -> ?VERB(".", []); - _ -> ok - end, + %% case ets:lookup_element(ETS, in_chunks, 2) rem 100 of + %% 0 -> ?VERB(".2", []); + %% _ -> ok + %% end, _T1 = os:timestamp(), %% TODO: support case multiple written or trimmed chunks returned - {ok, {[{_, Offset, Chunk, _}], _}} = + NSInfo = undefined, + {ok, {[{_, Offset, Chunk, _ReadCSum}|OtherChunks], []=_TrimmedList}} = machi_proxy_flu1_client:read_chunk( - SrcP, EpochID, File, Offset, Size, [], + SrcP, NSInfo, EpochID, File, Offset, Size, undefined, ?SHORT_TIMEOUT), + [] = OtherChunks, _T2 = os:timestamp(), <<_Tag:1/binary, CSum/binary>> = TaggedCSum, case machi_util:checksum_chunk(Chunk) of @@ -344,7 +356,7 @@ execute_repair_directive({File, Cmds}, {ProxiesDict, EpochID, Verb, ETS}=Acc) -> DstP = orddict:fetch(DstFLU, ProxiesDict), _T3 = os:timestamp(), ok = machi_proxy_flu1_client:write_chunk( - DstP, EpochID, File, Offset, Chunk, + DstP, NSInfo, EpochID, File, Offset, Chunk, TaggedCSum, ?SHORT_TIMEOUT), _T4 = os:timestamp() end || DstFLU <- MyDsts], diff --git a/src/machi_cr_client.erl b/src/machi_cr_client.erl index cec7c6a..a726744 100644 --- a/src/machi_cr_client.erl +++ b/src/machi_cr_client.erl @@ -21,8 +21,9 @@ %% @doc Erlang API for the Machi client-implemented Chain Replication %% (CORFU-style) protocol. %% -%% See also the docs for {@link machi_flu1_client} for additional -%% details on data types and operation descriptions. +%% Please see {@link machi_flu1_client} the "Client API implemntation notes" +%% section for how this module relates to the rest of the client API +%% implementation. %% %% The API here is much simpler than the {@link machi_flu1_client} or %% {@link machi_proxy_flu1_client} APIs. This module's API is a @@ -43,64 +44,6 @@ %% %% Doc TODO: Once this API stabilizes, add all relevant data type details %% to the EDoc here. -%% -%% -%% === Missing API features === -%% -%% So far, there is one missing client API feature that ought to be -%% added to Machi in the near future: more flexible checksum -%% management. -%% -%% Add a `source' annotation to all checksums to indicate where the -%% checksum was calculated. For example, -%% -%%
    -%% -%%
  • Calculated by client that performed the original chunk append, -%%
  • -%% -%%
  • Calculated by the 1st Machi server to receive an -%% un-checksummed append request -%%
  • -%% -%%
  • Re-calculated by Machi to manage fewer checksums of blocks of -%% data larger than the original client-specified chunks. -%%
  • -%%
-%% -%% Client-side checksums would be the "strongest" type of -%% checksum, meaning that any data corruption (of the original -%% data and/or of the checksum itself) can be detected after the -%% client-side calculation. There are too many horror stories on -%% The Net about IP PDUs that are corrupted but unnoticed due to -%% weak TCP checksums, buggy hardware, buggy OS drivers, etc. -%% Checksum versioning is also desirable if/when the current checksum -%% implementation changes from SHA-1 to something else. -%% -%% -%% === Implementation notes === -%% -%% The major operation processing is implemented in a state machine-like -%% manner. Before attempting an operation `X', there's an initial -%% operation `pre-X' that takes care of updating the epoch id, -%% restarting client protocol proxies, and if there's any server -%% instability (e.g. some server is wedged), then insert some sleep -%% time. When the chain appears to have stabilized, then we try the `X' -%% operation again. -%% -%% Function name for the `pre-X' stuff is usually `X()', and the -%% function name for the `X' stuff is usually `X2()'. (I.e., the `X' -%% stuff follows after `pre-X' and therefore has a `2' suffix on the -%% function name.) -%% -%% In the case of read repair, there are two stages: find the value to -%% perform the repair, then perform the repair writes. In the case of -%% the repair writes, the `pre-X' function is named `read_repair3()', -%% and the `X' function is named `read_repair4()'. -%% -%% TODO: It would be nifty to lift the very-nearly-but-not-quite-boilerplate -%% of the `pre-X' functions into a single common function ... but I'm not -%% sure yet on how to do it without making the code uglier. -module(machi_cr_client). @@ -118,13 +61,11 @@ %% FLU1 API -export([ %% File API - append_chunk/3, append_chunk/4, - append_chunk/5, append_chunk/6, - append_chunk_extra/4, append_chunk_extra/5, - append_chunk_extra/6, append_chunk_extra/7, - write_chunk/4, write_chunk/5, - read_chunk/5, read_chunk/6, - trim_chunk/4, trim_chunk/5, + append_chunk/5, + append_chunk/6, append_chunk/7, + write_chunk/6, write_chunk/7, + read_chunk/6, read_chunk/7, + trim_chunk/5, trim_chunk/6, checksum_list/2, checksum_list/3, list_files/1, list_files/2, @@ -165,101 +106,61 @@ start_link(P_srvr_list, Opts) -> %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. -append_chunk(PidSpec, Prefix, Chunk) -> - append_chunk_extra(PidSpec, ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, - Prefix, Chunk, 0, ?DEFAULT_TIMEOUT). +append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum) -> + append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum, #append_opts{}, ?DEFAULT_TIMEOUT). %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. -append_chunk(PidSpec, Prefix, Chunk, Timeout) -> - append_chunk_extra(PidSpec, ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, - Prefix, Chunk, 0, Timeout). +append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum, #append_opts{}=Opts) -> + append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum, Opts, ?DEFAULT_TIMEOUT). %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. -append_chunk(PidSpec, CoC_Namespace, CoC_Locator, Prefix, Chunk) -> - append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator, - Prefix, Chunk, 0, ?DEFAULT_TIMEOUT). - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix'. - -append_chunk(PidSpec, CoC_Namespace, CoC_Locator, Prefix, Chunk, Timeout) -> - append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator, - Prefix, Chunk, 0, Timeout). - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix'. - -append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra) - when is_integer(ChunkExtra), ChunkExtra >= 0 -> - append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra, ?DEFAULT_TIMEOUT). - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix'. - -append_chunk_extra(PidSpec, Prefix, Chunk, ChunkExtra, Timeout0) -> +append_chunk(PidSpec, NSInfo, Prefix, Chunk, CSum, #append_opts{}=Opts, Timeout0) -> + NSInfo2 = machi_util:ns_info_default(NSInfo), {TO, Timeout} = timeout(Timeout0), - gen_server:call(PidSpec, {req, {append_chunk_extra, - ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, - Prefix, - Chunk, ChunkExtra, TO}}, - Timeout). - -append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator, Prefix, Chunk, ChunkExtra) - when is_integer(ChunkExtra), ChunkExtra >= 0 -> - append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra, ?DEFAULT_TIMEOUT). - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix'. - -append_chunk_extra(PidSpec, CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra, Timeout0) -> - {TO, Timeout} = timeout(Timeout0), - gen_server:call(PidSpec, {req, {append_chunk_extra, - CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, TO}}, + gen_server:call(PidSpec, {req, {append_chunk, + NSInfo2, Prefix, Chunk, CSum, Opts, TO}}, Timeout). %% @doc Write a chunk of data (that has already been -%% allocated/sequenced by an earlier append_chunk_extra() call) to +%% allocated/sequenced by an earlier append_chunk() call) to %% `File' at `Offset'. -write_chunk(PidSpec, File, Offset, Chunk) -> - write_chunk(PidSpec, File, Offset, Chunk, ?DEFAULT_TIMEOUT). +write_chunk(PidSpec, NSInfo, File, Offset, Chunk, CSum) -> + write_chunk(PidSpec, NSInfo, File, Offset, Chunk, CSum, ?DEFAULT_TIMEOUT). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. -write_chunk(PidSpec, File, Offset, Chunk, Timeout0) -> +write_chunk(PidSpec, NSInfo, File, Offset, Chunk, CSum, Timeout0) -> {TO, Timeout} = timeout(Timeout0), - gen_server:call(PidSpec, {req, {write_chunk, File, Offset, Chunk, TO}}, + gen_server:call(PidSpec, {req, {write_chunk, NSInfo, File, Offset, Chunk, CSum, TO}}, Timeout). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. -read_chunk(PidSpec, File, Offset, Size, Opts) -> - read_chunk(PidSpec, File, Offset, Size, Opts, ?DEFAULT_TIMEOUT). +read_chunk(PidSpec, NSInfo, File, Offset, Size, Opts) -> + read_chunk(PidSpec, NSInfo, File, Offset, Size, Opts, ?DEFAULT_TIMEOUT). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. -read_chunk(PidSpec, File, Offset, Size, Opts, Timeout0) -> +read_chunk(PidSpec, NSInfo, File, Offset, Size, Opts, Timeout0) -> {TO, Timeout} = timeout(Timeout0), - gen_server:call(PidSpec, {req, {read_chunk, File, Offset, Size, Opts, TO}}, + gen_server:call(PidSpec, {req, {read_chunk, NSInfo, File, Offset, Size, Opts, TO}}, Timeout). %% @doc Trim a chunk of data of size `Size' from `File' at `Offset'. -trim_chunk(PidSpec, File, Offset, Size) -> - trim_chunk(PidSpec, File, Offset, Size, ?DEFAULT_TIMEOUT). +trim_chunk(PidSpec, NSInfo, File, Offset, Size) -> + trim_chunk(PidSpec, NSInfo, File, Offset, Size, ?DEFAULT_TIMEOUT). %% @doc Trim a chunk of data of size `Size' from `File' at `Offset'. -trim_chunk(PidSpec, File, Offset, Size, Timeout0) -> +trim_chunk(PidSpec, NSInfo, File, Offset, Size, Timeout0) -> {TO, Timeout} = timeout(Timeout0), - gen_server:call(PidSpec, {req, {trim_chunk, File, Offset, Size, TO}}, + gen_server:call(PidSpec, {req, {trim_chunk, NSInfo, File, Offset, Size, TO}}, Timeout). %% @doc Fetch the list of chunk checksums for `File'. @@ -324,28 +225,27 @@ code_change(_OldVsn, S, _Extra) -> %%%%%%%%%%%%%%%%%%%%%%%%%%% -handle_call2({append_chunk_extra, CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra, TO}, _From, S) -> - do_append_head(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, 0, os:timestamp(), TO, S); -handle_call2({write_chunk, File, Offset, Chunk, TO}, _From, S) -> - do_write_head(File, Offset, Chunk, 0, os:timestamp(), TO, S); -handle_call2({read_chunk, File, Offset, Size, Opts, TO}, _From, S) -> - do_read_chunk(File, Offset, Size, Opts, 0, os:timestamp(), TO, S); -handle_call2({trim_chunk, File, Offset, Size, TO}, _From, S) -> - do_trim_chunk(File, Offset, Size, 0, os:timestamp(), TO, S); +handle_call2({append_chunk, NSInfo, + Prefix, Chunk, CSum, Opts, TO}, _From, S) -> + do_append_head(NSInfo, Prefix, + Chunk, CSum, Opts, 0, os:timestamp(), TO, S); +handle_call2({write_chunk, NSInfo, File, Offset, Chunk, CSum, TO}, _From, S) -> + do_write_head(NSInfo, File, Offset, Chunk, CSum, 0, os:timestamp(), TO, S); +handle_call2({read_chunk, NSInfo, File, Offset, Size, Opts, TO}, _From, S) -> + do_read_chunk(NSInfo, File, Offset, Size, Opts, 0, os:timestamp(), TO, S); +handle_call2({trim_chunk, NSInfo, File, Offset, Size, TO}, _From, S) -> + do_trim_chunk(NSInfo, File, Offset, Size, 0, os:timestamp(), TO, S); handle_call2({checksum_list, File, TO}, _From, S) -> do_checksum_list(File, 0, os:timestamp(), TO, S); handle_call2({list_files, TO}, _From, S) -> do_list_files(0, os:timestamp(), TO, S). -do_append_head(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, 0=Depth, STime, TO, S) -> - do_append_head2(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, Depth + 1, STime, TO, S); -do_append_head(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, Depth, STime, TO, #state{proj=P}=S) -> - %% io:format(user, "head sleep1,", []), +do_append_head(NSInfo, Prefix, + Chunk, CSum, Opts, 0=Depth, STime, TO, S) -> + do_append_head2(NSInfo, Prefix, + Chunk, CSum, Opts, Depth + 1, STime, TO, S); +do_append_head(NSInfo, Prefix, + Chunk, CSum, Opts, Depth, STime, TO, #state{proj=P}=S) -> sleep_a_while(Depth), DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, if DiffMs > TO -> @@ -359,62 +259,61 @@ do_append_head(CoC_Namespace, CoC_Locator, Prefix, case S2#state.proj of P2 when P2 == undefined orelse P2#projection_v1.upi == [] -> - do_append_head(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, Depth + 1, + do_append_head(NSInfo, Prefix, + Chunk, CSum, Opts, Depth + 1, STime, TO, S2); _ -> - do_append_head2(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, Depth + 1, + do_append_head2(NSInfo, Prefix, + Chunk, CSum, Opts, Depth + 1, STime, TO, S2) end end. -do_append_head2(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, Depth, STime, TO, +do_append_head2(NSInfo, Prefix, + Chunk, CSum, Opts, Depth, STime, TO, #state{proj=P}=S) -> [HeadFLU|_RestFLUs] = mutation_flus(P), case is_witness_flu(HeadFLU, P) of true -> case witnesses_use_our_epoch(S) of true -> - do_append_head3(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, Depth, + do_append_head3(NSInfo, Prefix, + Chunk, CSum, Opts, Depth, STime, TO, S); false -> %% Bummer, go back to the beginning and retry. - do_append_head(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, Depth, + do_append_head(NSInfo, Prefix, + Chunk, CSum, Opts, Depth, STime, TO, S) end; false -> - do_append_head3(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, Depth, STime, TO, S) + do_append_head3(NSInfo, Prefix, + Chunk, CSum, Opts, Depth, STime, TO, S) end. -do_append_head3(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, Depth, STime, TO, +do_append_head3(NSInfo, Prefix, + Chunk, CSum, Opts, Depth, STime, TO, #state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) -> [HeadFLU|RestFLUs] = non_witness_flus(mutation_flus(P), P), Proxy = orddict:fetch(HeadFLU, PD), - case ?FLU_PC:append_chunk_extra(Proxy, EpochID, - CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, ?TIMEOUT) of + case ?FLU_PC:append_chunk(Proxy, NSInfo, EpochID, + Prefix, Chunk, CSum, Opts, ?TIMEOUT) of {ok, {Offset, _Size, File}=_X} -> - do_append_midtail(RestFLUs, CoC_Namespace, CoC_Locator, Prefix, - File, Offset, Chunk, ChunkExtra, - [HeadFLU], 0, STime, TO, S); + do_wr_app_midtail(RestFLUs, NSInfo, Prefix, + File, Offset, Chunk, CSum, Opts, + [HeadFLU], 0, STime, TO, append, S); {error, bad_checksum}=BadCS -> {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_append_head(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, Depth, STime, TO, S); + do_append_head(NSInfo, Prefix, + Chunk, CSum, Opts, Depth, STime, TO, S); {error, written} -> %% Implicit sequencing + this error = we don't know where this %% written block is. But we lost a race. Repeat, with a new %% sequencer assignment. - do_append_head(CoC_Namespace, CoC_Locator, Prefix, - Chunk, ChunkExtra, Depth, STime, TO, S); + do_append_head(NSInfo, Prefix, + Chunk, CSum, Opts, Depth, STime, TO, S); {error, trimmed} = Err -> %% TODO: behaviour {reply, Err, S}; @@ -423,17 +322,16 @@ do_append_head3(CoC_Namespace, CoC_Locator, Prefix, Prefix,iolist_size(Chunk)}) end. -do_append_midtail(RestFLUs, CoC_Namespace, CoC_Locator, Prefix, - File, Offset, Chunk, ChunkExtra, - Ws, Depth, STime, TO, S) +do_wr_app_midtail(RestFLUs, NSInfo, Prefix, + File, Offset, Chunk, CSum, Opts, + Ws, Depth, STime, TO, MyOp, S) when RestFLUs == [] orelse Depth == 0 -> - do_append_midtail2(RestFLUs, CoC_Namespace, CoC_Locator, Prefix, - File, Offset, Chunk, ChunkExtra, - Ws, Depth + 1, STime, TO, S); -do_append_midtail(_RestFLUs, CoC_Namespace, CoC_Locator, Prefix, File, - Offset, Chunk, ChunkExtra, - Ws, Depth, STime, TO, #state{proj=P}=S) -> - %% io:format(user, "midtail sleep2,", []), + do_wr_app_midtail2(RestFLUs, NSInfo, Prefix, + File, Offset, Chunk, CSum, Opts, + Ws, Depth + 1, STime, TO, MyOp, S); +do_wr_app_midtail(_RestFLUs, NSInfo, Prefix, File, + Offset, Chunk, CSum, Opts, + Ws, Depth, STime, TO, MyOp, #state{proj=P}=S) -> sleep_a_while(Depth), DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, if DiffMs > TO -> @@ -447,60 +345,66 @@ do_append_midtail(_RestFLUs, CoC_Namespace, CoC_Locator, Prefix, File, RestFLUs2 = mutation_flus(P2), case RestFLUs2 -- Ws of RestFLUs2 -> - %% None of the writes that we have done so far - %% are to FLUs that are in the RestFLUs2 list. - %% We are pessimistic here and assume that - %% those FLUs are permanently dead. Start - %% over with a new sequencer assignment, at - %% the 2nd have of the impl (we have already - %% slept & refreshed the projection). - if Prefix == undefined -> % atom! not binary()!! {error, partition}; - true -> - do_append_head2(CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra, - Depth, STime, TO, S2) + MyOp == append -> + %% None of the writes that we have done so + %% far are to FLUs that are in the + %% RestFLUs2 list. We are pessimistic + %% here and assume that those FLUs are + %% permanently dead. Start over with a + %% new sequencer assignment, at the 2nd + %% have of the impl (we have already slept + %% & refreshed the projection). + do_append_head2(NSInfo, + Prefix, Chunk, CSum, Opts, + Depth, STime, TO, S2); + MyOp == write -> + do_wr_app_midtail2(RestFLUs2, + NSInfo, + Prefix, File, Offset, + Chunk, CSum, Opts, + Ws, Depth + 1, STime, TO, + MyOp, S2) end; RestFLUs3 -> - do_append_midtail2(RestFLUs3, - CoC_Namespace, CoC_Locator, + do_wr_app_midtail2(RestFLUs3, + NSInfo, Prefix, File, Offset, - Chunk, ChunkExtra, - Ws, Depth + 1, STime, TO, S2) + Chunk, CSum, Opts, + Ws, Depth + 1, STime, TO, + MyOp, S2) end end end. -do_append_midtail2([], _CoC_Namespace, _CoC_Locator, +do_wr_app_midtail2([], _NSInfo, _Prefix, File, Offset, Chunk, - _ChunkExtra, _Ws, _Depth, _STime, _TO, S) -> - %% io:format(user, "ok!\n", []), + _CSum, _Opts, _Ws, _Depth, _STime, _TO, _MyOp, S) -> {reply, {ok, {Offset, chunk_wrapper_size(Chunk), File}}, S}; -do_append_midtail2([FLU|RestFLUs]=FLUs, CoC_Namespace, CoC_Locator, +do_wr_app_midtail2([FLU|RestFLUs]=FLUs, NSInfo, Prefix, File, Offset, Chunk, - ChunkExtra, Ws, Depth, STime, TO, + CSum, Opts, Ws, Depth, STime, TO, MyOp, #state{epoch_id=EpochID, proxies_dict=PD}=S) -> Proxy = orddict:fetch(FLU, PD), - case ?FLU_PC:write_chunk(Proxy, EpochID, File, Offset, Chunk, ?TIMEOUT) of + case ?FLU_PC:write_chunk(Proxy, NSInfo, EpochID, File, Offset, Chunk, CSum, ?TIMEOUT) of ok -> - %% io:format(user, "write ~w,", [FLU]), - do_append_midtail2(RestFLUs, CoC_Namespace, CoC_Locator, Prefix, + do_wr_app_midtail2(RestFLUs, NSInfo, Prefix, File, Offset, Chunk, - ChunkExtra, [FLU|Ws], Depth, STime, TO, S); + CSum, Opts, [FLU|Ws], Depth, STime, TO, MyOp, S); {error, bad_checksum}=BadCS -> %% TODO: alternate strategy? {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_append_midtail(FLUs, CoC_Namespace, CoC_Locator, Prefix, + do_wr_app_midtail(FLUs, NSInfo, Prefix, File, Offset, Chunk, - ChunkExtra, Ws, Depth, STime, TO, S); + CSum, Opts, Ws, Depth, STime, TO, MyOp, S); {error, written} -> %% We know what the chunk ought to be, so jump to the %% middle of read-repair. Resume = {append, Offset, iolist_size(Chunk), File}, - do_repair_chunk(FLUs, Resume, Chunk, [], File, Offset, + do_repair_chunk(FLUs, Resume, Chunk, CSum, [], NSInfo, File, Offset, iolist_size(Chunk), Depth, STime, S); {error, trimmed} = Err -> %% TODO: nothing can be done @@ -520,16 +424,15 @@ witnesses_use_our_epoch([FLU|RestFLUs], Proxy = orddict:fetch(FLU, PD), %% Check both that the EpochID is the same *and* not wedged! case ?FLU_PC:wedge_status(Proxy, ?TIMEOUT) of - {ok, {false, EID}} when EID == EpochID -> + {ok, {false, EID,_,_}} when EID == EpochID -> witnesses_use_our_epoch(RestFLUs, S); _Else -> false end. -do_write_head(File, Offset, Chunk, 0=Depth, STime, TO, S) -> - do_write_head2(File, Offset, Chunk, Depth + 1, STime, TO, S); -do_write_head(File, Offset, Chunk, Depth, STime, TO, #state{proj=P}=S) -> - %% io:format(user, "head sleep1,", []), +do_write_head(NSInfo, File, Offset, Chunk, CSum, 0=Depth, STime, TO, S) -> + do_write_head2(NSInfo, File, Offset, Chunk, CSum, Depth + 1, STime, TO, S); +do_write_head(NSInfo, File, Offset, Chunk, CSum, Depth, STime, TO, #state{proj=P}=S) -> sleep_a_while(Depth), DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, if DiffMs > TO -> @@ -543,30 +446,32 @@ do_write_head(File, Offset, Chunk, Depth, STime, TO, #state{proj=P}=S) -> case S2#state.proj of P2 when P2 == undefined orelse P2#projection_v1.upi == [] -> - do_write_head(File, Offset, Chunk, Depth + 1, + do_write_head(NSInfo, File, Offset, Chunk, CSum, Depth + 1, STime, TO, S2); _ -> - do_write_head2(File, Offset, Chunk, Depth + 1, + do_write_head2(NSInfo, File, Offset, Chunk, CSum, Depth + 1, STime, TO, S2) end end. -do_write_head2(File, Offset, Chunk, Depth, STime, TO, +do_write_head2(NSInfo, File, Offset, Chunk, CSum, Depth, STime, TO, #state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) -> [HeadFLU|RestFLUs] = mutation_flus(P), Proxy = orddict:fetch(HeadFLU, PD), - case ?FLU_PC:write_chunk(Proxy, EpochID, File, Offset, Chunk, ?TIMEOUT) of + case ?FLU_PC:write_chunk(Proxy, NSInfo, EpochID, File, Offset, Chunk, CSum, ?TIMEOUT) of ok -> %% From this point onward, we use the same code & logic path as %% append does. - do_append_midtail(RestFLUs, undefined, undefined, undefined, + Prefix=unused_write_path, + Opts=unused_write_path, + do_wr_app_midtail(RestFLUs, NSInfo, Prefix, File, Offset, Chunk, - undefined, [HeadFLU], 0, STime, TO, S); + CSum, Opts, [HeadFLU], 0, STime, TO, write, S); {error, bad_checksum}=BadCS -> {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_write_head(File, Offset, Chunk, Depth, STime, TO, S); + do_write_head(NSInfo, File, Offset, Chunk, CSum, Depth, STime, TO, S); {error, written}=Err -> {reply, Err, S}; {error, trimmed}=Err -> @@ -576,10 +481,10 @@ do_write_head2(File, Offset, Chunk, Depth, STime, TO, iolist_size(Chunk)}) end. -do_read_chunk(File, Offset, Size, Opts, 0=Depth, STime, TO, +do_read_chunk(NSInfo, File, Offset, Size, Opts, 0=Depth, STime, TO, #state{proj=#projection_v1{upi=[_|_]}}=S) -> % UPI is non-empty - do_read_chunk2(File, Offset, Size, Opts, Depth + 1, STime, TO, S); -do_read_chunk(File, Offset, Size, Opts, Depth, STime, TO, #state{proj=P}=S) -> + do_read_chunk2(NSInfo, File, Offset, Size, Opts, Depth + 1, STime, TO, S); +do_read_chunk(NSInfo, File, Offset, Size, Opts, Depth, STime, TO, #state{proj=P}=S) -> sleep_a_while(Depth), DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, if DiffMs > TO -> @@ -589,18 +494,18 @@ do_read_chunk(File, Offset, Size, Opts, Depth, STime, TO, #state{proj=P}=S) -> case S2#state.proj of P2 when P2 == undefined orelse P2#projection_v1.upi == [] -> - do_read_chunk(File, Offset, Size, Opts, Depth + 1, STime, TO, S2); + do_read_chunk(NSInfo, File, Offset, Size, Opts, Depth + 1, STime, TO, S2); _ -> - do_read_chunk2(File, Offset, Size, Opts, Depth + 1, STime, TO, S2) + do_read_chunk2(NSInfo, File, Offset, Size, Opts, Depth + 1, STime, TO, S2) end end. -do_read_chunk2(File, Offset, Size, Opts, Depth, STime, TO, +do_read_chunk2(NSInfo, File, Offset, Size, Opts, Depth, STime, TO, #state{proj=P, epoch_id=EpochID, proxies_dict=PD}=S) -> UPI = readonly_flus(P), Tail = lists:last(UPI), ConsistencyMode = P#projection_v1.mode, - case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), EpochID, + case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), NSInfo, EpochID, File, Offset, Size, Opts, ?TIMEOUT) of {ok, {Chunks, Trimmed}} when is_list(Chunks), is_list(Trimmed) -> %% After partition heal, there could happen that heads may @@ -625,9 +530,9 @@ do_read_chunk2(File, Offset, Size, Opts, Depth, STime, TO, {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_read_chunk(File, Offset, Size, Opts, Depth, STime, TO, S); + do_read_chunk(NSInfo, File, Offset, Size, Opts, Depth, STime, TO, S); {error, not_written} -> - read_repair(ConsistencyMode, read, File, Offset, Size, Depth, STime, S); + read_repair(ConsistencyMode, read, NSInfo, File, Offset, Size, Depth, STime, S); %% {reply, {error, not_written}, S}; {error, written} -> exit({todo_should_never_happen,?MODULE,?LINE,File,Offset,Size}); @@ -635,10 +540,10 @@ do_read_chunk2(File, Offset, Size, Opts, Depth, STime, TO, {reply, Err, S} end. -do_trim_chunk(File, Offset, Size, 0=Depth, STime, TO, S) -> - do_trim_chunk(File, Offset, Size, Depth+1, STime, TO, S); +do_trim_chunk(NSInfo, File, Offset, Size, 0=Depth, STime, TO, S) -> + do_trim_chunk(NSInfo, File, Offset, Size, Depth+1, STime, TO, S); -do_trim_chunk(File, Offset, Size, Depth, STime, TO, #state{proj=P}=S) -> +do_trim_chunk(NSInfo, File, Offset, Size, Depth, STime, TO, #state{proj=P}=S) -> sleep_a_while(Depth), DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, if DiffMs > TO -> @@ -652,42 +557,41 @@ do_trim_chunk(File, Offset, Size, Depth, STime, TO, #state{proj=P}=S) -> case S2#state.proj of P2 when P2 == undefined orelse P2#projection_v1.upi == [] -> - do_trim_chunk(File, Offset, Size, Depth + 1, + do_trim_chunk(NSInfo, File, Offset, Size, Depth + 1, STime, TO, S2); _ -> - do_trim_chunk2(File, Offset, Size, Depth + 1, + do_trim_chunk2(NSInfo, File, Offset, Size, Depth + 1, STime, TO, S2) end end. -do_trim_chunk2(File, Offset, Size, Depth, STime, TO, +do_trim_chunk2(NSInfo, File, Offset, Size, Depth, STime, TO, #state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) -> [HeadFLU|RestFLUs] = mutation_flus(P), Proxy = orddict:fetch(HeadFLU, PD), - case ?FLU_PC:trim_chunk(Proxy, EpochID, File, Offset, Size, ?TIMEOUT) of + case ?FLU_PC:trim_chunk(Proxy, NSInfo, EpochID, File, Offset, Size, ?TIMEOUT) of ok -> - do_trim_midtail(RestFLUs, undefined, File, Offset, Size, + do_trim_midtail(RestFLUs, undefined, NSInfo, File, Offset, Size, [HeadFLU], 0, STime, TO, S); {error, trimmed} -> %% Maybe the trim had failed in the middle of the tail so re-run %% trim accross the whole chain. - do_trim_midtail(RestFLUs, undefined, File, Offset, Size, + do_trim_midtail(RestFLUs, undefined, NSInfo, File, Offset, Size, [HeadFLU], 0, STime, TO, S); {error, bad_checksum}=BadCS -> {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_trim_chunk(File, Offset, Size, Depth, STime, TO, S) + do_trim_chunk(NSInfo, File, Offset, Size, Depth, STime, TO, S) end. -do_trim_midtail(RestFLUs, Prefix, File, Offset, Size, +do_trim_midtail(RestFLUs, Prefix, NSInfo, File, Offset, Size, Ws, Depth, STime, TO, S) when RestFLUs == [] orelse Depth == 0 -> - do_trim_midtail2(RestFLUs, Prefix, File, Offset, Size, + do_trim_midtail2(RestFLUs, Prefix, NSInfo, File, Offset, Size, Ws, Depth + 1, STime, TO, S); -do_trim_midtail(_RestFLUs, Prefix, File, Offset, Size, +do_trim_midtail(_RestFLUs, Prefix, NSInfo, File, Offset, Size, Ws, Depth, STime, TO, #state{proj=P}=S) -> - %% io:format(user, "midtail sleep2,", []), sleep_a_while(Depth), DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, if DiffMs > TO -> @@ -712,38 +616,36 @@ do_trim_midtail(_RestFLUs, Prefix, File, Offset, Size, if Prefix == undefined -> % atom! not binary()!! {error, partition}; true -> - do_trim_chunk(Prefix, Offset, Size, + do_trim_chunk(NSInfo, Prefix, Offset, Size, Depth, STime, TO, S2) end; RestFLUs3 -> - do_trim_midtail2(RestFLUs3, Prefix, File, Offset, Size, + do_trim_midtail2(RestFLUs3, Prefix, NSInfo, File, Offset, Size, Ws, Depth + 1, STime, TO, S2) end end end. -do_trim_midtail2([], _Prefix, _File, _Offset, _Size, +do_trim_midtail2([], _Prefix, _NSInfo, _File, _Offset, _Size, _Ws, _Depth, _STime, _TO, S) -> - %% io:format(user, "ok!\n", []), {reply, ok, S}; -do_trim_midtail2([FLU|RestFLUs]=FLUs, Prefix, File, Offset, Size, +do_trim_midtail2([FLU|RestFLUs]=FLUs, Prefix, NSInfo, File, Offset, Size, Ws, Depth, STime, TO, #state{epoch_id=EpochID, proxies_dict=PD}=S) -> Proxy = orddict:fetch(FLU, PD), - case ?FLU_PC:trim_chunk(Proxy, EpochID, File, Offset, Size, ?TIMEOUT) of + case ?FLU_PC:trim_chunk(Proxy, NSInfo, EpochID, File, Offset, Size, ?TIMEOUT) of ok -> - %% io:format(user, "write ~w,", [FLU]), - do_trim_midtail2(RestFLUs, Prefix, File, Offset, Size, + do_trim_midtail2(RestFLUs, Prefix, NSInfo, File, Offset, Size, [FLU|Ws], Depth, STime, TO, S); {error, trimmed} -> - do_trim_midtail2(RestFLUs, Prefix, File, Offset, Size, + do_trim_midtail2(RestFLUs, Prefix, NSInfo, File, Offset, Size, [FLU|Ws], Depth, STime, TO, S); {error, bad_checksum}=BadCS -> %% TODO: alternate strategy? {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_trim_midtail(FLUs, Prefix, File, Offset, Size, + do_trim_midtail(FLUs, Prefix, NSInfo, File, Offset, Size, Ws, Depth, STime, TO, S) end. @@ -759,11 +661,11 @@ do_trim_midtail2([FLU|RestFLUs]=FLUs, Prefix, File, Offset, Size, %% Never matches because Depth is always incremented beyond 0 prior to %% getting here. %% -%% read_repair(ConsistencyMode, ReturnMode, File, Offset, Size, 0=Depth, +%% read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, 0=Depth, %% STime, #state{proj=#projection_v1{upi=[_|_]}}=S) -> % UPI is non-empty -%% read_repair2(ConsistencyMode, ReturnMode, File, Offset, Size, Depth + 1, +%% read_repair2(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, Depth + 1, %% STime, S); -read_repair(ConsistencyMode, ReturnMode, File, Offset, Size, Depth, +read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, Depth, STime, #state{proj=P}=S) -> sleep_a_while(Depth), DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, @@ -774,26 +676,26 @@ read_repair(ConsistencyMode, ReturnMode, File, Offset, Size, Depth, case S2#state.proj of P2 when P2 == undefined orelse P2#projection_v1.upi == [] -> - read_repair(ConsistencyMode, ReturnMode, File, Offset, + read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, Depth + 1, STime, S2); _ -> - read_repair2(ConsistencyMode, ReturnMode, File, Offset, + read_repair2(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, Depth + 1, STime, S2) end end. read_repair2(cp_mode=ConsistencyMode, - ReturnMode, File, Offset, Size, Depth, STime, + ReturnMode, NSInfo, File, Offset, Size, Depth, STime, #state{proj=P, epoch_id=EpochID, proxies_dict=PD}=S) -> %% TODO WTF was I thinking here??.... Tail = lists:last(readonly_flus(P)), - case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), EpochID, - File, Offset, Size, [], ?TIMEOUT) of + case ?FLU_PC:read_chunk(orddict:fetch(Tail, PD), NSInfo, EpochID, + File, Offset, Size, undefined, ?TIMEOUT) of {ok, Chunks} when is_list(Chunks) -> %% TODO: change to {Chunks, Trimmed} and have them repaired ToRepair = mutation_flus(P) -- [Tail], {Reply, S1} = do_repair_chunks(Chunks, ToRepair, ReturnMode, - [Tail], File, Depth, STime, S, {ok, Chunks}), + [Tail], NSInfo, File, Depth, STime, S, {ok, Chunks}), {reply, Reply, S1}; %% {ok, BadChunk} -> %% exit({todo, bad_chunk_size, ?MODULE, ?LINE, File, Offset, @@ -803,7 +705,7 @@ read_repair2(cp_mode=ConsistencyMode, {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - read_repair(ConsistencyMode, ReturnMode, File, Offset, + read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, Depth, STime, S); {error, not_written} -> {reply, {error, not_written}, S}; @@ -816,24 +718,23 @@ read_repair2(cp_mode=ConsistencyMode, exit({todo_should_repair_unlinked_files, ?MODULE, ?LINE, File}) end; read_repair2(ap_mode=ConsistencyMode, - ReturnMode, File, Offset, Size, Depth, STime, + ReturnMode, NSInfo, File, Offset, Size, Depth, STime, #state{proj=P}=S) -> Eligible = mutation_flus(P), - case try_to_find_chunk(Eligible, File, Offset, Size, S) of + case try_to_find_chunk(Eligible, NSInfo, File, Offset, Size, S) of {ok, {Chunks, _Trimmed}, GotItFrom} when is_list(Chunks) -> %% TODO: Repair trimmed chunks ToRepair = mutation_flus(P) -- [GotItFrom], - {Reply0, S1} = do_repair_chunks(Chunks, ToRepair, ReturnMode, [GotItFrom], - File, Depth, STime, S, {ok, Chunks}), - {ok, Chunks} = Reply0, - Reply = {ok, {Chunks, _Trimmed}}, + Reply = {ok, {Chunks, []}}, + {Reply, S1} = do_repair_chunks(Chunks, ToRepair, ReturnMode, [GotItFrom], + NSInfo, File, Depth, STime, S, Reply), {reply, Reply, S1}; {error, bad_checksum}=BadCS -> %% TODO: alternate strategy? {reply, BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - read_repair(ConsistencyMode, ReturnMode, File, + read_repair(ConsistencyMode, ReturnMode, NSInfo, File, Offset, Size, Depth, STime, S); {error, not_written} -> {reply, {error, not_written}, S}; @@ -845,22 +746,22 @@ read_repair2(ap_mode=ConsistencyMode, exit({todo_should_repair_unlinked_files, ?MODULE, ?LINE, File}) end. -do_repair_chunks([], _, _, _, _, _, _, S, Reply) -> +do_repair_chunks([], _, _, _, _, _, _, _, S, Reply) -> {Reply, S}; -do_repair_chunks([{_, Offset, Chunk, _Csum}|T], - ToRepair, ReturnMode, [GotItFrom], File, Depth, STime, S, Reply) -> +do_repair_chunks([{_, Offset, Chunk, CSum}|T], + ToRepair, ReturnMode, [GotItFrom], NSInfo, File, Depth, STime, S, Reply) -> + true = not is_atom(CSum), Size = iolist_size(Chunk), - case do_repair_chunk(ToRepair, ReturnMode, Chunk, [GotItFrom], File, Offset, + case do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, [GotItFrom], NSInfo, File, Offset, Size, Depth, STime, S) of - {ok, Chunk, S1} -> - do_repair_chunks(T, ToRepair, ReturnMode, [GotItFrom], File, Depth, STime, S1, Reply); + {reply, {ok, _}, S1} -> + do_repair_chunks(T, ToRepair, ReturnMode, [GotItFrom], NSInfo, File, Depth, STime, S1, Reply); Error -> Error end. -do_repair_chunk(ToRepair, ReturnMode, Chunk, Repaired, File, Offset, +do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offset, Size, Depth, STime, #state{proj=P}=S) -> - %% io:format(user, "read_repair3 sleep1,", []), sleep_a_while(Depth), DiffMs = timer:now_diff(os:timestamp(), STime) div 1000, if DiffMs > ?MAX_RUNTIME -> @@ -870,42 +771,42 @@ do_repair_chunk(ToRepair, ReturnMode, Chunk, Repaired, File, Offset, case S2#state.proj of P2 when P2 == undefined orelse P2#projection_v1.upi == [] -> - do_repair_chunk(ToRepair, ReturnMode, Chunk, Repaired, File, + do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offset, Size, Depth + 1, STime, S2); P2 -> ToRepair2 = mutation_flus(P2) -- Repaired, - do_repair_chunk2(ToRepair2, ReturnMode, Chunk, Repaired, File, + do_repair_chunk2(ToRepair2, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offset, Size, Depth + 1, STime, S2) end end. -do_repair_chunk2([], ReturnMode, Chunk, _Repaired, File, Offset, +do_repair_chunk2([], ReturnMode, Chunk, CSum, _Repaired, _NSInfo, File, Offset, _IgnoreSize, _Depth, _STime, S) -> %% TODO: add stats for # of repairs, length(_Repaired)-1, etc etc? case ReturnMode of read -> - {ok, Chunk, S}; + {reply, {ok, {[{File, Offset, Chunk, CSum}], []}}, S}; {append, Offset, Size, File} -> - {ok, {Offset, Size, File}, S} + {reply, {ok, {[{Offset, Size, File}], []}}, S} end; -do_repair_chunk2([First|Rest]=ToRepair, ReturnMode, Chunk, Repaired, File, Offset, +do_repair_chunk2([First|Rest]=ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offset, Size, Depth, STime, #state{epoch_id=EpochID, proxies_dict=PD}=S) -> Proxy = orddict:fetch(First, PD), - case ?FLU_PC:write_chunk(Proxy, EpochID, File, Offset, Chunk, ?TIMEOUT) of + case ?FLU_PC:write_chunk(Proxy, NSInfo, EpochID, File, Offset, Chunk, CSum, ?TIMEOUT) of ok -> - do_repair_chunk2(Rest, ReturnMode, Chunk, [First|Repaired], File, + do_repair_chunk2(Rest, ReturnMode, Chunk, CSum, [First|Repaired], NSInfo, File, Offset, Size, Depth, STime, S); {error, bad_checksum}=BadCS -> %% TODO: alternate strategy? {BadCS, S}; {error, Retry} when Retry == partition; Retry == bad_epoch; Retry == wedged -> - do_repair_chunk(ToRepair, ReturnMode, Chunk, Repaired, File, + do_repair_chunk(ToRepair, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offset, Size, Depth, STime, S); {error, written} -> %% TODO: To be very paranoid, read the chunk here to verify %% that it is exactly our Chunk. - do_repair_chunk2(Rest, ReturnMode, Chunk, Repaired, File, + do_repair_chunk2(Rest, ReturnMode, Chunk, CSum, Repaired, NSInfo, File, Offset, Size, Depth, STime, S); {error, trimmed} = _Error -> %% TODO @@ -937,9 +838,9 @@ do_checksum_list(File, Depth, STime, TO, #state{proj=P}=S) -> end. do_checksum_list2(File, Depth, STime, TO, - #state{epoch_id=EpochID, proj=P, proxies_dict=PD}=S) -> + #state{proj=P, proxies_dict=PD}=S) -> Proxy = orddict:fetch(lists:last(readonly_flus(P)), PD), - case ?FLU_PC:checksum_list(Proxy, EpochID, File, ?TIMEOUT) of + case ?FLU_PC:checksum_list(Proxy, File, ?TIMEOUT) of {ok, _}=OK -> {reply, OK, S}; {error, Retry} @@ -1025,11 +926,13 @@ update_proj2(Count, #state{bad_proj=BadProj, proxies_dict=ProxiesDict, update_proj2(Count + 1, S); P when P >= BadProj -> #projection_v1{epoch_number=Epoch, epoch_csum=CSum, - members_dict=NewMembersDict} = P, + members_dict=NewMembersDict, dbg2=Dbg2} = P, EpochID = {Epoch, CSum}, ?FLU_PC:stop_proxies(ProxiesDict), NewProxiesDict = ?FLU_PC:start_proxies(NewMembersDict), - S#state{bad_proj=undefined, proj=P, epoch_id=EpochID, + %% Make crash reports shorter by getting rid of 'react' history. + P2 = P#projection_v1{dbg2=lists:keydelete(react, 1, Dbg2)}, + S#state{bad_proj=undefined, proj=P2, epoch_id=EpochID, members_dict=NewMembersDict, proxies_dict=NewProxiesDict}; _P -> sleep_a_while(Count), @@ -1074,14 +977,14 @@ choose_best_proj(Rs) -> BestProj end, ?WORST_PROJ, Rs). -try_to_find_chunk(Eligible, File, Offset, Size, +try_to_find_chunk(Eligible, NSInfo, File, Offset, Size, #state{epoch_id=EpochID, proxies_dict=PD}) -> Timeout = 2*1000, Work = fun(FLU) -> Proxy = orddict:fetch(FLU, PD), - case ?FLU_PC:read_chunk(Proxy, EpochID, + case ?FLU_PC:read_chunk(Proxy, NSInfo, EpochID, %% TODO Trimmed is required here - File, Offset, Size, []) of + File, Offset, Size, undefined) of {ok, {_Chunks, _} = ChunksAndTrimmed} -> {FLU, {ok, ChunksAndTrimmed}}; Else -> diff --git a/src/machi_dt.erl b/src/machi_dt.erl index daf26dd..6a57e86 100644 --- a/src/machi_dt.erl +++ b/src/machi_dt.erl @@ -20,18 +20,18 @@ -module(machi_dt). +-include("machi.hrl"). -include("machi_projection.hrl"). --type chunk() :: chunk_bin() | {chunk_csum(), chunk_bin()}. --type chunk_bin() :: binary() | iolist(). % client can use either --type chunk_csum() :: binary(). % 1 byte tag, N-1 bytes checksum --type chunk_summary() :: {file_offset(), chunk_size(), binary()}. --type chunk_s() :: 'trimmed' | binary(). +-type append_opts() :: #append_opts{}. +-type chunk() :: chunk_bin() | iolist(). % client can choose either rep. +-type chunk_bin() :: binary(). % server returns binary() only. +-type chunk_csum() :: <<>> | chunk_csum_bin() | {csum_tag(), binary()}. +-type chunk_csum_bin() :: binary(). % 1 byte tag, N-1 bytes checksum +-type chunk_cstrm() :: 'trimmed' | chunk_csum(). +-type chunk_summary() :: {file_offset(), chunk_size(), chunk_bin(), chunk_cstrm()}. -type chunk_pos() :: {file_offset(), chunk_size(), file_name_s()}. -type chunk_size() :: non_neg_integer(). --type coc_namespace() :: string(). --type coc_nl() :: {coc, coc_namespace(), coc_locator()}. --type coc_locator() :: non_neg_integer(). -type error_general() :: 'bad_arg' | 'wedged' | 'bad_checksum'. -type epoch_csum() :: binary(). -type epoch_num() :: -1 | non_neg_integer(). @@ -44,8 +44,14 @@ -type file_prefix() :: binary() | list(). -type inet_host() :: inet:ip_address() | inet:hostname(). -type inet_port() :: inet:port_number(). +-type locator() :: number(). +-type namespace() :: binary(). +-type namespace_version() :: non_neg_integer(). +-type ns_info() :: #ns_info{}. -type projection() :: #projection_v1{}. -type projection_type() :: 'public' | 'private'. +-type read_opts() :: #read_opts{}. +-type read_opts_x() :: 'undefined' | 'noopt' | 'none' | #read_opts{}. %% Tags that stand for how that checksum was generated. See %% machi_util:make_tagged_csum/{1,2} for further documentation and @@ -53,17 +59,15 @@ -type csum_tag() :: none | client_sha | server_sha | server_regen_sha. -export_type([ + append_opts/0, chunk/0, chunk_bin/0, chunk_csum/0, - csum_tag/0, + chunk_csum_bin/0, + chunk_cstrm/0, chunk_summary/0, - chunk_s/0, chunk_pos/0, chunk_size/0, - coc_namespace/0, - coc_nl/0, - coc_locator/0, error_general/0, epoch_csum/0, epoch_num/0, @@ -76,7 +80,13 @@ file_prefix/0, inet_host/0, inet_port/0, + locator/0, + namespace/0, + namespace_version/0, + ns_info/0, projection/0, - projection_type/0 + projection_type/0, + read_opts/0, + read_opts_x/0 ]). diff --git a/src/machi_file_proxy.erl b/src/machi_file_proxy.erl index cae292c..bc9a539 100644 --- a/src/machi_file_proxy.erl +++ b/src/machi_file_proxy.erl @@ -141,18 +141,18 @@ sync(_Pid, Type) -> Data :: binary(), Checksum :: binary()}]} | {error, Reason :: term()}. read(Pid, Offset, Length) -> - read(Pid, Offset, Length, []). + read(Pid, Offset, Length, #read_opts{}). -spec read(Pid :: pid(), Offset :: non_neg_integer(), Length :: non_neg_integer(), - [{no_checksum|no_chunk|needs_trimmed, boolean()}]) -> + machi_dt:read_opts_x()) -> {ok, [{Filename::string(), Offset :: non_neg_integer(), Data :: binary(), Checksum :: binary()}]} | {error, Reason :: term()}. -read(Pid, Offset, Length, Opts) when is_pid(Pid) andalso is_integer(Offset) andalso Offset >= 0 - andalso is_integer(Length) andalso Length > 0 - andalso is_list(Opts) -> +read(Pid, Offset, Length, #read_opts{}=Opts) + when is_pid(Pid) andalso is_integer(Offset) andalso Offset >= 0 + andalso is_integer(Length) andalso Length > 0 -> gen_server:call(Pid, {read, Offset, Length, Opts}, ?TIMEOUT); read(_Pid, Offset, Length, Opts) -> lager:warning("Bad args to read: Offset ~p, Length ~p, Options ~p", [Offset, Length, Opts]), @@ -298,15 +298,15 @@ handle_call({read, Offset, Length, Opts}, _From, }) -> %% TODO: use these options - NoChunk prevents reading from disks %% NoChecksum doesn't check checksums - NoChecksum = proplists:get_value(no_checksum, Opts, false), - NoChunk = proplists:get_value(no_chunk, Opts, false), + #read_opts{no_checksum=NoChecksum, no_chunk=NoChunk, + needs_trimmed=NeedsTrimmed} = Opts, {Resp, NewErr} = case do_read(FH, F, CsumTable, Offset, Length, NoChunk, NoChecksum) of {ok, {[], []}} -> {{error, not_written}, Err + 1}; {ok, {Chunks0, Trimmed0}} -> Chunks = slice_both_side(Chunks0, Offset, Offset+Length), - Trimmed = case proplists:get_value(needs_trimmed, Opts, false) of + Trimmed = case NeedsTrimmed of true -> Trimmed0; false -> [] end, diff --git a/src/machi_flu1.erl b/src/machi_flu1.erl index b75d955..8a33a04 100644 --- a/src/machi_flu1.erl +++ b/src/machi_flu1.erl @@ -129,7 +129,8 @@ main2(FluName, TcpPort, DataDir, Props) -> ok end, {ok, ListenerPid} = start_listen_server(FluName, TcpPort, Witness_p, DataDir, - ets_table_name(FluName), ProjectionPid), + ets_table_name(FluName), ProjectionPid, + Props), %% io:format(user, "Listener started: ~w~n", [{FluName, ListenerPid}]), Config_e = machi_util:make_config_filename(DataDir, "unused"), @@ -154,9 +155,10 @@ main2(FluName, TcpPort, DataDir, Props) -> start_append_server(FluName, Witness_p, Wedged_p, EpochId) -> machi_flu1_subsup:start_append_server(FluName, Witness_p, Wedged_p, EpochId). -start_listen_server(FluName, TcpPort, Witness_p, DataDir, EtsTab, ProjectionPid) -> +start_listen_server(FluName, TcpPort, Witness_p, DataDir, EtsTab, ProjectionPid, + Props) -> machi_flu1_subsup:start_listener(FluName, TcpPort, Witness_p, DataDir, - EtsTab, ProjectionPid). + EtsTab, ProjectionPid, Props). %% This is the name of the projection store that is spawned by the %% *flu*, for use primarily in testing scenarios. In normal use, we diff --git a/src/machi_flu1_append_server.erl b/src/machi_flu1_append_server.erl index a7b029c..a484410 100644 --- a/src/machi_flu1_append_server.erl +++ b/src/machi_flu1_append_server.erl @@ -82,25 +82,25 @@ init([Fluname, Witness_p, Wedged_p, EpochId]) -> {ok, #state{flu_name=Fluname, witness=Witness_p, wedged=Wedged_p, etstab=TID, epoch_id=EpochId}}. -handle_call({seq_append, _From2, _N, _L, _Prefix, _Chunk, _CSum, _Extra, _EpochID}, +handle_call({seq_append, _From2, _NSInfo, _EpochID, _Prefix, _Chunk, _TCSum, _Opts}, _From, #state{witness=true}=S) -> %% The FLU's machi_flu1_net_server process ought to filter all %% witness states, but we'll keep this clause for extra %% paranoia. {reply, witness, S}; -handle_call({seq_append, _From2, _N, _L, _Prefix, _Chunk, _CSum, _Extra, _EpochID}, +handle_call({seq_append, _From2, _NSInfo, _EpochID, _Prefix, _Chunk, _TCSum, _Opts}, _From, #state{wedged=true}=S) -> {reply, wedged, S}; -handle_call({seq_append, _From2, CoC_Namespace, CoC_Locator, - Prefix, Chunk, CSum, Extra, EpochID}, +handle_call({seq_append, _From2, NSInfo, EpochID, + Prefix, Chunk, TCSum, Opts}, From, #state{flu_name=FluName, epoch_id=OldEpochId}=S) -> %% Old is the one from our state, plain old 'EpochID' comes %% from the client. _ = case OldEpochId of EpochID -> spawn(fun() -> - append_server_dispatch(From, CoC_Namespace, CoC_Locator, - Prefix, Chunk, CSum, Extra, + append_server_dispatch(From, NSInfo, + Prefix, Chunk, TCSum, Opts, FluName, EpochID) end), {noreply, S}; @@ -161,10 +161,10 @@ terminate(Reason, _S) -> code_change(_OldVsn, S, _Extra) -> {ok, S}. -append_server_dispatch(From, CoC_Namespace, CoC_Locator, - Prefix, Chunk, CSum, Extra, FluName, EpochId) -> - Result = case handle_append(CoC_Namespace, CoC_Locator, - Prefix, Chunk, CSum, Extra, FluName, EpochId) of +append_server_dispatch(From, NSInfo, + Prefix, Chunk, TCSum, Opts, FluName, EpochId) -> + Result = case handle_append(NSInfo, + Prefix, Chunk, TCSum, Opts, FluName, EpochId) of {ok, File, Offset} -> {assignment, Offset, File}; Other -> @@ -173,19 +173,17 @@ append_server_dispatch(From, CoC_Namespace, CoC_Locator, _ = gen_server:reply(From, Result), ok. -handle_append(_N, _L, _Prefix, <<>>, _Csum, _Extra, _FluName, _EpochId) -> - {error, bad_arg}; -handle_append(CoC_Namespace, CoC_Locator, - Prefix, Chunk, Csum, Extra, FluName, EpochId) -> - CoC = {coc, CoC_Namespace, CoC_Locator}, +handle_append(NSInfo, + Prefix, Chunk, TCSum, Opts, FluName, EpochId) -> Res = machi_flu_filename_mgr:find_or_make_filename_from_prefix( - FluName, EpochId, {prefix, Prefix}, CoC), + FluName, EpochId, {prefix, Prefix}, NSInfo), case Res of {file, F} -> case machi_flu_metadata_mgr:start_proxy_pid(FluName, {file, F}) of {ok, Pid} -> - {Tag, CS} = machi_util:unmake_tagged_csum(Csum), + {Tag, CS} = machi_util:unmake_tagged_csum(TCSum), Meta = [{client_csum_tag, Tag}, {client_csum, CS}], + Extra = Opts#append_opts.chunk_extra, machi_file_proxy:append(Pid, Meta, Extra, Chunk); {error, trimmed} = E -> E diff --git a/src/machi_flu1_client.erl b/src/machi_flu1_client.erl index e5b65fc..37e6d5a 100644 --- a/src/machi_flu1_client.erl +++ b/src/machi_flu1_client.erl @@ -38,6 +38,71 @@ %% TODO This EDoc was written first, and the EDoc and also `-type' and %% `-spec' definitions for {@link machi_proxy_flu1_client} and {@link %% machi_cr_client} must be improved. +%% +%% == Client API implementation notes == +%% +%% At the moment, there are several modules that implement various +%% subsets of the Machi API. The table below attempts to show how and +%% why they differ. +%% +%% ``` +%% |--------------------------+-------+-----+------+------+-------+----------------| +%% | | PB | | # | | Conn | Epoch & NS | +%% | Module name | Level | CR? | FLUS | Impl | Life? | version aware? | +%% |--------------------------+-------+-----+------+------+-------+----------------| +%% | machi_pb_high_api_client | high | yes | many | proc | long | no | +%% | machi_cr_client | low | yes | many | proc | long | no | +%% | machi_proxy_flu1_client | low | no | 1 | proc | long | yes | +%% | machi_flu1_client | low | no | 1 | lib | short | yes | +%% |--------------------------+-------+-----+------+------+-------+----------------| +%% ''' +%% +%% In terms of use and API layering, the table rows are in highest`->'lowest +%% order: each level calls the layer immediately below it. +%% +%%
+%%
PB Level
+%%
The Protocol Buffers API is divided logically into two levels, +%% "low" and "high". The low-level protocol is used for intra-chain +%% communication. The high-level protocol is used for clients outside +%% of a Machi chain or Machi cluster of chains. +%%
+%%
CR?
+%%
Does this API support (directly or indirectly) Chain +%% Replication? If `no', then the API has no awareness of multiple +%% replicas of any file or file chunk; unaware clients can only +%% perform operations at a single Machi FLU's file service or +%% projection store service. +%%
+%%
# FLUs
+%%
Now many FLUs does this API layer communicate with +%% simultaneously? Note that there is a one-to-one correspondence +%% between this value and the "CR?" column's value. +%%
+%%
Impl
+%%
Implementation: library-only or an Erlang process, +%% e.g., `gen_server'. +%%
+%%
Conn Life?
+%%
Expected TCP session connection life: short or long. At the +%% lowest level, the {@link machi_flu1_client} API implementation takes +%% no effort to reconnect to a remote FLU when its single TCP session +%% is broken. For long-lived connection life APIs, the server side will +%% automatically attempt to reconnect to remote FLUs when a TCP session +%% is broken. +%%
+%%
Epoch & NS version aware?
+%%
Are clients of this API responsible for knowing a chain's EpochID +%% and namespace version numbers? If `no', then the server side of the +%% API will automatically attempt to discover/re-discover the EpochID and +%% namespace version numbers whenever they change. +%%
+%%
+%% +%% The only protocol that we expect to be used by entities outside of +%% a single Machi chain or a multi-chain cluster is the "high" +%% Protocol Buffers API. The {@link riak_pb_high_api_client} module +%% is an Erlang reference implementation of this PB API. -module(machi_flu1_client). @@ -50,16 +115,15 @@ -include_lib("pulse_otp/include/pulse_otp.hrl"). -endif. --define(HARD_TIMEOUT, 2500). +-define(SHORT_TIMEOUT, 2500). +-define(LONG_TIMEOUT, (60*1000)). -export([ %% File API - append_chunk/4, append_chunk/5, append_chunk/6, append_chunk/7, - append_chunk_extra/5, append_chunk_extra/6, - append_chunk_extra/7, append_chunk_extra/8, - read_chunk/6, read_chunk/7, - checksum_list/3, checksum_list/4, + append_chunk/8, append_chunk/9, + read_chunk/7, read_chunk/8, + checksum_list/2, checksum_list/3, list_files/2, list_files/3, wedge_status/1, wedge_status/2, @@ -81,190 +145,113 @@ ]). %% For "internal" replication only. -export([ - write_chunk/5, write_chunk/6, - trim_chunk/5, + write_chunk/7, write_chunk/8, + trim_chunk/6, delete_migration/3, delete_migration/4, trunc_hack/3, trunc_hack/4 ]). -type port_wrap() :: {w,atom(),term()}. -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix'. - --spec append_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk()) -> +-spec append_chunk(port_wrap(), + 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), + machi_dt:file_prefix(), machi_dt:chunk(), + machi_dt:chunk_csum()) -> {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. -append_chunk(Sock, EpochID, Prefix, Chunk) -> - append_chunk2(Sock, EpochID, - ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, - Prefix, Chunk, 0). +append_chunk(Sock, NSInfo, EpochID, Prefix, Chunk, CSum) -> + append_chunk(Sock, NSInfo, EpochID, Prefix, Chunk, CSum, + #append_opts{}, ?LONG_TIMEOUT). %% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix'. +%% with `Prefix' and also request an additional `Extra' bytes. +%% +%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then +%% the file offsets that follow `Chunk''s position for the following 4K will +%% be reserved by the file sequencer for later write(s) by the +%% `write_chunk()' API. -spec append_chunk(machi_dt:inet_host(), machi_dt:inet_port(), - machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk()) -> + 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), + machi_dt:file_prefix(), machi_dt:chunk(), + machi_dt:chunk_csum()) -> {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. -append_chunk(Host, TcpPort, EpochID, Prefix, Chunk) -> - Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), - try - append_chunk2(Sock, EpochID, - ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, - Prefix, Chunk, 0) - after - disconnect(Sock) - end. +append_chunk(Host, TcpPort, NSInfo, EpochID, Prefix, Chunk, CSum) -> + append_chunk(Host, TcpPort, NSInfo, EpochID, Prefix, Chunk, CSum, + #append_opts{}, ?LONG_TIMEOUT). + +-spec append_chunk(port_wrap(), + 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), + machi_dt:file_prefix(), machi_dt:chunk(), + machi_dt:chunk_csum(), machi_dt:append_opts(), timeout()) -> + {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. +append_chunk(Sock, NSInfo0, EpochID, Prefix, Chunk, CSum, Opts, Timeout) -> + NSInfo = machi_util:ns_info_default(NSInfo0), + append_chunk2(Sock, NSInfo, EpochID, Prefix, Chunk, CSum, Opts, Timeout). %% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix'. - --spec append_chunk(port_wrap(), machi_dt:epoch_id(), - machi_dt:coc_namespace(), machi_dt:coc_locator(), - machi_dt:file_prefix(), machi_dt:chunk()) -> - {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. -append_chunk(Sock, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk) -> - append_chunk2(Sock, EpochID, - CoC_Namespace, CoC_Locator, - Prefix, Chunk, 0). - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix'. +%% with `Prefix' and also request an additional `Extra' bytes. +%% +%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then +%% the file offsets that follow `Chunk''s position for the following 4K will +%% be reserved by the file sequencer for later write(s) by the +%% `write_chunk()' API. -spec append_chunk(machi_dt:inet_host(), machi_dt:inet_port(), - machi_dt:epoch_id(), - machi_dt:coc_namespace(), machi_dt:coc_locator(), - machi_dt:file_prefix(), machi_dt:chunk()) -> + 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), + machi_dt:file_prefix(), machi_dt:chunk(), + machi_dt:chunk_csum(), machi_dt:append_opts(), timeout()) -> {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. -append_chunk(Host, TcpPort, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk) -> +append_chunk(Host, TcpPort, NSInfo0, EpochID, + Prefix, Chunk, CSum, Opts, Timeout) -> Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), try - append_chunk2(Sock, EpochID, - CoC_Namespace, CoC_Locator, - Prefix, Chunk, 0) - after - disconnect(Sock) - end. - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix' and also request an additional `Extra' bytes. -%% -%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then -%% the file offsets that follow `Chunk''s position for the following 4K will -%% be reserved by the file sequencer for later write(s) by the -%% `write_chunk()' API. - --spec append_chunk_extra(port_wrap(), machi_dt:epoch_id(), - machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) -> - {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. -append_chunk_extra(Sock, EpochID, Prefix, Chunk, ChunkExtra) - when is_integer(ChunkExtra), ChunkExtra >= 0 -> - append_chunk2(Sock, EpochID, - ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, - Prefix, Chunk, ChunkExtra). - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix' and also request an additional `Extra' bytes. -%% -%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then -%% the file offsets that follow `Chunk''s position for the following 4K will -%% be reserved by the file sequencer for later write(s) by the -%% `write_chunk()' API. - --spec append_chunk_extra(machi_dt:inet_host(), machi_dt:inet_port(), - machi_dt:epoch_id(), machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) -> - {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. -append_chunk_extra(Host, TcpPort, EpochID, Prefix, Chunk, ChunkExtra) - when is_integer(ChunkExtra), ChunkExtra >= 0 -> - Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), - try - append_chunk2(Sock, EpochID, - ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, - Prefix, Chunk, ChunkExtra) - after - disconnect(Sock) - end. - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix' and also request an additional `Extra' bytes. -%% -%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then -%% the file offsets that follow `Chunk''s position for the following 4K will -%% be reserved by the file sequencer for later write(s) by the -%% `write_chunk()' API. - --spec append_chunk_extra(port_wrap(), machi_dt:epoch_id(), - machi_dt:coc_namespace(), machi_dt:coc_locator(), - machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) -> - {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. -append_chunk_extra(Sock, EpochID, CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra) - when is_integer(ChunkExtra), ChunkExtra >= 0 -> - append_chunk2(Sock, EpochID, - CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra). - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix' and also request an additional `Extra' bytes. -%% -%% For example, if the `Chunk' size is 1 KByte and `Extra' is 4K Bytes, then -%% the file offsets that follow `Chunk''s position for the following 4K will -%% be reserved by the file sequencer for later write(s) by the -%% `write_chunk()' API. - --spec append_chunk_extra(machi_dt:inet_host(), machi_dt:inet_port(), - machi_dt:epoch_id(), - machi_dt:coc_namespace(), machi_dt:coc_locator(), - machi_dt:file_prefix(), machi_dt:chunk(), machi_dt:chunk_size()) -> - {ok, machi_dt:chunk_pos()} | {error, machi_dt:error_general()} | {error, term()}. -append_chunk_extra(Host, TcpPort, EpochID, - CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra) - when is_integer(ChunkExtra), ChunkExtra >= 0 -> - Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), - try - append_chunk2(Sock, EpochID, - CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra) + NSInfo = machi_util:ns_info_default(NSInfo0), + append_chunk2(Sock, NSInfo, EpochID, + Prefix, Chunk, CSum, Opts, Timeout) after disconnect(Sock) end. %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. --spec read_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size(), - proplists:proplist()) -> - {ok, machi_dt:chunk_s()} | +-spec read_chunk(port_wrap(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size(), + machi_dt:read_opts_x()) -> + {ok, {[machi_dt:chunk_summary()], [machi_dt:chunk_pos()]}} | {error, machi_dt:error_general() | 'not_written' | 'partial_read'} | {error, term()}. -read_chunk(Sock, EpochID, File, Offset, Size, Opts) +read_chunk(Sock, NSInfo0, EpochID, File, Offset, Size, Opts0) when Offset >= ?MINIMUM_OFFSET, Size >= 0 -> - read_chunk2(Sock, EpochID, File, Offset, Size, Opts). + NSInfo = machi_util:ns_info_default(NSInfo0), + Opts = machi_util:read_opts_default(Opts0), + read_chunk2(Sock, NSInfo, EpochID, File, Offset, Size, Opts). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. --spec read_chunk(machi_dt:inet_host(), machi_dt:inet_port(), machi_dt:epoch_id(), +-spec read_chunk(machi_dt:inet_host(), machi_dt:inet_port(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size(), - proplists:proplist()) -> - {ok, machi_dt:chunk_s()} | + machi_dt:read_opts_x()) -> + {ok, [machi_dt:chunk_summary()]} | {error, machi_dt:error_general() | 'not_written' | 'partial_read'} | {error, term()}. -read_chunk(Host, TcpPort, EpochID, File, Offset, Size, Opts) +read_chunk(Host, TcpPort, NSInfo0, EpochID, File, Offset, Size, Opts0) when Offset >= ?MINIMUM_OFFSET, Size >= 0 -> Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), + NSInfo = machi_util:ns_info_default(NSInfo0), + Opts = machi_util:read_opts_default(Opts0), try - read_chunk2(Sock, EpochID, File, Offset, Size, Opts) + read_chunk2(Sock, NSInfo, EpochID, File, Offset, Size, Opts) after disconnect(Sock) end. %% @doc Fetch the list of chunk checksums for `File'. --spec checksum_list(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name()) -> +-spec checksum_list(port_wrap(), machi_dt:file_name()) -> {ok, binary()} | {error, machi_dt:error_general() | 'no_such_file' | 'partial_read'} | {error, term()}. -checksum_list(Sock, EpochID, File) -> - checksum_list2(Sock, EpochID, File). +checksum_list(Sock, File) -> + checksum_list2(Sock, File). %% @doc Fetch the list of chunk checksums for `File'. %% @@ -288,13 +275,13 @@ checksum_list(Sock, EpochID, File) -> %% Details of the encoding used inside the `binary()' blog can be found %% in the EDoc comments for {@link machi_flu1:decode_csum_file_entry/1}. --spec checksum_list(machi_dt:inet_host(), machi_dt:inet_port(), machi_dt:epoch_id(), machi_dt:file_name()) -> +-spec checksum_list(machi_dt:inet_host(), machi_dt:inet_port(), machi_dt:file_name()) -> {ok, binary()} | {error, machi_dt:error_general() | 'no_such_file'} | {error, term()}. -checksum_list(Host, TcpPort, EpochID, File) when is_integer(TcpPort) -> +checksum_list(Host, TcpPort, File) when is_integer(TcpPort) -> Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), try - checksum_list2(Sock, EpochID, File) + checksum_list2(Sock, File) after disconnect(Sock) end. @@ -321,7 +308,7 @@ list_files(Host, TcpPort, EpochID) when is_integer(TcpPort) -> %% @doc Fetch the wedge status from the remote FLU. -spec wedge_status(port_wrap()) -> - {ok, {boolean(), machi_dt:epoch_id()}} | {error, term()}. + {ok, {boolean(), machi_dt:epoch_id(), machi_dt:namespace_version(),machi_dt:namespace()}} | {error, term()}. wedge_status(Sock) -> wedge_status2(Sock). @@ -329,7 +316,7 @@ wedge_status(Sock) -> %% @doc Fetch the wedge status from the remote FLU. -spec wedge_status(machi_dt:inet_host(), machi_dt:inet_port()) -> - {ok, {boolean(), machi_dt:epoch_id()}} | {error, term()}. + {ok, {boolean(), machi_dt:epoch_id(), machi_dt:namespace_version(),machi_dt:namespace()}} | {error, term()}. wedge_status(Host, TcpPort) when is_integer(TcpPort) -> Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), try @@ -540,23 +527,25 @@ disconnect(_) -> %% @doc Restricted API: Write a chunk of already-sequenced data to %% `File' at `Offset'. --spec write_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk()) -> +-spec write_chunk(port_wrap(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk(), machi_dt:chunk_csum()) -> ok | {error, machi_dt:error_general()} | {error, term()}. -write_chunk(Sock, EpochID, File, Offset, Chunk) +write_chunk(Sock, NSInfo0, EpochID, File, Offset, Chunk, CSum) when Offset >= ?MINIMUM_OFFSET -> - write_chunk2(Sock, EpochID, File, Offset, Chunk). + NSInfo = machi_util:ns_info_default(NSInfo0), + write_chunk2(Sock, NSInfo, EpochID, File, Offset, Chunk, CSum). %% @doc Restricted API: Write a chunk of already-sequenced data to %% `File' at `Offset'. -spec write_chunk(machi_dt:inet_host(), machi_dt:inet_port(), - machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk()) -> + 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk(), machi_dt:chunk_csum()) -> ok | {error, machi_dt:error_general()} | {error, term()}. -write_chunk(Host, TcpPort, EpochID, File, Offset, Chunk) +write_chunk(Host, TcpPort, NSInfo0, EpochID, File, Offset, Chunk, CSum) when Offset >= ?MINIMUM_OFFSET -> Sock = connect(#p_srvr{proto_mod=?MODULE, address=Host, port=TcpPort}), try - write_chunk2(Sock, EpochID, File, Offset, Chunk) + NSInfo = machi_util:ns_info_default(NSInfo0), + write_chunk2(Sock, NSInfo, EpochID, File, Offset, Chunk, CSum) after disconnect(Sock) end. @@ -564,16 +553,18 @@ write_chunk(Host, TcpPort, EpochID, File, Offset, Chunk) %% @doc Restricted API: Write a chunk of already-sequenced data to %% `File' at `Offset'. --spec trim_chunk(port_wrap(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size()) -> +-spec trim_chunk(port_wrap(), 'undefined' | machi_dt:ns_info(), machi_dt:epoch_id(), machi_dt:file_name(), machi_dt:file_offset(), machi_dt:chunk_size()) -> ok | {error, machi_dt:error_general()} | {error, term()}. -trim_chunk(Sock, EpochID, File0, Offset, Size) +trim_chunk(Sock, NSInfo0, EpochID, File0, Offset, Size) when Offset >= ?MINIMUM_OFFSET -> ReqID = <<"id">>, + NSInfo = machi_util:ns_info_default(NSInfo0), + #ns_info{version=NSVersion, name=NS} = NSInfo, File = machi_util:make_binary(File0), true = (Offset >= ?MINIMUM_OFFSET), Req = machi_pb_translate:to_pb_request( ReqID, - {low_trim_chunk, EpochID, File, Offset, Size, 0}), + {low_trim_chunk, NSVersion, NS, EpochID, File, Offset, Size, 0}), do_pb_request_common(Sock, ReqID, Req). %% @doc Restricted API: Delete a file after it has been successfully @@ -620,83 +611,88 @@ trunc_hack(Host, TcpPort, EpochID, File) when is_integer(TcpPort) -> %%%%%%%%%%%%%%%%%%%%%%%%%%% -read_chunk2(Sock, EpochID, File0, Offset, Size, Opts) -> +read_chunk2(Sock, NSInfo, EpochID, File0, Offset, Size, Opts) -> ReqID = <<"id">>, + #ns_info{version=NSVersion, name=NS} = NSInfo, File = machi_util:make_binary(File0), Req = machi_pb_translate:to_pb_request( ReqID, - {low_read_chunk, EpochID, File, Offset, Size, Opts}), + {low_read_chunk, NSVersion, NS, EpochID, File, Offset, Size, Opts}), do_pb_request_common(Sock, ReqID, Req). -append_chunk2(Sock, EpochID, CoC_Namespace, CoC_Locator, - Prefix0, Chunk0, ChunkExtra) -> +append_chunk2(Sock, NSInfo, EpochID, + Prefix0, Chunk, CSum0, Opts, Timeout) -> ReqID = <<"id">>, - {Chunk, CSum_tag, CSum} = - case Chunk0 of - X when is_binary(X) -> - {Chunk0, ?CSUM_TAG_NONE, <<>>}; - {ChunkCSum, Chk} -> - {Tag, CS} = machi_util:unmake_tagged_csum(ChunkCSum), - {Chk, Tag, CS} - end, Prefix = machi_util:make_binary(Prefix0), + {CSum_tag, CSum} = case CSum0 of + <<>> -> + {?CSUM_TAG_NONE, <<>>}; + {_Tag, _CS} -> + CSum0; + B when is_binary(B) -> + machi_util:unmake_tagged_csum(CSum0) + end, + #ns_info{version=NSVersion, name=NS, locator=NSLocator} = NSInfo, + %% NOTE: The tuple position of NSLocator is a bit odd, because EpochID + %% _must_ be in the 4th position (as NSV & NS must be in 2nd & 3rd). Req = machi_pb_translate:to_pb_request( ReqID, - {low_append_chunk, EpochID, CoC_Namespace, CoC_Locator, - Prefix, Chunk, CSum_tag, CSum, ChunkExtra}), - do_pb_request_common(Sock, ReqID, Req). + {low_append_chunk, NSVersion, NS, EpochID, NSLocator, + Prefix, Chunk, CSum_tag, CSum, Opts}), + do_pb_request_common(Sock, ReqID, Req, true, Timeout). -write_chunk2(Sock, EpochID, File0, Offset, Chunk0) -> +write_chunk2(Sock, NSInfo, EpochID, File0, Offset, Chunk, CSum0) -> ReqID = <<"id">>, + #ns_info{version=NSVersion, name=NS} = NSInfo, File = machi_util:make_binary(File0), true = (Offset >= ?MINIMUM_OFFSET), - {Chunk, CSum_tag, CSum} = - case Chunk0 of - X when is_binary(X) -> - {Chunk0, ?CSUM_TAG_NONE, <<>>}; - {ChunkCSum, Chk} -> - {Tag, CS} = machi_util:unmake_tagged_csum(ChunkCSum), - {Chk, Tag, CS} - end, + {CSum_tag, CSum} = case CSum0 of + <<>> -> + {?CSUM_TAG_NONE, <<>>}; + {_Tag, _CS} -> + CSum0; + B when is_binary(B) -> + machi_util:unmake_tagged_csum(CSum0) + end, Req = machi_pb_translate:to_pb_request( ReqID, - {low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, CSum}), + {low_write_chunk, NSVersion, NS, EpochID, File, Offset, Chunk, CSum_tag, CSum}), do_pb_request_common(Sock, ReqID, Req). list2(Sock, EpochID) -> ReqID = <<"id">>, Req = machi_pb_translate:to_pb_request( - ReqID, {low_list_files, EpochID}), + ReqID, {low_skip_wedge, {low_list_files, EpochID}}), do_pb_request_common(Sock, ReqID, Req). wedge_status2(Sock) -> ReqID = <<"id">>, Req = machi_pb_translate:to_pb_request( - ReqID, {low_wedge_status, undefined}), + ReqID, {low_skip_wedge, {low_wedge_status}}), do_pb_request_common(Sock, ReqID, Req). echo2(Sock, Message) -> ReqID = <<"id">>, Req = machi_pb_translate:to_pb_request( - ReqID, {low_echo, undefined, Message}), + ReqID, {low_skip_wedge, {low_echo, Message}}), do_pb_request_common(Sock, ReqID, Req). -checksum_list2(Sock, EpochID, File) -> +checksum_list2(Sock, File) -> ReqID = <<"id">>, Req = machi_pb_translate:to_pb_request( - ReqID, {low_checksum_list, EpochID, File}), + ReqID, {low_skip_wedge, {low_checksum_list, File}}), do_pb_request_common(Sock, ReqID, Req). delete_migration2(Sock, EpochID, File) -> ReqID = <<"id">>, Req = machi_pb_translate:to_pb_request( - ReqID, {low_delete_migration, EpochID, File}), + ReqID, {low_skip_wedge, {low_delete_migration, EpochID, File}}), do_pb_request_common(Sock, ReqID, Req). trunc_hack2(Sock, EpochID, File) -> ReqID = <<"id-trunc">>, Req = machi_pb_translate:to_pb_request( - ReqID, {low_trunc_hack, EpochID, File}), + ReqID, {low_skip_wedge, {low_trunc_hack, EpochID, File}}), do_pb_request_common(Sock, ReqID, Req). get_latest_epochid2(Sock, ProjType) -> @@ -739,18 +735,18 @@ kick_projection_reaction2(Sock, _Options) -> ReqID = <<42>>, Req = machi_pb_translate:to_pb_request( ReqID, {low_proj, {kick_projection_reaction}}), - do_pb_request_common(Sock, ReqID, Req, false). + do_pb_request_common(Sock, ReqID, Req, false, ?LONG_TIMEOUT). do_pb_request_common(Sock, ReqID, Req) -> - do_pb_request_common(Sock, ReqID, Req, true). + do_pb_request_common(Sock, ReqID, Req, true, ?LONG_TIMEOUT). -do_pb_request_common(Sock, ReqID, Req, GetReply_p) -> +do_pb_request_common(Sock, ReqID, Req, GetReply_p, Timeout) -> erase(bad_sock), try ReqBin = list_to_binary(machi_pb:encode_mpb_ll_request(Req)), ok = w_send(Sock, ReqBin), if GetReply_p -> - case w_recv(Sock, 0) of + case w_recv(Sock, 0, Timeout) of {ok, RespBin} -> Resp = machi_pb:decode_mpb_ll_response(RespBin), {ReqID2, Reply} = machi_pb_translate:from_pb_response(Resp), @@ -796,7 +792,7 @@ w_connect(#p_srvr{proto_mod=?MODULE, address=Host, port=Port, props=Props}=_P)-> case proplists:get_value(session_proto, Props, tcp) of tcp -> put(xxx, goofus), - Sock = machi_util:connect(Host, Port, ?HARD_TIMEOUT), + Sock = machi_util:connect(Host, Port, ?SHORT_TIMEOUT), put(xxx, Sock), ok = inet:setopts(Sock, ?PB_PACKET_OPTS), {w,tcp,Sock}; @@ -820,8 +816,8 @@ w_close({w,tcp,Sock}) -> catch gen_tcp:close(Sock), ok. -w_recv({w,tcp,Sock}, Amt) -> - gen_tcp:recv(Sock, Amt, ?HARD_TIMEOUT). +w_recv({w,tcp,Sock}, Amt, Timeout) -> + gen_tcp:recv(Sock, Amt, Timeout). w_send({w,tcp,Sock}, IoData) -> gen_tcp:send(Sock, IoData). diff --git a/src/machi_flu1_net_server.erl b/src/machi_flu1_net_server.erl index 6610230..ed3d980 100644 --- a/src/machi_flu1_net_server.erl +++ b/src/machi_flu1_net_server.erl @@ -66,19 +66,25 @@ flu_name :: pv1_server(), %% Used in server_wedge_status to lookup the table epoch_tab :: ets:tab(), + %% Clustering: cluster map version number + namespace_version = 0 :: machi_dt:namespace_version(), + %% Clustering: my (and my chain's) assignment to a specific namespace + namespace = <<>> :: machi_dt:namespace(), %% High mode only high_clnt :: pid(), %% anything you want - props = [] :: list() % proplist + props = [] :: proplists:proplist() }). -type socket() :: any(). -type state() :: #state{}. -spec start_link(ranch:ref(), socket(), module(), [term()]) -> {ok, pid()}. -start_link(Ref, Socket, Transport, [FluName, Witness, DataDir, EpochTab, ProjStore]) -> +start_link(Ref, Socket, Transport, [FluName, Witness, DataDir, EpochTab, ProjStore, Props]) -> + NS = proplists:get_value(namespace, Props, <<>>), + true = is_binary(NS), proc_lib:start_link(?MODULE, init, [#state{ref=Ref, socket=Socket, transport=Transport, @@ -86,7 +92,9 @@ start_link(Ref, Socket, Transport, [FluName, Witness, DataDir, EpochTab, ProjSto witness=Witness, data_dir=DataDir, epoch_tab=EpochTab, - proj_store=ProjStore}]). + proj_store=ProjStore, + namespace=NS, + props=Props}]). -spec init(state()) -> no_return(). init(#state{ref=Ref, socket=Socket, transport=Transport}=State) -> @@ -209,44 +217,51 @@ do_pb_ll_request(#mpb_ll_request{req_id=ReqID}, #state{pb_mode=high}=S) -> {machi_pb_translate:to_pb_response(ReqID, unused, Result), S}; do_pb_ll_request(PB_request, S) -> Req = machi_pb_translate:from_pb_request(PB_request), - %% io:format(user, "[~w] do_pb_ll_request Req: ~w~n", [S#state.flu_name, Req]), {ReqID, Cmd, Result, S2} = case Req of - {RqID, {LowCmd, _}=Cmd0} - when LowCmd =:= low_proj; - LowCmd =:= low_wedge_status; - LowCmd =:= low_list_files -> + {RqID, {low_skip_wedge, LowSubCmd}=Cmd0} -> %% Skip wedge check for these unprivileged commands + {Rs, NewS} = do_pb_ll_request3(LowSubCmd, S), + {RqID, Cmd0, Rs, NewS}; + {RqID, {low_proj, _LowSubCmd}=Cmd0} -> {Rs, NewS} = do_pb_ll_request3(Cmd0, S), {RqID, Cmd0, Rs, NewS}; {RqID, Cmd0} -> - EpochID = element(2, Cmd0), % by common convention - {Rs, NewS} = do_pb_ll_request2(EpochID, Cmd0, S), + %% All remaining must have NSVersion, NS, & EpochID at next pos + NSVersion = element(2, Cmd0), + NS = element(3, Cmd0), + EpochID = element(4, Cmd0), + {Rs, NewS} = do_pb_ll_request2(NSVersion, NS, EpochID, Cmd0, S), {RqID, Cmd0, Rs, NewS} end, {machi_pb_translate:to_pb_response(ReqID, Cmd, Result), S2}. -do_pb_ll_request2(EpochID, CMD, S) -> +%% do_pb_ll_request2(): Verification of epoch details & namespace details. + +do_pb_ll_request2(NSVersion, NS, EpochID, CMD, S) -> {Wedged_p, CurrentEpochID} = lookup_epoch(S), - %% io:format(user, "{Wedged_p, CurrentEpochID}: ~w~n", [{Wedged_p, CurrentEpochID}]), - if Wedged_p == true -> + if not is_tuple(EpochID) orelse tuple_size(EpochID) /= 2 -> + exit({bad_epoch_id, EpochID, for, CMD}); + Wedged_p == true -> {{error, wedged}, S#state{epoch_id=CurrentEpochID}}; - is_tuple(EpochID) - andalso EpochID /= CurrentEpochID -> {Epoch, _} = EpochID, {CurrentEpoch, _} = CurrentEpochID, if Epoch < CurrentEpoch -> - ok; + {{error, bad_epoch}, S}; true -> - %% We're at same epoch # but different checksum, or - %% we're at a newer/bigger epoch #. _ = machi_flu1:wedge_myself(S#state.flu_name, CurrentEpochID), - ok - end, - {{error, bad_epoch}, S#state{epoch_id=CurrentEpochID}}; + {{error, wedged}, S#state{epoch_id=CurrentEpochID}} + end; true -> - do_pb_ll_request3(CMD, S#state{epoch_id=CurrentEpochID}) + #state{namespace_version=MyNSVersion, namespace=MyNS} = S, + if NSVersion /= MyNSVersion -> + {{error, bad_epoch}, S}; + NS /= MyNS -> + {{error, bad_arg}, S}; + true -> + do_pb_ll_request3(CMD, S) + end end. lookup_epoch(#state{epoch_tab=T}) -> @@ -254,34 +269,35 @@ lookup_epoch(#state{epoch_tab=T}) -> ets:lookup_element(T, epoch, 2). %% Witness status does not matter below. -do_pb_ll_request3({low_echo, _BogusEpochID, Msg}, S) -> +do_pb_ll_request3({low_echo, Msg}, S) -> {Msg, S}; -do_pb_ll_request3({low_auth, _BogusEpochID, _User, _Pass}, S) -> +do_pb_ll_request3({low_auth, _User, _Pass}, S) -> {-6, S}; -do_pb_ll_request3({low_wedge_status, _EpochID}, S) -> +do_pb_ll_request3({low_wedge_status}, S) -> {do_server_wedge_status(S), S}; do_pb_ll_request3({low_proj, PCMD}, S) -> {do_server_proj_request(PCMD, S), S}; %% Witness status *matters* below -do_pb_ll_request3({low_append_chunk, _EpochID, CoC_Namespace, CoC_Locator, +do_pb_ll_request3({low_append_chunk, NSVersion, NS, EpochID, NSLocator, Prefix, Chunk, CSum_tag, - CSum, ChunkExtra}, + CSum, Opts}, #state{witness=false}=S) -> - {do_server_append_chunk(CoC_Namespace, CoC_Locator, + NSInfo = #ns_info{version=NSVersion, name=NS, locator=NSLocator}, + {do_server_append_chunk(NSInfo, EpochID, Prefix, Chunk, CSum_tag, CSum, - ChunkExtra, S), S}; -do_pb_ll_request3({low_write_chunk, _EpochID, File, Offset, Chunk, CSum_tag, + Opts, S), S}; +do_pb_ll_request3({low_write_chunk, _NSVersion, _NS, _EpochID, File, Offset, Chunk, CSum_tag, CSum}, #state{witness=false}=S) -> {do_server_write_chunk(File, Offset, Chunk, CSum_tag, CSum, S), S}; -do_pb_ll_request3({low_read_chunk, _EpochID, File, Offset, Size, Opts}, +do_pb_ll_request3({low_read_chunk, _NSVersion, _NS, _EpochID, File, Offset, Size, Opts}, #state{witness=false} = S) -> {do_server_read_chunk(File, Offset, Size, Opts, S), S}; -do_pb_ll_request3({low_trim_chunk, _EpochID, File, Offset, Size, TriggerGC}, +do_pb_ll_request3({low_trim_chunk, _NSVersion, _NS, _EpochID, File, Offset, Size, TriggerGC}, #state{witness=false}=S) -> {do_server_trim_chunk(File, Offset, Size, TriggerGC, S), S}; -do_pb_ll_request3({low_checksum_list, _EpochID, File}, +do_pb_ll_request3({low_checksum_list, File}, #state{witness=false}=S) -> {do_server_checksum_listing(File, S), S}; do_pb_ll_request3({low_list_files, _EpochID}, @@ -334,27 +350,27 @@ do_server_proj_request({kick_projection_reaction}, end), async_no_response. -do_server_append_chunk(CoC_Namespace, CoC_Locator, +do_server_append_chunk(NSInfo, EpochID, Prefix, Chunk, CSum_tag, CSum, - ChunkExtra, S) -> + Opts, S) -> case sanitize_prefix(Prefix) of ok -> - do_server_append_chunk2(CoC_Namespace, CoC_Locator, + do_server_append_chunk2(NSInfo, EpochID, Prefix, Chunk, CSum_tag, CSum, - ChunkExtra, S); + Opts, S); _ -> {error, bad_arg} end. -do_server_append_chunk2(CoC_Namespace, CoC_Locator, +do_server_append_chunk2(NSInfo, EpochID, Prefix, Chunk, CSum_tag, Client_CSum, - ChunkExtra, #state{flu_name=FluName, - epoch_id=EpochID}=_S) -> + Opts, #state{flu_name=FluName, + epoch_id=EpochID}=_S) -> %% TODO: Do anything with PKey? try TaggedCSum = check_or_make_tagged_checksum(CSum_tag, Client_CSum,Chunk), - R = {seq_append, self(), CoC_Namespace, CoC_Locator, - Prefix, Chunk, TaggedCSum, ChunkExtra, EpochID}, + R = {seq_append, self(), NSInfo, EpochID, + Prefix, Chunk, TaggedCSum, Opts}, case gen_server:call(FluName, R, 10*1000) of {assignment, Offset, File} -> Size = iolist_size(Chunk), @@ -457,14 +473,14 @@ do_server_list_files(#state{data_dir=DataDir}=_S) -> {Size, File} end || File <- Files]}. -do_server_wedge_status(S) -> +do_server_wedge_status(#state{namespace_version=NSVersion, namespace=NS}=S) -> {Wedged_p, CurrentEpochID0} = lookup_epoch(S), CurrentEpochID = if CurrentEpochID0 == undefined -> ?DUMMY_PV1_EPOCH; true -> CurrentEpochID0 end, - {Wedged_p, CurrentEpochID}. + {Wedged_p, CurrentEpochID, NSVersion, NS}. do_server_delete_migration(File, #state{data_dir=DataDir}=_S) -> case sanitize_file_string(File) of @@ -563,26 +579,30 @@ do_pb_hl_request2({high_echo, Msg}, S) -> {Msg, S}; do_pb_hl_request2({high_auth, _User, _Pass}, S) -> {-77, S}; -do_pb_hl_request2({high_append_chunk, CoC_Namespace, CoC_Locator, - Prefix, ChunkBin, TaggedCSum, - ChunkExtra}, #state{high_clnt=Clnt}=S) -> - Chunk = {TaggedCSum, ChunkBin}, - Res = machi_cr_client:append_chunk_extra(Clnt, CoC_Namespace, CoC_Locator, - Prefix, Chunk, - ChunkExtra), - {Res, S}; -do_pb_hl_request2({high_write_chunk, File, Offset, ChunkBin, TaggedCSum}, +do_pb_hl_request2({high_append_chunk=Op, NS, Prefix, Chunk, TaggedCSum, Opts}, #state{high_clnt=Clnt}=S) -> - Chunk = {TaggedCSum, ChunkBin}, - Res = machi_cr_client:write_chunk(Clnt, File, Offset, Chunk), + NSInfo = #ns_info{name=NS}, % TODO populate other fields + todo_perhaps_remind_ns_locator_not_chosen(Op), + Res = machi_cr_client:append_chunk(Clnt, NSInfo, + Prefix, Chunk, TaggedCSum, Opts), {Res, S}; -do_pb_hl_request2({high_read_chunk, File, Offset, Size, Opts}, +do_pb_hl_request2({high_write_chunk=Op, File, Offset, Chunk, CSum}, #state{high_clnt=Clnt}=S) -> - Res = machi_cr_client:read_chunk(Clnt, File, Offset, Size, Opts), + NSInfo = undefined, + todo_perhaps_remind_ns_locator_not_chosen(Op), + Res = machi_cr_client:write_chunk(Clnt, NSInfo, File, Offset, Chunk, CSum), {Res, S}; -do_pb_hl_request2({high_trim_chunk, File, Offset, Size}, +do_pb_hl_request2({high_read_chunk=Op, File, Offset, Size, Opts}, #state{high_clnt=Clnt}=S) -> - Res = machi_cr_client:trim_chunk(Clnt, File, Offset, Size), + NSInfo = undefined, + todo_perhaps_remind_ns_locator_not_chosen(Op), + Res = machi_cr_client:read_chunk(Clnt, NSInfo, File, Offset, Size, Opts), + {Res, S}; +do_pb_hl_request2({high_trim_chunk=Op, File, Offset, Size}, + #state{high_clnt=Clnt}=S) -> + NSInfo = undefined, + todo_perhaps_remind_ns_locator_not_chosen(Op), + Res = machi_cr_client:trim_chunk(Clnt, NSInfo, File, Offset, Size), {Res, S}; do_pb_hl_request2({high_checksum_list, File}, #state{high_clnt=Clnt}=S) -> Res = machi_cr_client:checksum_list(Clnt, File), @@ -600,3 +620,15 @@ make_high_clnt(#state{high_clnt=undefined}=S) -> S#state{high_clnt=Clnt}; make_high_clnt(S) -> S. + +todo_perhaps_remind_ns_locator_not_chosen(Op) -> + Key = {?MODULE, Op}, + case get(Key) of + undefined -> + io:format(user, "TODO op ~w is using default locator value\n", + [Op]), + put(Key, true); + _ -> + ok + end. + diff --git a/src/machi_flu1_subsup.erl b/src/machi_flu1_subsup.erl index 21fd6f5..566c118 100644 --- a/src/machi_flu1_subsup.erl +++ b/src/machi_flu1_subsup.erl @@ -36,7 +36,7 @@ -export([start_link/1, start_append_server/4, stop_append_server/1, - start_listener/6, + start_listener/7, stop_listener/1, subsup_name/1, listener_name/1]). @@ -67,11 +67,13 @@ stop_append_server(FluName) -> ok = supervisor:delete_child(SubSup, FluName). -spec start_listener(pv1_server(), inet:port_number(), boolean(), - string(), ets:tab(), atom() | pid()) -> {ok, pid()}. -start_listener(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore) -> + string(), ets:tab(), atom() | pid(), + proplists:proplist()) -> {ok, pid()}. +start_listener(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore, + Props) -> supervisor:start_child(subsup_name(FluName), listener_spec(FluName, TcpPort, Witness, DataDir, - EpochTab, ProjStore)). + EpochTab, ProjStore, Props)). -spec stop_listener(pv1_server()) -> ok. stop_listener(FluName) -> @@ -97,12 +99,13 @@ init([]) -> %% private -spec listener_spec(pv1_server(), inet:port_number(), boolean(), - string(), ets:tab(), atom() | pid()) -> supervisor:child_spec(). -listener_spec(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore) -> + string(), ets:tab(), atom() | pid(), + proplists:proplist()) -> supervisor:child_spec(). +listener_spec(FluName, TcpPort, Witness, DataDir, EpochTab, ProjStore, Props) -> ListenerName = listener_name(FluName), NbAcceptors = 10, TcpOpts = [{port, TcpPort}, {backlog, ?BACKLOG}], - NetServerOpts = [FluName, Witness, DataDir, EpochTab, ProjStore], + NetServerOpts = [FluName, Witness, DataDir, EpochTab, ProjStore, Props], ranch:child_spec(ListenerName, NbAcceptors, ranch_tcp, TcpOpts, machi_flu1_net_server, NetServerOpts). diff --git a/src/machi_flu_filename_mgr.erl b/src/machi_flu_filename_mgr.erl index 36d830b..b25d146 100644 --- a/src/machi_flu_filename_mgr.erl +++ b/src/machi_flu_filename_mgr.erl @@ -67,6 +67,7 @@ ]). -define(TIMEOUT, 10 * 1000). +-include("machi.hrl"). %% included for #ns_info record -include("machi_projection.hrl"). %% included for pv1_epoch type -record(state, {fluname :: atom(), @@ -90,28 +91,28 @@ start_link(FluName, DataDir) when is_atom(FluName) andalso is_list(DataDir) -> -spec find_or_make_filename_from_prefix( FluName :: atom(), EpochId :: pv1_epoch(), Prefix :: {prefix, string()}, - machi_dt:coc_nl()) -> + machi_dt:ns_info()) -> {file, Filename :: string()} | {error, Reason :: term() } | timeout. % @doc Find the latest available or make a filename from a prefix. A prefix % should be in the form of a tagged tuple `{prefix, P}'. Returns a tagged % tuple in the form of `{file, F}' or an `{error, Reason}' find_or_make_filename_from_prefix(FluName, EpochId, {prefix, Prefix}, - {coc, _CoC_Ns, _CoC_Loc}=CoC_NL) + #ns_info{}=NSInfo) when is_atom(FluName) -> N = make_filename_mgr_name(FluName), - gen_server:call(N, {find_filename, FluName, EpochId, CoC_NL, Prefix}, ?TIMEOUT); + gen_server:call(N, {find_filename, FluName, EpochId, NSInfo, Prefix}, ?TIMEOUT); find_or_make_filename_from_prefix(_FluName, _EpochId, Other, Other2) -> - lager:error("~p is not a valid prefix/CoC ~p", [Other, Other2]), + lager:error("~p is not a valid prefix/locator ~p", [Other, Other2]), error(badarg). --spec increment_prefix_sequence( FluName :: atom(), CoC_NL :: machi_dt:coc_nl(), Prefix :: {prefix, string()} ) -> +-spec increment_prefix_sequence( FluName :: atom(), NSInfo :: machi_dt:ns_info(), Prefix :: {prefix, string()} ) -> ok | {error, Reason :: term() } | timeout. % @doc Increment the sequence counter for a given prefix. Prefix should % be in the form of `{prefix, P}'. -increment_prefix_sequence(FluName, {coc,_CoC_Namespace,_CoC_Locator}=CoC_NL, {prefix, Prefix}) when is_atom(FluName) -> - gen_server:call(make_filename_mgr_name(FluName), {increment_sequence, CoC_NL, Prefix}, ?TIMEOUT); -increment_prefix_sequence(_FluName, _CoC_NL, Other) -> +increment_prefix_sequence(FluName, #ns_info{}=NSInfo, {prefix, Prefix}) when is_atom(FluName) -> + gen_server:call(make_filename_mgr_name(FluName), {increment_sequence, NSInfo, Prefix}, ?TIMEOUT); +increment_prefix_sequence(_FluName, _NSInfo, Other) -> lager:error("~p is not a valid prefix.", [Other]), error(badarg). @@ -142,23 +143,22 @@ handle_cast(Req, State) -> %% the FLU has already validated that the caller's epoch id and the FLU's epoch id %% are the same. So we *assume* that remains the case here - that is to say, we %% are not wedged. -handle_call({find_filename, FluName, EpochId, CoC_NL, Prefix}, _From, S = #state{ datadir = DataDir, - epoch = EpochId, - tid = Tid }) -> +handle_call({find_filename, FluName, EpochId, NSInfo, Prefix}, _From, + S = #state{ datadir = DataDir, epoch = EpochId, tid = Tid }) -> %% Our state and the caller's epoch ids are the same. Business as usual. - File = handle_find_file(FluName, Tid, CoC_NL, Prefix, DataDir), + File = handle_find_file(FluName, Tid, NSInfo, Prefix, DataDir), {reply, {file, File}, S}; -handle_call({find_filename, _FluName, EpochId, CoC_NL, Prefix}, _From, S = #state{ datadir = DataDir, tid = Tid }) -> +handle_call({find_filename, _FluName, EpochId, NSInfo, Prefix}, _From, S = #state{ datadir = DataDir, tid = Tid }) -> %% If the epoch id in our state and the caller's epoch id were the same, it would've %% matched the above clause. Since we're here, we know that they are different. %% If epoch ids between our state and the caller's are different, we must increment the %% sequence number, generate a filename and then cache it. - File = increment_and_cache_filename(Tid, DataDir, CoC_NL, Prefix), + File = increment_and_cache_filename(Tid, DataDir, NSInfo, Prefix), {reply, {file, File}, S#state{epoch = EpochId}}; -handle_call({increment_sequence, {coc,CoC_Namespace,CoC_Locator}=_CoC_NL, Prefix}, _From, S = #state{ datadir = DataDir }) -> - ok = machi_util:increment_max_filenum(DataDir, CoC_Namespace,CoC_Locator, Prefix), +handle_call({increment_sequence, #ns_info{name=NS, locator=NSLocator}, Prefix}, _From, S = #state{ datadir = DataDir }) -> + ok = machi_util:increment_max_filenum(DataDir, NS, NSLocator, Prefix), {reply, ok, S}; handle_call({list_files, Prefix}, From, S = #state{ datadir = DataDir }) -> spawn(fun() -> @@ -198,30 +198,30 @@ list_files(DataDir, Prefix) -> make_filename_mgr_name(FluName) when is_atom(FluName) -> list_to_atom(atom_to_list(FluName) ++ "_filename_mgr"). -handle_find_file(_FluName, Tid, {coc,CoC_Namespace,CoC_Locator}, Prefix, DataDir) -> - case ets:lookup(Tid, {CoC_Namespace, CoC_Locator, Prefix}) of +handle_find_file(_FluName, Tid, #ns_info{name=NS, locator=NSLocator}, Prefix, DataDir) -> + case ets:lookup(Tid, {NS, NSLocator, Prefix}) of [] -> - N = machi_util:read_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix), - F = generate_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, N), - true = ets:insert(Tid, {{CoC_Namespace, CoC_Locator, Prefix}, F}), + N = machi_util:read_max_filenum(DataDir, NS, NSLocator, Prefix), + F = generate_filename(DataDir, NS, NSLocator, Prefix, N), + true = ets:insert(Tid, {{NS, NSLocator, Prefix}, F}), F; [{_Key, File}] -> File end. -generate_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, N) -> - {F, _} = machi_util:make_data_filename( +generate_filename(DataDir, NS, NSLocator, Prefix, N) -> + {F, _Q} = machi_util:make_data_filename( DataDir, - CoC_Namespace, CoC_Locator, Prefix, + NS, NSLocator, Prefix, generate_uuid_v4_str(), N), binary_to_list(F). -increment_and_cache_filename(Tid, DataDir, {coc,CoC_Namespace,CoC_Locator}, Prefix) -> - ok = machi_util:increment_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix), - N = machi_util:read_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix), - F = generate_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, N), - true = ets:insert(Tid, {{CoC_Namespace, CoC_Locator, Prefix}, F}), +increment_and_cache_filename(Tid, DataDir, #ns_info{name=NS,locator=NSLocator}, Prefix) -> + ok = machi_util:increment_max_filenum(DataDir, NS, NSLocator, Prefix), + N = machi_util:read_max_filenum(DataDir, NS, NSLocator, Prefix), + F = generate_filename(DataDir, NS, NSLocator, Prefix, N), + true = ets:insert(Tid, {{NS, NSLocator, Prefix}, F}), F. diff --git a/src/machi_flu_metadata_mgr.erl b/src/machi_flu_metadata_mgr.erl index 66274b3..b9c26c9 100644 --- a/src/machi_flu_metadata_mgr.erl +++ b/src/machi_flu_metadata_mgr.erl @@ -34,6 +34,7 @@ -module(machi_flu_metadata_mgr). -behaviour(gen_server). +-include("machi.hrl"). -define(MAX_MGRS, 10). %% number of managers to start by default. -define(HASH(X), erlang:phash2(X)). %% hash algorithm to use @@ -185,17 +186,16 @@ handle_info({'DOWN', Mref, process, Pid, file_rollover}, State = #state{ fluname tid = Tid }) -> lager:info("file proxy ~p shutdown because of file rollover", [Pid]), R = get_md_record_by_mref(Tid, Mref), - {Prefix, CoC_Namespace, CoC_Locator, _, _} = + {Prefix, NS, NSLocator, _, _} = machi_util:parse_filename(R#md.filename), - %% CoC_Namespace = list_to_binary(CoC_Namespace_str), - %% CoC_Locator = list_to_integer(CoC_Locator_str), %% We only increment the counter here. The filename will be generated on the %% next append request to that prefix and since the filename will have a new %% sequence number it probably will be associated with a different metadata %% manager. That's why we don't want to generate a new file name immediately %% and use it to start a new file proxy. - ok = machi_flu_filename_mgr:increment_prefix_sequence(FluName, {coc, CoC_Namespace, CoC_Locator}, {prefix, Prefix}), + NSInfo = #ns_info{name=NS, locator=NSLocator}, + ok = machi_flu_filename_mgr:increment_prefix_sequence(FluName, NSInfo, {prefix, Prefix}), %% purge our ets table of this entry completely since it is likely the %% new filename (whenever it comes) will be in a different manager than diff --git a/src/machi_lifecycle_mgr.erl b/src/machi_lifecycle_mgr.erl index 385c607..80ea8b4 100644 --- a/src/machi_lifecycle_mgr.erl +++ b/src/machi_lifecycle_mgr.erl @@ -950,7 +950,7 @@ make_pending_config(Term) -> %% The largest numbered file is assumed to be all of the AST changes that we %% want to apply in a single batch. The AST tuples of all files with smaller %% numbers will be concatenated together to create the prior history of -%% cluster-of-clusters. We assume that all transitions inside these earlier +%% the cluster. We assume that all transitions inside these earlier %% files were actually safe & sane, therefore any sanity problem can only %% be caused by the contents of the largest numbered file. diff --git a/src/machi_pb_high_client.erl b/src/machi_pb_high_client.erl index 5b2ab22..f67479e 100644 --- a/src/machi_pb_high_client.erl +++ b/src/machi_pb_high_client.erl @@ -25,6 +25,10 @@ %% to a single socket connection, and there is no code to deal with %% multiple connections/load balancing/error handling to several/all %% Machi cluster servers. +%% +%% Please see {@link machi_flu1_client} the "Client API implemntation notes" +%% section for how this module relates to the rest of the client API +%% implementation. -module(machi_pb_high_client). @@ -38,7 +42,7 @@ connected_p/1, echo/2, echo/3, auth/3, auth/4, - append_chunk/7, append_chunk/8, + append_chunk/6, append_chunk/7, write_chunk/5, write_chunk/6, read_chunk/5, read_chunk/6, trim_chunk/4, trim_chunk/5, @@ -96,30 +100,33 @@ auth(PidSpec, User, Pass) -> auth(PidSpec, User, Pass, Timeout) -> send_sync(PidSpec, {auth, User, Pass}, Timeout). --spec append_chunk(pid(), CoC_namespace::binary(), CoC_locator::integer(), Prefix::binary(), Chunk::binary(), - CSum::binary(), ChunkExtra::non_neg_integer()) -> +-spec append_chunk(pid(), + NS::machi_dt:namespace(), Prefix::machi_dt:file_prefix(), + Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum(), + Opts::machi_dt:append_opts()) -> {ok, Filename::string(), Offset::machi_dt:file_offset()} | {error, machi_client_error_reason()}. -append_chunk(PidSpec, CoC_namespace, CoC_locator, Prefix, Chunk, CSum, ChunkExtra) -> - append_chunk(PidSpec, CoC_namespace, CoC_locator, Prefix, Chunk, CSum, ChunkExtra, ?DEFAULT_TIMEOUT). +append_chunk(PidSpec, NS, Prefix, Chunk, CSum, Opts) -> + append_chunk(PidSpec, NS, Prefix, Chunk, CSum, Opts, ?DEFAULT_TIMEOUT). --spec append_chunk(pid(), CoC_namespace::binary(), CoC_locator::integer(), Prefix::binary(), - Chunk::binary(), CSum::binary(), - ChunkExtra::non_neg_integer(), +-spec append_chunk(pid(), + NS::machi_dt:namespace(), Prefix::machi_dt:file_prefix(), + Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum(), + Opts::machi_dt:append_opts(), Timeout::non_neg_integer()) -> {ok, Filename::string(), Offset::machi_dt:file_offset()} | {error, machi_client_error_reason()}. -append_chunk(PidSpec, CoC_namespace, CoC_locator, Prefix, Chunk, CSum, ChunkExtra, Timeout) -> - send_sync(PidSpec, {append_chunk, CoC_namespace, CoC_locator, Prefix, Chunk, CSum, ChunkExtra}, Timeout). +append_chunk(PidSpec, NS, Prefix, Chunk, CSum, Opts, Timeout) -> + send_sync(PidSpec, {append_chunk, NS, Prefix, Chunk, CSum, Opts}, Timeout). -spec write_chunk(pid(), File::string(), machi_dt:file_offset(), - Chunk::binary(), CSum::binary()) -> + Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum()) -> ok | {error, machi_client_error_reason()}. write_chunk(PidSpec, File, Offset, Chunk, CSum) -> write_chunk(PidSpec, File, Offset, Chunk, CSum, ?DEFAULT_TIMEOUT). -spec write_chunk(pid(), File::string(), machi_dt:file_offset(), - Chunk::binary(), CSum::binary(), Timeout::non_neg_integer()) -> + Chunk::machi_dt:chunk(), CSum::machi_dt:chunk_csum(), Timeout::non_neg_integer()) -> ok | {error, machi_client_error_reason()}. write_chunk(PidSpec, File, Offset, Chunk, CSum, Timeout) -> send_sync(PidSpec, {write_chunk, File, Offset, Chunk, CSum}, Timeout). @@ -128,21 +135,22 @@ write_chunk(PidSpec, File, Offset, Chunk, CSum, Timeout) -> %% {Chunks, TrimmedChunks}}' for live file while it returns `{error, %% trimmed}' if all bytes of the file was trimmed. -spec read_chunk(pid(), File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), - [{flag_no_checksum | flag_no_chunk | needs_trimmed, boolean()}]) -> + machi_dt:read_opts_x()) -> {ok, {Chunks::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), binary()}], Trimmed::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size()}]}} | {error, machi_client_error_reason()}. -read_chunk(PidSpec, File, Offset, Size, Options) -> - read_chunk(PidSpec, File, Offset, Size, Options, ?DEFAULT_TIMEOUT). +read_chunk(PidSpec, File, Offset, Size, Opts) -> + read_chunk(PidSpec, File, Offset, Size, Opts, ?DEFAULT_TIMEOUT). -spec read_chunk(pid(), File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), - [{flag_no_checksum | flag_no_chunk | needs_trimmed, boolean()}], + machi_dt:read_opts_x(), Timeout::non_neg_integer()) -> {ok, {Chunks::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size(), binary()}], Trimmed::[{File::string(), machi_dt:file_offset(), machi_dt:chunk_size()}]}} | {error, machi_client_error_reason()}. -read_chunk(PidSpec, File, Offset, Size, Options, Timeout) -> - send_sync(PidSpec, {read_chunk, File, Offset, Size, Options}, Timeout). +read_chunk(PidSpec, File, Offset, Size, Opts0, Timeout) -> + Opts = machi_util:read_opts_default(Opts0), + send_sync(PidSpec, {read_chunk, File, Offset, Size, Opts}, Timeout). %% @doc Trims arbitrary binary range of any file. If a specified range %% has any byte trimmed, it fails and returns `{error, trimmed}'. @@ -281,18 +289,19 @@ do_send_sync2({auth, User, Pass}, #state{sock=Sock}=S) -> Res = {bummer, {X, Y, erlang:get_stacktrace()}}, {Res, S} end; -do_send_sync2({append_chunk, CoC_Namespace, CoC_Locator, - Prefix, Chunk, CSum, ChunkExtra}, +do_send_sync2({append_chunk, NS, Prefix, Chunk, CSum, Opts}, #state{sock=Sock, sock_id=Index, count=Count}=S) -> try ReqID = <>, CSumT = convert_csum_req(CSum, Chunk), - Req = #mpb_appendchunkreq{coc_namespace=CoC_Namespace, - coc_locator=CoC_Locator, + {ChunkExtra, Pref, FailPref} = machi_pb_translate:conv_from_append_opts(Opts), + Req = #mpb_appendchunkreq{namespace=NS, prefix=Prefix, chunk=Chunk, csum=CSumT, - chunk_extra=ChunkExtra}, + chunk_extra=ChunkExtra, + preferred_file_name=Pref, + flag_fail_preferred=FailPref}, R1a = #mpb_request{req_id=ReqID, do_not_alter=1, append_chunk=Req}, Bin1a = machi_pb:encode_mpb_request(R1a), @@ -337,13 +346,13 @@ do_send_sync2({write_chunk, File, Offset, Chunk, CSum}, Res = {bummer, {X, Y, erlang:get_stacktrace()}}, {Res, S#state{count=Count+1}} end; -do_send_sync2({read_chunk, File, Offset, Size, Options}, +do_send_sync2({read_chunk, File, Offset, Size, Opts}, #state{sock=Sock, sock_id=Index, count=Count}=S) -> try ReqID = <>, - FlagNoChecksum = proplists:get_value(no_checksum, Options, false), - FlagNoChunk = proplists:get_value(no_chunk, Options, false), - NeedsTrimmed = proplists:get_value(needs_trimmed, Options, false), + #read_opts{no_checksum=FlagNoChecksum, + no_chunk=FlagNoChunk, + needs_trimmed=NeedsTrimmed} = Opts, Req = #mpb_readchunkreq{chunk_pos=#mpb_chunkpos{file_name=File, offset=Offset, chunk_size=Size}, @@ -436,9 +445,15 @@ do_send_sync2({list_files}, {Res, S#state{count=Count+1}} end. +%% We only convert the checksum types that make sense here: +%% none or client_sha. None of the other types should be sent +%% to us via the PB high protocol. + convert_csum_req(none, Chunk) -> #mpb_chunkcsum{type='CSUM_TAG_CLIENT_SHA', csum=machi_util:checksum_chunk(Chunk)}; +convert_csum_req(<<>>, Chunk) -> + convert_csum_req(none, Chunk); convert_csum_req({client_sha, CSumBin}, _Chunk) -> #mpb_chunkcsum{type='CSUM_TAG_CLIENT_SHA', csum=CSumBin}. @@ -486,12 +501,12 @@ convert_read_chunk_resp(#mpb_readchunkresp{status='OK', chunks=PB_Chunks, trimme csum=#mpb_chunkcsum{type=T, csum=Ck}}) -> %% TODO: cleanup export Csum = <<(machi_pb_translate:conv_to_csum_tag(T)):8, Ck/binary>>, - {File, Offset, Chunk, Csum} + {list_to_binary(File), Offset, Chunk, Csum} end, PB_Chunks), Trimmed = lists:map(fun(#mpb_chunkpos{file_name=File, offset=Offset, chunk_size=Size}) -> - {File, Offset, Size} + {list_to_binary(File), Offset, Size} end, PB_Trimmed), {ok, {Chunks, Trimmed}}; convert_read_chunk_resp(#mpb_readchunkresp{status=Status}) -> diff --git a/src/machi_pb_translate.erl b/src/machi_pb_translate.erl index cc8f728..1fd5f8b 100644 --- a/src/machi_pb_translate.erl +++ b/src/machi_pb_translate.erl @@ -34,7 +34,9 @@ -export([from_pb_request/1, from_pb_response/1, to_pb_request/2, - to_pb_response/3 + to_pb_response/3, + conv_from_append_opts/1, + conv_to_append_opts/1 ]). %% TODO: fixme cleanup @@ -43,95 +45,104 @@ from_pb_request(#mpb_ll_request{ req_id=ReqID, echo=#mpb_echoreq{message=Msg}}) -> - {ReqID, {low_echo, undefined, Msg}}; + {ReqID, {low_skip_wedge, {low_echo, Msg}}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, auth=#mpb_authreq{user=User, password=Pass}}) -> - {ReqID, {low_auth, undefined, User, Pass}}; + {ReqID, {low_skip_wedge, {low_auth, User, Pass}}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, - append_chunk=#mpb_ll_appendchunkreq{ + append_chunk=IR=#mpb_ll_appendchunkreq{ + namespace_version=NSVersion, + namespace=NS_str, + locator=NSLocator, epoch_id=PB_EpochID, - coc_namespace=CoC_Namespace, - coc_locator=CoC_Locator, prefix=Prefix, chunk=Chunk, - csum=#mpb_chunkcsum{type=CSum_type, csum=CSum}, - chunk_extra=ChunkExtra}}) -> + csum=#mpb_chunkcsum{type=CSum_type, csum=CSum}}}) -> + NS = list_to_binary(NS_str), EpochID = conv_to_epoch_id(PB_EpochID), CSum_tag = conv_to_csum_tag(CSum_type), - {ReqID, {low_append_chunk, EpochID, CoC_Namespace, CoC_Locator, - Prefix, Chunk, CSum_tag, CSum, - ChunkExtra}}; + Opts = conv_to_append_opts(IR), + %% NOTE: The tuple position of NSLocator is a bit odd, because EpochID + %% _must_ be in the 4th position (as NSV & NS must be in 2nd & 3rd). + {ReqID, {low_append_chunk, NSVersion, NS, EpochID, NSLocator, + Prefix, Chunk, CSum_tag, CSum, Opts}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, write_chunk=#mpb_ll_writechunkreq{ + namespace_version=NSVersion, + namespace=NS_str, epoch_id=PB_EpochID, chunk=#mpb_chunk{file_name=File, offset=Offset, chunk=Chunk, csum=#mpb_chunkcsum{type=CSum_type, csum=CSum}}}}) -> + NS = list_to_binary(NS_str), EpochID = conv_to_epoch_id(PB_EpochID), CSum_tag = conv_to_csum_tag(CSum_type), - {ReqID, {low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, CSum}}; + {ReqID, {low_write_chunk, NSVersion, NS, EpochID, File, Offset, Chunk, CSum_tag, CSum}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, read_chunk=#mpb_ll_readchunkreq{ + namespace_version=NSVersion, + namespace=NS_str, epoch_id=PB_EpochID, chunk_pos=ChunkPos, flag_no_checksum=PB_GetNoChecksum, flag_no_chunk=PB_GetNoChunk, flag_needs_trimmed=PB_NeedsTrimmed}}) -> + NS = list_to_binary(NS_str), EpochID = conv_to_epoch_id(PB_EpochID), - Opts = [{no_checksum, conv_to_boolean(PB_GetNoChecksum)}, - {no_chunk, conv_to_boolean(PB_GetNoChunk)}, - {needs_trimmed, conv_to_boolean(PB_NeedsTrimmed)}], + Opts = #read_opts{no_checksum=PB_GetNoChecksum, + no_chunk=PB_GetNoChunk, + needs_trimmed=PB_NeedsTrimmed}, #mpb_chunkpos{file_name=File, offset=Offset, chunk_size=Size} = ChunkPos, - {ReqID, {low_read_chunk, EpochID, File, Offset, Size, Opts}}; + {ReqID, {low_read_chunk, NSVersion, NS, EpochID, File, Offset, Size, Opts}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, trim_chunk=#mpb_ll_trimchunkreq{ + namespace_version=NSVersion, + namespace=NS_str, epoch_id=PB_EpochID, file=File, offset=Offset, size=Size, - trigger_gc=PB_TriggerGC}}) -> + trigger_gc=TriggerGC}}) -> + NS = list_to_binary(NS_str), EpochID = conv_to_epoch_id(PB_EpochID), - TriggerGC = conv_to_boolean(PB_TriggerGC), - {ReqID, {low_trim_chunk, EpochID, File, Offset, Size, TriggerGC}}; + {ReqID, {low_trim_chunk, NSVersion, NS, EpochID, File, Offset, Size, TriggerGC}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, checksum_list=#mpb_ll_checksumlistreq{ - epoch_id=PB_EpochID, file=File}}) -> - EpochID = conv_to_epoch_id(PB_EpochID), - {ReqID, {low_checksum_list, EpochID, File}}; + {ReqID, {low_skip_wedge, {low_checksum_list, File}}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, list_files=#mpb_ll_listfilesreq{ epoch_id=PB_EpochID}}) -> EpochID = conv_to_epoch_id(PB_EpochID), - {ReqID, {low_list_files, EpochID}}; + {ReqID, {low_skip_wedge, {low_list_files, EpochID}}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, wedge_status=#mpb_ll_wedgestatusreq{}}) -> - {ReqID, {low_wedge_status, undefined}}; + {ReqID, {low_skip_wedge, {low_wedge_status}}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, delete_migration=#mpb_ll_deletemigrationreq{ epoch_id=PB_EpochID, file=File}}) -> EpochID = conv_to_epoch_id(PB_EpochID), - {ReqID, {low_delete_migration, EpochID, File}}; + {ReqID, {low_skip_wedge, {low_delete_migration, EpochID, File}}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, trunc_hack=#mpb_ll_trunchackreq{ epoch_id=PB_EpochID, file=File}}) -> EpochID = conv_to_epoch_id(PB_EpochID), - {ReqID, {low_trunc_hack, EpochID, File}}; + {ReqID, {low_skip_wedge, {low_trunc_hack, EpochID, File}}}; from_pb_request(#mpb_ll_request{ req_id=ReqID, proj_gl=#mpb_ll_getlatestepochidreq{type=ProjType}}) -> @@ -172,23 +183,22 @@ from_pb_request(#mpb_request{req_id=ReqID, {ReqID, {high_auth, User, Pass}}; from_pb_request(#mpb_request{req_id=ReqID, append_chunk=IR=#mpb_appendchunkreq{}}) -> - #mpb_appendchunkreq{coc_namespace=CoC_namespace, - coc_locator=CoC_locator, + #mpb_appendchunkreq{namespace=NS_str, prefix=Prefix, chunk=Chunk, - csum=CSum, - chunk_extra=ChunkExtra} = IR, + csum=CSum} = IR, + NS = list_to_binary(NS_str), TaggedCSum = make_tagged_csum(CSum, Chunk), - {ReqID, {high_append_chunk, CoC_namespace, CoC_locator, Prefix, Chunk, - TaggedCSum, ChunkExtra}}; + Opts = conv_to_append_opts(IR), + {ReqID, {high_append_chunk, NS, Prefix, Chunk, TaggedCSum, Opts}}; from_pb_request(#mpb_request{req_id=ReqID, write_chunk=IR=#mpb_writechunkreq{}}) -> #mpb_writechunkreq{chunk=#mpb_chunk{file_name=File, offset=Offset, chunk=Chunk, - csum=CSum}} = IR, - TaggedCSum = make_tagged_csum(CSum, Chunk), - {ReqID, {high_write_chunk, File, Offset, Chunk, TaggedCSum}}; + csum=CSumRec}} = IR, + CSum = make_tagged_csum(CSumRec, Chunk), + {ReqID, {high_write_chunk, File, Offset, Chunk, CSum}}; from_pb_request(#mpb_request{req_id=ReqID, read_chunk=IR=#mpb_readchunkreq{}}) -> #mpb_readchunkreq{chunk_pos=#mpb_chunkpos{file_name=File, @@ -197,11 +207,10 @@ from_pb_request(#mpb_request{req_id=ReqID, flag_no_checksum=FlagNoChecksum, flag_no_chunk=FlagNoChunk, flag_needs_trimmed=NeedsTrimmed} = IR, - %% I want MAPS - Options = [{no_checksum, machi_util:int2bool(FlagNoChecksum)}, - {no_chunk, machi_util:int2bool(FlagNoChunk)}, - {needs_trimmed, machi_util:int2bool(NeedsTrimmed)}], - {ReqID, {high_read_chunk, File, Offset, Size, Options}}; + Opts = #read_opts{no_checksum=FlagNoChecksum, + no_chunk=FlagNoChunk, + needs_trimmed=NeedsTrimmed}, + {ReqID, {high_read_chunk, File, Offset, Size, Opts}}; from_pb_request(#mpb_request{req_id=ReqID, trim_chunk=IR=#mpb_trimchunkreq{}}) -> #mpb_trimchunkreq{chunk_pos=#mpb_chunkpos{file_name=File, @@ -265,12 +274,12 @@ from_pb_response(#mpb_ll_response{ chunk=Bytes, csum=#mpb_chunkcsum{type=T,csum=Ck}}) -> Csum = <<(conv_to_csum_tag(T)):8, Ck/binary>>, - {File, Offset, Bytes, Csum} + {list_to_binary(File), Offset, Bytes, Csum} end, PB_Chunks), Trimmed = lists:map(fun(#mpb_chunkpos{file_name=File, offset=Offset, chunk_size=Size}) -> - {File, Offset, Size} + {list_to_binary(File), Offset, Size} end, PB_Trimmed), {ReqID, {ok, {Chunks, Trimmed}}}; _ -> @@ -306,12 +315,16 @@ from_pb_response(#mpb_ll_response{ from_pb_response(#mpb_ll_response{ req_id=ReqID, wedge_status=#mpb_ll_wedgestatusresp{ - epoch_id=PB_EpochID, wedged_flag=PB_Wedged}}) -> + status=Status, + epoch_id=PB_EpochID, wedged_flag=Wedged_p, + namespace_version=NSVersion, namespace=NS_str}}) -> + GeneralStatus = case machi_pb_high_client:convert_general_status_code(Status) of + ok -> ok; + _Else -> {yukky, _Else} + end, EpochID = conv_to_epoch_id(PB_EpochID), - Wedged_p = if PB_Wedged == 1 -> true; - PB_Wedged == 0 -> false - end, - {ReqID, {ok, {Wedged_p, EpochID}}}; + NS = list_to_binary(NS_str), + {ReqID, {GeneralStatus, {Wedged_p, EpochID, NSVersion, NS}}}; from_pb_response(#mpb_ll_response{ req_id=ReqID, delete_migration=#mpb_ll_deletemigrationresp{ @@ -377,90 +390,100 @@ from_pb_response(#mpb_ll_response{ 'OK' -> {ReqID, {ok, Epochs}}; _ -> - {ReqID< machi_pb_high_client:convert_general_status_code(Status)} + {ReqID, machi_pb_high_client:convert_general_status_code(Status)} end. %% No response for proj_kp/kick_projection_reaction %% TODO: move the #mbp_* record making code from %% machi_pb_high_client:do_send_sync() clauses into to_pb_request(). -to_pb_request(ReqID, {low_echo, _BogusEpochID, Msg}) -> +to_pb_request(ReqID, {low_skip_wedge, {low_echo, Msg}}) -> #mpb_ll_request{ req_id=ReqID, do_not_alter=2, echo=#mpb_echoreq{message=Msg}}; -to_pb_request(ReqID, {low_auth, _BogusEpochID, User, Pass}) -> +to_pb_request(ReqID, {low_skip_wedge, {low_auth, User, Pass}}) -> #mpb_ll_request{req_id=ReqID, do_not_alter=2, auth=#mpb_authreq{user=User, password=Pass}}; -to_pb_request(ReqID, {low_append_chunk, EpochID, CoC_Namespace, CoC_Locator, - Prefix, Chunk, CSum_tag, CSum, ChunkExtra}) -> +%% NOTE: The tuple position of NSLocator is a bit odd, because EpochID +%% _must_ be in the 4th position (as NSV & NS must be in 2nd & 3rd). +to_pb_request(ReqID, {low_append_chunk, NSVersion, NS, EpochID, NSLocator, + Prefix, Chunk, CSum_tag, CSum, Opts}) -> PB_EpochID = conv_from_epoch_id(EpochID), CSum_type = conv_from_csum_tag(CSum_tag), PB_CSum = #mpb_chunkcsum{type=CSum_type, csum=CSum}, + {ChunkExtra, Pref, FailPref} = conv_from_append_opts(Opts), #mpb_ll_request{req_id=ReqID, do_not_alter=2, append_chunk=#mpb_ll_appendchunkreq{ + namespace_version=NSVersion, + namespace=NS, + locator=NSLocator, epoch_id=PB_EpochID, - coc_namespace=CoC_Namespace, - coc_locator=CoC_Locator, prefix=Prefix, chunk=Chunk, csum=PB_CSum, - chunk_extra=ChunkExtra}}; -to_pb_request(ReqID, {low_write_chunk, EpochID, File, Offset, Chunk, CSum_tag, CSum}) -> + chunk_extra=ChunkExtra, + preferred_file_name=Pref, + flag_fail_preferred=FailPref}}; +to_pb_request(ReqID, {low_write_chunk, NSVersion, NS, EpochID, File, Offset, Chunk, CSum_tag, CSum}) -> PB_EpochID = conv_from_epoch_id(EpochID), CSum_type = conv_from_csum_tag(CSum_tag), PB_CSum = #mpb_chunkcsum{type=CSum_type, csum=CSum}, #mpb_ll_request{req_id=ReqID, do_not_alter=2, write_chunk=#mpb_ll_writechunkreq{ + namespace_version=NSVersion, + namespace=NS, epoch_id=PB_EpochID, chunk=#mpb_chunk{file_name=File, offset=Offset, chunk=Chunk, csum=PB_CSum}}}; -to_pb_request(ReqID, {low_read_chunk, EpochID, File, Offset, Size, Opts}) -> +to_pb_request(ReqID, {low_read_chunk, NSVersion, NS, EpochID, File, Offset, Size, Opts}) -> PB_EpochID = conv_from_epoch_id(EpochID), - FNChecksum = proplists:get_value(no_checksum, Opts, false), - FNChunk = proplists:get_value(no_chunk, Opts, false), - NeedsTrimmed = proplists:get_value(needs_trimmed, Opts, false), + #read_opts{no_checksum=FNChecksum, + no_chunk=FNChunk, + needs_trimmed=NeedsTrimmed} = Opts, #mpb_ll_request{ req_id=ReqID, do_not_alter=2, read_chunk=#mpb_ll_readchunkreq{ - epoch_id=PB_EpochID, - chunk_pos=#mpb_chunkpos{ + namespace_version=NSVersion, + namespace=NS, + epoch_id=PB_EpochID, + chunk_pos=#mpb_chunkpos{ file_name=File, offset=Offset, chunk_size=Size}, - flag_no_checksum=machi_util:bool2int(FNChecksum), - flag_no_chunk=machi_util:bool2int(FNChunk), - flag_needs_trimmed=machi_util:bool2int(NeedsTrimmed)}}; -to_pb_request(ReqID, {low_trim_chunk, EpochID, File, Offset, Size, TriggerGC}) -> + flag_no_checksum=FNChecksum, + flag_no_chunk=FNChunk, + flag_needs_trimmed=NeedsTrimmed}}; +to_pb_request(ReqID, {low_trim_chunk, NSVersion, NS, EpochID, File, Offset, Size, TriggerGC}) -> PB_EpochID = conv_from_epoch_id(EpochID), #mpb_ll_request{req_id=ReqID, do_not_alter=2, trim_chunk=#mpb_ll_trimchunkreq{ + namespace_version=NSVersion, + namespace=NS, epoch_id=PB_EpochID, file=File, offset=Offset, size=Size, trigger_gc=TriggerGC}}; -to_pb_request(ReqID, {low_checksum_list, EpochID, File}) -> - PB_EpochID = conv_from_epoch_id(EpochID), +to_pb_request(ReqID, {low_skip_wedge, {low_checksum_list, File}}) -> #mpb_ll_request{req_id=ReqID, do_not_alter=2, checksum_list=#mpb_ll_checksumlistreq{ - epoch_id=PB_EpochID, file=File}}; -to_pb_request(ReqID, {low_list_files, EpochID}) -> +to_pb_request(ReqID, {low_skip_wedge, {low_list_files, EpochID}}) -> PB_EpochID = conv_from_epoch_id(EpochID), #mpb_ll_request{req_id=ReqID, do_not_alter=2, list_files=#mpb_ll_listfilesreq{epoch_id=PB_EpochID}}; -to_pb_request(ReqID, {low_wedge_status, _BogusEpochID}) -> +to_pb_request(ReqID, {low_skip_wedge, {low_wedge_status}}) -> #mpb_ll_request{req_id=ReqID, do_not_alter=2, wedge_status=#mpb_ll_wedgestatusreq{}}; -to_pb_request(ReqID, {low_delete_migration, EpochID, File}) -> +to_pb_request(ReqID, {low_skip_wedge, {low_delete_migration, EpochID, File}}) -> PB_EpochID = conv_from_epoch_id(EpochID), #mpb_ll_request{req_id=ReqID, do_not_alter=2, delete_migration=#mpb_ll_deletemigrationreq{ epoch_id=PB_EpochID, file=File}}; -to_pb_request(ReqID, {low_trunc_hack, EpochID, File}) -> +to_pb_request(ReqID, {low_skip_wedge, {low_trunc_hack, EpochID, File}}) -> PB_EpochID = conv_from_epoch_id(EpochID), #mpb_ll_request{req_id=ReqID, do_not_alter=2, trunc_hack=#mpb_ll_trunchackreq{ @@ -496,15 +519,15 @@ to_pb_response(_ReqID, _, async_no_response=X) -> X; to_pb_response(ReqID, _, {low_error, ErrCode, ErrMsg}) -> make_ll_error_resp(ReqID, ErrCode, ErrMsg); -to_pb_response(ReqID, {low_echo, _BogusEpochID, _Msg}, Resp) -> +to_pb_response(ReqID, {low_skip_wedge, {low_echo, _Msg}}, Resp) -> #mpb_ll_response{ req_id=ReqID, echo=#mpb_echoresp{message=Resp}}; -to_pb_response(ReqID, {low_auth, _, _, _}, __TODO_Resp) -> +to_pb_response(ReqID, {low_skip_wedge, {low_auth, _, _}}, __TODO_Resp) -> #mpb_ll_response{req_id=ReqID, generic=#mpb_errorresp{code=1, msg="AUTH not implemented"}}; -to_pb_response(ReqID, {low_append_chunk, _EID, _N, _L, _Pfx, _Ch, _CST, _CS, _CE}, Resp)-> +to_pb_response(ReqID, {low_append_chunk, _NSV, _NS, _EID, _NSL, _Pfx, _Ch, _CST, _CS, _O}, Resp)-> case Resp of {ok, {Offset, Size, File}} -> Where = #mpb_chunkpos{offset=Offset, @@ -520,11 +543,11 @@ to_pb_response(ReqID, {low_append_chunk, _EID, _N, _L, _Pfx, _Ch, _CST, _CS, _CE _Else -> make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {low_write_chunk, _EID, _Fl, _Off, _Ch, _CST, _CS},Resp)-> +to_pb_response(ReqID, {low_write_chunk, _NSV, _NS, _EID, _Fl, _Off, _Ch, _CST, _CS},Resp)-> Status = conv_from_status(Resp), #mpb_ll_response{req_id=ReqID, write_chunk=#mpb_ll_writechunkresp{status=Status}}; -to_pb_response(ReqID, {low_read_chunk, _EID, _Fl, _Off, _Sz, _Opts}, Resp)-> +to_pb_response(ReqID, {low_read_chunk, _NSV, _NS, _EID, _Fl, _Off, _Sz, _Opts}, Resp)-> case Resp of {ok, {Chunks, Trimmed}} -> PB_Chunks = lists:map(fun({File, Offset, Bytes, Csum}) -> @@ -551,7 +574,7 @@ to_pb_response(ReqID, {low_read_chunk, _EID, _Fl, _Off, _Sz, _Opts}, Resp)-> _Else -> make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {low_trim_chunk, _, _, _, _, _}, Resp) -> +to_pb_response(ReqID, {low_trim_chunk, _, _, _, _, _, _, _}, Resp) -> case Resp of ok -> #mpb_ll_response{req_id=ReqID, @@ -559,11 +582,11 @@ to_pb_response(ReqID, {low_trim_chunk, _, _, _, _, _}, Resp) -> {error, _}=Error -> Status = conv_from_status(Error), #mpb_ll_response{req_id=ReqID, - read_chunk=#mpb_ll_trimchunkresp{status=Status}}; + trim_chunk=#mpb_ll_trimchunkresp{status=Status}}; _Else -> make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {low_checksum_list, _EpochID, _File}, Resp) -> +to_pb_response(ReqID, {low_skip_wedge, {low_checksum_list, _File}}, Resp) -> case Resp of {ok, Chunk} -> #mpb_ll_response{req_id=ReqID, @@ -576,7 +599,7 @@ to_pb_response(ReqID, {low_checksum_list, _EpochID, _File}, Resp) -> _Else -> make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {low_list_files, _EpochID}, Resp) -> +to_pb_response(ReqID, {low_skip_wedge, {low_list_files, _EpochID}}, Resp) -> case Resp of {ok, FileInfo} -> PB_Files = [#mpb_fileinfo{file_size=Size, file_name=Name} || @@ -591,26 +614,28 @@ to_pb_response(ReqID, {low_list_files, _EpochID}, Resp) -> _Else -> make_ll_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {low_wedge_status, _BogusEpochID}, Resp) -> +to_pb_response(ReqID, {low_skip_wedge, {low_wedge_status}}, Resp) -> case Resp of {error, _}=Error -> Status = conv_from_status(Error), #mpb_ll_response{req_id=ReqID, wedge_status=#mpb_ll_wedgestatusresp{status=Status}}; - {Wedged_p, EpochID} -> - PB_Wedged = conv_from_boolean(Wedged_p), + {Wedged_p, EpochID, NSVersion, NS} -> PB_EpochID = conv_from_epoch_id(EpochID), #mpb_ll_response{req_id=ReqID, wedge_status=#mpb_ll_wedgestatusresp{ status='OK', epoch_id=PB_EpochID, - wedged_flag=PB_Wedged}} + wedged_flag=Wedged_p, + namespace_version=NSVersion, + namespace=NS + }} end; -to_pb_response(ReqID, {low_delete_migration, _EID, _Fl}, Resp)-> +to_pb_response(ReqID, {low_skip_wedge, {low_delete_migration, _EID, _Fl}}, Resp)-> Status = conv_from_status(Resp), #mpb_ll_response{req_id=ReqID, delete_migration=#mpb_ll_deletemigrationresp{status=Status}}; -to_pb_response(ReqID, {low_trunc_hack, _EID, _Fl}, Resp)-> +to_pb_response(ReqID, {low_skip_wedge, {low_trunc_hack, _EID, _Fl}}, Resp)-> Status = conv_from_status(Resp), #mpb_ll_response{req_id=ReqID, trunc_hack=#mpb_ll_trunchackresp{status=Status}}; @@ -691,7 +716,7 @@ to_pb_response(ReqID, {high_auth, _User, _Pass}, _Resp) -> #mpb_response{req_id=ReqID, generic=#mpb_errorresp{code=1, msg="AUTH not implemented"}}; -to_pb_response(ReqID, {high_append_chunk, _CoC_n, _CoC_l, _Prefix, _Chunk, _TSum, _CE}, Resp)-> +to_pb_response(ReqID, {high_append_chunk, _NS, _Prefix, _Chunk, _TSum, _O}, Resp)-> case Resp of {ok, {Offset, Size, File}} -> Where = #mpb_chunkpos{offset=Offset, @@ -707,7 +732,7 @@ to_pb_response(ReqID, {high_append_chunk, _CoC_n, _CoC_l, _Prefix, _Chunk, _TSum _Else -> make_error_resp(ReqID, 66, io_lib:format("err ~p", [_Else])) end; -to_pb_response(ReqID, {high_write_chunk, _File, _Offset, _Chunk, _TaggedCSum}, Resp) -> +to_pb_response(ReqID, {high_write_chunk, _File, _Offset, _Chunk, _CSum}, Resp) -> case Resp of {ok, {_,_,_}} -> %% machi_cr_client returns ok 2-tuple, convert to simple ok. @@ -797,12 +822,12 @@ make_tagged_csum(#mpb_chunkcsum{type='CSUM_TAG_CLIENT_SHA', csum=CSum}, _CB) -> make_ll_error_resp(ReqID, Code, Msg) -> #mpb_ll_response{req_id=ReqID, generic=#mpb_errorresp{code=Code, - msg=Msg}}. + msg=Msg}}. make_error_resp(ReqID, Code, Msg) -> #mpb_response{req_id=ReqID, generic=#mpb_errorresp{code=Code, - msg=Msg}}. + msg=Msg}}. conv_from_epoch_id({Epoch, EpochCSum}) -> #mpb_epochid{epoch_number=Epoch, @@ -962,17 +987,26 @@ conv_from_status(_OOPS) -> io:format(user, "HEY, ~s:~w got ~p\n", [?MODULE, ?LINE, _OOPS]), 'BAD_JOSS'. -conv_to_boolean(undefined) -> - false; -conv_to_boolean(0) -> - false; -conv_to_boolean(N) when is_integer(N) -> - true. +conv_from_append_opts(#append_opts{chunk_extra=ChunkExtra, + preferred_file_name=Pref, + flag_fail_preferred=FailPref}) -> + {ChunkExtra, Pref, FailPref}. -conv_from_boolean(false) -> - 0; -conv_from_boolean(true) -> - 1. + +conv_to_append_opts(#mpb_appendchunkreq{ + chunk_extra=ChunkExtra, + preferred_file_name=Pref, + flag_fail_preferred=FailPref}) -> + #append_opts{chunk_extra=ChunkExtra, + preferred_file_name=Pref, + flag_fail_preferred=FailPref}; +conv_to_append_opts(#mpb_ll_appendchunkreq{ + chunk_extra=ChunkExtra, + preferred_file_name=Pref, + flag_fail_preferred=FailPref}) -> + #append_opts{chunk_extra=ChunkExtra, + preferred_file_name=Pref, + flag_fail_preferred=FailPref}. conv_from_projection_v1(#projection_v1{epoch_number=Epoch, epoch_csum=CSum, diff --git a/src/machi_proxy_flu1_client.erl b/src/machi_proxy_flu1_client.erl index e4bc0d2..8f9dcf6 100644 --- a/src/machi_proxy_flu1_client.erl +++ b/src/machi_proxy_flu1_client.erl @@ -22,6 +22,10 @@ %% proxy-process style API for hiding messy details such as TCP %% connection/disconnection with the remote Machi server. %% +%% Please see {@link machi_flu1_client} the "Client API implemntation notes" +%% section for how this module relates to the rest of the client API +%% implementation. +%% %% Machi is intentionally avoiding using distributed Erlang for %% Machi's communication. This design decision makes Erlang-side code %% more difficult & complex, but it's the price to pay for some @@ -57,12 +61,9 @@ %% FLU1 API -export([ %% File API - append_chunk/4, append_chunk/5, - append_chunk/6, append_chunk/7, - append_chunk_extra/5, append_chunk_extra/6, - append_chunk_extra/7, append_chunk_extra/8, - read_chunk/6, read_chunk/7, - checksum_list/3, checksum_list/4, + append_chunk/6, append_chunk/8, + read_chunk/7, read_chunk/8, + checksum_list/2, checksum_list/3, list_files/2, list_files/3, wedge_status/1, wedge_status/2, @@ -80,8 +81,8 @@ quit/1, %% Internal API - write_chunk/5, write_chunk/6, - trim_chunk/5, trim_chunk/6, + write_chunk/7, write_chunk/8, + trim_chunk/6, trim_chunk/7, %% Helpers stop_proxies/1, start_proxies/1 @@ -106,80 +107,39 @@ start_link(#p_srvr{}=I) -> %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. -append_chunk(PidSpec, EpochID, Prefix, Chunk) -> - append_chunk(PidSpec, EpochID, Prefix, Chunk, infinity). +append_chunk(PidSpec, NSInfo, EpochID, Prefix, Chunk, CSum) -> + append_chunk(PidSpec, NSInfo, EpochID, Prefix, Chunk, CSum, + #append_opts{}, infinity). %% @doc Append a chunk (binary- or iolist-style) of data to a file %% with `Prefix'. -append_chunk(PidSpec, EpochID, Prefix, Chunk, Timeout) -> - append_chunk_extra(PidSpec, EpochID, - ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, - Prefix, Chunk, 0, Timeout). - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix'. - -append_chunk(PidSpec, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk) -> - append_chunk(PidSpec, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk, infinity). - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix'. - -append_chunk(PidSpec, EpochID, CoC_Namespace, CoC_Locator, Prefix, Chunk, Timeout) -> - append_chunk_extra(PidSpec, EpochID, - CoC_Namespace, CoC_Locator, - Prefix, Chunk, 0, Timeout). - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix'. - -append_chunk_extra(PidSpec, EpochID, Prefix, Chunk, ChunkExtra) - when is_integer(ChunkExtra), ChunkExtra >= 0 -> - append_chunk_extra(PidSpec, EpochID, - ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, - Prefix, Chunk, ChunkExtra, infinity). - -%% @doc Append a chunk (binary- or iolist-style) of data to a file -%% with `Prefix'. - -append_chunk_extra(PidSpec, EpochID, Prefix, Chunk, ChunkExtra, Timeout) -> - append_chunk_extra(PidSpec, EpochID, - ?DEFAULT_COC_NAMESPACE, ?DEFAULT_COC_LOCATOR, - Prefix, Chunk, ChunkExtra, Timeout). - -append_chunk_extra(PidSpec, EpochID, CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra) -> - append_chunk_extra(PidSpec, EpochID, CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra, infinity). - -append_chunk_extra(PidSpec, EpochID, CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra, Timeout) -> - gen_server:call(PidSpec, {req, {append_chunk_extra, EpochID, - CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra}}, +append_chunk(PidSpec, NSInfo, EpochID, Prefix, Chunk, CSum, Opts, + Timeout) -> + gen_server:call(PidSpec, {req, {append_chunk, NSInfo, EpochID, + Prefix, Chunk, CSum, Opts, Timeout}}, Timeout). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. -read_chunk(PidSpec, EpochID, File, Offset, Size, Opts) -> - read_chunk(PidSpec, EpochID, File, Offset, Size, Opts, infinity). +read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, Opts) -> + read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, Opts, infinity). %% @doc Read a chunk of data of size `Size' from `File' at `Offset'. -read_chunk(PidSpec, EpochID, File, Offset, Size, Opts, Timeout) -> - gen_server:call(PidSpec, {req, {read_chunk, EpochID, File, Offset, Size, Opts}}, +read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, Opts, Timeout) -> + gen_server:call(PidSpec, {req, {read_chunk, NSInfo, EpochID, File, Offset, Size, Opts}}, Timeout). %% @doc Fetch the list of chunk checksums for `File'. -checksum_list(PidSpec, EpochID, File) -> - checksum_list(PidSpec, EpochID, File, infinity). +checksum_list(PidSpec, File) -> + checksum_list(PidSpec, File, infinity). %% @doc Fetch the list of chunk checksums for `File'. -checksum_list(PidSpec, EpochID, File, Timeout) -> - gen_server:call(PidSpec, {req, {checksum_list, EpochID, File}}, +checksum_list(PidSpec, File, Timeout) -> + gen_server:call(PidSpec, {req, {checksum_list, File}}, Timeout). %% @doc Fetch the list of all files on the remote FLU. @@ -320,18 +280,18 @@ quit(PidSpec) -> %% @doc Write a chunk (binary- or iolist-style) of data to a file %% with `Prefix' at `Offset'. -write_chunk(PidSpec, EpochID, File, Offset, Chunk) -> - write_chunk(PidSpec, EpochID, File, Offset, Chunk, infinity). +write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum) -> + write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum, infinity). %% @doc Write a chunk (binary- or iolist-style) of data to a file %% with `Prefix' at `Offset'. -write_chunk(PidSpec, EpochID, File, Offset, Chunk, Timeout) -> - case gen_server:call(PidSpec, {req, {write_chunk, EpochID, File, Offset, Chunk}}, +write_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, CSum, Timeout) -> + case gen_server:call(PidSpec, {req, {write_chunk, NSInfo, EpochID, File, Offset, Chunk, CSum}}, Timeout) of {error, written}=Err -> Size = byte_size(Chunk), - case read_chunk(PidSpec, EpochID, File, Offset, Size, [], Timeout) of + case read_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, undefined, Timeout) of {ok, {[{File, Offset, Chunk2, _}], []}} when Chunk2 == Chunk -> %% See equivalent comment inside write_projection(). ok; @@ -343,15 +303,15 @@ write_chunk(PidSpec, EpochID, File, Offset, Chunk, Timeout) -> end. -trim_chunk(PidSpec, EpochID, File, Offset, Size) -> - trim_chunk(PidSpec, EpochID, File, Offset, Size, infinity). +trim_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size) -> + trim_chunk(PidSpec, NSInfo, EpochID, File, Offset, Size, infinity). %% @doc Write a chunk (binary- or iolist-style) of data to a file %% with `Prefix' at `Offset'. -trim_chunk(PidSpec, EpochID, File, Offset, Chunk, Timeout) -> +trim_chunk(PidSpec, NSInfo, EpochID, File, Offset, Chunk, Timeout) -> gen_server:call(PidSpec, - {req, {trim_chunk, EpochID, File, Offset, Chunk}}, + {req, {trim_chunk, NSInfo, EpochID, File, Offset, Chunk}}, Timeout). %%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -415,24 +375,24 @@ do_req_retry(_Req, 2, Err, S) -> do_req_retry(Req, Depth, _Err, S) -> do_req(Req, Depth + 1, try_connect(disconnect(S))). -make_req_fun({append_chunk_extra, EpochID, CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra}, +make_req_fun({append_chunk, NSInfo, EpochID, + Prefix, Chunk, CSum, Opts, Timeout}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> - fun() -> Mod:append_chunk_extra(Sock, EpochID, CoC_Namespace, CoC_Locator, - Prefix, Chunk, ChunkExtra) + fun() -> Mod:append_chunk(Sock, NSInfo, EpochID, + Prefix, Chunk, CSum, Opts, Timeout) end; -make_req_fun({read_chunk, EpochID, File, Offset, Size, Opts}, +make_req_fun({read_chunk, NSInfo, EpochID, File, Offset, Size, Opts}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> - fun() -> Mod:read_chunk(Sock, EpochID, File, Offset, Size, Opts) end; -make_req_fun({write_chunk, EpochID, File, Offset, Chunk}, + fun() -> Mod:read_chunk(Sock, NSInfo, EpochID, File, Offset, Size, Opts) end; +make_req_fun({write_chunk, NSInfo, EpochID, File, Offset, Chunk, CSum}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> - fun() -> Mod:write_chunk(Sock, EpochID, File, Offset, Chunk) end; -make_req_fun({trim_chunk, EpochID, File, Offset, Size}, + fun() -> Mod:write_chunk(Sock, NSInfo, EpochID, File, Offset, Chunk, CSum) end; +make_req_fun({trim_chunk, NSInfo, EpochID, File, Offset, Size}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> - fun() -> Mod:trim_chunk(Sock, EpochID, File, Offset, Size) end; -make_req_fun({checksum_list, EpochID, File}, + fun() -> Mod:trim_chunk(Sock, NSInfo, EpochID, File, Offset, Size) end; +make_req_fun({checksum_list, File}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> - fun() -> Mod:checksum_list(Sock, EpochID, File) end; + fun() -> Mod:checksum_list(Sock, File) end; make_req_fun({list_files, EpochID}, #state{sock=Sock,i=#p_srvr{proto_mod=Mod}}) -> fun() -> Mod:list_files(Sock, EpochID) end; diff --git a/src/machi_util.erl b/src/machi_util.erl index aa5f070..95a42a5 100644 --- a/src/machi_util.erl +++ b/src/machi_util.erl @@ -49,7 +49,9 @@ %% Other wait_for_death/2, wait_for_life/2, bool2int/1, - int2bool/1 + int2bool/1, + read_opts_default/1, + ns_info_default/1 ]). -include("machi.hrl"). @@ -68,12 +70,12 @@ make_regname(Prefix) when is_list(Prefix) -> %% @doc Calculate a config file path, by common convention. --spec make_config_filename(string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string()) -> +-spec make_config_filename(string(), machi_dt:namespace(), machi_dt:locator(), string()) -> string(). -make_config_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix) -> - Locator_str = int_to_hexstr(CoC_Locator, 32), +make_config_filename(DataDir, NS, NSLocator, Prefix) -> + NSLocator_str = int_to_hexstr(NSLocator, 32), lists:flatten(io_lib:format("~s/config/~s^~s^~s", - [DataDir, Prefix, CoC_Namespace, Locator_str])). + [DataDir, Prefix, NS, NSLocator_str])). %% @doc Calculate a config file path, by common convention. @@ -102,19 +104,19 @@ make_checksum_filename(DataDir, FileName) -> %% @doc Calculate a file data file path, by common convention. --spec make_data_filename(string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string(), atom()|string()|binary(), integer()|string()) -> +-spec make_data_filename(string(), machi_dt:namespace(), machi_dt:locator(), string(), atom()|string()|binary(), integer()|string()) -> {binary(), string()}. -make_data_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, SequencerName, FileNum) +make_data_filename(DataDir, NS, NSLocator, Prefix, SequencerName, FileNum) when is_integer(FileNum) -> - Locator_str = int_to_hexstr(CoC_Locator, 32), + NSLocator_str = int_to_hexstr(NSLocator, 32), File = erlang:iolist_to_binary(io_lib:format("~s^~s^~s^~s^~w", - [Prefix, CoC_Namespace, Locator_str, SequencerName, FileNum])), + [Prefix, NS, NSLocator_str, SequencerName, FileNum])), make_data_filename2(DataDir, File); -make_data_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix, SequencerName, String) +make_data_filename(DataDir, NS, NSLocator, Prefix, SequencerName, String) when is_list(String) -> - Locator_str = int_to_hexstr(CoC_Locator, 32), + NSLocator_str = int_to_hexstr(NSLocator, 32), File = erlang:iolist_to_binary(io_lib:format("~s^~s^~s^~s^~s", - [Prefix, CoC_Namespace, Locator_str, SequencerName, string])), + [Prefix, NS, NSLocator_str, SequencerName, string])), make_data_filename2(DataDir, File). make_data_filename2(DataDir, File) -> @@ -154,37 +156,36 @@ is_valid_filename(Filename) -> %% The components will be: %%
    %%
  • Prefix
  • -%%
  • CoC Namespace
  • -%%
  • CoC locator
  • +%%
  • Cluster namespace
  • +%%
  • Cluster locator
  • %%
  • UUID
  • %%
  • Sequence number
  • %%
%% %% Invalid filenames will return an empty list. --spec parse_filename( Filename :: string() ) -> {} | {string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string(), string() }. +-spec parse_filename( Filename :: string() ) -> {} | {string(), machi_dt:namespace(), machi_dt:locator(), string(), string() }. parse_filename(Filename) -> case string:tokens(Filename, "^") of - [Prefix, CoC_NS, CoC_Loc, UUID, SeqNo] -> - {Prefix, CoC_NS, list_to_integer(CoC_Loc), UUID, SeqNo}; - [Prefix, CoC_Loc, UUID, SeqNo] -> + [Prefix, NS, NSLocator, UUID, SeqNo] -> + {Prefix, NS, list_to_integer(NSLocator), UUID, SeqNo}; + [Prefix, NSLocator, UUID, SeqNo] -> %% string:tokens() doesn't consider "foo^^bar" as 3 tokens {sigh} case re:replace(Filename, "[^^]+", "x", [global,{return,binary}]) of <<"x^^x^x^x">> -> - {Prefix, <<"">>, list_to_integer(CoC_Loc), UUID, SeqNo}; + {Prefix, <<"">>, list_to_integer(NSLocator), UUID, SeqNo}; _ -> {} end; _ -> {} end. - %% @doc Read the file size of a config file, which is used as the %% basis for a minimum sequence number. --spec read_max_filenum(string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string()) -> +-spec read_max_filenum(string(), machi_dt:namespace(), machi_dt:locator(), string()) -> non_neg_integer(). -read_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix) -> - case file:read_file_info(make_config_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix)) of +read_max_filenum(DataDir, NS, NSLocator, Prefix) -> + case file:read_file_info(make_config_filename(DataDir, NS, NSLocator, Prefix)) of {error, enoent} -> 0; {ok, FI} -> @@ -194,11 +195,11 @@ read_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix) -> %% @doc Increase the file size of a config file, which is used as the %% basis for a minimum sequence number. --spec increment_max_filenum(string(), machi_dt:coc_namespace(), machi_dt:coc_locator(), string()) -> +-spec increment_max_filenum(string(), machi_dt:namespace(), machi_dt:locator(), string()) -> ok | {error, term()}. -increment_max_filenum(DataDir, CoC_Namespace, CoC_Locator, Prefix) -> +increment_max_filenum(DataDir, NS, NSLocator, Prefix) -> try - {ok, FH} = file:open(make_config_filename(DataDir, CoC_Namespace, CoC_Locator, Prefix), [append]), + {ok, FH} = file:open(make_config_filename(DataDir, NS, NSLocator, Prefix), [append]), ok = file:write(FH, "x"), ok = file:sync(FH), ok = file:close(FH) @@ -287,12 +288,25 @@ int_to_hexbin(I, I_size) -> checksum_chunk(Chunk) when is_binary(Chunk); is_list(Chunk) -> crypto:hash(sha, Chunk). +convert_csum_tag(A) when is_atom(A)-> + A; +convert_csum_tag(?CSUM_TAG_NONE) -> + ?CSUM_TAG_NONE_ATOM; +convert_csum_tag(?CSUM_TAG_CLIENT_SHA) -> + ?CSUM_TAG_CLIENT_SHA_ATOM; +convert_csum_tag(?CSUM_TAG_SERVER_SHA) -> + ?CSUM_TAG_SERVER_SHA_ATOM; +convert_csum_tag(?CSUM_TAG_SERVER_REGEN_SHA) -> + ?CSUM_TAG_SERVER_REGEN_SHA_ATOM. + %% @doc Create a tagged checksum make_tagged_csum(none) -> <>; +make_tagged_csum(<<>>) -> + <>; make_tagged_csum({Tag, CSum}) -> - make_tagged_csum(Tag, CSum). + make_tagged_csum(convert_csum_tag(Tag), CSum). %% @doc Makes tagged csum. Each meanings are: %% none / ?CSUM_TAG_NONE @@ -360,7 +374,7 @@ wait_for_death(Pid, Iters) when is_pid(Pid) -> false -> ok; true -> - timer:sleep(1), + timer:sleep(10), wait_for_death(Pid, Iters-1) end. @@ -431,3 +445,17 @@ bool2int(true) -> 1; bool2int(false) -> 0. int2bool(0) -> false; int2bool(I) when is_integer(I) -> true. + +read_opts_default(#read_opts{}=NSInfo) -> + NSInfo; +read_opts_default(A) when A == 'undefined'; A == 'noopt'; A == 'none' -> + #read_opts{}; +read_opts_default(A) when is_atom(A) -> + #read_opts{}. + +ns_info_default(#ns_info{}=NSInfo) -> + NSInfo; +ns_info_default(A) when is_atom(A) -> + #ns_info{}. + + diff --git a/src/machi_yessir_client.erl b/src/machi_yessir_client.erl index 1bdef2a..8721824 100644 --- a/src/machi_yessir_client.erl +++ b/src/machi_yessir_client.erl @@ -22,6 +22,8 @@ -module(machi_yessir_client). +-ifdef(TODO_refactoring_deferred). + -include("machi.hrl"). -include("machi_projection.hrl"). @@ -30,7 +32,7 @@ append_chunk/4, append_chunk/5, append_chunk_extra/5, append_chunk_extra/6, read_chunk/5, read_chunk/6, - checksum_list/3, checksum_list/4, + checksum_list/2, checksum_list/3, list_files/2, list_files/3, wedge_status/1, wedge_status/2, @@ -173,7 +175,7 @@ read_chunk(_Host, _TcpPort, EpochID, File, Offset, Size) %% @doc Fetch the list of chunk checksums for `File'. -checksum_list(#yessir{name=Name,chunk_size=ChunkSize}, _EpochID, File) -> +checksum_list(#yessir{name=Name,chunk_size=ChunkSize}, File) -> case get({Name,offset,File}) of undefined -> {error, no_such_file}; @@ -187,10 +189,10 @@ checksum_list(#yessir{name=Name,chunk_size=ChunkSize}, _EpochID, File) -> %% @doc Fetch the list of chunk checksums for `File'. -checksum_list(_Host, _TcpPort, EpochID, File) -> +checksum_list(_Host, _TcpPort, File) -> Sock = connect(#p_srvr{proto_mod=?MODULE}), try - checksum_list(Sock, EpochID, File) + checksum_list(Sock, File) after disconnect(Sock) end. @@ -509,3 +511,5 @@ disconnect(#yessir{name=Name}) -> %% =INFO REPORT==== 17-May-2015::18:57:52 === %% Repair success: tail a of [a] finished ap_mode repair ID {a,{1431,856671,140404}}: ok %% Stats [{t_in_files,0},{t_in_chunks,10413},{t_in_bytes,682426368},{t_out_files,0},{t_out_chunks,10413},{t_out_bytes,682426368},{t_bad_chunks,0},{t_elapsed_seconds,1.591}] + +-endif. % TODO_refactoring_deferred diff --git a/test/machi_admin_util_test.erl b/test/machi_admin_util_test.erl index 1ebbbf3..cd4d813 100644 --- a/test/machi_admin_util_test.erl +++ b/test/machi_admin_util_test.erl @@ -44,6 +44,8 @@ verify_file_checksums_test2() -> TcpPort = 32958, DataDir = "./data", W_props = [{initial_wedged, false}], + NSInfo = undefined, + NoCSum = <<>>, try machi_test_util:start_flu_package(verify1_flu, TcpPort, DataDir, W_props), @@ -51,8 +53,8 @@ verify_file_checksums_test2() -> try Prefix = <<"verify_prefix">>, NumChunks = 10, - [{ok, _} = ?FLU_C:append_chunk(Sock1, ?DUMMY_PV1_EPOCH, - Prefix, <>) || + [{ok, _} = ?FLU_C:append_chunk(Sock1, NSInfo, ?DUMMY_PV1_EPOCH, + Prefix, <>, NoCSum) || X <- lists:seq(1, NumChunks)], {ok, [{_FileSize,File}]} = ?FLU_C:list_files(Sock1, ?DUMMY_PV1_EPOCH), ?assertEqual({ok, []}, diff --git a/test/machi_ap_repair_eqc.erl b/test/machi_ap_repair_eqc.erl index 7d87d35..55bc082 100644 --- a/test/machi_ap_repair_eqc.erl +++ b/test/machi_ap_repair_eqc.erl @@ -118,7 +118,10 @@ append(CRIndex, Bin, #state{verbose=V}=S) -> {_SimSelfName, C} = lists:nth(CRIndex, CRList), Prefix = <<"pre">>, Len = byte_size(Bin), - Res = (catch machi_cr_client:append_chunk(C, Prefix, Bin, {sec(1), sec(1)})), + NSInfo = #ns_info{}, + NoCSum = <<>>, + Opts1 = #append_opts{}, + Res = (catch machi_cr_client:append_chunk(C, NSInfo, Prefix, Bin, NoCSum, Opts1, sec(1))), case Res of {ok, {_Off, Len, _FileName}=Key} -> case ets:insert_new(?WRITTEN_TAB, {Key, Bin}) of @@ -190,6 +193,7 @@ change_partition(Partition, %% Don't wait for stable chain, tick will be executed on demand %% in append oprations _ = tick(S), + ok. %% Generators @@ -427,7 +431,7 @@ confirm_result(_T) -> 0 -> ok; _ -> DumpFailed = filename:join(DirBase, "dump-failed-" ++ Suffix), - ?V("Dump failed ETS tab to: ~w~n", [DumpFailed]), + ?V("Dump failed ETS tab to: ~s~n", [DumpFailed]), ets:tab2file(?FAILED_TAB, DumpFailed) end, case Critical of @@ -450,14 +454,14 @@ confirm_written(C) -> assert_chunk(C, {Off, Len, FileName}=Key, Bin) -> %% TODO: This probably a bug, read_chunk respnds with filename of `string()' type - FileNameStr = binary_to_list(FileName), %% TODO : Use CSum instead of binary (after disuccsion about CSum is calmed down?) - case (catch machi_cr_client:read_chunk(C, FileName, Off, Len, [], sec(3))) of - {ok, {[{FileNameStr, Off, Bin, _}], []}} -> + NSInfo = undefined, + case (catch machi_cr_client:read_chunk(C, NSInfo, FileName, Off, Len, undefined, sec(3))) of + {ok, {[{FileName, Off, Bin, _}], []}} -> ok; {ok, Got} -> ?V("read_chunk got different binary for Key=~p~n", [Key]), - ?V(" Expected: ~p~n", [{[{FileNameStr, Off, Bin, <<"CSum-NYI">>}], []}]), + ?V(" Expected: ~p~n", [{[{FileName, Off, Bin, <<"CSum-NYI">>}], []}]), ?V(" Got: ~p~n", [Got]), {error, different_binary}; {error, Reason} -> @@ -479,7 +483,7 @@ eqc_verbose() -> os:getenv("EQC_VERBOSE") =:= "true". eqc_timeout(Default) -> - PropTimeout = case os:getenv("EQC_TIMEOUT") of + PropTimeout = case os:getenv("EQC_TIME") of false -> Default; V -> list_to_integer(V) end, @@ -554,8 +558,10 @@ wait_until_stable(ExpectedChainState, FLUNames, MgrNames, Retries, Verbose) -> FCList = fc_list(), wait_until_stable1(ExpectedChainState, TickFun, FCList, Retries, Verbose). -wait_until_stable1(_ExpectedChainState, _TickFun, FCList, 0, _Verbose) -> +wait_until_stable1(ExpectedChainState, _TickFun, FCList, 0, _Verbose) -> + ?V(" [ERROR] _ExpectedChainState ~p\n", [ExpectedChainState]), ?V(" [ERROR] wait_until_stable failed.... : ~p~n", [chain_state(FCList)]), + ?V(" [ERROR] norm.... : ~p~n", [normalize_chain_state(chain_state(FCList))]), false; wait_until_stable1(ExpectedChainState, TickFun, FCList, Reties, Verbose) -> [TickFun(3, 0, 100) || _ <- lists:seq(1, 3)], diff --git a/test/machi_chain_manager1_test.erl b/test/machi_chain_manager1_test.erl index 02010ff..80296d2 100644 --- a/test/machi_chain_manager1_test.erl +++ b/test/machi_chain_manager1_test.erl @@ -401,7 +401,7 @@ nonunanimous_setup_and_fix_test2() -> Mb, ChainName, TheEpoch_3, ap_mode, MembersDict4, []), Advance(), - {ok, {true, _}} = ?FLU_PC:wedge_status(Proxy_a), + {ok, {true, _,_,_}} = ?FLU_PC:wedge_status(Proxy_a), {_, _, TheEpoch_4} = ?MGR:trigger_react_to_env(Mb), {_, _, TheEpoch_4} = ?MGR:trigger_react_to_env(Mc), [{ok, #projection_v1{upi=[b,c], repairing=[]}} = @@ -451,9 +451,9 @@ nonunanimous_setup_and_fix_test2() -> #p_srvr{name=NameA} = hd(Ps), {ok,_}=machi_flu_psup:start_flu_package(NameA, TcpPort+1, hd(Dirs), Opts), Advance(), - {ok, {true, _}} = ?FLU_PC:wedge_status(Proxy_a), - {ok, {false, EpochID_8}} = ?FLU_PC:wedge_status(Proxy_b), - {ok, {false, EpochID_8}} = ?FLU_PC:wedge_status(Proxy_c), + {ok, {true, _,_,_}} = ?FLU_PC:wedge_status(Proxy_a), + {ok, {false, EpochID_8,_,_}} = ?FLU_PC:wedge_status(Proxy_b), + {ok, {false, EpochID_8,_,_}} = ?FLU_PC:wedge_status(Proxy_c), [{ok, #projection_v1{upi=[b,c], repairing=[]}} = ?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- tl(Proxies)], @@ -463,8 +463,8 @@ nonunanimous_setup_and_fix_test2() -> ok = machi_flu_psup:stop_flu_package(a), Advance(), machi_flu1_test:clean_up_data_dir(hd(Dirs)), - {ok, {false, _}} = ?FLU_PC:wedge_status(Proxy_b), - {ok, {false, _}} = ?FLU_PC:wedge_status(Proxy_c), + {ok, {false, _,_,_}} = ?FLU_PC:wedge_status(Proxy_b), + {ok, {false, _,_,_}} = ?FLU_PC:wedge_status(Proxy_c), %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% io:format("STEP: Add a to the chain again (a is stopped).\n", []), @@ -482,9 +482,9 @@ nonunanimous_setup_and_fix_test2() -> {ok,_}=machi_flu_psup:start_flu_package(NameA, TcpPort+1, hd(Dirs), Opts), Advance(), - {ok, {false, {TheEpoch10,_}}} = ?FLU_PC:wedge_status(Proxy_a), - {ok, {false, {TheEpoch10,_}}} = ?FLU_PC:wedge_status(Proxy_b), - {ok, {false, {TheEpoch10,_}}} = ?FLU_PC:wedge_status(Proxy_c), + {ok, {false, {TheEpoch10,_},_,_}} = ?FLU_PC:wedge_status(Proxy_a), + {ok, {false, {TheEpoch10,_},_,_}} = ?FLU_PC:wedge_status(Proxy_b), + {ok, {false, {TheEpoch10,_},_,_}} = ?FLU_PC:wedge_status(Proxy_c), [{ok, #projection_v1{upi=[b,c], repairing=[a]}} = ?FLU_PC:read_latest_projection(Pxy, private) || Pxy <- Proxies], ok diff --git a/test/machi_cr_client_test.erl b/test/machi_cr_client_test.erl index 5179fc8..29e1d13 100644 --- a/test/machi_cr_client_test.erl +++ b/test/machi_cr_client_test.erl @@ -107,6 +107,8 @@ smoke_test2() -> try Prefix = <<"pre">>, Chunk1 = <<"yochunk">>, + NSInfo = undefined, + NoCSum = <<>>, Host = "localhost", PortBase = 64454, Os = [{ignore_stability_time, true}, {active_mode, false}], @@ -114,91 +116,92 @@ smoke_test2() -> %% Whew ... ok, now start some damn tests. {ok, C1} = machi_cr_client:start_link([P || {_,P}<-orddict:to_list(D)]), - machi_cr_client:append_chunk(C1, Prefix, Chunk1), + machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum), {ok, {Off1,Size1,File1}} = - machi_cr_client:append_chunk(C1, Prefix, Chunk1), - Chunk1_badcs = {<>, Chunk1}, + machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum), + BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, "foo")}, {error, bad_checksum} = - machi_cr_client:append_chunk(C1, Prefix, Chunk1_badcs), + machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, BadCSum), {ok, {[{_, Off1, Chunk1, _}], []}} = - machi_cr_client:read_chunk(C1, File1, Off1, Size1, []), + machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, Size1, undefined), {ok, PPP} = machi_flu1_client:read_latest_projection(Host, PortBase+0, private), %% Verify that the client's CR wrote to all of them. [{ok, {[{_, Off1, Chunk1, _}], []}} = machi_flu1_client:read_chunk( - Host, PortBase+X, EpochID, File1, Off1, Size1, []) || + Host, PortBase+X, NSInfo, EpochID, File1, Off1, Size1, undefined) || X <- [0,1,2] ], %% Test read repair: Manually write to head, then verify that %% read-repair fixes all. FooOff1 = Off1 + (1024*1024), [{error, not_written} = machi_flu1_client:read_chunk( - Host, PortBase+X, EpochID, - File1, FooOff1, Size1, []) || X <- [0,1,2] ], - ok = machi_flu1_client:write_chunk(Host, PortBase+0, EpochID, - File1, FooOff1, Chunk1), - {ok, {[{_, FooOff1, Chunk1, _}], []}} = - machi_flu1_client:read_chunk(Host, PortBase+0, EpochID, - File1, FooOff1, Size1, []), - {ok, {[{_, FooOff1, Chunk1, _}], []}} = - machi_cr_client:read_chunk(C1, File1, FooOff1, Size1, []), + Host, PortBase+X, NSInfo, EpochID, + File1, FooOff1, Size1, undefined) || X <- [0,1,2] ], + ok = machi_flu1_client:write_chunk(Host, PortBase+0, NSInfo, EpochID, + File1, FooOff1, Chunk1, NoCSum), + {ok, {[{File1, FooOff1, Chunk1, _}=_YY], []}} = + machi_flu1_client:read_chunk(Host, PortBase+0, NSInfo, EpochID, + File1, FooOff1, Size1, undefined), + {ok, {[{File1, FooOff1, Chunk1, _}], []}} = + machi_cr_client:read_chunk(C1, NSInfo, File1, FooOff1, Size1, undefined), [?assertMatch({X,{ok, {[{_, FooOff1, Chunk1, _}], []}}}, {X,machi_flu1_client:read_chunk( - Host, PortBase+X, EpochID, - File1, FooOff1, Size1, [])}) + Host, PortBase+X, NSInfo, EpochID, + File1, FooOff1, Size1, undefined)}) || X <- [0,1,2] ], %% Test read repair: Manually write to middle, then same checking. FooOff2 = Off1 + (2*1024*1024), Chunk2 = <<"Middle repair chunk">>, Size2 = size(Chunk2), - ok = machi_flu1_client:write_chunk(Host, PortBase+1, EpochID, - File1, FooOff2, Chunk2), - {ok, {[{_, FooOff2, Chunk2, _}], []}} = - machi_cr_client:read_chunk(C1, File1, FooOff2, Size2, []), - [{X,{ok, {[{_, FooOff2, Chunk2, _}], []}}} = + ok = machi_flu1_client:write_chunk(Host, PortBase+1, NSInfo, EpochID, + File1, FooOff2, Chunk2, NoCSum), + {ok, {[{File1, FooOff2, Chunk2, _}], []}} = + machi_cr_client:read_chunk(C1, NSInfo, File1, FooOff2, Size2, undefined), + [{X,{ok, {[{File1, FooOff2, Chunk2, _}], []}}} = {X,machi_flu1_client:read_chunk( - Host, PortBase+X, EpochID, - File1, FooOff2, Size2, [])} || X <- [0,1,2] ], + Host, PortBase+X, NSInfo, EpochID, + File1, FooOff2, Size2, undefined)} || X <- [0,1,2] ], %% Misc API smoke & minor regression checks - {error, bad_arg} = machi_cr_client:read_chunk(C1, <<"no">>, - 999999999, 1, []), - {ok, {[{_,Off1,Chunk1,_}, {_,FooOff1,Chunk1,_}, {_,FooOff2,Chunk2,_}], + {error, bad_arg} = machi_cr_client:read_chunk(C1, NSInfo, <<"no">>, + 999999999, 1, undefined), + {ok, {[{File1,Off1,Chunk1,_}, {File1,FooOff1,Chunk1,_}, {File1,FooOff2,Chunk2,_}], []}} = - machi_cr_client:read_chunk(C1, File1, Off1, 88888888, []), + machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, 88888888, undefined), %% Checksum list return value is a primitive binary(). {ok, KludgeBin} = machi_cr_client:checksum_list(C1, File1), true = is_binary(KludgeBin), {error, bad_arg} = machi_cr_client:checksum_list(C1, <<"!!!!">>), -io:format(user, "\nFiles = ~p\n", [machi_cr_client:list_files(C1)]), + io:format(user, "\nFiles = ~p\n", [machi_cr_client:list_files(C1)]), %% Exactly one file right now, e.g., %% {ok,[{2098202,<<"pre^b144ef13-db4d-4c9f-96e7-caff02dc754f^1">>}]} {ok, [_]} = machi_cr_client:list_files(C1), - %% Go back and test append_chunk_extra() and write_chunk() + %% Go back and test append_chunk() + extra and write_chunk() Chunk10 = <<"It's a different chunk!">>, Size10 = byte_size(Chunk10), Extra10 = 5, + Opts1 = #append_opts{chunk_extra=Extra10*Size10}, {ok, {Off10,Size10,File10}} = - machi_cr_client:append_chunk_extra(C1, Prefix, Chunk10, - Extra10 * Size10), + machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk10, + NoCSum, Opts1), {ok, {[{_, Off10, Chunk10, _}], []}} = - machi_cr_client:read_chunk(C1, File10, Off10, Size10, []), + machi_cr_client:read_chunk(C1, NSInfo, File10, Off10, Size10, undefined), [begin Offx = Off10 + (Seq * Size10), %% TODO: uncomment written/not_written enforcement is available. - %% {error,not_written} = machi_cr_client:read_chunk(C1, File10, + %% {error,not_written} = machi_cr_client:read_chunk(C1, NSInfo, File10, %% Offx, Size10), {ok, {Offx,Size10,File10}} = - machi_cr_client:write_chunk(C1, File10, Offx, Chunk10), + machi_cr_client:write_chunk(C1, NSInfo, File10, Offx, Chunk10, NoCSum), {ok, {[{_, Offx, Chunk10, _}], []}} = - machi_cr_client:read_chunk(C1, File10, Offx, Size10, []) + machi_cr_client:read_chunk(C1, NSInfo, File10, Offx, Size10, undefined) end || Seq <- lists:seq(1, Extra10)], {ok, {Off11,Size11,File11}} = - machi_cr_client:append_chunk(C1, Prefix, Chunk10), + machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk10, NoCSum), %% %% Double-check that our reserved extra bytes were really honored! %% true = (Off11 > (Off10 + (Extra10 * Size10))), io:format(user, "\nFiles = ~p\n", [machi_cr_client:list_files(C1)]), @@ -224,6 +227,8 @@ witness_smoke_test2() -> try Prefix = <<"pre">>, Chunk1 = <<"yochunk">>, + NSInfo = undefined, + NoCSum = <<>>, Host = "localhost", PortBase = 64444, Os = [{ignore_stability_time, true}, {active_mode, false}, @@ -233,14 +238,15 @@ witness_smoke_test2() -> %% Whew ... ok, now start some damn tests. {ok, C1} = machi_cr_client:start_link([P || {_,P}<-orddict:to_list(D)]), - {ok, _} = machi_cr_client:append_chunk(C1, Prefix, Chunk1), + {ok, _} = machi_cr_client:append_chunk(C1, NSInfo, Prefix, + Chunk1, NoCSum), {ok, {Off1,Size1,File1}} = - machi_cr_client:append_chunk(C1, Prefix, Chunk1), - Chunk1_badcs = {<>, Chunk1}, + machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum), + BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, "foo")}, {error, bad_checksum} = - machi_cr_client:append_chunk(C1, Prefix, Chunk1_badcs), + machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, BadCSum), {ok, {[{_, Off1, Chunk1, _}], []}} = - machi_cr_client:read_chunk(C1, File1, Off1, Size1, []), + machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, Size1, undefined), %% Stop 'b' and let the chain reset. ok = machi_flu_psup:stop_flu_package(b), @@ -253,24 +259,25 @@ witness_smoke_test2() -> %% Let's wedge OurWitness and see what happens: timeout/partition. #p_srvr{name=WitName, address=WitA, port=WitP} = orddict:fetch(OurWitness, D), - {ok, {false, EpochID2}} = machi_flu1_client:wedge_status(WitA, WitP), + {ok, {false, EpochID2,_,_}} = machi_flu1_client:wedge_status(WitA, WitP), machi_flu1:wedge_myself(WitName, EpochID2), case machi_flu1_client:wedge_status(WitA, WitP) of - {ok, {true, EpochID2}} -> + {ok, {true, EpochID2,_,_}} -> ok; - {ok, {false, EpochID2}} -> + {ok, {false, EpochID2,_,_}} -> %% This is racy. Work around it by sleeping a while. timer:sleep(6*1000), - {ok, {true, EpochID2}} = + {ok, {true, EpochID2,_,_}} = machi_flu1_client:wedge_status(WitA, WitP) end, %% Chunk1 is still readable: not affected by wedged witness head. {ok, {[{_, Off1, Chunk1, _}], []}} = - machi_cr_client:read_chunk(C1, File1, Off1, Size1, []), + machi_cr_client:read_chunk(C1, NSInfo, File1, Off1, Size1, undefined), %% But because the head is wedged, an append will fail. {error, partition} = - machi_cr_client:append_chunk(C1, Prefix, Chunk1, 1*1000), + machi_cr_client:append_chunk(C1, NSInfo, Prefix, Chunk1, NoCSum, + #append_opts{}, 1*1000), %% The witness's wedge status should cause timeout/partition %% for write_chunk also. @@ -279,7 +286,7 @@ witness_smoke_test2() -> File10 = File1, Offx = Off1 + (1 * Size10), {error, partition} = - machi_cr_client:write_chunk(C1, File10, Offx, Chunk10, 1*1000), + machi_cr_client:write_chunk(C1, NSInfo, File10, Offx, Chunk10, NoCSum, 1*1000), ok after diff --git a/test/machi_file_proxy_eqc.erl b/test/machi_file_proxy_eqc.erl index dd36787..c7a50e2 100644 --- a/test/machi_file_proxy_eqc.erl +++ b/test/machi_file_proxy_eqc.erl @@ -35,10 +35,14 @@ %% EUNIT TEST DEFINITION eqc_test_() -> - {timeout, 60, + PropTimeout = case os:getenv("EQC_TIME") of + false -> 30; + V -> list_to_integer(V) + end, + {timeout, PropTimeout*2 + 30, {spawn, [ - ?_assertEqual(true, eqc:quickcheck(eqc:testing_time(30, ?QC_OUT(prop_ok())))) + ?_assertEqual(true, eqc:quickcheck(eqc:testing_time(PropTimeout, ?QC_OUT(prop_ok())))) ] }}. diff --git a/test/machi_file_proxy_test.erl b/test/machi_file_proxy_test.erl index a04d880..605abe7 100644 --- a/test/machi_file_proxy_test.erl +++ b/test/machi_file_proxy_test.erl @@ -119,7 +119,7 @@ multiple_chunks_read_test_() -> ?_assertEqual(ok, machi_file_proxy:trim(Pid, 0, 1, false)), ?_assertMatch({ok, {[], [{"test", 0, 1}]}}, machi_file_proxy:read(Pid, 0, 1, - [{needs_trimmed, true}])), + #read_opts{needs_trimmed=true})), ?_assertMatch({ok, "test", _}, machi_file_proxy:append(Pid, random_binary(0, 1024))), ?_assertEqual(ok, machi_file_proxy:write(Pid, 10000, <<"fail">>)), ?_assertEqual(ok, machi_file_proxy:write(Pid, 20000, <<"fail">>)), @@ -134,7 +134,7 @@ multiple_chunks_read_test_() -> machi_file_proxy:read(Pid, 1024, 530000)), ?_assertMatch({ok, {[{"test", 1, _, _}], [{"test", 0, 1}]}}, machi_file_proxy:read(Pid, 0, 1024, - [{needs_trimmed, true}])) + #read_opts{needs_trimmed=true})) ] end}. diff --git a/test/machi_flu1_test.erl b/test/machi_flu1_test.erl index a1d098a..74490d2 100644 --- a/test/machi_flu1_test.erl +++ b/test/machi_flu1_test.erl @@ -91,6 +91,8 @@ flu_smoke_test() -> Host = "localhost", TcpPort = 12957, DataDir = "./data", + NSInfo = undefined, + NoCSum = <<>>, Prefix = <<"prefix!">>, BadPrefix = BadFile = "no/good", W_props = [{initial_wedged, false}], @@ -98,32 +100,31 @@ flu_smoke_test() -> try Msg = "Hello, world!", Msg = ?FLU_C:echo(Host, TcpPort, Msg), - {error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, - ?DUMMY_PV1_EPOCH, - "does-not-exist"), - {error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, - ?DUMMY_PV1_EPOCH, BadFile), + {error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort,"does-not-exist"), + {error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, BadFile), {ok, []} = ?FLU_C:list_files(Host, TcpPort, ?DUMMY_PV1_EPOCH), - {ok, {false, _}} = ?FLU_C:wedge_status(Host, TcpPort), + {ok, {false, _,_,_}} = ?FLU_C:wedge_status(Host, TcpPort), Chunk1 = <<"yo!">>, - {ok, {Off1,Len1,File1}} = ?FLU_C:append_chunk(Host, TcpPort, + {ok, {Off1,Len1,File1}} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, - Prefix, Chunk1), - {ok, {[{_, Off1, Chunk1, _}], _}} = ?FLU_C:read_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, - File1, Off1, Len1, []), - {ok, KludgeBin} = ?FLU_C:checksum_list(Host, TcpPort, - ?DUMMY_PV1_EPOCH, File1), + Prefix, Chunk1, NoCSum), + {ok, {[{_, Off1, Chunk1, _}], _}} = ?FLU_C:read_chunk(Host, TcpPort, + NSInfo, ?DUMMY_PV1_EPOCH, + File1, Off1, Len1, + noopt), + {ok, KludgeBin} = ?FLU_C:checksum_list(Host, TcpPort, File1), true = is_binary(KludgeBin), - {error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, + {error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, - BadPrefix, Chunk1), + BadPrefix, Chunk1, NoCSum), {ok, [{_,File1}]} = ?FLU_C:list_files(Host, TcpPort, ?DUMMY_PV1_EPOCH), Len1 = size(Chunk1), {error, not_written} = ?FLU_C:read_chunk(Host, TcpPort, - ?DUMMY_PV1_EPOCH, - File1, Off1*983829323, Len1, []), + NSInfo, ?DUMMY_PV1_EPOCH, + File1, Off1*983829323, Len1, + noopt), %% XXX FIXME %% %% This is failing because the read extends past the end of the file. @@ -132,19 +133,22 @@ flu_smoke_test() -> %% of the read will cause it to fail. %% %% {error, partial_read} = ?FLU_C:read_chunk(Host, TcpPort, - %% ?DUMMY_PV1_EPOCH, + %% NSInfo, ?DUMMY_PV1_EPOCH, %% File1, Off1, Len1*9999), - {ok, {Off1b,Len1b,File1b}} = ?FLU_C:append_chunk(Host, TcpPort, + {ok, {Off1b,Len1b,File1b}} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, - Prefix, Chunk1), + Prefix, Chunk1,NoCSum), Extra = 42, - {ok, {Off1c,Len1c,File1c}} = ?FLU_C:append_chunk_extra(Host, TcpPort, + Opts1 = #append_opts{chunk_extra=Extra}, + {ok, {Off1c,Len1c,File1c}} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, - Prefix, Chunk1, Extra), + Prefix, Chunk1, NoCSum, + Opts1, infinity), {ok, {Off1d,Len1d,File1d}} = ?FLU_C:append_chunk(Host, TcpPort, + NSInfo, ?DUMMY_PV1_EPOCH, - Prefix, Chunk1), + Prefix, Chunk1,NoCSum), if File1b == File1c, File1c == File1d -> true = (Off1c == Off1b + Len1b), true = (Off1d == Off1c + Len1c + Extra); @@ -152,27 +156,44 @@ flu_smoke_test() -> exit(not_mandatory_but_test_expected_same_file_fixme) end, - Chunk1_cs = {<>, Chunk1}, - {ok, {Off1e,Len1e,File1e}} = ?FLU_C:append_chunk(Host, TcpPort, - ?DUMMY_PV1_EPOCH, - Prefix, Chunk1_cs), - Chunk2 = <<"yo yo">>, Len2 = byte_size(Chunk2), Off2 = ?MINIMUM_OFFSET + 77, File2 = "smoke-whole-file^^0^1^1", - ok = ?FLU_C:write_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, - File2, Off2, Chunk2), - {error, bad_arg} = ?FLU_C:write_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, - BadFile, Off2, Chunk2), + ok = ?FLU_C:write_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, + File2, Off2, Chunk2, NoCSum), + {error, bad_arg} = ?FLU_C:write_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, + BadFile, Off2, Chunk2, NoCSum), {ok, {[{_, Off2, Chunk2, _}], _}} = - ?FLU_C:read_chunk(Host, TcpPort, ?DUMMY_PV1_EPOCH, File2, Off2, Len2, []), + ?FLU_C:read_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, File2, Off2, Len2, noopt), {error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort, - ?DUMMY_PV1_EPOCH, - "no!!", Off2, Len2, []), + NSInfo, ?DUMMY_PV1_EPOCH, + "no!!", Off2, Len2, noopt), {error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort, - ?DUMMY_PV1_EPOCH, - BadFile, Off2, Len2, []), + NSInfo, ?DUMMY_PV1_EPOCH, + BadFile, Off2, Len2, noopt), + + %% Make a connected socket. + Sock1 = ?FLU_C:connect(#p_srvr{address=Host, port=TcpPort}), + + %% Let's test some cluster version enforcement. + Good_EpochNum = 0, + Good_NSVersion = 0, + Good_NS = <<>>, + {ok, {false, {Good_EpochNum,_}, Good_NSVersion, GoodNS}} = + ?FLU_C:wedge_status(Sock1), + NS_good = #ns_info{version=Good_NSVersion, name=Good_NS}, + {ok, {[{_, Off2, Chunk2, _}], _}} = + ?FLU_C:read_chunk(Sock1, NS_good, ?DUMMY_PV1_EPOCH, + File2, Off2, Len2, noopt), + NS_bad_version = #ns_info{version=1, name=Good_NS}, + NS_bad_name = #ns_info{version=Good_NSVersion, name= <<"foons">>}, + {error, bad_epoch} = + ?FLU_C:read_chunk(Sock1, NS_bad_version, ?DUMMY_PV1_EPOCH, + File2, Off2, Len2, noopt), + {error, bad_arg} = + ?FLU_C:read_chunk(Sock1, NS_bad_name, ?DUMMY_PV1_EPOCH, + File2, Off2, Len2, noopt), %% We know that File1 still exists. Pretend that we've done a %% migration and exercise the delete_migration() API. @@ -189,8 +210,7 @@ flu_smoke_test() -> {error, bad_arg} = ?FLU_C:trunc_hack(Host, TcpPort, ?DUMMY_PV1_EPOCH, BadFile), - ok = ?FLU_C:quit(?FLU_C:connect(#p_srvr{address=Host, - port=TcpPort})) + ok = ?FLU_C:quit(Sock1) after machi_test_util:stop_flu_package() end. @@ -203,7 +223,7 @@ flu_projection_smoke_test() -> try [ok = flu_projection_common(Host, TcpPort, T) || T <- [public, private] ] -%% , {ok, {false, EpochID1}} = ?FLU_C:wedge_status(Host, TcpPort), +%% , {ok, {false, EpochID1,_,_}} = ?FLU_C:wedge_status(Host, TcpPort), %% io:format(user, "EpochID1 ~p\n", [EpochID1]) after machi_test_util:stop_flu_package() @@ -238,13 +258,15 @@ bad_checksum_test() -> DataDir = "./data.bct", Opts = [{initial_wedged, false}], {_,_,_} = machi_test_util:start_flu_package(projection_test_flu, TcpPort, DataDir, Opts), + NSInfo = undefined, try Prefix = <<"some prefix">>, Chunk1 = <<"yo yo yo">>, - Chunk1_badcs = {<>, Chunk1}, - {error, bad_checksum} = ?FLU_C:append_chunk(Host, TcpPort, + BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, ".................")}, + {error, bad_checksum} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, ?DUMMY_PV1_EPOCH, - Prefix, Chunk1_badcs), + Prefix, + Chunk1, BadCSum), ok after machi_test_util:stop_flu_package() @@ -256,6 +278,8 @@ witness_test() -> DataDir = "./data.witness", Opts = [{initial_wedged, false}, {witness_mode, true}], {_,_,_} = machi_test_util:start_flu_package(projection_test_flu, TcpPort, DataDir, Opts), + NSInfo = undefined, + NoCSum = <<>>, try Prefix = <<"some prefix">>, Chunk1 = <<"yo yo yo">>, @@ -268,15 +292,14 @@ witness_test() -> {ok, EpochID1} = ?FLU_C:get_latest_epochid(Host, TcpPort, private), %% Witness-protected ops all fail - {error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, EpochID1, - Prefix, Chunk1), + {error, bad_arg} = ?FLU_C:append_chunk(Host, TcpPort, NSInfo, EpochID1, + Prefix, Chunk1, NoCSum), File = <<"foofile">>, - {error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort, EpochID1, - File, 9999, 9999, []), - {error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, EpochID1, - File), + {error, bad_arg} = ?FLU_C:read_chunk(Host, TcpPort, NSInfo, EpochID1, + File, 9999, 9999, noopt), + {error, bad_arg} = ?FLU_C:checksum_list(Host, TcpPort, File), {error, bad_arg} = ?FLU_C:list_files(Host, TcpPort, EpochID1), - {ok, {false, EpochID1}} = ?FLU_C:wedge_status(Host, TcpPort), + {ok, {false, EpochID1,_,_}} = ?FLU_C:wedge_status(Host, TcpPort), {ok, _} = ?FLU_C:get_latest_epochid(Host, TcpPort, public), {ok, _} = ?FLU_C:read_latest_projection(Host, TcpPort, public), {error, not_written} = ?FLU_C:read_projection(Host, TcpPort, diff --git a/test/machi_flu_psup_test.erl b/test/machi_flu_psup_test.erl index 378ff74..bc43437 100644 --- a/test/machi_flu_psup_test.erl +++ b/test/machi_flu_psup_test.erl @@ -84,20 +84,23 @@ partial_stop_restart2() -> WedgeStatus = fun({_,#p_srvr{address=Addr, port=TcpPort}}) -> machi_flu1_client:wedge_status(Addr, TcpPort) end, + NSInfo = undefined, Append = fun({_,#p_srvr{address=Addr, port=TcpPort}}, EpochID) -> + NoCSum = <<>>, machi_flu1_client:append_chunk(Addr, TcpPort, - EpochID, - <<"prefix">>, <<"data">>) + NSInfo, EpochID, + <<"prefix">>, + <<"data">>, NoCSum) end, try [Start(P) || P <- Ps], - [{ok, {true, _}} = WedgeStatus(P) || P <- Ps], % all are wedged + [{ok, {true, _,_,_}} = WedgeStatus(P) || P <- Ps], % all are wedged [{error,wedged} = Append(P, ?DUMMY_PV1_EPOCH) || P <- Ps], % all are wedged [machi_chain_manager1:set_chain_members(ChMgr, Dict) || ChMgr <- ChMgrs ], - {ok, {false, EpochID1}} = WedgeStatus(hd(Ps)), - [{ok, {false, EpochID1}} = WedgeStatus(P) || P <- Ps], % *not* wedged + {ok, {false, EpochID1,_,_}} = WedgeStatus(hd(Ps)), + [{ok, {false, EpochID1,_,_}} = WedgeStatus(P) || P <- Ps], % *not* wedged [{ok,_} = Append(P, EpochID1) || P <- Ps], % *not* wedged {ok, {_,_,File1}} = Append(hd(Ps), EpochID1), @@ -123,9 +126,9 @@ partial_stop_restart2() -> Epoch_m = Proj_m#projection_v1.epoch_number, %% Confirm that all FLUs are *not* wedged, with correct proj & epoch Proj_mCSum = Proj_m#projection_v1.epoch_csum, - [{ok, {false, {Epoch_m, Proj_mCSum}}} = WedgeStatus(P) || % *not* wedged + [{ok, {false, {Epoch_m, Proj_mCSum},_,_}} = WedgeStatus(P) || % *not* wedged P <- Ps], - {ok, {false, EpochID1}} = WedgeStatus(hd(Ps)), + {ok, {false, EpochID1,_,_}} = WedgeStatus(hd(Ps)), [{ok,_} = Append(P, EpochID1) || P <- Ps], % *not* wedged %% Stop all but 'a'. @@ -145,10 +148,10 @@ partial_stop_restart2() -> {error, wedged} = Append(hd(Ps), EpochID1), {_, #p_srvr{address=Addr_a, port=TcpPort_a}} = hd(Ps), {error, wedged} = machi_flu1_client:read_chunk( - Addr_a, TcpPort_a, ?DUMMY_PV1_EPOCH, - <<>>, 99999999, 1, []), - {error, wedged} = machi_flu1_client:checksum_list( - Addr_a, TcpPort_a, ?DUMMY_PV1_EPOCH, <<>>), + Addr_a, TcpPort_a, NSInfo, ?DUMMY_PV1_EPOCH, + <<>>, 99999999, 1, undefined), + {error, bad_arg} = machi_flu1_client:checksum_list( + Addr_a, TcpPort_a, <<>>), %% list_files() is permitted despite wedged status {ok, _} = machi_flu1_client:list_files( Addr_a, TcpPort_a, ?DUMMY_PV1_EPOCH), @@ -157,7 +160,7 @@ partial_stop_restart2() -> {now_using,_,Epoch_n} = machi_chain_manager1:trigger_react_to_env( hd(ChMgrs)), true = (Epoch_n > Epoch_m), - {ok, {false, EpochID3}} = WedgeStatus(hd(Ps)), + {ok, {false, EpochID3,_,_}} = WedgeStatus(hd(Ps)), %% The file we're assigned should be different with the epoch change. {ok, {_,_,File3}} = Append(hd(Ps), EpochID3), true = (File1 /= File3), diff --git a/test/machi_pb_high_client_test.erl b/test/machi_pb_high_client_test.erl index 16b125c..68df0c9 100644 --- a/test/machi_pb_high_client_test.erl +++ b/test/machi_pb_high_client_test.erl @@ -24,6 +24,7 @@ -ifdef(TEST). -ifndef(PULSE). +-include("machi.hrl"). -include("machi_pb.hrl"). -include("machi_projection.hrl"). -include_lib("eunit/include/eunit.hrl"). @@ -55,17 +56,18 @@ smoke_test2() -> %% a separate test module? Or separate test func? {error, _} = ?C:auth(Clnt, "foo", "bar"), - CoC_n = "", % CoC_namespace (not implemented) - CoC_l = 0, % CoC_locator (not implemented) Prefix = <<"prefix">>, Chunk1 = <<"Hello, chunk!">>, + NS = "", + NoCSum = <<>>, + Opts1 = #append_opts{}, {ok, {Off1, Size1, File1}} = - ?C:append_chunk(Clnt, CoC_n, CoC_l, Prefix, Chunk1, none, 0), + ?C:append_chunk(Clnt, NS, Prefix, Chunk1, NoCSum, Opts1), true = is_binary(File1), Chunk2 = "It's another chunk", CSum2 = {client_sha, machi_util:checksum_chunk(Chunk2)}, {ok, {Off2, Size2, File2}} = - ?C:append_chunk(Clnt, CoC_n, CoC_l, Prefix, Chunk2, CSum2, 1024), + ?C:append_chunk(Clnt, NS, Prefix, Chunk2, CSum2, Opts1), Chunk3 = ["This is a ", <<"test,">>, 32, [["Hello, world!"]]], File3 = File2, Off3 = Off2 + iolist_size(Chunk2), @@ -76,9 +78,9 @@ smoke_test2() -> {iolist_to_binary(Chunk2), File2, Off2, Size2}, {iolist_to_binary(Chunk3), File3, Off3, Size3}], [begin - File = binary_to_list(Fl), + File = Fl, ?assertMatch({ok, {[{File, Off, Ch, _}], []}}, - ?C:read_chunk(Clnt, Fl, Off, Sz, [])) + ?C:read_chunk(Clnt, Fl, Off, Sz, undefined)) end || {Ch, Fl, Off, Sz} <- Reads], {ok, KludgeBin} = ?C:checksum_list(Clnt, File1), @@ -102,16 +104,16 @@ smoke_test2() -> end || {_Ch, Fl, Off, Sz} <- Reads], [begin {ok, {[], Trimmed}} = - ?C:read_chunk(Clnt, Fl, Off, Sz, [{needs_trimmed, true}]), - Filename = binary_to_list(Fl), + ?C:read_chunk(Clnt, Fl, Off, Sz, #read_opts{needs_trimmed=true}), + Filename = Fl, ?assertEqual([{Filename, Off, Sz}], Trimmed) end || {_Ch, Fl, Off, Sz} <- Reads], LargeBytes = binary:copy(<<"x">>, 1024*1024), LBCsum = {client_sha, machi_util:checksum_chunk(LargeBytes)}, {ok, {Offx, Sizex, Filex}} = - ?C:append_chunk(Clnt, CoC_n, CoC_l, - Prefix, LargeBytes, LBCsum, 0), + ?C:append_chunk(Clnt, NS, + Prefix, LargeBytes, LBCsum, Opts1), ok = ?C:trim_chunk(Clnt, Filex, Offx, Sizex), %% Make sure everything was trimmed @@ -128,7 +130,7 @@ smoke_test2() -> [begin {error, trimmed} = - ?C:read_chunk(Clnt, Fl, Off, Sz, []) + ?C:read_chunk(Clnt, Fl, Off, Sz, undefined) end || {_Ch, Fl, Off, Sz} <- Reads], ok after diff --git a/test/machi_proxy_flu1_client_test.erl b/test/machi_proxy_flu1_client_test.erl index b8556b7..7f8dcce 100644 --- a/test/machi_proxy_flu1_client_test.erl +++ b/test/machi_proxy_flu1_client_test.erl @@ -36,6 +36,8 @@ api_smoke_test() -> DataDir = "./data.api_smoke_flu", W_props = [{active_mode, false},{initial_wedged, false}], Prefix = <<"prefix">>, + NSInfo = undefined, + NoCSum = <<>>, try {[I], _, _} = machi_test_util:start_flu_package( @@ -43,35 +45,42 @@ api_smoke_test() -> {ok, Prox1} = ?MUT:start_link(I), try FakeEpoch = ?DUMMY_PV1_EPOCH, - [{ok, {_,_,_}} = ?MUT:append_chunk(Prox1, - FakeEpoch, Prefix, <<"data">>, - infinity) || _ <- lists:seq(1,5)], + [{ok, {_,_,_}} = ?MUT:append_chunk( + Prox1, NSInfo, FakeEpoch, + Prefix, <<"data">>, NoCSum) || + _ <- lists:seq(1,5)], %% Stop the FLU, what happens? machi_test_util:stop_flu_package(), - [{error,partition} = ?MUT:append_chunk(Prox1, + [{error,partition} = ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, Prefix, <<"data-stopped1">>, - infinity) || _ <- lists:seq(1,3)], + NoCSum) || _ <- lists:seq(1,3)], %% Start the FLU again, we should be able to do stuff immediately machi_test_util:start_flu_package(RegName, TcpPort, DataDir, [no_cleanup|W_props]), MyChunk = <<"my chunk data">>, {ok, {MyOff,MySize,MyFile}} = - ?MUT:append_chunk(Prox1, FakeEpoch, Prefix, MyChunk, - infinity), - {ok, {[{_, MyOff, MyChunk, _}], []}} = - ?MUT:read_chunk(Prox1, FakeEpoch, MyFile, MyOff, MySize, []), - MyChunk2 = <<"my chunk data, yeah, again">>, + ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, Prefix, MyChunk, + NoCSum), + {ok, {[{_, MyOff, MyChunk, _MyChunkCSUM}], []}} = + ?MUT:read_chunk(Prox1, NSInfo, FakeEpoch, MyFile, MyOff, MySize, undefined), + MyChunk2_parts = [<<"my chunk ">>, "data", <<", yeah, again">>], + MyChunk2 = iolist_to_binary(MyChunk2_parts), + Opts1 = #append_opts{chunk_extra=4242}, {ok, {MyOff2,MySize2,MyFile2}} = - ?MUT:append_chunk_extra(Prox1, FakeEpoch, Prefix, - MyChunk2, 4242, infinity), - {ok, {[{_, MyOff2, MyChunk2, _}], []}} = - ?MUT:read_chunk(Prox1, FakeEpoch, MyFile2, MyOff2, MySize2, []), - MyChunk_badcs = {<>, MyChunk}, - {error, bad_checksum} = ?MUT:append_chunk(Prox1, FakeEpoch, - Prefix, MyChunk_badcs), - {error, bad_checksum} = ?MUT:write_chunk(Prox1, FakeEpoch, - <<"foo-file^^0^1^1">>, 99832, - MyChunk_badcs), + ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, Prefix, + MyChunk2_parts, NoCSum, Opts1, infinity), + [{ok, {[{_, MyOff2, MyChunk2, _}], []}} = + ?MUT:read_chunk(Prox1, NSInfo, FakeEpoch, MyFile2, MyOff2, MySize2, DefaultOptions) || + DefaultOptions <- [undefined, noopt, none, any_atom_at_all] ], + + BadCSum = {?CSUM_TAG_CLIENT_SHA, crypto:hash(sha, "...................")}, + {error, bad_checksum} = ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, + Prefix, MyChunk, BadCSum), + {error, bad_checksum} = ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, + MyFile2, + MyOff2 + size(MyChunk2), + MyChunk, BadCSum, + infinity), %% Put kick_projection_reaction() in the middle of the test so %% that any problems with its async nature will (hopefully) @@ -80,9 +89,9 @@ api_smoke_test() -> %% Alright, now for the rest of the API, whee BadFile = <<"no-such-file">>, - {error, bad_arg} = ?MUT:checksum_list(Prox1, FakeEpoch, BadFile), + {error, bad_arg} = ?MUT:checksum_list(Prox1, BadFile), {ok, [_|_]} = ?MUT:list_files(Prox1, FakeEpoch), - {ok, {false, _}} = ?MUT:wedge_status(Prox1), + {ok, {false, _,_,_}} = ?MUT:wedge_status(Prox1), {ok, {0, _SomeCSum}} = ?MUT:get_latest_epochid(Prox1, public), {ok, #projection_v1{epoch_number=0}} = ?MUT:read_latest_projection(Prox1, public), @@ -111,6 +120,8 @@ flu_restart_test2() -> TcpPort = 17125, DataDir = "./data.api_smoke_flu2", W_props = [{initial_wedged, false}, {active_mode, false}], + NSInfo = undefined, + NoCSum = <<>>, try {[I], _, _} = machi_test_util:start_flu_package( @@ -120,9 +131,8 @@ flu_restart_test2() -> FakeEpoch = ?DUMMY_PV1_EPOCH, Data = <<"data!">>, Dataxx = <<"Fake!">>, - {ok, {Off1,Size1,File1}} = ?MUT:append_chunk(Prox1, - FakeEpoch, <<"prefix">>, Data, - infinity), + {ok, {Off1,Size1,File1}} = ?MUT:append_chunk(Prox1, NSInfo, + FakeEpoch, <<"prefix">>, Data, NoCSum), P_a = #p_srvr{name=a, address="localhost", port=6622}, P1 = machi_projection:new(1, RegName, [P_a], [], [RegName], [], []), P1xx = P1#projection_v1{dbg2=["dbg2 changes are ok"]}, @@ -146,6 +156,7 @@ flu_restart_test2() -> %% makes the code a bit convoluted. (No LFE or %% Elixir macros here, alas, they'd be useful.) + AppendOpts1 = #append_opts{chunk_extra=42}, ExpectedOps = [ fun(run) -> ?assertEqual({ok, EpochID}, ?MUT:get_epoch_id(Prox1)), @@ -227,35 +238,37 @@ flu_restart_test2() -> (stop) -> ?MUT:get_all_projections(Prox1, private) end, fun(run) -> {ok, {_,_,_}} = - ?MUT:append_chunk(Prox1, FakeEpoch, - <<"prefix">>, Data, infinity), + ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, + <<"prefix">>, Data, NoCSum), ok; (line) -> io:format("line ~p, ", [?LINE]); - (stop) -> ?MUT:append_chunk(Prox1, FakeEpoch, - <<"prefix">>, Data, infinity) + (stop) -> ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, + <<"prefix">>, Data, NoCSum) end, fun(run) -> {ok, {_,_,_}} = - ?MUT:append_chunk_extra(Prox1, FakeEpoch, - <<"prefix">>, Data, 42, infinity), + ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, + <<"prefix">>, Data, NoCSum, + AppendOpts1, infinity), ok; (line) -> io:format("line ~p, ", [?LINE]); - (stop) -> ?MUT:append_chunk_extra(Prox1, FakeEpoch, - <<"prefix">>, Data, 42, infinity) + (stop) -> ?MUT:append_chunk(Prox1, NSInfo, FakeEpoch, + <<"prefix">>, Data, NoCSum, + AppendOpts1, infinity) end, fun(run) -> {ok, {[{_, Off1, Data, _}], []}} = - ?MUT:read_chunk(Prox1, FakeEpoch, - File1, Off1, Size1, []), + ?MUT:read_chunk(Prox1, NSInfo, FakeEpoch, + File1, Off1, Size1, undefined), ok; (line) -> io:format("line ~p, ", [?LINE]); - (stop) -> ?MUT:read_chunk(Prox1, FakeEpoch, - File1, Off1, Size1, []) + (stop) -> ?MUT:read_chunk(Prox1, NSInfo, FakeEpoch, + File1, Off1, Size1, undefined) end, fun(run) -> {ok, KludgeBin} = - ?MUT:checksum_list(Prox1, FakeEpoch, File1), + ?MUT:checksum_list(Prox1, File1), true = is_binary(KludgeBin), ok; (line) -> io:format("line ~p, ", [?LINE]); - (stop) -> ?MUT:checksum_list(Prox1, FakeEpoch, File1) + (stop) -> ?MUT:checksum_list(Prox1, File1) end, fun(run) -> {ok, _} = ?MUT:list_files(Prox1, FakeEpoch), @@ -271,21 +284,21 @@ flu_restart_test2() -> end, fun(run) -> ok = - ?MUT:write_chunk(Prox1, FakeEpoch, File1, Off1, - Data, infinity), + ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, File1, Off1, + Data, NoCSum, infinity), ok; (line) -> io:format("line ~p, ", [?LINE]); - (stop) -> ?MUT:write_chunk(Prox1, FakeEpoch, File1, Off1, - Data, infinity) + (stop) -> ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, File1, Off1, + Data, NoCSum, infinity) end, fun(run) -> {error, written} = - ?MUT:write_chunk(Prox1, FakeEpoch, File1, Off1, - Dataxx, infinity), + ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, File1, Off1, + Dataxx, NoCSum, infinity), ok; (line) -> io:format("line ~p, ", [?LINE]); - (stop) -> ?MUT:write_chunk(Prox1, FakeEpoch, File1, Off1, - Dataxx, infinity) + (stop) -> ?MUT:write_chunk(Prox1, NSInfo, FakeEpoch, File1, Off1, + Dataxx, NoCSum, infinity) end ], diff --git a/test/machi_test_util.erl b/test/machi_test_util.erl index ff908b7..70b02af 100644 --- a/test/machi_test_util.erl +++ b/test/machi_test_util.erl @@ -83,7 +83,7 @@ stop_machi_sup() -> undefined -> ok; Pid -> catch exit(whereis(machi_sup), normal), - machi_util:wait_for_death(Pid, 30) + machi_util:wait_for_death(Pid, 100) end. clean_up(FluInfo) ->