diff --git a/src/stasis/concurrentHash.c b/src/stasis/concurrentHash.c index 3c83997..e90ec87 100644 --- a/src/stasis/concurrentHash.c +++ b/src/stasis/concurrentHash.c @@ -1,8 +1,356 @@ -/* - * concurrenthash.c - * - * Created on: Oct 15, 2009 - * Author: sears +/** + concurrenthash.c + + @file implementation of a concurrent, fixed-size hashtable + + +============================ +Discussion of implementation +============================ + +Before reading the rest of this (giant) comment, you probably should read the +implementation, as it is shorter, and the text below assumes an understanding +of the API, and the general idea behind the algorithm. If you just want to use +the hashtable (or haven't yet looked at it), you should read concurrentHash.h +for a discussion of this data structure's API, and non-standard concurrency +primitives. + +This concurrent hash table implementation completely avoids the need for +global latches, but is incapable of resizing itself at runtime. It is based +upon three ideas: + + - Islands, which we use to implement the NEAR primitive from navigational + databases of lore. + + - Modular arithmetic and two's complement, which we use to precisely define + the concepts of "before" and "after" that the island implementation relies + upon. + + - Lock (latch) crabbing and linearizability, which form the basis of the + argument that the concurrency control protocol is both live and safe. + +Implementing NEAR with islands +============================== + +Ignoring wrapping and locking for a moment, inserting, looking up and +removing an element from the hash table can be summarized as follows. +Let us call each sequence of non-null elements in the hash table an +"island". When looking up an entry in the hashtable, we start at the element +the key hashes to. If this element matches, we return it. If it is null, +then we return null. Otherwise, we repeat the process at the next entry. It +is easy to see that we will find a element that hashes to element N iff it is +in the same island as N, and in a position that is >= N. + +This trick is extremely old and general-purpose; navigational databases +actually implemented the primitives described here, and allowed applications to +request that some key K be placed NEAR some other key K', then lookup K using +the key K'. + +Insertion in this scheme is straightforward. We simply start at entry N, +then move to N+1, N+2 and so on until we find a null entry. We store the +element in this entry and return. Clearly, the entry is in the same island +as N, and is in a position >= N. + +Removing elements is more complicated. When removing an element from an +island, we will either replace it with a null, or replace it with a copy of an +element that resides in a different bucket, then delete the entry from the +second bucket. + +Replacing an element with null breaks an island. Say, we are removing C: + +null,A,B,C,D,E,F,null -> null,A,B,null,D,E,F,null + +This is OK to do as long D,E and F do not hash to position of A,B or C. +Otherwise, they will no longer be discoverable. + +The alternative is to replace C with some other element from the same island, +say E, and then apply the same algorithm to delete E: + +null,A,B,C,D,E,F,null -> null,A,B,E,D,?,F,null -> ... + +this is OK to do as long as E hashes to the position of A,B or C. + +These two observations lead to the following algorithm. Say we need to +remove an element at position P: + +Keep checking elements at position P+1, P+2 and so on until either: + A. We discover a null element, or + B. We discover an element that hashes to position P' <= P + +In case A, we know that all elements in (P,P') are mapped to positions +that are > P. Therefore, breaking the island at position P by replacing +P with null is allowed. So we do exactly that. + +In case B, we should move the element from position P' to position P and +then apply the same deletion algorithm to position P'. This is OK +because if element was discoverable at position P', it will remain +discoverable at position P. Lets prove it. Say P' hashes to position I. +We know that I <= P due to condition B. Since P' was discoverable before +transformation, and because P' >= P, we know [I,P] contains only non-null +elements. So, the search for that element will iterate starting from I +until it finds element at position P. + +Dealing with wraparound +======================= + +The wraparound is actually not complicating things too much as long as +we guarantee that the number of items in the hash is less than 50% of +total hash capacity; to decide whether A is before or after B, we see +whether there are more buckets in the range (A,B), or in the range (B,A). +Wraparound guarantees that both ranges are well-defined. + +Once we get above 50% occupancy, it gets hard to figure out which position is +"greater" than the other one as we can theoretically have an island that is +1/2 as long as the hash table. Based on the definition in the previous +paragraph, the "head" of such an island would be "after" its tail. We could +circumvent the problem by walking the entire island to see whether or not a +null occurs between the head and an entry of interest, but this would incur +O(n) overhead, and dramatically increase the number of latches held by any +given thread. + +At any rate, hashtables with over 50% utilization are likely to have poor +performance characteristics. Rather than risk poor CPU utilization, users +of this hash size the bucket list conservatively, preventing it from having +over 25% utilization. Compared to the memory overhead of pointers (and +fragmentation) that would be required by a scheme that implemented a linked +list per bucket, this overhead seems reasonable. + +At any rate, the 50% utilization requirement allows us define "after" as +follows: + + position Y is after X iff "(X - Y) mod N > N/2" + +, where N is the number of hash buckets. We constrain N to be a power of +two, giving us: + + position Y is after X iff "(X - Y) bitwise_and (N-1) > N/2" + +Note that if (X-Y) is negative, the bitwise_and will truncate any leading 1 +bits so that the resulting expression is less than N. Assuming the difference +between X and Y is less than N/2 (ie: the island that led to this range +computation is less than N/2 buckets long), the properties of 2's complement +arithmetic tell us that this number will be greater than N/2, and the +expression will return true. If the difference is positive (and again, by +assumption, less than N/2), the expression will correctly return false; +iterating from Y to Y+1 and so on will reach X faster than iterating in the +other direction. The hashtable used to use two expressions that special-cased +the above reasoning, but they did not cover corner cases involving entries 0 +and maxbucketid. This led to silent hashtable corruption for over a year until +it was eventually spotted. Modular arithmetic handles strange wraparound cases +correctly and implicitly. + +Lock crabbing and linearizability +================================= + +According to wikipedia, a history is linearizable if: + + * its invocations and responses can be reordered to yield a sequential history + + * that sequential history is correct according to the sequential definition of + the object + + * if a response preceded an invocation in the original history, it must still + precede it in the sequential reordering + +The first two bullets define serializability. The third strengthens the +concept. Serializability allows the system to choose an ordering (the +"commit order") for a set of requests. Linearizability restricts the choice +of schedules to those that do not involve reordering of concurrent requests +from the callers' perspective. This is important if the callers communicate +via external data structures; without the third requirement, apparent temporal +paradoxes could arise. + +With this in mind, let us prove that the crabbing scheme is deadlock-free and +linearizable. + +Lookup and insertion keep at most two hash table entries locked at a time; +deletion temporarily grabs a third lock. Ignoring wraparound, this makes it +easy to show liveness. The only way a deadlock could occur would be if there +existed a cycle of processes, each waiting to grab a latch held by the next. +Since each process grabs latches according to the bucket ordering, no process +will ever hold a high-numbered latch while waiting for a lower-numbered latch. +Therefore, the processes cannot possibly form a cycle, and no deadlock exists. + +Informally, linearizability is achieved by using a latch protocol that leads +to an ordering of operations, and that ensures each operation sees all updates +from operations performed before it and no updates from operations performed +after it. + +Unfortunately, wraparound introduces the possibility of deadlock and leads to +cycles of non-linearizable operations. + +To see why deadlock is not a problem in practice, note that, for deadlock to +occur, we need to have threads obtain mutexes in a way that creates a cycle. +Each thread holds at most 3 latches at a time, and no thread will ever block +while holding a latch on a null bucket (making it difficult for the cycle to +span multiple islands). Therefore, such cycles can be eliminated by limiting +hashtable bucket occupancy. Similarly, the definitions of "before" and +"after" (in the temporal, linearizability sense) are correct, up to cycles; the +hash is linearizable so long as a cycle of concurrent requests do not all come +(by the above definition) "before" each other. It is likely that such a cycle +could be broken arbitrarily in practice, as such requests would be, by +definition, concurrent, and each operates on a single key-value pair. + +However, with this high-level intuition, we've gotten ahead of ourselves. Let +us start by considering deletion in detail, as it is the most complex of the +operations. + +Deletion starts by exclusively locking element to be removed (P), and then +potentially moves the exclusive lock to P', starting with P+1 and moving +forward. If it moves forward, a third lock is temporarily grabbed on entry +P'+1, and then the lock on P' is released. + +Let's consider the deletion algorithm described above and reason about cases +A and B: + + A. We discover a null element, or + B. We discover an element that hashes to position P' <= P + + (here "<=" means "before or equal to) + +In both cases, we want to ensure that the operation we perform is not +making any element in (P,P') undiscoverable. This is more complicated than +in the single-threaded case, as we released locks on the elements we observed +in this interval during crabbing. + +At this point in the discussion, we fall back on the definition of +linearizability for guidance. From the point of view of any hashtable +operation, we can partition all other operations as follows: +they happen "before" or "after" this operation, or they touch a +non-overlapping portion of the hashtable. + +Non-overlapping operations are trivially linearizable (read "somebody else's +problem"), as linearizability across mutex acquisitions is guaranteed by +pthreads. + +This leaves operations that touch the same hash buckets as our operation. +Each operation maintains a number of cursors, and occurs in two phases. +In phase 1, it simply crabs along from the bucket that the element of interest +hashes to, looking for a null bucket, or the element. When our operation grabs +a mutex in phase one, it is forced into being "before" any other operations +that hold a bucket latch "before" our latch. (The terminology here is +unfortunate.) Similarly, our operation is "after" any operations that hold a +latch on a bucket "after" ours. + +(NOTE: These definitions do not lead to transitive "before" and "after" +relationships. For the proof hold, we would need to make use of the fact that +dependency cycles cannot exist due to low occupancy, just as we do for deadlock +freedom.) + +During phase 1, the latches that are held are adjacent (ie: P+1=P'; therefore, +no intervening thread can get a latch inside of the range. Similarly, all +operations obtain latches by crabbing, making it impossible for our definition +to say some operation is "before" and "after" this operation. + +Phase 1 is read only, so the "before" and "after" definition trivially +corresponds to phase 1's side effects. + +Phase 2 can be broken into three sub-cases. The first two: lookup and +insertion, are trivial. Phase 1 positioned the cursor at the correct bucket +(either the bucket containing the value, or the first null bucket after the +value, respectively). Showing that lookups and insertions are always +linearizable reduces to applying the definition of "before" and "after", and +observing that: + + (1) operations blocked on this operation must be "after" it, and must reflect + any update we performed, and + + (2) operations this insert operation blocked on must be "before" it, and + cannot observe any modifications we made to the hashtable. + +We now move onto deletion, which is (as always) more complicated. Instead of +making use of simple crabbing, deletion leaves an "anchor" mutex at the location +of the value to be deleted, then creates a crabbing pair of mutexes that walk +along the island, looking for something to stick in the empty bucket at the +anchor. This allows concurrent threads to place cursors between the anchor and +the crabbing pair. It turns out that such threads are "after" the deletion +operation. They do not notice any modifications made by deletion (deletion +would have moved the anchor if it had modified anything between the two +cursors), and deletion does not observe any empty buckets that future +operations could have created, since it never relatches buckets between the +anchor and the crabbing pair. This property is not important from an +application point of view, but it does form the basis of our reasoning about +the correctness of concurrent deletion: + +Case I: the deletion is not nested inside another deletion: + +Because deletion keeps an exclusive lock on P, the only way for another +thread to get into (P,P') is to operate on a node that hashes between P and P', +as it could not arrive inside this interval by skipping over a locked P. + +In case A (deletion encountered a null) it breaks the island by replacing +element P with null. Recall that, in the sequential case, this is OK as long +as (P,P') does not contain entries mapped to indexes before P. It didn't when +we went through it in deletion algorithm, and it couldn't get elements like +that inserted because P has been continuously latched. So it is safe to break +the island in this case. + +In case B, we are moving an element from P' to P. This will cause trouble +only if there is a search (or phase 1) operation in progress in (P, P') looking +for the element at P'. Since the element at P' hashes at or before position +P, and P is exclusively locked, the search operation must be scheduled +before the deletion operation. By linearizability, the deletion operation +cannot discover P' ahead of the search (the crabbing operation in search will +not let lookup of deletion operation pass it. So we can be certain that in +case B (P' hashes before or at P) there are no ongoing searches for element P' +in (P,P') and we can therefore safely move it over to P. + +Case II: the deletion is nested inside another deletion: + +The concern here is that we may break an invariant by breaking an island in +two (case A), or by moving some values around (case B). + +A (Inserting a null): Had that null existed, the outside deletion would have +terminated upon reaching the null. This is OK because, in order to create a +null, the inner traversal must first inspect the remainder of the island. +Since the null is between the outer deletion's anchor and crabs, the portion of +the island that comes after the null does not contain any value that hashes +before the null. The outer deletion's anchor, P, is before P''. Since +P < P'', and all values after P'' in the island belong after P'', such values +must not belong before or at P, and therefore would not have been moved by the +outer delete, had it created the null. + +B (Moving a value): When we move a value from one part of the island to +another, it remains discoverable, exactly as in the sequential case. This case +eventually will reduce to case A. + +Liveness, revisited +------------------- + +It is easy to prove that even though we cannot statically rank all the +locks covering hash table, deadlock is still impossible. For deadlock to +occur, we need to have a "ring" covering entire hash table contained of +"islands" and connected by in-flight crabbing operations. No hashtable operations +block upon further latch acquisitions upon encountering an existing null bucket. +Such a bucket exists at the end of each island, preventing any latch chain from +spanning multiple islands. + +There could be more involved cases of deadlock involving application code that +holds hashtable locks and then locks over some external data structures, or +attempts to latch a second hashtable bucket. These deadlocks are inherent to +unsafe usages of the hashtable API, and not the underlying implementation. +Most should be detectable by ranking all locks and assigning the same rank to +all hashtable locks. + +Conclusion +========== + +As far as we can tell, the hash table implementation is correct. It is +conservative because it caps the utilization of hash at 25% instead of +50% minus one element. This is OK as it is relatively cheap and +decreases the average size of hash collision chains. + +History: +======== + + Created on: Oct 15, 2009 + Author: sears + +-r1275 09 Nov 2009 Finalized API +-r1410 16 Sep 2010 Discovered need for three mutexes during deletion crabbing. +-r1429 30 Sep 2010 Added fsck logic. (To no avail) +-r1475 14 Feb 2011 Slava found the mod bug, and wrote version 1 of the extensive + documentation above. I expanded it into v2, and committed it. */ #define _XOPEN_SOURCE 600 #include @@ -229,9 +577,11 @@ void hashtable_end_op(hashtable_mode mode, hashtable_t *ht, void *val, hashtable pageid_t newidx = hashtable_func(ht, b1->key); - // Subcase 1: newidx is higher than idx2, so newidx should stay where it is. - // Subcase 2: newidx wrapped, so it is less than idx2, but more than half way around the ring. - if(idx2 < newidx || (idx2 > newidx + (ht->maxbucketid/2))) { + // If newidx is past idx2, lookup will never find b1->key in position + // idx2. Taking wraparound into account, and noticing that we never + // have more than maxbucketid/4 elements in hash table, the following + // expression detects if newidx is past idx2: + if(((idx2 - newidx) & ht->maxbucketid) > ht->maxbucketid/2) { // skip this b1. // printf("s\n"); fflush(0); idx = hashtable_wrap(ht, idx+1); diff --git a/stasis/concurrentHash.h b/stasis/concurrentHash.h index 49db084..a14d19e 100644 --- a/stasis/concurrentHash.h +++ b/stasis/concurrentHash.h @@ -1,6 +1,24 @@ -/* +/** * concurrentHash.h * + * @file A concurrent, fixed-size hashtable that allows users to obtain latches + * on its keys. + * + * Operations against this hashtable proceed in two phases. In the first phase, + * the bucket that contains (or will contain) the requested key is located. At + * this point, the implementation optionally returns control to the caller, + * which may examine the bucket, and decide to complete or cancel the operation. + * + * Of course, like any other mutex, bucket latches allow you to write code that + * will deadlock. Initiating an operation against a hashtable while holding a + * latch on one of its buckets is unsafe, and will lead to deadlocks and other + * bad behavior. + * + * Notes: + * + * It would be trivial to implement an insert_begin, _finish, and _remove, but + * the need for such things has never come up. (See hashtable_test_and_set instead) + * * Created on: Oct 15, 2009 * Author: sears */ @@ -36,11 +54,13 @@ void * hashtable_remove_begin(hashtable_t *ht, pageid_t p, hashtable_bucket_hand void hashtable_remove_finish(hashtable_t *ht, hashtable_bucket_handle_t *h); void hashtable_remove_cancel(hashtable_t *ht, hashtable_bucket_handle_t *h); +/** Be sure to call this immediately after calling an methods whose names end in "_lock()" */ +void hashtable_unlock(hashtable_bucket_handle_t *h); + /** * @return -0 if key not found, 1 if the key exists, >1 if the hashtable is corrupt, and the key appears multiple times.. */ int hashtable_debug_number_of_key_copies(hashtable_t *ht, pageid_t pageied); -void hashtable_unlock(hashtable_bucket_handle_t *h); #endif /* CONCURRENTHASH_H_ */ diff --git a/test/stasis/check_concurrentHash.c b/test/stasis/check_concurrentHash.c index 35d4e02..031a20f 100644 --- a/test/stasis/check_concurrentHash.c +++ b/test/stasis/check_concurrentHash.c @@ -58,13 +58,11 @@ terms specified in this license. #include #include -#ifdef DBUG_TEST -extern int dbug_choice(int); -#endif - #define LOG_NAME "check_lhtable.log" + #ifdef DBUG_TEST +extern int dbug_choice(int); #define NUM_OPS 4 #define NUM_ENTRIES 4 #define NUM_THREADS 2 @@ -73,28 +71,16 @@ extern int dbug_choice(int); #define myrandom(x) dbug_choice(x) #else #define NUM_OPS 10000000 -#define NUM_ENTRIES 10000 +#define NUM_ENTRIES 8192 #define NUM_THREADS 100 #define THREAD_ENTRIES ((NUM_ENTRIES/NUM_THREADS)-1) #endif + hashtable_t * ht; void * worker(void * arg) { - int stride = *(int*) arg; - - pageid_t *data = malloc(sizeof(pageid_t) * THREAD_ENTRIES); - -#ifdef DBUG_TEST - for(int i = 1; i <= THREAD_ENTRIES; i++) { - data[i-1] = -1 * (stride + (i * HT_ENTRIES)); - } -#else - for(int i = 1; i <= THREAD_ENTRIES; i++) { - data[i-1] = -1 * (stride + (i * NUM_THREADS)); - } -#endif - for(int j = 0; j < NUM_OPS/*/ NUM_THREADS*/; j++) { - + pageid_t *data = (pageid_t *)arg; + for(int j = 0; j < NUM_OPS/ NUM_THREADS; j++) { int op = myrandom(2); int i = myrandom(THREAD_ENTRIES); @@ -134,26 +120,53 @@ void * worker(void * arg) { START_TEST(singleThreadHashTest) { #ifdef DBUG_TEST - ht = hashtable_init((pageid_t)HT_ENTRIES); + ht = hashtable_init(HT_ENTRIES); #else ht = hashtable_init((pageid_t)((double)THREAD_ENTRIES * 1.1)); #endif - int i = 0; - worker(&i); + pageid_t *data = malloc(sizeof(pageid_t) * THREAD_ENTRIES); + + for(int i = 1; i <= THREAD_ENTRIES; i++) { + data[i-1] = -1 * (i * NUM_THREADS); + } + worker(data); + hashtable_deinit(ht); +} END_TEST + +START_TEST(wraparoundHashTest) { + unsigned numEntries = NUM_OPS/ NUM_THREADS * 4 + 3; + unsigned power = 1; + while ( (1ull << power ) < numEntries ) { + ++power; + } +#ifdef DBUG_TEST + ht = hashtable_init(HT_ENTRIES); +#else + ht = hashtable_init(numEntries); +#endif + pageid_t *data = malloc(sizeof(pageid_t) * THREAD_ENTRIES); + + for(int i = 1; i <= THREAD_ENTRIES; i++) { + data[i-1] = -1 * (((i << power) - 6 + myrandom(13)) / 13); + } + worker(data); hashtable_deinit(ht); } END_TEST START_TEST(concurrentHashTest) { #ifdef DBUG_TEST - ht = hashtable_init((pageid_t)HT_ENTRIES); + ht = hashtable_init(HT_ENTRIES); #else ht = hashtable_init((pageid_t)((double)NUM_ENTRIES * 1.1)); #endif pthread_t workers[NUM_THREADS]; for(int i = 0 ; i < NUM_THREADS; i++) { - int * ip = malloc(sizeof(int)); - *ip = i; - pthread_create(&workers[i], 0, worker, ip); + pageid_t *data = malloc(sizeof(pageid_t) * THREAD_ENTRIES); + + for(int j = 1; j <= THREAD_ENTRIES; j++) { + data[j-1] = -1 * (i + (j * NUM_THREADS)); + } + pthread_create(&workers[i], 0, worker, data); } for(int i = 0 ; i < NUM_THREADS; i++) { pthread_join(workers[i],0); @@ -173,8 +186,10 @@ Suite * check_suite(void) { /* Sub tests are added, one per line, here */ tcase_add_test(tc, singleThreadHashTest); #ifndef DBUG_TEST // TODO should run exactly one of these two tests under dbug. Need good way to choose which one. + tcase_add_test(tc, wraparoundHashTest); tcase_add_test(tc, concurrentHashTest); #endif + /* --------------------------------------------- */ tcase_add_checked_fixture(tc, setup, teardown);