From fb5791e7e47e179a4de5dd2bfcfac8edc3132fbf Mon Sep 17 00:00:00 2001
From: Sears Russell <sears@cs.berkeley.edu>
Date: Tue, 15 Feb 2011 02:57:19 +0000
Subject: [PATCH] one line fix to concurrentHashTable, plus documentation and
 improved unit tests.

---
 src/stasis/concurrentHash.c        | 366 ++++++++++++++++++++++++++++-
 stasis/concurrentHash.h            |  24 +-
 test/stasis/check_concurrentHash.c |  69 +++---
 3 files changed, 422 insertions(+), 37 deletions(-)

diff --git a/src/stasis/concurrentHash.c b/src/stasis/concurrentHash.c
index 3c83997..e90ec87 100644
--- a/src/stasis/concurrentHash.c
+++ b/src/stasis/concurrentHash.c
@@ -1,8 +1,356 @@
-/*
- * concurrenthash.c
- *
- *  Created on: Oct 15, 2009
- *      Author: sears
+/**
+  concurrenthash.c
+
+  @file implementation of a concurrent, fixed-size hashtable
+
+
+============================
+Discussion of implementation
+============================
+
+Before reading the rest of this (giant) comment, you probably should read the
+implementation, as it is shorter, and the text below assumes an understanding
+of the API, and the general idea behind the algorithm.  If you just want to use
+the hashtable (or haven't yet looked at it), you should read concurrentHash.h
+for a discussion of this data structure's API, and non-standard concurrency
+primitives.
+
+This concurrent hash table implementation completely avoids the need for
+global latches, but is incapable of resizing itself at runtime.  It is based
+upon three ideas:
+
+ - Islands, which we use to implement the NEAR primitive from navigational
+   databases of lore.
+
+ - Modular arithmetic and two's complement, which we use to precisely define
+   the concepts of "before" and "after" that the island implementation relies
+   upon.
+
+ - Lock (latch) crabbing and linearizability, which form the basis of the
+   argument that the concurrency control protocol is both live and safe.
+
+Implementing NEAR with islands
+==============================
+
+Ignoring wrapping and locking for a moment, inserting, looking up and
+removing an element from the hash table can be summarized as follows.
+Let us call each sequence of non-null elements in the hash table an
+"island".  When looking up an entry in the hashtable, we start at the element
+the key hashes to.  If this element matches, we return it.  If it is null,
+then we return null.  Otherwise, we repeat the process at the next entry.  It
+is easy to see that we will find a element that hashes to element N iff it is
+in the same island as N, and in a position that is >= N.
+
+This trick is extremely old and general-purpose; navigational databases
+actually implemented the primitives described here, and allowed applications to
+request that some key K be placed NEAR some other key K', then lookup K using
+the key K'.
+
+Insertion in this scheme is straightforward.  We simply start at entry N,
+then move to N+1, N+2 and so on until we find a null entry.  We store the
+element in this entry and return.  Clearly, the entry is in the same island
+as N, and is in a position >= N.
+
+Removing elements is more complicated.  When removing an element from an
+island, we will either replace it with a null, or replace it with a copy of an
+element that resides in a different bucket, then delete the entry from the
+second bucket.
+
+Replacing an element with null breaks an island. Say, we are removing C:
+
+null,A,B,C,D,E,F,null -> null,A,B,null,D,E,F,null
+
+This is OK to do as long D,E and F do not hash to position of A,B or C.
+Otherwise, they will no longer be discoverable.
+
+The alternative is to replace C with some other element from the same island,
+say E, and then apply the same algorithm to delete E:
+
+null,A,B,C,D,E,F,null -> null,A,B,E,D,?,F,null -> ...
+
+this is OK to do as long as E hashes to the position of A,B or C.
+
+These two observations lead to the following algorithm. Say we need to
+remove an element at position P:
+
+Keep checking elements at position P+1, P+2 and so on until either:
+  A. We discover a null element, or
+  B. We discover an element that hashes to position P' <= P
+
+In case A, we know that all elements in (P,P') are mapped to positions
+that are > P. Therefore, breaking the island at position P by replacing
+P with null is allowed. So we do exactly that.
+
+In case B, we should move the element from position P' to position P and
+then apply the same deletion algorithm to position P'. This is OK
+because if element was discoverable at position P', it will remain
+discoverable at position P. Lets prove it. Say P' hashes to position I.
+We know that I <= P due to condition B. Since P' was discoverable before
+transformation, and because P' >= P, we know [I,P] contains only non-null
+elements. So, the search for that element will iterate starting from I
+until it finds element at position P.
+
+Dealing with wraparound
+=======================
+
+The wraparound is actually not complicating things too much as long as
+we guarantee that the number of items in the hash is less than 50% of
+total hash capacity; to decide whether A is before or after B, we see
+whether there are more buckets in the range (A,B), or in the range (B,A).
+Wraparound guarantees that both ranges are well-defined.
+
+Once we get above 50% occupancy, it gets hard to figure out which position is
+"greater" than the other one as we can theoretically have an island that is
+1/2 as long as the hash table.  Based on the definition in the previous
+paragraph, the "head" of such an island would be "after" its tail.  We could
+circumvent the problem by walking the entire island to see whether or not a
+null occurs between the head and an entry of interest, but this would incur
+O(n) overhead, and dramatically increase the number of latches held by any
+given thread.
+
+At any rate, hashtables with over 50% utilization are likely to have poor
+performance characteristics.  Rather than risk poor CPU utilization, users
+of this hash size the bucket list conservatively, preventing it from having
+over 25% utilization.  Compared to the memory overhead of pointers (and
+fragmentation) that would be required by a scheme that implemented a linked
+list per bucket, this overhead seems reasonable.
+
+At any rate, the 50% utilization requirement allows us define "after" as
+follows:
+
+  position Y is after X iff "(X - Y) mod N > N/2"
+
+, where N is the number of hash buckets.  We constrain N to be a power of
+two, giving us:
+
+  position Y is after X iff "(X - Y) bitwise_and (N-1) > N/2"
+
+Note that if (X-Y) is negative, the bitwise_and will truncate any leading 1
+bits so that the resulting expression is less than N.  Assuming the difference
+between X and Y is less than N/2 (ie: the island that led to this range
+computation is less than N/2 buckets long), the properties of 2's complement
+arithmetic tell us that this number will be greater than N/2, and the
+expression will return true.  If the difference is positive (and again, by
+assumption, less than N/2), the expression will correctly return false;
+iterating from Y to Y+1 and so on will reach X faster than iterating in the
+other direction.  The hashtable used to use two expressions that special-cased
+the above reasoning, but they did not cover corner cases involving entries 0
+and maxbucketid.  This led to silent hashtable corruption for over a year until
+it was eventually spotted.  Modular arithmetic handles strange wraparound cases
+correctly and implicitly.
+
+Lock crabbing and linearizability
+=================================
+
+According to wikipedia, a history is linearizable if:
+
+ * its invocations and responses can be reordered to yield a sequential history
+
+ * that sequential history is correct according to the sequential definition of
+   the object
+
+ * if a response preceded an invocation in the original history, it must still
+   precede it in the sequential reordering
+
+The first two bullets define serializability.  The third strengthens the
+concept.  Serializability allows the system to choose an ordering (the
+"commit order") for a set of requests.  Linearizability restricts the choice
+of schedules to those that do not involve reordering of concurrent requests
+from the callers' perspective.  This is important if the callers communicate
+via external data structures; without the third requirement, apparent temporal
+paradoxes could arise.
+
+With this in mind, let us prove that the crabbing scheme is deadlock-free and
+linearizable.
+
+Lookup and insertion keep at most two hash table entries locked at a time;
+deletion temporarily grabs a third lock.  Ignoring wraparound, this makes it
+easy to show liveness.  The only way a deadlock could occur would be if there
+existed a cycle of processes, each waiting to grab a latch held by the next.
+Since each process grabs latches according to the bucket ordering, no process
+will ever hold a high-numbered latch while waiting for a lower-numbered latch.
+Therefore, the processes cannot possibly form a cycle, and no deadlock exists.
+
+Informally, linearizability is achieved by using a latch protocol that leads
+to an ordering of operations, and that ensures each operation sees all updates
+from operations performed before it and no updates from operations performed
+after it.
+
+Unfortunately, wraparound introduces the possibility of deadlock and leads to
+cycles of non-linearizable operations.
+
+To see why deadlock is not a problem in practice, note that, for deadlock to
+occur, we need to have threads obtain mutexes in a way that creates a cycle.
+Each thread holds at most 3 latches at a time, and no thread will ever block
+while holding a latch on a null bucket (making it difficult for the cycle to
+span multiple islands).  Therefore, such cycles can be eliminated by limiting
+hashtable bucket occupancy.  Similarly, the definitions of "before" and
+"after" (in the temporal, linearizability sense) are correct, up to cycles; the
+hash is linearizable so long as a cycle of concurrent requests do not all come
+(by the above definition) "before" each other.  It is likely that such a cycle
+could be broken arbitrarily in practice, as such requests would be, by
+definition, concurrent, and each operates on a single key-value pair.
+
+However, with this high-level intuition, we've gotten ahead of ourselves.  Let
+us start by considering deletion in detail, as it is the most complex of the
+operations.
+
+Deletion starts by exclusively locking element to be removed (P), and then
+potentially moves the exclusive lock to P', starting with P+1 and moving
+forward.  If it moves forward, a third lock is temporarily grabbed on entry
+P'+1, and then the lock on P' is released.
+
+Let's consider the deletion algorithm described above and reason about cases
+A and B:
+
+  A. We discover a null element, or
+  B. We discover an element that hashes to position P' <= P
+
+                                   (here "<=" means "before or equal to)
+
+In both cases, we want to ensure that the operation we perform is not
+making any element in (P,P') undiscoverable. This is more complicated than
+in the single-threaded case, as we released locks on the elements we observed
+in this interval during crabbing.
+
+At this point in the discussion, we fall back on the definition of
+linearizability for guidance.  From the point of view of any hashtable
+operation, we can partition all other operations as follows:
+they happen "before" or "after" this operation, or they touch a
+non-overlapping portion of the hashtable.
+
+Non-overlapping operations are trivially linearizable (read "somebody else's
+problem"), as linearizability across mutex acquisitions is guaranteed by
+pthreads.
+
+This leaves operations that touch the same hash buckets as our operation.
+Each operation maintains a number of cursors, and occurs in two phases.
+In phase 1, it simply crabs along from the bucket that the element of interest
+hashes to, looking for a null bucket, or the element.  When our operation grabs
+a mutex in phase one, it is forced into being "before" any other operations
+that hold a bucket latch "before" our latch.  (The terminology here is
+unfortunate.)  Similarly, our operation is "after" any operations that hold a
+latch on a bucket "after" ours.
+
+(NOTE: These definitions do not lead to transitive "before" and "after"
+relationships.  For the proof hold, we would need to make use of the fact that
+dependency cycles cannot exist due to low occupancy, just as we do for deadlock
+freedom.)
+
+During phase 1, the latches that are held are adjacent (ie: P+1=P'; therefore,
+no intervening thread can get a latch inside of the range.  Similarly, all
+operations obtain latches by crabbing, making it impossible for our definition
+to say some operation is "before" and "after" this operation.
+
+Phase 1 is read only, so the "before" and "after" definition trivially
+corresponds to phase 1's side effects.
+
+Phase 2 can be broken into three sub-cases.  The first two: lookup and
+insertion, are trivial.  Phase 1 positioned the cursor at the correct bucket
+(either the bucket containing the value, or the first null bucket after the
+value, respectively).  Showing that lookups and insertions are always
+linearizable reduces to applying the definition of "before" and "after", and
+observing that:
+
+ (1) operations blocked on this operation must be "after" it, and must reflect
+     any update we performed, and
+
+ (2) operations this insert operation blocked on must be "before" it, and
+     cannot observe any modifications we made to the hashtable.
+
+We now move onto deletion, which is (as always) more complicated.  Instead of
+making use of simple crabbing, deletion leaves an "anchor" mutex at the location
+of the value to be deleted, then creates a crabbing pair of mutexes that walk
+along the island, looking for something to stick in the empty bucket at the
+anchor.  This allows concurrent threads to place cursors between the anchor and
+the crabbing pair.  It turns out that such threads are "after" the deletion
+operation.  They do not notice any modifications made by deletion (deletion
+would have moved the anchor if it had modified anything between the two
+cursors), and deletion does not observe any empty buckets that future
+operations could have created, since it never relatches buckets between the
+anchor and the crabbing pair.  This property is not important from an
+application point of view, but it does form the basis of our reasoning about
+the correctness of concurrent deletion:
+
+Case I: the deletion is not nested inside another deletion:
+
+Because deletion keeps an exclusive lock on P, the only way for another
+thread to get into (P,P') is to operate on a node that hashes between P and P',
+as it could not arrive inside this interval by skipping over a locked P.
+
+In case A (deletion encountered a null) it breaks the island by replacing
+element P with null.  Recall that, in the sequential case, this is OK as long
+as (P,P') does not contain entries mapped to indexes before P. It didn't when
+we went through it in deletion algorithm, and it couldn't get elements like
+that inserted because P has been continuously latched.  So it is safe to break
+the island in this case.
+
+In case B, we are moving an element from P' to P. This will cause trouble
+only if there is a search (or phase 1) operation in progress in (P, P') looking
+for the element at P'.  Since the element at P' hashes at or before position
+P, and P is exclusively locked, the search operation must be scheduled
+before the deletion operation. By linearizability, the deletion operation
+cannot discover P' ahead of the search (the crabbing operation in search will
+not let lookup of deletion operation pass it. So we can be certain that in
+case B (P' hashes before or at P) there are no ongoing searches for element P'
+in (P,P') and we can therefore safely move it over to P.
+
+Case II: the deletion is nested inside another deletion:
+
+The concern here is that we may break an invariant by breaking an island in
+two (case A), or by moving some values around (case B).
+
+A (Inserting a null):  Had that null existed, the outside deletion would have
+terminated upon reaching the null.  This is OK because, in order to create a
+null, the inner traversal must first inspect the remainder of the island.
+Since the null is between the outer deletion's anchor and crabs, the portion of
+the island that comes after the null does not contain any value that hashes
+before the null.  The outer deletion's anchor, P, is before P''.  Since
+P < P'', and all values after P'' in the island belong after P'', such values
+must not belong before or at P, and therefore would not have been moved by the
+outer delete, had it created the null.
+
+B (Moving a value):  When we move a value from one part of the island to
+another, it remains discoverable, exactly as in the sequential case.  This case
+eventually will reduce to case A.
+
+Liveness, revisited
+-------------------
+
+It is easy to prove that even though we cannot statically rank all the
+locks covering hash table, deadlock is still impossible. For deadlock to
+occur, we need to have a "ring" covering entire hash table contained of
+"islands" and connected by in-flight crabbing operations.  No hashtable operations
+block upon further latch acquisitions upon encountering an existing null bucket.
+Such a bucket exists at the end of each island, preventing any latch chain from
+spanning multiple islands.
+
+There could be more involved cases of deadlock involving application code that
+holds hashtable locks and then locks over some external data structures, or
+attempts to latch a second hashtable bucket.  These deadlocks are inherent to
+unsafe usages of the hashtable API, and not the underlying implementation.
+Most should be detectable by ranking all locks and assigning the same rank to
+all hashtable locks.
+
+Conclusion
+==========
+
+As far as we can tell, the hash table implementation is correct. It is
+conservative because it caps the utilization of hash at 25% instead of
+50% minus one element. This is OK as it is relatively cheap and
+decreases the average size of hash collision chains.
+
+History:
+========
+
+   Created on: Oct 15, 2009
+       Author: sears
+
+-r1275 09 Nov 2009  Finalized API
+-r1410 16 Sep 2010  Discovered need for three mutexes during deletion crabbing.
+-r1429 30 Sep 2010  Added fsck logic.  (To no avail)
+-r1475 14 Feb 2011  Slava found the mod bug, and wrote version 1 of the extensive
+                    documentation above.  I expanded it into v2, and committed it.
  */
 #define _XOPEN_SOURCE 600
 #include <config.h>
@@ -229,9 +577,11 @@ void hashtable_end_op(hashtable_mode mode, hashtable_t *ht, void *val, hashtable
 
         pageid_t newidx = hashtable_func(ht, b1->key);
 
-        // Subcase 1: newidx is higher than idx2, so newidx should stay where it is.
-        // Subcase 2: newidx wrapped, so it is less than idx2, but more than half way around the ring.
-        if(idx2 < newidx || (idx2 > newidx + (ht->maxbucketid/2))) {
+        // If newidx is past idx2, lookup will never find b1->key in position
+        // idx2. Taking wraparound into account, and noticing that we never
+        // have more than maxbucketid/4 elements in hash table, the following
+        // expression detects if newidx is past idx2:
+        if(((idx2 - newidx) & ht->maxbucketid) > ht->maxbucketid/2) {
           // skip this b1.
   //        printf("s\n"); fflush(0);
           idx = hashtable_wrap(ht, idx+1);
diff --git a/stasis/concurrentHash.h b/stasis/concurrentHash.h
index 49db084..a14d19e 100644
--- a/stasis/concurrentHash.h
+++ b/stasis/concurrentHash.h
@@ -1,6 +1,24 @@
-/*
+/**
  * concurrentHash.h
  *
+ * @file A concurrent, fixed-size hashtable that allows users to obtain latches
+ *       on its keys.
+ *
+ * Operations against this hashtable proceed in two phases.  In the first phase,
+ * the bucket that contains (or will contain) the requested key is located.  At
+ * this point, the implementation optionally returns control to the caller,
+ * which may examine the bucket, and decide to complete or cancel the operation.
+ *
+ * Of course, like any other mutex, bucket latches allow you to write code that
+ * will deadlock.  Initiating an operation against a hashtable while holding a
+ * latch on one of its buckets is unsafe, and will lead to deadlocks and other
+ * bad behavior.
+ *
+ * Notes:
+ *
+ * It would be trivial to implement an insert_begin, _finish, and _remove, but
+ * the need for such things has never come up.  (See hashtable_test_and_set instead)
+ *
  *  Created on: Oct 15, 2009
  *      Author: sears
  */
@@ -36,11 +54,13 @@ void * hashtable_remove_begin(hashtable_t *ht, pageid_t p, hashtable_bucket_hand
 void   hashtable_remove_finish(hashtable_t *ht, hashtable_bucket_handle_t *h);
 void   hashtable_remove_cancel(hashtable_t *ht, hashtable_bucket_handle_t *h);
 
+/** Be sure to call this immediately after calling an methods whose names end in "_lock()" */
+void hashtable_unlock(hashtable_bucket_handle_t *h);
+
 /**
  * @return -0 if key not found, 1 if the key exists, >1 if the hashtable is corrupt, and the key appears multiple times..
  */
 int hashtable_debug_number_of_key_copies(hashtable_t *ht, pageid_t pageied);
 
-void hashtable_unlock(hashtable_bucket_handle_t *h);
 
 #endif /* CONCURRENTHASH_H_ */
diff --git a/test/stasis/check_concurrentHash.c b/test/stasis/check_concurrentHash.c
index 35d4e02..031a20f 100644
--- a/test/stasis/check_concurrentHash.c
+++ b/test/stasis/check_concurrentHash.c
@@ -58,13 +58,11 @@ terms specified in this license.
 #include <stdlib.h>
 #include <string.h>
 
-#ifdef DBUG_TEST
-extern int dbug_choice(int);
-#endif
-
 #define LOG_NAME   "check_lhtable.log"
 
+
 #ifdef DBUG_TEST
+extern int dbug_choice(int);
 #define NUM_OPS 4
 #define NUM_ENTRIES 4
 #define NUM_THREADS 2
@@ -73,28 +71,16 @@ extern int dbug_choice(int);
 #define myrandom(x) dbug_choice(x)
 #else
 #define NUM_OPS 10000000
-#define NUM_ENTRIES 10000
+#define NUM_ENTRIES 8192
 #define NUM_THREADS 100
 #define THREAD_ENTRIES ((NUM_ENTRIES/NUM_THREADS)-1)
 #endif
+
 hashtable_t * ht;
 
 void * worker(void * arg) {
-  int stride = *(int*) arg;
-
-  pageid_t *data = malloc(sizeof(pageid_t) * THREAD_ENTRIES);
-
-#ifdef DBUG_TEST
-  for(int i = 1; i <= THREAD_ENTRIES; i++) {
-    data[i-1] = -1 * (stride + (i * HT_ENTRIES));
-  }
-#else
-  for(int i = 1; i <= THREAD_ENTRIES; i++) {
-    data[i-1] = -1 * (stride + (i * NUM_THREADS));
-  }
-#endif
-  for(int j = 0; j < NUM_OPS/*/ NUM_THREADS*/; j++) {
-
+  pageid_t *data = (pageid_t *)arg;
+  for(int j = 0; j < NUM_OPS/ NUM_THREADS; j++) {
     int op = myrandom(2);
 
     int i = myrandom(THREAD_ENTRIES);
@@ -134,26 +120,53 @@ void * worker(void * arg) {
 
 START_TEST(singleThreadHashTest) {
 #ifdef DBUG_TEST
-  ht = hashtable_init((pageid_t)HT_ENTRIES);
+  ht = hashtable_init(HT_ENTRIES);
 #else
   ht = hashtable_init((pageid_t)((double)THREAD_ENTRIES * 1.1));
 #endif
-  int i = 0;
-  worker(&i);
+  pageid_t *data = malloc(sizeof(pageid_t) * THREAD_ENTRIES);
+
+  for(int i = 1; i <= THREAD_ENTRIES; i++) {
+    data[i-1] = -1 * (i * NUM_THREADS);
+  }
+  worker(data);
+  hashtable_deinit(ht);
+} END_TEST
+
+START_TEST(wraparoundHashTest) {
+  unsigned numEntries = NUM_OPS/ NUM_THREADS * 4 + 3;
+  unsigned power = 1;
+  while ( (1ull << power ) < numEntries ) {
+    ++power;
+  }
+#ifdef DBUG_TEST
+  ht = hashtable_init(HT_ENTRIES);
+#else
+  ht = hashtable_init(numEntries);
+#endif
+  pageid_t *data = malloc(sizeof(pageid_t) * THREAD_ENTRIES);
+
+  for(int i = 1; i <= THREAD_ENTRIES; i++) {
+    data[i-1] = -1 * (((i << power) - 6 + myrandom(13)) / 13);
+  }
+  worker(data);
   hashtable_deinit(ht);
 } END_TEST
 
 START_TEST(concurrentHashTest) {
 #ifdef DBUG_TEST
-  ht = hashtable_init((pageid_t)HT_ENTRIES);
+  ht = hashtable_init(HT_ENTRIES);
 #else
   ht = hashtable_init((pageid_t)((double)NUM_ENTRIES * 1.1));
 #endif
   pthread_t workers[NUM_THREADS];
   for(int i = 0 ; i < NUM_THREADS; i++) {
-    int * ip = malloc(sizeof(int));
-    *ip = i;
-    pthread_create(&workers[i], 0, worker, ip);
+    pageid_t *data = malloc(sizeof(pageid_t) * THREAD_ENTRIES);
+
+    for(int j = 1; j <= THREAD_ENTRIES; j++) {
+      data[j-1] = -1 * (i + (j * NUM_THREADS));
+    }
+    pthread_create(&workers[i], 0, worker, data);
   }
   for(int i = 0 ; i < NUM_THREADS; i++) {
     pthread_join(workers[i],0);
@@ -173,8 +186,10 @@ Suite * check_suite(void) {
   /* Sub tests are added, one per line, here */
   tcase_add_test(tc, singleThreadHashTest);
 #ifndef DBUG_TEST // TODO should run exactly one of these two tests under dbug.  Need good way to choose which one.
+  tcase_add_test(tc, wraparoundHashTest);
   tcase_add_test(tc, concurrentHashTest);
 #endif
+
   /* --------------------------------------------- */
 
   tcase_add_checked_fixture(tc, setup, teardown);