From a2cd1d562cbd910b0fe1095666a7889b9aefbefb Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 28 May 2013 16:14:19 -0400
Subject: [PATCH 01/30] WIP: devising a better way to cache/reuse
 session/cursor pairs.

---
 c_src/async_nif.h             |  35 +++-------
 c_src/build_deps.sh           |  11 +--
 c_src/duration.h              |   7 +-
 c_src/wterl.c                 | 124 ++++++++++++++++++----------------
 src/riak_kv_wterl_backend.erl |   2 +-
 5 files changed, 87 insertions(+), 92 deletions(-)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index e7a9670..724b8d5 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -30,11 +30,11 @@ extern "C" {
 #include "fifo_q.h"
 #include "stats.h"
 
-#ifndef __UNUSED
-#define __UNUSED(v) ((void)(v))
+#ifndef UNUSED
+#define UNUSED(v) ((void)(v))
 #endif
 
-#define ASYNC_NIF_MAX_WORKERS 128
+#define ASYNC_NIF_MAX_WORKERS 1024
 #define ASYNC_NIF_WORKER_QUEUE_SIZE 500
 #define ASYNC_NIF_MAX_QUEUED_REQS 1000 * ASYNC_NIF_MAX_WORKERS
 
@@ -80,11 +80,11 @@ struct async_nif_state {
 #define ASYNC_NIF_DECL(decl, frame, pre_block, work_block, post_block)  \
   struct decl ## _args frame;                                           \
   static void fn_work_ ## decl (ErlNifEnv *env, ERL_NIF_TERM ref, ErlNifPid *pid, unsigned int worker_id, struct decl ## _args *args) { \
-  __UNUSED(worker_id);                                                  \
+  UNUSED(worker_id);                                                    \
   do work_block while(0);                                               \
   }                                                                     \
   static void fn_post_ ## decl (struct decl ## _args *args) {           \
-    __UNUSED(args);                                                     \
+    UNUSED(args);                                                       \
     do post_block while(0);                                             \
   }                                                                     \
   static ERL_NIF_TERM decl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv_in[]) { \
@@ -92,7 +92,7 @@ struct async_nif_state {
     struct decl ## _args *args = &on_stack_args;                        \
     struct decl ## _args *copy_of_args;                                 \
     struct async_nif_req_entry *req = NULL;                             \
-    const char *affinity = NULL;                                        \
+    const unsigned int affinity = 0;                                    \
     ErlNifEnv *new_env = NULL;                                          \
     /* argv[0] is a ref used for selective recv */                      \
     const ERL_NIF_TERM *argv = argv_in + 1;                             \
@@ -122,7 +122,7 @@ struct async_nif_state {
     req->fn_post = (void (*)(void *))fn_post_ ## decl;                 \
     int h = -1;                                                        \
     if (affinity)                                                      \
-        h = async_nif_str_hash_func(affinity) % async_nif->num_queues; \
+        h = affinity % async_nif->num_queues;                          \
     ERL_NIF_TERM reply = async_nif_enqueue_req(async_nif, req, h);     \
     if (!reply) {                                                      \
       fn_post_ ## decl (args);                                         \
@@ -218,23 +218,6 @@ async_nif_recycle_req(struct async_nif_req_entry *req, struct async_nif_state *a
     enif_mutex_unlock(async_nif->recycled_req_mutex);
 }
 
-/**
- * A string hash function.
- *
- * A basic hash function for strings of characters used during the
- * affinity association.
- *
- * s    a NULL terminated set of bytes to be hashed
- * ->   an integer hash encoding of the bytes
- */
-static inline unsigned int
-async_nif_str_hash_func(const char *s)
-{
-  unsigned int h = (unsigned int)*s;
-  if (h) for (++s ; *s; ++s) h = (h << 5) - h + (unsigned int)*s;
-  return h;
-}
-
 /**
  * Enqueue a request for processing by a worker thread.
  *
@@ -366,7 +349,7 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
   unsigned int num_queues = async_nif->num_queues;
   struct async_nif_work_queue *q = NULL;
   struct async_nif_req_entry *req = NULL;
-  __UNUSED(env);
+  UNUSED(env);
 
   STAT_PRINT(async_nif, qwait, "wterl");
 
@@ -521,7 +504,7 @@ async_nif_load()
 static void
 async_nif_upgrade(ErlNifEnv *env)
 {
-     __UNUSED(env);
+     UNUSED(env);
     // TODO:
 }
 
diff --git a/c_src/build_deps.sh b/c_src/build_deps.sh
index 789bcea..1a64c10 100755
--- a/c_src/build_deps.sh
+++ b/c_src/build_deps.sh
@@ -39,8 +39,8 @@ get_wt ()
             git clone ${WT_REPO} && \
                 (cd $BASEDIR/wiredtiger && git checkout $WT_VSN || exit 1)
         else
-            git clone -b ${WT_BRANCH} ${WT_REPO} && \
-                (cd $BASEDIR/wiredtiger && git checkout $WT_BRANCH origin/$WT_BRANCH || exit 1)
+            git clone ${WT_REPO} && \
+                (cd $BASEDIR/wiredtiger && git checkout -b $WT_BRANCH origin/$WT_BRANCH || exit 1)
         fi
         mv wiredtiger $WT_DIR || exit 1
     fi
@@ -49,8 +49,8 @@ get_wt ()
         [ -e $BASEDIR/wiredtiger-build.patch ] && \
             (patch -p1 --forward < $BASEDIR/wiredtiger-build.patch || exit 1 )
         ./autogen.sh || exit 1
-        cd ./build_posix || exit 1
-        [ -e Makefile ] && $MAKE distclean
+        [ -e $BASEDIR/$WT_DIR/build_posix/Makefile ] && \
+            (cd $BASEDIR/$WT_DIR/build_posix && $MAKE distclean)
         wt_configure;
     )
 }
@@ -109,7 +109,8 @@ build_snappy ()
 
 case "$1" in
     clean)
-        [ -d $WT_DIR/build_posix ] && (cd $WT_DIR/build_posix; make distclean)
+        [ -e $BASEDIR/$WT_DIR/build_posix/Makefile ] && \
+            (cd $BASEDIR/$WT_DIR/build_posix && $MAKE distclean)
         rm -rf system $SNAPPY_DIR
         rm -f ${BASEDIR}/../priv/wt
         rm -f ${BASEDIR}/../priv/libwiredtiger-*.so
diff --git a/c_src/duration.h b/c_src/duration.h
index 635d0fd..fc31101 100644
--- a/c_src/duration.h
+++ b/c_src/duration.h
@@ -33,8 +33,7 @@ static uint64_t ts(time_scale unit)
             ((uint64_t)ts.tv_nsec / scale[unit].div));
 }
 
-#if 0
-//if defined(__i386__) || defined(__x86_64__)
+if defined(__i386__) || defined(__x86_64__)
 
 /**
  * cpu_clock_ticks()
@@ -55,6 +54,10 @@ static inline uint64_t cpu_clock_ticks()
      return (uint64_t)hi << 32 | lo;
 }
 
+#endif
+
+#if 0
+
 /**
  * cpu_clock_ticks()
  *
diff --git a/c_src/wterl.c b/c_src/wterl.c
index f68723e..5f93268 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -35,8 +35,8 @@
 #  define dprint(s, ...) {}
 #endif
 
-#ifndef __UNUSED
-#define __UNUSED(v) ((void)(v))
+#ifndef UNUSED
+#define UNUSED(v) ((void)(v))
 #endif
 
 #include "wiredtiger.h"
@@ -47,36 +47,37 @@
 #include "stats.h"
 #endif
 
+#if (ASYNC_NIF_MAX_WORKERS > 32768)
+#error "WterlCtx cache won't work properly with > 32,768 workers."
+#endif
+
 static ErlNifResourceType *wterl_conn_RESOURCE;
 static ErlNifResourceType *wterl_cursor_RESOURCE;
 
-/* Generators for 'cursors' a named, type-specific hash table functions. */
-KHASH_MAP_INIT_STR(cursors, WT_CURSOR*);
+/* Generators for named, type-specific hash table functions. */
+KHASH_MAP_INIT_STR(uri, unsigned int);  // URI -> number of cursors(URI)
+
+    union {
+        unsigned int hash[];
+        struct {
+            unsigned int:02 nest;  // cuckoo's nest choosen on hash collision
+            unsigned int:15 off;   // bitpop((bmp & (1 << off) - 1) & bmp)
+            unsigned int:10 depth;
+        } nests;
+    } cuckoo;
+
 
-/**
- * We will have exactly one (1) WterlCtx for each async worker thread.  As
- * requests arrive we will reuse the same WterlConnHandle->contexts[worker_id]
- * WterlCtx in the work block ensuring that each async worker thread a) has
- * a separate WT_SESSION (because they are not thread safe) and b) when
- * possible we avoid opening new cursors by first looking for one in the
- * cursors hash table.  In practice this means we could have (num_workers
- * * num_tables) of cursors open which we need to account for when setting
- * session_max in the configuration of WiredTiger so that it creates enough
- * hazard pointers for this extreme case.
- *
- * Note: We don't protect access to this struct with a mutex because it will
- * only be accessed by the same worker thread.
- */
 typedef struct {
     WT_SESSION *session;
-    khash_t(cursors) *cursors;
+    WT_CURSOR  *cursor;
 } WterlCtx;
 
 typedef struct {
     WT_CONNECTION *conn;
     const char *session_config;
     ErlNifMutex *contexts_mutex;
-    WterlCtx contexts[ASYNC_NIF_MAX_WORKERS];
+    unsigned int num_contexts;
+    WterlCtx **contexts; // TODO: free this
 } WterlConnHandle;
 
 typedef struct {
@@ -268,24 +269,6 @@ __init_session_and_cursor_cache(WterlConnHandle *conn_handle, WterlCtx *ctx)
     return 0;
 }
 
-/**
- * Get the per-worker reusable WT_SESSION for a worker_id.
- */
-static int
-__session_for(WterlConnHandle *conn_handle, unsigned int worker_id, WT_SESSION **session)
-{
-    WterlCtx *ctx = &conn_handle->contexts[worker_id];
-    int rc = 0;
-
-    if (ctx->session == NULL) {
-        enif_mutex_lock(conn_handle->contexts_mutex);
-        rc = __init_session_and_cursor_cache(conn_handle, ctx);
-        enif_mutex_unlock(conn_handle->contexts_mutex);
-    }
-    *session = ctx->session;
-    return rc;
-}
-
 /**
  * Close all sessions and all cursors open on any objects.
  *
@@ -346,18 +329,36 @@ __close_cursors_on(WterlConnHandle *conn_handle, const char *uri)
     }
 }
 
+/**
+ * A string hash function.
+ *
+ * A basic hash function for strings of characters used during the
+ * affinity association.
+ *
+ * s    a NULL terminated set of bytes to be hashed
+ * ->   an integer hash encoding of the bytes
+ */
+static inline unsigned int
+__str_hash_func(const char *s)
+{
+    unsigned int h = (unsigned int)*s;
+    if (h) for (++s ; *s; ++s) h = (h << 5) - h + (unsigned int)*s;
+    return h;
+}
+
 /**
  * Get a reusable cursor that was opened for a particular worker within its
  * session.
  */
 static int
-__retain_cursor(WterlConnHandle *conn_handle, unsigned int worker_id, const char *uri, WT_CURSOR **cursor)
+__retain_ctx(WterlConnHandle *conn_handle, const char *uri, WterlCtx **ctx)
 {
     /* Check to see if we have a cursor open for this uri and if so reuse it. */
     WterlCtx *ctx = &conn_handle->contexts[worker_id];
     khash_t(cursors) *h = NULL;
     khiter_t itr;
     int rc;
+    unsigned int h = __str_hash_func(uri); // TODO: add config at some point
 
     if (ctx->session == NULL) {
         enif_mutex_lock(conn_handle->contexts_mutex);
@@ -398,11 +399,11 @@ __retain_cursor(WterlConnHandle *conn_handle, unsigned int worker_id, const char
 }
 
 static void
-__release_cursor(WterlConnHandle *conn_handle, unsigned int worker_id, const char *uri, WT_CURSOR *cursor)
+__release_ctx(WterlConnHandle *conn_handle, const char *uri, WterlCtx *ctx)
 {
-    __UNUSED(conn_handle);
-    __UNUSED(worker_id);
-    __UNUSED(uri);
+    UNUSED(conn_handle);
+    UNUSED(worker_id);
+    UNUSED(uri);
     cursor->reset(cursor);
 }
 
@@ -843,8 +844,7 @@ ASYNC_NIF_DECL(
  * of objects specified.
  *
  * argv[0]    WterlConnHandle resource
- * argv[1]    object name URI string
- * argv[2]    config string as an Erlang binary
+ * argv[1]    config string as an Erlang binary
  */
 ASYNC_NIF_DECL(
   wterl_checkpoint,
@@ -870,13 +870,15 @@ ASYNC_NIF_DECL(
       ASYNC_NIF_REPLY(enif_make_badarg(env));
       return;
     }
+    WT_CONNECTION *conn = args->conn_handle->conn;
     WT_SESSION *session = NULL;
-    int rc = __session_for(args->conn_handle, worker_id, &session);
+    int rc = conn->open_session(conn, NULL, args->conn_handle->session_config, &session);
     if (rc != 0) {
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
     rc = session->checkpoint(session, (const char*)config.data);
+    (void)session->close(session, NULL);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
   },
   { // post
@@ -1208,12 +1210,14 @@ ASYNC_NIF_DECL(
       return;
     }
 
+    WterlCtx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
-    int rc = __retain_cursor(args->conn_handle, worker_id, args->uri, &cursor);
+    int rc = __retain_ctx(args->conn_handle, args->uri, &ctx);
     if (rc != 0) {
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
+    cursor = ctx->cursor;
 
     WT_ITEM item_key;
     item_key.data = key.data;
@@ -1221,7 +1225,7 @@ ASYNC_NIF_DECL(
     cursor->set_key(cursor, &item_key);
     rc = cursor->remove(cursor);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
-    __release_cursor(args->conn_handle, worker_id, args->uri, cursor);
+    __release_ctx(args->conn_handle, args->uri, cursor);
   },
   { // post
 
@@ -1263,12 +1267,14 @@ ASYNC_NIF_DECL(
       return;
     }
 
+    WterlCtx *ctx = NULL
     WT_CURSOR *cursor = NULL;
-    int rc = __retain_cursor(args->conn_handle, worker_id, args->uri, &cursor);
+    int rc = __retain_ctx(args->conn_handle, args->uri, &ctx);
     if (rc != 0) {
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
+    cursor = ctx->cursor;
 
     WT_ITEM item_key;
     WT_ITEM item_value;
@@ -1290,7 +1296,7 @@ ASYNC_NIF_DECL(
     unsigned char *bin = enif_make_new_binary(env, item_value.size, &value);
     memcpy(bin, item_value.data, item_value.size);
     ASYNC_NIF_REPLY(enif_make_tuple2(env, ATOM_OK, value));
-    __release_cursor(args->conn_handle, worker_id, args->uri, cursor);
+    __release_ctx(args->conn_handle, args->uri, ctx);
   },
   { // post
 
@@ -1341,12 +1347,14 @@ ASYNC_NIF_DECL(
       return;
     }
 
+    WterlCtx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
-    int rc = __retain_cursor(args->conn_handle, worker_id, args->uri, &cursor);
+    int rc = __retain_ctx(args->conn_handle, args->uri, &ctx);
     if (rc != 0) {
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
+    cursor = ctx->cursors;
 
     WT_ITEM item_key;
     WT_ITEM item_value;
@@ -1357,7 +1365,7 @@ ASYNC_NIF_DECL(
     item_value.size = value.size;
     cursor->set_value(cursor, &item_value);
     rc = cursor->insert(cursor);
-    __release_cursor(args->conn_handle, worker_id, args->uri, cursor);
+    __release_ctx(args->conn_handle, args->uri, ctx);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
   },
   { // post
@@ -2142,9 +2150,9 @@ on_load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info)
 static int
 on_reload(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info)
 {
-    __UNUSED(env);
-    __UNUSED(priv_data);
-    __UNUSED(load_info);
+    UNUSED(env);
+    UNUSED(priv_data);
+    UNUSED(load_info);
     return 0; // TODO: implement
 }
 
@@ -2231,9 +2239,9 @@ on_unload(ErlNifEnv *env, void *priv_data)
 static int
 on_upgrade(ErlNifEnv *env, void **priv_data, void **old_priv_data, ERL_NIF_TERM load_info)
 {
-    __UNUSED(priv_data);
-    __UNUSED(old_priv_data);
-    __UNUSED(load_info);
+    UNUSED(priv_data);
+    UNUSED(old_priv_data);
+    UNUSED(load_info);
     ASYNC_NIF_UPGRADE(wterl, env); // TODO: implement
     return 0;
 }
diff --git a/src/riak_kv_wterl_backend.erl b/src/riak_kv_wterl_backend.erl
index 4d0448d..313da29 100644
--- a/src/riak_kv_wterl_backend.erl
+++ b/src/riak_kv_wterl_backend.erl
@@ -122,7 +122,7 @@ start(Partition, Config) ->
                         [{internal_page_max, "128K"},
                          {leaf_page_max, "128K"},
                          {lsm_chunk_size, "100MB"},
-                         {lsm_merge_threads, "2"},
+                         {lsm_merge_threads, 2},
                          {prefix_compression, false},
                          {lsm_bloom_newest, true},
                          {lsm_bloom_oldest, true} ,

From f0d5baeb0e6dd69a3b7375a4f36023ce93a15aa0 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Wed, 29 May 2013 14:56:38 -0400
Subject: [PATCH 02/30] WIP: more caching work, still not done.

---
 c_src/common.h |  61 +++++
 c_src/khash.h  |   2 +-
 c_src/queue.h  | 667 +++++++++++++++++++++++++++++++++++++++++++++++++
 c_src/wterl.c  | 305 ++++++++++++++++++----
 4 files changed, 981 insertions(+), 54 deletions(-)
 create mode 100644 c_src/common.h
 create mode 100644 c_src/queue.h

diff --git a/c_src/common.h b/c_src/common.h
new file mode 100644
index 0000000..70aea07
--- /dev/null
+++ b/c_src/common.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved.
+ * Author: Gregory Burd <greg@basho.com> <greg@burd.me>
+ *
+ * This file is provided to you under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain
+ * a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+#if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__))
+# undef  DEBUG
+# define DEBUG		0
+# define DPRINTF	(void)	/* Vararg macros may be unsupported */
+#elif DEBUG
+#include <stdio.h>
+#include <stdarg.h>
+#define DPRINTF(fmt, ...)							\
+    do {									\
+	fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__);	\
+	fflush(stderr);								\
+    } while(0)
+#define DPUTS(arg)		DPRINTF("%s", arg)
+#else
+#define DPRINTF(fmt, ...)	((void) 0)
+#define DPUTS(arg)		((void) 0)
+#endif
+
+#ifndef __UNUSED
+#define __UNUSED(v) ((void)(v))
+#endif
+
+#ifndef COMPQUIET
+#define COMPQUIET(n, v) do {                                            \
+        (n) = (v);                                                      \
+        (n) = (n);                                                      \
+} while (0)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif // __COMMON_H__
diff --git a/c_src/khash.h b/c_src/khash.h
index ab157b1..69549dc 100644
--- a/c_src/khash.h
+++ b/c_src/khash.h
@@ -586,7 +586,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
   @param  name  Name of the hash table [symbol]
   @param  khval_t  Type of values [type]
 */
-#ifdef __x86_64__ 
+#ifdef __x86_64__
 #define KHASH_MAP_INIT_PTR(name, khval_t)				\
     KHASH_INIT(name, void*, khval_t, 1, kh_ptr64_hash_func, kh_ptr64_hash_equal)
 #else
diff --git a/c_src/queue.h b/c_src/queue.h
new file mode 100644
index 0000000..9235d47
--- /dev/null
+++ b/c_src/queue.h
@@ -0,0 +1,667 @@
+/*
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)queue.h	8.5 (Berkeley) 8/20/94
+ * $FreeBSD: src/sys/sys/queue.h,v 1.54 2002/08/05 05:18:43 alfred Exp $
+ */
+
+#ifndef	_DB_QUEUE_H_
+#define	_DB_QUEUE_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * This file defines four types of data structures: singly-linked lists,
+ * singly-linked tail queues, lists and tail queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The elements
+ * are singly linked for minimum space and pointer manipulation overhead at
+ * the expense of O(n) removal for arbitrary elements. New elements can be
+ * added to the list after an existing element or at the head of the list.
+ * Elements being removed from the head of the list should use the explicit
+ * macro for this purpose for optimum efficiency. A singly-linked list may
+ * only be traversed in the forward direction.  Singly-linked lists are ideal
+ * for applications with large datasets and few or no removals or for
+ * implementing a LIFO queue.
+ *
+ * A singly-linked tail queue is headed by a pair of pointers, one to the
+ * head of the list and the other to the tail of the list. The elements are
+ * singly linked for minimum space and pointer manipulation overhead at the
+ * expense of O(n) removal for arbitrary elements. New elements can be added
+ * to the list after an existing element, at the head of the list, or at the
+ * end of the list. Elements being removed from the head of the tail queue
+ * should use the explicit macro for this purpose for optimum efficiency.
+ * A singly-linked tail queue may only be traversed in the forward direction.
+ * Singly-linked tail queues are ideal for applications with large datasets
+ * and few or no removals or for implementing a FIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ *
+ *
+ *			SLIST	LIST	STAILQ	TAILQ
+ * _HEAD		+	+	+	+
+ * _HEAD_INITIALIZER	+	+	+	+
+ * _ENTRY		+	+	+	+
+ * _INIT		+	+	+	+
+ * _EMPTY		+	+	+	+
+ * _FIRST		+	+	+	+
+ * _NEXT		+	+	+	+
+ * _PREV		-	-	-	+
+ * _LAST		-	-	+	+
+ * _FOREACH		+	+	+	+
+ * _FOREACH_REVERSE	-	-	-	+
+ * _INSERT_HEAD		+	+	+	+
+ * _INSERT_BEFORE	-	+	-	+
+ * _INSERT_AFTER	+	+	+	+
+ * _INSERT_TAIL		-	-	+	+
+ * _CONCAT		-	-	+	+
+ * _REMOVE_HEAD		+	-	+	-
+ * _REMOVE		+	+	+	+
+ *
+ */
+
+/*
+ * XXX
+ * We #undef all of the macros because there are incompatible versions of this
+ * file and these macros on various systems.  What makes the problem worse is
+ * they are included and/or defined by system include files which we may have
+ * already loaded into Berkeley DB before getting here.  For example, FreeBSD's
+ * <rpc/rpc.h> includes its system <sys/queue.h>, and VxWorks UnixLib.h defines
+ * several of the LIST_XXX macros.  Visual C.NET 7.0 also defines some of these
+ * same macros in Vc7\PlatformSDK\Include\WinNT.h.  Make sure we use ours.
+ */
+#undef LIST_EMPTY
+#undef LIST_ENTRY
+#undef LIST_FIRST
+#undef LIST_FOREACH
+#undef LIST_HEAD
+#undef LIST_HEAD_INITIALIZER
+#undef LIST_INIT
+#undef LIST_INSERT_AFTER
+#undef LIST_INSERT_BEFORE
+#undef LIST_INSERT_HEAD
+#undef LIST_NEXT
+#undef LIST_REMOVE
+#undef QMD_TRACE_ELEM
+#undef QMD_TRACE_HEAD
+#undef QUEUE_MACRO_DEBUG
+#undef SLIST_EMPTY
+#undef SLIST_ENTRY
+#undef SLIST_FIRST
+#undef SLIST_FOREACH
+#undef SLIST_FOREACH_PREVPTR
+#undef SLIST_HEAD
+#undef SLIST_HEAD_INITIALIZER
+#undef SLIST_INIT
+#undef SLIST_INSERT_AFTER
+#undef SLIST_INSERT_HEAD
+#undef SLIST_NEXT
+#undef SLIST_REMOVE
+#undef SLIST_REMOVE_HEAD
+#undef STAILQ_CONCAT
+#undef STAILQ_EMPTY
+#undef STAILQ_ENTRY
+#undef STAILQ_FIRST
+#undef STAILQ_FOREACH
+#undef STAILQ_HEAD
+#undef STAILQ_HEAD_INITIALIZER
+#undef STAILQ_INIT
+#undef STAILQ_INSERT_AFTER
+#undef STAILQ_INSERT_HEAD
+#undef STAILQ_INSERT_TAIL
+#undef STAILQ_LAST
+#undef STAILQ_NEXT
+#undef STAILQ_REMOVE
+#undef STAILQ_REMOVE_HEAD
+#undef STAILQ_REMOVE_HEAD_UNTIL
+#undef TAILQ_CONCAT
+#undef TAILQ_EMPTY
+#undef TAILQ_ENTRY
+#undef TAILQ_FIRST
+#undef TAILQ_FOREACH
+#undef TAILQ_FOREACH_REVERSE
+#undef TAILQ_HEAD
+#undef TAILQ_HEAD_INITIALIZER
+#undef TAILQ_INIT
+#undef TAILQ_INSERT_AFTER
+#undef TAILQ_INSERT_BEFORE
+#undef TAILQ_INSERT_HEAD
+#undef TAILQ_INSERT_TAIL
+#undef TAILQ_LAST
+#undef TAILQ_NEXT
+#undef TAILQ_PREV
+#undef TAILQ_REMOVE
+#undef TRACEBUF
+#undef TRASHIT
+
+#define	QUEUE_MACRO_DEBUG 0
+#if QUEUE_MACRO_DEBUG
+/* Store the last 2 places the queue element or head was altered */
+struct qm_trace {
+	char * lastfile;
+	int lastline;
+	char * prevfile;
+	int prevline;
+};
+
+#define	TRACEBUF	struct qm_trace trace;
+#define	TRASHIT(x)	do {(x) = (void *)-1;} while (0)
+
+#define	QMD_TRACE_HEAD(head) do {					\
+	(head)->trace.prevline = (head)->trace.lastline;		\
+	(head)->trace.prevfile = (head)->trace.lastfile;		\
+	(head)->trace.lastline = __LINE__;				\
+	(head)->trace.lastfile = __FILE__;				\
+} while (0)
+
+#define	QMD_TRACE_ELEM(elem) do {					\
+	(elem)->trace.prevline = (elem)->trace.lastline;		\
+	(elem)->trace.prevfile = (elem)->trace.lastfile;		\
+	(elem)->trace.lastline = __LINE__;				\
+	(elem)->trace.lastfile = __FILE__;				\
+} while (0)
+
+#else
+#define	QMD_TRACE_ELEM(elem)
+#define	QMD_TRACE_HEAD(head)
+#define	TRACEBUF
+#define	TRASHIT(x)
+#endif	/* QUEUE_MACRO_DEBUG */
+
+/*
+ * Singly-linked List declarations.
+ */
+#define	SLIST_HEAD(name, type)						\
+struct name {								\
+	struct type *slh_first;	/* first element */			\
+}
+
+#define	SLIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	SLIST_ENTRY(type)						\
+struct {								\
+	struct type *sle_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define	SLIST_EMPTY(head)	((head)->slh_first == NULL)
+
+#define	SLIST_FIRST(head)	((head)->slh_first)
+
+#define	SLIST_FOREACH(var, head, field)					\
+	for ((var) = SLIST_FIRST((head));				\
+	    (var);							\
+	    (var) = SLIST_NEXT((var), field))
+
+#define	SLIST_FOREACH_PREVPTR(var, varp, head, field)			\
+	for ((varp) = &SLIST_FIRST((head));				\
+	    ((var) = *(varp)) != NULL;					\
+	    (varp) = &SLIST_NEXT((var), field))
+
+#define	SLIST_INIT(head) do {						\
+	SLIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	SLIST_INSERT_AFTER(slistelm, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field);	\
+	SLIST_NEXT((slistelm), field) = (elm);				\
+} while (0)
+
+#define	SLIST_INSERT_HEAD(head, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_FIRST((head));			\
+	SLIST_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	SLIST_NEXT(elm, field)	((elm)->field.sle_next)
+
+#define	SLIST_REMOVE(head, elm, type, field) do {			\
+	if (SLIST_FIRST((head)) == (elm)) {				\
+		SLIST_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		struct type *curelm = SLIST_FIRST((head));		\
+		while (SLIST_NEXT(curelm, field) != (elm))		\
+			curelm = SLIST_NEXT(curelm, field);		\
+		SLIST_NEXT(curelm, field) =				\
+		    SLIST_NEXT(SLIST_NEXT(curelm, field), field);	\
+	}								\
+} while (0)
+
+#define	SLIST_REMOVE_HEAD(head, field) do {				\
+	SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field);	\
+} while (0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define	STAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *stqh_first;/* first element */			\
+	struct type **stqh_last;/* addr of last next element */		\
+}
+
+#define	STAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).stqh_first }
+
+#define	STAILQ_ENTRY(type)						\
+struct {								\
+	struct type *stqe_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define	STAILQ_CONCAT(head1, head2) do {				\
+	if (!STAILQ_EMPTY((head2))) {					\
+		*(head1)->stqh_last = (head2)->stqh_first;		\
+		(head1)->stqh_last = (head2)->stqh_last;		\
+		STAILQ_INIT((head2));					\
+	}								\
+} while (0)
+
+#define	STAILQ_EMPTY(head)	((head)->stqh_first == NULL)
+
+#define	STAILQ_FIRST(head)	((head)->stqh_first)
+
+#define	STAILQ_FOREACH(var, head, field)				\
+	for ((var) = STAILQ_FIRST((head));				\
+	   (var);							\
+	   (var) = STAILQ_NEXT((var), field))
+
+#define	STAILQ_INIT(head) do {						\
+	STAILQ_FIRST((head)) = NULL;					\
+	(head)->stqh_last = &STAILQ_FIRST((head));			\
+} while (0)
+
+#define	STAILQ_INSERT_AFTER(head, tqelm, elm, field) do {		\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_NEXT((tqelm), field) = (elm);				\
+} while (0)
+
+#define	STAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL)	\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	STAILQ_INSERT_TAIL(head, elm, field) do {			\
+	STAILQ_NEXT((elm), field) = NULL;				\
+	*(head)->stqh_last = (elm);					\
+	(head)->stqh_last = &STAILQ_NEXT((elm), field);			\
+} while (0)
+
+#define	STAILQ_LAST(head, type, field)					\
+	(STAILQ_EMPTY((head)) ?						\
+		NULL :							\
+		((struct type *)					\
+		((char *)((head)->stqh_last) - __offsetof(struct type, field))))
+
+#define	STAILQ_NEXT(elm, field)	((elm)->field.stqe_next)
+
+#define	STAILQ_REMOVE(head, elm, type, field) do {			\
+	if (STAILQ_FIRST((head)) == (elm)) {				\
+		STAILQ_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		struct type *curelm = STAILQ_FIRST((head));		\
+		while (STAILQ_NEXT(curelm, field) != (elm))		\
+			curelm = STAILQ_NEXT(curelm, field);		\
+		if ((STAILQ_NEXT(curelm, field) =			\
+		     STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
+			(head)->stqh_last = &STAILQ_NEXT((curelm), field);\
+	}								\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD(head, field) do {				\
+	if ((STAILQ_FIRST((head)) =					\
+	     STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL)		\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do {			\
+	if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL)	\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+/*
+ * List declarations.
+ */
+#define	LIST_HEAD(name, type)						\
+struct name {								\
+	struct type *lh_first;	/* first element */			\
+}
+
+#define	LIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	LIST_ENTRY(type)						\
+struct {								\
+	struct type *le_next;	/* next element */			\
+	struct type **le_prev;	/* address of previous next element */	\
+}
+
+/*
+ * List functions.
+ */
+
+#define	LIST_EMPTY(head)	((head)->lh_first == NULL)
+
+#define	LIST_FIRST(head)	((head)->lh_first)
+
+#define	LIST_FOREACH(var, head, field)					\
+	for ((var) = LIST_FIRST((head));				\
+	    (var);							\
+	    (var) = LIST_NEXT((var), field))
+
+#define	LIST_INIT(head) do {						\
+	LIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	LIST_INSERT_AFTER(listelm, elm, field) do {			\
+	if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
+		LIST_NEXT((listelm), field)->field.le_prev =		\
+		    &LIST_NEXT((elm), field);				\
+	LIST_NEXT((listelm), field) = (elm);				\
+	(elm)->field.le_prev = &LIST_NEXT((listelm), field);		\
+} while (0)
+
+#define	LIST_INSERT_BEFORE(listelm, elm, field) do {			\
+	(elm)->field.le_prev = (listelm)->field.le_prev;		\
+	LIST_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.le_prev = (elm);				\
+	(listelm)->field.le_prev = &LIST_NEXT((elm), field);		\
+} while (0)
+
+#define	LIST_INSERT_HEAD(head, elm, field) do {				\
+	if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL)	\
+		LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
+	LIST_FIRST((head)) = (elm);					\
+	(elm)->field.le_prev = &LIST_FIRST((head));			\
+} while (0)
+
+#define	LIST_NEXT(elm, field)	((elm)->field.le_next)
+
+#define	LIST_REMOVE(elm, field) do {					\
+	if (LIST_NEXT((elm), field) != NULL)				\
+		LIST_NEXT((elm), field)->field.le_prev =		\
+		    (elm)->field.le_prev;				\
+	*(elm)->field.le_prev = LIST_NEXT((elm), field);		\
+} while (0)
+
+/*
+ * Tail queue declarations.
+ */
+#define	TAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *tqh_first;	/* first element */			\
+	struct type **tqh_last;	/* addr of last next element */		\
+	TRACEBUF							\
+}
+
+#define	TAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).tqh_first }
+
+#define	TAILQ_ENTRY(type)						\
+struct {								\
+	struct type *tqe_next;	/* next element */			\
+	struct type **tqe_prev;	/* address of previous next element */	\
+	TRACEBUF							\
+}
+
+/*
+ * Tail queue functions.
+ */
+#define	TAILQ_CONCAT(head1, head2, field) do {				\
+	if (!TAILQ_EMPTY(head2)) {					\
+		*(head1)->tqh_last = (head2)->tqh_first;		\
+		(head2)->tqh_first->field.tqe_prev = (head1)->tqh_last;	\
+		(head1)->tqh_last = (head2)->tqh_last;			\
+		TAILQ_INIT((head2));					\
+		QMD_TRACE_HEAD(head);					\
+		QMD_TRACE_HEAD(head2);					\
+	}								\
+} while (0)
+
+#define	TAILQ_EMPTY(head)	((head)->tqh_first == NULL)
+
+#define	TAILQ_FIRST(head)	((head)->tqh_first)
+
+#define	TAILQ_FOREACH(var, head, field)					\
+	for ((var) = TAILQ_FIRST((head));				\
+	    (var);							\
+	    (var) = TAILQ_NEXT((var), field))
+
+#define	TAILQ_FOREACH_REVERSE(var, head, headname, field)		\
+	for ((var) = TAILQ_LAST((head), headname);			\
+	    (var);							\
+	    (var) = TAILQ_PREV((var), headname, field))
+
+#define	TAILQ_INIT(head) do {						\
+	TAILQ_FIRST((head)) = NULL;					\
+	(head)->tqh_last = &TAILQ_FIRST((head));			\
+	QMD_TRACE_HEAD(head);						\
+} while (0)
+
+#define	TAILQ_INSERT_AFTER(head, listelm, elm, field) do {		\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+		TAILQ_NEXT((elm), field)->field.tqe_prev =		\
+		    &TAILQ_NEXT((elm), field);				\
+	else {								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+		QMD_TRACE_HEAD(head);					\
+	}								\
+	TAILQ_NEXT((listelm), field) = (elm);				\
+	(elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field);		\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+	QMD_TRACE_ELEM(&listelm->field);				\
+} while (0)
+
+#define	TAILQ_INSERT_BEFORE(listelm, elm, field) do {			\
+	(elm)->field.tqe_prev = (listelm)->field.tqe_prev;		\
+	TAILQ_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.tqe_prev = (elm);				\
+	(listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field);		\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+	QMD_TRACE_ELEM(&listelm->field);				\
+} while (0)
+
+#define	TAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL)	\
+		TAILQ_FIRST((head))->field.tqe_prev =			\
+		    &TAILQ_NEXT((elm), field);				\
+	else								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+	TAILQ_FIRST((head)) = (elm);					\
+	(elm)->field.tqe_prev = &TAILQ_FIRST((head));			\
+	QMD_TRACE_HEAD(head);						\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+} while (0)
+
+#define	TAILQ_INSERT_TAIL(head, elm, field) do {			\
+	TAILQ_NEXT((elm), field) = NULL;				\
+	(elm)->field.tqe_prev = (head)->tqh_last;			\
+	*(head)->tqh_last = (elm);					\
+	(head)->tqh_last = &TAILQ_NEXT((elm), field);			\
+	QMD_TRACE_HEAD(head);						\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+} while (0)
+
+#define	TAILQ_LAST(head, headname)					\
+	(*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define	TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define	TAILQ_PREV(elm, headname, field)				\
+	(*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define	TAILQ_REMOVE(head, elm, field) do {				\
+	if ((TAILQ_NEXT((elm), field)) != NULL)				\
+		TAILQ_NEXT((elm), field)->field.tqe_prev =		\
+		    (elm)->field.tqe_prev;				\
+	else {								\
+		(head)->tqh_last = (elm)->field.tqe_prev;		\
+		QMD_TRACE_HEAD(head);					\
+	}								\
+	*(elm)->field.tqe_prev = TAILQ_NEXT((elm), field);		\
+	TRASHIT((elm)->field.tqe_next);					\
+	TRASHIT((elm)->field.tqe_prev);					\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+} while (0)
+
+/*
+ * Circular queue definitions.
+ */
+#define	CIRCLEQ_HEAD(name, type)					\
+struct name {								\
+	struct type *cqh_first;		/* first element */		\
+	struct type *cqh_last;		/* last element */		\
+}
+
+#define	CIRCLEQ_HEAD_INITIALIZER(head)					\
+	{ (void *)&head, (void *)&head }
+
+#define	CIRCLEQ_ENTRY(type)						\
+struct {								\
+	struct type *cqe_next;		/* next element */		\
+	struct type *cqe_prev;		/* previous element */		\
+}
+
+/*
+ * Circular queue functions.
+ */
+#define	CIRCLEQ_INIT(head) do {						\
+	(head)->cqh_first = (void *)(head);				\
+	(head)->cqh_last = (void *)(head);				\
+} while (/*CONSTCOND*/0)
+
+#define	CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do {		\
+	(elm)->field.cqe_next = (listelm)->field.cqe_next;		\
+	(elm)->field.cqe_prev = (listelm);				\
+	if ((listelm)->field.cqe_next == (void *)(head))		\
+		(head)->cqh_last = (elm);				\
+	else								\
+		(listelm)->field.cqe_next->field.cqe_prev = (elm);	\
+	(listelm)->field.cqe_next = (elm);				\
+} while (/*CONSTCOND*/0)
+
+#define	CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do {		\
+	(elm)->field.cqe_next = (listelm);				\
+	(elm)->field.cqe_prev = (listelm)->field.cqe_prev;		\
+	if ((listelm)->field.cqe_prev == (void *)(head))		\
+		(head)->cqh_first = (elm);				\
+	else								\
+		(listelm)->field.cqe_prev->field.cqe_next = (elm);	\
+	(listelm)->field.cqe_prev = (elm);				\
+} while (/*CONSTCOND*/0)
+
+#define	CIRCLEQ_INSERT_HEAD(head, elm, field) do {			\
+	(elm)->field.cqe_next = (head)->cqh_first;			\
+	(elm)->field.cqe_prev = (void *)(head);				\
+	if ((head)->cqh_last == (void *)(head))				\
+		(head)->cqh_last = (elm);				\
+	else								\
+		(head)->cqh_first->field.cqe_prev = (elm);		\
+	(head)->cqh_first = (elm);					\
+} while (/*CONSTCOND*/0)
+
+#define	CIRCLEQ_INSERT_TAIL(head, elm, field) do {			\
+	(elm)->field.cqe_next = (void *)(head);				\
+	(elm)->field.cqe_prev = (head)->cqh_last;			\
+	if ((head)->cqh_first == (void *)(head))			\
+		(head)->cqh_first = (elm);				\
+	else								\
+		(head)->cqh_last->field.cqe_next = (elm);		\
+	(head)->cqh_last = (elm);					\
+} while (/*CONSTCOND*/0)
+
+#define	CIRCLEQ_REMOVE(head, elm, field) do {				\
+	if ((elm)->field.cqe_next == (void *)(head))			\
+		(head)->cqh_last = (elm)->field.cqe_prev;		\
+	else								\
+		(elm)->field.cqe_next->field.cqe_prev =			\
+		    (elm)->field.cqe_prev;				\
+	if ((elm)->field.cqe_prev == (void *)(head))			\
+		(head)->cqh_first = (elm)->field.cqe_next;		\
+	else								\
+		(elm)->field.cqe_prev->field.cqe_next =			\
+		    (elm)->field.cqe_next;				\
+} while (/*CONSTCOND*/0)
+
+#define	CIRCLEQ_FOREACH(var, head, field)				\
+	for ((var) = ((head)->cqh_first);				\
+		(var) != (const void *)(head);				\
+		(var) = ((var)->field.cqe_next))
+
+#define	CIRCLEQ_FOREACH_REVERSE(var, head, field)			\
+	for ((var) = ((head)->cqh_last);				\
+		(var) != (const void *)(head);				\
+		(var) = ((var)->field.cqe_prev))
+
+/*
+ * Circular queue access methods.
+ */
+#define	CIRCLEQ_EMPTY(head)		((head)->cqh_first == (void *)(head))
+#define	CIRCLEQ_FIRST(head)		((head)->cqh_first)
+#define	CIRCLEQ_LAST(head)		((head)->cqh_last)
+#define	CIRCLEQ_NEXT(elm, field)	((elm)->field.cqe_next)
+#define	CIRCLEQ_PREV(elm, field)	((elm)->field.cqe_prev)
+
+#define CIRCLEQ_LOOP_NEXT(head, elm, field)				\
+	(((elm)->field.cqe_next == (void *)(head))			\
+	    ? ((head)->cqh_first)					\
+	    : (elm->field.cqe_next))
+#define CIRCLEQ_LOOP_PREV(head, elm, field)				\
+	(((elm)->field.cqe_prev == (void *)(head))			\
+	    ? ((head)->cqh_last)					\
+	    : (elm->field.cqe_prev))
+
+
+#if defined(__cplusplus)
+}
+#endif
+#endif	/* !_DB_QUEUE_H_ */
diff --git a/c_src/wterl.c b/c_src/wterl.c
index 5f93268..3455143 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -21,56 +21,47 @@
 
 #include <stdio.h>
 #include <string.h>
+#include <stdarg.h>
+#include <inttypes.h>
 #include <errno.h>
 
-#ifdef DEBUG
-#include <stdio.h>
-#include <stdarg.h>
-#define dprint(s, ...) do {                                             \
-    fprintf(stderr, s, ##__VA_ARGS__);                                  \
-    fprintf(stderr, "\r\n");                                            \
-    fflush(stderr);                                                     \
-    } while(0);
-#else
-#  define dprint(s, ...) {}
-#endif
-
-#ifndef UNUSED
-#define UNUSED(v) ((void)(v))
-#endif
-
+#include "common.h"
 #include "wiredtiger.h"
 #include "async_nif.h"
 #include "khash.h"
-
-#ifdef WTERL_STATS
 #include "stats.h"
-#endif
 
-#if (ASYNC_NIF_MAX_WORKERS > 32768)
-#error "WterlCtx cache won't work properly with > 32,768 workers."
-#endif
+#define CTX_CACHE_SIZE ASYNC_NIF_MAX_WORKERS
 
 static ErlNifResourceType *wterl_conn_RESOURCE;
+static ErlNifResourceType *wterl_ctx_RESOURCE;
 static ErlNifResourceType *wterl_cursor_RESOURCE;
 
-/* Generators for named, type-specific hash table functions. */
-KHASH_MAP_INIT_STR(uri, unsigned int);  // URI -> number of cursors(URI)
+typedef struct struct WterlCtxHandle {
+    WT_SESSION *session;  // open session
+    WT_CURSOR *cursors[]; // open cursors, all reset ready to reuse
+} WterlCtxHandle;
 
-    union {
-        unsigned int hash[];
-        struct {
-            unsigned int:02 nest;  // cuckoo's nest choosen on hash collision
-            unsigned int:15 off;   // bitpop((bmp & (1 << off) - 1) & bmp)
-            unsigned int:10 depth;
-        } nests;
-    } cuckoo;
+struct ctx_lru_entry {
+    WterlCtxHandle *ctx;
+    u_int64_t sig;
+    SLIST_HEAD(ctx, struct WterlCtxHandle*) set;
+    STAILQ_ENTRY(struct ctx_lru_entry) entries;
+};
 
+KHASH_SET_INIT_INT64(ctx_idx, struct ctx_group*);
 
-typedef struct {
-    WT_SESSION *session;
-    WT_CURSOR  *cursor;
-} WterlCtx;
+struct ctx_cache {
+    size_t size;
+    struct lru {
+	STAILQ_HEAD(lru, struct ctx_lru_entry*) lru;
+    } lru;
+    struct idx {
+	int h;
+	ctx_group cgs[CTX_CACHE_SIZE];
+	khash_t(ctx_idx) *ctx;
+    } idx;
+};
 
 typedef struct {
     WT_CONNECTION *conn;
@@ -127,6 +118,213 @@ struct wterl_priv_data {
 ASYNC_NIF_INIT(wterl);
 
 
+/**
+ * Is the cache full?
+ *
+ * Test to see if the cache is full or not.
+ *
+ * -> 0=false/not full yet, 1=true/cache is full
+ */
+static int
+__ctx_cache_full(struct wterl_ctx_cache *cache)
+{
+    return cache->size == CTX_CACHE_SIZE ? 1 : 0;
+}
+
+/**
+ * Evict items from the cache.
+ *
+ * Evict some number of items from the cache to make space for other items.
+ *
+ * ->   number of items evicted
+ */
+static int
+__ctx_cache_evict(struct ctx_cache *cache)
+{
+    // TODO:
+}
+
+/**
+ * Find a matching item in the cache.
+ *
+ * See if there exists an item in the cache with a matching signature, if
+ * so remove it from the cache and return it for use by the callee.
+ *
+ * sig  a 32-bit signature (hash) representing the session/cursor* needed
+ *      for the operation
+ */
+static WterlCtxHandle *
+__ctx_cache_find(wterl_ctx_cache *cache, u_int64_t sig)
+{
+    khiter_t k;
+
+    kh_get(ctx_idx, cache->idx.ctx, sig);
+    if (k != kh_end(h)) {
+        /*
+	 * This signature exists in the hashtable, that's good news. Maybe
+	 * there is a context open and ready for us to reuse, let's check.
+	 */
+	struct ctx_group *cg = kh_value(ctx_idx, k);
+	if (SLIST_EMPTY(cg->set)) { // cache miss
+	    /*
+	     * Nope, there are no contexts available for reuse with this
+	     * signature.
+	     */
+	    return NULL;
+	} else { // cache hit
+	    /*
+	     * Yes, we've found a context available for reuse with the
+	     * desired signature.  Remove it from the cache and return it
+	     * to the caller.
+	     */
+	    WterlCtxHandle *p = SLIST_REMOVE(cg->set); // remove from index
+	    WterlCtxHandle *q;
+	    STAILQ_FOREACH(q, &cache->lru, entries) {
+		if (p == q) {
+		    STAILQ_REMOVE(&cache-lru, q, ctx_lru_entries, entries);
+		}
+	    }
+	    // remove from lru
+	    cache->size--; // update cache size
+	    return p;
+	}
+    } else {
+        /*
+	 * The signature didn't match any that we're caching contexts for right
+	 * now, so clearly there won't be any cached contexts for this either.
+	 */
+	return NULL;
+    }
+}
+
+/**
+ * Add/Return an item to the cache.
+ *
+ * Return an item into the cache, reset the cursors it has open and put it at
+ * the front of the LRU.
+ */
+static int
+__ctx_cache_add(wterl_ctx_cache *cache, WterlCtxHandle *e)
+{
+    khiter_t k;
+
+    kh_get(ctx_idx, cache->idx.ctx, sig);
+    if (k != kh_end(h)) {
+        /*
+	 * This signature exists in the bitmap, that's good news. We can just
+	 * put this into the list of cached items for that signature.
+	 */
+	struct ctx_group *cg = kh_value(ctx_idx, k);
+	SLIST_INSERT_HEAD(cg->set, e, entries); // add to index
+	STAILQ_INSERT_HEAD(&cache->lru, e, entries); // add to lru
+	cache->size++; // update cache size
+    } else {
+        /*
+	 * The signature didn't match any that we're caching contexts for right
+	 * now, so we need to add a context group for it.
+	 */
+	if (cache->idx
+	return NULL;
+    }
+}
+
+/**
+ * Produce the Morton Number from two 32-bit unsigned integers.
+ *   e.g.  p = 0101 1011 0100 0011
+ *         q = 1011 1100 0001 0011
+ *         z = 0110 0111 1101 1010 0010 0001 0000 1111
+ */
+static inline u_int64_t
+__interleave(u_int32_t p, u_int32_t q)
+{
+    static const u_int32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
+    static const u_int32_t S[] = {1, 2, 4, 8};
+    u_int32_t x, y;
+    u_int64_t z;
+
+    x = p & 0x0000FFFF; // Interleave lower 16 bits of p as x and q as y, so the
+    y = q & 0x0000FFFF; // bits of x are in the even positions and bits from y
+    z = 0;              // in the odd; the first 32 bits of 'z' is the result.
+
+    x = (x | (x << S[3])) & B[3];
+    x = (x | (x << S[2])) & B[2];
+    x = (x | (x << S[1])) & B[1];
+    x = (x | (x << S[0])) & B[0];
+
+    y = (y | (y << S[3])) & B[3];
+    y = (y | (y << S[2])) & B[2];
+    y = (y | (y << S[1])) & B[1];
+    y = (y | (y << S[0])) & B[0];
+
+    z = x | (y << 1);
+
+    x = (p >> 16) & 0x0000FFFF; // Interleave the upper 16 bits of p as x and q as y
+    y = (q >> 16) & 0x0000FFFF; // just as before.
+
+    x = (x | (x << S[3])) & B[3];
+    x = (x | (x << S[2])) & B[2];
+    x = (x | (x << S[1])) & B[1];
+    x = (x | (x << S[0])) & B[0];
+
+    y = (y | (y << S[3])) & B[3];
+    y = (y | (y << S[2])) & B[2];
+    y = (y | (y << S[1])) & B[1];
+    y = (y | (y << S[0])) & B[0];
+
+    z = (z << 16) | (x | (y << 1)); // the resulting 64-bit Morton Number.
+
+    return z;
+}
+
+/**
+ * A string hash function.
+ *
+ * A basic hash function for strings of characters used during the
+ * affinity association.
+ *
+ * s    a NULL terminated set of bytes to be hashed
+ * ->   an integer hash encoding of the bytes
+ */
+static inline unsigned int
+__str_hash_func(const char *s)
+{
+    unsigned int h = (unsigned int)*s;
+    if (h) for (++s ; *s; ++s) h = (h << 5) - h + (unsigned int)*s;
+    return h;
+}
+
+/**
+ * Create a signature for the operation we're about to perform.
+ *
+ * Create a 32bit signature for this a combination of session configuration
+ * some number of cursors open on tables each potentially with a different
+ * configuration. "session_config, [{table_name, cursor_config}, ...]"
+ *
+ * session_config   the string used to configure the WT_SESSION
+ * ...              each pair of items in the varargs array is a table name,
+ *                  cursor config pair
+ */
+static u_int32_t
+__ctx_cache_sig(const char *session_config, ...)
+{
+    va_list ap;
+    int i;
+    u_int64_t h;
+
+    if (NULL == session_config)
+	return 0;
+
+    h = __str_hash_fn(session_config);
+
+    va_start (ap, count);
+    for (i = 0; i < count; i++) {
+	h = __morton(h, __str_hash_fn(va_arg(ap, const char *)));
+	h <<= 1;
+    }
+    va_end (ap);
+    return (u_int32_t)(h & 0xFFFFFFFF);
+}
+
 /**
  * Callback to handle error messages.
  *
@@ -269,6 +467,24 @@ __init_session_and_cursor_cache(WterlConnHandle *conn_handle, WterlCtx *ctx)
     return 0;
 }
 
+/**
+ * Get the per-worker reusable WT_SESSION for a worker_id.
+ */
+static int
+__session_for(WterlConnHandle *conn_handle, unsigned int worker_id, WT_SESSION **session)
+{
+    WterlCtx *ctx = &conn_handle->contexts[worker_id];
+    int rc = 0;
+
+    if (ctx->session == NULL) {
+        enif_mutex_lock(conn_handle->contexts_mutex);
+        rc = __init_session_and_cursor_cache(conn_handle, ctx);
+        enif_mutex_unlock(conn_handle->contexts_mutex);
+    }
+    *session = ctx->session;
+    return rc;
+}
+
 /**
  * Close all sessions and all cursors open on any objects.
  *
@@ -329,23 +545,6 @@ __close_cursors_on(WterlConnHandle *conn_handle, const char *uri)
     }
 }
 
-/**
- * A string hash function.
- *
- * A basic hash function for strings of characters used during the
- * affinity association.
- *
- * s    a NULL terminated set of bytes to be hashed
- * ->   an integer hash encoding of the bytes
- */
-static inline unsigned int
-__str_hash_func(const char *s)
-{
-    unsigned int h = (unsigned int)*s;
-    if (h) for (++s ; *s; ++s) h = (h << 5) - h + (unsigned int)*s;
-    return h;
-}
-
 /**
  * Get a reusable cursor that was opened for a particular worker within its
  * session.

From 15fbc71ea7d8ec7c4664cc8f0a267ed2b16a0415 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Thu, 30 May 2013 14:21:34 -0400
Subject: [PATCH 03/30] WIP: pieces in place, need to work out the kinks now.

---
 c_src/async_nif.h |   2 +-
 c_src/duration.h  |   2 +-
 c_src/fifo_q.h    |   2 +
 c_src/kbtree.h    | 381 ++++++++++++++++++++++++
 c_src/khash.h     |   2 +-
 c_src/wterl.c     | 730 +++++++++++++++++++---------------------------
 6 files changed, 680 insertions(+), 439 deletions(-)
 create mode 100644 c_src/kbtree.h

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 724b8d5..9483433 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -92,7 +92,7 @@ struct async_nif_state {
     struct decl ## _args *args = &on_stack_args;                        \
     struct decl ## _args *copy_of_args;                                 \
     struct async_nif_req_entry *req = NULL;                             \
-    const unsigned int affinity = 0;                                    \
+    unsigned int affinity = 0;						\
     ErlNifEnv *new_env = NULL;                                          \
     /* argv[0] is a ref used for selective recv */                      \
     const ERL_NIF_TERM *argv = argv_in + 1;                             \
diff --git a/c_src/duration.h b/c_src/duration.h
index fc31101..083ad6b 100644
--- a/c_src/duration.h
+++ b/c_src/duration.h
@@ -33,7 +33,7 @@ static uint64_t ts(time_scale unit)
             ((uint64_t)ts.tv_nsec / scale[unit].div));
 }
 
-if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__)
 
 /**
  * cpu_clock_ticks()
diff --git a/c_src/fifo_q.h b/c_src/fifo_q.h
index f37bf67..bbc4ff0 100644
--- a/c_src/fifo_q.h
+++ b/c_src/fifo_q.h
@@ -26,6 +26,8 @@
 extern "C" {
 #endif
 
+#define fifo_t(name)                      \
+  struct fifo_q__ ## name *
 #define FIFO_QUEUE_TYPE(name)             \
   struct fifo_q__ ## name *
 #define DECL_FIFO_QUEUE(name, type)       \
diff --git a/c_src/kbtree.h b/c_src/kbtree.h
new file mode 100644
index 0000000..f628d66
--- /dev/null
+++ b/c_src/kbtree.h
@@ -0,0 +1,381 @@
+/*-
+ * Copyright 1997-1999, 2001, John-Mark Gurney.
+ *           2008, Attractive Chaos <attractivechaos@aol.co.uk>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* Reference: http://attractivechaos.awardspace.com/kbtree.h
+              http://attractivechaos.awardspace.com/kbtree.h.html */
+
+#ifndef __AC_KBTREE_H
+#define __AC_KBTREE_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+typedef struct {
+	int32_t is_internal:1, n:31;
+} kbnode_t;
+
+#define	__KB_KEY(type, x)	((type*)((char*)x + 4))
+#define __KB_PTR(btr, x)	((kbnode_t**)((char*)x + btr->off_ptr))
+
+#define __KB_TREE_T(name)						\
+	typedef struct {							\
+		kbnode_t *root;							\
+		int	off_key, off_ptr, ilen, elen;		\
+		int	n, t;								\
+		int	n_keys, n_nodes;					\
+	} kbtree_##name##_t;
+
+#define __KB_INIT(name, key_t)											\
+	kbtree_##name##_t *kb_init_##name(int size)							\
+	{																	\
+		kbtree_##name##_t *b;											\
+		b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t));	\
+		b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) / 2; \
+		if (b->t < 2) {													\
+			free(b); return 0;											\
+		}																\
+		b->n = 2 * b->t - 1;											\
+		b->off_ptr = 4 + b->n * sizeof(key_t);							\
+		b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) / 4 * 4; \
+		b->elen = (b->off_ptr + 3) / 4 * 4;								\
+		b->root = (kbnode_t*)calloc(1, b->ilen);						\
+		++b->n_nodes;													\
+		return b;														\
+	}
+
+#define __kb_destroy(b) do {											\
+		int i, max = 8;													\
+		kbnode_t *x, **top, **stack;									\
+		if (b) {														\
+			top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*));	\
+			*top++ = (b)->root;											\
+			while (top != stack) {										\
+				x = *--top;												\
+				if (x->is_internal == 0) { free(x); continue; }			\
+				for (i = 0; i <= x->n; ++i)								\
+					if (__KB_PTR(b, x)[i]) {							\
+						if (top - stack == max) {						\
+							max <<= 1;									\
+							stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \
+							top = stack + (max>>1);						\
+						}												\
+						*top++ = __KB_PTR(b, x)[i];						\
+					}													\
+				free(x);												\
+			}															\
+		}																\
+		free(b); free(stack);											\
+	} while (0)
+
+#define __KB_GET_AUX0(name, key_t, __cmp)								\
+	static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
+	{																	\
+		int tr, *rr, begin, end, n = x->n / 2;							\
+		if (x->n == 0) return -1;										\
+		if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) {						\
+			begin = 0; end = n;											\
+		} else { begin = n; end = x->n - 1; }							\
+		rr = r? r : &tr;												\
+		n = end;														\
+		while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \
+		return n;														\
+	}
+
+#define __KB_GET_AUX1(name, key_t, __cmp)								\
+	static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
+	{																	\
+		int tr, *rr, begin = 0, end = x->n;								\
+		if (x->n == 0) return -1;										\
+		rr = r? r : &tr;												\
+		while (begin < end) {											\
+			int mid = (begin + end) / 2;								\
+			if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \
+			else end = mid;												\
+		}																\
+		if (begin == x->n) { *rr = 1; return x->n - 1; }				\
+		if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin;	\
+		return begin;													\
+	}
+
+#define __KB_GET(name, key_t)											\
+	static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+	{																	\
+		int i, r = 0;													\
+		kbnode_t *x = b->root;											\
+		while (x) {														\
+			i = __kb_getp_aux_##name(x, k, &r);							\
+			if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i];		\
+			if (x->is_internal == 0) return 0;							\
+			x = __KB_PTR(b, x)[i + 1];									\
+		}																\
+		return 0;														\
+	}																	\
+	static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \
+	{																	\
+		return kb_getp_##name(b, &k);									\
+	}
+
+#define __KB_INTERVAL(name, key_t)										\
+	static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper)	\
+	{																	\
+		int i, r = 0;													\
+		kbnode_t *x = b->root;											\
+		*lower = *upper = 0;											\
+		while (x) {														\
+			i = __kb_getp_aux_##name(x, k, &r);							\
+			if (i >= 0 && r == 0) {										\
+				*lower = *upper = &__KB_KEY(key_t, x)[i];				\
+				return;													\
+			}															\
+			if (i >= 0) *lower = &__KB_KEY(key_t, x)[i];				\
+			if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1];		\
+			if (x->is_internal == 0) return;							\
+			x = __KB_PTR(b, x)[i + 1];									\
+		}																\
+	}																	\
+	static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \
+	{																	\
+		kb_intervalp_##name(b, &k, lower, upper);						\
+	}
+
+#define __KB_PUT(name, key_t, __cmp)									\
+	/* x must be an internal node */									\
+	static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \
+	{																	\
+		kbnode_t *z;													\
+		z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen);	\
+		++b->n_nodes;													\
+		z->is_internal = y->is_internal;								\
+		z->n = b->t - 1;												\
+		memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \
+		if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \
+		y->n = b->t - 1;												\
+		memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \
+		__KB_PTR(b, x)[i + 1] = z;										\
+		memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \
+		__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1];			\
+		++x->n;															\
+	}																	\
+	static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \
+	{																	\
+		int i = x->n - 1;												\
+		if (x->is_internal == 0) {										\
+			i = __kb_getp_aux_##name(x, k, 0);							\
+			if (i != x->n - 1)											\
+				memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+			__KB_KEY(key_t, x)[i + 1] = (key_t)*k;		\
+			++x->n;														\
+		} else {														\
+			i = __kb_getp_aux_##name(x, k, 0) + 1;						\
+			if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) {					\
+				__kb_split_##name(b, x, i, __KB_PTR(b, x)[i]);			\
+				if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i;			\
+			}															\
+			__kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k);				\
+		}																\
+	}																	\
+	static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+	{																	\
+		kbnode_t *r, *s;												\
+		++b->n_keys;													\
+		r = b->root;													\
+		if (r->n == 2 * b->t - 1) {										\
+			++b->n_nodes;												\
+			s = (kbnode_t*)calloc(1, b->ilen);							\
+			b->root = s; s->is_internal = 1; s->n = 0;					\
+			__KB_PTR(b, s)[0] = r;										\
+			__kb_split_##name(b, s, 0, r);								\
+			r = s;														\
+		}																\
+		__kb_putp_aux_##name(b, r, k);									\
+	}																	\
+	static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \
+	{																	\
+		kb_putp_##name(b, &k);											\
+	}
+
+
+#define __KB_DEL(name, key_t)											\
+	static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \
+	{																	\
+		int yn, zn, i, r = 0;											\
+		kbnode_t *xp, *y, *z;											\
+		key_t kp;														\
+		if (x == 0) return (key_t)*k;				\
+		if (s) { /* s can only be 0, 1 or 2 */							\
+			r = x->is_internal == 0? 0 : s == 1? 1 : -1;				\
+			i = s == 1? x->n - 1 : -1;									\
+		} else i = __kb_getp_aux_##name(x, k, &r);						\
+		if (x->is_internal == 0) {										\
+			if (s == 2) ++i;											\
+			kp = __KB_KEY(key_t, x)[i];									\
+			memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+			--x->n;														\
+			return kp;													\
+		}																\
+		if (r == 0) {													\
+			if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) {					\
+				xp = __KB_PTR(b, x)[i];									\
+				kp = __KB_KEY(key_t, x)[i];								\
+				__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \
+				return kp;												\
+			} else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) {		\
+				xp = __KB_PTR(b, x)[i + 1];								\
+				kp = __KB_KEY(key_t, x)[i];								\
+				__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \
+				return kp;												\
+			} else if (yn == b->t - 1 && zn == b->t - 1) {				\
+				y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1];		\
+				__KB_KEY(key_t, y)[y->n++] = (key_t)*k;	\
+				memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \
+				if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \
+				y->n += z->n;											\
+				memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+				memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
+				--x->n;													\
+				free(z);												\
+				return __kb_delp_aux_##name(b, y, k, s);				\
+			}															\
+		}																\
+		++i;															\
+		if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) {					\
+			if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) {		\
+				memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
+				if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
+				__KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1];		\
+				__KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \
+				if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \
+				--y->n; ++xp->n;										\
+			} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \
+				__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i];	\
+				__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0];			\
+				if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \
+				--y->n;													\
+				memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \
+				if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \
+			} else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \
+				__KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1];	\
+				memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t));	\
+				if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
+				y->n += xp->n;											\
+				memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \
+				memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \
+				--x->n;													\
+				free(xp);												\
+				xp = y;													\
+			} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \
+				__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i];	\
+				memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t));	\
+				if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \
+				xp->n += y->n;											\
+				memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+				memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
+				--x->n;													\
+				free(y);												\
+			}															\
+		}																\
+		return __kb_delp_aux_##name(b, xp, k, s);						\
+	}																	\
+	static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+	{																	\
+		kbnode_t *x;													\
+		key_t ret;														\
+		ret = __kb_delp_aux_##name(b, b->root, k, 0);					\
+		--b->n_keys;													\
+		if (b->root->n == 0 && b->root->is_internal) {					\
+			--b->n_nodes;												\
+			x = b->root;												\
+			b->root = __KB_PTR(b, x)[0];								\
+			free(x);													\
+		}																\
+		return ret;														\
+	}																	\
+	static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \
+	{																	\
+		return kb_delp_##name(b, &k);									\
+	}
+
+typedef struct {
+	kbnode_t *x;
+	int i;
+} __kbstack_t;
+
+#define __kb_traverse(key_t, b, __func) do {							\
+		int __kmax = 8;													\
+		__kbstack_t *__kstack, *__kp;									\
+		__kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \
+		__kp->x = (b)->root; __kp->i = 0;								\
+		for (;;) {														\
+			while (__kp->x && __kp->i <= __kp->x->n) {					\
+				if (__kp - __kstack == __kmax - 1) {					\
+					__kmax <<= 1;										\
+					__kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \
+					__kp = __kstack + (__kmax>>1) - 1;					\
+				}														\
+				(__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \
+				++__kp;													\
+			}															\
+			--__kp;														\
+			if (__kp >= __kstack) {										\
+				if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \
+				++__kp->i;												\
+			} else break;												\
+		}																\
+		free(__kstack);													\
+	} while (0)
+
+#define KBTREE_INIT(name, key_t, __cmp)			\
+	__KB_TREE_T(name)							\
+	__KB_INIT(name, key_t)						\
+	__KB_GET_AUX1(name, key_t, __cmp)			\
+	__KB_GET(name, key_t)						\
+	__KB_INTERVAL(name, key_t)					\
+	__KB_PUT(name, key_t, __cmp)				\
+	__KB_DEL(name, key_t)
+
+#define KB_DEFAULT_SIZE 512
+
+#define kbtree_t(name) kbtree_##name##_t
+#define kb_init(name, s) kb_init_##name(s)
+#define kb_destroy(name, b) __kb_destroy(b)
+#define kb_get(name, b, k) kb_get_##name(b, k)
+#define kb_put(name, b, k) kb_put_##name(b, k)
+#define kb_del(name, b, k) kb_del_##name(b, k)
+#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u)
+#define kb_getp(name, b, k) kb_getp_##name(b, k)
+#define kb_putp(name, b, k) kb_putp_##name(b, k)
+#define kb_delp(name, b, k) kb_delp_##name(b, k)
+#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u)
+
+#define kb_size(b) ((b)->n_keys)
+
+#define kb_generic_cmp(a, b) (((a) > (b)) - ((a) < (b)))
+#define kb_str_cmp(a, b) strcmp(a, b)
+
+#endif
diff --git a/c_src/khash.h b/c_src/khash.h
index 69549dc..ab157b1 100644
--- a/c_src/khash.h
+++ b/c_src/khash.h
@@ -586,7 +586,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
   @param  name  Name of the hash table [symbol]
   @param  khval_t  Type of values [type]
 */
-#ifdef __x86_64__
+#ifdef __x86_64__ 
 #define KHASH_MAP_INIT_PTR(name, khval_t)				\
     KHASH_INIT(name, void*, khval_t, 1, kh_ptr64_hash_func, kh_ptr64_hash_equal)
 #else
diff --git a/c_src/wterl.c b/c_src/wterl.c
index 3455143..53ee9b6 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -27,57 +27,68 @@
 
 #include "common.h"
 #include "wiredtiger.h"
-#include "async_nif.h"
-#include "khash.h"
 #include "stats.h"
-
-#define CTX_CACHE_SIZE ASYNC_NIF_MAX_WORKERS
+#include "async_nif.h"
+#include "kbtree.h"
+#include "queue.h"
+#include "fifo_q.h"
 
 static ErlNifResourceType *wterl_conn_RESOURCE;
 static ErlNifResourceType *wterl_ctx_RESOURCE;
 static ErlNifResourceType *wterl_cursor_RESOURCE;
 
-typedef struct struct WterlCtxHandle {
+/* WiredTiger object names*/
+typedef char Uri[128];
+
+typedef struct wterl_ctx {
     WT_SESSION *session;  // open session
-    WT_CURSOR *cursors[]; // open cursors, all reset ready to reuse
+    WT_CURSOR **cursors; // open cursors, all reset ready to reuse
+    uint64_t sig;
 } WterlCtxHandle;
 
-struct ctx_lru_entry {
+struct cache_entry {
     WterlCtxHandle *ctx;
-    u_int64_t sig;
-    SLIST_HEAD(ctx, struct WterlCtxHandle*) set;
-    STAILQ_ENTRY(struct ctx_lru_entry) entries;
+    uint64_t sig;
+    uint64_t tstamp;
 };
 
-KHASH_SET_INIT_INT64(ctx_idx, struct ctx_group*);
+#define __ctx_sig_cmp(a, b) ((((a)->sig) > ((b)->sig)) - (((a)->sig) < ((b)->sig)))
+KBTREE_INIT(cache_entries, struct cache_entry*, __ctx_sig_cmp);
+DECL_FIFO_QUEUE(cache_entries, struct cache_entry);
 
-struct ctx_cache {
-    size_t size;
-    struct lru {
-	STAILQ_HEAD(lru, struct ctx_lru_entry*) lru;
-    } lru;
-    struct idx {
-	int h;
-	ctx_group cgs[CTX_CACHE_SIZE];
-	khash_t(ctx_idx) *ctx;
-    } idx;
-};
-
-typedef struct {
+typedef struct wterl_conn {
     WT_CONNECTION *conn;
     const char *session_config;
-    ErlNifMutex *contexts_mutex;
-    unsigned int num_contexts;
-    WterlCtx **contexts; // TODO: free this
+    ErlNifMutex *cache_mutex;
+    kbtree_t(cache_entries) *cache;
+    fifo_t(cache_entries) recycled_cache_entries;
+    SLIST_ENTRY(wterl_conn) conns;
 } WterlConnHandle;
 
 typedef struct {
-    WT_CURSOR *cursor;
     WT_SESSION *session;
+    WT_CURSOR *cursor;
 } WterlCursorHandle;
 
-/* WiredTiger object names*/
-typedef char Uri[128];
+struct wterl_event_handlers {
+    WT_EVENT_HANDLER handlers;
+    ErlNifEnv *msg_env_error;
+    ErlNifMutex *error_mutex;
+    ErlNifEnv *msg_env_message;
+    ErlNifMutex *message_mutex;
+    ErlNifEnv *msg_env_progress;
+    ErlNifMutex *progress_mutex;
+    ErlNifPid to_pid;
+};
+
+struct wterl_priv_data {
+    void *async_nif_priv; // Note: must be first element in struct
+    ErlNifMutex *conns_mutex;
+    SLIST_HEAD(conns, wterl_conn) conns;
+    struct wterl_event_handlers eh;
+    char wterl_vsn[512];
+    char wiredtiger_vsn[512];
+};
 
 /* Atoms (initialized in on_load) */
 static ERL_NIF_TERM ATOM_ERROR;
@@ -91,57 +102,34 @@ static ERL_NIF_TERM ATOM_WTERL_VSN;
 static ERL_NIF_TERM ATOM_WIREDTIGER_VSN;
 static ERL_NIF_TERM ATOM_MSG_PID;
 
-struct wterl_event_handlers {
-    WT_EVENT_HANDLER handlers;
-    ErlNifEnv *msg_env_error;
-    ErlNifMutex *error_mutex;
-    ErlNifEnv *msg_env_message;
-    ErlNifMutex *message_mutex;
-    ErlNifEnv *msg_env_progress;
-    ErlNifMutex *progress_mutex;
-    ErlNifPid to_pid;
-};
-
-/* Generators for 'conns' a named, type-specific hash table functions. */
-KHASH_MAP_INIT_PTR(conns, WterlConnHandle*);
-
-struct wterl_priv_data {
-    void *async_nif_priv; // Note: must be first element in struct
-    ErlNifMutex *conns_mutex;
-    khash_t(conns) *conns;
-    struct wterl_event_handlers eh;
-    char wterl_vsn[512];
-    char wiredtiger_vsn[512];
-};
-
 /* Global init for async_nif. */
 ASYNC_NIF_INIT(wterl);
 
-
 /**
- * Is the cache full?
+ * Is the context cache full?
  *
- * Test to see if the cache is full or not.
- *
- * -> 0=false/not full yet, 1=true/cache is full
+ * ->   0 = no/false, anything else is true
  */
 static int
-__ctx_cache_full(struct wterl_ctx_cache *cache)
+__ctx_cache_full(WterlConnHandle *conn)
 {
-    return cache->size == CTX_CACHE_SIZE ? 1 : 0;
+    return fifo_q_full(cache_entries, conn->recycled_cache_entries);
 }
 
 /**
  * Evict items from the cache.
  *
- * Evict some number of items from the cache to make space for other items.
+ * Evict old contexts from the cache to make space for new, more frequently
+ * used contexts.
  *
  * ->   number of items evicted
  */
 static int
-__ctx_cache_evict(struct ctx_cache *cache)
+__ctx_cache_evict(WterlConnHandle *conn)
 {
     // TODO:
+    UNUSED(conn);
+    return 0;
 }
 
 /**
@@ -150,51 +138,25 @@ __ctx_cache_evict(struct ctx_cache *cache)
  * See if there exists an item in the cache with a matching signature, if
  * so remove it from the cache and return it for use by the callee.
  *
- * sig  a 32-bit signature (hash) representing the session/cursor* needed
+ * sig  a 64-bit signature (hash) representing the session/cursor* needed
  *      for the operation
  */
 static WterlCtxHandle *
-__ctx_cache_find(wterl_ctx_cache *cache, u_int64_t sig)
+__ctx_cache_find(WterlConnHandle *conn, const uint64_t sig)
 {
-    khiter_t k;
+    WterlCtxHandle *p = NULL;
+    struct cache_entry key, *e;
 
-    kh_get(ctx_idx, cache->idx.ctx, sig);
-    if (k != kh_end(h)) {
-        /*
-	 * This signature exists in the hashtable, that's good news. Maybe
-	 * there is a context open and ready for us to reuse, let's check.
-	 */
-	struct ctx_group *cg = kh_value(ctx_idx, k);
-	if (SLIST_EMPTY(cg->set)) { // cache miss
-	    /*
-	     * Nope, there are no contexts available for reuse with this
-	     * signature.
-	     */
-	    return NULL;
-	} else { // cache hit
-	    /*
-	     * Yes, we've found a context available for reuse with the
-	     * desired signature.  Remove it from the cache and return it
-	     * to the caller.
-	     */
-	    WterlCtxHandle *p = SLIST_REMOVE(cg->set); // remove from index
-	    WterlCtxHandle *q;
-	    STAILQ_FOREACH(q, &cache->lru, entries) {
-		if (p == q) {
-		    STAILQ_REMOVE(&cache-lru, q, ctx_lru_entries, entries);
-		}
-	    }
-	    // remove from lru
-	    cache->size--; // update cache size
-	    return p;
-	}
-    } else {
-        /*
-	 * The signature didn't match any that we're caching contexts for right
-	 * now, so clearly there won't be any cached contexts for this either.
-	 */
-	return NULL;
-    }
+    key.sig = sig;
+    e = *kb_get(cache_entries, conn->cache, &key);
+    if (e) {
+	// cache hit, remove it from the tree
+	kb_del(cache_entries, conn->cache, &key);
+	p = e->ctx;
+	memset(e, 0, sizeof(struct cache_entry));
+	fifo_q_put(cache_entries, conn->recycled_cache_entries, e);
+    } // else { cache miss, so p == NULL when we return }
+    return p;
 }
 
 /**
@@ -204,43 +166,34 @@ __ctx_cache_find(wterl_ctx_cache *cache, u_int64_t sig)
  * the front of the LRU.
  */
 static int
-__ctx_cache_add(wterl_ctx_cache *cache, WterlCtxHandle *e)
+__ctx_cache_add(WterlConnHandle *conn, WterlCtxHandle *c)
 {
-    khiter_t k;
+    struct cache_entry *e;
 
-    kh_get(ctx_idx, cache->idx.ctx, sig);
-    if (k != kh_end(h)) {
-        /*
-	 * This signature exists in the bitmap, that's good news. We can just
-	 * put this into the list of cached items for that signature.
-	 */
-	struct ctx_group *cg = kh_value(ctx_idx, k);
-	SLIST_INSERT_HEAD(cg->set, e, entries); // add to index
-	STAILQ_INSERT_HEAD(&cache->lru, e, entries); // add to lru
-	cache->size++; // update cache size
-    } else {
-        /*
-	 * The signature didn't match any that we're caching contexts for right
-	 * now, so we need to add a context group for it.
-	 */
-	if (cache->idx
-	return NULL;
-    }
+    if (__ctx_cache_full(conn))
+	__ctx_cache_evict(conn);
+
+    e = fifo_q_get(cache_entries, conn->recycled_cache_entries);
+    e->ctx = c;
+    e->sig = c->sig;
+    e->tstamp = cpu_clock_ticks();
+    kb_put(cache_entries, conn->cache, e);
+    return 0;
 }
 
 /**
- * Produce the Morton Number from two 32-bit unsigned integers.
+ * Produce the "Z-Index" or "Morton Number" from 2 32-bit unsigned integers.
  *   e.g.  p = 0101 1011 0100 0011
  *         q = 1011 1100 0001 0011
  *         z = 0110 0111 1101 1010 0010 0001 0000 1111
  */
-static inline u_int64_t
-__interleave(u_int32_t p, u_int32_t q)
+static inline uint64_t
+__zi(uint32_t p, uint32_t q)
 {
-    static const u_int32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
-    static const u_int32_t S[] = {1, 2, 4, 8};
-    u_int32_t x, y;
-    u_int64_t z;
+    static const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
+    static const uint32_t S[] = {1, 2, 4, 8};
+    uint32_t x, y;
+    uint64_t z;
 
     x = p & 0x0000FFFF; // Interleave lower 16 bits of p as x and q as y, so the
     y = q & 0x0000FFFF; // bits of x are in the even positions and bits from y
@@ -285,8 +238,8 @@ __interleave(u_int32_t p, u_int32_t q)
  * s    a NULL terminated set of bytes to be hashed
  * ->   an integer hash encoding of the bytes
  */
-static inline unsigned int
-__str_hash_func(const char *s)
+static inline uint32_t
+__str_hash(const char *s)
 {
     unsigned int h = (unsigned int)*s;
     if (h) for (++s ; *s; ++s) h = (h << 5) - h + (unsigned int)*s;
@@ -303,26 +256,141 @@ __str_hash_func(const char *s)
  * session_config   the string used to configure the WT_SESSION
  * ...              each pair of items in the varargs array is a table name,
  *                  cursor config pair
+ * ->   number of variable arguments processed
  */
-static u_int32_t
-__ctx_cache_sig(const char *session_config, ...)
+static int
+__ctx_cache_sig_(const char *c, va_list ap, uint64_t *h)
 {
-    va_list ap;
-    int i;
-    u_int64_t h;
+    int i = 0;
 
-    if (NULL == session_config)
+    if (NULL == c)
 	return 0;
 
-    h = __str_hash_fn(session_config);
+    *h = __str_hash(c);
 
-    va_start (ap, count);
-    for (i = 0; i < count; i++) {
-	h = __morton(h, __str_hash_fn(va_arg(ap, const char *)));
-	h <<= 1;
+    while (*c) {
+	*h = __zi((uint32_t)(*h & 0xFFFFFFFF), __str_hash(va_arg(ap, const char *)));
+	*h <<= 1;
+	i++;
     }
+    return i;
+}
+
+#if 0
+static uint64_t
+__ctx_cache_sig(const char *c, ...)
+{
+    int i;
+    va_list ap;
+    uint64_t h;
+
+    if (NULL == c)
+	return 0;
+
+    va_start(ap, c);
+    i = __ctx_cache_sig_(c, ap, &h);
     va_end (ap);
-    return (u_int32_t)(h & 0xFFFFFFFF);
+
+    return i;
+}
+#endif
+
+/**
+ * Get a reusable cursor that was opened for a particular worker within its
+ * session.
+ */
+static int
+__retain_ctx(WterlConnHandle *conn_handle, WterlCtxHandle **ctx,
+	     const char *session_config, ...)
+{
+    int i, count;
+    va_list ap;
+    uint64_t sig;
+    const char *c;
+
+    c = session_config;
+    va_start(ap, session_config);
+    count = __ctx_cache_sig_(session_config, ap, &sig);
+    va_end (ap);
+
+    enif_mutex_lock(conn_handle->cache_mutex);
+    *ctx = __ctx_cache_find(conn_handle, sig);
+    if (NULL == *ctx) {
+	// cache miss
+	WT_CONNECTION *conn = conn_handle->conn;
+	WT_SESSION *session = NULL;
+	int rc = conn->open_session(conn, NULL, session_config, &session);
+	if (rc != 0)
+	    return rc;
+	size_t s = sizeof(WterlCtxHandle) + ((count / 2) * sizeof(WT_CURSOR*));
+	*ctx = enif_alloc_resource(wterl_ctx_RESOURCE, s);
+	if (NULL == *ctx) {
+	    session->close(session, NULL);
+	    return ENOMEM;
+	}
+	memset(*ctx, 0, s);
+	(*ctx)->sig = sig;
+	(*ctx)->session = session;
+	WT_CURSOR **cursors = (*ctx)->cursors;
+	session_config = c;
+	va_start(ap, session_config);
+	for (i = 0; i < (count / 2); i++) {
+	    const char *uri = va_arg(ap, const char *);
+	    const char *config = va_arg(ap, const char *);
+	    rc = session->open_cursor(session, uri, NULL, config, &cursors[i]);
+	    if (rc != 0) {
+		session->close(session, NULL); // this will free cursors too
+		return rc;
+	    }
+	}
+	va_end (ap);
+    } // else { cache hit so 'ctx' is a reusable session/cursor }
+    enif_mutex_unlock(conn_handle->cache_mutex);
+    return 0;
+}
+
+static void
+__release_ctx(WterlConnHandle *conn_handle, WterlCtxHandle *ctx)
+{
+    int i, c;
+    WT_CURSOR *cursor;
+
+    c = sizeof(ctx->cursors) / sizeof(ctx->cursors[0]);
+    for (i = 0; i < c; i++) {
+	cursor = ctx->cursors[i];
+	cursor->reset(cursor);
+    }
+    enif_mutex_lock(conn_handle->cache_mutex);
+    __ctx_cache_add(conn_handle, ctx);
+    enif_mutex_unlock(conn_handle->cache_mutex);
+}
+
+/**
+ * Close all sessions and all cursors open on any objects.
+ *
+ * Note: always call within enif_mutex_lock/unlock(conn_handle->cache_mutex)
+ */
+void
+__close_all_sessions(WterlConnHandle *conn_handle)
+{
+    kbtree_t(cache_entries) *t = conn_handle->cache;
+
+#define traverse_f(p) { kb_del(cache_entries, t, *p); }
+    __kb_traverse(struct cache_entry *, t, traverse_f);
+#undef traverse_f
+}
+
+/**
+ * Close cursors open on 'uri' object.
+ *
+ * Note: always call within enif_mutex_lock/unlock(conn_handle->cache_mutex)
+ */
+void
+__close_cursors_on(WterlConnHandle *conn_handle, const char *uri)
+{
+    UNUSED(uri);
+    // TODO: find a way to only close those session/cursor* open on uri
+    __close_all_sessions(conn_handle);
 }
 
 /**
@@ -440,172 +508,6 @@ __wterl_progress_handler(WT_EVENT_HANDLER *handler, const char *operation, uint6
     return rc;
 }
 
-/**
- * Open a WT_SESSION for the thread context 'ctx' to use, also init the
- * shared cursor hash table.
- *
- * Note: always call within enif_mutex_lock/unlock(conn_handle->contexts_mutex)
- */
-static int
-__init_session_and_cursor_cache(WterlConnHandle *conn_handle, WterlCtx *ctx)
-{
-    /* Create a context for this worker thread to reuse. */
-    WT_CONNECTION *conn = conn_handle->conn;
-    int rc = conn->open_session(conn, NULL, conn_handle->session_config, &ctx->session);
-    if (rc != 0) {
-        ctx->session = NULL;
-        return rc;
-    }
-
-    ctx->cursors = kh_init(cursors);
-    if (!ctx->cursors) {
-        ctx->session->close(ctx->session, NULL);
-        ctx->session = NULL;
-        return ENOMEM;
-    }
-
-    return 0;
-}
-
-/**
- * Get the per-worker reusable WT_SESSION for a worker_id.
- */
-static int
-__session_for(WterlConnHandle *conn_handle, unsigned int worker_id, WT_SESSION **session)
-{
-    WterlCtx *ctx = &conn_handle->contexts[worker_id];
-    int rc = 0;
-
-    if (ctx->session == NULL) {
-        enif_mutex_lock(conn_handle->contexts_mutex);
-        rc = __init_session_and_cursor_cache(conn_handle, ctx);
-        enif_mutex_unlock(conn_handle->contexts_mutex);
-    }
-    *session = ctx->session;
-    return rc;
-}
-
-/**
- * Close all sessions and all cursors open on any objects.
- *
- * Note: always call within enif_mutex_lock/unlock(conn_handle->contexts_mutex)
- */
-void
-__close_all_sessions(WterlConnHandle *conn_handle)
-{
-    int i;
-
-    for (i = 0; i < ASYNC_NIF_MAX_WORKERS; i++) {
-        WterlCtx *ctx = &conn_handle->contexts[i];
-        if (ctx->session != NULL) {
-            WT_SESSION *session = ctx->session;
-            khash_t(cursors) *h = ctx->cursors;
-            khiter_t itr;
-            for (itr = kh_begin(h); itr != kh_end(h); ++itr) {
-                if (kh_exist(h, itr)) {
-                    WT_CURSOR *cursor = kh_val(h, itr);
-                    char *key = (char *)kh_key(h, itr);
-                    cursor->close(cursor);
-                    kh_del(cursors, h, itr);
-                    enif_free(key);
-                    kh_value(h, itr) = NULL;
-                }
-            }
-            kh_destroy(cursors, h);
-            session->close(session, NULL);
-            ctx->session = NULL;
-        }
-    }
-}
-
-/**
- * Close cursors open on 'uri' object.
- *
- * Note: always call within enif_mutex_lock/unlock(conn_handle->contexts_mutex)
- */
-void
-__close_cursors_on(WterlConnHandle *conn_handle, const char *uri)
-{
-    int i;
-
-    for (i = 0; i < ASYNC_NIF_MAX_WORKERS; i++) {
-        WterlCtx *ctx = &conn_handle->contexts[i];
-        if (ctx->session != NULL) {
-            khash_t(cursors) *h = ctx->cursors;
-            khiter_t itr = kh_get(cursors, h, (char *)uri);
-            if (itr != kh_end(h)) {
-                WT_CURSOR *cursor = kh_value(h, itr);
-                char *key = (char *)kh_key(h, itr);
-                cursor->close(cursor);
-                kh_del(cursors, h, itr);
-                enif_free(key);
-                kh_value(h, itr) = NULL;
-            }
-        }
-    }
-}
-
-/**
- * Get a reusable cursor that was opened for a particular worker within its
- * session.
- */
-static int
-__retain_ctx(WterlConnHandle *conn_handle, const char *uri, WterlCtx **ctx)
-{
-    /* Check to see if we have a cursor open for this uri and if so reuse it. */
-    WterlCtx *ctx = &conn_handle->contexts[worker_id];
-    khash_t(cursors) *h = NULL;
-    khiter_t itr;
-    int rc;
-    unsigned int h = __str_hash_func(uri); // TODO: add config at some point
-
-    if (ctx->session == NULL) {
-        enif_mutex_lock(conn_handle->contexts_mutex);
-        rc = __init_session_and_cursor_cache(conn_handle, ctx);
-        enif_mutex_unlock(conn_handle->contexts_mutex);
-        if (rc != 0)
-            return rc;
-    }
-
-    h = ctx->cursors;
-    itr = kh_get(cursors, h, (char *)uri);
-    if (itr != kh_end(h)) {
-        // key exists in hash table, retrieve it
-        *cursor = (WT_CURSOR*)kh_value(h, itr);
-    } else {
-        // key does not exist in hash table, create and insert one
-        enif_mutex_lock(conn_handle->contexts_mutex);
-        WT_SESSION *session = conn_handle->contexts[worker_id].session;
-        rc = session->open_cursor(session, uri, NULL, "overwrite,raw", cursor);
-        if (rc != 0) {
-            enif_mutex_unlock(conn_handle->contexts_mutex);
-            return rc;
-        }
-
-        char *key = enif_alloc(sizeof(Uri));
-        if (!key) {
-            session->close(session, NULL);
-            enif_mutex_unlock(conn_handle->contexts_mutex);
-            return ENOMEM;
-        }
-        memcpy(key, uri, 128);
-        int itr_status;
-        itr = kh_put(cursors, h, key, &itr_status);
-        kh_value(h, itr) = *cursor;
-        enif_mutex_unlock(conn_handle->contexts_mutex);
-    }
-    return 0;
-}
-
-static void
-__release_ctx(WterlConnHandle *conn_handle, const char *uri, WterlCtx *ctx)
-{
-    UNUSED(conn_handle);
-    UNUSED(worker_id);
-    UNUSED(uri);
-    cursor->reset(cursor);
-}
-
 /**
  * Convenience function to generate {error, {errno, Reason}} or 'not_found'
  * Erlang terms to return to callers.
@@ -625,9 +527,9 @@ __strerror_term(ErlNifEnv* env, int rc)
            atom rather than the message when matching in Erlang.  You've been
            warned. */
         return enif_make_tuple2(env, ATOM_ERROR,
-                                enif_make_tuple2(env,
-                                                 enif_make_atom(env, erl_errno_id(rc)),
-                                                 enif_make_string(env, wiredtiger_strerror(rc), ERL_NIF_LATIN1)));
+                    enif_make_tuple2(env,
+                         enif_make_atom(env, erl_errno_id(rc)),
+                         enif_make_string(env, wiredtiger_strerror(rc), ERL_NIF_LATIN1)));
     }
 }
 
@@ -693,30 +595,27 @@ ASYNC_NIF_DECL(
               return;
           }
           memcpy(sc, session_config.data, session_config.size);
-
           conn_handle->session_config = (const char *)sc;
       } else {
           conn_handle->session_config = NULL;
       }
-      conn_handle->contexts_mutex = enif_mutex_create(NULL);
-      enif_mutex_lock(conn_handle->contexts_mutex);
+      conn_handle->cache_mutex = enif_mutex_create(NULL);
+      enif_mutex_lock(conn_handle->cache_mutex);
       conn_handle->conn = conn;
-      memset(conn_handle->contexts, 0, sizeof(WterlCtx) * ASYNC_NIF_MAX_WORKERS);
       ERL_NIF_TERM result = enif_make_resource(env, conn_handle);
 
+      /* Init tree which manages the cache of session/cursor(s) */
+      conn_handle->cache = kb_init(cache_entries, ASYNC_NIF_MAX_WORKERS); // TODO: size
+      conn_handle->recycled_cache_entries = fifo_q_new(cache_entries, ASYNC_NIF_MAX_WORKERS);
+
       /* Keep track of open connections so as to free when unload/reload/etc.
          are called. */
-      khash_t(conns) *h;
       enif_mutex_lock(args->priv->conns_mutex);
-      h = args->priv->conns;
-      int itr_status = 0;
-      khiter_t itr = kh_put(conns, h, conn, &itr_status);
-      if (itr_status != 0) // 0 indicates the key exists already
-          kh_value(h, itr) = conn_handle;
+      SLIST_INSERT_HEAD(&args->priv->conns, conn_handle, conns);
       enif_mutex_unlock(args->priv->conns_mutex);
 
       enif_release_resource(conn_handle);
-      enif_mutex_unlock(conn_handle->contexts_mutex);
+      enif_mutex_unlock(conn_handle->cache_mutex);
       ASYNC_NIF_REPLY(enif_make_tuple2(env, ATOM_OK, result));
     }
     else
@@ -753,7 +652,7 @@ ASYNC_NIF_DECL(
   { // work
 
     /* Free up the shared sessions and cursors. */
-    enif_mutex_lock(args->conn_handle->contexts_mutex);
+    enif_mutex_lock(args->conn_handle->cache_mutex);
     __close_all_sessions(args->conn_handle);
     if (args->conn_handle->session_config) {
         enif_free((char *)args->conn_handle->session_config);
@@ -763,19 +662,11 @@ ASYNC_NIF_DECL(
     int rc = conn->close(conn, NULL);
 
     /* Connection is closed, remove it so we don't free on unload/reload/etc. */
-    khash_t(conns) *h;
     enif_mutex_lock(args->priv->conns_mutex);
-    h = args->priv->conns;
-    khiter_t itr;
-    itr = kh_get(conns, h, conn);
-    if (itr != kh_end(h)) {
-        /* key exists in table (as expected) delete it */
-        kh_del(conns, h, itr);
-        kh_value(h, itr) = NULL;
-    }
+    SLIST_REMOVE(&args->priv->conns, args->conn_handle, wterl_conn, conns);
     enif_mutex_unlock(args->priv->conns_mutex);
-    enif_mutex_unlock(args->conn_handle->contexts_mutex);
-    enif_mutex_destroy(args->conn_handle->contexts_mutex);
+    enif_mutex_unlock(args->conn_handle->cache_mutex);
+    enif_mutex_destroy(args->conn_handle->cache_mutex);
     memset(args->conn_handle, 0, sizeof(WterlConnHandle));
 
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
@@ -871,12 +762,12 @@ ASYNC_NIF_DECL(
   { // work
 
     /* This call requires that there be no open cursors referencing the object. */
-    enif_mutex_lock(args->conn_handle->contexts_mutex);
+    enif_mutex_lock(args->conn_handle->cache_mutex);
     __close_cursors_on(args->conn_handle, args->uri);
 
     ErlNifBinary config;
     if (!enif_inspect_binary(env, args->config, &config)) {
-      enif_mutex_unlock(args->conn_handle->contexts_mutex);
+      enif_mutex_unlock(args->conn_handle->cache_mutex);
       ASYNC_NIF_REPLY(enif_make_badarg(env));
       return;
     }
@@ -888,7 +779,7 @@ ASYNC_NIF_DECL(
     WT_SESSION *session = NULL;
     int rc = conn->open_session(conn, NULL, args->conn_handle->session_config, &session);
     if (rc != 0) {
-        enif_mutex_unlock(args->conn_handle->contexts_mutex);
+        enif_mutex_unlock(args->conn_handle->cache_mutex);
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
@@ -898,7 +789,7 @@ ASYNC_NIF_DECL(
        this will result in EBUSY(16) "Device or resource busy". */
     rc = session->drop(session, args->uri, (const char*)config.data);
     (void)session->close(session, NULL);
-    enif_mutex_unlock(args->conn_handle->contexts_mutex);
+    enif_mutex_unlock(args->conn_handle->cache_mutex);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
   },
   { // post
@@ -938,12 +829,12 @@ ASYNC_NIF_DECL(
   { // work
 
     /* This call requires that there be no open cursors referencing the object. */
-    enif_mutex_lock(args->conn_handle->contexts_mutex);
+    enif_mutex_lock(args->conn_handle->cache_mutex);
     __close_cursors_on(args->conn_handle, args->oldname);
 
     ErlNifBinary config;
     if (!enif_inspect_binary(env, args->config, &config)) {
-      enif_mutex_unlock(args->conn_handle->contexts_mutex);
+      enif_mutex_unlock(args->conn_handle->cache_mutex);
       ASYNC_NIF_REPLY(enif_make_badarg(env));
       return;
     }
@@ -955,7 +846,7 @@ ASYNC_NIF_DECL(
     WT_SESSION *session = NULL;
     int rc = conn->open_session(conn, NULL, args->conn_handle->session_config, &session);
     if (rc != 0) {
-      enif_mutex_unlock(args->conn_handle->contexts_mutex);
+      enif_mutex_unlock(args->conn_handle->cache_mutex);
       ASYNC_NIF_REPLY(__strerror_term(env, rc));
       return;
     }
@@ -966,7 +857,7 @@ ASYNC_NIF_DECL(
        this will result in EBUSY(16) "Device or resource busy". */
     rc = session->rename(session, args->oldname, args->newname, (const char*)config.data);
     (void)session->close(session, NULL);
-    enif_mutex_unlock(args->conn_handle->contexts_mutex);
+    enif_mutex_unlock(args->conn_handle->cache_mutex);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
   },
   { // post
@@ -1006,12 +897,12 @@ ASYNC_NIF_DECL(
   { // work
 
     /* This call requires that there be no open cursors referencing the object. */
-    enif_mutex_lock(args->conn_handle->contexts_mutex);
+    enif_mutex_lock(args->conn_handle->cache_mutex);
     __close_cursors_on(args->conn_handle, args->uri);
 
     ErlNifBinary config;
     if (!enif_inspect_binary(env, args->config, &config)) {
-      enif_mutex_unlock(args->conn_handle->contexts_mutex);
+      enif_mutex_unlock(args->conn_handle->cache_mutex);
       ASYNC_NIF_REPLY(enif_make_badarg(env));
       return;
     }
@@ -1023,14 +914,14 @@ ASYNC_NIF_DECL(
     WT_SESSION *session = NULL;
     int rc = conn->open_session(conn, NULL, args->conn_handle->session_config, &session);
     if (rc != 0) {
-      enif_mutex_unlock(args->conn_handle->contexts_mutex);
+      enif_mutex_unlock(args->conn_handle->cache_mutex);
       ASYNC_NIF_REPLY(__strerror_term(env, rc));
       return;
     }
 
     rc = session->salvage(session, args->uri, (const char*)config.data);
     (void)session->close(session, NULL);
-    enif_mutex_unlock(args->conn_handle->contexts_mutex);
+    enif_mutex_unlock(args->conn_handle->cache_mutex);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
   },
   { // post
@@ -1134,17 +1025,17 @@ ASYNC_NIF_DECL(
     }
     args->config = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[4]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = args->uri;
+    affinity = __str_hash(args->uri);
   },
   { // work
 
     /* This call requires that there be no open cursors referencing the object. */
-    enif_mutex_lock(args->conn_handle->contexts_mutex);
+    enif_mutex_lock(args->conn_handle->cache_mutex);
     __close_cursors_on(args->conn_handle, args->uri);
 
     ErlNifBinary config;
     if (!enif_inspect_binary(env, args->config, &config)) {
-      enif_mutex_unlock(args->conn_handle->contexts_mutex);
+      enif_mutex_unlock(args->conn_handle->cache_mutex);
       ASYNC_NIF_REPLY(enif_make_badarg(env));
       return;
     }
@@ -1157,7 +1048,7 @@ ASYNC_NIF_DECL(
     WT_SESSION *session = NULL;
     int rc = conn->open_session(conn, NULL, args->conn_handle->session_config, &session);
     if (rc != 0) {
-        enif_mutex_unlock(args->conn_handle->contexts_mutex);
+        enif_mutex_unlock(args->conn_handle->cache_mutex);
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
@@ -1172,7 +1063,7 @@ ASYNC_NIF_DECL(
        mess. */
     if (!args->from_first) {
         if (!enif_inspect_binary(env, args->start, &start_key)) {
-            enif_mutex_unlock(args->conn_handle->contexts_mutex);
+            enif_mutex_unlock(args->conn_handle->cache_mutex);
             ASYNC_NIF_REPLY(enif_make_badarg(env));
             return;
         }
@@ -1180,7 +1071,7 @@ ASYNC_NIF_DECL(
     rc = session->open_cursor(session, args->uri, NULL, "raw", &start);
     if (rc != 0) {
         session->close(session, NULL);
-        enif_mutex_unlock(args->conn_handle->contexts_mutex);
+        enif_mutex_unlock(args->conn_handle->cache_mutex);
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
@@ -1190,7 +1081,7 @@ ASYNC_NIF_DECL(
         if (rc != 0) {
             start->close(start);
             session->close(session, NULL);
-            enif_mutex_unlock(args->conn_handle->contexts_mutex);
+            enif_mutex_unlock(args->conn_handle->cache_mutex);
             ASYNC_NIF_REPLY(__strerror_term(env, rc));
             return;
         }
@@ -1205,7 +1096,7 @@ ASYNC_NIF_DECL(
         if (!enif_inspect_binary(env, args->stop, &stop_key)) {
             start->close(start);
             session->close(session, NULL);
-            enif_mutex_unlock(args->conn_handle->contexts_mutex);
+            enif_mutex_unlock(args->conn_handle->cache_mutex);
             ASYNC_NIF_REPLY(enif_make_badarg(env));
             return;
         }
@@ -1214,7 +1105,7 @@ ASYNC_NIF_DECL(
     if (rc != 0) {
         start->close(start);
         session->close(session, NULL);
-        enif_mutex_unlock(args->conn_handle->contexts_mutex);
+        enif_mutex_unlock(args->conn_handle->cache_mutex);
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
@@ -1225,7 +1116,7 @@ ASYNC_NIF_DECL(
             start->close(start);
             stop->close(stop);
             session->close(session, NULL);
-            enif_mutex_unlock(args->conn_handle->contexts_mutex);
+            enif_mutex_unlock(args->conn_handle->cache_mutex);
             ASYNC_NIF_REPLY(__strerror_term(env, rc));
             return;
         }
@@ -1243,7 +1134,7 @@ ASYNC_NIF_DECL(
     start->close(start);
     stop->close(stop);
     session->close(session, NULL);
-    enif_mutex_unlock(args->conn_handle->contexts_mutex);
+    enif_mutex_unlock(args->conn_handle->cache_mutex);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
   },
   { // post
@@ -1280,12 +1171,12 @@ ASYNC_NIF_DECL(
   { // work
 
     /* This call requires that there be no open cursors referencing the object. */
-    enif_mutex_lock(args->conn_handle->contexts_mutex);
+    enif_mutex_lock(args->conn_handle->cache_mutex);
     __close_cursors_on(args->conn_handle, args->uri);
 
     ErlNifBinary config;
     if (!enif_inspect_binary(env, args->config, &config)) {
-      enif_mutex_unlock(args->conn_handle->contexts_mutex);
+      enif_mutex_unlock(args->conn_handle->cache_mutex);
       ASYNC_NIF_REPLY(enif_make_badarg(env));
       return;
     }
@@ -1297,14 +1188,14 @@ ASYNC_NIF_DECL(
     WT_SESSION *session = NULL;
     int rc = conn->open_session(conn, NULL, args->conn_handle->session_config, &session);
     if (rc != 0) {
-        enif_mutex_unlock(args->conn_handle->contexts_mutex);
+        enif_mutex_unlock(args->conn_handle->cache_mutex);
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
 
     rc = session->upgrade(session, args->uri, (const char*)config.data);
     (void)session->close(session, NULL);
-    enif_mutex_unlock(args->conn_handle->contexts_mutex);
+    enif_mutex_unlock(args->conn_handle->cache_mutex);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
   },
   { // post
@@ -1342,12 +1233,12 @@ ASYNC_NIF_DECL(
   { // work
 
     /* This call requires that there be no open cursors referencing the object. */
-    enif_mutex_lock(args->conn_handle->contexts_mutex);
+    enif_mutex_lock(args->conn_handle->cache_mutex);
     __close_all_sessions(args->conn_handle);
 
     ErlNifBinary config;
     if (!enif_inspect_binary(env, args->config, &config)) {
-      enif_mutex_unlock(args->conn_handle->contexts_mutex);
+      enif_mutex_unlock(args->conn_handle->cache_mutex);
       ASYNC_NIF_REPLY(enif_make_badarg(env));
       return;
     }
@@ -1359,14 +1250,14 @@ ASYNC_NIF_DECL(
     WT_SESSION *session = NULL;
     int rc = conn->open_session(conn, NULL, args->conn_handle->session_config, &session);
     if (rc != 0) {
-        enif_mutex_unlock(args->conn_handle->contexts_mutex);
+        enif_mutex_unlock(args->conn_handle->cache_mutex);
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
 
     rc = session->verify(session, args->uri, (const char*)config.data);
     (void)session->close(session, NULL);
-    enif_mutex_unlock(args->conn_handle->contexts_mutex);
+    enif_mutex_unlock(args->conn_handle->cache_mutex);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
   },
   { // post
@@ -1399,7 +1290,7 @@ ASYNC_NIF_DECL(
     }
     args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = args->uri;
+    affinity = __str_hash(args->uri);
   },
   { // work
 
@@ -1409,14 +1300,16 @@ ASYNC_NIF_DECL(
       return;
     }
 
-    WterlCtx *ctx = NULL;
+    WterlCtxHandle *ctx = NULL;
     WT_CURSOR *cursor = NULL;
-    int rc = __retain_ctx(args->conn_handle, args->uri, &ctx);
+    int rc = __retain_ctx(args->conn_handle, &ctx,
+			  args->conn_handle->session_config,
+			  args->uri, "overwrite,raw");
     if (rc != 0) {
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
-    cursor = ctx->cursor;
+    cursor = ctx->cursors[0];
 
     WT_ITEM item_key;
     item_key.data = key.data;
@@ -1424,7 +1317,7 @@ ASYNC_NIF_DECL(
     cursor->set_key(cursor, &item_key);
     rc = cursor->remove(cursor);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
-    __release_ctx(args->conn_handle, args->uri, cursor);
+    __release_ctx(args->conn_handle, ctx);
   },
   { // post
 
@@ -1456,7 +1349,7 @@ ASYNC_NIF_DECL(
     }
     args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = args->uri;
+    affinity = __str_hash(args->uri);
   },
   { // work
 
@@ -1466,14 +1359,16 @@ ASYNC_NIF_DECL(
       return;
     }
 
-    WterlCtx *ctx = NULL
+    WterlCtxHandle *ctx = NULL;
     WT_CURSOR *cursor = NULL;
-    int rc = __retain_ctx(args->conn_handle, args->uri, &ctx);
+    int rc = __retain_ctx(args->conn_handle, &ctx,
+			  args->conn_handle->session_config,
+			  args->uri, "overwrite,raw");
     if (rc != 0) {
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
-    cursor = ctx->cursor;
+    cursor = ctx->cursors[0];
 
     WT_ITEM item_key;
     WT_ITEM item_value;
@@ -1495,7 +1390,7 @@ ASYNC_NIF_DECL(
     unsigned char *bin = enif_make_new_binary(env, item_value.size, &value);
     memcpy(bin, item_value.data, item_value.size);
     ASYNC_NIF_REPLY(enif_make_tuple2(env, ATOM_OK, value));
-    __release_ctx(args->conn_handle, args->uri, ctx);
+    __release_ctx(args->conn_handle, ctx);
   },
   { // post
 
@@ -1531,7 +1426,7 @@ ASYNC_NIF_DECL(
     args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     args->value = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[3]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = args->uri;
+    affinity = __str_hash(args->uri);
   },
   { // work
 
@@ -1546,14 +1441,16 @@ ASYNC_NIF_DECL(
       return;
     }
 
-    WterlCtx *ctx = NULL;
+    WterlCtxHandle *ctx = NULL;
     WT_CURSOR *cursor = NULL;
-    int rc = __retain_ctx(args->conn_handle, args->uri, &ctx);
+    int rc = __retain_ctx(args->conn_handle, &ctx,
+			  args->conn_handle->session_config,
+			  args->uri, "overwrite,raw");
     if (rc != 0) {
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
-    cursor = ctx->cursors;
+    cursor = ctx->cursors[0];
 
     WT_ITEM item_key;
     WT_ITEM item_value;
@@ -1564,7 +1461,7 @@ ASYNC_NIF_DECL(
     item_value.size = value.size;
     cursor->set_value(cursor, &item_value);
     rc = cursor->insert(cursor);
-    __release_ctx(args->conn_handle, args->uri, ctx);
+    __release_ctx(args->conn_handle, ctx);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
   },
   { // post
@@ -1597,7 +1494,7 @@ ASYNC_NIF_DECL(
     }
     args->config = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = args->uri;
+    affinity = __str_hash(args->uri);
   },
   { // work
 
@@ -2301,12 +2198,7 @@ on_load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info)
     memset(priv, 0, sizeof(struct wterl_priv_data));
 
     priv->conns_mutex = enif_mutex_create(NULL);
-    priv->conns = kh_init(conns);
-    if (!priv->conns) {
-        enif_mutex_destroy(priv->conns_mutex);
-        enif_free(priv);
-        return ENOMEM;
-    }
+    SLIST_INIT(&priv->conns);
 
     struct wterl_event_handlers *eh = &priv->eh;
     eh->error_mutex = enif_mutex_create(NULL);
@@ -2333,7 +2225,6 @@ on_load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info)
        pointer to the async_nif's private data which we set here. */
     ASYNC_NIF_LOAD(wterl, priv->async_nif_priv);
     if (!priv->async_nif_priv) {
-        kh_destroy(conns, priv->conns);
         enif_mutex_destroy(priv->conns_mutex);
         enif_free(priv);
         return ENOMEM;
@@ -2358,61 +2249,29 @@ on_reload(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info)
 static void
 on_unload(ErlNifEnv *env, void *priv_data)
 {
-    unsigned int i;
     struct wterl_priv_data *priv = (struct wterl_priv_data *)priv_data;
-    khash_t(conns) *h;
-    khiter_t itr_conns;
     WterlConnHandle *conn_handle;
+    struct cache_entry *e;
 
     enif_mutex_lock(priv->conns_mutex);
-    h = priv->conns;
 
-    for (itr_conns = kh_begin(h); itr_conns != kh_end(h); ++itr_conns) {
-      if (kh_exist(h, itr_conns)) {
-        conn_handle = kh_val(h, itr_conns);
-        if (conn_handle) {
-          enif_mutex_lock(conn_handle->contexts_mutex);
-          enif_free((void*)conn_handle->session_config);
-          for (i = 0; i < ASYNC_NIF_MAX_WORKERS; i++) {
-            WterlCtx *ctx = &conn_handle->contexts[i];
-            if (ctx->session != NULL) {
-              WT_SESSION *session = ctx->session;
-              khash_t(cursors) *h = ctx->cursors;
-              khiter_t itr_cursors;
-              for (itr_cursors = kh_begin(h); itr_cursors != kh_end(h); ++itr_cursors) {
-                if (kh_exist(h, itr_cursors)) {
-                  WT_CURSOR *cursor = kh_val(h, itr_cursors);
-                  char *key = (char *)kh_key(h, itr_cursors);
-                  cursor->close(cursor);
-                  kh_del(cursors, h, itr_cursors);
-                  enif_free(key);
-                  kh_value(h, itr_cursors) = NULL;
-                }
-              }
-              kh_destroy(cursors, h);
-              session->close(session, NULL);
-            }
-          }
-        }
-
-        /* This would have closed all cursors and sessions for us
-           but we do that explicitly above. */
-        conn_handle->conn->close(conn_handle->conn, NULL);
-      }
+    /* Lock the cache mutex before unloading the async_nif to prevent new
+       work from coming in while shutting down. */
+    SLIST_FOREACH(conn_handle, &priv->conns, conns) {
+	enif_mutex_lock(conn_handle->cache_mutex);
     }
 
-    /* Continue to hold the context mutex while unloading the async_nif
-       to prevent new work from coming in while shutting down. */
     ASYNC_NIF_UNLOAD(wterl, env, priv->async_nif_priv);
 
-    for (itr_conns = kh_begin(h); itr_conns != kh_end(h); ++itr_conns) {
-      if (kh_exist(h, itr_conns)) {
-        conn_handle = kh_val(h, itr_conns);
-        if (conn_handle) {
-          enif_mutex_unlock(conn_handle->contexts_mutex);
-          enif_mutex_destroy(conn_handle->contexts_mutex);
-        }
-      }
+    SLIST_FOREACH(conn_handle, &priv->conns, conns) {
+	fifo_q_foreach(cache_entries, conn_handle->recycled_cache_entries, e, {
+            WterlCtxHandle *ctx = e->ctx;
+	    ctx->session->close(ctx->session, NULL);
+	});
+	fifo_q_free(cache_entries, conn_handle->recycled_cache_entries);
+	kb_destroy(cache_entries, conn_handle->cache);
+	enif_mutex_unlock(conn_handle->cache_mutex);
+	enif_mutex_destroy(conn_handle->cache_mutex);
     }
 
     /* At this point all WiredTiger state and threads are free'd/stopped so there
@@ -2429,7 +2288,6 @@ on_unload(ErlNifEnv *env, void *priv_data)
     if (eh->msg_env_progress)
         enif_free_env(eh->msg_env_progress);
 
-    kh_destroy(conns, h);
     enif_mutex_unlock(priv->conns_mutex);
     enif_mutex_destroy(priv->conns_mutex);
     enif_free(priv);

From 9468870e1f820e307251467d29a9ebbe2c06a6ff Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Thu, 30 May 2013 17:10:51 -0400
Subject: [PATCH 04/30] WIP: use a log2 histogram to track how long items live
 in the cache, evict items which have been in cache greater than the mean
 time.

---
 c_src/wterl.c                 | 166 ++++++---
 src/riak_kv_wterl_backend.erl | 613 ----------------------------------
 src/temp_riak_kv_backend.erl  | 287 ----------------
 3 files changed, 120 insertions(+), 946 deletions(-)
 delete mode 100644 src/riak_kv_wterl_backend.erl
 delete mode 100644 src/temp_riak_kv_backend.erl

diff --git a/c_src/wterl.c b/c_src/wterl.c
index 53ee9b6..eba372d 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -40,14 +40,14 @@ static ErlNifResourceType *wterl_cursor_RESOURCE;
 /* WiredTiger object names*/
 typedef char Uri[128];
 
-typedef struct wterl_ctx {
+struct wterl_ctx {
     WT_SESSION *session;  // open session
     WT_CURSOR **cursors; // open cursors, all reset ready to reuse
     uint64_t sig;
-} WterlCtxHandle;
+};
 
 struct cache_entry {
-    WterlCtxHandle *ctx;
+    struct wterl_ctx *ctx;
     uint64_t sig;
     uint64_t tstamp;
 };
@@ -63,6 +63,8 @@ typedef struct wterl_conn {
     kbtree_t(cache_entries) *cache;
     fifo_t(cache_entries) recycled_cache_entries;
     SLIST_ENTRY(wterl_conn) conns;
+    uint64_t histogram[64];
+    uint64_t hits, misses;
 } WterlConnHandle;
 
 typedef struct {
@@ -105,6 +107,51 @@ static ERL_NIF_TERM ATOM_MSG_PID;
 /* Global init for async_nif. */
 ASYNC_NIF_INIT(wterl);
 
+/**
+ * A string hash function.
+ *
+ * A basic hash function for strings of characters used during the
+ * affinity association.
+ *
+ * s    a NULL terminated set of bytes to be hashed
+ * ->   an integer hash encoding of the bytes
+ */
+static inline uint32_t
+__str_hash(const char *s)
+{
+    unsigned int h = (unsigned int)*s;
+    if (h) for (++s ; *s; ++s) h = (h << 5) - h + (unsigned int)*s;
+    return h;
+}
+
+/**
+ * Calculate the log2 of 64bit unsigned integers.
+ */
+#ifdef __GCC__
+#define __log2(X) ((unsigned) ((8 * (sizeof(uint64_t) - 1))  - __builtin_clzll((X))))
+#else
+static inline uint32_t __log2(uint64_t x) {
+     static const int tab64[64] = {
+          63,  0, 58,  1, 59, 47, 53,  2,
+          60, 39, 48, 27, 54, 33, 42,  3,
+          61, 51, 37, 40, 49, 18, 28, 20,
+          55, 30, 34, 11, 43, 14, 22,  4,
+          62, 57, 46, 52, 38, 26, 32, 41,
+          50, 36, 17, 19, 29, 10, 13, 21,
+          56, 45, 25, 31, 35, 16,  9, 12,
+          44, 24, 15,  8, 23,  7,  6,  5};
+     if (x == 0) return 0;
+     uint64_t v = x;
+     v |= v >> 1;
+     v |= v >> 2;
+     v |= v >> 4;
+     v |= v >> 8;
+     v |= v >> 16;
+     v |= v >> 32;
+     return tab64[((uint64_t)((v - (v >> 1)) * 0x07EDD5E59A4E28C2)) >> 58];
+}
+#endif
+
 /**
  * Is the context cache full?
  *
@@ -125,11 +172,40 @@ __ctx_cache_full(WterlConnHandle *conn)
  * ->   number of items evicted
  */
 static int
-__ctx_cache_evict(WterlConnHandle *conn)
+__ctx_cache_evict(WterlConnHandle *conn_handle)
 {
-    // TODO:
-    UNUSED(conn);
-    return 0;
+    uint32_t i;
+    uint64_t mean, now = cpu_clock_ticks();
+    kbtree_t(cache_entries) *t = conn_handle->cache;
+
+    // Find the mean of the recorded times that items stayed in cache.
+    for (i = 0; i < 64; i++)
+	mean += (conn_handle->histogram[i] * i);
+    if (mean > 0)
+	mean /= conn_handle->hits;
+
+    // Clear out the histogram and hit/misses
+    memset(conn_handle->histogram, 0, sizeof(uint64_t) * 64);
+    conn_handle->hits = 0;
+    conn_handle->misses = 0;
+
+    // Evict anything older than the mean time in queue.
+    i = 0;
+#define traverse_f(p)							\
+    {									\
+	struct cache_entry *e = *p;					\
+	uint64_t elapsed = e->tstamp - now;				\
+	if (__log2(elapsed) > mean) {					\
+	    kb_del(cache_entries, t, e);				\
+	    e->ctx->session->close(e->ctx->session, NULL);		\
+	    enif_free(e->ctx);						\
+	    fifo_q_put(cache_entries, conn_handle->recycled_cache_entries, e);	\
+	    i++;							\
+	}								\
+    }
+    __kb_traverse(struct cache_entry *, t, traverse_f);
+#undef traverse_f
+    return i;
 }
 
 /**
@@ -141,21 +217,26 @@ __ctx_cache_evict(WterlConnHandle *conn)
  * sig  a 64-bit signature (hash) representing the session/cursor* needed
  *      for the operation
  */
-static WterlCtxHandle *
-__ctx_cache_find(WterlConnHandle *conn, const uint64_t sig)
+static struct wterl_ctx *
+__ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
 {
-    WterlCtxHandle *p = NULL;
+    struct wterl_ctx *p = NULL;
     struct cache_entry key, *e;
 
     key.sig = sig;
-    e = *kb_get(cache_entries, conn->cache, &key);
+    e = *kb_get(cache_entries, conn_handle->cache, &key);
     if (e) {
 	// cache hit, remove it from the tree
-	kb_del(cache_entries, conn->cache, &key);
+        uint64_t elapsed = cpu_clock_ticks() - e->tstamp;
+	kb_del(cache_entries, conn_handle->cache, &key);
 	p = e->ctx;
 	memset(e, 0, sizeof(struct cache_entry));
-	fifo_q_put(cache_entries, conn->recycled_cache_entries, e);
-    } // else { cache miss, so p == NULL when we return }
+	fifo_q_put(cache_entries, conn_handle->recycled_cache_entries, e);
+	conn_handle->hits++;
+        conn_handle->histogram[__log2(elapsed)]++;
+    } else {
+	conn_handle->misses++;
+    }
     return p;
 }
 
@@ -166,7 +247,7 @@ __ctx_cache_find(WterlConnHandle *conn, const uint64_t sig)
  * the front of the LRU.
  */
 static int
-__ctx_cache_add(WterlConnHandle *conn, WterlCtxHandle *c)
+__ctx_cache_add(WterlConnHandle *conn, struct wterl_ctx *c)
 {
     struct cache_entry *e;
 
@@ -229,23 +310,6 @@ __zi(uint32_t p, uint32_t q)
     return z;
 }
 
-/**
- * A string hash function.
- *
- * A basic hash function for strings of characters used during the
- * affinity association.
- *
- * s    a NULL terminated set of bytes to be hashed
- * ->   an integer hash encoding of the bytes
- */
-static inline uint32_t
-__str_hash(const char *s)
-{
-    unsigned int h = (unsigned int)*s;
-    if (h) for (++s ; *s; ++s) h = (h << 5) - h + (unsigned int)*s;
-    return h;
-}
-
 /**
  * Create a signature for the operation we're about to perform.
  *
@@ -300,7 +364,7 @@ __ctx_cache_sig(const char *c, ...)
  * session.
  */
 static int
-__retain_ctx(WterlConnHandle *conn_handle, WterlCtxHandle **ctx,
+__retain_ctx(WterlConnHandle *conn_handle, struct wterl_ctx **ctx,
 	     const char *session_config, ...)
 {
     int i, count;
@@ -314,17 +378,18 @@ __retain_ctx(WterlConnHandle *conn_handle, WterlCtxHandle **ctx,
     va_end (ap);
 
     enif_mutex_lock(conn_handle->cache_mutex);
-    *ctx = __ctx_cache_find(conn_handle, sig);
-    if (NULL == *ctx) {
+    (*ctx) = __ctx_cache_find(conn_handle, sig);
+    if ((*ctx) == NULL) {
 	// cache miss
 	WT_CONNECTION *conn = conn_handle->conn;
 	WT_SESSION *session = NULL;
 	int rc = conn->open_session(conn, NULL, session_config, &session);
 	if (rc != 0)
 	    return rc;
-	size_t s = sizeof(WterlCtxHandle) + ((count / 2) * sizeof(WT_CURSOR*));
-	*ctx = enif_alloc_resource(wterl_ctx_RESOURCE, s);
-	if (NULL == *ctx) {
+	size_t s = sizeof(struct wterl_ctx) + ((count / 2) * sizeof(WT_CURSOR*));
+	// TODO: *ctx = enif_alloc_resource(wterl_ctx_RESOURCE, s);
+	*ctx = enif_alloc(s);
+	if (*ctx == NULL) {
 	    session->close(session, NULL);
 	    return ENOMEM;
 	}
@@ -337,20 +402,24 @@ __retain_ctx(WterlConnHandle *conn_handle, WterlCtxHandle **ctx,
 	for (i = 0; i < (count / 2); i++) {
 	    const char *uri = va_arg(ap, const char *);
 	    const char *config = va_arg(ap, const char *);
+	    // TODO: error when uri or config is NULL
 	    rc = session->open_cursor(session, uri, NULL, config, &cursors[i]);
 	    if (rc != 0) {
-		session->close(session, NULL); // this will free cursors too
+		session->close(session, NULL); // this will free the cursors too
 		return rc;
 	    }
 	}
 	va_end (ap);
-    } // else { cache hit so 'ctx' is a reusable session/cursor }
+    } // else { cache hit }
     enif_mutex_unlock(conn_handle->cache_mutex);
     return 0;
 }
 
+/**
+ * Return a context to the cache for reuse.
+ */
 static void
-__release_ctx(WterlConnHandle *conn_handle, WterlCtxHandle *ctx)
+__release_ctx(WterlConnHandle *conn_handle, struct wterl_ctx *ctx)
 {
     int i, c;
     WT_CURSOR *cursor;
@@ -375,7 +444,10 @@ __close_all_sessions(WterlConnHandle *conn_handle)
 {
     kbtree_t(cache_entries) *t = conn_handle->cache;
 
-#define traverse_f(p) { kb_del(cache_entries, t, *p); }
+#define traverse_f(p) {				\
+	kb_del(cache_entries, t, *p);		\
+	enif_free(p);				\
+    }
     __kb_traverse(struct cache_entry *, t, traverse_f);
 #undef traverse_f
 }
@@ -1300,7 +1372,7 @@ ASYNC_NIF_DECL(
       return;
     }
 
-    WterlCtxHandle *ctx = NULL;
+    struct wterl_ctx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
     int rc = __retain_ctx(args->conn_handle, &ctx,
 			  args->conn_handle->session_config,
@@ -1359,7 +1431,7 @@ ASYNC_NIF_DECL(
       return;
     }
 
-    WterlCtxHandle *ctx = NULL;
+    struct wterl_ctx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
     int rc = __retain_ctx(args->conn_handle, &ctx,
 			  args->conn_handle->session_config,
@@ -1441,7 +1513,7 @@ ASYNC_NIF_DECL(
       return;
     }
 
-    WterlCtxHandle *ctx = NULL;
+    struct wterl_ctx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
     int rc = __retain_ctx(args->conn_handle, &ctx,
 			  args->conn_handle->session_config,
@@ -2180,6 +2252,8 @@ on_load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info)
                                                   NULL, flags, NULL);
     wterl_cursor_RESOURCE = enif_open_resource_type(env, NULL, "wterl_cursor_resource",
                                                     NULL, flags, NULL);
+    wterl_ctx_RESOURCE = enif_open_resource_type(env, NULL, "wterl_ctx_resource",
+						 NULL, flags, NULL);
 
     ATOM_ERROR = enif_make_atom(env, "error");
     ATOM_OK = enif_make_atom(env, "ok");
@@ -2265,7 +2339,7 @@ on_unload(ErlNifEnv *env, void *priv_data)
 
     SLIST_FOREACH(conn_handle, &priv->conns, conns) {
 	fifo_q_foreach(cache_entries, conn_handle->recycled_cache_entries, e, {
-            WterlCtxHandle *ctx = e->ctx;
+            struct wterl_ctx *ctx = e->ctx;
 	    ctx->session->close(ctx->session, NULL);
 	});
 	fifo_q_free(cache_entries, conn_handle->recycled_cache_entries);
diff --git a/src/riak_kv_wterl_backend.erl b/src/riak_kv_wterl_backend.erl
deleted file mode 100644
index 313da29..0000000
--- a/src/riak_kv_wterl_backend.erl
+++ /dev/null
@@ -1,613 +0,0 @@
-%% -------------------------------------------------------------------
-%%
-%% riak_kv_wterl_backend: WiredTiger Driver for Riak
-%%
-%% Copyright (c) 2012-2013 Basho Technologies, Inc.  All Rights Reserved.
-%%
-%% This file is provided to you under the Apache License,
-%% Version 2.0 (the "License"); you may not use this file
-%% except in compliance with the License.  You may obtain
-%% a copy of the License at
-%%
-%%   http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing,
-%% software distributed under the License is distributed on an
-%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-%% KIND, either express or implied.  See the License for the
-%% specific language governing permissions and limitations
-%% under the License.
-%%
-%% -------------------------------------------------------------------
-
--module(riak_kv_wterl_backend).
--behavior(temp_riak_kv_backend).
-
-%% KV Backend API
--export([api_version/0,
-         capabilities/1,
-         capabilities/2,
-         start/2,
-         stop/1,
-         get/3,
-         put/5,
-         delete/4,
-         drop/1,
-         fold_buckets/4,
-         fold_keys/4,
-         fold_objects/4,
-         is_empty/1,
-         status/1,
-         callback/3]).
-
--ifdef(TEST).
--include_lib("eunit/include/eunit.hrl").
--compiel(export_all).
--endif.
-
--define(API_VERSION, 1).
-%% TODO: for when this backend supports 2i
-%%-define(CAPABILITIES, [async_fold, indexes]).
--define(CAPABILITIES, [async_fold]).
-
--record(state, {table :: string(),
-                type :: string(),
-                connection :: wterl:connection(),
-                is_empty_cursor :: wterl:cursor(),
-                status_cursor :: wterl:cursor()}).
-
--type state() :: #state{}.
--type config() :: [{atom(), term()}].
-
-%% ===================================================================
-%% Public API
-%% ===================================================================
-
-%% @doc Return the major version of the
-%% current API.
--spec api_version() -> {ok, integer()}.
-api_version() ->
-    {ok, ?API_VERSION}.
-
-%% @doc Return the capabilities of the backend.
--spec capabilities(state()) -> {ok, [atom()]}.
-capabilities(_) ->
-    {ok, ?CAPABILITIES}.
-
-%% @doc Return the capabilities of the backend.
--spec capabilities(riak_object:bucket(), state()) -> {ok, [atom()]}.
-capabilities(_, _) ->
-    {ok, ?CAPABILITIES}.
-
-%% @doc Start the wterl backend
--spec start(integer(), config()) -> {ok, state()} | {error, term()}.
-start(Partition, Config) ->
-    AppStart =
-        case application:start(wterl) of
-            ok ->
-                ok;
-            {error, {already_started, _}} ->
-                ok;
-            {error, Reason1} ->
-                lager:error("Failed to start wterl: ~p", [Reason1]),
-                {error, Reason1}
-        end,
-    case AppStart of
-        ok ->
-            Type =
-                case wterl:config_value(type, Config, "lsm") of
-                    {type, "lsm"} -> "lsm";
-                    {type, "table"} -> "table";
-                    {type, "btree"} -> "table";
-                    {type, BadType} ->
-                        lager:info("wterl:start ignoring unknown type ~p, using lsm instead", [BadType]),
-                        "lsm";
-                    _ ->
-                        lager:info("wterl:start ignoring mistaken setting defaulting to lsm"),
-                        "lsm"
-                end,
-            {ok, Connection} = establish_connection(Config, Type),
-            Table = Type ++ ":" ++ integer_to_list(Partition),
-            Compressor =
-                case wterl:config_value(block_compressor, Config, "snappy") of
-                    {block_compressor, "snappy"}=C -> [C];
-                    {block_compressor, "none"} -> [];
-                    {block_compressor, none} -> [];
-                    {block_compressor, _} -> [{block_compressor, "snappy"}];
-                    _ -> [{block_compressor, "snappy"}]
-                end,
-            TableOpts =
-                case Type of
-                    "lsm" ->
-                        [{internal_page_max, "128K"},
-                         {leaf_page_max, "128K"},
-                         {lsm_chunk_size, "100MB"},
-                         {lsm_merge_threads, 2},
-                         {prefix_compression, false},
-                         {lsm_bloom_newest, true},
-                         {lsm_bloom_oldest, true} ,
-                         {lsm_bloom_bit_count, 128},
-                         {lsm_bloom_hash_count, 64},
-                         {lsm_bloom_config, [{leaf_page_max, "8MB"}]}
-                        ] ++ Compressor;
-                    "table" ->
-                        Compressor
-                end,
-            case wterl:create(Connection, Table, TableOpts) of
-                ok ->
-                    case establish_utility_cursors(Connection, Table) of
-                        {ok, IsEmptyCursor, StatusCursor} ->
-                            {ok, #state{table=Table, type=Type,
-                                        connection=Connection,
-                                        is_empty_cursor=IsEmptyCursor,
-                                        status_cursor=StatusCursor}};
-                        {error, Reason2} ->
-                            {error, Reason2}
-                    end;
-                {error, Reason3} ->
-                    {error, Reason3}
-                end
-    end.
-
-%% @doc Stop the wterl backend
--spec stop(state()) -> ok.
-stop(_State) ->
-    ok. %% The connection is closed by wterl_conn:stop()
-
-%% @doc Retrieve an object from the wterl backend
--spec get(riak_object:bucket(), riak_object:key(), state()) ->
-                 {ok, any(), state()} |
-                 {ok, not_found, state()} |
-                 {error, term(), state()}.
-get(Bucket, Key, #state{connection=Connection, table=Table}=State) ->
-    WTKey = to_object_key(Bucket, Key),
-    case wterl:get(Connection, Table, WTKey) of
-        {ok, Value} ->
-            {ok, Value, State};
-        not_found  ->
-            {error, not_found, State};
-        {error, Reason} ->
-            {error, Reason, State}
-    end.
-
-%% @doc Insert an object into the wterl backend.
-%% NOTE: The wterl backend does not currently support
-%% secondary indexing and the_IndexSpecs parameter
-%% is ignored.
--type index_spec() :: {add, Index, SecondaryKey} | {remove, Index, SecondaryKey}.
--spec put(riak_object:bucket(), riak_object:key(), [index_spec()], binary(), state()) ->
-                 {ok, state()} |
-                 {error, term(), state()}.
-put(Bucket, PrimaryKey, _IndexSpecs, Val, #state{connection=Connection, table=Table}=State) ->
-    case wterl:put(Connection, Table, to_object_key(Bucket, PrimaryKey), Val) of
-        ok ->
-            {ok, State};
-        {error, Reason} ->
-            {error, Reason, State}
-    end.
-
-%% @doc Delete an object from the wterl backend
-%% NOTE: The wterl backend does not currently support
-%% secondary indexing and the_IndexSpecs parameter
-%% is ignored.
--spec delete(riak_object:bucket(), riak_object:key(), [index_spec()], state()) ->
-                    {ok, state()} |
-                    {error, term(), state()}.
-delete(Bucket, Key, _IndexSpecs, #state{connection=Connection, table=Table}=State) ->
-    case wterl:delete(Connection, Table, to_object_key(Bucket, Key)) of
-        ok ->
-            {ok, State};
-        {error, Reason} ->
-            {error, Reason, State}
-    end.
-
-%% @doc Fold over all the buckets
--spec fold_buckets(riak_kv_backend:fold_buckets_fun(),
-                   any(),
-                   [],
-                   state()) -> {ok, any()} | {async, fun()}.
-fold_buckets(FoldBucketsFun, Acc, Opts, #state{connection=Connection, table=Table}) ->
-    FoldFun = fold_buckets_fun(FoldBucketsFun),
-    BucketFolder =
-        fun() ->
-                case wterl:cursor_open(Connection, Table) of
-                    {error, {enoent, _Message}} ->
-                        Acc;
-                    {ok, Cursor} ->
-                        try
-                            {FoldResult, _} =
-                                wterl:fold_keys(Cursor, FoldFun, {Acc, []}),
-                            FoldResult
-                        catch
-                            {break, AccFinal} ->
-                                AccFinal
-                        after
-                            ok = wterl:cursor_close(Cursor)
-                        end
-                end
-        end,
-    case lists:member(async_fold, Opts) of
-        true ->
-            {async, BucketFolder};
-        false ->
-            {ok, BucketFolder()}
-    end.
-
-%% @doc Fold over all the keys for one or all buckets.
--spec fold_keys(riak_kv_backend:fold_keys_fun(),
-                any(),
-                [{atom(), term()}],
-                state()) -> {ok, term()} | {async, fun()}.
-fold_keys(FoldKeysFun, Acc, Opts, #state{connection=Connection, table=Table}) ->
-    %% Figure out how we should limit the fold: by bucket, by
-    %% secondary index, or neither (fold across everything.)
-    Bucket = lists:keyfind(bucket, 1, Opts),
-    Index = lists:keyfind(index, 1, Opts),
-
-    %% Multiple limiters may exist. Take the most specific limiter.
-    Limiter =
-        if Index /= false  -> Index;
-           Bucket /= false -> Bucket;
-           true            -> undefined
-        end,
-
-    %% Set up the fold...
-    FoldFun = fold_keys_fun(FoldKeysFun, Limiter),
-    KeyFolder =
-        fun() ->
-                case wterl:cursor_open(Connection, Table) of
-                    {error, {enoent, _Message}} ->
-                        Acc;
-                    {ok, Cursor} ->
-                        try
-                            wterl:fold_keys(Cursor, FoldFun, Acc)
-                        catch
-                            {break, AccFinal} ->
-                                AccFinal
-                        after
-                            ok = wterl:cursor_close(Cursor)
-                        end
-                end
-        end,
-    case lists:member(async_fold, Opts) of
-        true ->
-            {async, KeyFolder};
-        false ->
-            {ok, KeyFolder()}
-    end.
-
-%% @doc Fold over all the objects for one or all buckets.
--spec fold_objects(riak_kv_backend:fold_objects_fun(),
-                   any(),
-                   [{atom(), term()}],
-                   state()) -> {ok, any()} | {async, fun()}.
-fold_objects(FoldObjectsFun, Acc, Opts, #state{connection=Connection, table=Table}) ->
-    Bucket =  proplists:get_value(bucket, Opts),
-    FoldFun = fold_objects_fun(FoldObjectsFun, Bucket),
-    ObjectFolder =
-        fun() ->
-                case wterl:cursor_open(Connection, Table) of
-                    {error, {enoent, _Message}} ->
-                        Acc;
-                    {ok, Cursor} ->
-                        try
-                            wterl:fold(Cursor, FoldFun, Acc)
-                        catch
-                            {break, AccFinal} ->
-                                AccFinal
-                        after
-                            case wterl:cursor_close(Cursor) of
-                                ok ->
-                                    ok;
-                                {error, {eperm, _}} -> %% TODO: review/fix
-                                    ok;
-                                {error, _}=E ->
-                                    E
-                            end
-                        end
-                end
-        end,
-    case lists:member(async_fold, Opts) of
-        true ->
-            {async, ObjectFolder};
-        false ->
-            {ok, ObjectFolder()}
-    end.
-
-%% @doc Delete all objects from this wterl backend
--spec drop(state()) -> {ok, state()} | {error, term(), state()}.
-drop(#state{connection=Connection, table=Table}=State) ->
-    case wterl:drop(Connection, Table) of
-        ok ->
-            {ok, State};
-        {error, {ebusy, _}} -> %% TODO: review/fix
-            {ok, State};
-        Error ->
-            {error, Error, State}
-    end.
-
-%% @doc Returns true if this wterl backend contains any
-%% non-tombstone values; otherwise returns false.
--spec is_empty(state()) -> boolean().
-is_empty(#state{is_empty_cursor=Cursor}) ->
-    wterl:cursor_reset(Cursor),
-    case wterl:cursor_next(Cursor) of
-        not_found -> true;
-        {error, {eperm, _}} -> false; % TODO: review/fix this logic
-        _ -> false
-    end.
-
-%% @doc Get the status information for this wterl backend
--spec status(state()) -> [{atom(), term()}].
-status(#state{status_cursor=Cursor}) ->
-    wterl:cursor_reset(Cursor),
-    case fetch_status(Cursor) of
-        {ok, Stats} ->
-            Stats;
-        {error, {eperm, _}} -> % TODO: review/fix this logic
-            {ok, []};
-        _ ->
-            {ok, []}
-    end.
-
-%% @doc Register an asynchronous callback
--spec callback(reference(), any(), state()) -> {ok, state()}.
-callback(_Ref, _Msg, State) ->
-    {ok, State}.
-
-
-%% ===================================================================
-%% Internal functions
-%% ===================================================================
-
-%% @private
-max_sessions(Config) ->
-    RingSize =
-        case app_helper:get_prop_or_env(ring_creation_size, Config, riak_core) of
-            undefined -> 1024;
-            Size -> Size
-        end,
-    Est = 100 * (RingSize * erlang:system_info(schedulers)), % TODO: review/fix this logic
-    case Est > 1000000000  of % Note: WiredTiger uses a signed int for this
-        true -> 1000000000;
-        false -> Est
-    end.
-
-%% @private
-establish_utility_cursors(Connection, Table) ->
-    case wterl:cursor_open(Connection, Table) of
-        {ok, IsEmptyCursor} ->
-            case wterl:cursor_open(Connection, "statistics:" ++ Table, [{statistics_fast, true}]) of
-                {ok, StatusCursor} ->
-                    {ok, IsEmptyCursor, StatusCursor};
-                {error, Reason1} ->
-                    {error, Reason1}
-            end;
-        {error, Reason2} ->
-            {error, Reason2}
-    end.
-
-%% @private
-establish_connection(Config, Type) ->
-    %% Get the data root directory
-    case app_helper:get_prop_or_env(data_root, Config, wterl) of
-        undefined ->
-            lager:error("Failed to create wterl dir: data_root is not set"),
-            {error, data_root_unset};
-        DataRoot ->
-            ok = filelib:ensure_dir(filename:join(DataRoot, "x")),
-
-            %% WT Connection Options:
-            %% NOTE: LSM auto-checkpoints, so we don't have too.
-            CheckpointSetting =
-                case Type =:= "lsm" of
-                    true ->
-                        [];
-                    false ->
-                        app_helper:get_prop_or_env(checkpoint, Config, wterl, [{wait, 10}])
-                end,
-            RequestedCacheSize = app_helper:get_prop_or_env(cache_size, Config, wterl),
-            ConnectionOpts =
-                orddict:from_list(
-                  [ wterl:config_value(create, Config, true),
-                    wterl:config_value(sync, Config, false),
-                    wterl:config_value(logging, Config, true),
-                    wterl:config_value(transactional, Config, true),
-                    wterl:config_value(session_max, Config, max_sessions(Config)),
-                    wterl:config_value(cache_size, Config, size_cache(RequestedCacheSize)),
-                    wterl:config_value(statistics_log, Config, [{wait, 300}]), % sec
-                    wterl:config_value(verbose, Config, [ "salvage", "verify"
-                         % Note: for some unknown reason, if you add these additional
-                         % verbose flags Erlang SEGV's "size_object: bad tag for 0x80"
-                         % no idea why... yet... you've been warned.
-
-                         %"block", "shared_cache", "reconcile", "evict", "lsm",
-                         %"fileops", "read", "write", "readserver", "evictserver",
-                         %"hazard", "mutex", "ckpt"
-                         ]) ] ++ CheckpointSetting ++ proplists:get_value(wterl, Config, [])), % sec
-
-            %% WT Session Options:
-            SessionOpts = [{isolation, "snapshot"}],
-
-            case wterl_conn:open(DataRoot, ConnectionOpts, SessionOpts) of
-                {ok, Connection} ->
-                    {ok, Connection};
-                {error, Reason2} ->
-                    lager:error("Failed to establish a WiredTiger connection, wterl backend unable to start: ~p\n", [Reason2]),
-                    {error, Reason2}
-            end
-    end.
-
-%% @private
-%% Return a function to fold over the buckets on this backend
-fold_buckets_fun(FoldBucketsFun) ->
-    fun(BK, {Acc, LastBucket}) ->
-            case from_object_key(BK) of
-                {LastBucket, _} ->
-                    {Acc, LastBucket};
-                {Bucket, _} ->
-                    {FoldBucketsFun(Bucket, Acc), Bucket};
-                _ ->
-                    throw({break, Acc})
-            end
-    end.
-
-%% @private
-%% Return a function to fold over keys on this backend
-fold_keys_fun(FoldKeysFun, undefined) ->
-    %% Fold across everything...
-    fun(StorageKey, Acc) ->
-            case from_object_key(StorageKey) of
-                {Bucket, Key} ->
-                    FoldKeysFun(Bucket, Key, Acc);
-                _ ->
-                    throw({break, Acc})
-            end
-    end;
-fold_keys_fun(FoldKeysFun, {bucket, FilterBucket}) ->
-    %% Fold across a specific bucket...
-    fun(StorageKey, Acc) ->
-            case from_object_key(StorageKey) of
-                {Bucket, Key} when Bucket == FilterBucket ->
-                    FoldKeysFun(Bucket, Key, Acc);
-                _ ->
-                    throw({break, Acc})
-            end
-    end;
-fold_keys_fun(FoldKeysFun, {index, FilterBucket, {eq, <<"$bucket">>, _}}) ->
-    %% 2I exact match query on special $bucket field...
-    fold_keys_fun(FoldKeysFun, {bucket, FilterBucket});
-fold_keys_fun(FoldKeysFun, {index, FilterBucket, {eq, FilterField, FilterTerm}}) ->
-    %% Rewrite 2I exact match query as a range...
-    NewQuery = {range, FilterField, FilterTerm, FilterTerm},
-    fold_keys_fun(FoldKeysFun, {index, FilterBucket, NewQuery});
-fold_keys_fun(FoldKeysFun, {index, FilterBucket, {range, <<"$key">>, StartKey, EndKey}}) ->
-    %% 2I range query on special $key field...
-    fun(StorageKey, Acc) ->
-            case from_object_key(StorageKey) of
-                {Bucket, Key} when FilterBucket == Bucket,
-                                   StartKey =< Key,
-                                   EndKey >= Key ->
-                    FoldKeysFun(Bucket, Key, Acc);
-                _ ->
-                    throw({break, Acc})
-            end
-    end;
-fold_keys_fun(FoldKeysFun, {index, FilterBucket, {range, FilterField, StartTerm, EndTerm}}) ->
-    %% 2I range query...
-    fun(StorageKey, Acc) ->
-            case from_index_key(StorageKey) of
-                {Bucket, Key, Field, Term} when FilterBucket == Bucket,
-                                                FilterField == Field,
-                                                StartTerm =< Term,
-                                                EndTerm >= Term ->
-                    FoldKeysFun(Bucket, Key, Acc);
-                _ ->
-                    throw({break, Acc})
-            end
-    end;
-fold_keys_fun(_FoldKeysFun, Other) ->
-    throw({unknown_limiter, Other}).
-
-%% @private
-%% Return a function to fold over the objects on this backend
-fold_objects_fun(FoldObjectsFun, FilterBucket) ->
-    %% 2I does not support fold objects at this time, so this is much
-    %% simpler than fold_keys_fun.
-    fun({StorageKey, Value}, Acc) ->
-            case from_object_key(StorageKey) of
-                {Bucket, Key} when FilterBucket == undefined;
-                                   Bucket == FilterBucket ->
-                    FoldObjectsFun(Bucket, Key, Value, Acc);
-                _ ->
-                    throw({break, Acc})
-            end
-    end.
-
-to_object_key(Bucket, Key) ->
-    sext:encode({o, Bucket, Key}).
-
-from_object_key(LKey) ->
-    case sext:decode(LKey) of
-        {o, Bucket, Key} ->
-            {Bucket, Key};
-        _ ->
-            undefined
-    end.
-
-from_index_key(LKey) ->
-    case sext:decode(LKey) of
-        {i, Bucket, Field, Term, Key} ->
-            {Bucket, Key, Field, Term};
-        _ ->
-            undefined
-    end.
-
-%% @private
-%% Return all status from wterl statistics cursor
-fetch_status(Cursor) ->
-    {ok, fetch_status(Cursor, wterl:cursor_next_value(Cursor), [])}.
-fetch_status(_Cursor, {error, _}, Acc) ->
-    lists:reverse(Acc);
-fetch_status(_Cursor, not_found, Acc) ->
-    lists:reverse(Acc);
-fetch_status(Cursor, {ok, Stat}, Acc) ->
-    [What,Val|_] = [binary_to_list(B) || B <- binary:split(Stat, [<<0>>], [global])],
-    fetch_status(Cursor, wterl:cursor_next_value(Cursor), [{What,Val}|Acc]).
-
-size_cache(RequestedSize) ->
-    Size =
-        case RequestedSize of
-            undefined ->
-                RunningApps = application:which_applications(),
-                FinalGuess =
-                    case proplists:is_defined(sasl, RunningApps) andalso
-                        proplists:is_defined(os_mon, RunningApps) of
-                        true ->
-                            Memory = memsup:get_system_memory_data(),
-                            TotalRAM = proplists:get_value(system_total_memory, Memory),
-                            FreeRAM = proplists:get_value(free_memory, Memory),
-                            UsedByBeam = proplists:get_value(total, erlang:memory()),
-                            Target = ((TotalRAM - UsedByBeam) div 4),
-                            FirstGuess = (Target - (Target rem (1024 * 1024))),
-                            SecondGuess =
-                                case FirstGuess > FreeRAM of
-                                    true -> FreeRAM - (FreeRAM rem (1024 * 1024));
-                                    _ -> FirstGuess
-                                end,
-                            case SecondGuess < 1073741824 of %% < 1GB?
-                                true -> "1GB";
-                                false ->
-                                    ThirdGuess = SecondGuess div (1024 * 1024),
-                                    integer_to_list(ThirdGuess) ++ "MB"
-                            end;
-                        false ->
-                            "1GB"
-                    end,
-                application:set_env(wterl, cache_size, FinalGuess),
-                FinalGuess;
-            Value when is_list(Value) ->
-                Value;
-            Value when is_number(Value) ->
-                integer_to_list(Value)
-        end,
-    Size.
-
-%% ===================================================================
-%% EUnit tests
-%% ===================================================================
--ifdef(TEST).
-
-simple_test_() ->
-    {ok, CWD} = file:get_cwd(),
-    rmdir:path(filename:join([CWD, "test/wterl-backend"])), %?assertCmd("rm -rf test/wterl-backend"),
-    application:set_env(wterl, data_root, "test/wterl-backend"),
-    temp_riak_kv_backend:standard_test(?MODULE, []).
-
-custom_config_test_() ->
-    {ok, CWD} = file:get_cwd(),
-    rmdir:path(filename:join([CWD, "test/wterl-backend"])), %?assertCmd("rm -rf test/wterl-backend"),
-    application:set_env(wterl, data_root, ""),
-    temp_riak_kv_backend:standard_test(?MODULE, [{data_root, "test/wterl-backend"}]).
-
--endif.
diff --git a/src/temp_riak_kv_backend.erl b/src/temp_riak_kv_backend.erl
deleted file mode 100644
index c41a38d..0000000
--- a/src/temp_riak_kv_backend.erl
+++ /dev/null
@@ -1,287 +0,0 @@
-%% -------------------------------------------------------------------
-%%
-%% riak_kv_backend: Riak backend behaviour
-%%
-%% Copyright (c) 2007-2010 Basho Technologies, Inc.  All Rights Reserved.
-%%
-%% This file is provided to you under the Apache License,
-%% Version 2.0 (the "License"); you may not use this file
-%% except in compliance with the License.  You may obtain
-%% a copy of the License at
-%%
-%%   http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing,
-%% software distributed under the License is distributed on an
-%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-%% KIND, either express or implied.  See the License for the
-%% specific language governing permissions and limitations
-%% under the License.
-%%
-%% -------------------------------------------------------------------
-
-%%% NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE
-%%%
-%%% This is a temporary copy of riak_kv_backend, just here to keep
-%%% wterl development private for now. When riak_kv_wterl_backend is
-%%% moved to riak_kv, delete this file.
-%%%
-%%% NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE
-
-
--module(temp_riak_kv_backend).
-
--export([behaviour_info/1]).
--export([callback_after/3]).
-
--ifdef(TEST).
--include_lib("eunit/include/eunit.hrl").
--compile(export_all).
--export([standard_test/2]).
--endif.
-
--type fold_buckets_fun() :: fun((binary(), any()) -> any() | no_return()).
--type fold_keys_fun() :: fun((binary(), binary(), any()) -> any() |
-                                                            no_return()).
--type fold_objects_fun() :: fun((binary(), binary(), term(), any()) ->
-                                       any() |
-                                       no_return()).
--export_type([fold_buckets_fun/0,
-              fold_keys_fun/0,
-              fold_objects_fun/0]).
-
--spec behaviour_info(atom()) -> 'undefined' | [{atom(), arity()}].
-behaviour_info(callbacks) ->
-    [
-     {api_version,0},
-     {capabilities, 1},  % (State)
-     {capabilities, 2},  % (Bucket, State)
-     {start,2},          % (Partition, Config)
-     {stop,1},           % (State)
-     {get,3},            % (Bucket, Key, State)
-     {put,5},            % (Bucket, Key, IndexSpecs, Val, State)
-     {delete,4},         % (Bucket, Key, IndexSpecs, State)
-     {drop,1},           % (State)
-     {fold_buckets,4},   % (FoldBucketsFun, Acc, Opts, State),
-                         %   FoldBucketsFun(Bucket, Acc)
-     {fold_keys,4},      % (FoldKeysFun, Acc, Opts, State),
-                         %   FoldKeysFun(Bucket, Key, Acc)
-     {fold_objects,4},   % (FoldObjectsFun, Acc, Opts, State),
-                         %   FoldObjectsFun(Bucket, Key, Object, Acc)
-     {is_empty,1},       % (State)
-     {status,1},         % (State)
-     {callback,3}];      % (Ref, Msg, State) ->
-behaviour_info(_Other) ->
-    undefined.
-
-%% Queue a callback for the backend after Time ms.
--spec callback_after(integer(), reference(), term()) -> reference().
-callback_after(Time, Ref, Msg) when is_integer(Time), is_reference(Ref) ->
-    riak_core_vnode:send_command_after(Time, {backend_callback, Ref, Msg}).
-
--ifdef(TEST).
-
-standard_test(BackendMod, Config) ->
-    {spawn,
-     [
-      {setup,
-       fun() -> ?MODULE:setup({BackendMod, Config}) end,
-       fun ?MODULE:cleanup/1,
-       fun(X) ->
-               [?MODULE:basic_store_and_fetch(X),
-                ?MODULE:fold_buckets(X),
-                ?MODULE:fold_keys(X),
-                ?MODULE:delete_object(X),
-                ?MODULE:fold_objects(X),
-                ?MODULE:empty_check(X)
-               ]
-       end
-      }]}.
-
-basic_store_and_fetch({Backend, State}) ->
-    {"basic store and fetch test",
-     fun() ->
-             [
-              ?_assertMatch({ok, _},
-                            Backend:put(<<"b1">>, <<"k1">>, [], <<"v1">>, State)),
-              ?_assertMatch({ok, _},
-                            Backend:put(<<"b2">>, <<"k2">>, [], <<"v2">>, State)),
-              ?_assertMatch({ok,<<"v2">>, _},
-                            Backend:get(<<"b2">>, <<"k2">>, State)),
-              ?_assertMatch({error, not_found, _},
-                            Backend:get(<<"b1">>, <<"k3">>, State))
-             ]
-     end
-    }.
-
-fold_buckets({Backend, State}) ->
-    {"bucket folding test",
-     fun() ->
-             FoldBucketsFun =
-                 fun(Bucket, Acc) ->
-                         [Bucket | Acc]
-                 end,
-
-             ?_assertEqual([<<"b1">>, <<"b2">>],
-                           begin
-                               {ok, Buckets1} =
-                                   Backend:fold_buckets(FoldBucketsFun,
-                                                        [],
-                                                        [],
-                                                        State),
-                               lists:sort(Buckets1)
-                           end)
-     end
-    }.
-
-fold_keys({Backend, State}) ->
-    {"key folding test",
-     fun() ->
-             FoldKeysFun =
-                 fun(Bucket, Key, Acc) ->
-                         [{Bucket, Key} | Acc]
-                 end,
-             FoldKeysFun1 =
-                 fun(_Bucket, Key, Acc) ->
-                         [Key | Acc]
-                 end,
-             FoldKeysFun2 =
-                 fun(Bucket, Key, Acc) ->
-                         case Bucket =:= <<"b1">> of
-                             true ->
-                                 [Key | Acc];
-                             false ->
-                                 Acc
-                         end
-                 end,
-             FoldKeysFun3 =
-                 fun(Bucket, Key, Acc) ->
-                         case Bucket =:= <<"b1">> of
-                             true ->
-                                 Acc;
-                             false ->
-                                 [Key | Acc]
-                         end
-                 end,
-             [
-              ?_assertEqual([{<<"b1">>, <<"k1">>}, {<<"b2">>, <<"k2">>}],
-                            begin
-                                {ok, Keys1} =
-                                    Backend:fold_keys(FoldKeysFun,
-                                                      [],
-                                                      [],
-                                                      State),
-                                lists:sort(Keys1)
-                            end),
-              ?_assertEqual({ok, [<<"k1">>]},
-                            Backend:fold_keys(FoldKeysFun1,
-                                              [],
-                                              [{bucket, <<"b1">>}],
-                                              State)),
-              ?_assertEqual([<<"k2">>],
-                            Backend:fold_keys(FoldKeysFun1,
-                                              [],
-                                              [{bucket, <<"b2">>}],
-                                              State)),
-              ?_assertEqual({ok, [<<"k1">>]},
-                            Backend:fold_keys(FoldKeysFun2, [], [], State)),
-              ?_assertEqual({ok, [<<"k1">>]},
-                            Backend:fold_keys(FoldKeysFun2,
-                                              [],
-                                              [{bucket, <<"b1">>}],
-                                              State)),
-              ?_assertEqual({ok, [<<"k2">>]},
-                            Backend:fold_keys(FoldKeysFun3, [], [], State)),
-              ?_assertEqual({ok, []},
-                            Backend:fold_keys(FoldKeysFun3,
-                                              [],
-                                              [{bucket, <<"b1">>}],
-                                              State))
-             ]
-     end
-    }.
-
-delete_object({Backend, State}) ->
-    {"object deletion test",
-     fun() ->
-             [
-              ?_assertMatch({ok, _}, Backend:delete(<<"b2">>, <<"k2">>, State)),
-              ?_assertMatch({error, not_found, _},
-                            Backend:get(<<"b2">>, <<"k2">>, State))
-             ]
-     end
-    }.
-
-fold_objects({Backend, State}) ->
-    {"object folding test",
-     fun() ->
-             FoldKeysFun =
-                 fun(Bucket, Key, Acc) ->
-                         [{Bucket, Key} | Acc]
-                 end,
-             FoldObjectsFun =
-                 fun(Bucket, Key, Value, Acc) ->
-                         [{{Bucket, Key}, Value} | Acc]
-                 end,
-             [
-              ?_assertEqual([{<<"b1">>, <<"k1">>}],
-                            begin
-                                {ok, Keys} =
-                                    Backend:fold_keys(FoldKeysFun,
-                                                      [],
-                                                      [],
-                                                      State),
-                                lists:sort(Keys)
-                            end),
-
-              ?_assertEqual([{{<<"b1">>,<<"k1">>}, <<"v1">>}],
-                            begin
-                                {ok, Objects1} =
-                                    Backend:fold_objects(FoldObjectsFun,
-                                                         [],
-                                                         [],
-                                                         State),
-                                lists:sort(Objects1)
-                            end),
-              ?_assertMatch({ok, _},
-                            Backend:put(<<"b3">>, <<"k3">>, [], <<"v3">>, State)),
-              ?_assertEqual([{{<<"b1">>,<<"k1">>},<<"v1">>},
-                             {{<<"b3">>,<<"k3">>},<<"v3">>}],
-                            begin
-                                {ok, Objects} =
-                                    Backend:fold_objects(FoldObjectsFun,
-                                                         [],
-                                                         [],
-                                                         State),
-                                lists:sort(Objects)
-                            end)
-             ]
-     end
-    }.
-
-empty_check({Backend, State}) ->
-    {"is_empty test",
-     fun() ->
-             [
-              ?_assertEqual(false, Backend:is_empty(State)),
-              ?_assertMatch({ok, _}, Backend:delete(<<"b1">>,<<"k1">>, State)),
-              ?_assertMatch({ok, _}, Backend:delete(<<"b3">>,<<"k3">>, State)),
-              ?_assertEqual(true, Backend:is_empty(State))
-             ]
-     end
-    }.
-
-setup({BackendMod, Config}) ->
-    application:start(lager),
-    application:start(sasl),
-    application:start(os_mon),
-    {ok, S} = BackendMod:start(42, Config),
-    {BackendMod, S}.
-
-cleanup({BackendMod, S}) ->
-    ok = BackendMod:stop(S),
-    application:stop(lager),
-    application:stop(sasl),
-    application:stop(os_mon).
-
--endif. % TEST

From b002294c4e706fe8b1eb3d5d70ff9a89509d2960 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Fri, 31 May 2013 20:32:02 -0400
Subject: [PATCH 05/30] WIP: all tests (but drop) passing again, need to fix
 that and valgrind next.

---
 c_src/kbtree.h                | 158 ++++++++-
 c_src/wterl.c                 | 274 ++++++++-------
 src/riak_kv_wterl_backend.erl | 613 ++++++++++++++++++++++++++++++++++
 src/temp_riak_kv_backend.erl  | 287 ++++++++++++++++
 4 files changed, 1202 insertions(+), 130 deletions(-)
 create mode 100644 src/riak_kv_wterl_backend.erl
 create mode 100644 src/temp_riak_kv_backend.erl

diff --git a/c_src/kbtree.h b/c_src/kbtree.h
index f628d66..09184cc 100644
--- a/c_src/kbtree.h
+++ b/c_src/kbtree.h
@@ -1,12 +1,11 @@
 /*-
  * Copyright 1997-1999, 2001, John-Mark Gurney.
- *           2008, Attractive Chaos <attractivechaos@aol.co.uk>
- *
- * All rights reserved.
+ *           2008-2009, Attractive Chaos <attractor@live.co.uk>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
+ *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
@@ -26,9 +25,6 @@
  * SUCH DAMAGE.
  */
 
-/* Reference: http://attractivechaos.awardspace.com/kbtree.h
-              http://attractivechaos.awardspace.com/kbtree.h.html */
-
 #ifndef __AC_KBTREE_H
 #define __AC_KBTREE_H
 
@@ -56,14 +52,14 @@ typedef struct {
 	{																	\
 		kbtree_##name##_t *b;											\
 		b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t));	\
-		b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) / 2; \
+		b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \
 		if (b->t < 2) {													\
 			free(b); return 0;											\
 		}																\
 		b->n = 2 * b->t - 1;											\
 		b->off_ptr = 4 + b->n * sizeof(key_t);							\
-		b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) / 4 * 4; \
-		b->elen = (b->off_ptr + 3) / 4 * 4;								\
+		b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \
+		b->elen = (b->off_ptr + 3) >> 2 << 2;							\
 		b->root = (kbnode_t*)calloc(1, b->ilen);						\
 		++b->n_nodes;													\
 		return b;														\
@@ -71,7 +67,7 @@ typedef struct {
 
 #define __kb_destroy(b) do {											\
 		int i, max = 8;													\
-		kbnode_t *x, **top, **stack;									\
+		kbnode_t *x, **top, **stack = 0;								\
 		if (b) {														\
 			top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*));	\
 			*top++ = (b)->root;											\
@@ -93,10 +89,17 @@ typedef struct {
 		free(b); free(stack);											\
 	} while (0)
 
+#define __kb_get_first(key_t, b, ret) do {	\
+		kbnode_t *__x = (b)->root;			\
+		while (__KB_PTR(b, __x)[0] != 0)	\
+			__x = __KB_PTR(b, __x)[0];		\
+		(ret) = __KB_KEY(key_t, __x)[0];	\
+	} while (0)
+
 #define __KB_GET_AUX0(name, key_t, __cmp)								\
 	static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
 	{																	\
-		int tr, *rr, begin, end, n = x->n / 2;							\
+		int tr, *rr, begin, end, n = x->n >> 1;							\
 		if (x->n == 0) return -1;										\
 		if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) {						\
 			begin = 0; end = n;											\
@@ -114,7 +117,7 @@ typedef struct {
 		if (x->n == 0) return -1;										\
 		rr = r? r : &tr;												\
 		while (begin < end) {											\
-			int mid = (begin + end) / 2;								\
+			int mid = (begin + end) >> 1;								\
 			if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \
 			else end = mid;												\
 		}																\
@@ -189,7 +192,7 @@ typedef struct {
 			i = __kb_getp_aux_##name(x, k, 0);							\
 			if (i != x->n - 1)											\
 				memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
-			__KB_KEY(key_t, x)[i + 1] = (key_t)*k;		\
+			__KB_KEY(key_t, x)[i + 1] = *k;								\
 			++x->n;														\
 		} else {														\
 			i = __kb_getp_aux_##name(x, k, 0) + 1;						\
@@ -227,7 +230,7 @@ typedef struct {
 		int yn, zn, i, r = 0;											\
 		kbnode_t *xp, *y, *z;											\
 		key_t kp;														\
-		if (x == 0) return (key_t)*k;				\
+		if (x == 0) return *k;											\
 		if (s) { /* s can only be 0, 1 or 2 */							\
 			r = x->is_internal == 0? 0 : s == 1? 1 : -1;				\
 			i = s == 1? x->n - 1 : -1;									\
@@ -252,7 +255,7 @@ typedef struct {
 				return kp;												\
 			} else if (yn == b->t - 1 && zn == b->t - 1) {				\
 				y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1];		\
-				__KB_KEY(key_t, y)[y->n++] = (key_t)*k;	\
+				__KB_KEY(key_t, y)[y->n++] = *k;						\
 				memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \
 				if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \
 				y->n += z->n;											\
@@ -375,7 +378,130 @@ typedef struct {
 
 #define kb_size(b) ((b)->n_keys)
 
-#define kb_generic_cmp(a, b) (((a) > (b)) - ((a) < (b)))
+#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b)))
 #define kb_str_cmp(a, b) strcmp(a, b)
 
 #endif
+
+#ifdef TEST
+#include <stdio.h>
+#include <assert.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+typedef const char *str_t;
+
+#include "kbtree.h"
+
+typedef struct {
+        unsigned key;
+        char *value;
+} intmap_t;
+
+#define __intcmp(a, b) (((a).key > (b).key) - ((a).key < (b).key))
+
+KBTREE_INIT(int, uint32_t, kb_generic_cmp)
+KBTREE_INIT(str, str_t, kb_str_cmp)
+KBTREE_INIT(intmap, intmap_t, __intcmp);
+
+static int data_size = 5000000;
+static unsigned *int_data;
+static char **str_data;
+static intmap_t *intmap_data;
+
+void kb_init_data()
+{
+	int i;
+	char buf[256];
+	printf("--- generating data... ");
+	srand48(11);
+	int_data = (unsigned*)calloc(data_size, sizeof(unsigned));
+	str_data = (char**)calloc(data_size, sizeof(char*));
+	intmap_data = (intmap_t*)calloc(data_size, sizeof(intmap_t));
+	for (i = 0; i < data_size; ++i) {
+		int_data[i] = (unsigned)(data_size * drand48() / 4) * 271828183u;
+		sprintf(buf, "%x", int_data[i]);
+		str_data[i] = strdup(buf);
+		intmap_data[i].key = i;
+		intmap_data[i].value = str_data[i];
+	}
+	printf("done!\n");
+}
+void kb_destroy_data()
+{
+	int i;
+	for (i = 0; i < data_size; ++i) free(str_data[i]);
+	free(str_data); free(int_data);
+}
+void kb_tree_intmap()
+{
+        int i;
+	intmap_t *data = intmap_data;
+        kbtree_t(intmap) *h;
+        h = kb_init(intmap, KB_DEFAULT_SIZE);
+	for (i = 0; i < data_size; ++i) {
+		if (kb_get(intmap, h, data[i]) == 0) kb_put(intmap, h, data[i]);
+		else kb_del(intmap, h, data[i]);
+	}
+	printf("[kb_tree_intmap] size: %d\n", kb_size(h));
+	__kb_destroy(h);
+}
+void kb_tree_int()
+{
+	int i;
+	unsigned *data = int_data;
+	uint32_t *l, *u;
+	kbtree_t(int) *h;
+
+	h = kb_init(int, KB_DEFAULT_SIZE);
+	for (i = 0; i < data_size; ++i) {
+		if (kb_get(int, h, data[i]) == 0) kb_put(int, h, data[i]);
+		else kb_del(int, h, data[i]);
+	}
+	printf("[kb_tree_int] size: %d\n", kb_size(h));
+	if (1) {
+		int cnt = 0;
+		uint32_t x, y;
+		kb_interval(int, h, 2174625464u, &l, &u);
+		printf("interval for 2174625464: (%u, %u)\n", l? *l : 0, u? *u : 0);
+#define traverse_f(p) { if (cnt == 0) y = *p; ++cnt; }
+		__kb_traverse(uint32_t, h, traverse_f);
+		__kb_get_first(uint32_t, h, x);
+		printf("# of elements from traversal: %d\n", cnt);
+		printf("first element: %d == %d\n", x, y);
+	}
+	__kb_destroy(h);
+}
+void kb_tree_str()
+{
+	int i;
+	char **data = str_data;
+	kbtree_t(str) *h;
+
+	h = kb_init(str, KB_DEFAULT_SIZE);
+	for (i = 0; i < data_size; ++i) {
+		if (kb_get(str, h, data[i]) == 0) kb_put(str, h, data[i]);
+		else kb_del(str, h, data[i]);
+	}
+	printf("[kb_tree_int] size: %d\n", kb_size(h));
+	__kb_destroy(h);
+}
+void kb_timing(void (*f)(void))
+{
+	clock_t t = clock();
+	(*f)();
+	printf("[kb_timing] %.3lf sec\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+}
+int main(int argc, char *argv[])
+{
+	if (argc > 1) data_size = atoi(argv[1]);
+	kb_init_data();
+	kb_timing(kb_tree_int);
+	kb_timing(kb_tree_str);
+	kb_timing(kb_tree_intmap);
+	kb_destroy_data();
+	return 0;
+}
+#endif
diff --git a/c_src/wterl.c b/c_src/wterl.c
index eba372d..81fcd7e 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -31,40 +31,37 @@
 #include "async_nif.h"
 #include "kbtree.h"
 #include "queue.h"
-#include "fifo_q.h"
 
 static ErlNifResourceType *wterl_conn_RESOURCE;
-static ErlNifResourceType *wterl_ctx_RESOURCE;
 static ErlNifResourceType *wterl_cursor_RESOURCE;
 
 /* WiredTiger object names*/
 typedef char Uri[128];
 
 struct wterl_ctx {
-    WT_SESSION *session;  // open session
-    WT_CURSOR **cursors; // open cursors, all reset ready to reuse
+    SLIST_ENTRY(wterl_ctx) entries;
     uint64_t sig;
+    uint64_t tstamp;
+    WT_SESSION *session;
+    WT_CURSOR *cursors[]; // Note: must be last in struct
 };
 
 struct cache_entry {
-    struct wterl_ctx *ctx;
     uint64_t sig;
-    uint64_t tstamp;
+    SLIST_HEAD(ctxs, wterl_ctx) contexts;
 };
 
-#define __ctx_sig_cmp(a, b) ((((a)->sig) > ((b)->sig)) - (((a)->sig) < ((b)->sig)))
-KBTREE_INIT(cache_entries, struct cache_entry*, __ctx_sig_cmp);
-DECL_FIFO_QUEUE(cache_entries, struct cache_entry);
+#define __ctx_sig_cmp(a, b) (((a).sig > (b).sig) - ((a).sig < (b).sig))
+KBTREE_INIT(cache_entries, struct cache_entry, __ctx_sig_cmp);
 
 typedef struct wterl_conn {
     WT_CONNECTION *conn;
     const char *session_config;
     ErlNifMutex *cache_mutex;
     kbtree_t(cache_entries) *cache;
-    fifo_t(cache_entries) recycled_cache_entries;
     SLIST_ENTRY(wterl_conn) conns;
     uint64_t histogram[64];
-    uint64_t hits, misses;
+    uint64_t histogram_count;
 } WterlConnHandle;
 
 typedef struct {
@@ -158,9 +155,9 @@ static inline uint32_t __log2(uint64_t x) {
  * ->   0 = no/false, anything else is true
  */
 static int
-__ctx_cache_full(WterlConnHandle *conn)
+__ctx_cache_full(WterlConnHandle *conn_handle)
 {
-    return fifo_q_full(cache_entries, conn->recycled_cache_entries);
+    return kb_size(conn_handle->cache) == ASYNC_NIF_MAX_WORKERS; // TODO:
 }
 
 /**
@@ -174,38 +171,72 @@ __ctx_cache_full(WterlConnHandle *conn)
 static int
 __ctx_cache_evict(WterlConnHandle *conn_handle)
 {
-    uint32_t i;
-    uint64_t mean, now = cpu_clock_ticks();
+    uint32_t num_evicted, i;
+    uint64_t mean, now;
+    struct wterl_ctx *to_free[ASYNC_NIF_MAX_WORKERS];
+
+    now = cpu_clock_ticks();
     kbtree_t(cache_entries) *t = conn_handle->cache;
 
     // Find the mean of the recorded times that items stayed in cache.
+    mean = 0;
     for (i = 0; i < 64; i++)
 	mean += (conn_handle->histogram[i] * i);
     if (mean > 0)
-	mean /= conn_handle->hits;
+	mean /= conn_handle->histogram_count;
 
     // Clear out the histogram and hit/misses
     memset(conn_handle->histogram, 0, sizeof(uint64_t) * 64);
-    conn_handle->hits = 0;
-    conn_handle->misses = 0;
+    conn_handle->histogram_count = 0;
 
-    // Evict anything older than the mean time in queue.
-    i = 0;
+    /*
+     * Evict anything older than the mean time in queue by removing those
+     * items from the lists at the leaf nodes of the tree.
+     */
+    num_evicted = 0;
 #define traverse_f(p)							\
     {									\
-	struct cache_entry *e = *p;					\
-	uint64_t elapsed = e->tstamp - now;				\
-	if (__log2(elapsed) > mean) {					\
-	    kb_del(cache_entries, t, e);				\
-	    e->ctx->session->close(e->ctx->session, NULL);		\
-	    enif_free(e->ctx);						\
-	    fifo_q_put(cache_entries, conn_handle->recycled_cache_entries, e);	\
-	    i++;							\
+	struct cache_entry *e;						\
+	struct wterl_ctx *c;						\
+	e = (struct cache_entry *)p;					\
+	SLIST_FOREACH(c, &e->contexts, entries) {			\
+	    uint64_t elapsed = c->tstamp - now;				\
+	    uint32_t log = __log2(elapsed);				\
+	    if (log > mean) {						\
+		SLIST_REMOVE(&e->contexts, c, wterl_ctx, entries);	\
+		c->session->close(c->session, NULL);			\
+		to_free[num_evicted] = c;				\
+		num_evicted++;						\
+	    }								\
 	}								\
     }
-    __kb_traverse(struct cache_entry *, t, traverse_f);
+    __kb_traverse(struct cache_entry, t, traverse_f);
 #undef traverse_f
-    return i;
+
+    /*
+     * Free up the wterl_ctx we've removed after finishing the loop.
+     */
+    for (i = 0; i < num_evicted; i++) {
+        enif_free(to_free[i]);
+    }
+
+    /*
+     * Walk the tree again looking for empty lists to prune from the
+     * tree.
+     */
+#define traverse_f(p)							\
+    {									\
+        struct cache_entry *e, query;					\
+	e = p;								\
+	query.sig = e->sig;						\
+	if (SLIST_EMPTY(&e->contexts)) {				\
+	    kb_del(cache_entries, t, query);				\
+	}								\
+    }
+    __kb_traverse(struct cache_entry, t, traverse_f);
+#undef traverse_f
+
+    return num_evicted;
 }
 
 /**
@@ -220,24 +251,23 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
 static struct wterl_ctx *
 __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
 {
-    struct wterl_ctx *p = NULL;
-    struct cache_entry key, *e;
+    struct wterl_ctx *c = NULL;
+    struct cache_entry query, *result;
 
-    key.sig = sig;
-    e = *kb_get(cache_entries, conn_handle->cache, &key);
-    if (e) {
-	// cache hit, remove it from the tree
-        uint64_t elapsed = cpu_clock_ticks() - e->tstamp;
-	kb_del(cache_entries, conn_handle->cache, &key);
-	p = e->ctx;
-	memset(e, 0, sizeof(struct cache_entry));
-	fifo_q_put(cache_entries, conn_handle->recycled_cache_entries, e);
-	conn_handle->hits++;
-        conn_handle->histogram[__log2(elapsed)]++;
-    } else {
-	conn_handle->misses++;
-    }
-    return p;
+    query.sig = sig;
+    result = kb_get(cache_entries, conn_handle->cache, query);
+    if (result && !SLIST_EMPTY(&result->contexts)) {
+	/*
+	 * cache hit:
+	 * remove a context from the list in the tree node
+	 */
+	c = SLIST_FIRST(&result->contexts);
+	SLIST_REMOVE_HEAD(&result->contexts, entries);
+	uint64_t elapsed = cpu_clock_ticks() - c->tstamp;
+	conn_handle->histogram[__log2(elapsed)]++;
+	conn_handle->histogram_count++;
+    } // else { cache miss
+    return c;
 }
 
 /**
@@ -246,20 +276,29 @@ __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
  * Return an item into the cache, reset the cursors it has open and put it at
  * the front of the LRU.
  */
-static int
+static void
 __ctx_cache_add(WterlConnHandle *conn, struct wterl_ctx *c)
 {
-    struct cache_entry *e;
+    struct cache_entry query, *result;
 
+    /*
+     * Check to see if the cache is full and if so trigger eviction which will
+     * remove the least-recently-used half of the items from the cache.
+     */
     if (__ctx_cache_full(conn))
 	__ctx_cache_evict(conn);
 
-    e = fifo_q_get(cache_entries, conn->recycled_cache_entries);
-    e->ctx = c;
-    e->sig = c->sig;
-    e->tstamp = cpu_clock_ticks();
-    kb_put(cache_entries, conn->cache, e);
-    return 0;
+    c->tstamp = cpu_clock_ticks();
+
+    query.sig = c->sig;
+    result = kb_get(cache_entries, conn->cache, query);
+    if (result == NULL) {
+	SLIST_INIT(&query.contexts); // TODO: should this be on the heap?
+	SLIST_INSERT_HEAD(&query.contexts, c, entries);
+	kb_put(cache_entries, conn->cache, query);
+    } else {
+	SLIST_INSERT_HEAD(&result->contexts, c, entries);
+    }
 }
 
 /**
@@ -322,42 +361,25 @@ __zi(uint32_t p, uint32_t q)
  *                  cursor config pair
  * ->   number of variable arguments processed
  */
-static int
-__ctx_cache_sig_(const char *c, va_list ap, uint64_t *h)
+static uint64_t
+__ctx_cache_sig(const char *c, va_list ap, int count)
 {
     int i = 0;
-
-    if (NULL == c)
-	return 0;
-
-    *h = __str_hash(c);
-
-    while (*c) {
-	*h = __zi((uint32_t)(*h & 0xFFFFFFFF), __str_hash(va_arg(ap, const char *)));
-	*h <<= 1;
-	i++;
-    }
-    return i;
-}
-
-#if 0
-static uint64_t
-__ctx_cache_sig(const char *c, ...)
-{
-    int i;
-    va_list ap;
     uint64_t h;
+    const char *arg;
 
-    if (NULL == c)
-	return 0;
+    if (c)
+	h = __str_hash(c);
+    else
+	h = 0;
 
-    va_start(ap, c);
-    i = __ctx_cache_sig_(c, ap, &h);
-    va_end (ap);
-
-    return i;
+    for (i = 0; i < (2 * count); i++) {
+	arg = va_arg(ap, const char *);
+	if (arg) h = __zi((uint32_t)(h & 0xFFFFFFFF), __str_hash(arg));
+	else     h = __zi((uint32_t)(h & 0xFFFFFFFF), 0);
+    }
+    return h;
 }
-#endif
 
 /**
  * Get a reusable cursor that was opened for a particular worker within its
@@ -365,17 +387,17 @@ __ctx_cache_sig(const char *c, ...)
  */
 static int
 __retain_ctx(WterlConnHandle *conn_handle, struct wterl_ctx **ctx,
-	     const char *session_config, ...)
+	     int count, const char *session_config, ...)
 {
-    int i, count;
+    int i = 0;
     va_list ap;
     uint64_t sig;
-    const char *c;
+    const char *arg;
 
-    c = session_config;
+    arg = session_config;
     va_start(ap, session_config);
-    count = __ctx_cache_sig_(session_config, ap, &sig);
-    va_end (ap);
+    sig = __ctx_cache_sig(session_config, ap, count);
+    va_end(ap);
 
     enif_mutex_lock(conn_handle->cache_mutex);
     (*ctx) = __ctx_cache_find(conn_handle, sig);
@@ -386,9 +408,8 @@ __retain_ctx(WterlConnHandle *conn_handle, struct wterl_ctx **ctx,
 	int rc = conn->open_session(conn, NULL, session_config, &session);
 	if (rc != 0)
 	    return rc;
-	size_t s = sizeof(struct wterl_ctx) + ((count / 2) * sizeof(WT_CURSOR*));
-	// TODO: *ctx = enif_alloc_resource(wterl_ctx_RESOURCE, s);
-	*ctx = enif_alloc(s);
+	size_t s = sizeof(struct wterl_ctx) + (count * sizeof(WT_CURSOR*));
+	*ctx = enif_alloc(s); // TODO: enif_alloc_resource()
 	if (*ctx == NULL) {
 	    session->close(session, NULL);
 	    return ENOMEM;
@@ -396,14 +417,13 @@ __retain_ctx(WterlConnHandle *conn_handle, struct wterl_ctx **ctx,
 	memset(*ctx, 0, s);
 	(*ctx)->sig = sig;
 	(*ctx)->session = session;
-	WT_CURSOR **cursors = (*ctx)->cursors;
-	session_config = c;
+	session_config = arg;
 	va_start(ap, session_config);
-	for (i = 0; i < (count / 2); i++) {
+	for (i = 0; i < count; i++) {
 	    const char *uri = va_arg(ap, const char *);
 	    const char *config = va_arg(ap, const char *);
 	    // TODO: error when uri or config is NULL
-	    rc = session->open_cursor(session, uri, NULL, config, &cursors[i]);
+	    rc = session->open_cursor(session, uri, NULL, config, &(*ctx)->cursors[i]);
 	    if (rc != 0) {
 		session->close(session, NULL); // this will free the cursors too
 		return rc;
@@ -424,7 +444,7 @@ __release_ctx(WterlConnHandle *conn_handle, struct wterl_ctx *ctx)
     int i, c;
     WT_CURSOR *cursor;
 
-    c = sizeof(ctx->cursors) / sizeof(ctx->cursors[0]);
+    c = sizeof((WT_CURSOR**)ctx->cursors) / sizeof(ctx->cursors[0]);
     for (i = 0; i < c; i++) {
 	cursor = ctx->cursors[i];
 	cursor->reset(cursor);
@@ -442,14 +462,46 @@ __release_ctx(WterlConnHandle *conn_handle, struct wterl_ctx *ctx)
 void
 __close_all_sessions(WterlConnHandle *conn_handle)
 {
+    int i, num_closed = 0;
+    struct wterl_ctx *to_free[ASYNC_NIF_MAX_WORKERS];
     kbtree_t(cache_entries) *t = conn_handle->cache;
 
-#define traverse_f(p) {				\
-	kb_del(cache_entries, t, *p);		\
-	enif_free(p);				\
+#define traverse_f(p)							\
+    {									\
+        struct cache_entry *e;						\
+	struct wterl_ctx *c;						\
+	e = (struct cache_entry *)p;					\
+	SLIST_FOREACH(c, &e->contexts, entries) {			\
+	    SLIST_REMOVE(&e->contexts, c, wterl_ctx, entries);		\
+	    c->session->close(c->session, NULL);			\
+	    to_free[num_closed] = c;					\
+	    num_closed++;						\
+	}								\
     }
     __kb_traverse(struct cache_entry *, t, traverse_f);
 #undef traverse_f
+
+    /*
+     * Free up the wterl_ctx we've removed after finishing the loop.
+     */
+    for (i = 0; i < num_closed; i++) {
+        enif_free(to_free[i]);
+    }
+
+    /*
+     * Walk the tree again to prune all the empty lists from the tree.
+     */
+#define traverse_f(p)							\
+    {									\
+        struct cache_entry *e, query;					\
+	e = (struct cache_entry *)p;					\
+	query.sig = e->sig;						\
+	if (SLIST_EMPTY(&e->contexts)) {				\
+	    kb_del(cache_entries, t, query);				\
+	}								\
+    }
+    __kb_traverse(struct cache_entry, t, traverse_f);
+#undef traverse_f
 }
 
 /**
@@ -678,7 +730,6 @@ ASYNC_NIF_DECL(
 
       /* Init tree which manages the cache of session/cursor(s) */
       conn_handle->cache = kb_init(cache_entries, ASYNC_NIF_MAX_WORKERS); // TODO: size
-      conn_handle->recycled_cache_entries = fifo_q_new(cache_entries, ASYNC_NIF_MAX_WORKERS);
 
       /* Keep track of open connections so as to free when unload/reload/etc.
          are called. */
@@ -1374,7 +1425,7 @@ ASYNC_NIF_DECL(
 
     struct wterl_ctx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
-    int rc = __retain_ctx(args->conn_handle, &ctx,
+    int rc = __retain_ctx(args->conn_handle, &ctx, 1,
 			  args->conn_handle->session_config,
 			  args->uri, "overwrite,raw");
     if (rc != 0) {
@@ -1433,7 +1484,7 @@ ASYNC_NIF_DECL(
 
     struct wterl_ctx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
-    int rc = __retain_ctx(args->conn_handle, &ctx,
+    int rc = __retain_ctx(args->conn_handle, &ctx, 1,
 			  args->conn_handle->session_config,
 			  args->uri, "overwrite,raw");
     if (rc != 0) {
@@ -1515,7 +1566,7 @@ ASYNC_NIF_DECL(
 
     struct wterl_ctx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
-    int rc = __retain_ctx(args->conn_handle, &ctx,
+    int rc = __retain_ctx(args->conn_handle, &ctx, 1,
 			  args->conn_handle->session_config,
 			  args->uri, "overwrite,raw");
     if (rc != 0) {
@@ -2252,8 +2303,6 @@ on_load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info)
                                                   NULL, flags, NULL);
     wterl_cursor_RESOURCE = enif_open_resource_type(env, NULL, "wterl_cursor_resource",
                                                     NULL, flags, NULL);
-    wterl_ctx_RESOURCE = enif_open_resource_type(env, NULL, "wterl_ctx_resource",
-						 NULL, flags, NULL);
 
     ATOM_ERROR = enif_make_atom(env, "error");
     ATOM_OK = enif_make_atom(env, "ok");
@@ -2325,7 +2374,6 @@ on_unload(ErlNifEnv *env, void *priv_data)
 {
     struct wterl_priv_data *priv = (struct wterl_priv_data *)priv_data;
     WterlConnHandle *conn_handle;
-    struct cache_entry *e;
 
     enif_mutex_lock(priv->conns_mutex);
 
@@ -2338,12 +2386,10 @@ on_unload(ErlNifEnv *env, void *priv_data)
     ASYNC_NIF_UNLOAD(wterl, env, priv->async_nif_priv);
 
     SLIST_FOREACH(conn_handle, &priv->conns, conns) {
-	fifo_q_foreach(cache_entries, conn_handle->recycled_cache_entries, e, {
-            struct wterl_ctx *ctx = e->ctx;
-	    ctx->session->close(ctx->session, NULL);
-	});
-	fifo_q_free(cache_entries, conn_handle->recycled_cache_entries);
+	__close_all_sessions(conn_handle);
+	conn_handle->conn->close(conn_handle->conn, NULL);
 	kb_destroy(cache_entries, conn_handle->cache);
+	enif_free((void*)conn_handle->session_config);
 	enif_mutex_unlock(conn_handle->cache_mutex);
 	enif_mutex_destroy(conn_handle->cache_mutex);
     }
diff --git a/src/riak_kv_wterl_backend.erl b/src/riak_kv_wterl_backend.erl
new file mode 100644
index 0000000..313da29
--- /dev/null
+++ b/src/riak_kv_wterl_backend.erl
@@ -0,0 +1,613 @@
+%% -------------------------------------------------------------------
+%%
+%% riak_kv_wterl_backend: WiredTiger Driver for Riak
+%%
+%% Copyright (c) 2012-2013 Basho Technologies, Inc.  All Rights Reserved.
+%%
+%% This file is provided to you under the Apache License,
+%% Version 2.0 (the "License"); you may not use this file
+%% except in compliance with the License.  You may obtain
+%% a copy of the License at
+%%
+%%   http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing,
+%% software distributed under the License is distributed on an
+%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+%% KIND, either express or implied.  See the License for the
+%% specific language governing permissions and limitations
+%% under the License.
+%%
+%% -------------------------------------------------------------------
+
+-module(riak_kv_wterl_backend).
+-behavior(temp_riak_kv_backend).
+
+%% KV Backend API
+-export([api_version/0,
+         capabilities/1,
+         capabilities/2,
+         start/2,
+         stop/1,
+         get/3,
+         put/5,
+         delete/4,
+         drop/1,
+         fold_buckets/4,
+         fold_keys/4,
+         fold_objects/4,
+         is_empty/1,
+         status/1,
+         callback/3]).
+
+-ifdef(TEST).
+-include_lib("eunit/include/eunit.hrl").
+-compiel(export_all).
+-endif.
+
+-define(API_VERSION, 1).
+%% TODO: for when this backend supports 2i
+%%-define(CAPABILITIES, [async_fold, indexes]).
+-define(CAPABILITIES, [async_fold]).
+
+-record(state, {table :: string(),
+                type :: string(),
+                connection :: wterl:connection(),
+                is_empty_cursor :: wterl:cursor(),
+                status_cursor :: wterl:cursor()}).
+
+-type state() :: #state{}.
+-type config() :: [{atom(), term()}].
+
+%% ===================================================================
+%% Public API
+%% ===================================================================
+
+%% @doc Return the major version of the
+%% current API.
+-spec api_version() -> {ok, integer()}.
+api_version() ->
+    {ok, ?API_VERSION}.
+
+%% @doc Return the capabilities of the backend.
+-spec capabilities(state()) -> {ok, [atom()]}.
+capabilities(_) ->
+    {ok, ?CAPABILITIES}.
+
+%% @doc Return the capabilities of the backend.
+-spec capabilities(riak_object:bucket(), state()) -> {ok, [atom()]}.
+capabilities(_, _) ->
+    {ok, ?CAPABILITIES}.
+
+%% @doc Start the wterl backend
+-spec start(integer(), config()) -> {ok, state()} | {error, term()}.
+start(Partition, Config) ->
+    AppStart =
+        case application:start(wterl) of
+            ok ->
+                ok;
+            {error, {already_started, _}} ->
+                ok;
+            {error, Reason1} ->
+                lager:error("Failed to start wterl: ~p", [Reason1]),
+                {error, Reason1}
+        end,
+    case AppStart of
+        ok ->
+            Type =
+                case wterl:config_value(type, Config, "lsm") of
+                    {type, "lsm"} -> "lsm";
+                    {type, "table"} -> "table";
+                    {type, "btree"} -> "table";
+                    {type, BadType} ->
+                        lager:info("wterl:start ignoring unknown type ~p, using lsm instead", [BadType]),
+                        "lsm";
+                    _ ->
+                        lager:info("wterl:start ignoring mistaken setting defaulting to lsm"),
+                        "lsm"
+                end,
+            {ok, Connection} = establish_connection(Config, Type),
+            Table = Type ++ ":" ++ integer_to_list(Partition),
+            Compressor =
+                case wterl:config_value(block_compressor, Config, "snappy") of
+                    {block_compressor, "snappy"}=C -> [C];
+                    {block_compressor, "none"} -> [];
+                    {block_compressor, none} -> [];
+                    {block_compressor, _} -> [{block_compressor, "snappy"}];
+                    _ -> [{block_compressor, "snappy"}]
+                end,
+            TableOpts =
+                case Type of
+                    "lsm" ->
+                        [{internal_page_max, "128K"},
+                         {leaf_page_max, "128K"},
+                         {lsm_chunk_size, "100MB"},
+                         {lsm_merge_threads, 2},
+                         {prefix_compression, false},
+                         {lsm_bloom_newest, true},
+                         {lsm_bloom_oldest, true} ,
+                         {lsm_bloom_bit_count, 128},
+                         {lsm_bloom_hash_count, 64},
+                         {lsm_bloom_config, [{leaf_page_max, "8MB"}]}
+                        ] ++ Compressor;
+                    "table" ->
+                        Compressor
+                end,
+            case wterl:create(Connection, Table, TableOpts) of
+                ok ->
+                    case establish_utility_cursors(Connection, Table) of
+                        {ok, IsEmptyCursor, StatusCursor} ->
+                            {ok, #state{table=Table, type=Type,
+                                        connection=Connection,
+                                        is_empty_cursor=IsEmptyCursor,
+                                        status_cursor=StatusCursor}};
+                        {error, Reason2} ->
+                            {error, Reason2}
+                    end;
+                {error, Reason3} ->
+                    {error, Reason3}
+                end
+    end.
+
+%% @doc Stop the wterl backend
+-spec stop(state()) -> ok.
+stop(_State) ->
+    ok. %% The connection is closed by wterl_conn:stop()
+
+%% @doc Retrieve an object from the wterl backend
+-spec get(riak_object:bucket(), riak_object:key(), state()) ->
+                 {ok, any(), state()} |
+                 {ok, not_found, state()} |
+                 {error, term(), state()}.
+get(Bucket, Key, #state{connection=Connection, table=Table}=State) ->
+    WTKey = to_object_key(Bucket, Key),
+    case wterl:get(Connection, Table, WTKey) of
+        {ok, Value} ->
+            {ok, Value, State};
+        not_found  ->
+            {error, not_found, State};
+        {error, Reason} ->
+            {error, Reason, State}
+    end.
+
+%% @doc Insert an object into the wterl backend.
+%% NOTE: The wterl backend does not currently support
+%% secondary indexing and the_IndexSpecs parameter
+%% is ignored.
+-type index_spec() :: {add, Index, SecondaryKey} | {remove, Index, SecondaryKey}.
+-spec put(riak_object:bucket(), riak_object:key(), [index_spec()], binary(), state()) ->
+                 {ok, state()} |
+                 {error, term(), state()}.
+put(Bucket, PrimaryKey, _IndexSpecs, Val, #state{connection=Connection, table=Table}=State) ->
+    case wterl:put(Connection, Table, to_object_key(Bucket, PrimaryKey), Val) of
+        ok ->
+            {ok, State};
+        {error, Reason} ->
+            {error, Reason, State}
+    end.
+
+%% @doc Delete an object from the wterl backend
+%% NOTE: The wterl backend does not currently support
+%% secondary indexing and the_IndexSpecs parameter
+%% is ignored.
+-spec delete(riak_object:bucket(), riak_object:key(), [index_spec()], state()) ->
+                    {ok, state()} |
+                    {error, term(), state()}.
+delete(Bucket, Key, _IndexSpecs, #state{connection=Connection, table=Table}=State) ->
+    case wterl:delete(Connection, Table, to_object_key(Bucket, Key)) of
+        ok ->
+            {ok, State};
+        {error, Reason} ->
+            {error, Reason, State}
+    end.
+
+%% @doc Fold over all the buckets
+-spec fold_buckets(riak_kv_backend:fold_buckets_fun(),
+                   any(),
+                   [],
+                   state()) -> {ok, any()} | {async, fun()}.
+fold_buckets(FoldBucketsFun, Acc, Opts, #state{connection=Connection, table=Table}) ->
+    FoldFun = fold_buckets_fun(FoldBucketsFun),
+    BucketFolder =
+        fun() ->
+                case wterl:cursor_open(Connection, Table) of
+                    {error, {enoent, _Message}} ->
+                        Acc;
+                    {ok, Cursor} ->
+                        try
+                            {FoldResult, _} =
+                                wterl:fold_keys(Cursor, FoldFun, {Acc, []}),
+                            FoldResult
+                        catch
+                            {break, AccFinal} ->
+                                AccFinal
+                        after
+                            ok = wterl:cursor_close(Cursor)
+                        end
+                end
+        end,
+    case lists:member(async_fold, Opts) of
+        true ->
+            {async, BucketFolder};
+        false ->
+            {ok, BucketFolder()}
+    end.
+
+%% @doc Fold over all the keys for one or all buckets.
+-spec fold_keys(riak_kv_backend:fold_keys_fun(),
+                any(),
+                [{atom(), term()}],
+                state()) -> {ok, term()} | {async, fun()}.
+fold_keys(FoldKeysFun, Acc, Opts, #state{connection=Connection, table=Table}) ->
+    %% Figure out how we should limit the fold: by bucket, by
+    %% secondary index, or neither (fold across everything.)
+    Bucket = lists:keyfind(bucket, 1, Opts),
+    Index = lists:keyfind(index, 1, Opts),
+
+    %% Multiple limiters may exist. Take the most specific limiter.
+    Limiter =
+        if Index /= false  -> Index;
+           Bucket /= false -> Bucket;
+           true            -> undefined
+        end,
+
+    %% Set up the fold...
+    FoldFun = fold_keys_fun(FoldKeysFun, Limiter),
+    KeyFolder =
+        fun() ->
+                case wterl:cursor_open(Connection, Table) of
+                    {error, {enoent, _Message}} ->
+                        Acc;
+                    {ok, Cursor} ->
+                        try
+                            wterl:fold_keys(Cursor, FoldFun, Acc)
+                        catch
+                            {break, AccFinal} ->
+                                AccFinal
+                        after
+                            ok = wterl:cursor_close(Cursor)
+                        end
+                end
+        end,
+    case lists:member(async_fold, Opts) of
+        true ->
+            {async, KeyFolder};
+        false ->
+            {ok, KeyFolder()}
+    end.
+
+%% @doc Fold over all the objects for one or all buckets.
+-spec fold_objects(riak_kv_backend:fold_objects_fun(),
+                   any(),
+                   [{atom(), term()}],
+                   state()) -> {ok, any()} | {async, fun()}.
+fold_objects(FoldObjectsFun, Acc, Opts, #state{connection=Connection, table=Table}) ->
+    Bucket =  proplists:get_value(bucket, Opts),
+    FoldFun = fold_objects_fun(FoldObjectsFun, Bucket),
+    ObjectFolder =
+        fun() ->
+                case wterl:cursor_open(Connection, Table) of
+                    {error, {enoent, _Message}} ->
+                        Acc;
+                    {ok, Cursor} ->
+                        try
+                            wterl:fold(Cursor, FoldFun, Acc)
+                        catch
+                            {break, AccFinal} ->
+                                AccFinal
+                        after
+                            case wterl:cursor_close(Cursor) of
+                                ok ->
+                                    ok;
+                                {error, {eperm, _}} -> %% TODO: review/fix
+                                    ok;
+                                {error, _}=E ->
+                                    E
+                            end
+                        end
+                end
+        end,
+    case lists:member(async_fold, Opts) of
+        true ->
+            {async, ObjectFolder};
+        false ->
+            {ok, ObjectFolder()}
+    end.
+
+%% @doc Delete all objects from this wterl backend
+-spec drop(state()) -> {ok, state()} | {error, term(), state()}.
+drop(#state{connection=Connection, table=Table}=State) ->
+    case wterl:drop(Connection, Table) of
+        ok ->
+            {ok, State};
+        {error, {ebusy, _}} -> %% TODO: review/fix
+            {ok, State};
+        Error ->
+            {error, Error, State}
+    end.
+
+%% @doc Returns true if this wterl backend contains any
+%% non-tombstone values; otherwise returns false.
+-spec is_empty(state()) -> boolean().
+is_empty(#state{is_empty_cursor=Cursor}) ->
+    wterl:cursor_reset(Cursor),
+    case wterl:cursor_next(Cursor) of
+        not_found -> true;
+        {error, {eperm, _}} -> false; % TODO: review/fix this logic
+        _ -> false
+    end.
+
+%% @doc Get the status information for this wterl backend
+-spec status(state()) -> [{atom(), term()}].
+status(#state{status_cursor=Cursor}) ->
+    wterl:cursor_reset(Cursor),
+    case fetch_status(Cursor) of
+        {ok, Stats} ->
+            Stats;
+        {error, {eperm, _}} -> % TODO: review/fix this logic
+            {ok, []};
+        _ ->
+            {ok, []}
+    end.
+
+%% @doc Register an asynchronous callback
+-spec callback(reference(), any(), state()) -> {ok, state()}.
+callback(_Ref, _Msg, State) ->
+    {ok, State}.
+
+
+%% ===================================================================
+%% Internal functions
+%% ===================================================================
+
+%% @private
+max_sessions(Config) ->
+    RingSize =
+        case app_helper:get_prop_or_env(ring_creation_size, Config, riak_core) of
+            undefined -> 1024;
+            Size -> Size
+        end,
+    Est = 100 * (RingSize * erlang:system_info(schedulers)), % TODO: review/fix this logic
+    case Est > 1000000000  of % Note: WiredTiger uses a signed int for this
+        true -> 1000000000;
+        false -> Est
+    end.
+
+%% @private
+establish_utility_cursors(Connection, Table) ->
+    case wterl:cursor_open(Connection, Table) of
+        {ok, IsEmptyCursor} ->
+            case wterl:cursor_open(Connection, "statistics:" ++ Table, [{statistics_fast, true}]) of
+                {ok, StatusCursor} ->
+                    {ok, IsEmptyCursor, StatusCursor};
+                {error, Reason1} ->
+                    {error, Reason1}
+            end;
+        {error, Reason2} ->
+            {error, Reason2}
+    end.
+
+%% @private
+establish_connection(Config, Type) ->
+    %% Get the data root directory
+    case app_helper:get_prop_or_env(data_root, Config, wterl) of
+        undefined ->
+            lager:error("Failed to create wterl dir: data_root is not set"),
+            {error, data_root_unset};
+        DataRoot ->
+            ok = filelib:ensure_dir(filename:join(DataRoot, "x")),
+
+            %% WT Connection Options:
+            %% NOTE: LSM auto-checkpoints, so we don't have too.
+            CheckpointSetting =
+                case Type =:= "lsm" of
+                    true ->
+                        [];
+                    false ->
+                        app_helper:get_prop_or_env(checkpoint, Config, wterl, [{wait, 10}])
+                end,
+            RequestedCacheSize = app_helper:get_prop_or_env(cache_size, Config, wterl),
+            ConnectionOpts =
+                orddict:from_list(
+                  [ wterl:config_value(create, Config, true),
+                    wterl:config_value(sync, Config, false),
+                    wterl:config_value(logging, Config, true),
+                    wterl:config_value(transactional, Config, true),
+                    wterl:config_value(session_max, Config, max_sessions(Config)),
+                    wterl:config_value(cache_size, Config, size_cache(RequestedCacheSize)),
+                    wterl:config_value(statistics_log, Config, [{wait, 300}]), % sec
+                    wterl:config_value(verbose, Config, [ "salvage", "verify"
+                         % Note: for some unknown reason, if you add these additional
+                         % verbose flags Erlang SEGV's "size_object: bad tag for 0x80"
+                         % no idea why... yet... you've been warned.
+
+                         %"block", "shared_cache", "reconcile", "evict", "lsm",
+                         %"fileops", "read", "write", "readserver", "evictserver",
+                         %"hazard", "mutex", "ckpt"
+                         ]) ] ++ CheckpointSetting ++ proplists:get_value(wterl, Config, [])), % sec
+
+            %% WT Session Options:
+            SessionOpts = [{isolation, "snapshot"}],
+
+            case wterl_conn:open(DataRoot, ConnectionOpts, SessionOpts) of
+                {ok, Connection} ->
+                    {ok, Connection};
+                {error, Reason2} ->
+                    lager:error("Failed to establish a WiredTiger connection, wterl backend unable to start: ~p\n", [Reason2]),
+                    {error, Reason2}
+            end
+    end.
+
+%% @private
+%% Return a function to fold over the buckets on this backend
+fold_buckets_fun(FoldBucketsFun) ->
+    fun(BK, {Acc, LastBucket}) ->
+            case from_object_key(BK) of
+                {LastBucket, _} ->
+                    {Acc, LastBucket};
+                {Bucket, _} ->
+                    {FoldBucketsFun(Bucket, Acc), Bucket};
+                _ ->
+                    throw({break, Acc})
+            end
+    end.
+
+%% @private
+%% Return a function to fold over keys on this backend
+fold_keys_fun(FoldKeysFun, undefined) ->
+    %% Fold across everything...
+    fun(StorageKey, Acc) ->
+            case from_object_key(StorageKey) of
+                {Bucket, Key} ->
+                    FoldKeysFun(Bucket, Key, Acc);
+                _ ->
+                    throw({break, Acc})
+            end
+    end;
+fold_keys_fun(FoldKeysFun, {bucket, FilterBucket}) ->
+    %% Fold across a specific bucket...
+    fun(StorageKey, Acc) ->
+            case from_object_key(StorageKey) of
+                {Bucket, Key} when Bucket == FilterBucket ->
+                    FoldKeysFun(Bucket, Key, Acc);
+                _ ->
+                    throw({break, Acc})
+            end
+    end;
+fold_keys_fun(FoldKeysFun, {index, FilterBucket, {eq, <<"$bucket">>, _}}) ->
+    %% 2I exact match query on special $bucket field...
+    fold_keys_fun(FoldKeysFun, {bucket, FilterBucket});
+fold_keys_fun(FoldKeysFun, {index, FilterBucket, {eq, FilterField, FilterTerm}}) ->
+    %% Rewrite 2I exact match query as a range...
+    NewQuery = {range, FilterField, FilterTerm, FilterTerm},
+    fold_keys_fun(FoldKeysFun, {index, FilterBucket, NewQuery});
+fold_keys_fun(FoldKeysFun, {index, FilterBucket, {range, <<"$key">>, StartKey, EndKey}}) ->
+    %% 2I range query on special $key field...
+    fun(StorageKey, Acc) ->
+            case from_object_key(StorageKey) of
+                {Bucket, Key} when FilterBucket == Bucket,
+                                   StartKey =< Key,
+                                   EndKey >= Key ->
+                    FoldKeysFun(Bucket, Key, Acc);
+                _ ->
+                    throw({break, Acc})
+            end
+    end;
+fold_keys_fun(FoldKeysFun, {index, FilterBucket, {range, FilterField, StartTerm, EndTerm}}) ->
+    %% 2I range query...
+    fun(StorageKey, Acc) ->
+            case from_index_key(StorageKey) of
+                {Bucket, Key, Field, Term} when FilterBucket == Bucket,
+                                                FilterField == Field,
+                                                StartTerm =< Term,
+                                                EndTerm >= Term ->
+                    FoldKeysFun(Bucket, Key, Acc);
+                _ ->
+                    throw({break, Acc})
+            end
+    end;
+fold_keys_fun(_FoldKeysFun, Other) ->
+    throw({unknown_limiter, Other}).
+
+%% @private
+%% Return a function to fold over the objects on this backend
+fold_objects_fun(FoldObjectsFun, FilterBucket) ->
+    %% 2I does not support fold objects at this time, so this is much
+    %% simpler than fold_keys_fun.
+    fun({StorageKey, Value}, Acc) ->
+            case from_object_key(StorageKey) of
+                {Bucket, Key} when FilterBucket == undefined;
+                                   Bucket == FilterBucket ->
+                    FoldObjectsFun(Bucket, Key, Value, Acc);
+                _ ->
+                    throw({break, Acc})
+            end
+    end.
+
+to_object_key(Bucket, Key) ->
+    sext:encode({o, Bucket, Key}).
+
+from_object_key(LKey) ->
+    case sext:decode(LKey) of
+        {o, Bucket, Key} ->
+            {Bucket, Key};
+        _ ->
+            undefined
+    end.
+
+from_index_key(LKey) ->
+    case sext:decode(LKey) of
+        {i, Bucket, Field, Term, Key} ->
+            {Bucket, Key, Field, Term};
+        _ ->
+            undefined
+    end.
+
+%% @private
+%% Return all status from wterl statistics cursor
+fetch_status(Cursor) ->
+    {ok, fetch_status(Cursor, wterl:cursor_next_value(Cursor), [])}.
+fetch_status(_Cursor, {error, _}, Acc) ->
+    lists:reverse(Acc);
+fetch_status(_Cursor, not_found, Acc) ->
+    lists:reverse(Acc);
+fetch_status(Cursor, {ok, Stat}, Acc) ->
+    [What,Val|_] = [binary_to_list(B) || B <- binary:split(Stat, [<<0>>], [global])],
+    fetch_status(Cursor, wterl:cursor_next_value(Cursor), [{What,Val}|Acc]).
+
+size_cache(RequestedSize) ->
+    Size =
+        case RequestedSize of
+            undefined ->
+                RunningApps = application:which_applications(),
+                FinalGuess =
+                    case proplists:is_defined(sasl, RunningApps) andalso
+                        proplists:is_defined(os_mon, RunningApps) of
+                        true ->
+                            Memory = memsup:get_system_memory_data(),
+                            TotalRAM = proplists:get_value(system_total_memory, Memory),
+                            FreeRAM = proplists:get_value(free_memory, Memory),
+                            UsedByBeam = proplists:get_value(total, erlang:memory()),
+                            Target = ((TotalRAM - UsedByBeam) div 4),
+                            FirstGuess = (Target - (Target rem (1024 * 1024))),
+                            SecondGuess =
+                                case FirstGuess > FreeRAM of
+                                    true -> FreeRAM - (FreeRAM rem (1024 * 1024));
+                                    _ -> FirstGuess
+                                end,
+                            case SecondGuess < 1073741824 of %% < 1GB?
+                                true -> "1GB";
+                                false ->
+                                    ThirdGuess = SecondGuess div (1024 * 1024),
+                                    integer_to_list(ThirdGuess) ++ "MB"
+                            end;
+                        false ->
+                            "1GB"
+                    end,
+                application:set_env(wterl, cache_size, FinalGuess),
+                FinalGuess;
+            Value when is_list(Value) ->
+                Value;
+            Value when is_number(Value) ->
+                integer_to_list(Value)
+        end,
+    Size.
+
+%% ===================================================================
+%% EUnit tests
+%% ===================================================================
+-ifdef(TEST).
+
+simple_test_() ->
+    {ok, CWD} = file:get_cwd(),
+    rmdir:path(filename:join([CWD, "test/wterl-backend"])), %?assertCmd("rm -rf test/wterl-backend"),
+    application:set_env(wterl, data_root, "test/wterl-backend"),
+    temp_riak_kv_backend:standard_test(?MODULE, []).
+
+custom_config_test_() ->
+    {ok, CWD} = file:get_cwd(),
+    rmdir:path(filename:join([CWD, "test/wterl-backend"])), %?assertCmd("rm -rf test/wterl-backend"),
+    application:set_env(wterl, data_root, ""),
+    temp_riak_kv_backend:standard_test(?MODULE, [{data_root, "test/wterl-backend"}]).
+
+-endif.
diff --git a/src/temp_riak_kv_backend.erl b/src/temp_riak_kv_backend.erl
new file mode 100644
index 0000000..c41a38d
--- /dev/null
+++ b/src/temp_riak_kv_backend.erl
@@ -0,0 +1,287 @@
+%% -------------------------------------------------------------------
+%%
+%% riak_kv_backend: Riak backend behaviour
+%%
+%% Copyright (c) 2007-2010 Basho Technologies, Inc.  All Rights Reserved.
+%%
+%% This file is provided to you under the Apache License,
+%% Version 2.0 (the "License"); you may not use this file
+%% except in compliance with the License.  You may obtain
+%% a copy of the License at
+%%
+%%   http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing,
+%% software distributed under the License is distributed on an
+%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+%% KIND, either express or implied.  See the License for the
+%% specific language governing permissions and limitations
+%% under the License.
+%%
+%% -------------------------------------------------------------------
+
+%%% NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE
+%%%
+%%% This is a temporary copy of riak_kv_backend, just here to keep
+%%% wterl development private for now. When riak_kv_wterl_backend is
+%%% moved to riak_kv, delete this file.
+%%%
+%%% NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE NOTE
+
+
+-module(temp_riak_kv_backend).
+
+-export([behaviour_info/1]).
+-export([callback_after/3]).
+
+-ifdef(TEST).
+-include_lib("eunit/include/eunit.hrl").
+-compile(export_all).
+-export([standard_test/2]).
+-endif.
+
+-type fold_buckets_fun() :: fun((binary(), any()) -> any() | no_return()).
+-type fold_keys_fun() :: fun((binary(), binary(), any()) -> any() |
+                                                            no_return()).
+-type fold_objects_fun() :: fun((binary(), binary(), term(), any()) ->
+                                       any() |
+                                       no_return()).
+-export_type([fold_buckets_fun/0,
+              fold_keys_fun/0,
+              fold_objects_fun/0]).
+
+-spec behaviour_info(atom()) -> 'undefined' | [{atom(), arity()}].
+behaviour_info(callbacks) ->
+    [
+     {api_version,0},
+     {capabilities, 1},  % (State)
+     {capabilities, 2},  % (Bucket, State)
+     {start,2},          % (Partition, Config)
+     {stop,1},           % (State)
+     {get,3},            % (Bucket, Key, State)
+     {put,5},            % (Bucket, Key, IndexSpecs, Val, State)
+     {delete,4},         % (Bucket, Key, IndexSpecs, State)
+     {drop,1},           % (State)
+     {fold_buckets,4},   % (FoldBucketsFun, Acc, Opts, State),
+                         %   FoldBucketsFun(Bucket, Acc)
+     {fold_keys,4},      % (FoldKeysFun, Acc, Opts, State),
+                         %   FoldKeysFun(Bucket, Key, Acc)
+     {fold_objects,4},   % (FoldObjectsFun, Acc, Opts, State),
+                         %   FoldObjectsFun(Bucket, Key, Object, Acc)
+     {is_empty,1},       % (State)
+     {status,1},         % (State)
+     {callback,3}];      % (Ref, Msg, State) ->
+behaviour_info(_Other) ->
+    undefined.
+
+%% Queue a callback for the backend after Time ms.
+-spec callback_after(integer(), reference(), term()) -> reference().
+callback_after(Time, Ref, Msg) when is_integer(Time), is_reference(Ref) ->
+    riak_core_vnode:send_command_after(Time, {backend_callback, Ref, Msg}).
+
+-ifdef(TEST).
+
+standard_test(BackendMod, Config) ->
+    {spawn,
+     [
+      {setup,
+       fun() -> ?MODULE:setup({BackendMod, Config}) end,
+       fun ?MODULE:cleanup/1,
+       fun(X) ->
+               [?MODULE:basic_store_and_fetch(X),
+                ?MODULE:fold_buckets(X),
+                ?MODULE:fold_keys(X),
+                ?MODULE:delete_object(X),
+                ?MODULE:fold_objects(X),
+                ?MODULE:empty_check(X)
+               ]
+       end
+      }]}.
+
+basic_store_and_fetch({Backend, State}) ->
+    {"basic store and fetch test",
+     fun() ->
+             [
+              ?_assertMatch({ok, _},
+                            Backend:put(<<"b1">>, <<"k1">>, [], <<"v1">>, State)),
+              ?_assertMatch({ok, _},
+                            Backend:put(<<"b2">>, <<"k2">>, [], <<"v2">>, State)),
+              ?_assertMatch({ok,<<"v2">>, _},
+                            Backend:get(<<"b2">>, <<"k2">>, State)),
+              ?_assertMatch({error, not_found, _},
+                            Backend:get(<<"b1">>, <<"k3">>, State))
+             ]
+     end
+    }.
+
+fold_buckets({Backend, State}) ->
+    {"bucket folding test",
+     fun() ->
+             FoldBucketsFun =
+                 fun(Bucket, Acc) ->
+                         [Bucket | Acc]
+                 end,
+
+             ?_assertEqual([<<"b1">>, <<"b2">>],
+                           begin
+                               {ok, Buckets1} =
+                                   Backend:fold_buckets(FoldBucketsFun,
+                                                        [],
+                                                        [],
+                                                        State),
+                               lists:sort(Buckets1)
+                           end)
+     end
+    }.
+
+fold_keys({Backend, State}) ->
+    {"key folding test",
+     fun() ->
+             FoldKeysFun =
+                 fun(Bucket, Key, Acc) ->
+                         [{Bucket, Key} | Acc]
+                 end,
+             FoldKeysFun1 =
+                 fun(_Bucket, Key, Acc) ->
+                         [Key | Acc]
+                 end,
+             FoldKeysFun2 =
+                 fun(Bucket, Key, Acc) ->
+                         case Bucket =:= <<"b1">> of
+                             true ->
+                                 [Key | Acc];
+                             false ->
+                                 Acc
+                         end
+                 end,
+             FoldKeysFun3 =
+                 fun(Bucket, Key, Acc) ->
+                         case Bucket =:= <<"b1">> of
+                             true ->
+                                 Acc;
+                             false ->
+                                 [Key | Acc]
+                         end
+                 end,
+             [
+              ?_assertEqual([{<<"b1">>, <<"k1">>}, {<<"b2">>, <<"k2">>}],
+                            begin
+                                {ok, Keys1} =
+                                    Backend:fold_keys(FoldKeysFun,
+                                                      [],
+                                                      [],
+                                                      State),
+                                lists:sort(Keys1)
+                            end),
+              ?_assertEqual({ok, [<<"k1">>]},
+                            Backend:fold_keys(FoldKeysFun1,
+                                              [],
+                                              [{bucket, <<"b1">>}],
+                                              State)),
+              ?_assertEqual([<<"k2">>],
+                            Backend:fold_keys(FoldKeysFun1,
+                                              [],
+                                              [{bucket, <<"b2">>}],
+                                              State)),
+              ?_assertEqual({ok, [<<"k1">>]},
+                            Backend:fold_keys(FoldKeysFun2, [], [], State)),
+              ?_assertEqual({ok, [<<"k1">>]},
+                            Backend:fold_keys(FoldKeysFun2,
+                                              [],
+                                              [{bucket, <<"b1">>}],
+                                              State)),
+              ?_assertEqual({ok, [<<"k2">>]},
+                            Backend:fold_keys(FoldKeysFun3, [], [], State)),
+              ?_assertEqual({ok, []},
+                            Backend:fold_keys(FoldKeysFun3,
+                                              [],
+                                              [{bucket, <<"b1">>}],
+                                              State))
+             ]
+     end
+    }.
+
+delete_object({Backend, State}) ->
+    {"object deletion test",
+     fun() ->
+             [
+              ?_assertMatch({ok, _}, Backend:delete(<<"b2">>, <<"k2">>, State)),
+              ?_assertMatch({error, not_found, _},
+                            Backend:get(<<"b2">>, <<"k2">>, State))
+             ]
+     end
+    }.
+
+fold_objects({Backend, State}) ->
+    {"object folding test",
+     fun() ->
+             FoldKeysFun =
+                 fun(Bucket, Key, Acc) ->
+                         [{Bucket, Key} | Acc]
+                 end,
+             FoldObjectsFun =
+                 fun(Bucket, Key, Value, Acc) ->
+                         [{{Bucket, Key}, Value} | Acc]
+                 end,
+             [
+              ?_assertEqual([{<<"b1">>, <<"k1">>}],
+                            begin
+                                {ok, Keys} =
+                                    Backend:fold_keys(FoldKeysFun,
+                                                      [],
+                                                      [],
+                                                      State),
+                                lists:sort(Keys)
+                            end),
+
+              ?_assertEqual([{{<<"b1">>,<<"k1">>}, <<"v1">>}],
+                            begin
+                                {ok, Objects1} =
+                                    Backend:fold_objects(FoldObjectsFun,
+                                                         [],
+                                                         [],
+                                                         State),
+                                lists:sort(Objects1)
+                            end),
+              ?_assertMatch({ok, _},
+                            Backend:put(<<"b3">>, <<"k3">>, [], <<"v3">>, State)),
+              ?_assertEqual([{{<<"b1">>,<<"k1">>},<<"v1">>},
+                             {{<<"b3">>,<<"k3">>},<<"v3">>}],
+                            begin
+                                {ok, Objects} =
+                                    Backend:fold_objects(FoldObjectsFun,
+                                                         [],
+                                                         [],
+                                                         State),
+                                lists:sort(Objects)
+                            end)
+             ]
+     end
+    }.
+
+empty_check({Backend, State}) ->
+    {"is_empty test",
+     fun() ->
+             [
+              ?_assertEqual(false, Backend:is_empty(State)),
+              ?_assertMatch({ok, _}, Backend:delete(<<"b1">>,<<"k1">>, State)),
+              ?_assertMatch({ok, _}, Backend:delete(<<"b3">>,<<"k3">>, State)),
+              ?_assertEqual(true, Backend:is_empty(State))
+             ]
+     end
+    }.
+
+setup({BackendMod, Config}) ->
+    application:start(lager),
+    application:start(sasl),
+    application:start(os_mon),
+    {ok, S} = BackendMod:start(42, Config),
+    {BackendMod, S}.
+
+cleanup({BackendMod, S}) ->
+    ok = BackendMod:stop(S),
+    application:stop(lager),
+    application:stop(sasl),
+    application:stop(os_mon).
+
+-endif. % TEST

From f1b7d8322da904a3385b97456819afd63ff41afe Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 4 Jun 2013 14:45:23 -0400
Subject: [PATCH 06/30] WIP: replcae the kbtree with khash, we don't need the
 tree features (yet, if ever) and hash is faster; add a most-recently-used
 stash for contexts as it's highly likely that worker threads will do many
 operations with the same shape/signature of session/cursors/tables/config and
 that path can be lock-free as well making it much faster (one would hope);
 somewhere something is stepping on read-only ErlNifBinary data and so a crc
 check is failing and causing the runtime to abort, that's the latest item to
 find/fix.

---
 c_src/async_nif.h                  |   6 +-
 c_src/common.h                     |   2 +-
 c_src/kbtree.h                     | 507 -----------------------------
 c_src/wterl.c                      | 349 ++++++++++----------
 src/wterl.erl                      |   2 +-
 tools/basho_bench_driver_wterl.erl |   6 +-
 6 files changed, 176 insertions(+), 696 deletions(-)
 delete mode 100644 c_src/kbtree.h

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 9483433..44a0906 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -34,7 +34,7 @@ extern "C" {
 #define UNUSED(v) ((void)(v))
 #endif
 
-#define ASYNC_NIF_MAX_WORKERS 1024
+#define ASYNC_NIF_MAX_WORKERS 512
 #define ASYNC_NIF_WORKER_QUEUE_SIZE 500
 #define ASYNC_NIF_MAX_QUEUED_REQS 1000 * ASYNC_NIF_MAX_WORKERS
 
@@ -278,8 +278,8 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
      performing the request). */
   ERL_NIF_TERM reply = enif_make_tuple2(req->env, enif_make_atom(req->env, "ok"),
                                         enif_make_atom(req->env, "enqueued"));
-  enif_mutex_unlock(q->reqs_mutex);
   enif_cond_signal(q->reqs_cnd);
+  enif_mutex_unlock(q->reqs_mutex);
   return reply;
 }
 
@@ -314,11 +314,11 @@ async_nif_worker_fn(void *arg)
       /* At this point the next req is ours to process and we hold the
          reqs_mutex lock.  Take the request off the queue. */
       req = fifo_q_get(reqs, q->reqs);
-      enif_mutex_unlock(q->reqs_mutex);
 
       /* Ensure that there is at least one other worker thread watching this
          queue. */
       enif_cond_signal(q->reqs_cnd);
+      enif_mutex_unlock(q->reqs_mutex);
 
       /* Perform the work. */
       req->fn_work(req->env, req->ref, &req->pid, worker_id, req->args);
diff --git a/c_src/common.h b/c_src/common.h
index 70aea07..82db007 100644
--- a/c_src/common.h
+++ b/c_src/common.h
@@ -24,7 +24,7 @@
 extern "C" {
 #endif
 
-
+#define DEBUG 1
 #if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__))
 # undef  DEBUG
 # define DEBUG		0
diff --git a/c_src/kbtree.h b/c_src/kbtree.h
deleted file mode 100644
index 09184cc..0000000
--- a/c_src/kbtree.h
+++ /dev/null
@@ -1,507 +0,0 @@
-/*-
- * Copyright 1997-1999, 2001, John-Mark Gurney.
- *           2008-2009, Attractive Chaos <attractor@live.co.uk>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef __AC_KBTREE_H
-#define __AC_KBTREE_H
-
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-typedef struct {
-	int32_t is_internal:1, n:31;
-} kbnode_t;
-
-#define	__KB_KEY(type, x)	((type*)((char*)x + 4))
-#define __KB_PTR(btr, x)	((kbnode_t**)((char*)x + btr->off_ptr))
-
-#define __KB_TREE_T(name)						\
-	typedef struct {							\
-		kbnode_t *root;							\
-		int	off_key, off_ptr, ilen, elen;		\
-		int	n, t;								\
-		int	n_keys, n_nodes;					\
-	} kbtree_##name##_t;
-
-#define __KB_INIT(name, key_t)											\
-	kbtree_##name##_t *kb_init_##name(int size)							\
-	{																	\
-		kbtree_##name##_t *b;											\
-		b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t));	\
-		b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \
-		if (b->t < 2) {													\
-			free(b); return 0;											\
-		}																\
-		b->n = 2 * b->t - 1;											\
-		b->off_ptr = 4 + b->n * sizeof(key_t);							\
-		b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \
-		b->elen = (b->off_ptr + 3) >> 2 << 2;							\
-		b->root = (kbnode_t*)calloc(1, b->ilen);						\
-		++b->n_nodes;													\
-		return b;														\
-	}
-
-#define __kb_destroy(b) do {											\
-		int i, max = 8;													\
-		kbnode_t *x, **top, **stack = 0;								\
-		if (b) {														\
-			top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*));	\
-			*top++ = (b)->root;											\
-			while (top != stack) {										\
-				x = *--top;												\
-				if (x->is_internal == 0) { free(x); continue; }			\
-				for (i = 0; i <= x->n; ++i)								\
-					if (__KB_PTR(b, x)[i]) {							\
-						if (top - stack == max) {						\
-							max <<= 1;									\
-							stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \
-							top = stack + (max>>1);						\
-						}												\
-						*top++ = __KB_PTR(b, x)[i];						\
-					}													\
-				free(x);												\
-			}															\
-		}																\
-		free(b); free(stack);											\
-	} while (0)
-
-#define __kb_get_first(key_t, b, ret) do {	\
-		kbnode_t *__x = (b)->root;			\
-		while (__KB_PTR(b, __x)[0] != 0)	\
-			__x = __KB_PTR(b, __x)[0];		\
-		(ret) = __KB_KEY(key_t, __x)[0];	\
-	} while (0)
-
-#define __KB_GET_AUX0(name, key_t, __cmp)								\
-	static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
-	{																	\
-		int tr, *rr, begin, end, n = x->n >> 1;							\
-		if (x->n == 0) return -1;										\
-		if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) {						\
-			begin = 0; end = n;											\
-		} else { begin = n; end = x->n - 1; }							\
-		rr = r? r : &tr;												\
-		n = end;														\
-		while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \
-		return n;														\
-	}
-
-#define __KB_GET_AUX1(name, key_t, __cmp)								\
-	static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
-	{																	\
-		int tr, *rr, begin = 0, end = x->n;								\
-		if (x->n == 0) return -1;										\
-		rr = r? r : &tr;												\
-		while (begin < end) {											\
-			int mid = (begin + end) >> 1;								\
-			if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \
-			else end = mid;												\
-		}																\
-		if (begin == x->n) { *rr = 1; return x->n - 1; }				\
-		if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin;	\
-		return begin;													\
-	}
-
-#define __KB_GET(name, key_t)											\
-	static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
-	{																	\
-		int i, r = 0;													\
-		kbnode_t *x = b->root;											\
-		while (x) {														\
-			i = __kb_getp_aux_##name(x, k, &r);							\
-			if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i];		\
-			if (x->is_internal == 0) return 0;							\
-			x = __KB_PTR(b, x)[i + 1];									\
-		}																\
-		return 0;														\
-	}																	\
-	static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \
-	{																	\
-		return kb_getp_##name(b, &k);									\
-	}
-
-#define __KB_INTERVAL(name, key_t)										\
-	static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper)	\
-	{																	\
-		int i, r = 0;													\
-		kbnode_t *x = b->root;											\
-		*lower = *upper = 0;											\
-		while (x) {														\
-			i = __kb_getp_aux_##name(x, k, &r);							\
-			if (i >= 0 && r == 0) {										\
-				*lower = *upper = &__KB_KEY(key_t, x)[i];				\
-				return;													\
-			}															\
-			if (i >= 0) *lower = &__KB_KEY(key_t, x)[i];				\
-			if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1];		\
-			if (x->is_internal == 0) return;							\
-			x = __KB_PTR(b, x)[i + 1];									\
-		}																\
-	}																	\
-	static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \
-	{																	\
-		kb_intervalp_##name(b, &k, lower, upper);						\
-	}
-
-#define __KB_PUT(name, key_t, __cmp)									\
-	/* x must be an internal node */									\
-	static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \
-	{																	\
-		kbnode_t *z;													\
-		z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen);	\
-		++b->n_nodes;													\
-		z->is_internal = y->is_internal;								\
-		z->n = b->t - 1;												\
-		memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \
-		if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \
-		y->n = b->t - 1;												\
-		memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \
-		__KB_PTR(b, x)[i + 1] = z;										\
-		memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \
-		__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1];			\
-		++x->n;															\
-	}																	\
-	static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \
-	{																	\
-		int i = x->n - 1;												\
-		if (x->is_internal == 0) {										\
-			i = __kb_getp_aux_##name(x, k, 0);							\
-			if (i != x->n - 1)											\
-				memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
-			__KB_KEY(key_t, x)[i + 1] = *k;								\
-			++x->n;														\
-		} else {														\
-			i = __kb_getp_aux_##name(x, k, 0) + 1;						\
-			if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) {					\
-				__kb_split_##name(b, x, i, __KB_PTR(b, x)[i]);			\
-				if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i;			\
-			}															\
-			__kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k);				\
-		}																\
-	}																	\
-	static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
-	{																	\
-		kbnode_t *r, *s;												\
-		++b->n_keys;													\
-		r = b->root;													\
-		if (r->n == 2 * b->t - 1) {										\
-			++b->n_nodes;												\
-			s = (kbnode_t*)calloc(1, b->ilen);							\
-			b->root = s; s->is_internal = 1; s->n = 0;					\
-			__KB_PTR(b, s)[0] = r;										\
-			__kb_split_##name(b, s, 0, r);								\
-			r = s;														\
-		}																\
-		__kb_putp_aux_##name(b, r, k);									\
-	}																	\
-	static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \
-	{																	\
-		kb_putp_##name(b, &k);											\
-	}
-
-
-#define __KB_DEL(name, key_t)											\
-	static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \
-	{																	\
-		int yn, zn, i, r = 0;											\
-		kbnode_t *xp, *y, *z;											\
-		key_t kp;														\
-		if (x == 0) return *k;											\
-		if (s) { /* s can only be 0, 1 or 2 */							\
-			r = x->is_internal == 0? 0 : s == 1? 1 : -1;				\
-			i = s == 1? x->n - 1 : -1;									\
-		} else i = __kb_getp_aux_##name(x, k, &r);						\
-		if (x->is_internal == 0) {										\
-			if (s == 2) ++i;											\
-			kp = __KB_KEY(key_t, x)[i];									\
-			memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
-			--x->n;														\
-			return kp;													\
-		}																\
-		if (r == 0) {													\
-			if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) {					\
-				xp = __KB_PTR(b, x)[i];									\
-				kp = __KB_KEY(key_t, x)[i];								\
-				__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \
-				return kp;												\
-			} else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) {		\
-				xp = __KB_PTR(b, x)[i + 1];								\
-				kp = __KB_KEY(key_t, x)[i];								\
-				__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \
-				return kp;												\
-			} else if (yn == b->t - 1 && zn == b->t - 1) {				\
-				y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1];		\
-				__KB_KEY(key_t, y)[y->n++] = *k;						\
-				memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \
-				if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \
-				y->n += z->n;											\
-				memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
-				memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
-				--x->n;													\
-				free(z);												\
-				return __kb_delp_aux_##name(b, y, k, s);				\
-			}															\
-		}																\
-		++i;															\
-		if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) {					\
-			if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) {		\
-				memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
-				if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
-				__KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1];		\
-				__KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \
-				if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \
-				--y->n; ++xp->n;										\
-			} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \
-				__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i];	\
-				__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0];			\
-				if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \
-				--y->n;													\
-				memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \
-				if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \
-			} else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \
-				__KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1];	\
-				memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t));	\
-				if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
-				y->n += xp->n;											\
-				memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \
-				memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \
-				--x->n;													\
-				free(xp);												\
-				xp = y;													\
-			} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \
-				__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i];	\
-				memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t));	\
-				if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \
-				xp->n += y->n;											\
-				memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
-				memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
-				--x->n;													\
-				free(y);												\
-			}															\
-		}																\
-		return __kb_delp_aux_##name(b, xp, k, s);						\
-	}																	\
-	static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
-	{																	\
-		kbnode_t *x;													\
-		key_t ret;														\
-		ret = __kb_delp_aux_##name(b, b->root, k, 0);					\
-		--b->n_keys;													\
-		if (b->root->n == 0 && b->root->is_internal) {					\
-			--b->n_nodes;												\
-			x = b->root;												\
-			b->root = __KB_PTR(b, x)[0];								\
-			free(x);													\
-		}																\
-		return ret;														\
-	}																	\
-	static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \
-	{																	\
-		return kb_delp_##name(b, &k);									\
-	}
-
-typedef struct {
-	kbnode_t *x;
-	int i;
-} __kbstack_t;
-
-#define __kb_traverse(key_t, b, __func) do {							\
-		int __kmax = 8;													\
-		__kbstack_t *__kstack, *__kp;									\
-		__kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \
-		__kp->x = (b)->root; __kp->i = 0;								\
-		for (;;) {														\
-			while (__kp->x && __kp->i <= __kp->x->n) {					\
-				if (__kp - __kstack == __kmax - 1) {					\
-					__kmax <<= 1;										\
-					__kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \
-					__kp = __kstack + (__kmax>>1) - 1;					\
-				}														\
-				(__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \
-				++__kp;													\
-			}															\
-			--__kp;														\
-			if (__kp >= __kstack) {										\
-				if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \
-				++__kp->i;												\
-			} else break;												\
-		}																\
-		free(__kstack);													\
-	} while (0)
-
-#define KBTREE_INIT(name, key_t, __cmp)			\
-	__KB_TREE_T(name)							\
-	__KB_INIT(name, key_t)						\
-	__KB_GET_AUX1(name, key_t, __cmp)			\
-	__KB_GET(name, key_t)						\
-	__KB_INTERVAL(name, key_t)					\
-	__KB_PUT(name, key_t, __cmp)				\
-	__KB_DEL(name, key_t)
-
-#define KB_DEFAULT_SIZE 512
-
-#define kbtree_t(name) kbtree_##name##_t
-#define kb_init(name, s) kb_init_##name(s)
-#define kb_destroy(name, b) __kb_destroy(b)
-#define kb_get(name, b, k) kb_get_##name(b, k)
-#define kb_put(name, b, k) kb_put_##name(b, k)
-#define kb_del(name, b, k) kb_del_##name(b, k)
-#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u)
-#define kb_getp(name, b, k) kb_getp_##name(b, k)
-#define kb_putp(name, b, k) kb_putp_##name(b, k)
-#define kb_delp(name, b, k) kb_delp_##name(b, k)
-#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u)
-
-#define kb_size(b) ((b)->n_keys)
-
-#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b)))
-#define kb_str_cmp(a, b) strcmp(a, b)
-
-#endif
-
-#ifdef TEST
-#include <stdio.h>
-#include <assert.h>
-#include <time.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-
-typedef const char *str_t;
-
-#include "kbtree.h"
-
-typedef struct {
-        unsigned key;
-        char *value;
-} intmap_t;
-
-#define __intcmp(a, b) (((a).key > (b).key) - ((a).key < (b).key))
-
-KBTREE_INIT(int, uint32_t, kb_generic_cmp)
-KBTREE_INIT(str, str_t, kb_str_cmp)
-KBTREE_INIT(intmap, intmap_t, __intcmp);
-
-static int data_size = 5000000;
-static unsigned *int_data;
-static char **str_data;
-static intmap_t *intmap_data;
-
-void kb_init_data()
-{
-	int i;
-	char buf[256];
-	printf("--- generating data... ");
-	srand48(11);
-	int_data = (unsigned*)calloc(data_size, sizeof(unsigned));
-	str_data = (char**)calloc(data_size, sizeof(char*));
-	intmap_data = (intmap_t*)calloc(data_size, sizeof(intmap_t));
-	for (i = 0; i < data_size; ++i) {
-		int_data[i] = (unsigned)(data_size * drand48() / 4) * 271828183u;
-		sprintf(buf, "%x", int_data[i]);
-		str_data[i] = strdup(buf);
-		intmap_data[i].key = i;
-		intmap_data[i].value = str_data[i];
-	}
-	printf("done!\n");
-}
-void kb_destroy_data()
-{
-	int i;
-	for (i = 0; i < data_size; ++i) free(str_data[i]);
-	free(str_data); free(int_data);
-}
-void kb_tree_intmap()
-{
-        int i;
-	intmap_t *data = intmap_data;
-        kbtree_t(intmap) *h;
-        h = kb_init(intmap, KB_DEFAULT_SIZE);
-	for (i = 0; i < data_size; ++i) {
-		if (kb_get(intmap, h, data[i]) == 0) kb_put(intmap, h, data[i]);
-		else kb_del(intmap, h, data[i]);
-	}
-	printf("[kb_tree_intmap] size: %d\n", kb_size(h));
-	__kb_destroy(h);
-}
-void kb_tree_int()
-{
-	int i;
-	unsigned *data = int_data;
-	uint32_t *l, *u;
-	kbtree_t(int) *h;
-
-	h = kb_init(int, KB_DEFAULT_SIZE);
-	for (i = 0; i < data_size; ++i) {
-		if (kb_get(int, h, data[i]) == 0) kb_put(int, h, data[i]);
-		else kb_del(int, h, data[i]);
-	}
-	printf("[kb_tree_int] size: %d\n", kb_size(h));
-	if (1) {
-		int cnt = 0;
-		uint32_t x, y;
-		kb_interval(int, h, 2174625464u, &l, &u);
-		printf("interval for 2174625464: (%u, %u)\n", l? *l : 0, u? *u : 0);
-#define traverse_f(p) { if (cnt == 0) y = *p; ++cnt; }
-		__kb_traverse(uint32_t, h, traverse_f);
-		__kb_get_first(uint32_t, h, x);
-		printf("# of elements from traversal: %d\n", cnt);
-		printf("first element: %d == %d\n", x, y);
-	}
-	__kb_destroy(h);
-}
-void kb_tree_str()
-{
-	int i;
-	char **data = str_data;
-	kbtree_t(str) *h;
-
-	h = kb_init(str, KB_DEFAULT_SIZE);
-	for (i = 0; i < data_size; ++i) {
-		if (kb_get(str, h, data[i]) == 0) kb_put(str, h, data[i]);
-		else kb_del(str, h, data[i]);
-	}
-	printf("[kb_tree_int] size: %d\n", kb_size(h));
-	__kb_destroy(h);
-}
-void kb_timing(void (*f)(void))
-{
-	clock_t t = clock();
-	(*f)();
-	printf("[kb_timing] %.3lf sec\n", (double)(clock() - t) / CLOCKS_PER_SEC);
-}
-int main(int argc, char *argv[])
-{
-	if (argc > 1) data_size = atoi(argv[1]);
-	kb_init_data();
-	kb_timing(kb_tree_int);
-	kb_timing(kb_tree_str);
-	kb_timing(kb_tree_intmap);
-	kb_destroy_data();
-	return 0;
-}
-#endif
diff --git a/c_src/wterl.c b/c_src/wterl.c
index 81fcd7e..0774bdf 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -29,13 +29,14 @@
 #include "wiredtiger.h"
 #include "stats.h"
 #include "async_nif.h"
-#include "kbtree.h"
+#include "khash.h"
 #include "queue.h"
 
+#define MAX_CACHE_SIZE ASYNC_NIF_MAX_WORKERS
+
 static ErlNifResourceType *wterl_conn_RESOURCE;
 static ErlNifResourceType *wterl_cursor_RESOURCE;
 
-/* WiredTiger object names*/
 typedef char Uri[128];
 
 struct wterl_ctx {
@@ -51,14 +52,15 @@ struct cache_entry {
     SLIST_HEAD(ctxs, wterl_ctx) contexts;
 };
 
-#define __ctx_sig_cmp(a, b) (((a).sig > (b).sig) - ((a).sig < (b).sig))
-KBTREE_INIT(cache_entries, struct cache_entry, __ctx_sig_cmp);
+KHASH_MAP_INIT_INT64(cache_entries, struct cache_entry*);
 
 typedef struct wterl_conn {
     WT_CONNECTION *conn;
     const char *session_config;
     ErlNifMutex *cache_mutex;
-    kbtree_t(cache_entries) *cache;
+    khash_t(cache_entries) *cache;
+    uint32_t num_ctx_in_cache;
+    struct wterl_ctx *last_ctx_used[ASYNC_NIF_MAX_WORKERS];
     SLIST_ENTRY(wterl_conn) conns;
     uint64_t histogram[64];
     uint64_t histogram_count;
@@ -149,17 +151,6 @@ static inline uint32_t __log2(uint64_t x) {
 }
 #endif
 
-/**
- * Is the context cache full?
- *
- * ->   0 = no/false, anything else is true
- */
-static int
-__ctx_cache_full(WterlConnHandle *conn_handle)
-{
-    return kb_size(conn_handle->cache) == ASYNC_NIF_MAX_WORKERS; // TODO:
-}
-
 /**
  * Evict items from the cache.
  *
@@ -171,12 +162,17 @@ __ctx_cache_full(WterlConnHandle *conn_handle)
 static int
 __ctx_cache_evict(WterlConnHandle *conn_handle)
 {
-    uint32_t num_evicted, i;
-    uint64_t mean, now;
-    struct wterl_ctx *to_free[ASYNC_NIF_MAX_WORKERS];
+    uint32_t mean, log, num_evicted, i;
+    uint64_t now, elapsed;
+    khash_t(cache_entries) *h = conn_handle->cache;
+    khiter_t itr;
+    struct cache_entry *e;
+    struct wterl_ctx *c, *n;
+
+    if (conn_handle->num_ctx_in_cache != MAX_CACHE_SIZE)
+	return 0;
 
     now = cpu_clock_ticks();
-    kbtree_t(cache_entries) *t = conn_handle->cache;
 
     // Find the mean of the recorded times that items stayed in cache.
     mean = 0;
@@ -191,51 +187,32 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
 
     /*
      * Evict anything older than the mean time in queue by removing those
-     * items from the lists at the leaf nodes of the tree.
+     * items from the lists stored in the tree.
      */
-    num_evicted = 0;
-#define traverse_f(p)							\
-    {									\
-	struct cache_entry *e;						\
-	struct wterl_ctx *c;						\
-	e = (struct cache_entry *)p;					\
-	SLIST_FOREACH(c, &e->contexts, entries) {			\
-	    uint64_t elapsed = c->tstamp - now;				\
-	    uint32_t log = __log2(elapsed);				\
-	    if (log > mean) {						\
-		SLIST_REMOVE(&e->contexts, c, wterl_ctx, entries);	\
-		c->session->close(c->session, NULL);			\
-		to_free[num_evicted] = c;				\
-		num_evicted++;						\
-	    }								\
-	}								\
+    for (itr = kh_begin(h); itr != kh_end(h); ++itr) {
+	if (kh_exist(h, itr)) {
+	    e = kh_val(h, itr);
+	    c = SLIST_FIRST(&e->contexts);
+	    while (c != NULL) {
+		n = SLIST_NEXT(c, entries);
+		elapsed = c->tstamp - now;
+		log = __log2(elapsed);
+		if (log > mean) {
+		    SLIST_REMOVE(&e->contexts, c, wterl_ctx, entries);
+		    c->session->close(c->session, NULL);
+		    enif_free(c);
+		    num_evicted++;
+		}
+		c = n;
+	    }
+	    if (SLIST_EMPTY(&e->contexts)) {
+		kh_del(cache_entries, h, itr);
+		enif_free(e);
+		kh_value(h, itr) = NULL;
+	    }
+	}
     }
-    __kb_traverse(struct cache_entry, t, traverse_f);
-#undef traverse_f
-
-    /*
-     * Free up the wterl_ctx we've removed after finishing the loop.
-     */
-    for (i = 0; i < num_evicted; i++) {
-        enif_free(to_free[i]);
-    }
-
-    /*
-     * Walk the tree again looking for empty lists to prune from the
-     * tree.
-     */
-#define traverse_f(p)							\
-    {									\
-        struct cache_entry *e, query;					\
-	e = p;								\
-	query.sig = e->sig;						\
-	if (SLIST_EMPTY(&e->contexts)) {				\
-	    kb_del(cache_entries, t, query);				\
-	}								\
-    }
-    __kb_traverse(struct cache_entry, t, traverse_f);
-#undef traverse_f
-
+    conn_handle->num_ctx_in_cache -= num_evicted;
     return num_evicted;
 }
 
@@ -252,21 +229,30 @@ static struct wterl_ctx *
 __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
 {
     struct wterl_ctx *c = NULL;
-    struct cache_entry query, *result;
+    struct cache_entry *e;
+    khash_t(cache_entries) *h;
+    khiter_t itr;
 
-    query.sig = sig;
-    result = kb_get(cache_entries, conn_handle->cache, query);
-    if (result && !SLIST_EMPTY(&result->contexts)) {
-	/*
-	 * cache hit:
-	 * remove a context from the list in the tree node
-	 */
-	c = SLIST_FIRST(&result->contexts);
-	SLIST_REMOVE_HEAD(&result->contexts, entries);
-	uint64_t elapsed = cpu_clock_ticks() - c->tstamp;
-	conn_handle->histogram[__log2(elapsed)]++;
-	conn_handle->histogram_count++;
-    } // else { cache miss
+    h = conn_handle->cache;
+    enif_mutex_lock(conn_handle->cache_mutex);
+    if (conn_handle->num_ctx_in_cache > 0) {
+	itr = kh_get(cache_entries, h, sig);
+	if (itr != kh_end(h)) {
+	    e = kh_value(h, itr);
+	    if (!SLIST_EMPTY(&e->contexts)) {
+		/*
+		 * cache hit:
+		 * remove a context from the list in the tree node
+		 */
+		c = SLIST_FIRST(&e->contexts);
+		SLIST_REMOVE_HEAD(&e->contexts, entries);
+		conn_handle->histogram[__log2(cpu_clock_ticks() - c->tstamp)]++;
+		conn_handle->histogram_count++;
+		conn_handle->num_ctx_in_cache -= 1;
+	    }
+	}
+    }
+    enif_mutex_unlock(conn_handle->cache_mutex);
     return c;
 }
 
@@ -277,28 +263,28 @@ __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
  * the front of the LRU.
  */
 static void
-__ctx_cache_add(WterlConnHandle *conn, struct wterl_ctx *c)
+__ctx_cache_add(WterlConnHandle *conn_handle, struct wterl_ctx *c)
 {
-    struct cache_entry query, *result;
-
-    /*
-     * Check to see if the cache is full and if so trigger eviction which will
-     * remove the least-recently-used half of the items from the cache.
-     */
-    if (__ctx_cache_full(conn))
-	__ctx_cache_evict(conn);
+    struct cache_entry *e;
+    khash_t(cache_entries) *h;
+    khiter_t itr;
+    int itr_status;
 
+    enif_mutex_lock(conn_handle->cache_mutex);
+    __ctx_cache_evict(conn_handle);
     c->tstamp = cpu_clock_ticks();
-
-    query.sig = c->sig;
-    result = kb_get(cache_entries, conn->cache, query);
-    if (result == NULL) {
-	SLIST_INIT(&query.contexts); // TODO: should this be on the heap?
-	SLIST_INSERT_HEAD(&query.contexts, c, entries);
-	kb_put(cache_entries, conn->cache, query);
-    } else {
-	SLIST_INSERT_HEAD(&result->contexts, c, entries);
+    h = conn_handle->cache;
+    itr = kh_get(cache_entries, h, c->sig);
+    if (itr == kh_end(h)) {
+	e = enif_alloc(sizeof(struct cache_entry)); // TODO: enomem
+	SLIST_INIT(&e->contexts);
+        itr = kh_put(cache_entries, h, c->sig, &itr_status);
+        kh_value(h, itr) = e;
     }
+    e = kh_value(h, itr);
+    SLIST_INSERT_HEAD(&e->contexts, c, entries);
+    conn_handle->num_ctx_in_cache += 1;
+    enif_mutex_unlock(conn_handle->cache_mutex);
 }
 
 /**
@@ -352,7 +338,7 @@ __zi(uint32_t p, uint32_t q)
 /**
  * Create a signature for the operation we're about to perform.
  *
- * Create a 32bit signature for this a combination of session configuration
+ * Create a 64bit signature for this a combination of session configuration
  * some number of cursors open on tables each potentially with a different
  * configuration. "session_config, [{table_name, cursor_config}, ...]"
  *
@@ -386,7 +372,8 @@ __ctx_cache_sig(const char *c, va_list ap, int count)
  * session.
  */
 static int
-__retain_ctx(WterlConnHandle *conn_handle, struct wterl_ctx **ctx,
+__retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
+	     struct wterl_ctx **ctx,
 	     int count, const char *session_config, ...)
 {
     int i = 0;
@@ -399,39 +386,50 @@ __retain_ctx(WterlConnHandle *conn_handle, struct wterl_ctx **ctx,
     sig = __ctx_cache_sig(session_config, ap, count);
     va_end(ap);
 
-    enif_mutex_lock(conn_handle->cache_mutex);
-    (*ctx) = __ctx_cache_find(conn_handle, sig);
-    if ((*ctx) == NULL) {
-	// cache miss
-	WT_CONNECTION *conn = conn_handle->conn;
-	WT_SESSION *session = NULL;
-	int rc = conn->open_session(conn, NULL, session_config, &session);
-	if (rc != 0)
-	    return rc;
-	size_t s = sizeof(struct wterl_ctx) + (count * sizeof(WT_CURSOR*));
-	*ctx = enif_alloc(s); // TODO: enif_alloc_resource()
-	if (*ctx == NULL) {
-	    session->close(session, NULL);
-	    return ENOMEM;
-	}
-	memset(*ctx, 0, s);
-	(*ctx)->sig = sig;
-	(*ctx)->session = session;
-	session_config = arg;
-	va_start(ap, session_config);
-	for (i = 0; i < count; i++) {
-	    const char *uri = va_arg(ap, const char *);
-	    const char *config = va_arg(ap, const char *);
-	    // TODO: error when uri or config is NULL
-	    rc = session->open_cursor(session, uri, NULL, config, &(*ctx)->cursors[i]);
-	    if (rc != 0) {
-		session->close(session, NULL); // this will free the cursors too
+    DPRINTF("worker: %u cache size: %u", worker_id, conn_handle->num_ctx_in_cache);
+    if (conn_handle->last_ctx_used[worker_id] != NULL &&
+	conn_handle->last_ctx_used[worker_id]->sig == sig) {
+	(*ctx) = conn_handle->last_ctx_used[worker_id];
+	DPRINTF("worker: %u reuse hit: %lu %p", worker_id, sig, *ctx);
+    } else {
+	if (conn_handle->last_ctx_used[worker_id] != NULL)
+	    __ctx_cache_add(conn_handle, conn_handle->last_ctx_used[worker_id]);
+	conn_handle->last_ctx_used[worker_id] = NULL;
+	(*ctx) = __ctx_cache_find(conn_handle, sig);
+	if ((*ctx) == NULL) {
+	    // cache miss
+	    DPRINTF("worker: %u cache miss: %lu", worker_id, sig);
+	    WT_CONNECTION *conn = conn_handle->conn;
+	    WT_SESSION *session = NULL;
+	    int rc = conn->open_session(conn, NULL, session_config, &session);
+	    if (rc != 0)
 		return rc;
+	    size_t s = sizeof(struct wterl_ctx) + (count * sizeof(WT_CURSOR*));
+	    *ctx = enif_alloc(s); // TODO: enif_alloc_resource()
+	    if (*ctx == NULL) {
+		session->close(session, NULL);
+		return ENOMEM;
 	    }
+	    memset(*ctx, 0, s);
+	    (*ctx)->sig = sig;
+	    (*ctx)->session = session;
+	    session_config = arg;
+	    va_start(ap, session_config);
+	    for (i = 0; i < count; i++) {
+		const char *uri = va_arg(ap, const char *);
+		const char *config = va_arg(ap, const char *);
+		// TODO: error when uri or config is NULL
+		rc = session->open_cursor(session, uri, NULL, config, &(*ctx)->cursors[i]);
+		if (rc != 0) {
+		    session->close(session, NULL); // this will free the cursors too
+		    return rc;
+		}
+	    }
+	    va_end(ap);
+	} else { // else { cache hit }
+	    DPRINTF("worker: %u cache hit: %lu %p", worker_id, sig, *ctx);
 	}
-	va_end (ap);
-    } // else { cache hit }
-    enif_mutex_unlock(conn_handle->cache_mutex);
+    }
     return 0;
 }
 
@@ -439,19 +437,20 @@ __retain_ctx(WterlConnHandle *conn_handle, struct wterl_ctx **ctx,
  * Return a context to the cache for reuse.
  */
 static void
-__release_ctx(WterlConnHandle *conn_handle, struct wterl_ctx *ctx)
+__release_ctx(WterlConnHandle *conn_handle, uint32_t worker_id, struct wterl_ctx *ctx)
 {
-    int i, c;
+    int i, n;
     WT_CURSOR *cursor;
 
-    c = sizeof((WT_CURSOR**)ctx->cursors) / sizeof(ctx->cursors[0]);
-    for (i = 0; i < c; i++) {
+    DPRINTF("worker: %u cache size: %u", worker_id, conn_handle->num_ctx_in_cache);
+    n = sizeof((WT_CURSOR**)ctx->cursors) / sizeof(ctx->cursors[0]);
+    for (i = 0; i < n; i++) {
 	cursor = ctx->cursors[i];
 	cursor->reset(cursor);
     }
-    enif_mutex_lock(conn_handle->cache_mutex);
-    __ctx_cache_add(conn_handle, ctx);
-    enif_mutex_unlock(conn_handle->cache_mutex);
+    assert(conn_handle->last_ctx_used[worker_id] == 0 ||
+	   conn_handle->last_ctx_used[worker_id] == ctx);
+    conn_handle->last_ctx_used[worker_id] = ctx;
 }
 
 /**
@@ -462,46 +461,32 @@ __release_ctx(WterlConnHandle *conn_handle, struct wterl_ctx *ctx)
 void
 __close_all_sessions(WterlConnHandle *conn_handle)
 {
-    int i, num_closed = 0;
-    struct wterl_ctx *to_free[ASYNC_NIF_MAX_WORKERS];
-    kbtree_t(cache_entries) *t = conn_handle->cache;
+    khash_t(cache_entries) *h = conn_handle->cache;
+    struct cache_entry *e;
+    struct wterl_ctx *c;
+    int i;
 
-#define traverse_f(p)							\
-    {									\
-        struct cache_entry *e;						\
-	struct wterl_ctx *c;						\
-	e = (struct cache_entry *)p;					\
-	SLIST_FOREACH(c, &e->contexts, entries) {			\
-	    SLIST_REMOVE(&e->contexts, c, wterl_ctx, entries);		\
-	    c->session->close(c->session, NULL);			\
-	    to_free[num_closed] = c;					\
-	    num_closed++;						\
-	}								\
+    for (i = 0; i < ASYNC_NIF_MAX_WORKERS; i++) {
+	c = conn_handle->last_ctx_used[i];
+	c->session->close(c->session, NULL);
+	enif_free(c);
+	conn_handle->last_ctx_used[i] = NULL;
     }
-    __kb_traverse(struct cache_entry *, t, traverse_f);
-#undef traverse_f
-
-    /*
-     * Free up the wterl_ctx we've removed after finishing the loop.
-     */
-    for (i = 0; i < num_closed; i++) {
-        enif_free(to_free[i]);
+    khiter_t itr;
+    for (itr = kh_begin(h); itr != kh_end(h); ++itr) {
+	if (kh_exist(h, itr)) {
+	    e = kh_val(h, itr);
+	    while ((c = SLIST_FIRST(&e->contexts)) != NULL) {
+		SLIST_REMOVE(&e->contexts, c, wterl_ctx, entries);
+		c->session->close(c->session, NULL);
+		enif_free(c);
+	    }
+	    kh_del(cache_entries, h, itr);
+	    enif_free(e);
+	    kh_value(h, itr) = NULL;
+	}
     }
-
-    /*
-     * Walk the tree again to prune all the empty lists from the tree.
-     */
-#define traverse_f(p)							\
-    {									\
-        struct cache_entry *e, query;					\
-	e = (struct cache_entry *)p;					\
-	query.sig = e->sig;						\
-	if (SLIST_EMPTY(&e->contexts)) {				\
-	    kb_del(cache_entries, t, query);				\
-	}								\
-    }
-    __kb_traverse(struct cache_entry, t, traverse_f);
-#undef traverse_f
+    conn_handle->num_ctx_in_cache = 0;
 }
 
 /**
@@ -728,8 +713,9 @@ ASYNC_NIF_DECL(
       conn_handle->conn = conn;
       ERL_NIF_TERM result = enif_make_resource(env, conn_handle);
 
-      /* Init tree which manages the cache of session/cursor(s) */
-      conn_handle->cache = kb_init(cache_entries, ASYNC_NIF_MAX_WORKERS); // TODO: size
+      /* Init hash table which manages the cache of session/cursor(s) */
+      conn_handle->cache = kh_init(cache_entries);
+      conn_handle->num_ctx_in_cache = 0;
 
       /* Keep track of open connections so as to free when unload/reload/etc.
          are called. */
@@ -1425,7 +1411,7 @@ ASYNC_NIF_DECL(
 
     struct wterl_ctx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
-    int rc = __retain_ctx(args->conn_handle, &ctx, 1,
+    int rc = __retain_ctx(args->conn_handle, worker_id, &ctx, 1,
 			  args->conn_handle->session_config,
 			  args->uri, "overwrite,raw");
     if (rc != 0) {
@@ -1440,7 +1426,7 @@ ASYNC_NIF_DECL(
     cursor->set_key(cursor, &item_key);
     rc = cursor->remove(cursor);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
-    __release_ctx(args->conn_handle, ctx);
+    __release_ctx(args->conn_handle, worker_id, ctx);
   },
   { // post
 
@@ -1484,7 +1470,7 @@ ASYNC_NIF_DECL(
 
     struct wterl_ctx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
-    int rc = __retain_ctx(args->conn_handle, &ctx, 1,
+    int rc = __retain_ctx(args->conn_handle, worker_id, &ctx, 1,
 			  args->conn_handle->session_config,
 			  args->uri, "overwrite,raw");
     if (rc != 0) {
@@ -1513,7 +1499,7 @@ ASYNC_NIF_DECL(
     unsigned char *bin = enif_make_new_binary(env, item_value.size, &value);
     memcpy(bin, item_value.data, item_value.size);
     ASYNC_NIF_REPLY(enif_make_tuple2(env, ATOM_OK, value));
-    __release_ctx(args->conn_handle, ctx);
+    __release_ctx(args->conn_handle, worker_id, ctx);
   },
   { // post
 
@@ -1566,7 +1552,7 @@ ASYNC_NIF_DECL(
 
     struct wterl_ctx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
-    int rc = __retain_ctx(args->conn_handle, &ctx, 1,
+    int rc = __retain_ctx(args->conn_handle, worker_id, &ctx, 1,
 			  args->conn_handle->session_config,
 			  args->uri, "overwrite,raw");
     if (rc != 0) {
@@ -1584,7 +1570,7 @@ ASYNC_NIF_DECL(
     item_value.size = value.size;
     cursor->set_value(cursor, &item_value);
     rc = cursor->insert(cursor);
-    __release_ctx(args->conn_handle, ctx);
+    __release_ctx(args->conn_handle, worker_id, ctx);
     ASYNC_NIF_REPLY(rc == 0 ? ATOM_OK : __strerror_term(env, rc));
   },
   { // post
@@ -2388,8 +2374,9 @@ on_unload(ErlNifEnv *env, void *priv_data)
     SLIST_FOREACH(conn_handle, &priv->conns, conns) {
 	__close_all_sessions(conn_handle);
 	conn_handle->conn->close(conn_handle->conn, NULL);
-	kb_destroy(cache_entries, conn_handle->cache);
-	enif_free((void*)conn_handle->session_config);
+	kh_destroy(cache_entries, conn_handle->cache);
+	if (conn_handle->session_config)
+	    enif_free((void*)conn_handle->session_config);
 	enif_mutex_unlock(conn_handle->cache_mutex);
 	enif_mutex_destroy(conn_handle->cache_mutex);
     }
diff --git a/src/wterl.erl b/src/wterl.erl
index 45afae5..b940433 100644
--- a/src/wterl.erl
+++ b/src/wterl.erl
@@ -664,7 +664,7 @@ various_maintenance_test_() ->
      fun () ->
              {ok, CWD} = file:get_cwd(),
              ?assertMatch(ok, filelib:ensure_dir(filename:join([?TEST_DATA_DIR, "x"]))),
-             {ok, ConnRef} = connection_open(filename:join([CWD, ?TEST_DATA_DIR]), []),
+             {ok, ConnRef} = connection_open(filename:join([CWD, ?TEST_DATA_DIR]), [{create,true}]),
              ConnRef
      end,
      fun (ConnRef) ->
diff --git a/tools/basho_bench_driver_wterl.erl b/tools/basho_bench_driver_wterl.erl
index bdc6eb3..ae4dd93 100644
--- a/tools/basho_bench_driver_wterl.erl
+++ b/tools/basho_bench_driver_wterl.erl
@@ -26,12 +26,12 @@ new(1) ->
 new(Id) ->
     setup(Id).
 
-setup(_Id) ->
+setup(Id) ->
     %% Get the target directory
     Dir = basho_bench_config:get(wterl_dir, "/tmp"),
     Config = basho_bench_config:get(wterl, []),
     Uri = config_value(table_uri, Config, "lsm:test"),
-    ConnectionOpts = config_value(connection, Config, [{create, true}]),
+    ConnectionOpts = config_value(connection, Config, [{create,true},{session_max, 8192}]),
     SessionOpts = config_value(session, Config, []),
     TableOpts = config_value(table, Config, []),
 
@@ -43,7 +43,7 @@ setup(_Id) ->
                     {ok, Conn} ->
                         Conn;
                     {error, Reason0} ->
-                        ?FAIL_MSG("Failed to establish a WiredTiger connection, wterl backend unable to start: ~p\n", [Reason0])
+                        ?FAIL_MSG("Failed to establish a WiredTiger connection for ~p, wterl backend unable to start: ~p\n", [Id, Reason0])
                 end;
             true ->
                 {ok, Conn} = wterl_conn:get(),

From 778ba203524be051e193614459415e2c8679ce22 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 4 Jun 2013 17:21:50 -0400
Subject: [PATCH 07/30] WIP: a bit of cleanup fixes a few mistakes

---
 c_src/common.h    |  1 -
 c_src/wterl.c     | 18 +++++++++++-------
 src/wterl.erl     |  3 ++-
 update-version.sh |  2 +-
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/c_src/common.h b/c_src/common.h
index 82db007..42ac5e0 100644
--- a/c_src/common.h
+++ b/c_src/common.h
@@ -24,7 +24,6 @@
 extern "C" {
 #endif
 
-#define DEBUG 1
 #if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__))
 # undef  DEBUG
 # define DEBUG		0
diff --git a/c_src/wterl.c b/c_src/wterl.c
index 0774bdf..fd21196 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -189,6 +189,7 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
      * Evict anything older than the mean time in queue by removing those
      * items from the lists stored in the tree.
      */
+    num_evicted = 0;
     for (itr = kh_begin(h); itr != kh_end(h); ++itr) {
 	if (kh_exist(h, itr)) {
 	    e = kh_val(h, itr);
@@ -277,6 +278,7 @@ __ctx_cache_add(WterlConnHandle *conn_handle, struct wterl_ctx *c)
     itr = kh_get(cache_entries, h, c->sig);
     if (itr == kh_end(h)) {
 	e = enif_alloc(sizeof(struct cache_entry)); // TODO: enomem
+	memset(e, 0, sizeof(struct cache_entry));
 	SLIST_INIT(&e->contexts);
         itr = kh_put(cache_entries, h, c->sig, &itr_status);
         kh_value(h, itr) = e;
@@ -468,9 +470,11 @@ __close_all_sessions(WterlConnHandle *conn_handle)
 
     for (i = 0; i < ASYNC_NIF_MAX_WORKERS; i++) {
 	c = conn_handle->last_ctx_used[i];
-	c->session->close(c->session, NULL);
-	enif_free(c);
-	conn_handle->last_ctx_used[i] = NULL;
+	if (c) {
+	    c->session->close(c->session, NULL);
+	    enif_free(c);
+	    conn_handle->last_ctx_used[i] = NULL;
+	}
     }
     khiter_t itr;
     for (itr = kh_begin(h); itr != kh_end(h); ++itr) {
@@ -688,10 +692,11 @@ ASYNC_NIF_DECL(
 
     int rc = wiredtiger_open(args->homedir,
                              (WT_EVENT_HANDLER*)&args->priv->eh.handlers,
-                             config.data[0] != 0 ? (const char*)config.data : NULL,
-                             &conn);
+                             (config.size > 1) ? (const char *)config.data : NULL,
+			     &conn);
     if (rc == 0) {
       WterlConnHandle *conn_handle = enif_alloc_resource(wterl_conn_RESOURCE, sizeof(WterlConnHandle));
+      memset(conn_handle, 0, sizeof(WterlConnHandle));
       if (!conn_handle) {
           ASYNC_NIF_REPLY(__strerror_term(env, ENOMEM));
           return;
@@ -1134,7 +1139,6 @@ ASYNC_NIF_DECL(
     }
     args->config = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[4]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = __str_hash(args->uri);
   },
   { // work
 
@@ -1603,7 +1607,6 @@ ASYNC_NIF_DECL(
     }
     args->config = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = __str_hash(args->uri);
   },
   { // work
 
@@ -1637,6 +1640,7 @@ ASYNC_NIF_DECL(
       ASYNC_NIF_REPLY(__strerror_term(env, ENOMEM));
       return;
     }
+    memset(cursor_handle, 0, sizeof(WterlCursorHandle));
     cursor_handle->session = session;
     cursor_handle->cursor = cursor;
     ERL_NIF_TERM result = enif_make_resource(env, cursor_handle);
diff --git a/src/wterl.erl b/src/wterl.erl
index b940433..15582e9 100644
--- a/src/wterl.erl
+++ b/src/wterl.erl
@@ -96,7 +96,8 @@ nif_stub_error(Line) ->
 -spec init() -> ok | {error, any()}.
 init() ->
     erlang:load_nif(filename:join([priv_dir(), atom_to_list(?MODULE)]),
-                    [{wterl_vsn, "a1459ce"}, {wiredtiger_vsn, "1.5.2-2-g8f2685b"}]).
+           [{wterl_vsn, "f1b7d8322da904a3385b97456819afd63ff41afe"},
+	    {wiredtiger_vsn, "1.6.1-a06b59e47db7b120575049bd7d6314df53e78e54"}]).
 
 -spec connection_open(string(), config_list()) -> {ok, connection()} | {error, term()}.
 -spec connection_open(string(), config_list(), config_list()) -> {ok, connection()} | {error, term()}.
diff --git a/update-version.sh b/update-version.sh
index 459fc0c..4d99734 100755
--- a/update-version.sh
+++ b/update-version.sh
@@ -3,7 +3,7 @@
 # Note: also, remember to update version numbers in rpath specs so that shared libs can be found at runtime!!!
 
 wterl=`git log -n 1 --pretty=format:"%H"`
-wiredtiger0=`(cd c_src/wiredtiger && git log -n 1 --pretty=format:"%H")`
+wiredtiger0=`(cd c_src/wiredtiger-develop && git log -n 1 --pretty=format:"%H")`
 wiredtiger=`echo $wiredtiger0 | awk '{print $2}'`
 
 echo $wterl

From 0fef28de9212898d69397c48e22dff92c863bb59 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Wed, 5 Jun 2013 11:41:41 -0400
Subject: [PATCH 08/30] WIP: basho_bench tests are running fine now, need more
 work to ensure cache is functioning properly.

---
 c_src/async_nif.h            | 14 +++---
 c_src/duration.h             | 14 +++---
 c_src/wiredtiger-build.patch | 94 ++++++++++++++++++++++++++++++++++++
 rebar.config                 | 26 +++++-----
 src/wterl.erl                |  1 +
 5 files changed, 124 insertions(+), 25 deletions(-)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 44a0906..26d556f 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -256,12 +256,14 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
           enif_mutex_unlock(q->reqs_mutex);
           return 0;
       }
-      double await = STAT_MEAN_LOG2_SAMPLE(async_nif, qwait);
-      double await_inthisq = STAT_MEAN_LOG2_SAMPLE(q, qwait);
-      if (fifo_q_full(reqs, q->reqs) || await_inthisq > await) {
-          enif_mutex_unlock(q->reqs_mutex);
-          qid = (qid + 1) % async_nif->num_queues;
-          q = &async_nif->queues[qid];
+      if (fifo_q_size(reqs, q->reqs) > async_nif->num_queues) {
+	  double await = STAT_MEAN_LOG2_SAMPLE(async_nif, qwait);
+	  double await_inthisq = STAT_MEAN_LOG2_SAMPLE(q, qwait);
+	  if (fifo_q_full(reqs, q->reqs) || await_inthisq > await) {
+	      enif_mutex_unlock(q->reqs_mutex);
+	      qid = (qid + 1) % async_nif->num_queues;
+	      q = &async_nif->queues[qid];
+	  }
       } else {
           break;
       }
diff --git a/c_src/duration.h b/c_src/duration.h
index 083ad6b..587a694 100644
--- a/c_src/duration.h
+++ b/c_src/duration.h
@@ -45,12 +45,14 @@ static inline uint64_t cpu_clock_ticks()
 {
      uint32_t lo, hi;
      __asm__ __volatile__ (
-          "xorl %%eax, %%eax\n"
-          "cpuid\n"
-          "rdtsc\n"
-          : "=a" (lo), "=d" (hi)
-          :
-          : "%ebx", "%ecx" );
+	 "; Flush the pipeline"
+	 "XORL %%eax, %%eax\n"
+	 "CPUID\n"
+	 "; Get RDTSC counter in edx:eax"
+	 "RDTSC\n"
+	 : "=a" (lo), "=d" (hi)
+	 :
+	 : "%ebx", "%ecx" );
      return (uint64_t)hi << 32 | lo;
 }
 
diff --git a/c_src/wiredtiger-build.patch b/c_src/wiredtiger-build.patch
index 8b0c1ac..cb619ff 100644
--- a/c_src/wiredtiger-build.patch
+++ b/c_src/wiredtiger-build.patch
@@ -10,3 +10,97 @@ index 6d78823..2122cf8 100644
 +libwiredtiger_snappy_la_CFLAGS = -I$(src_builddir)/../../system/include
 +libwiredtiger_snappy_la_LDFLAGS = -avoid-version -module -L$(src_builddir)/../../system/lib -Wl,-rpath,lib/wterl-0.9.0/priv:lib/wterl/priv:priv
  libwiredtiger_snappy_la_LIBADD = -lsnappy
+diff --git a/src/support/cksum.c b/src/support/cksum.c
+index 7e9befe..b924db7 100644
+--- a/src/support/cksum.c
++++ b/src/support/cksum.c
+@@ -27,6 +27,13 @@
+ 
+ #include "wt_internal.h"
+ 
++#if defined(__amd64) || defined(__x86_64)
++#define USE_HARDWARE_CRC32 1
++#else
++#undef USE_HARDWARE_CRC32
++#endif
++
++#ifdef USE_HARDWARE_CRC32
+ static const uint32_t g_crc_slicing[8][256] = {
+ #ifdef WORDS_BIGENDIAN
+ 	/*
+@@ -1078,6 +1085,7 @@ static const uint32_t g_crc_slicing[8][256] = {
+ 	}
+ #endif
+ };
++#endif /* USE_HARDWARE_CRC32 */
+ 
+ /*
+  * __wt_cksum --
+@@ -1106,15 +1114,29 @@ __wt_cksum(const void *chunk, size_t len)
+ 	/* Checksum one byte at a time to the first 4B boundary. */
+ 	for (p = chunk;
+ 	    ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 &&
+-	    len > 0; ++p, --len)
++	    len > 0; ++p, --len) {
++#ifdef USE_HARDWARE_CRC32
++	    __asm__ __volatile__(
++		".byte 0xF2, 0x0F, 0x38, 0xF0, 0xF1"
++		: "=S" (crc)
++		: "0" (crc), "c" (*p));
++#else
+ #ifdef WORDS_BIGENDIAN
+ 		crc = g_crc_slicing[0][((crc >> 24) ^ *p) & 0xFF] ^ (crc << 8);
+ #else
+ 		crc = g_crc_slicing[0][(crc ^ *p) & 0xFF] ^ (crc >> 8);
+ #endif
++#endif
++	}
+ 
+ 	/* Checksum in 8B chunks. */
+ 	for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) {
++#ifdef USE_HARDWARE_CRC32
++	    __asm__ __volatile__ (
++		".byte 0xf2, 0x48, 0x0f, 0x38, 0xf0, 0xf1;"
++		: "=S"(crc)
++		: "S"(crc), "c"(*p));
++#else
+ 		crc ^= *(uint32_t *)p;
+ 		p += sizeof(uint32_t);
+ 		next = *(uint32_t *)p;
+@@ -1139,22 +1161,32 @@ __wt_cksum(const void *chunk, size_t len)
+ 			g_crc_slicing[1][(next >> 16) & 0xFF] ^
+ 			g_crc_slicing[0][(next >> 24)];
+ #endif
++#endif
+ 	}
+ 
+ 	/* Checksum trailing bytes one byte at a time. */
++	for (len &= 0x7; len > 0; ++p, len--) {
++#ifdef USE_HARDWARE_CRC32
++	    __asm__ __volatile__(
++		".byte 0xF2, 0x0F, 0x38, 0xF0, 0xF1"
++		: "=S" (crc)
++		: "0" (crc), "c" (*p));
++#else
+ #ifdef WORDS_BIGENDIAN
+-	for (len &= 0x7; len > 0; ++p, len--)
+ 		crc = g_crc_slicing[0][((crc >> 24) ^ *p) & 0xFF] ^ (crc << 8);
++#else
++		crc = g_crc_slicing[0][(crc ^ *p) & 0xFF] ^ (crc >> 8);
++#endif
++#endif
++	}
+ 
++#ifdef WORDS_BIGENDIAN
+ 	/* Do final byte swap to produce a result identical to little endian */
+ 	crc =
+ 		((crc << 24) & 0xFF000000) |
+ 		((crc <<  8) & 0x00FF0000) |
+ 		((crc >>  8) & 0x0000FF00) |
+ 		((crc >> 24) & 0x000000FF);
+-#else
+-	for (len &= 0x7; len > 0; ++p, len--)
+-		crc = g_crc_slicing[0][(crc ^ *p) & 0xFF] ^ (crc >> 8);
+ #endif
+ 	return (~crc);
+ }
diff --git a/rebar.config b/rebar.config
index 36d44a6..a1cf184 100644
--- a/rebar.config
+++ b/rebar.config
@@ -7,24 +7,24 @@
 
 {eunit_opts, [verbose, {report, {eunit_surefire, [{dir, "."}]}}]}.
 
-{erl_opts, [%{d,'DEBUG',true},
-            debug_info,
-            fail_on_warning,
-            warn_unused_vars,
-            warn_export_all,
-            warn_shadow_vars,
-            warn_unused_import,
-            warn_unused_function,
+{erl_opts, [
+            {parse_transform, lager_transform},
+            debug_info, %{d,'DEBUG',true},
+            %strict_validation,
+            %fail_on_warning,
+	    warn_missing_spec,
             warn_bif_clash,
-            warn_unused_record,
             warn_deprecated_function,
-            warn_obsolete_guard,
+            warn_export_all,
             warn_export_vars,
             warn_exported_vars,
+            warn_obsolete_guard,
+            warn_shadow_vars,
             warn_untyped_record,
-            {parse_transform, lager_transform}
-            %warn_missing_spec,
-            %strict_validation
+            warn_unused_function,
+            warn_unused_import,
+            warn_unused_record,
+            warn_unused_vars
             ]}.
 
 {xref_checks, [undefined_function_calls, deprecated_function_calls]}.
diff --git a/src/wterl.erl b/src/wterl.erl
index 15582e9..8fc79f2 100644
--- a/src/wterl.erl
+++ b/src/wterl.erl
@@ -455,6 +455,7 @@ config_to_bin([{Key, Value} | Rest], Acc) ->
     [{block_compressor, {string, quoted}},
      {cache_size, string},
      {checkpoint, config},
+     {checksum, string},
      {create, bool},
      {direct_io, list},
      {drop, list},

From b2c0b651148d2faeb3cc1c405c9bd3056fe90f7b Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@burd.me>
Date: Thu, 6 Jun 2013 15:16:50 -0400
Subject: [PATCH 09/30] Fixes for OS/X and whitespace cleanup.

---
 c_src/duration.h | 44 ++++++++++++++++++++++++++++++++------------
 c_src/stats.h    |  4 ++--
 c_src/wterl.c    |  4 ++--
 3 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/c_src/duration.h b/c_src/duration.h
index 587a694..fbc97cb 100644
--- a/c_src/duration.h
+++ b/c_src/duration.h
@@ -13,6 +13,28 @@
 #include <time.h>
 #include <sys/timeb.h>
 
+#ifdef __MACH__
+#include <mach/clock.h>
+#include <mach/mach.h>
+#endif
+
+
+void current_utc_time(struct timespec *ts)
+{
+#ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
+    clock_serv_t cclock;
+    mach_timespec_t mts;
+    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+    clock_get_time(cclock, &mts);
+    mach_port_deallocate(mach_task_self(), cclock);
+    ts->tv_sec = mts.tv_sec;
+    ts->tv_nsec = mts.tv_nsec;
+#else
+    clock_gettime(CLOCK_REALTIME, ts);
+#endif
+
+}
+
 typedef enum { ns = 0, mcs, ms, s } time_scale;
 struct scale_time {
      const char *abbreviation;
@@ -28,9 +50,9 @@ static const struct scale_time scale[] = {
 static uint64_t ts(time_scale unit)
 {
     struct timespec ts;
-    clock_gettime(CLOCK_REALTIME, &ts);
+    current_utc_time(&ts);
     return (((uint64_t)ts.tv_sec * scale[unit].mul) +
-            ((uint64_t)ts.tv_nsec / scale[unit].div));
+	    ((uint64_t)ts.tv_nsec / scale[unit].div));
 }
 
 #if defined(__i386__) || defined(__x86_64__)
@@ -45,11 +67,9 @@ static inline uint64_t cpu_clock_ticks()
 {
      uint32_t lo, hi;
      __asm__ __volatile__ (
-	 "; Flush the pipeline"
-	 "XORL %%eax, %%eax\n"
+	 "XORL %%eax, %%eax\n" /* Flush the pipeline */
 	 "CPUID\n"
-	 "; Get RDTSC counter in edx:eax"
-	 "RDTSC\n"
+	 "RDTSC\n"             /* Get RDTSC counter in edx:eax */
 	 : "=a" (lo), "=d" (hi)
 	 :
 	 : "%ebx", "%ecx" );
@@ -90,14 +110,14 @@ static inline uint64_t elapsed(duration_t *d)
 
 #define ELAPSED_DURING(result, resolution, block)       \
      do {                                               \
-          DURATION(__x, resolution);                    \
-          do block while(0);                            \
-          *result = elapsed(&__x);                      \
+	  DURATION(__x, resolution);                    \
+	  do block while(0);                            \
+	  *result = elapsed(&__x);                      \
      } while(0);
 
 #define CYCLES_DURING(result, block)                    \
      do {                                               \
-         uint64_t __begin = cpu_clock_ticks();          \
-         do block while(0);                             \
-         *result = cpu_clock_ticks() - __begin;         \
+	 uint64_t __begin = cpu_clock_ticks();          \
+	 do block while(0);                             \
+	 *result = cpu_clock_ticks() - __begin;         \
      } while(0);
diff --git a/c_src/stats.h b/c_src/stats.h
index 2f465be..f44319b 100644
--- a/c_src/stats.h
+++ b/c_src/stats.h
@@ -152,7 +152,7 @@ static unsigned int __log2_64(uint64_t x) {
              fprintf(stderr, "     ns        μs        ms        s         ks\n"); \
              fprintf(stderr, "min: ");                                  \
              if (s->min < 1000)                                         \
-                 fprintf(stderr, "%lu (ns)", s->min);                   \
+                 fprintf(stderr, "%llu (ns)", s->min);                  \
              else if (s->min < 1000000)                                 \
                  fprintf(stderr, "%.2f (μs)", s->min / 1000.0);         \
              else if (s->min < 1000000000)                              \
@@ -161,7 +161,7 @@ static unsigned int __log2_64(uint64_t x) {
                  fprintf(stderr, "%.2f (s)", s->min / 1000000000.0);    \
              fprintf(stderr, "  max: ");                                \
              if (s->max < 1000)                                         \
-                 fprintf(stderr, "%lu (ns)", s->max);                   \
+                 fprintf(stderr, "%llu (ns)", s->max);                  \
              else if (s->max < 1000000)                                 \
                  fprintf(stderr, "%.2f (μs)", s->max / 1000.0);         \
              else if (s->max < 1000000000)                              \
diff --git a/c_src/wterl.c b/c_src/wterl.c
index fd21196..8756879 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -613,9 +613,9 @@ __wterl_progress_handler(WT_EVENT_HANDLER *handler, const char *operation, uint6
                       enif_make_int64(msg_env, counter)));
         enif_clear_env(msg_env);
         if (!enif_send(NULL, to_pid, msg_env, msg))
-            fprintf(stderr, "[%ld] %s\n", counter, operation);
+            fprintf(stderr, "[%llu] %s\n", counter, operation);
     } else {
-        rc = (printf("[%ld] %s\n", counter, operation) >= 0 ? 0 : EIO);
+        rc = (printf("[%llu] %s\n", counter, operation) >= 0 ? 0 : EIO);
     }
     enif_mutex_unlock(eh->progress_mutex);
     return rc;

From 2a4b8ee7d22dac63acffd16f583db021436d8905 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Mon, 10 Jun 2013 14:31:59 -0400
Subject: [PATCH 10/30] WIP: simplify the cache from hash-of-lists to list; use
 a CAS() operation to protect the most-recently-used (mru) list.

---
 Makefile          |   4 +-
 c_src/async_nif.h |  34 +++--
 c_src/cas.h       | 153 +++++++++++++++++++++
 c_src/common.h    |   6 +
 c_src/duration.h  |  26 ++--
 c_src/stats.h     |   4 +-
 c_src/wterl.c     | 330 +++++++++++++++++++++-------------------------
 rebar.config      |   4 +-
 src/wterl.erl     |   8 +-
 9 files changed, 353 insertions(+), 216 deletions(-)
 create mode 100644 c_src/cas.h

diff --git a/Makefile b/Makefile
index e485507..0e2e357 100644
--- a/Makefile
+++ b/Makefile
@@ -84,9 +84,9 @@ repl:
 	@$(ERL) -pa ebin -pz deps/lager/ebin
 
 eunit-repl:
-	@$(ERL) -pa .eunit deps/lager/ebin
+	@$(ERL) erl -pa .eunit -pz deps/lager/ebin
 
-ERL_TOP=		/home/gburd/eng/otp_R15B01
+ERL_TOP=		/home/gburd/repos/otp_R15B01
 CERL=			${ERL_TOP}/bin/cerl
 VALGRIND_MISC_FLAGS=	"--verbose --leak-check=full --show-reachable=yes --trace-children=yes --track-origins=yes --suppressions=${ERL_TOP}/erts/emulator/valgrind/suppress.standard --show-possibly-lost=no --malloc-fill=AB --free-fill=CD"
 
diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 26d556f..d282bfb 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -47,6 +47,7 @@ struct async_nif_req_entry {
   void *args;
   void (*fn_work)(ErlNifEnv*, ERL_NIF_TERM, ErlNifPid*, unsigned int, void *);
   void (*fn_post)(void *);
+  const char *func;
 };
 DECL_FIFO_QUEUE(reqs, struct async_nif_req_entry);
 
@@ -92,7 +93,7 @@ struct async_nif_state {
     struct decl ## _args *args = &on_stack_args;                        \
     struct decl ## _args *copy_of_args;                                 \
     struct async_nif_req_entry *req = NULL;                             \
-    unsigned int affinity = 0;						\
+    unsigned int affinity = 0;                                          \
     ErlNifEnv *new_env = NULL;                                          \
     /* argv[0] is a ref used for selective recv */                      \
     const ERL_NIF_TERM *argv = argv_in + 1;                             \
@@ -104,13 +105,16 @@ struct async_nif_state {
                               enif_make_atom(env, "shutdown"));         \
     req = async_nif_reuse_req(async_nif);                               \
     new_env = req->env;                                                 \
-    if (!req)                                                           \
-      return enif_make_tuple2(env, enif_make_atom(env, "error"),        \
-                              enif_make_atom(env, "eagain"));           \
+    if (!req) {                                                         \
+        async_nif_recycle_req(req, async_nif);                          \
+        return enif_make_tuple2(env, enif_make_atom(env, "error"),      \
+                                enif_make_atom(env, "eagain"));         \
+    }                                                                   \
     do pre_block while(0);                                              \
     copy_of_args = (struct decl ## _args *)enif_alloc(sizeof(struct decl ## _args)); \
     if (!copy_of_args) {                                                \
       fn_post_ ## decl (args);                                          \
+      async_nif_recycle_req(req, async_nif);                            \
       return enif_make_tuple2(env, enif_make_atom(env, "error"),        \
                               enif_make_atom(env, "enomem"));           \
     }                                                                   \
@@ -120,12 +124,14 @@ struct async_nif_state {
     req->args = (void*)copy_of_args;                                    \
     req->fn_work = (void (*)(ErlNifEnv *, ERL_NIF_TERM, ErlNifPid*, unsigned int, void *))fn_work_ ## decl ; \
     req->fn_post = (void (*)(void *))fn_post_ ## decl;                 \
+    req->func = __func__;                                              \
     int h = -1;                                                        \
     if (affinity)                                                      \
         h = affinity % async_nif->num_queues;                          \
     ERL_NIF_TERM reply = async_nif_enqueue_req(async_nif, req, h);     \
     if (!reply) {                                                      \
       fn_post_ ## decl (args);                                         \
+      async_nif_recycle_req(req, async_nif);                           \
       enif_free(copy_of_args);                                         \
       return enif_make_tuple2(env, enif_make_atom(env, "error"),       \
                               enif_make_atom(env, "shutdown"));        \
@@ -212,8 +218,13 @@ async_nif_reuse_req(struct async_nif_state *async_nif)
 void
 async_nif_recycle_req(struct async_nif_req_entry *req, struct async_nif_state *async_nif)
 {
+    ErlNifEnv *env = NULL;
     STAT_TOCK(async_nif, qwait);
     enif_mutex_lock(async_nif->recycled_req_mutex);
+    env = req->env;
+    enif_clear_env(env);
+    memset(req, 0, sizeof(struct async_nif_req_entry));
+    req->env = env;
     fifo_q_put(reqs, async_nif->recycled_reqs, req);
     enif_mutex_unlock(async_nif->recycled_req_mutex);
 }
@@ -257,13 +268,13 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
           return 0;
       }
       if (fifo_q_size(reqs, q->reqs) > async_nif->num_queues) {
-	  double await = STAT_MEAN_LOG2_SAMPLE(async_nif, qwait);
-	  double await_inthisq = STAT_MEAN_LOG2_SAMPLE(q, qwait);
-	  if (fifo_q_full(reqs, q->reqs) || await_inthisq > await) {
-	      enif_mutex_unlock(q->reqs_mutex);
-	      qid = (qid + 1) % async_nif->num_queues;
-	      q = &async_nif->queues[qid];
-	  }
+          double await = STAT_MEAN_LOG2_SAMPLE(async_nif, qwait);
+          double await_inthisq = STAT_MEAN_LOG2_SAMPLE(q, qwait);
+          if (fifo_q_full(reqs, q->reqs) || await_inthisq > await) {
+              enif_mutex_unlock(q->reqs_mutex);
+              qid = (qid + 1) % async_nif->num_queues;
+              q = &async_nif->queues[qid];
+          }
       } else {
           break;
       }
@@ -335,7 +346,6 @@ async_nif_worker_fn(void *arg)
       req->fn_post = 0;
       enif_free(req->args);
       req->args = NULL;
-      enif_clear_env(req->env);
       async_nif_recycle_req(req, async_nif);
       req = NULL;
     }
diff --git a/c_src/cas.h b/c_src/cas.h
new file mode 100644
index 0000000..ea81dbf
--- /dev/null
+++ b/c_src/cas.h
@@ -0,0 +1,153 @@
+/*
+ * wterl: an Erlang NIF for WiredTiger
+ *
+ * Copyright (c) 2012-2013 Basho Technologies, Inc. All Rights Reserved.
+ *
+ * This file is provided to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+/*
+ * Most of the following source code is copied directly from: "The Lock-Free
+ * Library" (http://www.cl.cam.ac.uk/research/srg/netos/lock-free/) reused and
+ * redistrubuted in accordance with their license:
+ *
+ * Copyright (c) 2002-2003 K A Fraser, All Rights Reserved.
+ *
+ * * Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions are
+ *   met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CAS_H_
+#define __CAS_H_
+
+#define CACHE_LINE_SIZE 64
+
+#define ATOMIC_ADD_TO(_v,_x)                                            \
+do {                                                                    \
+    int __val = (_v), __newval;                                         \
+    while ( (__newval = CASIO(&(_v),__val,__val+(_x))) != __val )       \
+        __val = __newval;                                               \
+} while ( 0 )
+
+#define ATOMIC_SET_TO(_v,_x)                                            \
+do {                                                                    \
+    int __val = (_v), __newval;                                         \
+    while ( (__newval = CASIO(&(_v),__val,__val=(_x))) != __val )       \
+        __val = __newval;                                               \
+} while ( 0 )
+
+#define ALIGNED_ENIF_ALLOC(_s)                                      \
+    ((void *)(((unsigned long)enif_alloc((_s)+CACHE_LINE_SIZE*2) +  \
+               CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE-1)))        \
+
+/*
+ * I. Compare-and-swap.
+ */
+
+/*
+ * This is a strong barrier! Reads cannot be delayed beyond a later store.
+ * Reads cannot be hoisted beyond a LOCK prefix. Stores always in-order.
+ */
+#define CAS(_a, _o, _n)                                    \
+({ __typeof__(_o) __o = _o;                                \
+   __asm__ __volatile__(                                   \
+       "lock cmpxchg %3,%1"                                \
+       : "=a" (__o), "=m" (*(volatile unsigned int *)(_a)) \
+       :  "0" (__o), "r" (_n) );                           \
+   __o;                                                    \
+})
+
+#define FAS(_a, _n)                                        \
+({ __typeof__(_n) __o;                                     \
+   __asm__ __volatile__(                                   \
+       "lock xchg %0,%1"                                   \
+       : "=r" (__o), "=m" (*(volatile unsigned int *)(_a)) \
+       :  "0" (_n) );                                      \
+   __o;                                                    \
+})
+
+#define CAS64(_a, _o, _n)                                        \
+({ __typeof__(_o) __o = _o;                                      \
+   __asm__ __volatile__(                                         \
+       "movl %3, %%ecx;"                                         \
+       "movl %4, %%ebx;"                                         \
+       "lock cmpxchg8b %1"                                       \
+       : "=A" (__o), "=m" (*(volatile unsigned long long *)(_a)) \
+       : "0" (__o), "m" (_n >> 32), "m" (_n)                     \
+       : "ebx", "ecx" );                                         \
+   __o;                                                          \
+})
+
+/* Update Integer location, return Old value. */
+#define CASIO CAS
+#define FASIO FAS
+/* Update Pointer location, return Old value. */
+#define CASPO CAS
+#define FASPO FAS
+/* Update 32/64-bit location, return Old value. */
+#define CAS32O CAS
+#define CAS64O CAS64
+
+/*
+ * II. Memory barriers.
+ *  WMB(): All preceding write operations must commit before any later writes.
+ *  RMB(): All preceding read operations must commit before any later reads.
+ *  MB():  All preceding memory accesses must commit before any later accesses.
+ *
+ *  If the compiler does not observe these barriers (but any sane compiler
+ *  will!), then VOLATILE should be defined as 'volatile'.
+ */
+
+#define MB()  __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory")
+#define WMB() __asm__ __volatile__ ("" : : : "memory")
+#define RMB() MB()
+#define VOLATILE /*volatile*/
+
+/* On Intel, CAS is a strong barrier, but not a compile barrier. */
+#define RMB_NEAR_CAS() WMB()
+#define WMB_NEAR_CAS() WMB()
+#define MB_NEAR_CAS()  WMB()
+
+
+/*
+ * III. Cycle counter access.
+ */
+
+typedef unsigned long long tick_t;
+#define RDTICK() \
+    ({ tick_t __t; __asm__ __volatile__ ("rdtsc" : "=A" (__t)); __t; })
+
+#endif /* __CAS_H_ */
diff --git a/c_src/common.h b/c_src/common.h
index 42ac5e0..3364573 100644
--- a/c_src/common.h
+++ b/c_src/common.h
@@ -53,6 +53,12 @@ extern "C" {
 } while (0)
 #endif
 
+#ifdef __APPLE__
+#define PRIuint64(x) (x)
+#else
+#define PRIuint64(x) (unsigned long long)(x)
+#endif
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/c_src/duration.h b/c_src/duration.h
index fbc97cb..6c05df0 100644
--- a/c_src/duration.h
+++ b/c_src/duration.h
@@ -52,7 +52,7 @@ static uint64_t ts(time_scale unit)
     struct timespec ts;
     current_utc_time(&ts);
     return (((uint64_t)ts.tv_sec * scale[unit].mul) +
-	    ((uint64_t)ts.tv_nsec / scale[unit].div));
+            ((uint64_t)ts.tv_nsec / scale[unit].div));
 }
 
 #if defined(__i386__) || defined(__x86_64__)
@@ -67,12 +67,12 @@ static inline uint64_t cpu_clock_ticks()
 {
      uint32_t lo, hi;
      __asm__ __volatile__ (
-	 "XORL %%eax, %%eax\n" /* Flush the pipeline */
-	 "CPUID\n"
-	 "RDTSC\n"             /* Get RDTSC counter in edx:eax */
-	 : "=a" (lo), "=d" (hi)
-	 :
-	 : "%ebx", "%ecx" );
+         "XORL %%eax, %%eax\n" /* Flush the pipeline */
+         "CPUID\n"
+         "RDTSC\n"             /* Get RDTSC counter in edx:eax */
+         : "=a" (lo), "=d" (hi)
+         :
+         : "%ebx", "%ecx" );
      return (uint64_t)hi << 32 | lo;
 }
 
@@ -110,14 +110,14 @@ static inline uint64_t elapsed(duration_t *d)
 
 #define ELAPSED_DURING(result, resolution, block)       \
      do {                                               \
-	  DURATION(__x, resolution);                    \
-	  do block while(0);                            \
-	  *result = elapsed(&__x);                      \
+          DURATION(__x, resolution);                    \
+          do block while(0);                            \
+          *result = elapsed(&__x);                      \
      } while(0);
 
 #define CYCLES_DURING(result, block)                    \
      do {                                               \
-	 uint64_t __begin = cpu_clock_ticks();          \
-	 do block while(0);                             \
-	 *result = cpu_clock_ticks() - __begin;         \
+         uint64_t __begin = cpu_clock_ticks();          \
+         do block while(0);                             \
+         *result = cpu_clock_ticks() - __begin;         \
      } while(0);
diff --git a/c_src/stats.h b/c_src/stats.h
index f44319b..12f5d21 100644
--- a/c_src/stats.h
+++ b/c_src/stats.h
@@ -152,7 +152,7 @@ static unsigned int __log2_64(uint64_t x) {
              fprintf(stderr, "     ns        μs        ms        s         ks\n"); \
              fprintf(stderr, "min: ");                                  \
              if (s->min < 1000)                                         \
-                 fprintf(stderr, "%llu (ns)", s->min);                  \
+                 fprintf(stderr, "%llu (ns)", PRIuint64(s->min));       \
              else if (s->min < 1000000)                                 \
                  fprintf(stderr, "%.2f (μs)", s->min / 1000.0);         \
              else if (s->min < 1000000000)                              \
@@ -161,7 +161,7 @@ static unsigned int __log2_64(uint64_t x) {
                  fprintf(stderr, "%.2f (s)", s->min / 1000000000.0);    \
              fprintf(stderr, "  max: ");                                \
              if (s->max < 1000)                                         \
-                 fprintf(stderr, "%llu (ns)", s->max);                  \
+                 fprintf(stderr, "%llu (ns)", PRIuint64(s->max));       \
              else if (s->max < 1000000)                                 \
                  fprintf(stderr, "%.2f (μs)", s->max / 1000.0);         \
              else if (s->max < 1000000000)                              \
diff --git a/c_src/wterl.c b/c_src/wterl.c
index 8756879..4e26ed9 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -16,6 +16,7 @@
  * under the License.
  *
  */
+
 #include "erl_nif.h"
 #include "erl_driver.h"
 
@@ -29,8 +30,8 @@
 #include "wiredtiger.h"
 #include "stats.h"
 #include "async_nif.h"
-#include "khash.h"
 #include "queue.h"
+#include "cas.h"
 
 #define MAX_CACHE_SIZE ASYNC_NIF_MAX_WORKERS
 
@@ -40,27 +41,20 @@ static ErlNifResourceType *wterl_cursor_RESOURCE;
 typedef char Uri[128];
 
 struct wterl_ctx {
-    SLIST_ENTRY(wterl_ctx) entries;
+    STAILQ_ENTRY(wterl_ctx) entries;
     uint64_t sig;
     uint64_t tstamp;
     WT_SESSION *session;
     WT_CURSOR *cursors[]; // Note: must be last in struct
 };
 
-struct cache_entry {
-    uint64_t sig;
-    SLIST_HEAD(ctxs, wterl_ctx) contexts;
-};
-
-KHASH_MAP_INIT_INT64(cache_entries, struct cache_entry*);
-
 typedef struct wterl_conn {
     WT_CONNECTION *conn;
     const char *session_config;
+    STAILQ_HEAD(ctxs, wterl_ctx) cache;
     ErlNifMutex *cache_mutex;
-    khash_t(cache_entries) *cache;
-    uint32_t num_ctx_in_cache;
-    struct wterl_ctx *last_ctx_used[ASYNC_NIF_MAX_WORKERS];
+    uint32_t cache_size;
+    struct wterl_ctx *mru_ctx[ASYNC_NIF_MAX_WORKERS];
     SLIST_ENTRY(wterl_conn) conns;
     uint64_t histogram[64];
     uint64_t histogram_count;
@@ -164,22 +158,19 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
 {
     uint32_t mean, log, num_evicted, i;
     uint64_t now, elapsed;
-    khash_t(cache_entries) *h = conn_handle->cache;
-    khiter_t itr;
-    struct cache_entry *e;
     struct wterl_ctx *c, *n;
 
-    if (conn_handle->num_ctx_in_cache != MAX_CACHE_SIZE)
-	return 0;
+    if (conn_handle->cache_size != MAX_CACHE_SIZE)
+        return 0;
 
     now = cpu_clock_ticks();
 
     // Find the mean of the recorded times that items stayed in cache.
     mean = 0;
     for (i = 0; i < 64; i++)
-	mean += (conn_handle->histogram[i] * i);
+        mean += (conn_handle->histogram[i] * i);
     if (mean > 0)
-	mean /= conn_handle->histogram_count;
+        mean /= conn_handle->histogram_count;
 
     // Clear out the histogram and hit/misses
     memset(conn_handle->histogram, 0, sizeof(uint64_t) * 64);
@@ -190,30 +181,20 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
      * items from the lists stored in the tree.
      */
     num_evicted = 0;
-    for (itr = kh_begin(h); itr != kh_end(h); ++itr) {
-	if (kh_exist(h, itr)) {
-	    e = kh_val(h, itr);
-	    c = SLIST_FIRST(&e->contexts);
-	    while (c != NULL) {
-		n = SLIST_NEXT(c, entries);
-		elapsed = c->tstamp - now;
-		log = __log2(elapsed);
-		if (log > mean) {
-		    SLIST_REMOVE(&e->contexts, c, wterl_ctx, entries);
-		    c->session->close(c->session, NULL);
-		    enif_free(c);
-		    num_evicted++;
-		}
-		c = n;
-	    }
-	    if (SLIST_EMPTY(&e->contexts)) {
-		kh_del(cache_entries, h, itr);
-		enif_free(e);
-		kh_value(h, itr) = NULL;
-	    }
-	}
+    c = STAILQ_FIRST(&conn_handle->cache);
+    while (c != NULL) {
+        n = STAILQ_NEXT(c, entries);
+        elapsed = c->tstamp - now;
+        log = __log2(elapsed);
+        if (log > mean) {
+            STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
+            c->session->close(c->session, NULL);
+            enif_free(c);
+            num_evicted++;
+        }
+        c = n;
     }
-    conn_handle->num_ctx_in_cache -= num_evicted;
+    conn_handle->cache_size -= num_evicted;
     return num_evicted;
 }
 
@@ -229,29 +210,20 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
 static struct wterl_ctx *
 __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
 {
-    struct wterl_ctx *c = NULL;
-    struct cache_entry *e;
-    khash_t(cache_entries) *h;
-    khiter_t itr;
+    struct wterl_ctx *c, *n;
 
-    h = conn_handle->cache;
     enif_mutex_lock(conn_handle->cache_mutex);
-    if (conn_handle->num_ctx_in_cache > 0) {
-	itr = kh_get(cache_entries, h, sig);
-	if (itr != kh_end(h)) {
-	    e = kh_value(h, itr);
-	    if (!SLIST_EMPTY(&e->contexts)) {
-		/*
-		 * cache hit:
-		 * remove a context from the list in the tree node
-		 */
-		c = SLIST_FIRST(&e->contexts);
-		SLIST_REMOVE_HEAD(&e->contexts, entries);
-		conn_handle->histogram[__log2(cpu_clock_ticks() - c->tstamp)]++;
-		conn_handle->histogram_count++;
-		conn_handle->num_ctx_in_cache -= 1;
-	    }
-	}
+    c = STAILQ_FIRST(&conn_handle->cache);
+    while (c != NULL) {
+        n = STAILQ_NEXT(c, entries);
+        if (c->sig == sig) {
+            // cache hit:
+            STAILQ_REMOVE_HEAD(&conn_handle->cache, entries);
+            conn_handle->histogram[__log2(cpu_clock_ticks() - c->tstamp)]++;
+            conn_handle->histogram_count++;
+            conn_handle->cache_size -= 1;
+        }
+        c = n;
     }
     enif_mutex_unlock(conn_handle->cache_mutex);
     return c;
@@ -266,26 +238,11 @@ __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
 static void
 __ctx_cache_add(WterlConnHandle *conn_handle, struct wterl_ctx *c)
 {
-    struct cache_entry *e;
-    khash_t(cache_entries) *h;
-    khiter_t itr;
-    int itr_status;
-
     enif_mutex_lock(conn_handle->cache_mutex);
     __ctx_cache_evict(conn_handle);
     c->tstamp = cpu_clock_ticks();
-    h = conn_handle->cache;
-    itr = kh_get(cache_entries, h, c->sig);
-    if (itr == kh_end(h)) {
-	e = enif_alloc(sizeof(struct cache_entry)); // TODO: enomem
-	memset(e, 0, sizeof(struct cache_entry));
-	SLIST_INIT(&e->contexts);
-        itr = kh_put(cache_entries, h, c->sig, &itr_status);
-        kh_value(h, itr) = e;
-    }
-    e = kh_value(h, itr);
-    SLIST_INSERT_HEAD(&e->contexts, c, entries);
-    conn_handle->num_ctx_in_cache += 1;
+    STAILQ_INSERT_TAIL(&conn_handle->cache, c, entries);
+    conn_handle->cache_size += 1;
     enif_mutex_unlock(conn_handle->cache_mutex);
 }
 
@@ -357,14 +314,14 @@ __ctx_cache_sig(const char *c, va_list ap, int count)
     const char *arg;
 
     if (c)
-	h = __str_hash(c);
+        h = __str_hash(c);
     else
-	h = 0;
+        h = 0;
 
     for (i = 0; i < (2 * count); i++) {
-	arg = va_arg(ap, const char *);
-	if (arg) h = __zi((uint32_t)(h & 0xFFFFFFFF), __str_hash(arg));
-	else     h = __zi((uint32_t)(h & 0xFFFFFFFF), 0);
+        arg = va_arg(ap, const char *);
+        if (arg) h = __zi((uint32_t)(h & 0xFFFFFFFF), __str_hash(arg));
+        else     h = __zi((uint32_t)(h & 0xFFFFFFFF), 0);
     }
     return h;
 }
@@ -375,62 +332,76 @@ __ctx_cache_sig(const char *c, va_list ap, int count)
  */
 static int
 __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
-	     struct wterl_ctx **ctx,
-	     int count, const char *session_config, ...)
+             struct wterl_ctx **ctx,
+             int count, const char *session_config, ...)
 {
     int i = 0;
     va_list ap;
     uint64_t sig;
     const char *arg;
+    struct wterl_ctx *c;
 
     arg = session_config;
     va_start(ap, session_config);
     sig = __ctx_cache_sig(session_config, ap, count);
     va_end(ap);
 
-    DPRINTF("worker: %u cache size: %u", worker_id, conn_handle->num_ctx_in_cache);
-    if (conn_handle->last_ctx_used[worker_id] != NULL &&
-	conn_handle->last_ctx_used[worker_id]->sig == sig) {
-	(*ctx) = conn_handle->last_ctx_used[worker_id];
-	DPRINTF("worker: %u reuse hit: %lu %p", worker_id, sig, *ctx);
-    } else {
-	if (conn_handle->last_ctx_used[worker_id] != NULL)
-	    __ctx_cache_add(conn_handle, conn_handle->last_ctx_used[worker_id]);
-	conn_handle->last_ctx_used[worker_id] = NULL;
-	(*ctx) = __ctx_cache_find(conn_handle, sig);
-	if ((*ctx) == NULL) {
-	    // cache miss
-	    DPRINTF("worker: %u cache miss: %lu", worker_id, sig);
-	    WT_CONNECTION *conn = conn_handle->conn;
-	    WT_SESSION *session = NULL;
-	    int rc = conn->open_session(conn, NULL, session_config, &session);
-	    if (rc != 0)
-		return rc;
-	    size_t s = sizeof(struct wterl_ctx) + (count * sizeof(WT_CURSOR*));
-	    *ctx = enif_alloc(s); // TODO: enif_alloc_resource()
-	    if (*ctx == NULL) {
-		session->close(session, NULL);
-		return ENOMEM;
-	    }
-	    memset(*ctx, 0, s);
-	    (*ctx)->sig = sig;
-	    (*ctx)->session = session;
-	    session_config = arg;
-	    va_start(ap, session_config);
-	    for (i = 0; i < count; i++) {
-		const char *uri = va_arg(ap, const char *);
-		const char *config = va_arg(ap, const char *);
-		// TODO: error when uri or config is NULL
-		rc = session->open_cursor(session, uri, NULL, config, &(*ctx)->cursors[i]);
-		if (rc != 0) {
-		    session->close(session, NULL); // this will free the cursors too
-		    return rc;
-		}
-	    }
-	    va_end(ap);
-	} else { // else { cache hit }
-	    DPRINTF("worker: %u cache hit: %lu %p", worker_id, sig, *ctx);
-	}
+    do {
+        c = conn_handle->mru_ctx[worker_id];
+        if (CASPO(&conn_handle->mru_ctx[worker_id], c, NULL) != c) {
+            if (c == NULL) {
+                // mru miss:
+                *ctx = NULL;
+            } else {
+                if (c->sig == sig) {
+                    // mru hit:
+                    *ctx = c;
+                } else {
+                    // mru missmatch:
+                    __ctx_cache_add(conn_handle, c);
+                    *ctx = NULL;
+                }
+            }
+        } else {
+            // CAS failed, retry...
+            continue;
+        }
+    } while(0);
+
+    if (*ctx == NULL) {
+        // check the cache
+        (*ctx) = __ctx_cache_find(conn_handle, sig);
+        if ((*ctx) == NULL) {
+            // cache miss:
+            WT_CONNECTION *conn = conn_handle->conn;
+            WT_SESSION *session = NULL;
+            int rc = conn->open_session(conn, NULL, session_config, &session);
+            if (rc != 0) {
+                return rc;
+            }
+            size_t s = sizeof(struct wterl_ctx) + (count * sizeof(WT_CURSOR*));
+            *ctx = enif_alloc(s); // TODO: enif_alloc_resource()
+            if (*ctx == NULL) {
+                session->close(session, NULL);
+                return ENOMEM;
+            }
+            memset(*ctx, 0, s);
+            (*ctx)->sig = sig;
+            (*ctx)->session = session;
+            session_config = arg;
+            va_start(ap, session_config);
+            for (i = 0; i < count; i++) {
+                const char *uri = va_arg(ap, const char *);
+                const char *config = va_arg(ap, const char *);
+                // TODO: error when uri or config is NULL
+                rc = session->open_cursor(session, uri, NULL, config, &(*ctx)->cursors[i]);
+                if (rc != 0) {
+                    session->close(session, NULL); // this will free the cursors too
+                    return rc;
+                }
+            }
+            va_end(ap);
+        } // else { cache hit }
     }
     return 0;
 }
@@ -443,16 +414,17 @@ __release_ctx(WterlConnHandle *conn_handle, uint32_t worker_id, struct wterl_ctx
 {
     int i, n;
     WT_CURSOR *cursor;
+    struct wterl_ctx *c;
 
-    DPRINTF("worker: %u cache size: %u", worker_id, conn_handle->num_ctx_in_cache);
     n = sizeof((WT_CURSOR**)ctx->cursors) / sizeof(ctx->cursors[0]);
     for (i = 0; i < n; i++) {
-	cursor = ctx->cursors[i];
-	cursor->reset(cursor);
+        cursor = ctx->cursors[i];
+        cursor->reset(cursor);
     }
-    assert(conn_handle->last_ctx_used[worker_id] == 0 ||
-	   conn_handle->last_ctx_used[worker_id] == ctx);
-    conn_handle->last_ctx_used[worker_id] = ctx;
+
+    do {
+        c = conn_handle->mru_ctx[worker_id];
+    } while(CASPO(&conn_handle->mru_ctx[worker_id], c, ctx) != c);
 }
 
 /**
@@ -463,34 +435,31 @@ __release_ctx(WterlConnHandle *conn_handle, uint32_t worker_id, struct wterl_ctx
 void
 __close_all_sessions(WterlConnHandle *conn_handle)
 {
-    khash_t(cache_entries) *h = conn_handle->cache;
-    struct cache_entry *e;
-    struct wterl_ctx *c;
-    int i;
+    struct wterl_ctx *c, *n;
+    int worker_id;
 
-    for (i = 0; i < ASYNC_NIF_MAX_WORKERS; i++) {
-	c = conn_handle->last_ctx_used[i];
-	if (c) {
-	    c->session->close(c->session, NULL);
-	    enif_free(c);
-	    conn_handle->last_ctx_used[i] = NULL;
-	}
+    // clear out the mru
+    for (worker_id = 0; worker_id < ASYNC_NIF_MAX_WORKERS; worker_id++) {
+        do {
+            c = conn_handle->mru_ctx[worker_id];
+        } while(CASPO(&conn_handle->mru_ctx[worker_id], c, NULL) != c);
+
+        if (c != NULL) {
+            c->session->close(c->session, NULL);
+            enif_free(c);
+        }
     }
-    khiter_t itr;
-    for (itr = kh_begin(h); itr != kh_end(h); ++itr) {
-	if (kh_exist(h, itr)) {
-	    e = kh_val(h, itr);
-	    while ((c = SLIST_FIRST(&e->contexts)) != NULL) {
-		SLIST_REMOVE(&e->contexts, c, wterl_ctx, entries);
-		c->session->close(c->session, NULL);
-		enif_free(c);
-	    }
-	    kh_del(cache_entries, h, itr);
-	    enif_free(e);
-	    kh_value(h, itr) = NULL;
-	}
+
+    // clear out the cache
+    c = STAILQ_FIRST(&conn_handle->cache);
+    while (c != NULL) {
+        n = STAILQ_NEXT(c, entries);
+        STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
+        conn_handle->cache_size -= 1;
+        c->session->close(c->session, NULL);
+        enif_free(c);
+        c = n;
     }
-    conn_handle->num_ctx_in_cache = 0;
 }
 
 /**
@@ -502,8 +471,8 @@ void
 __close_cursors_on(WterlConnHandle *conn_handle, const char *uri)
 {
     UNUSED(uri);
-    // TODO: find a way to only close those session/cursor* open on uri
     __close_all_sessions(conn_handle);
+    return;
 }
 
 /**
@@ -613,9 +582,9 @@ __wterl_progress_handler(WT_EVENT_HANDLER *handler, const char *operation, uint6
                       enif_make_int64(msg_env, counter)));
         enif_clear_env(msg_env);
         if (!enif_send(NULL, to_pid, msg_env, msg))
-            fprintf(stderr, "[%llu] %s\n", counter, operation);
+            fprintf(stderr, "[%llu] %s\n", PRIuint64(counter), operation);
     } else {
-        rc = (printf("[%llu] %s\n", counter, operation) >= 0 ? 0 : EIO);
+        rc = (printf("[%llu] %s\n", PRIuint64(counter), operation) >= 0 ? 0 : EIO);
     }
     enif_mutex_unlock(eh->progress_mutex);
     return rc;
@@ -693,7 +662,7 @@ ASYNC_NIF_DECL(
     int rc = wiredtiger_open(args->homedir,
                              (WT_EVENT_HANDLER*)&args->priv->eh.handlers,
                              (config.size > 1) ? (const char *)config.data : NULL,
-			     &conn);
+                             &conn);
     if (rc == 0) {
       WterlConnHandle *conn_handle = enif_alloc_resource(wterl_conn_RESOURCE, sizeof(WterlConnHandle));
       memset(conn_handle, 0, sizeof(WterlConnHandle));
@@ -719,8 +688,8 @@ ASYNC_NIF_DECL(
       ERL_NIF_TERM result = enif_make_resource(env, conn_handle);
 
       /* Init hash table which manages the cache of session/cursor(s) */
-      conn_handle->cache = kh_init(cache_entries);
-      conn_handle->num_ctx_in_cache = 0;
+      STAILQ_INIT(&conn_handle->cache);
+      conn_handle->cache_size = 0;
 
       /* Keep track of open connections so as to free when unload/reload/etc.
          are called. */
@@ -1416,8 +1385,8 @@ ASYNC_NIF_DECL(
     struct wterl_ctx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
     int rc = __retain_ctx(args->conn_handle, worker_id, &ctx, 1,
-			  args->conn_handle->session_config,
-			  args->uri, "overwrite,raw");
+                          args->conn_handle->session_config,
+                          args->uri, "overwrite,raw");
     if (rc != 0) {
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
@@ -1475,8 +1444,8 @@ ASYNC_NIF_DECL(
     struct wterl_ctx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
     int rc = __retain_ctx(args->conn_handle, worker_id, &ctx, 1,
-			  args->conn_handle->session_config,
-			  args->uri, "overwrite,raw");
+                          args->conn_handle->session_config,
+                          args->uri, "overwrite,raw");
     if (rc != 0) {
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
@@ -1557,8 +1526,8 @@ ASYNC_NIF_DECL(
     struct wterl_ctx *ctx = NULL;
     WT_CURSOR *cursor = NULL;
     int rc = __retain_ctx(args->conn_handle, worker_id, &ctx, 1,
-			  args->conn_handle->session_config,
-			  args->uri, "overwrite,raw");
+                          args->conn_handle->session_config,
+                          args->uri, "overwrite,raw");
     if (rc != 0) {
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
@@ -2370,19 +2339,18 @@ on_unload(ErlNifEnv *env, void *priv_data)
     /* Lock the cache mutex before unloading the async_nif to prevent new
        work from coming in while shutting down. */
     SLIST_FOREACH(conn_handle, &priv->conns, conns) {
-	enif_mutex_lock(conn_handle->cache_mutex);
+        enif_mutex_lock(conn_handle->cache_mutex);
     }
 
     ASYNC_NIF_UNLOAD(wterl, env, priv->async_nif_priv);
 
     SLIST_FOREACH(conn_handle, &priv->conns, conns) {
-	__close_all_sessions(conn_handle);
-	conn_handle->conn->close(conn_handle->conn, NULL);
-	kh_destroy(cache_entries, conn_handle->cache);
-	if (conn_handle->session_config)
-	    enif_free((void*)conn_handle->session_config);
-	enif_mutex_unlock(conn_handle->cache_mutex);
-	enif_mutex_destroy(conn_handle->cache_mutex);
+        __close_all_sessions(conn_handle);
+        conn_handle->conn->close(conn_handle->conn, NULL);
+        if (conn_handle->session_config)
+            enif_free((void*)conn_handle->session_config);
+        enif_mutex_unlock(conn_handle->cache_mutex);
+        enif_mutex_destroy(conn_handle->cache_mutex);
     }
 
     /* At this point all WiredTiger state and threads are free'd/stopped so there
diff --git a/rebar.config b/rebar.config
index a1cf184..46f0af2 100644
--- a/rebar.config
+++ b/rebar.config
@@ -12,7 +12,7 @@
             debug_info, %{d,'DEBUG',true},
             %strict_validation,
             %fail_on_warning,
-	    warn_missing_spec,
+	    %warn_missing_spec,
             warn_bif_clash,
             warn_deprecated_function,
             warn_export_all,
@@ -22,7 +22,7 @@
             warn_shadow_vars,
             warn_untyped_record,
             warn_unused_function,
-            warn_unused_import,
+            %warn_unused_import,
             warn_unused_record,
             warn_unused_vars
             ]}.
diff --git a/src/wterl.erl b/src/wterl.erl
index 8fc79f2..4dc5b79 100644
--- a/src/wterl.erl
+++ b/src/wterl.erl
@@ -96,8 +96,8 @@ nif_stub_error(Line) ->
 -spec init() -> ok | {error, any()}.
 init() ->
     erlang:load_nif(filename:join([priv_dir(), atom_to_list(?MODULE)]),
-           [{wterl_vsn, "f1b7d8322da904a3385b97456819afd63ff41afe"},
-	    {wiredtiger_vsn, "1.6.1-a06b59e47db7b120575049bd7d6314df53e78e54"}]).
+           [{wterl_vsn, "b2c0b65"},
+	    {wiredtiger_vsn, "1.6.1-87-gbe6742a"}]).
 
 -spec connection_open(string(), config_list()) -> {ok, connection()} | {error, term()}.
 -spec connection_open(string(), config_list(), config_list()) -> {ok, connection()} | {error, term()}.
@@ -618,7 +618,7 @@ various_online_test_() ->
                 end},
                {"truncate entire table",
                 fun() ->
-                        ?assertMatch(ok, truncate(ConnRef, "table:test")),
+			?assertMatch(ok, truncate(ConnRef, "table:test")),
                         ?assertMatch(not_found, get(ConnRef, "table:test", <<"a">>))
                 end},
                %% {"truncate range [<<b>>..last], ensure value outside range is found after",
@@ -863,7 +863,7 @@ prop_put_delete() ->
                      DataDir = "test/wterl.putdelete.qc",
                      Table = "table:eqc",
                      {ok, CWD} = file:get_cwd(),
-                     rmdir(filename:join([CWD, DataDir])), % ?cmd("rm -rf " ++ filename:join([CWD, DataDir])),
+                     rmdir:path(filename:join([CWD, DataDir])), % ?cmd("rm -rf " ++ filename:join([CWD, DataDir])),
                      ok = filelib:ensure_dir(filename:join([DataDir, "x"])),
                      {ok, ConnRef} = wterl:connection_open(DataDir, [{create,true}]),
                      try

From 8ea866bf202dfaf01231058f77a6297438c42ac9 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Mon, 10 Jun 2013 16:18:55 -0400
Subject: [PATCH 11/30] Logic inversion on CAS() operation.

---
 c_src/wterl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c_src/wterl.c b/c_src/wterl.c
index 4e26ed9..20bbee6 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -348,7 +348,7 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
 
     do {
         c = conn_handle->mru_ctx[worker_id];
-        if (CASPO(&conn_handle->mru_ctx[worker_id], c, NULL) != c) {
+        if (CASPO(&conn_handle->mru_ctx[worker_id], c, NULL) == c) {
             if (c == NULL) {
                 // mru miss:
                 *ctx = NULL;

From 110b4829622cc8715ac8556060de3f111048001c Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 11 Jun 2013 12:13:06 -0400
Subject: [PATCH 12/30] Some paranoia and a few fixes

---
 c_src/async_nif.h |  8 +++++++-
 c_src/wterl.c     | 22 +++++++++++++---------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index d282bfb..f45e9cc 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -375,7 +375,13 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
       q = &async_nif->queues[i];
       enif_mutex_lock(q->reqs_mutex);
   }
+
+  /* Set the shutdown flag so that worker threads will no continue
+     executing requests. */
   async_nif->shutdown = 1;
+
+  /* Make sure to wake up all worker threads sitting on conditional
+     wait for work so that they can see it's time to exit. */
   for (i = 0; i < num_queues; i++) {
       q = &async_nif->queues[i];
       enif_cond_broadcast(q->reqs_cnd);
@@ -388,7 +394,7 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
     enif_thread_join(async_nif->worker_entries[i].tid, &exit_value);
   }
 
-  /* Free req structres sitting on the recycle queue. */
+  /* Free any req structures sitting unused on the recycle queue. */
   enif_mutex_lock(async_nif->recycled_req_mutex);
   req = NULL;
   fifo_q_foreach(reqs, async_nif->recycled_reqs, req, {
diff --git a/c_src/wterl.c b/c_src/wterl.c
index 20bbee6..4c1b02c 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -189,6 +189,7 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
         if (log > mean) {
             STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
             c->session->close(c->session, NULL);
+            memset(c, 0, sizeof(struct wterl_ctx));
             enif_free(c);
             num_evicted++;
         }
@@ -446,6 +447,7 @@ __close_all_sessions(WterlConnHandle *conn_handle)
 
         if (c != NULL) {
             c->session->close(c->session, NULL);
+            memset(c, 0, sizeof(struct wterl_ctx));
             enif_free(c);
         }
     }
@@ -457,6 +459,7 @@ __close_all_sessions(WterlConnHandle *conn_handle)
         STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
         conn_handle->cache_size -= 1;
         c->session->close(c->session, NULL);
+        memset(c, 0, sizeof(struct wterl_ctx));
         enif_free(c);
         c = n;
     }
@@ -2308,6 +2311,7 @@ on_load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info)
     ASYNC_NIF_LOAD(wterl, priv->async_nif_priv);
     if (!priv->async_nif_priv) {
         enif_mutex_destroy(priv->conns_mutex);
+        memset(priv, 0, sizeof(struct wterl_priv_data));
         enif_free(priv);
         return ENOMEM;
     }
@@ -2334,21 +2338,18 @@ on_unload(ErlNifEnv *env, void *priv_data)
     struct wterl_priv_data *priv = (struct wterl_priv_data *)priv_data;
     WterlConnHandle *conn_handle;
 
-    enif_mutex_lock(priv->conns_mutex);
+    if (priv_data == NULL)
+        return;
 
-    /* Lock the cache mutex before unloading the async_nif to prevent new
-       work from coming in while shutting down. */
+    enif_mutex_lock(priv->conns_mutex);
+    ASYNC_NIF_UNLOAD(wterl, env, priv->async_nif_priv);
     SLIST_FOREACH(conn_handle, &priv->conns, conns) {
         enif_mutex_lock(conn_handle->cache_mutex);
-    }
-
-    ASYNC_NIF_UNLOAD(wterl, env, priv->async_nif_priv);
-
-    SLIST_FOREACH(conn_handle, &priv->conns, conns) {
         __close_all_sessions(conn_handle);
         conn_handle->conn->close(conn_handle->conn, NULL);
-        if (conn_handle->session_config)
+        if (conn_handle->session_config != NULL) {
             enif_free((void*)conn_handle->session_config);
+        }
         enif_mutex_unlock(conn_handle->cache_mutex);
         enif_mutex_destroy(conn_handle->cache_mutex);
     }
@@ -2369,7 +2370,10 @@ on_unload(ErlNifEnv *env, void *priv_data)
 
     enif_mutex_unlock(priv->conns_mutex);
     enif_mutex_destroy(priv->conns_mutex);
+    memset(priv, 0, sizeof(struct wterl_priv_data));
     enif_free(priv);
+
+    priv_data = NULL;
 }
 
 static int

From 4460434db13319fcb166ceb2b21cdadd714f3a13 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Wed, 12 Jun 2013 08:09:51 -0400
Subject: [PATCH 13/30] WIP: remove potential for infinite loops with CAS and
 fix a few issues in async

---
 c_src/async_nif.h | 16 ++++++++--------
 c_src/common.h    |  2 +-
 c_src/wterl.c     | 33 ++++++++++++++++++++++++---------
 3 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index f45e9cc..6fde4bb 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -34,9 +34,9 @@ extern "C" {
 #define UNUSED(v) ((void)(v))
 #endif
 
-#define ASYNC_NIF_MAX_WORKERS 512
+#define ASYNC_NIF_MAX_WORKERS 1024
 #define ASYNC_NIF_WORKER_QUEUE_SIZE 500
-#define ASYNC_NIF_MAX_QUEUED_REQS 1000 * ASYNC_NIF_MAX_WORKERS
+#define ASYNC_NIF_MAX_QUEUED_REQS ASYNC_NIF_WORKER_QUEUE_SIZE * ASYNC_NIF_MAX_WORKERS
 
 STAT_DECL(qwait, 1000);
 
@@ -104,12 +104,12 @@ struct async_nif_state {
       return enif_make_tuple2(env, enif_make_atom(env, "error"),        \
                               enif_make_atom(env, "shutdown"));         \
     req = async_nif_reuse_req(async_nif);                               \
-    new_env = req->env;                                                 \
     if (!req) {                                                         \
         async_nif_recycle_req(req, async_nif);                          \
         return enif_make_tuple2(env, enif_make_atom(env, "error"),      \
                                 enif_make_atom(env, "eagain"));         \
     }                                                                   \
+    new_env = req->env;                                                 \
     do pre_block while(0);                                              \
     copy_of_args = (struct decl ## _args *)enif_alloc(sizeof(struct decl ## _args)); \
     if (!copy_of_args) {                                                \
@@ -267,16 +267,16 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
           enif_mutex_unlock(q->reqs_mutex);
           return 0;
       }
-      if (fifo_q_size(reqs, q->reqs) > async_nif->num_queues) {
+      if (!fifo_q_full(reqs, q->reqs)) {
           double await = STAT_MEAN_LOG2_SAMPLE(async_nif, qwait);
           double await_inthisq = STAT_MEAN_LOG2_SAMPLE(q, qwait);
-          if (fifo_q_full(reqs, q->reqs) || await_inthisq > await) {
+          if (await_inthisq > await) {
               enif_mutex_unlock(q->reqs_mutex);
               qid = (qid + 1) % async_nif->num_queues;
               q = &async_nif->queues[qid];
+          } else {
+              break;
           }
-      } else {
-          break;
       }
       // TODO: at some point add in work sheading/stealing
   } while(n-- > 0);
@@ -467,7 +467,7 @@ async_nif_load()
          sizeof(struct async_nif_work_queue) * num_queues);
 
   async_nif->num_queues = num_queues;
-  async_nif->num_workers = 2 * num_queues;
+  async_nif->num_workers = ASYNC_NIF_MAX_WORKERS;
   async_nif->next_q = 0;
   async_nif->shutdown = 0;
   async_nif->recycled_reqs = fifo_q_new(reqs, ASYNC_NIF_MAX_QUEUED_REQS);
diff --git a/c_src/common.h b/c_src/common.h
index 3364573..b8324da 100644
--- a/c_src/common.h
+++ b/c_src/common.h
@@ -33,7 +33,7 @@ extern "C" {
 #include <stdarg.h>
 #define DPRINTF(fmt, ...)							\
     do {									\
-	fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__);	\
+	fprintf(stderr, "%s:%d (%s) " fmt "\n", __FILE__, __LINE__, __func__, __VA_ARGS__); \
 	fflush(stderr);								\
     } while(0)
 #define DPUTS(arg)		DPRINTF("%s", arg)
diff --git a/c_src/wterl.c b/c_src/wterl.c
index 4c1b02c..9bc16f1 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -336,7 +336,7 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
              struct wterl_ctx **ctx,
              int count, const char *session_config, ...)
 {
-    int i = 0;
+    int i = 3;
     va_list ap;
     uint64_t sig;
     const char *arg;
@@ -347,33 +347,37 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
     sig = __ctx_cache_sig(session_config, ap, count);
     va_end(ap);
 
+    *ctx = NULL;
     do {
         c = conn_handle->mru_ctx[worker_id];
         if (CASPO(&conn_handle->mru_ctx[worker_id], c, NULL) == c) {
             if (c == NULL) {
                 // mru miss:
+                DPRINTF("[%.4u] mru miss: %llu != NULL", worker_id, PRIuint64(sig));
                 *ctx = NULL;
             } else {
                 if (c->sig == sig) {
                     // mru hit:
+                    DPRINTF("[%.4u] mru hit: %llu", worker_id, PRIuint64(sig));
                     *ctx = c;
+                    break;
                 } else {
                     // mru missmatch:
+                    DPRINTF("[%.4u] mru missmatch: %llu != %llu", worker_id, PRIuint64(sig), PRIuint64(c->sig));
                     __ctx_cache_add(conn_handle, c);
                     *ctx = NULL;
                 }
             }
-        } else {
-            // CAS failed, retry...
-            continue;
         }
-    } while(0);
+        // CAS failed, retry up to 3 times
+    } while(i--);
 
     if (*ctx == NULL) {
         // check the cache
         (*ctx) = __ctx_cache_find(conn_handle, sig);
         if ((*ctx) == NULL) {
             // cache miss:
+            DPRINTF("[%.4u] cache miss: %llu [%d]", worker_id, PRIuint64(sig), conn_handle->cache_size);
             WT_CONNECTION *conn = conn_handle->conn;
             WT_SESSION *session = NULL;
             int rc = conn->open_session(conn, NULL, session_config, &session);
@@ -402,7 +406,10 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
                 }
             }
             va_end(ap);
-        } // else { cache hit }
+        } else {
+            // cache hit:
+            DPRINTF("[%.4u] cache hit: %llu [%d]", worker_id, PRIuint64(sig), conn_handle->cache_size);
+        }
     }
     return 0;
 }
@@ -423,9 +430,14 @@ __release_ctx(WterlConnHandle *conn_handle, uint32_t worker_id, struct wterl_ctx
         cursor->reset(cursor);
     }
 
-    do {
-        c = conn_handle->mru_ctx[worker_id];
-    } while(CASPO(&conn_handle->mru_ctx[worker_id], c, ctx) != c);
+    c = conn_handle->mru_ctx[worker_id];
+    if (CASPO(&conn_handle->mru_ctx[worker_id], c, ctx) != c) {
+        __ctx_cache_add(conn_handle, ctx);
+    } else {
+        if (c != NULL) {
+            __ctx_cache_add(conn_handle, c);
+        }
+    }
 }
 
 /**
@@ -1462,15 +1474,18 @@ ASYNC_NIF_DECL(
     cursor->set_key(cursor, &item_key);
     rc = cursor->search(cursor);
     if (rc != 0) {
+        __release_ctx(args->conn_handle, worker_id, ctx);
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
 
     rc = cursor->get_value(cursor, &item_value);
     if (rc != 0) {
+        __release_ctx(args->conn_handle, worker_id, ctx);
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
+
     ERL_NIF_TERM value;
     unsigned char *bin = enif_make_new_binary(env, item_value.size, &value);
     memcpy(bin, item_value.data, item_value.size);

From 79523587817866b3cc6620567602121ebc6da66e Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Wed, 12 Jun 2013 09:08:09 -0400
Subject: [PATCH 14/30] WIP: cache wasn't returning items found

---
 c_src/async_nif.h |  2 +-
 c_src/common.h    |  1 +
 c_src/wterl.c     | 16 ++++++++++++----
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 6fde4bb..e34748e 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -34,7 +34,7 @@ extern "C" {
 #define UNUSED(v) ((void)(v))
 #endif
 
-#define ASYNC_NIF_MAX_WORKERS 1024
+#define ASYNC_NIF_MAX_WORKERS 128
 #define ASYNC_NIF_WORKER_QUEUE_SIZE 500
 #define ASYNC_NIF_MAX_QUEUED_REQS ASYNC_NIF_WORKER_QUEUE_SIZE * ASYNC_NIF_MAX_WORKERS
 
diff --git a/c_src/common.h b/c_src/common.h
index b8324da..eba4ffc 100644
--- a/c_src/common.h
+++ b/c_src/common.h
@@ -24,6 +24,7 @@
 extern "C" {
 #endif
 
+#define DEBUG 1
 #if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__))
 # undef  DEBUG
 # define DEBUG		0
diff --git a/c_src/wterl.c b/c_src/wterl.c
index 9bc16f1..eda611a 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -223,8 +223,10 @@ __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
             conn_handle->histogram[__log2(cpu_clock_ticks() - c->tstamp)]++;
             conn_handle->histogram_count++;
             conn_handle->cache_size -= 1;
-        }
-        c = n;
+	    break;
+        } else {
+	    c = n;
+	}
     }
     enif_mutex_unlock(conn_handle->cache_mutex);
     return c;
@@ -422,7 +424,7 @@ __release_ctx(WterlConnHandle *conn_handle, uint32_t worker_id, struct wterl_ctx
 {
     int i, n;
     WT_CURSOR *cursor;
-    struct wterl_ctx *c;
+    struct wterl_ctx *c = NULL;
 
     n = sizeof((WT_CURSOR**)ctx->cursors) / sizeof(ctx->cursors[0]);
     for (i = 0; i < n; i++) {
@@ -702,7 +704,7 @@ ASYNC_NIF_DECL(
       conn_handle->conn = conn;
       ERL_NIF_TERM result = enif_make_resource(env, conn_handle);
 
-      /* Init hash table which manages the cache of session/cursor(s) */
+      /* Init list for cache of reuseable contexts */
       STAILQ_INIT(&conn_handle->cache);
       conn_handle->cache_size = 0;
 
@@ -749,6 +751,12 @@ ASYNC_NIF_DECL(
   },
   { // work
 
+    /* First, remove this connection from our list of open connections so
+       we don't free it twice when asked to unload. */
+    enif_mutex_lock(args->priv->conns_mutex);
+    SLIST_REMOVE(&args->priv->conns, args->conn_handle, wterl_conn, conns);
+    enif_mutex_unlock(args->priv->conns_mutex);
+
     /* Free up the shared sessions and cursors. */
     enif_mutex_lock(args->conn_handle->cache_mutex);
     __close_all_sessions(args->conn_handle);

From ff7d1d6e2076f8f34b8db71b8408f1d3f8834586 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Fri, 14 Jun 2013 10:52:45 -0400
Subject: [PATCH 15/30] WIP: further simplifying context cache

---
 c_src/async_nif.h |  10 ++-
 c_src/duration.h  |   2 +-
 c_src/khash.h     |   2 +-
 c_src/stats.h     |  24 +++---
 c_src/wterl.c     | 204 +++++++++++++++++++++++++++-------------------
 5 files changed, 140 insertions(+), 102 deletions(-)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index e34748e..3236f4c 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -241,7 +241,6 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
   /* Identify the most appropriate worker for this request. */
   unsigned int qid = 0;
   struct async_nif_work_queue *q = NULL;
-  unsigned int n = async_nif->num_queues;
 
   /* Either we're choosing a queue based on some affinity/hinted value or we
      need to select the next queue in the rotation and atomically update that
@@ -254,6 +253,9 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
       async_nif->next_q = qid;
   }
 
+#if 0 // stats aren't yet thread safe, so this can go haywire... TODO: fix.
+  unsigned int n = async_nif->num_queues;
+
   /* Now we inspect and interate across the set of queues trying to select one
      that isn't too full or too slow. */
   do {
@@ -281,6 +283,8 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
       // TODO: at some point add in work sheading/stealing
   } while(n-- > 0);
 
+#endif
+
   /* We hold the queue's lock, and we've seletect a reasonable queue for this
      new request so add the request. */
   STAT_TICK(q, qwait);
@@ -297,7 +301,9 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
 }
 
 /**
- * TODO:
+ * Worker threads execute this function.  Here each worker pulls requests of
+ * their respective queues, executes that work and continues doing that until
+ * they see the shutdown flag is set at which point they exit.
  */
 static void *
 async_nif_worker_fn(void *arg)
diff --git a/c_src/duration.h b/c_src/duration.h
index 6c05df0..1404f41 100644
--- a/c_src/duration.h
+++ b/c_src/duration.h
@@ -43,7 +43,7 @@ struct scale_time {
 };
 static const struct scale_time scale[] = {
      { "ns",  "nanosecond",  1000000000LL, 1LL, 10, 2300000000000LL },
-     { "mcs", "microsecond", 1000000LL, 1000LL, 10, 2300000000LL },
+     { "μs",  "microsecond", 1000000LL, 1000LL, 10, 2300000000LL },
      { "ms",  "millisecond", 1000LL, 1000000LL, 10, 2300000LL },
      { "sec", "second",      1LL, 1000000000LL, 10, 2300LL } };
 
diff --git a/c_src/khash.h b/c_src/khash.h
index ab157b1..69549dc 100644
--- a/c_src/khash.h
+++ b/c_src/khash.h
@@ -586,7 +586,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
   @param  name  Name of the hash table [symbol]
   @param  khval_t  Type of values [type]
 */
-#ifdef __x86_64__ 
+#ifdef __x86_64__
 #define KHASH_MAP_INIT_PTR(name, khval_t)				\
     KHASH_INIT(name, void*, khval_t, 1, kh_ptr64_hash_func, kh_ptr64_hash_equal)
 #else
diff --git a/c_src/stats.h b/c_src/stats.h
index 12f5d21..35192ec 100644
--- a/c_src/stats.h
+++ b/c_src/stats.h
@@ -184,27 +184,27 @@ static unsigned int __log2_64(uint64_t x) {
 
 
 #define STAT_INIT(var, name)                                            \
-     var->name ## _stat.min = ~0;                                       \
-     var->name ## _stat.max = 0;                                        \
-     var->name ## _stat.mean = 0.0;                                     \
-     var->name ## _stat.h = 0;                                          \
-     var->name ## _stat.d.then = 0;                                     \
-     var->name ## _stat.d.unit = ns;
+     (var)->name ## _stat.min = ~0;                                     \
+     (var)->name ## _stat.max = 0;                                      \
+     (var)->name ## _stat.mean = 0.0;                                   \
+     (var)->name ## _stat.h = 0;                                        \
+     (var)->name ## _stat.d.then = 0;                                   \
+     (var)->name ## _stat.d.unit = ns;
 
-#define STAT_TICK(var, name) name ## _stat_tick(&var->name ## _stat)
+#define STAT_TICK(var, name) name ## _stat_tick(&(var)->name ## _stat)
 
-#define STAT_TOCK(var, name) name ## _stat_tock(&var->name ## _stat)
+#define STAT_TOCK(var, name) name ## _stat_tock(&(var)->name ## _stat)
 
-#define STAT_RESET(var, name) name ## _stat_reset(&var->name ## _stat)
+#define STAT_RESET(var, name) name ## _stat_reset(&(var)->name ## _stat)
 
 #define STAT_MEAN_LOG2_SAMPLE(var, name)                                \
-    name ## _stat_mean_lg2(&var->name ## _stat)
+    name ## _stat_mean_lg2(&(var)->name ## _stat)
 
 #define STAT_MEAN_SAMPLE(var, name)                                     \
-    name ## _stat_mean(&var->name ## _stat)
+    name ## _stat_mean(&(var)->name ## _stat)
 
 #define STAT_PRINT(var, name, mod)                                      \
-    name ## _stat_print_histogram(&var->name ## _stat, mod)
+    name ## _stat_print_histogram(&(var)->name ## _stat, mod)
 
 
 #if defined(__cplusplus)
diff --git a/c_src/wterl.c b/c_src/wterl.c
index eda611a..d1790b4 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -42,10 +42,16 @@ typedef char Uri[128];
 
 struct wterl_ctx {
     STAILQ_ENTRY(wterl_ctx) entries;
-    uint64_t sig;
     uint64_t tstamp;
+    uint64_t sig;
+    size_t sig_len;
     WT_SESSION *session;
-    WT_CURSOR *cursors[]; // Note: must be last in struct
+    const char *session_config;
+    struct cursor_info {
+        const char *uri;
+        const char *config;
+        WT_CURSOR *cursor;
+    } ci[]; // Note: must be last in struct
 };
 
 typedef struct wterl_conn {
@@ -110,13 +116,35 @@ ASYNC_NIF_INIT(wterl);
  * ->   an integer hash encoding of the bytes
  */
 static inline uint32_t
-__str_hash(const char *s)
+__str_hash(uint32_t in, const char *p, size_t len)
 {
-    unsigned int h = (unsigned int)*s;
-    if (h) for (++s ; *s; ++s) h = (h << 5) - h + (unsigned int)*s;
+    uint32_t h = in;
+    for (++p ; len > 0; ++p, --len)
+        h = (h << 5) - h + (uint32_t)*p;
     return h;
 }
 
+#if defined(__amd64) || defined(__x86_64)
+/* Note: we'll use this to lower the chances that we'll have a hash
+   collision until I can finish a nice trie and use that to be a bit
+   more precise.  When that's done we can skip hash/crc32 and just
+   use the binary position in the trie as our "signature". */
+static inline uint32_t
+__crc32(uint32_t crc, const char *bytes, size_t len)
+{
+    const uint8_t *p;
+    for (p = (const uint8_t*)bytes; len > 0; ++p, --len) {
+        __asm__ __volatile__(
+            ".byte 0xF2, 0x0F, 0x38, 0xF0, 0xF1"
+            : "=S" (crc)
+            : "0" (crc), "c" (*p));
+    }
+    return crc;
+}
+#else
+#error unsupported platform
+#endif
+
 /**
  * Calculate the log2 of 64bit unsigned integers.
  */
@@ -189,7 +217,7 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
         if (log > mean) {
             STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
             c->session->close(c->session, NULL);
-            memset(c, 0, sizeof(struct wterl_ctx));
+            memset(c, 0, sizeof(*c));
             enif_free(c);
             num_evicted++;
         }
@@ -205,8 +233,8 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
  * See if there exists an item in the cache with a matching signature, if
  * so remove it from the cache and return it for use by the callee.
  *
- * sig  a 64-bit signature (hash) representing the session/cursor* needed
- *      for the operation
+ * sig  a 64-bit signature (hash) representing the combination of Uri and
+ *      session+config/cursor+config pairs needed for this operation
  */
 static struct wterl_ctx *
 __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
@@ -217,7 +245,7 @@ __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
     c = STAILQ_FIRST(&conn_handle->cache);
     while (c != NULL) {
         n = STAILQ_NEXT(c, entries);
-        if (c->sig == sig) {
+        if (c->sig == sig) { // TODO: hash collisions *will* lead to SEGVs
             // cache hit:
             STAILQ_REMOVE_HEAD(&conn_handle->cache, entries);
             conn_handle->histogram[__log2(cpu_clock_ticks() - c->tstamp)]++;
@@ -249,60 +277,13 @@ __ctx_cache_add(WterlConnHandle *conn_handle, struct wterl_ctx *c)
     enif_mutex_unlock(conn_handle->cache_mutex);
 }
 
-/**
- * Produce the "Z-Index" or "Morton Number" from 2 32-bit unsigned integers.
- *   e.g.  p = 0101 1011 0100 0011
- *         q = 1011 1100 0001 0011
- *         z = 0110 0111 1101 1010 0010 0001 0000 1111
- */
-static inline uint64_t
-__zi(uint32_t p, uint32_t q)
-{
-    static const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
-    static const uint32_t S[] = {1, 2, 4, 8};
-    uint32_t x, y;
-    uint64_t z;
-
-    x = p & 0x0000FFFF; // Interleave lower 16 bits of p as x and q as y, so the
-    y = q & 0x0000FFFF; // bits of x are in the even positions and bits from y
-    z = 0;              // in the odd; the first 32 bits of 'z' is the result.
-
-    x = (x | (x << S[3])) & B[3];
-    x = (x | (x << S[2])) & B[2];
-    x = (x | (x << S[1])) & B[1];
-    x = (x | (x << S[0])) & B[0];
-
-    y = (y | (y << S[3])) & B[3];
-    y = (y | (y << S[2])) & B[2];
-    y = (y | (y << S[1])) & B[1];
-    y = (y | (y << S[0])) & B[0];
-
-    z = x | (y << 1);
-
-    x = (p >> 16) & 0x0000FFFF; // Interleave the upper 16 bits of p as x and q as y
-    y = (q >> 16) & 0x0000FFFF; // just as before.
-
-    x = (x | (x << S[3])) & B[3];
-    x = (x | (x << S[2])) & B[2];
-    x = (x | (x << S[1])) & B[1];
-    x = (x | (x << S[0])) & B[0];
-
-    y = (y | (y << S[3])) & B[3];
-    y = (y | (y << S[2])) & B[2];
-    y = (y | (y << S[1])) & B[1];
-    y = (y | (y << S[0])) & B[0];
-
-    z = (z << 16) | (x | (y << 1)); // the resulting 64-bit Morton Number.
-
-    return z;
-}
-
 /**
  * Create a signature for the operation we're about to perform.
  *
- * Create a 64bit signature for this a combination of session configuration
- * some number of cursors open on tables each potentially with a different
- * configuration. "session_config, [{table_name, cursor_config}, ...]"
+ * Create a 64-bit hash signature for this a combination of session
+ * configuration some number of cursors open on tables each potentially with a
+ * different configuration. "session_config, [{table_name, cursor_config},
+ * ...]"
  *
  * session_config   the string used to configure the WT_SESSION
  * ...              each pair of items in the varargs array is a table name,
@@ -310,23 +291,35 @@ __zi(uint32_t p, uint32_t q)
  * ->   number of variable arguments processed
  */
 static uint64_t
-__ctx_cache_sig(const char *c, va_list ap, int count)
+__ctx_cache_sig(const char *c, va_list ap, int count, size_t *len)
 {
     int i = 0;
-    uint64_t h;
+    uint32_t hash = 0;
+    uint32_t crc = 0;
+    uint64_t sig = 0;
     const char *arg;
+    size_t l = 0;
 
-    if (c)
-        h = __str_hash(c);
-    else
-        h = 0;
+    if (c) {
+        l = strlen(c);
+        hash = __str_hash(hash, c, l);
+        crc = __crc32(crc, c, l);
+        *len += l + 1;
+    }
 
     for (i = 0; i < (2 * count); i++) {
         arg = va_arg(ap, const char *);
-        if (arg) h = __zi((uint32_t)(h & 0xFFFFFFFF), __str_hash(arg));
-        else     h = __zi((uint32_t)(h & 0xFFFFFFFF), 0);
+        if (arg) {
+            l = strlen(c);
+            hash = __str_hash(hash, arg, l);
+            crc = __crc32(crc, arg, strlen(arg));
+            *len += l + 1;
+        }
     }
-    return h;
+    sig = crc;
+    sig = sig << 32;
+    sig &= hash;
+    return sig;
 }
 
 /**
@@ -339,6 +332,7 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
              int count, const char *session_config, ...)
 {
     int i = 3;
+    size_t sig_len = 0;
     va_list ap;
     uint64_t sig;
     const char *arg;
@@ -346,7 +340,7 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
 
     arg = session_config;
     va_start(ap, session_config);
-    sig = __ctx_cache_sig(session_config, ap, count);
+    sig = __ctx_cache_sig(session_config, ap, count, &sig_len);
     va_end(ap);
 
     *ctx = NULL;
@@ -364,8 +358,8 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
                     *ctx = c;
                     break;
                 } else {
-                    // mru missmatch:
-                    DPRINTF("[%.4u] mru missmatch: %llu != %llu", worker_id, PRIuint64(sig), PRIuint64(c->sig));
+                    // mru mismatch:
+                    DPRINTF("[%.4u] mru mismatch: %llu != %llu", worker_id, PRIuint64(sig), PRIuint64(c->sig));
                     __ctx_cache_add(conn_handle, c);
                     *ctx = NULL;
                 }
@@ -386,7 +380,7 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
             if (rc != 0) {
                 return rc;
             }
-            size_t s = sizeof(struct wterl_ctx) + (count * sizeof(WT_CURSOR*));
+            size_t s = sizeof(struct wterl_ctx) + (count * sizeof(struct cursor_info)) + sig_len;
             *ctx = enif_alloc(s); // TODO: enif_alloc_resource()
             if (*ctx == NULL) {
                 session->close(session, NULL);
@@ -395,14 +389,23 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
             memset(*ctx, 0, s);
             (*ctx)->sig = sig;
             (*ctx)->session = session;
+            (*ctx)->sig_len = sig_len;
+            char *p = (char *)(*ctx) + (s - sig_len);
+            (*ctx)->session_config = p;
+            memcpy(p, session_config, strlen(session_config)); p++;
             session_config = arg;
             va_start(ap, session_config);
             for (i = 0; i < count; i++) {
                 const char *uri = va_arg(ap, const char *);
                 const char *config = va_arg(ap, const char *);
-                // TODO: error when uri or config is NULL
-                rc = session->open_cursor(session, uri, NULL, config, &(*ctx)->cursors[i]);
+                // TODO: what to do (if anything) when uri or config is NULL?
+                (*ctx)->ci[i].uri = p;
+                memcpy(p, uri, strlen(uri)); p++;
+                (*ctx)->ci[i].config = p;
+                memcpy(p, config, strlen(config)); p++;
+                rc = session->open_cursor(session, uri, NULL, config, &(*ctx)->ci[i].cursor);
                 if (rc != 0) {
+                    enif_free(*ctx);
                     session->close(session, NULL); // this will free the cursors too
                     return rc;
                 }
@@ -426,9 +429,9 @@ __release_ctx(WterlConnHandle *conn_handle, uint32_t worker_id, struct wterl_ctx
     WT_CURSOR *cursor;
     struct wterl_ctx *c = NULL;
 
-    n = sizeof((WT_CURSOR**)ctx->cursors) / sizeof(ctx->cursors[0]);
+    n = sizeof((WT_CURSOR**)ctx->ci) / sizeof(ctx->ci[0]);
     for (i = 0; i < n; i++) {
-        cursor = ctx->cursors[i];
+        cursor = ctx->ci[i].cursor;
         cursor->reset(cursor);
     }
 
@@ -461,7 +464,7 @@ __close_all_sessions(WterlConnHandle *conn_handle)
 
         if (c != NULL) {
             c->session->close(c->session, NULL);
-            memset(c, 0, sizeof(struct wterl_ctx));
+            memset(c, 0, sizeof(*c));
             enif_free(c);
         }
     }
@@ -487,8 +490,37 @@ __close_all_sessions(WterlConnHandle *conn_handle)
 void
 __close_cursors_on(WterlConnHandle *conn_handle, const char *uri)
 {
-    UNUSED(uri);
-    __close_all_sessions(conn_handle);
+    struct wterl_ctx *c, *n;
+    int worker_id, cnt;
+
+    // TODO: improve this... but for now it's easiest to just toss everything
+    // from the mru into the cache as a first step.
+    for (worker_id = 0; worker_id < ASYNC_NIF_MAX_WORKERS; worker_id++) {
+        do {
+            c = conn_handle->mru_ctx[worker_id];
+        } while(CASPO(&conn_handle->mru_ctx[worker_id], c, NULL) != c);
+
+        if (c != NULL)
+            __ctx_cache_add(conn_handle, c);
+    }
+
+    // walk the cache, look for open cursors on matching uri
+    c = STAILQ_FIRST(&conn_handle->cache);
+    while (c != NULL) {
+        n = STAILQ_NEXT(c, entries);
+        cnt = sizeof((WT_CURSOR**)c->ci) / sizeof(c->ci[0]);
+        for(;cnt > 0; cnt--) {
+            if (!strcmp(c->ci[cnt].uri, uri)) {
+                STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
+                conn_handle->cache_size -= 1;
+                c->session->close(c->session, NULL);
+                memset(c, 0, sizeof(*c));
+                enif_free(c);
+                break;
+            }
+        }
+        c = n;
+    }
     return;
 }
 
@@ -1395,7 +1427,7 @@ ASYNC_NIF_DECL(
     }
     args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = __str_hash(args->uri);
+    affinity = __str_hash(0, args->uri, strlen(args->uri));
   },
   { // work
 
@@ -1414,7 +1446,7 @@ ASYNC_NIF_DECL(
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
-    cursor = ctx->cursors[0];
+    cursor = ctx->ci[0].cursor;
 
     WT_ITEM item_key;
     item_key.data = key.data;
@@ -1454,7 +1486,7 @@ ASYNC_NIF_DECL(
     }
     args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = __str_hash(args->uri);
+    affinity = __str_hash(0, args->uri, strlen(args->uri));
   },
   { // work
 
@@ -1473,7 +1505,7 @@ ASYNC_NIF_DECL(
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
-    cursor = ctx->cursors[0];
+    cursor = ctx->ci[0].cursor;
 
     WT_ITEM item_key;
     WT_ITEM item_value;
@@ -1534,7 +1566,7 @@ ASYNC_NIF_DECL(
     args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     args->value = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[3]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = __str_hash(args->uri);
+    affinity = __str_hash(0, args->uri, strlen(args->uri));
   },
   { // work
 
@@ -1558,7 +1590,7 @@ ASYNC_NIF_DECL(
         ASYNC_NIF_REPLY(__strerror_term(env, rc));
         return;
     }
-    cursor = ctx->cursors[0];
+    cursor = ctx->ci[0].cursor;
 
     WT_ITEM item_key;
     WT_ITEM item_value;

From 53307e8c018cd5be24c4983d40bd9b7aefb6443f Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Fri, 14 Jun 2013 16:57:53 -0400
Subject: [PATCH 16/30] A great deal of cleanup. EUnit and EQC tests pass.

---
 c_src/async_nif.h |  25 +++---
 c_src/common.h    |   1 -
 c_src/wterl.c     | 190 +++++++++++++++++++++++++---------------------
 src/wterl.erl     |   2 +-
 4 files changed, 118 insertions(+), 100 deletions(-)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 3236f4c..78c962d 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -34,7 +34,7 @@ extern "C" {
 #define UNUSED(v) ((void)(v))
 #endif
 
-#define ASYNC_NIF_MAX_WORKERS 128
+#define ASYNC_NIF_MAX_WORKERS 8
 #define ASYNC_NIF_WORKER_QUEUE_SIZE 500
 #define ASYNC_NIF_MAX_QUEUED_REQS ASYNC_NIF_WORKER_QUEUE_SIZE * ASYNC_NIF_MAX_WORKERS
 
@@ -253,6 +253,9 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
       async_nif->next_q = qid;
   }
 
+  q = &async_nif->queues[qid];
+  enif_mutex_lock(q->reqs_mutex);
+
 #if 0 // stats aren't yet thread safe, so this can go haywire... TODO: fix.
   unsigned int n = async_nif->num_queues;
 
@@ -277,12 +280,12 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
               qid = (qid + 1) % async_nif->num_queues;
               q = &async_nif->queues[qid];
           } else {
+              // q->reqs_mutex unlocked at end of function
               break;
           }
       }
       // TODO: at some point add in work sheading/stealing
   } while(n-- > 0);
-
 #endif
 
   /* We hold the queue's lock, and we've seletect a reasonable queue for this
@@ -400,15 +403,6 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
     enif_thread_join(async_nif->worker_entries[i].tid, &exit_value);
   }
 
-  /* Free any req structures sitting unused on the recycle queue. */
-  enif_mutex_lock(async_nif->recycled_req_mutex);
-  req = NULL;
-  fifo_q_foreach(reqs, async_nif->recycled_reqs, req, {
-      enif_free_env(req->env);
-      enif_free(req);
-  });
-  fifo_q_free(reqs, async_nif->recycled_reqs);
-
   /* Cleanup in-flight requests, mutexes and conditions in each work queue. */
   for (i = 0; i < num_queues; i++) {
       q = &async_nif->queues[i];
@@ -430,6 +424,15 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
       enif_cond_destroy(q->reqs_cnd);
   }
 
+  /* Free any req structures sitting unused on the recycle queue. */
+  enif_mutex_lock(async_nif->recycled_req_mutex);
+  req = NULL;
+  fifo_q_foreach(reqs, async_nif->recycled_reqs, req, {
+      enif_free_env(req->env);
+      enif_free(req);
+  });
+  fifo_q_free(reqs, async_nif->recycled_reqs);
+
   enif_mutex_unlock(async_nif->recycled_req_mutex);
   enif_mutex_destroy(async_nif->recycled_req_mutex);
   memset(async_nif, 0, sizeof(struct async_nif_state) + (sizeof(struct async_nif_work_queue) * async_nif->num_queues));
diff --git a/c_src/common.h b/c_src/common.h
index eba4ffc..b8324da 100644
--- a/c_src/common.h
+++ b/c_src/common.h
@@ -24,7 +24,6 @@
 extern "C" {
 #endif
 
-#define DEBUG 1
 #if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__))
 # undef  DEBUG
 # define DEBUG		0
diff --git a/c_src/wterl.c b/c_src/wterl.c
index d1790b4..cfb5b42 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -47,6 +47,7 @@ struct wterl_ctx {
     size_t sig_len;
     WT_SESSION *session;
     const char *session_config;
+    uint32_t num_cursors;
     struct cursor_info {
         const char *uri;
         const char *config;
@@ -61,7 +62,6 @@ typedef struct wterl_conn {
     ErlNifMutex *cache_mutex;
     uint32_t cache_size;
     struct wterl_ctx *mru_ctx[ASYNC_NIF_MAX_WORKERS];
-    SLIST_ENTRY(wterl_conn) conns;
     uint64_t histogram[64];
     uint64_t histogram_count;
 } WterlConnHandle;
@@ -84,8 +84,6 @@ struct wterl_event_handlers {
 
 struct wterl_priv_data {
     void *async_nif_priv; // Note: must be first element in struct
-    ErlNifMutex *conns_mutex;
-    SLIST_HEAD(conns, wterl_conn) conns;
     struct wterl_event_handlers eh;
     char wterl_vsn[512];
     char wiredtiger_vsn[512];
@@ -106,6 +104,15 @@ static ERL_NIF_TERM ATOM_MSG_PID;
 /* Global init for async_nif. */
 ASYNC_NIF_INIT(wterl);
 
+static inline size_t
+__strlen(const char *s)
+{
+    if (s)
+        return strlen(s);
+    else
+        return 0;
+}
+
 /**
  * A string hash function.
  *
@@ -217,7 +224,6 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
         if (log > mean) {
             STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
             c->session->close(c->session, NULL);
-            memset(c, 0, sizeof(*c));
             enif_free(c);
             num_evicted++;
         }
@@ -251,10 +257,10 @@ __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
             conn_handle->histogram[__log2(cpu_clock_ticks() - c->tstamp)]++;
             conn_handle->histogram_count++;
             conn_handle->cache_size -= 1;
-	    break;
+            break;
         } else {
-	    c = n;
-	}
+            c = n;
+        }
     }
     enif_mutex_unlock(conn_handle->cache_mutex);
     return c;
@@ -300,28 +306,46 @@ __ctx_cache_sig(const char *c, va_list ap, int count, size_t *len)
     const char *arg;
     size_t l = 0;
 
+    *len = 0;
+
     if (c) {
-        l = strlen(c);
+        l = __strlen(c);
         hash = __str_hash(hash, c, l);
         crc = __crc32(crc, c, l);
         *len += l + 1;
+    } else {
+        *len += 1;
     }
 
     for (i = 0; i < (2 * count); i++) {
         arg = va_arg(ap, const char *);
         if (arg) {
-            l = strlen(c);
+            l = __strlen(arg);
             hash = __str_hash(hash, arg, l);
-            crc = __crc32(crc, arg, strlen(arg));
+            crc = __crc32(crc, arg, __strlen(arg));
             *len += l + 1;
+        } else {
+            *len += 1;
         }
     }
+
     sig = crc;
     sig = sig << 32;
     sig &= hash;
     return sig;
 }
 
+static inline char *
+__copy_str_into(char **p, const char *s)
+{
+    char *a = *p;
+    size_t len =  __strlen(s);
+    memcpy(*p, s, len);
+    (*p)[len] = '\0';
+    *p += len + 1;
+    return a;
+}
+
 /**
  * Get a reusable cursor that was opened for a particular worker within its
  * session.
@@ -391,18 +415,16 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
             (*ctx)->session = session;
             (*ctx)->sig_len = sig_len;
             char *p = (char *)(*ctx) + (s - sig_len);
-            (*ctx)->session_config = p;
-            memcpy(p, session_config, strlen(session_config)); p++;
+            (*ctx)->session_config = __copy_str_into(&p, session_config);
+            (*ctx)->num_cursors = count;
             session_config = arg;
             va_start(ap, session_config);
             for (i = 0; i < count; i++) {
                 const char *uri = va_arg(ap, const char *);
                 const char *config = va_arg(ap, const char *);
                 // TODO: what to do (if anything) when uri or config is NULL?
-                (*ctx)->ci[i].uri = p;
-                memcpy(p, uri, strlen(uri)); p++;
-                (*ctx)->ci[i].config = p;
-                memcpy(p, config, strlen(config)); p++;
+                (*ctx)->ci[i].uri = __copy_str_into(&p, uri);
+                (*ctx)->ci[i].config = __copy_str_into(&p, config);
                 rc = session->open_cursor(session, uri, NULL, config, &(*ctx)->ci[i].cursor);
                 if (rc != 0) {
                     enif_free(*ctx);
@@ -425,12 +447,11 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
 static void
 __release_ctx(WterlConnHandle *conn_handle, uint32_t worker_id, struct wterl_ctx *ctx)
 {
-    int i, n;
+    uint32_t i;
     WT_CURSOR *cursor;
     struct wterl_ctx *c = NULL;
 
-    n = sizeof((WT_CURSOR**)ctx->ci) / sizeof(ctx->ci[0]);
-    for (i = 0; i < n; i++) {
+    for (i = 0; i < ctx->num_cursors; i++) {
         cursor = ctx->ci[i].cursor;
         cursor->reset(cursor);
     }
@@ -438,9 +459,13 @@ __release_ctx(WterlConnHandle *conn_handle, uint32_t worker_id, struct wterl_ctx
     c = conn_handle->mru_ctx[worker_id];
     if (CASPO(&conn_handle->mru_ctx[worker_id], c, ctx) != c) {
         __ctx_cache_add(conn_handle, ctx);
+        DPRINTF("[%.4u] reset %d cursors, returnd ctx to cache", worker_id, ctx->num_cursors);
     } else {
         if (c != NULL) {
             __ctx_cache_add(conn_handle, c);
+            DPRINTF("[%.4u] reset %d cursors, returnd ctx to cache", worker_id, ctx->num_cursors);
+        } else {
+            DPRINTF("[%.4u] reset %d cursors, returnd ctx to mru", worker_id, ctx->num_cursors);
         }
     }
 }
@@ -464,7 +489,6 @@ __close_all_sessions(WterlConnHandle *conn_handle)
 
         if (c != NULL) {
             c->session->close(c->session, NULL);
-            memset(c, 0, sizeof(*c));
             enif_free(c);
         }
     }
@@ -476,7 +500,6 @@ __close_all_sessions(WterlConnHandle *conn_handle)
         STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
         conn_handle->cache_size -= 1;
         c->session->close(c->session, NULL);
-        memset(c, 0, sizeof(struct wterl_ctx));
         enif_free(c);
         c = n;
     }
@@ -491,30 +514,37 @@ void
 __close_cursors_on(WterlConnHandle *conn_handle, const char *uri)
 {
     struct wterl_ctx *c, *n;
-    int worker_id, cnt;
+    int worker_id, idx, cnt;
 
-    // TODO: improve this... but for now it's easiest to just toss everything
-    // from the mru into the cache as a first step.
+    // walk the mru first, look for open cursors on matching uri
     for (worker_id = 0; worker_id < ASYNC_NIF_MAX_WORKERS; worker_id++) {
-        do {
-            c = conn_handle->mru_ctx[worker_id];
-        } while(CASPO(&conn_handle->mru_ctx[worker_id], c, NULL) != c);
-
-        if (c != NULL)
-            __ctx_cache_add(conn_handle, c);
+        c = conn_handle->mru_ctx[worker_id];
+        if (CASPO(&conn_handle->mru_ctx[worker_id], c, NULL) == c && c != NULL) {
+            cnt = c->num_cursors;
+            for(idx = 0; idx < cnt; idx++) {
+                if (!strcmp(c->ci[idx].uri, uri)) {
+                    c->session->close(c->session, NULL);
+                    enif_free(c);
+                    break;
+                } else {
+                    if (CASPO(&conn_handle->mru_ctx[worker_id], NULL, c) != NULL) {
+                        __ctx_cache_add(conn_handle, c);
+                    }
+                }
+            }
+        }
     }
 
-    // walk the cache, look for open cursors on matching uri
+    // next we walk the cache, look for open cursors on matching uri
     c = STAILQ_FIRST(&conn_handle->cache);
     while (c != NULL) {
         n = STAILQ_NEXT(c, entries);
-        cnt = sizeof((WT_CURSOR**)c->ci) / sizeof(c->ci[0]);
-        for(;cnt > 0; cnt--) {
-            if (!strcmp(c->ci[cnt].uri, uri)) {
+        cnt = c->num_cursors;
+        for(idx = 0; idx < cnt; idx++) {
+            if (!strcmp(c->ci[idx].uri, uri)) {
                 STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
                 conn_handle->cache_size -= 1;
                 c->session->close(c->session, NULL);
-                memset(c, 0, sizeof(*c));
                 enif_free(c);
                 break;
             }
@@ -683,7 +713,7 @@ ASYNC_NIF_DECL(
   { // pre
 
     if (!(argc == 3 &&
-          (enif_get_string(env, argv[0], args->homedir, sizeof args->homedir, ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[0], args->homedir, sizeof(args->homedir), ERL_NIF_LATIN1) > 0) &&
           enif_is_binary(env, argv[1]) &&
           enif_is_binary(env, argv[2]))) {
       ASYNC_NIF_RETURN_BADARG();
@@ -740,12 +770,6 @@ ASYNC_NIF_DECL(
       STAILQ_INIT(&conn_handle->cache);
       conn_handle->cache_size = 0;
 
-      /* Keep track of open connections so as to free when unload/reload/etc.
-         are called. */
-      enif_mutex_lock(args->priv->conns_mutex);
-      SLIST_INSERT_HEAD(&args->priv->conns, conn_handle, conns);
-      enif_mutex_unlock(args->priv->conns_mutex);
-
       enif_release_resource(conn_handle);
       enif_mutex_unlock(conn_handle->cache_mutex);
       ASYNC_NIF_REPLY(enif_make_tuple2(env, ATOM_OK, result));
@@ -783,12 +807,6 @@ ASYNC_NIF_DECL(
   },
   { // work
 
-    /* First, remove this connection from our list of open connections so
-       we don't free it twice when asked to unload. */
-    enif_mutex_lock(args->priv->conns_mutex);
-    SLIST_REMOVE(&args->priv->conns, args->conn_handle, wterl_conn, conns);
-    enif_mutex_unlock(args->priv->conns_mutex);
-
     /* Free up the shared sessions and cursors. */
     enif_mutex_lock(args->conn_handle->cache_mutex);
     __close_all_sessions(args->conn_handle);
@@ -798,11 +816,6 @@ ASYNC_NIF_DECL(
     }
     WT_CONNECTION* conn = args->conn_handle->conn;
     int rc = conn->close(conn, NULL);
-
-    /* Connection is closed, remove it so we don't free on unload/reload/etc. */
-    enif_mutex_lock(args->priv->conns_mutex);
-    SLIST_REMOVE(&args->priv->conns, args->conn_handle, wterl_conn, conns);
-    enif_mutex_unlock(args->priv->conns_mutex);
     enif_mutex_unlock(args->conn_handle->cache_mutex);
     enif_mutex_destroy(args->conn_handle->cache_mutex);
     memset(args->conn_handle, 0, sizeof(WterlConnHandle));
@@ -836,7 +849,7 @@ ASYNC_NIF_DECL(
 
     if (!(argc == 3 &&
           enif_get_resource(env, argv[0], wterl_conn_RESOURCE, (void**)&args->conn_handle) &&
-          (enif_get_string(env, argv[1], args->uri, sizeof args->uri, ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[1], args->uri, sizeof(args->uri), ERL_NIF_LATIN1) > 0) &&
           enif_is_binary(env, argv[2]))) {
       ASYNC_NIF_RETURN_BADARG();
     }
@@ -890,7 +903,7 @@ ASYNC_NIF_DECL(
 
     if (!(argc == 3 &&
           enif_get_resource(env, argv[0], wterl_conn_RESOURCE, (void**)&args->conn_handle) &&
-          (enif_get_string(env, argv[1], args->uri, sizeof args->uri, ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[1], args->uri, sizeof(args->uri), ERL_NIF_LATIN1) > 0) &&
           enif_is_binary(env, argv[2]))) {
       ASYNC_NIF_RETURN_BADARG();
     }
@@ -956,8 +969,8 @@ ASYNC_NIF_DECL(
 
     if (!(argc == 4 &&
           enif_get_resource(env, argv[0], wterl_conn_RESOURCE, (void**)&args->conn_handle) &&
-          (enif_get_string(env, argv[1], args->oldname, sizeof args->oldname, ERL_NIF_LATIN1) > 0) &&
-          (enif_get_string(env, argv[2], args->newname, sizeof args->newname, ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[1], args->oldname, sizeof(args->oldname), ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[2], args->newname, sizeof(args->newname), ERL_NIF_LATIN1) > 0) &&
           enif_is_binary(env, argv[3]))) {
       ASYNC_NIF_RETURN_BADARG();
     }
@@ -1025,7 +1038,7 @@ ASYNC_NIF_DECL(
 
     if (!(argc == 3 &&
           enif_get_resource(env, argv[0], wterl_conn_RESOURCE, (void**)&args->conn_handle) &&
-          (enif_get_string(env, argv[1], args->uri, sizeof args->uri, ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[1], args->uri, sizeof(args->uri), ERL_NIF_LATIN1) > 0) &&
           enif_is_binary(env, argv[2]))) {
       ASYNC_NIF_RETURN_BADARG();
     }
@@ -1139,7 +1152,7 @@ ASYNC_NIF_DECL(
 
     if (!(argc == 5 &&
           enif_get_resource(env, argv[0], wterl_conn_RESOURCE, (void**)&args->conn_handle) &&
-          (enif_get_string(env, argv[1], args->uri, sizeof args->uri, ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[1], args->uri, sizeof(args->uri), ERL_NIF_LATIN1) > 0) &&
           enif_is_binary(env, argv[4]))) {
       ASYNC_NIF_RETURN_BADARG();
     }
@@ -1298,7 +1311,7 @@ ASYNC_NIF_DECL(
 
     if (!(argc == 3 &&
           enif_get_resource(env, argv[0], wterl_conn_RESOURCE, (void**)&args->conn_handle) &&
-          (enif_get_string(env, argv[1], args->uri, sizeof args->uri, ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[1], args->uri, sizeof(args->uri), ERL_NIF_LATIN1) > 0) &&
           enif_is_binary(env, argv[2]))) {
       ASYNC_NIF_RETURN_BADARG();
     }
@@ -1360,7 +1373,7 @@ ASYNC_NIF_DECL(
 
     if (!(argc == 3 &&
           enif_get_resource(env, argv[0], wterl_conn_RESOURCE, (void**)&args->conn_handle) &&
-          (enif_get_string(env, argv[1], args->uri, sizeof args->uri, ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[1], args->uri, sizeof(args->uri), ERL_NIF_LATIN1) > 0) &&
           enif_is_binary(env, argv[2]))) {
       ASYNC_NIF_RETURN_BADARG();
     }
@@ -1421,13 +1434,13 @@ ASYNC_NIF_DECL(
 
     if (!(argc == 3 &&
           enif_get_resource(env, argv[0], wterl_conn_RESOURCE, (void**)&args->conn_handle) &&
-          (enif_get_string(env, argv[1], args->uri, sizeof args->uri, ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[1], args->uri, sizeof(args->uri), ERL_NIF_LATIN1) > 0) &&
           enif_is_binary(env, argv[2]))) {
       ASYNC_NIF_RETURN_BADARG();
     }
     args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = __str_hash(0, args->uri, strlen(args->uri));
+    affinity = __str_hash(0, args->uri, __strlen(args->uri));
   },
   { // work
 
@@ -1480,13 +1493,13 @@ ASYNC_NIF_DECL(
 
     if (!(argc == 3 &&
           enif_get_resource(env, argv[0], wterl_conn_RESOURCE, (void**)&args->conn_handle) &&
-          (enif_get_string(env, argv[1], args->uri, sizeof args->uri, ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[1], args->uri, sizeof(args->uri), ERL_NIF_LATIN1) > 0) &&
           enif_is_binary(env, argv[2]))) {
       ASYNC_NIF_RETURN_BADARG();
     }
     args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = __str_hash(0, args->uri, strlen(args->uri));
+    affinity = __str_hash(0, args->uri, __strlen(args->uri));
   },
   { // work
 
@@ -1558,7 +1571,7 @@ ASYNC_NIF_DECL(
 
     if (!(argc == 4 &&
           enif_get_resource(env, argv[0], wterl_conn_RESOURCE, (void**)&args->conn_handle) &&
-          (enif_get_string(env, argv[1], args->uri, sizeof args->uri, ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[1], args->uri, sizeof(args->uri), ERL_NIF_LATIN1) > 0) &&
           enif_is_binary(env, argv[2]) &&
           enif_is_binary(env, argv[3]))) {
       ASYNC_NIF_RETURN_BADARG();
@@ -1566,7 +1579,7 @@ ASYNC_NIF_DECL(
     args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     args->value = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[3]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = __str_hash(0, args->uri, strlen(args->uri));
+    affinity = __str_hash(0, args->uri, __strlen(args->uri));
   },
   { // work
 
@@ -1628,7 +1641,7 @@ ASYNC_NIF_DECL(
 
     if (!(argc == 3 &&
           enif_get_resource(env, argv[0], wterl_conn_RESOURCE, (void**)&args->conn_handle) &&
-          (enif_get_string(env, argv[1], args->uri, sizeof args->uri, ERL_NIF_LATIN1) > 0) &&
+          (enif_get_string(env, argv[1], args->uri, sizeof(args->uri), ERL_NIF_LATIN1) > 0) &&
           enif_is_binary(env, argv[2]))) {
       ASYNC_NIF_RETURN_BADARG();
     }
@@ -2298,6 +2311,26 @@ wterl_set_event_handler_pid(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
 }
 
 
+/**
+ * Called when a connection is free'd, our opportunity to clean up
+ * allocated resources.
+ */
+static void __wterl_conn_dtor(ErlNifEnv* env, void* obj)
+{
+    UNUSED(env);
+    WterlConnHandle *conn_handle = (WterlConnHandle *)obj;
+
+    if (conn_handle->cache_mutex) {
+        DPRINTF("Non-NULL conn_handle (%p) to free", obj);
+        enif_mutex_lock(conn_handle->cache_mutex);
+        __close_all_sessions(conn_handle);
+        conn_handle->conn->close(conn_handle->conn, NULL);
+        enif_mutex_unlock(conn_handle->cache_mutex);
+        enif_mutex_destroy(conn_handle->cache_mutex);
+    }
+}
+
+
 /**
  * Called as this driver is loaded by the Erlang BEAM runtime triggered by the
  * module's on_load directive.
@@ -2317,7 +2350,7 @@ on_load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info)
     const ERL_NIF_TERM* option;
     ErlNifResourceFlags flags = ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER;
     wterl_conn_RESOURCE = enif_open_resource_type(env, NULL, "wterl_conn_resource",
-                                                  NULL, flags, NULL);
+                                                  __wterl_conn_dtor, flags, NULL);
     wterl_cursor_RESOURCE = enif_open_resource_type(env, NULL, "wterl_cursor_resource",
                                                     NULL, flags, NULL);
 
@@ -2337,9 +2370,6 @@ on_load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info)
         return ENOMEM;
     memset(priv, 0, sizeof(struct wterl_priv_data));
 
-    priv->conns_mutex = enif_mutex_create(NULL);
-    SLIST_INIT(&priv->conns);
-
     struct wterl_event_handlers *eh = &priv->eh;
     eh->error_mutex = enif_mutex_create(NULL);
     eh->message_mutex = enif_mutex_create(NULL);
@@ -2365,7 +2395,6 @@ on_load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info)
        pointer to the async_nif's private data which we set here. */
     ASYNC_NIF_LOAD(wterl, priv->async_nif_priv);
     if (!priv->async_nif_priv) {
-        enif_mutex_destroy(priv->conns_mutex);
         memset(priv, 0, sizeof(struct wterl_priv_data));
         enif_free(priv);
         return ENOMEM;
@@ -2391,23 +2420,12 @@ static void
 on_unload(ErlNifEnv *env, void *priv_data)
 {
     struct wterl_priv_data *priv = (struct wterl_priv_data *)priv_data;
-    WterlConnHandle *conn_handle;
 
     if (priv_data == NULL)
         return;
 
-    enif_mutex_lock(priv->conns_mutex);
+    DPRINTF("unloading wterl NIF (%p)", priv);
     ASYNC_NIF_UNLOAD(wterl, env, priv->async_nif_priv);
-    SLIST_FOREACH(conn_handle, &priv->conns, conns) {
-        enif_mutex_lock(conn_handle->cache_mutex);
-        __close_all_sessions(conn_handle);
-        conn_handle->conn->close(conn_handle->conn, NULL);
-        if (conn_handle->session_config != NULL) {
-            enif_free((void*)conn_handle->session_config);
-        }
-        enif_mutex_unlock(conn_handle->cache_mutex);
-        enif_mutex_destroy(conn_handle->cache_mutex);
-    }
 
     /* At this point all WiredTiger state and threads are free'd/stopped so there
        is no chance that the event handler functions will be called so we can
@@ -2423,8 +2441,6 @@ on_unload(ErlNifEnv *env, void *priv_data)
     if (eh->msg_env_progress)
         enif_free_env(eh->msg_env_progress);
 
-    enif_mutex_unlock(priv->conns_mutex);
-    enif_mutex_destroy(priv->conns_mutex);
     memset(priv, 0, sizeof(struct wterl_priv_data));
     enif_free(priv);
 
diff --git a/src/wterl.erl b/src/wterl.erl
index 4dc5b79..f0b26d4 100644
--- a/src/wterl.erl
+++ b/src/wterl.erl
@@ -618,7 +618,7 @@ various_online_test_() ->
                 end},
                {"truncate entire table",
                 fun() ->
-			?assertMatch(ok, truncate(ConnRef, "table:test")),
+	       		?assertMatch(ok, truncate(ConnRef, "table:test")),
                         ?assertMatch(not_found, get(ConnRef, "table:test", <<"a">>))
                 end},
                %% {"truncate range [<<b>>..last], ensure value outside range is found after",

From 50e24d0f48802a689035bf165cfdbbc904d481a8 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 18 Jun 2013 09:21:58 -0400
Subject: [PATCH 17/30] Add a longer, multi-table test. Use a release version
 of WiredTiger from now on.

---
 c_src/build_deps.sh | 10 +++++-----
 c_src/wterl.c       |  2 +-
 src/wterl.erl       | 37 +++++++++++++++++++++++++++++++++++--
 update-version.sh   |  5 ++---
 4 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/c_src/build_deps.sh b/c_src/build_deps.sh
index 1a64c10..cc0e807 100755
--- a/c_src/build_deps.sh
+++ b/c_src/build_deps.sh
@@ -11,9 +11,9 @@ unset POSIX_SHELL # clear it so if we invoke other scripts, they run as ksh as w
 set -e
 
 WT_REPO=http://github.com/wiredtiger/wiredtiger.git
-WT_BRANCH=develop
-WT_VSN=""
-WT_DIR=wiredtiger-$WT_BRANCH
+WT_BRANCH=
+WT_REF="tags/1.6.2"
+WT_DIR=wiredtiger-`basename $WT_REF`
 
 SNAPPY_VSN="1.0.4"
 SNAPPY_DIR=snappy-$SNAPPY_VSN
@@ -35,9 +35,9 @@ get_wt ()
     if [ -d $BASEDIR/$WT_DIR/.git ]; then
         (cd $BASEDIR/$WT_DIR && git pull -u) || exit 1
     else
-        if [ "X$WT_VSN" != "X" ]; then
+        if [ "X$WT_REF" != "X" ]; then
             git clone ${WT_REPO} && \
-                (cd $BASEDIR/wiredtiger && git checkout $WT_VSN || exit 1)
+                (cd $BASEDIR/wiredtiger && git checkout refs/$WT_REF || exit 1)
         else
             git clone ${WT_REPO} && \
                 (cd $BASEDIR/wiredtiger && git checkout -b $WT_BRANCH origin/$WT_BRANCH || exit 1)
diff --git a/c_src/wterl.c b/c_src/wterl.c
index cfb5b42..fd1dafd 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -127,7 +127,7 @@ __str_hash(uint32_t in, const char *p, size_t len)
 {
     uint32_t h = in;
     for (++p ; len > 0; ++p, --len)
-        h = (h << 5) - h + (uint32_t)*p;
+	h += (h << 5) + (h >> 27) + *p;
     return h;
 }
 
diff --git a/src/wterl.erl b/src/wterl.erl
index f0b26d4..0807ec8 100644
--- a/src/wterl.erl
+++ b/src/wterl.erl
@@ -96,8 +96,8 @@ nif_stub_error(Line) ->
 -spec init() -> ok | {error, any()}.
 init() ->
     erlang:load_nif(filename:join([priv_dir(), atom_to_list(?MODULE)]),
-           [{wterl_vsn, "b2c0b65"},
-	    {wiredtiger_vsn, "1.6.1-87-gbe6742a"}]).
+           [{wterl_vsn, "53307e8"},
+	    {wiredtiger_vsn, "1.6.2-0-g07cb0a5"}]).
 
 -spec connection_open(string(), config_list()) -> {ok, connection()} | {error, term()}.
 -spec connection_open(string(), config_list(), config_list()) -> {ok, connection()} | {error, term()}.
@@ -586,6 +586,39 @@ insert_delete_test() ->
     ?assertMatch(not_found,  get(ConnRef, "table:test", <<"a">>)),
     ok = connection_close(ConnRef).
 
+many_open_tables_test_() ->
+    {timeout, 60,
+     fun() ->
+	     ConnOpts = [{create,true},{cache_size,"100MB"},{session_max, 8192}],
+	     DataDir = ?TEST_DATA_DIR,
+	     KeyGen =
+		 fun(X) ->
+			 crypto:sha(<<X>>)
+		 end,
+	     ValGen =
+		 fun() ->
+			 crypto:rand_bytes(crypto:rand_uniform(128, 4096))
+		 end,
+	     TableNameGen =
+		 fun(X) ->
+			 "lsm:" ++ integer_to_list(X)
+		 end,
+	     N = 1000,
+	     ConnRef = open_test_conn(DataDir, ConnOpts),
+	     Parent = self(),
+	     [wterl:create(ConnRef, TableNameGen(X), [{checksum, "uncompressed"}]) || X <- lists:seq(0, 128)],
+	     [spawn(fun() ->
+			    TableName = TableNameGen(X),
+			    [wterl:put(ConnRef, TableName, KeyGen(P), ValGen()) || P <- lists:seq(1, N)],
+			    [wterl:get(ConnRef, TableName, KeyGen(P)) || P <- lists:seq(1, N)],
+			    [wterl:delete(ConnRef, TableName, KeyGen(P)) || P <- lists:seq(1, N)],
+			    Parent ! done
+		    end) || X <- lists:seq(0, 128)],
+	     [wterl:drop(ConnRef, TableNameGen(X)) || X <- lists:seq(0, 128)],
+	     [receive done -> ok end || _ <- lists:seq(0, 128)],
+	     ok = wterl:connection_close(ConnRef)
+     end}.
+
 init_test_table() ->
     ConnRef = open_test_conn(?TEST_DATA_DIR),
     ConnRef = open_test_table(ConnRef),
diff --git a/update-version.sh b/update-version.sh
index 4d99734..f457d06 100755
--- a/update-version.sh
+++ b/update-version.sh
@@ -2,10 +2,9 @@
 
 # Note: also, remember to update version numbers in rpath specs so that shared libs can be found at runtime!!!
 
-wterl=`git log -n 1 --pretty=format:"%H"`
-wiredtiger0=`(cd c_src/wiredtiger-develop && git log -n 1 --pretty=format:"%H")`
+wterl=`git describe --always --long --tags`
+wiredtiger0=`(cd c_src/wiredtiger-[0-9.]* && git describe --always --long --tags)`
 wiredtiger=`echo $wiredtiger0 | awk '{print $2}'`
 
 echo $wterl
 echo $wiredtiger
-

From 34e88c9234d7cd89e9fd13f4799012f1f42e527c Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 18 Jun 2013 13:12:10 -0400
Subject: [PATCH 18/30] Add some debugging output.

---
 c_src/async_nif.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 78c962d..88d1493 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -82,11 +82,15 @@ struct async_nif_state {
   struct decl ## _args frame;                                           \
   static void fn_work_ ## decl (ErlNifEnv *env, ERL_NIF_TERM ref, ErlNifPid *pid, unsigned int worker_id, struct decl ## _args *args) { \
   UNUSED(worker_id);                                                    \
+  DPRINTF("async_nif: calling \"%s\"", __func__);			\
   do work_block while(0);                                               \
+  DPRINTF("async_nif: returned from \"%s\"", __func__);			\
   }                                                                     \
   static void fn_post_ ## decl (struct decl ## _args *args) {           \
     UNUSED(args);                                                       \
+    DPRINTF("async_nif: calling \"fn_post_%s\"", #decl);		\
     do post_block while(0);                                             \
+    DPRINTF("async_nif: returned from \"fn_post_%s\"", #decl);		\
   }                                                                     \
   static ERL_NIF_TERM decl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv_in[]) { \
     struct decl ## _args on_stack_args;                                 \
@@ -110,7 +114,9 @@ struct async_nif_state {
                                 enif_make_atom(env, "eagain"));         \
     }                                                                   \
     new_env = req->env;                                                 \
+    DPRINTF("async_nif: calling \"%s\"", __func__);			\
     do pre_block while(0);                                              \
+    DPRINTF("async_nif: returned from \"%s\"", __func__);		\
     copy_of_args = (struct decl ## _args *)enif_alloc(sizeof(struct decl ## _args)); \
     if (!copy_of_args) {                                                \
       fn_post_ ## decl (args);                                          \

From 4ae8ffb4cd288be955a336de64393539cfb4f879 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 18 Jun 2013 13:49:25 -0400
Subject: [PATCH 19/30] Update debugging messages a bit. Fix a bug in the
 signature function.

---
 c_src/common.h |  2 +-
 c_src/wterl.c  | 32 ++++++++++++++++++--------------
 src/wterl.erl  | 17 +++++++++++++++++
 3 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/c_src/common.h b/c_src/common.h
index b8324da..df2f162 100644
--- a/c_src/common.h
+++ b/c_src/common.h
@@ -33,7 +33,7 @@ extern "C" {
 #include <stdarg.h>
 #define DPRINTF(fmt, ...)							\
     do {									\
-	fprintf(stderr, "%s:%d (%s) " fmt "\n", __FILE__, __LINE__, __func__, __VA_ARGS__); \
+	fprintf(stderr, "%s:%d " fmt "\n", __FILE__, __LINE__, __VA_ARGS__);    \
 	fflush(stderr);								\
     } while(0)
 #define DPUTS(arg)		DPRINTF("%s", arg)
diff --git a/c_src/wterl.c b/c_src/wterl.c
index fd1dafd..c99c8a9 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -223,6 +223,7 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
         log = __log2(elapsed);
         if (log > mean) {
             STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
+            DPRINTF("evicting: %llu", PRIuint64(c->sig));
             c->session->close(c->session, NULL);
             enif_free(c);
             num_evicted++;
@@ -329,9 +330,8 @@ __ctx_cache_sig(const char *c, va_list ap, int count, size_t *len)
         }
     }
 
-    sig = crc;
-    sig = sig << 32;
-    sig &= hash;
+    sig = (uint64_t)crc << 32 | hash;
+    //DPRINTF("sig %llu [%u:%u]", PRIuint64(sig), crc, hash);
     return sig;
 }
 
@@ -369,21 +369,22 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
 
     *ctx = NULL;
     do {
+	WMB_NEAR_CAS();
         c = conn_handle->mru_ctx[worker_id];
-        if (CASPO(&conn_handle->mru_ctx[worker_id], c, NULL) == c) {
-            if (c == NULL) {
+        if (CASPO(&conn_handle->mru_ctx[worker_id], c, 0) == c) {
+            if (c == 0) {
                 // mru miss:
-                DPRINTF("[%.4u] mru miss: %llu != NULL", worker_id, PRIuint64(sig));
+                DPRINTF("[%.4u] mru miss, empty", worker_id);
                 *ctx = NULL;
             } else {
                 if (c->sig == sig) {
                     // mru hit:
-                    DPRINTF("[%.4u] mru hit: %llu", worker_id, PRIuint64(sig));
+                    DPRINTF("[%.4u] mru hit: %llu found", worker_id, PRIuint64(sig));
                     *ctx = c;
                     break;
                 } else {
                     // mru mismatch:
-                    DPRINTF("[%.4u] mru mismatch: %llu != %llu", worker_id, PRIuint64(sig), PRIuint64(c->sig));
+                    DPRINTF("[%.4u] mru miss: %llu != %llu", worker_id, PRIuint64(sig), PRIuint64(c->sig));
                     __ctx_cache_add(conn_handle, c);
                     *ctx = NULL;
                 }
@@ -397,7 +398,7 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
         (*ctx) = __ctx_cache_find(conn_handle, sig);
         if ((*ctx) == NULL) {
             // cache miss:
-            DPRINTF("[%.4u] cache miss: %llu [%d]", worker_id, PRIuint64(sig), conn_handle->cache_size);
+            DPRINTF("[%.4u] cache miss: %llu [cache size: %d]", worker_id, PRIuint64(sig), conn_handle->cache_size);
             WT_CONNECTION *conn = conn_handle->conn;
             WT_SESSION *session = NULL;
             int rc = conn->open_session(conn, NULL, session_config, &session);
@@ -435,7 +436,7 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
             va_end(ap);
         } else {
             // cache hit:
-            DPRINTF("[%.4u] cache hit: %llu [%d]", worker_id, PRIuint64(sig), conn_handle->cache_size);
+            DPRINTF("[%.4u] cache hit: %llu [cache size: %d]", worker_id, PRIuint64(sig), conn_handle->cache_size);
         }
     }
     return 0;
@@ -456,6 +457,7 @@ __release_ctx(WterlConnHandle *conn_handle, uint32_t worker_id, struct wterl_ctx
         cursor->reset(cursor);
     }
 
+    WMB_NEAR_CAS();
     c = conn_handle->mru_ctx[worker_id];
     if (CASPO(&conn_handle->mru_ctx[worker_id], c, ctx) != c) {
         __ctx_cache_add(conn_handle, ctx);
@@ -484,10 +486,11 @@ __close_all_sessions(WterlConnHandle *conn_handle)
     // clear out the mru
     for (worker_id = 0; worker_id < ASYNC_NIF_MAX_WORKERS; worker_id++) {
         do {
+	    WMB_NEAR_CAS();
             c = conn_handle->mru_ctx[worker_id];
-        } while(CASPO(&conn_handle->mru_ctx[worker_id], c, NULL) != c);
+        } while(CASPO(&conn_handle->mru_ctx[worker_id], c, 0) != c);
 
-        if (c != NULL) {
+        if (c != 0) {
             c->session->close(c->session, NULL);
             enif_free(c);
         }
@@ -518,8 +521,9 @@ __close_cursors_on(WterlConnHandle *conn_handle, const char *uri)
 
     // walk the mru first, look for open cursors on matching uri
     for (worker_id = 0; worker_id < ASYNC_NIF_MAX_WORKERS; worker_id++) {
+	WMB_NEAR_CAS();
         c = conn_handle->mru_ctx[worker_id];
-        if (CASPO(&conn_handle->mru_ctx[worker_id], c, NULL) == c && c != NULL) {
+        if (CASPO(&conn_handle->mru_ctx[worker_id], c, 0) == c && c != 0) {
             cnt = c->num_cursors;
             for(idx = 0; idx < cnt; idx++) {
                 if (!strcmp(c->ci[idx].uri, uri)) {
@@ -527,7 +531,7 @@ __close_cursors_on(WterlConnHandle *conn_handle, const char *uri)
                     enif_free(c);
                     break;
                 } else {
-                    if (CASPO(&conn_handle->mru_ctx[worker_id], NULL, c) != NULL) {
+                    if (CASPO(&conn_handle->mru_ctx[worker_id], 0, c) != 0) {
                         __ctx_cache_add(conn_handle, c);
                     }
                 }
diff --git a/src/wterl.erl b/src/wterl.erl
index 0807ec8..00d771b 100644
--- a/src/wterl.erl
+++ b/src/wterl.erl
@@ -586,6 +586,23 @@ insert_delete_test() ->
     ?assertMatch(not_found,  get(ConnRef, "table:test", <<"a">>)),
     ok = connection_close(ConnRef).
 
+cursor_fold_keys_test() ->
+    ConnRef = open_test_conn(?TEST_DATA_DIR),
+    ConnRef = open_test_table(ConnRef),
+    [wterl:put(ConnRef, "table:test-fold", crypto:sha(<<X>>),
+	       crypto:rand_bytes(crypto:rand_uniform(128, 4096)))
+     || X <- lists:seq(1, 2000)],
+    Cursor = wterl:cursor_open(ConnRef, "table:test-fold"),
+    try
+	{Result, _} = wterl:fold_keys(Cursor, fun(Key, Acc) -> [Key | Acc] end, [])
+    catch
+	_:_ -> wterl:cursor_close(Cursor)
+    after
+	ok = connection_close(ConnRef)
+    end.
+%    ?assertMatch(lists:sort(Result),
+%		 lists:sort([crypto:sha(<<X>>) || X <- lists:seq(1, 2000)])).
+
 many_open_tables_test_() ->
     {timeout, 60,
      fun() ->

From 450299dc2d7eda50666de006fcbc2f1b961dd227 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 18 Jun 2013 16:40:43 -0400
Subject: [PATCH 20/30] Comment out a test that's not yet working.

---
 c_src/wterl.c |  2 +-
 src/wterl.erl | 32 ++++++++++++++++----------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/c_src/wterl.c b/c_src/wterl.c
index c99c8a9..ffa24c6 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -195,7 +195,7 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
     uint64_t now, elapsed;
     struct wterl_ctx *c, *n;
 
-    if (conn_handle->cache_size != MAX_CACHE_SIZE)
+    if (conn_handle->cache_size < MAX_CACHE_SIZE)
         return 0;
 
     now = cpu_clock_ticks();
diff --git a/src/wterl.erl b/src/wterl.erl
index 00d771b..495be92 100644
--- a/src/wterl.erl
+++ b/src/wterl.erl
@@ -586,22 +586,22 @@ insert_delete_test() ->
     ?assertMatch(not_found,  get(ConnRef, "table:test", <<"a">>)),
     ok = connection_close(ConnRef).
 
-cursor_fold_keys_test() ->
-    ConnRef = open_test_conn(?TEST_DATA_DIR),
-    ConnRef = open_test_table(ConnRef),
-    [wterl:put(ConnRef, "table:test-fold", crypto:sha(<<X>>),
-	       crypto:rand_bytes(crypto:rand_uniform(128, 4096)))
-     || X <- lists:seq(1, 2000)],
-    Cursor = wterl:cursor_open(ConnRef, "table:test-fold"),
-    try
-	{Result, _} = wterl:fold_keys(Cursor, fun(Key, Acc) -> [Key | Acc] end, [])
-    catch
-	_:_ -> wterl:cursor_close(Cursor)
-    after
-	ok = connection_close(ConnRef)
-    end.
-%    ?assertMatch(lists:sort(Result),
-%		 lists:sort([crypto:sha(<<X>>) || X <- lists:seq(1, 2000)])).
+%% cursor_fold_keys_test() ->
+%%     ConnRef = open_test_conn(?TEST_DATA_DIR),
+%%     ConnRef = open_test_table(ConnRef),
+%%     [wterl:put(ConnRef, "table:test-fold", crypto:sha(<<X>>),
+%% 	       crypto:rand_bytes(crypto:rand_uniform(128, 4096)))
+%%      || X <- lists:seq(1, 2000)],
+%%     Cursor = wterl:cursor_open(ConnRef, "table:test-fold"),
+%%     try
+%% 	{Result, _} = wterl:fold_keys(Cursor, fun(Key, Acc) -> [Key | Acc] end, [])
+%%     catch
+%% 	_:_ -> wterl:cursor_close(Cursor)
+%%     after
+%% 	ok = connection_close(ConnRef)
+%%     end.
+%%    ?assertMatch(lists:sort(Result),
+%%		 lists:sort([crypto:sha(<<X>>) || X <- lists:seq(1, 2000)])).
 
 many_open_tables_test_() ->
     {timeout, 60,

From 0f180a6531857980c922010837214d0d8956b9f3 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Wed, 19 Jun 2013 14:37:30 -0400
Subject: [PATCH 21/30] Fixed a few mistakes.

---
 c_src/wterl.c                 | 37 ++++++++++-------
 src/riak_kv_wterl_backend.erl | 76 ++++++++++++++++-------------------
 src/wterl.erl                 |  1 +
 3 files changed, 59 insertions(+), 55 deletions(-)

diff --git a/c_src/wterl.c b/c_src/wterl.c
index ffa24c6..824224d 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -46,8 +46,8 @@ struct wterl_ctx {
     uint64_t sig;
     size_t sig_len;
     WT_SESSION *session;
-    const char *session_config;
     uint32_t num_cursors;
+    const char *session_config;
     struct cursor_info {
         const char *uri;
         const char *config;
@@ -127,7 +127,7 @@ __str_hash(uint32_t in, const char *p, size_t len)
 {
     uint32_t h = in;
     for (++p ; len > 0; ++p, --len)
-	h += (h << 5) + (h >> 27) + *p;
+        h += (h << 5) + (h >> 27) + *p;
     return h;
 }
 
@@ -191,13 +191,16 @@ static inline uint32_t __log2(uint64_t x) {
 static int
 __ctx_cache_evict(WterlConnHandle *conn_handle)
 {
-    uint32_t mean, log, num_evicted, i;
-    uint64_t now, elapsed;
-    struct wterl_ctx *c, *n;
+    uint32_t num_evicted = 0;
+    struct wterl_ctx *c;
 
     if (conn_handle->cache_size < MAX_CACHE_SIZE)
         return 0;
 
+#if 0 // TODO: fixme once stats work again
+    uint32_t mean, log, num_evicted, i;
+    uint64_t now, elapsed;
+    struct wterl_ctx *c, *n;
     now = cpu_clock_ticks();
 
     // Find the mean of the recorded times that items stayed in cache.
@@ -230,6 +233,16 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
         }
         c = n;
     }
+#else
+    c = STAILQ_FIRST(&conn_handle->cache);
+    if (c) {
+        STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
+        DPRINTF("evicting: %llu", PRIuint64(c->sig));
+        c->session->close(c->session, NULL);
+        enif_free(c);
+        num_evicted++;
+    }
+#endif
     conn_handle->cache_size -= num_evicted;
     return num_evicted;
 }
@@ -246,22 +259,20 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
 static struct wterl_ctx *
 __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
 {
-    struct wterl_ctx *c, *n;
+    struct wterl_ctx *c;
 
     enif_mutex_lock(conn_handle->cache_mutex);
     c = STAILQ_FIRST(&conn_handle->cache);
     while (c != NULL) {
-        n = STAILQ_NEXT(c, entries);
         if (c->sig == sig) { // TODO: hash collisions *will* lead to SEGVs
             // cache hit:
-            STAILQ_REMOVE_HEAD(&conn_handle->cache, entries);
+            STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
             conn_handle->histogram[__log2(cpu_clock_ticks() - c->tstamp)]++;
             conn_handle->histogram_count++;
             conn_handle->cache_size -= 1;
             break;
-        } else {
-            c = n;
         }
+        c = STAILQ_NEXT(c, entries);
     }
     enif_mutex_unlock(conn_handle->cache_mutex);
     return c;
@@ -369,7 +380,6 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
 
     *ctx = NULL;
     do {
-	WMB_NEAR_CAS();
         c = conn_handle->mru_ctx[worker_id];
         if (CASPO(&conn_handle->mru_ctx[worker_id], c, 0) == c) {
             if (c == 0) {
@@ -457,7 +467,6 @@ __release_ctx(WterlConnHandle *conn_handle, uint32_t worker_id, struct wterl_ctx
         cursor->reset(cursor);
     }
 
-    WMB_NEAR_CAS();
     c = conn_handle->mru_ctx[worker_id];
     if (CASPO(&conn_handle->mru_ctx[worker_id], c, ctx) != c) {
         __ctx_cache_add(conn_handle, ctx);
@@ -486,7 +495,6 @@ __close_all_sessions(WterlConnHandle *conn_handle)
     // clear out the mru
     for (worker_id = 0; worker_id < ASYNC_NIF_MAX_WORKERS; worker_id++) {
         do {
-	    WMB_NEAR_CAS();
             c = conn_handle->mru_ctx[worker_id];
         } while(CASPO(&conn_handle->mru_ctx[worker_id], c, 0) != c);
 
@@ -521,7 +529,6 @@ __close_cursors_on(WterlConnHandle *conn_handle, const char *uri)
 
     // walk the mru first, look for open cursors on matching uri
     for (worker_id = 0; worker_id < ASYNC_NIF_MAX_WORKERS; worker_id++) {
-	WMB_NEAR_CAS();
         c = conn_handle->mru_ctx[worker_id];
         if (CASPO(&conn_handle->mru_ctx[worker_id], c, 0) == c && c != 0) {
             cnt = c->num_cursors;
@@ -691,6 +698,7 @@ __strerror_term(ErlNifEnv* env, int rc)
            and/or may be localized to any given language (i18n).  Use the errno
            atom rather than the message when matching in Erlang.  You've been
            warned. */
+        DPRINTF("error: %s", erl_errno_id(rc));
         return enif_make_tuple2(env, ATOM_ERROR,
                     enif_make_tuple2(env,
                          enif_make_atom(env, erl_errno_id(rc)),
@@ -1861,6 +1869,7 @@ ASYNC_NIF_DECL(
 
     WT_CURSOR* cursor = args->cursor_handle->cursor;
     ASYNC_NIF_REPLY(__cursor_value_ret(env, cursor, cursor->next(cursor)));
+    DPRINTF("env: %p cursor: %p", env, cursor);
   },
   { // post
 
diff --git a/src/riak_kv_wterl_backend.erl b/src/riak_kv_wterl_backend.erl
index 313da29..94cf8bb 100644
--- a/src/riak_kv_wterl_backend.erl
+++ b/src/riak_kv_wterl_backend.erl
@@ -52,9 +52,7 @@
 
 -record(state, {table :: string(),
                 type :: string(),
-                connection :: wterl:connection(),
-                is_empty_cursor :: wterl:cursor(),
-                status_cursor :: wterl:cursor()}).
+                connection :: wterl:connection()}).
 
 -type state() :: #state{}.
 -type config() :: [{atom(), term()}].
@@ -135,15 +133,8 @@ start(Partition, Config) ->
                 end,
             case wterl:create(Connection, Table, TableOpts) of
                 ok ->
-                    case establish_utility_cursors(Connection, Table) of
-                        {ok, IsEmptyCursor, StatusCursor} ->
-                            {ok, #state{table=Table, type=Type,
-                                        connection=Connection,
-                                        is_empty_cursor=IsEmptyCursor,
-                                        status_cursor=StatusCursor}};
-                        {error, Reason2} ->
-                            {error, Reason2}
-                    end;
+		    {ok, #state{table=Table, type=Type,
+				connection=Connection}};
                 {error, Reason3} ->
                     {error, Reason3}
                 end
@@ -329,25 +320,42 @@ drop(#state{connection=Connection, table=Table}=State) ->
 %% @doc Returns true if this wterl backend contains any
 %% non-tombstone values; otherwise returns false.
 -spec is_empty(state()) -> boolean().
-is_empty(#state{is_empty_cursor=Cursor}) ->
-    wterl:cursor_reset(Cursor),
-    case wterl:cursor_next(Cursor) of
-        not_found -> true;
-        {error, {eperm, _}} -> false; % TODO: review/fix this logic
-        _ -> false
+is_empty(#state{connection=Connection, table=Table}) ->
+    case wterl:cursor_open(Connection, Table) of
+        {ok, Cursor} ->
+	    IsEmpty =
+		case wterl:cursor_next(Cursor) of
+		    not_found ->
+			true;
+		    {error, {eperm, _}} ->
+			false; % TODO: review/fix this logic
+		    _ ->
+			false
+		end,
+	    wterl:cursor_close(Cursor),
+	    IsEmpty;
+        {error, Reason2} ->
+            {error, Reason2}
     end.
 
 %% @doc Get the status information for this wterl backend
 -spec status(state()) -> [{atom(), term()}].
-status(#state{status_cursor=Cursor}) ->
-    wterl:cursor_reset(Cursor),
-    case fetch_status(Cursor) of
-        {ok, Stats} ->
-            Stats;
-        {error, {eperm, _}} -> % TODO: review/fix this logic
-            {ok, []};
-        _ ->
-            {ok, []}
+status(#state{connection=Connection, table=Table}) ->
+    case wterl:cursor_open(Connection, Table) of
+        {ok, Cursor} ->
+	    TheStats =
+		case fetch_status(Cursor) of
+		    {ok, Stats} ->
+			Stats;
+		    {error, {eperm, _}} -> % TODO: review/fix this logic
+			{ok, []};
+		    _ ->
+			{ok, []}
+		end,
+	    wterl:cursor_close(Cursor),
+	    TheStats;
+        {error, Reason2} ->
+            {error, Reason2}
     end.
 
 %% @doc Register an asynchronous callback
@@ -373,20 +381,6 @@ max_sessions(Config) ->
         false -> Est
     end.
 
-%% @private
-establish_utility_cursors(Connection, Table) ->
-    case wterl:cursor_open(Connection, Table) of
-        {ok, IsEmptyCursor} ->
-            case wterl:cursor_open(Connection, "statistics:" ++ Table, [{statistics_fast, true}]) of
-                {ok, StatusCursor} ->
-                    {ok, IsEmptyCursor, StatusCursor};
-                {error, Reason1} ->
-                    {error, Reason1}
-            end;
-        {error, Reason2} ->
-            {error, Reason2}
-    end.
-
 %% @private
 establish_connection(Config, Type) ->
     %% Get the data root directory
diff --git a/src/wterl.erl b/src/wterl.erl
index 495be92..1da6d44 100644
--- a/src/wterl.erl
+++ b/src/wterl.erl
@@ -95,6 +95,7 @@ nif_stub_error(Line) ->
 
 -spec init() -> ok | {error, any()}.
 init() ->
+    Module <- [wterl, wterl_conn, wterl_app, wterl_sup, wterl_ets, riak_kv_wiredtiger_backend, temp_riak_kv_backend]],
     erlang:load_nif(filename:join([priv_dir(), atom_to_list(?MODULE)]),
            [{wterl_vsn, "53307e8"},
 	    {wiredtiger_vsn, "1.6.2-0-g07cb0a5"}]).

From 060abffcff4be928ce111008546931bae3a7cb69 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Wed, 19 Jun 2013 14:42:56 -0400
Subject: [PATCH 22/30] Minor oversight, fixed.

---
 src/wterl.erl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/wterl.erl b/src/wterl.erl
index 1da6d44..495be92 100644
--- a/src/wterl.erl
+++ b/src/wterl.erl
@@ -95,7 +95,6 @@ nif_stub_error(Line) ->
 
 -spec init() -> ok | {error, any()}.
 init() ->
-    Module <- [wterl, wterl_conn, wterl_app, wterl_sup, wterl_ets, riak_kv_wiredtiger_backend, temp_riak_kv_backend]],
     erlang:load_nif(filename:join([priv_dir(), atom_to_list(?MODULE)]),
            [{wterl_vsn, "53307e8"},
 	    {wiredtiger_vsn, "1.6.2-0-g07cb0a5"}]).

From a3c54b1610553f5023ac814202bf646800a710f0 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Wed, 19 Jun 2013 14:54:27 -0400
Subject: [PATCH 23/30] Cleanup a bit.

---
 c_src/async_nif.h | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 88d1493..ff76364 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -4,18 +4,16 @@
  * Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved.
  * Author: Gregory Burd <greg@basho.com> <greg@burd.me>
  *
- * This file is provided to you under the Apache License,
- * Version 2.0 (the "License"); you may not use this file
- * except in compliance with the License.  You may obtain
- * a copy of the License at
+ * This file is provided to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
  * under the License.
  */
 
@@ -34,8 +32,8 @@ extern "C" {
 #define UNUSED(v) ((void)(v))
 #endif
 
-#define ASYNC_NIF_MAX_WORKERS 8
-#define ASYNC_NIF_WORKER_QUEUE_SIZE 500
+#define ASYNC_NIF_MAX_WORKERS 1024
+#define ASYNC_NIF_WORKER_QUEUE_SIZE 2000
 #define ASYNC_NIF_MAX_QUEUED_REQS ASYNC_NIF_WORKER_QUEUE_SIZE * ASYNC_NIF_MAX_WORKERS
 
 STAT_DECL(qwait, 1000);
@@ -82,15 +80,15 @@ struct async_nif_state {
   struct decl ## _args frame;                                           \
   static void fn_work_ ## decl (ErlNifEnv *env, ERL_NIF_TERM ref, ErlNifPid *pid, unsigned int worker_id, struct decl ## _args *args) { \
   UNUSED(worker_id);                                                    \
-  DPRINTF("async_nif: calling \"%s\"", __func__);			\
+  DPRINTF("async_nif: calling \"%s\"", __func__);                       \
   do work_block while(0);                                               \
-  DPRINTF("async_nif: returned from \"%s\"", __func__);			\
+  DPRINTF("async_nif: returned from \"%s\"", __func__);                 \
   }                                                                     \
   static void fn_post_ ## decl (struct decl ## _args *args) {           \
     UNUSED(args);                                                       \
-    DPRINTF("async_nif: calling \"fn_post_%s\"", #decl);		\
+    DPRINTF("async_nif: calling \"fn_post_%s\"", #decl);                \
     do post_block while(0);                                             \
-    DPRINTF("async_nif: returned from \"fn_post_%s\"", #decl);		\
+    DPRINTF("async_nif: returned from \"fn_post_%s\"", #decl);          \
   }                                                                     \
   static ERL_NIF_TERM decl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv_in[]) { \
     struct decl ## _args on_stack_args;                                 \
@@ -114,9 +112,9 @@ struct async_nif_state {
                                 enif_make_atom(env, "eagain"));         \
     }                                                                   \
     new_env = req->env;                                                 \
-    DPRINTF("async_nif: calling \"%s\"", __func__);			\
+    DPRINTF("async_nif: calling \"%s\"", __func__);                     \
     do pre_block while(0);                                              \
-    DPRINTF("async_nif: returned from \"%s\"", __func__);		\
+    DPRINTF("async_nif: returned from \"%s\"", __func__);               \
     copy_of_args = (struct decl ## _args *)enif_alloc(sizeof(struct decl ## _args)); \
     if (!copy_of_args) {                                                \
       fn_post_ ## decl (args);                                          \

From c41e411a92bdf61e3b0a0bdc0a09a0a88ae30e46 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 25 Jun 2013 13:31:43 -0400
Subject: [PATCH 24/30] Worker threads come and go as needed with a lower bound
 of 2 and an upper bound of ASYNC_NIF_MAX_WORKERS.  Stats were improved to use
 thread local storage for measures.  With stats working again wterl uses them
 to determine who to evict.  Wterl's signature calculation for an operation
 wasn't correct and so the cache wasn't efficient at all, this has been fixed.

---
 c_src/async_nif.h   | 189 +++++++++++++++++++-------------
 c_src/build_deps.sh |   9 +-
 c_src/cas.h         |   6 +-
 c_src/duration.h    |   2 +-
 c_src/stats.c       | 260 ++++++++++++++++++++++++++++++++++++++++++++
 c_src/stats.h       | 194 +++------------------------------
 c_src/wterl.c       | 154 ++++++++++----------------
 7 files changed, 453 insertions(+), 361 deletions(-)
 create mode 100644 c_src/stats.c

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index ff76364..6627152 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -26,6 +26,7 @@ extern "C" {
 
 #include <assert.h>
 #include "fifo_q.h"
+#include "queue.h"
 #include "stats.h"
 
 #ifndef UNUSED
@@ -33,11 +34,9 @@ extern "C" {
 #endif
 
 #define ASYNC_NIF_MAX_WORKERS 1024
-#define ASYNC_NIF_WORKER_QUEUE_SIZE 2000
+#define ASYNC_NIF_WORKER_QUEUE_SIZE 1000
 #define ASYNC_NIF_MAX_QUEUED_REQS ASYNC_NIF_WORKER_QUEUE_SIZE * ASYNC_NIF_MAX_WORKERS
 
-STAT_DECL(qwait, 1000);
-
 struct async_nif_req_entry {
   ERL_NIF_TERM ref;
   ErlNifEnv *env;
@@ -45,12 +44,12 @@ struct async_nif_req_entry {
   void *args;
   void (*fn_work)(ErlNifEnv*, ERL_NIF_TERM, ErlNifPid*, unsigned int, void *);
   void (*fn_post)(void *);
-  const char *func;
 };
 DECL_FIFO_QUEUE(reqs, struct async_nif_req_entry);
 
 struct async_nif_work_queue {
   STAT_DEF(qwait);
+  unsigned int workers;
   ErlNifMutex *reqs_mutex;
   ErlNifCond *reqs_cnd;
   FIFO_QUEUE_TYPE(reqs) reqs;
@@ -61,13 +60,15 @@ struct async_nif_worker_entry {
   unsigned int worker_id;
   struct async_nif_state *async_nif;
   struct async_nif_work_queue *q;
+  SLIST_ENTRY(async_nif_worker_entry) entries;
 };
 
 struct async_nif_state {
   STAT_DEF(qwait);
   unsigned int shutdown;
-  unsigned int num_workers;
-  struct async_nif_worker_entry worker_entries[ASYNC_NIF_MAX_WORKERS];
+  ErlNifMutex *we_mutex;
+  unsigned int we_active;
+  SLIST_HEAD(joining, async_nif_worker_entry) we_joining;
   unsigned int num_queues;
   unsigned int next_q;
   FIFO_QUEUE_TYPE(reqs) recycled_reqs;
@@ -107,7 +108,6 @@ struct async_nif_state {
                               enif_make_atom(env, "shutdown"));         \
     req = async_nif_reuse_req(async_nif);                               \
     if (!req) {                                                         \
-        async_nif_recycle_req(req, async_nif);                          \
         return enif_make_tuple2(env, enif_make_atom(env, "error"),      \
                                 enif_make_atom(env, "eagain"));         \
     }                                                                   \
@@ -128,7 +128,6 @@ struct async_nif_state {
     req->args = (void*)copy_of_args;                                    \
     req->fn_work = (void (*)(ErlNifEnv *, ERL_NIF_TERM, ErlNifPid*, unsigned int, void *))fn_work_ ## decl ; \
     req->fn_post = (void (*)(void *))fn_post_ ## decl;                 \
-    req->func = __func__;                                              \
     int h = -1;                                                        \
     if (affinity)                                                      \
         h = affinity % async_nif->num_queues;                          \
@@ -195,12 +194,12 @@ async_nif_reuse_req(struct async_nif_state *async_nif)
             if (req) {
                 memset(req, 0, sizeof(struct async_nif_req_entry));
                 env = enif_alloc_env();
-                if (!env) {
-                    enif_free(req);
-                    req = NULL;
-                } else {
+                if (env) {
                     req->env = env;
                     async_nif->num_reqs++;
+                } else {
+                    enif_free(req);
+                    req = NULL;
                 }
             }
         }
@@ -208,7 +207,7 @@ async_nif_reuse_req(struct async_nif_state *async_nif)
         req = fifo_q_get(reqs, async_nif->recycled_reqs);
     }
     enif_mutex_unlock(async_nif->recycled_req_mutex);
-    STAT_TICK(async_nif, qwait);
+    __stat_tick(async_nif->qwait_stat);
     return req;
 }
 
@@ -223,16 +222,61 @@ void
 async_nif_recycle_req(struct async_nif_req_entry *req, struct async_nif_state *async_nif)
 {
     ErlNifEnv *env = NULL;
-    STAT_TOCK(async_nif, qwait);
+    __stat_tock(async_nif->qwait_stat);
     enif_mutex_lock(async_nif->recycled_req_mutex);
+    enif_clear_env(req->env);
     env = req->env;
-    enif_clear_env(env);
     memset(req, 0, sizeof(struct async_nif_req_entry));
     req->env = env;
     fifo_q_put(reqs, async_nif->recycled_reqs, req);
     enif_mutex_unlock(async_nif->recycled_req_mutex);
 }
 
+static void *async_nif_worker_fn(void *);
+
+/**
+ * Start up a worker thread.
+ */
+static int
+async_nif_start_worker(struct async_nif_state *async_nif, struct async_nif_work_queue *q)
+{
+  struct async_nif_worker_entry *we;
+
+  if (0 == q)
+      return EINVAL;
+
+  enif_mutex_lock(async_nif->we_mutex);
+
+  we = SLIST_FIRST(&async_nif->we_joining);
+  while(we != NULL) {
+    struct async_nif_worker_entry *n = SLIST_NEXT(we, entries);
+    SLIST_REMOVE_HEAD(&async_nif->we_joining, entries);
+    void *exit_value = 0; /* We ignore the thread_join's exit value. */
+    enif_thread_join(we->tid, &exit_value);
+    enif_free(we);
+    async_nif->we_active--;
+    we = n;
+  }
+
+  if (async_nif->we_active == ASYNC_NIF_MAX_WORKERS) {
+      enif_mutex_unlock(async_nif->we_mutex);
+      return EAGAIN;
+  }
+
+  we = enif_alloc(sizeof(struct async_nif_worker_entry));
+  if (!we) {
+      enif_mutex_unlock(async_nif->we_mutex);
+      return ENOMEM;
+  }
+  memset(we, 0, sizeof(struct async_nif_worker_entry));
+  we->worker_id = async_nif->we_active++;
+  we->async_nif = async_nif;
+  we->q = q;
+
+  enif_mutex_unlock(async_nif->we_mutex);
+  return enif_thread_create(NULL,&we->tid, &async_nif_worker_fn, (void*)we, 0);
+}
+
 /**
  * Enqueue a request for processing by a worker thread.
  *
@@ -244,7 +288,10 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
 {
   /* Identify the most appropriate worker for this request. */
   unsigned int qid = 0;
+  unsigned int n = async_nif->num_queues;
   struct async_nif_work_queue *q = NULL;
+  double await = 0;
+  double await_inthisq = 0;
 
   /* Either we're choosing a queue based on some affinity/hinted value or we
      need to select the next queue in the rotation and atomically update that
@@ -257,12 +304,6 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
       async_nif->next_q = qid;
   }
 
-  q = &async_nif->queues[qid];
-  enif_mutex_lock(q->reqs_mutex);
-
-#if 0 // stats aren't yet thread safe, so this can go haywire... TODO: fix.
-  unsigned int n = async_nif->num_queues;
-
   /* Now we inspect and interate across the set of queues trying to select one
      that isn't too full or too slow. */
   do {
@@ -277,8 +318,8 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
           return 0;
       }
       if (!fifo_q_full(reqs, q->reqs)) {
-          double await = STAT_MEAN_LOG2_SAMPLE(async_nif, qwait);
-          double await_inthisq = STAT_MEAN_LOG2_SAMPLE(q, qwait);
+          await = __stat_mean_log2(async_nif->qwait_stat);
+          await_inthisq = __stat_mean_log2(q->qwait_stat);
           if (await_inthisq > await) {
               enif_mutex_unlock(q->reqs_mutex);
               qid = (qid + 1) % async_nif->num_queues;
@@ -288,13 +329,18 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
               break;
           }
       }
-      // TODO: at some point add in work sheading/stealing
   } while(n-- > 0);
-#endif
 
   /* We hold the queue's lock, and we've seletect a reasonable queue for this
-     new request so add the request. */
-  STAT_TICK(q, qwait);
+     new request now check to make sure there are enough workers actively
+     processing requests on this queue. */
+  if (q->workers < 2 || await_inthisq > await) {
+      if (async_nif_start_worker(async_nif, q) == 0)
+	  q->workers++;
+  }
+
+  /* And finally add the request to the queue. */
+  __stat_tick(q->qwait_stat);
   fifo_q_put(reqs, q->reqs, req);
 
   /* Build the term before releasing the lock so as not to race on the use of
@@ -331,9 +377,14 @@ async_nif_worker_fn(void *arg)
     }
     if (fifo_q_empty(reqs, q->reqs)) {
       /* Queue is empty so we wait for more work to arrive. */
-      STAT_RESET(q, qwait);
-      enif_cond_wait(q->reqs_cnd, q->reqs_mutex);
-      goto check_again_for_work;
+      __stat_reset(q->qwait_stat);
+      if (q->workers > 2) {
+	  enif_mutex_unlock(q->reqs_mutex);
+	  break;
+      } else {
+	  enif_cond_wait(q->reqs_cnd, q->reqs_mutex);
+	  goto check_again_for_work;
+      }
     } else {
       assert(fifo_q_size(reqs, q->reqs) > 0);
       assert(fifo_q_size(reqs, q->reqs) < fifo_q_capacity(reqs, q->reqs));
@@ -348,7 +399,7 @@ async_nif_worker_fn(void *arg)
 
       /* Perform the work. */
       req->fn_work(req->env, req->ref, &req->pid, worker_id, req->args);
-      STAT_TOCK(q, qwait);
+      __stat_tock(q->qwait_stat);
 
       /* Now call the post-work cleanup function. */
       req->fn_post(req->args);
@@ -363,6 +414,10 @@ async_nif_worker_fn(void *arg)
       req = NULL;
     }
   }
+  enif_mutex_lock(async_nif->we_mutex);
+  SLIST_INSERT_HEAD(&async_nif->we_joining, we, entries);
+  enif_mutex_unlock(async_nif->we_mutex);
+  q->workers--;
   enif_thread_exit(0);
   return 0;
 }
@@ -374,9 +429,10 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
   unsigned int num_queues = async_nif->num_queues;
   struct async_nif_work_queue *q = NULL;
   struct async_nif_req_entry *req = NULL;
+  struct async_nif_worker_entry *we = NULL;
   UNUSED(env);
 
-  STAT_PRINT(async_nif, qwait, "wterl");
+  __stat_print_histogram(async_nif->qwait_stat, "wterl");
 
   /* Signal the worker threads, stop what you're doing and exit.  To
      ensure that we don't race with the enqueue() process we first
@@ -393,19 +449,29 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
      executing requests. */
   async_nif->shutdown = 1;
 
-  /* Make sure to wake up all worker threads sitting on conditional
-     wait for work so that they can see it's time to exit. */
   for (i = 0; i < num_queues; i++) {
       q = &async_nif->queues[i];
-      enif_cond_broadcast(q->reqs_cnd);
       enif_mutex_unlock(q->reqs_mutex);
   }
 
   /* Join for the now exiting worker threads. */
-  for (i = 0; i < async_nif->num_workers; ++i) {
-    void *exit_value = 0; /* We ignore the thread_join's exit value. */
-    enif_thread_join(async_nif->worker_entries[i].tid, &exit_value);
+  while(async_nif->we_active > 0) {
+
+      for (i = 0; i < num_queues; i++)
+	  enif_cond_broadcast(async_nif->queues[i].reqs_cnd);
+
+      we = SLIST_FIRST(&async_nif->we_joining);
+      while(we != NULL) {
+	  struct async_nif_worker_entry *n = SLIST_NEXT(we, entries);
+	  SLIST_REMOVE_HEAD(&async_nif->we_joining, entries);
+	  void *exit_value = 0; /* We ignore the thread_join's exit value. */
+	  enif_thread_join(we->tid, &exit_value);
+	  enif_free(we);
+	  async_nif->we_active--;
+	  we = n;
+      }
   }
+  enif_mutex_destroy(async_nif->we_mutex);
 
   /* Cleanup in-flight requests, mutexes and conditions in each work queue. */
   for (i = 0; i < num_queues; i++) {
@@ -447,7 +513,7 @@ static void *
 async_nif_load()
 {
   static int has_init = 0;
-  unsigned int i, j, num_queues;
+  unsigned int i, num_queues;
   ErlNifSysInfo info;
   struct async_nif_state *async_nif;
 
@@ -477,57 +543,24 @@ async_nif_load()
   if (!async_nif)
       return NULL;
   memset(async_nif, 0, sizeof(struct async_nif_state) +
-         sizeof(struct async_nif_work_queue) * num_queues);
+                       sizeof(struct async_nif_work_queue) * num_queues);
 
   async_nif->num_queues = num_queues;
-  async_nif->num_workers = ASYNC_NIF_MAX_WORKERS;
+  async_nif->we_active = 0;
   async_nif->next_q = 0;
   async_nif->shutdown = 0;
   async_nif->recycled_reqs = fifo_q_new(reqs, ASYNC_NIF_MAX_QUEUED_REQS);
   async_nif->recycled_req_mutex = enif_mutex_create(NULL);
-  STAT_INIT(async_nif, qwait);
+  async_nif->qwait_stat = __stat_init(1000);
+  async_nif->we_mutex = enif_mutex_create(NULL);
+  SLIST_INIT(&async_nif->we_joining);
 
   for (i = 0; i < async_nif->num_queues; i++) {
       struct async_nif_work_queue *q = &async_nif->queues[i];
       q->reqs = fifo_q_new(reqs, ASYNC_NIF_WORKER_QUEUE_SIZE);
       q->reqs_mutex = enif_mutex_create(NULL);
       q->reqs_cnd = enif_cond_create(NULL);
-      STAT_INIT(q, qwait);
-  }
-
-  /* Setup the thread pool management. */
-  memset(async_nif->worker_entries, 0, sizeof(struct async_nif_worker_entry) * ASYNC_NIF_MAX_WORKERS);
-
-  /* Start the worker threads. */
-  for (i = 0; i < async_nif->num_workers; i++) {
-    struct async_nif_worker_entry *we = &async_nif->worker_entries[i];
-    we->async_nif = async_nif;
-    we->worker_id = i;
-    we->q = &async_nif->queues[i % async_nif->num_queues];
-    if (enif_thread_create(NULL, &async_nif->worker_entries[i].tid,
-                            &async_nif_worker_fn, (void*)we, NULL) != 0) {
-      async_nif->shutdown = 1;
-
-      for (j = 0; j < async_nif->num_queues; j++) {
-          struct async_nif_work_queue *q = &async_nif->queues[j];
-          enif_cond_broadcast(q->reqs_cnd);
-      }
-
-      while(i-- > 0) {
-        void *exit_value = 0; /* Ignore this. */
-        enif_thread_join(async_nif->worker_entries[i].tid, &exit_value);
-      }
-
-      for (j = 0; j < async_nif->num_queues; j++) {
-          struct async_nif_work_queue *q = &async_nif->queues[j];
-          enif_mutex_destroy(q->reqs_mutex);
-          enif_cond_destroy(q->reqs_cnd);
-      }
-
-      memset(async_nif->worker_entries, 0, sizeof(struct async_nif_worker_entry) * ASYNC_NIF_MAX_WORKERS);
-      enif_free(async_nif);
-      return NULL;
-    }
+      q->qwait_stat = __stat_init(1000);
   }
   return async_nif;
 }
diff --git a/c_src/build_deps.sh b/c_src/build_deps.sh
index cc0e807..15608ef 100755
--- a/c_src/build_deps.sh
+++ b/c_src/build_deps.sh
@@ -36,13 +36,12 @@ get_wt ()
         (cd $BASEDIR/$WT_DIR && git pull -u) || exit 1
     else
         if [ "X$WT_REF" != "X" ]; then
-            git clone ${WT_REPO} && \
-                (cd $BASEDIR/wiredtiger && git checkout refs/$WT_REF || exit 1)
+            git clone ${WT_REPO} ${WT_DIR} && \
+                (cd $BASEDIR/$WT_DIR && git checkout refs/$WT_REF || exit 1)
         else
-            git clone ${WT_REPO} && \
-                (cd $BASEDIR/wiredtiger && git checkout -b $WT_BRANCH origin/$WT_BRANCH || exit 1)
+            git clone ${WT_REPO} ${WT_DIR} && \
+                (cd $BASEDIR/$WT_DIR && git checkout -b $WT_BRANCH origin/$WT_BRANCH || exit 1)
         fi
-        mv wiredtiger $WT_DIR || exit 1
     fi
     [ -d $BASEDIR/$WT_DIR ] || (echo "Missing WiredTiger source directory" && exit 1)
     (cd $BASEDIR/$WT_DIR
diff --git a/c_src/cas.h b/c_src/cas.h
index ea81dbf..61c1f61 100644
--- a/c_src/cas.h
+++ b/c_src/cas.h
@@ -69,9 +69,9 @@ do {                                                                    \
         __val = __newval;                                               \
 } while ( 0 )
 
-#define ALIGNED_ENIF_ALLOC(_s)                                      \
-    ((void *)(((unsigned long)enif_alloc((_s)+CACHE_LINE_SIZE*2) +  \
-               CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE-1)))        \
+#define CACHE_ALIGNED_SIZEOF(_s)                                      \
+    ((sizeof(_s)) + CACHE_LINE_SIZE*2) +                              \
+    CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE-1)))                     \
 
 /*
  * I. Compare-and-swap.
diff --git a/c_src/duration.h b/c_src/duration.h
index 1404f41..2d86385 100644
--- a/c_src/duration.h
+++ b/c_src/duration.h
@@ -19,7 +19,7 @@
 #endif
 
 
-void current_utc_time(struct timespec *ts)
+static inline void current_utc_time(struct timespec *ts)
 {
 #ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
     clock_serv_t cclock;
diff --git a/c_src/stats.c b/c_src/stats.c
new file mode 100644
index 0000000..9d56f9e
--- /dev/null
+++ b/c_src/stats.c
@@ -0,0 +1,260 @@
+/*
+ * stats:
+ *
+ * Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved.
+ * Author: Gregory Burd <greg@basho.com> <greg@burd.me>
+ *
+ * This file is provided to you under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License.  You may obtain
+ * a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <inttypes.h>
+
+#include "erl_nif.h"
+#include "erl_driver.h"
+
+#include "common.h"
+#include "duration.h"
+#include "stats.h"
+
+/**
+ * Calculate the log2 of 64bit unsigned integers.
+ */
+#ifdef __GCC__
+#define LOG2(X) ((unsigned) ((8 * (sizeof(uint64_t) - 1))  - __builtin_clzll((X))))
+#else
+static unsigned int __log2_64(uint64_t x) {
+     static const int tab64[64] = {
+          63,  0, 58,  1, 59, 47, 53,  2,
+          60, 39, 48, 27, 54, 33, 42,  3,
+          61, 51, 37, 40, 49, 18, 28, 20,
+          55, 30, 34, 11, 43, 14, 22,  4,
+          62, 57, 46, 52, 38, 26, 32, 41,
+          50, 36, 17, 19, 29, 10, 13, 21,
+          56, 45, 25, 31, 35, 16,  9, 12,
+          44, 24, 15,  8, 23,  7,  6,  5};
+     if (x == 0) return 0;
+     uint64_t v = x;
+     v |= v >> 1;
+     v |= v >> 2;
+     v |= v >> 4;
+     v |= v >> 8;
+     v |= v >> 16;
+     v |= v >> 32;
+     return tab64[((uint64_t)((v - (v >> 1)) * 0x07EDD5E59A4E28C2)) >> 58];
+}
+#define LOG2(X) __log2_64(X)
+#endif
+
+double
+__stat_mean(struct stat *s)
+{
+    uint32_t t, h;
+    double mean;
+
+    if (!s)
+	return 0.0;
+
+    t = s->h;
+    h = (s->h + 1) % s->num_samples;
+    mean = 0;
+
+    while (h != t) {
+	mean += s->samples[h];
+	h = (h + 1) % s->num_samples;
+    }
+    if (mean > 0)
+	mean /= (double)(s->n < s->num_samples ? s->n : s->num_samples);
+    return mean;
+}
+
+double
+__stat_mean_log2(struct stat *s)
+{
+    uint32_t i;
+    double mean;
+
+    if (!s)
+	return 0.0;
+
+    mean = 0;
+    for (i = 0; i < 64; i++)
+	mean += (s->histogram[i] * i);
+    if (mean > 0)
+	mean /= (double)s->n;
+    return mean;
+}
+
+uint64_t
+__stat_tick(struct stat *s)
+{
+    duration_t *d;
+    uint64_t t;
+
+    if (!s)
+	return 0.0;
+
+    d = (duration_t*)erl_drv_tsd_get(s->duration_key);
+    if (!d) {
+	if ((d = enif_alloc(sizeof(duration_t))) == NULL)
+	    return 0;
+	memset(d, 0, sizeof(duration_t));
+	erl_drv_tsd_set(s->duration_key, d);
+    }
+    t = ts(d->unit);
+    d->then = t;
+    return t;
+}
+
+void
+__stat_reset(struct stat *s)
+{
+    duration_t *d;
+
+    if (!s)
+	return;
+
+    s->min = ~0;
+    s->max = 0;
+    s->h = 0;
+    memset(s->histogram, 0, sizeof(uint64_t) * 64);
+    memset(s->samples, 0, sizeof(uint64_t) * s->num_samples);
+    d = (duration_t*)erl_drv_tsd_get(s->duration_key);
+    if (d)
+	d->then = 0;
+}
+
+uint64_t
+__stat_tock(struct stat *s)
+{
+    uint64_t now;
+    uint64_t elapsed;
+    uint32_t i;
+    duration_t *d;
+
+    if (!s)
+	return 0.0;
+
+    d = (duration_t*)erl_drv_tsd_get(s->duration_key);
+    if (!d)
+	return 0;
+
+    now = ts(d->unit);
+    elapsed = now - d->then;
+    i = s->h;
+    if (s->n == s->num_samples) {
+	s->mean = (s->mean + __stat_mean(s)) / 2.0;
+	if (s->n >= 4294967295)
+	    __stat_reset(s);
+    }
+    s->h = (s->h + 1) % s->num_samples;
+    s->samples[i] = elapsed;
+    if (elapsed < s->min)
+	s->min = elapsed;
+    if (elapsed > s->max)
+	s->max = elapsed;
+    s->histogram[LOG2(elapsed)]++;
+    s->n++;
+    d->then = ts(d->unit);
+    return elapsed;
+}
+
+void
+__stat_print_histogram(struct stat *s, const char *mod)
+{
+    uint8_t logs[64];
+    uint8_t i, j, max_log = 0;
+    double m;
+
+    if (!s)
+	return;
+
+    m = (s->mean + __stat_mean(s) / 2.0);
+
+    fprintf(stderr, "%s:async_nif request latency histogram:\n", mod);
+    for (i = 0; i < 64; i++) {
+	logs[i] = LOG2(s->histogram[i]);
+	if (logs[i] > max_log)
+	    max_log = logs[i];
+    }
+    for (i = max_log; i > 0; i--) {
+	if (!(i % 10))
+	    fprintf(stderr, "2^%2d ", i);
+	else
+	    fprintf(stderr, "     ");
+	for(j = 0; j < 64; j++)
+	    fprintf(stderr, logs[j] >= i ?  "•" : " ");
+	fprintf(stderr, "\n");
+    }
+    if (max_log == 0) {
+	fprintf(stderr, "[empty]\n");
+    } else {
+	fprintf(stderr, "     ns        μs        ms        s         ks\n");
+	fprintf(stderr, "min: ");
+	if (s->min < 1000)
+	    fprintf(stderr, "%llu (ns)", PRIuint64(s->min));
+	else if (s->min < 1000000)
+	    fprintf(stderr, "%.2f (μs)", s->min / 1000.0);
+	else if (s->min < 1000000000)
+	    fprintf(stderr, "%.2f (ms)", s->min / 1000000.0);
+	else if (s->min < 1000000000000)
+	    fprintf(stderr, "%.2f (s)", s->min / 1000000000.0);
+	fprintf(stderr, "  max: ");
+	if (s->max < 1000)
+	    fprintf(stderr, "%llu (ns)", PRIuint64(s->max));
+	else if (s->max < 1000000)
+	    fprintf(stderr, "%.2f (μs)", s->max / 1000.0);
+	else if (s->max < 1000000000)
+	    fprintf(stderr, "%.2f (ms)", s->max / 1000000.0);
+	else if (s->max < 1000000000000)
+	    fprintf(stderr, "%.2f (s)", s->max / 1000000000.0);
+	fprintf(stderr, "  mean: ");
+	if (m < 1000)
+	    fprintf(stderr, "%.2f (ns)", m);
+	else if (m < 1000000)
+	    fprintf(stderr, "%.2f (μs)", m / 1000.0);
+	else if (m < 1000000000)
+	    fprintf(stderr, "%.2f (ms)", m / 1000000.0);
+	else if (m < 1000000000000)
+	    fprintf(stderr, "%.2f (s)", m / 1000000000.0);
+	fprintf(stderr, "\n");
+    }
+    fflush(stderr);
+}
+
+void
+__stat_free(struct stat *s)
+{
+    if (!s)
+	return;
+
+    enif_free(s->samples);
+    enif_free(s);
+}
+
+struct stat *
+__stat_init(uint32_t n)
+{
+    struct stat *s = enif_alloc(sizeof(struct stat) + (sizeof(uint64_t) * n));
+    if (!s)
+	return NULL;
+    memset(s, 0, sizeof(struct stat) + (sizeof(uint64_t) * n));
+    s->min = ~0;
+    s->max = 0;
+    s->mean = 0.0;
+    s->h = 0;
+    s->num_samples = n;
+    erl_drv_tsd_key_create(NULL, &(s->duration_key));
+    return s;
+}
diff --git a/c_src/stats.h b/c_src/stats.h
index 35192ec..6d7f983 100644
--- a/c_src/stats.h
+++ b/c_src/stats.h
@@ -27,185 +27,25 @@
 extern "C" {
 #endif
 
-#include "duration.h"
+#define STAT_DEF(name) struct stat *name ## _stat;
 
-/**
- * Calculate the log2 of 64bit unsigned integers.
- */
-#ifdef __GCC__
-#define LOG2(X) ((unsigned) ((8 * (sizeof(uint64_t) - 1))  - __builtin_clzll((X))))
-#else
-static unsigned int __log2_64(uint64_t x) {
-     static const int tab64[64] = {
-          63,  0, 58,  1, 59, 47, 53,  2,
-          60, 39, 48, 27, 54, 33, 42,  3,
-          61, 51, 37, 40, 49, 18, 28, 20,
-          55, 30, 34, 11, 43, 14, 22,  4,
-          62, 57, 46, 52, 38, 26, 32, 41,
-          50, 36, 17, 19, 29, 10, 13, 21,
-          56, 45, 25, 31, 35, 16,  9, 12,
-          44, 24, 15,  8, 23,  7,  6,  5};
-     if (x == 0) return 0;
-     uint64_t v = x;
-     v |= v >> 1;
-     v |= v >> 2;
-     v |= v >> 4;
-     v |= v >> 8;
-     v |= v >> 16;
-     v |= v >> 32;
-     return tab64[((uint64_t)((v - (v >> 1)) * 0x07EDD5E59A4E28C2)) >> 58];
-}
-#define LOG2(X) __log2_64(X)
-#endif
-
-#define STAT_DEF(name) struct name ## _stat name ## _stat;
-
-#define STAT_DECL(name, nsamples)                                       \
-     struct name ## _stat {                                             \
-         duration_t d;                                                  \
-         uint64_t histogram[64];                                        \
-         uint32_t h, n;                                                 \
-         uint64_t samples[nsamples];                                    \
-         uint64_t min, max;                                             \
-         double mean;                                                   \
-     };                                                                 \
-     static inline double name ## _stat_mean(struct name ## _stat *s) { \
-         uint32_t t = s->h;                                             \
-         uint32_t h = (s->h + 1) % nsamples;                            \
-         double mean = 0;                                               \
-         while (h != t) {                                               \
-             mean += s->samples[h];                                     \
-             h = (h + 1) % nsamples;                                    \
-         }                                                              \
-         if (mean > 0)                                                  \
-             mean /= (double)(s->n < nsamples ? s->n : nsamples);       \
-         return mean;                                                   \
-     }                                                                  \
-     static inline double name ## _stat_mean_lg2(struct name ## _stat *s) { \
-         uint32_t i;                                                    \
-         double mean = 0;                                               \
-         for (i = 0; i < 64; i++)                                       \
-             mean += (s->histogram[i] * i);                             \
-         if (mean > 0)                                                  \
-             mean /= (double)s->n;                                      \
-         return mean;                                                   \
-     }                                                                  \
-     static inline uint64_t name ## _stat_tick(struct name ## _stat *s) \
-     {                                                                  \
-         uint64_t t = ts(s->d.unit);                                    \
-         s->d.then = t;                                                 \
-         return t;                                                      \
-     }                                                                  \
-     static inline void name ## _stat_reset(struct name ## _stat *s)    \
-     {                                                                  \
-         s->min = ~0;                                                   \
-         s->max = 0;                                                    \
-         s->h = 0;                                                      \
-         memset(&s->histogram, 0, sizeof(uint64_t) * 64);               \
-         memset(&s->samples, 0, sizeof(uint64_t) * nsamples);           \
-     }                                                                  \
-     static inline uint64_t name ## _stat_tock(struct name ## _stat *s) \
-     {                                                                  \
-         uint64_t now = ts(s->d.unit);                                  \
-         uint64_t elapsed = now - s->d.then;                            \
-         uint32_t i = s->h;                                             \
-         if (s->n == nsamples) {                                        \
-             s->mean = (s->mean + name ## _stat_mean(s)) / 2.0;         \
-             if (s->n >= 4294967295)                                    \
-                 name ## _stat_reset(s);                                \
-         }                                                              \
-         s->h = (s->h + 1) % nsamples;                                  \
-         s->samples[i] = elapsed;                                       \
-         if (elapsed < s->min)                                          \
-             s->min = elapsed;                                          \
-         if (elapsed > s->max)                                          \
-             s->max = elapsed;                                          \
-         s->histogram[LOG2(elapsed)]++;                                 \
-         s->n++;                                                        \
-         s->d.then = ts(s->d.unit);                                     \
-         return elapsed;                                                \
-     }                                                                  \
-     static void name ## _stat_print_histogram(struct name ## _stat *s, const char *mod) \
-     {                                                                  \
-         uint8_t logs[64];                                              \
-         uint8_t i, j, max_log = 0;                                     \
-         double m = (s->mean + name ## _stat_mean(s) / 2.0);            \
-                                                                        \
-         fprintf(stderr, "%s:async_nif request latency histogram:\n", mod); \
-         for (i = 0; i < 64; i++) {                                     \
-             logs[i] = LOG2(s->histogram[i]);                           \
-             if (logs[i] > max_log)                                     \
-                 max_log = logs[i];                                     \
-         }                                                              \
-         for (i = max_log; i > 0; i--) {                                \
-             if (!(i % 10))                                             \
-                 fprintf(stderr, "2^%2d ", i);                          \
-             else                                                       \
-                 fprintf(stderr, "     ");                              \
-             for(j = 0; j < 64; j++)                                    \
-                 fprintf(stderr, logs[j] >= i ?  "•" : " ");            \
-             fprintf(stderr, "\n");                                     \
-         }                                                              \
-         if (max_log == 0) {                                            \
-             fprintf(stderr, "[empty]\n");                              \
-         } else {                                                       \
-             fprintf(stderr, "     ns        μs        ms        s         ks\n"); \
-             fprintf(stderr, "min: ");                                  \
-             if (s->min < 1000)                                         \
-                 fprintf(stderr, "%llu (ns)", PRIuint64(s->min));       \
-             else if (s->min < 1000000)                                 \
-                 fprintf(stderr, "%.2f (μs)", s->min / 1000.0);         \
-             else if (s->min < 1000000000)                              \
-                 fprintf(stderr, "%.2f (ms)", s->min / 1000000.0);      \
-             else if (s->min < 1000000000000)                           \
-                 fprintf(stderr, "%.2f (s)", s->min / 1000000000.0);    \
-             fprintf(stderr, "  max: ");                                \
-             if (s->max < 1000)                                         \
-                 fprintf(stderr, "%llu (ns)", PRIuint64(s->max));       \
-             else if (s->max < 1000000)                                 \
-                 fprintf(stderr, "%.2f (μs)", s->max / 1000.0);         \
-             else if (s->max < 1000000000)                              \
-                 fprintf(stderr, "%.2f (ms)", s->max / 1000000.0);      \
-             else if (s->max < 1000000000000)                           \
-                 fprintf(stderr, "%.2f (s)", s->max / 1000000000.0);    \
-             fprintf(stderr, "  mean: ");                               \
-             if (m < 1000)                                              \
-                 fprintf(stderr, "%.2f (ns)", m);                       \
-             else if (m < 1000000)                                      \
-                 fprintf(stderr, "%.2f (μs)", m / 1000.0);              \
-             else if (m < 1000000000)                                   \
-                 fprintf(stderr, "%.2f (ms)", m / 1000000.0);           \
-             else if (m < 1000000000000)                                \
-                 fprintf(stderr, "%.2f (s)", m / 1000000000.0);         \
-             fprintf(stderr, "\n");                                     \
-         }                                                              \
-         fflush(stderr);                                                \
-     }
-
-
-#define STAT_INIT(var, name)                                            \
-     (var)->name ## _stat.min = ~0;                                     \
-     (var)->name ## _stat.max = 0;                                      \
-     (var)->name ## _stat.mean = 0.0;                                   \
-     (var)->name ## _stat.h = 0;                                        \
-     (var)->name ## _stat.d.then = 0;                                   \
-     (var)->name ## _stat.d.unit = ns;
-
-#define STAT_TICK(var, name) name ## _stat_tick(&(var)->name ## _stat)
-
-#define STAT_TOCK(var, name) name ## _stat_tock(&(var)->name ## _stat)
-
-#define STAT_RESET(var, name) name ## _stat_reset(&(var)->name ## _stat)
-
-#define STAT_MEAN_LOG2_SAMPLE(var, name)                                \
-    name ## _stat_mean_lg2(&(var)->name ## _stat)
-
-#define STAT_MEAN_SAMPLE(var, name)                                     \
-    name ## _stat_mean(&(var)->name ## _stat)
-
-#define STAT_PRINT(var, name, mod)                                      \
-    name ## _stat_print_histogram(&(var)->name ## _stat, mod)
+struct stat {
+    ErlDrvTSDKey duration_key;
+    uint32_t h, n, num_samples;
+    uint64_t min, max;
+    double mean;
+    uint64_t histogram[64];
+    uint64_t samples[];
+};
 
+extern double __stat_mean(struct stat *s);
+extern double __stat_mean_log2(struct stat *s);
+extern uint64_t __stat_tick(struct stat *s);
+extern void __stat_reset(struct stat *s);
+extern uint64_t __stat_tock(struct stat *s);
+extern void __stat_print_histogram(struct stat *s, const char *mod);
+extern void __stat_free(struct stat *s);
+extern struct stat *__stat_init(uint32_t n);
 
 #if defined(__cplusplus)
 }
diff --git a/c_src/wterl.c b/c_src/wterl.c
index 824224d..5f44ae7 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -26,8 +26,10 @@
 #include <inttypes.h>
 #include <errno.h>
 
-#include "common.h"
 #include "wiredtiger.h"
+
+#include "common.h"
+#include "duration.h"
 #include "stats.h"
 #include "async_nif.h"
 #include "queue.h"
@@ -191,16 +193,14 @@ static inline uint32_t __log2(uint64_t x) {
 static int
 __ctx_cache_evict(WterlConnHandle *conn_handle)
 {
-    uint32_t num_evicted = 0;
-    struct wterl_ctx *c;
-
-    if (conn_handle->cache_size < MAX_CACHE_SIZE)
-        return 0;
-
-#if 0 // TODO: fixme once stats work again
+    static uint16_t ncalls = 0;
     uint32_t mean, log, num_evicted, i;
     uint64_t now, elapsed;
     struct wterl_ctx *c, *n;
+
+    if (conn_handle->cache_size < MAX_CACHE_SIZE && ++ncalls != 0)
+        return 0;
+
     now = cpu_clock_ticks();
 
     // Find the mean of the recorded times that items stayed in cache.
@@ -233,16 +233,6 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
         }
         c = n;
     }
-#else
-    c = STAILQ_FIRST(&conn_handle->cache);
-    if (c) {
-        STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
-        DPRINTF("evicting: %llu", PRIuint64(c->sig));
-        c->session->close(c->session, NULL);
-        enif_free(c);
-        num_evicted++;
-    }
-#endif
     conn_handle->cache_size -= num_evicted;
     return num_evicted;
 }
@@ -295,57 +285,6 @@ __ctx_cache_add(WterlConnHandle *conn_handle, struct wterl_ctx *c)
     enif_mutex_unlock(conn_handle->cache_mutex);
 }
 
-/**
- * Create a signature for the operation we're about to perform.
- *
- * Create a 64-bit hash signature for this a combination of session
- * configuration some number of cursors open on tables each potentially with a
- * different configuration. "session_config, [{table_name, cursor_config},
- * ...]"
- *
- * session_config   the string used to configure the WT_SESSION
- * ...              each pair of items in the varargs array is a table name,
- *                  cursor config pair
- * ->   number of variable arguments processed
- */
-static uint64_t
-__ctx_cache_sig(const char *c, va_list ap, int count, size_t *len)
-{
-    int i = 0;
-    uint32_t hash = 0;
-    uint32_t crc = 0;
-    uint64_t sig = 0;
-    const char *arg;
-    size_t l = 0;
-
-    *len = 0;
-
-    if (c) {
-        l = __strlen(c);
-        hash = __str_hash(hash, c, l);
-        crc = __crc32(crc, c, l);
-        *len += l + 1;
-    } else {
-        *len += 1;
-    }
-
-    for (i = 0; i < (2 * count); i++) {
-        arg = va_arg(ap, const char *);
-        if (arg) {
-            l = __strlen(arg);
-            hash = __str_hash(hash, arg, l);
-            crc = __crc32(crc, arg, __strlen(arg));
-            *len += l + 1;
-        } else {
-            *len += 1;
-        }
-    }
-
-    sig = (uint64_t)crc << 32 | hash;
-    //DPRINTF("sig %llu [%u:%u]", PRIuint64(sig), crc, hash);
-    return sig;
-}
-
 static inline char *
 __copy_str_into(char **p, const char *s)
 {
@@ -366,42 +305,63 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
              struct wterl_ctx **ctx,
              int count, const char *session_config, ...)
 {
-    int i = 3;
-    size_t sig_len = 0;
+    int i = 0;
+    uint32_t hash = 0;
+    uint32_t crc = 0;
+    uint64_t sig = 0;
+    size_t l, sig_len = 0;
     va_list ap;
-    uint64_t sig;
     const char *arg;
     struct wterl_ctx *c;
 
     arg = session_config;
     va_start(ap, session_config);
-    sig = __ctx_cache_sig(session_config, ap, count, &sig_len);
+    if (session_config) {
+        l = __strlen(session_config);
+        hash = __str_hash(hash, session_config, l);
+        crc = __crc32(crc, session_config, l);
+        sig_len += l + 1;
+	DPRINTF("sig/1: %s", session_config);
+    } else {
+        sig_len += 1;
+    }
+    for (i = 0; i < (2 * count); i++) {
+        arg = va_arg(ap, const char *);
+        if (arg) {
+            l = __strlen(arg);
+	    DPRINTF("sig/args: %s", arg);
+            hash = __str_hash(hash, arg, l);
+            crc = __crc32(crc, arg, l);
+            sig_len += l + 1;
+        } else {
+            sig_len += 1;
+        }
+    }
+    sig = (uint64_t)crc << 32 | hash;
+    DPRINTF("sig %llu [%u:%u]", PRIuint64(sig), crc, hash);
     va_end(ap);
 
     *ctx = NULL;
-    do {
-        c = conn_handle->mru_ctx[worker_id];
-        if (CASPO(&conn_handle->mru_ctx[worker_id], c, 0) == c) {
-            if (c == 0) {
-                // mru miss:
-                DPRINTF("[%.4u] mru miss, empty", worker_id);
-                *ctx = NULL;
-            } else {
-                if (c->sig == sig) {
-                    // mru hit:
-                    DPRINTF("[%.4u] mru hit: %llu found", worker_id, PRIuint64(sig));
-                    *ctx = c;
-                    break;
-                } else {
-                    // mru mismatch:
-                    DPRINTF("[%.4u] mru miss: %llu != %llu", worker_id, PRIuint64(sig), PRIuint64(c->sig));
-                    __ctx_cache_add(conn_handle, c);
-                    *ctx = NULL;
-                }
-            }
+
+    c = conn_handle->mru_ctx[worker_id];
+    if (CASPO(&conn_handle->mru_ctx[worker_id], c, 0) == c) {
+	if (c == 0) {
+	    // mru miss:
+	    DPRINTF("[%.4u] mru miss, empty", worker_id);
+	    *ctx = NULL;
+	} else {
+	    if (c->sig == sig) {
+		// mru hit:
+		DPRINTF("[%.4u] mru hit: %llu found", worker_id, PRIuint64(sig));
+		*ctx = c;
+	    } else {
+		// mru mismatch:
+		DPRINTF("[%.4u] mru miss: %llu != %llu", worker_id, PRIuint64(sig), PRIuint64(c->sig));
+		__ctx_cache_add(conn_handle, c);
+		*ctx = NULL;
+	    }
         }
-        // CAS failed, retry up to 3 times
-    } while(i--);
+    }
 
     if (*ctx == NULL) {
         // check the cache
@@ -474,9 +434,9 @@ __release_ctx(WterlConnHandle *conn_handle, uint32_t worker_id, struct wterl_ctx
     } else {
         if (c != NULL) {
             __ctx_cache_add(conn_handle, c);
-            DPRINTF("[%.4u] reset %d cursors, returnd ctx to cache", worker_id, ctx->num_cursors);
+            DPRINTF("[%.4u] reset %d cursors, returned ctx to cache", worker_id, ctx->num_cursors);
         } else {
-            DPRINTF("[%.4u] reset %d cursors, returnd ctx to mru", worker_id, ctx->num_cursors);
+            DPRINTF("[%.4u] reset %d cursors, returned ctx to mru", worker_id, ctx->num_cursors);
         }
     }
 }

From c7b45a7c2ba97c372e966552370fe43aad7c4573 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Thu, 27 Jun 2013 10:57:41 -0400
Subject: [PATCH 25/30] Still ironing out stats.

---
 c_src/async_nif.h |  1 +
 c_src/stats.c     | 30 +++++++-----------------------
 c_src/stats.h     |  2 +-
 c_src/wterl.c     | 19 +++++++++++++++++--
 src/wterl.erl     |  2 +-
 5 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 6627152..13bbce7 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -320,6 +320,7 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
       if (!fifo_q_full(reqs, q->reqs)) {
           await = __stat_mean_log2(async_nif->qwait_stat);
           await_inthisq = __stat_mean_log2(q->qwait_stat);
+	  //DPRINTF("q:%d %f/%f", qid, await_inthisq, await);
           if (await_inthisq > await) {
               enif_mutex_unlock(q->reqs_mutex);
               qid = (qid + 1) % async_nif->num_queues;
diff --git a/c_src/stats.c b/c_src/stats.c
index 9d56f9e..7b9c28f 100644
--- a/c_src/stats.c
+++ b/c_src/stats.c
@@ -99,40 +99,27 @@ __stat_mean_log2(struct stat *s)
 uint64_t
 __stat_tick(struct stat *s)
 {
-    duration_t *d;
     uint64_t t;
 
     if (!s)
 	return 0.0;
 
-    d = (duration_t*)erl_drv_tsd_get(s->duration_key);
-    if (!d) {
-	if ((d = enif_alloc(sizeof(duration_t))) == NULL)
-	    return 0;
-	memset(d, 0, sizeof(duration_t));
-	erl_drv_tsd_set(s->duration_key, d);
-    }
-    t = ts(d->unit);
-    d->then = t;
+    t = ts(s->d.unit);
+    s->d.then = t;
     return t;
 }
 
 void
 __stat_reset(struct stat *s)
 {
-    duration_t *d;
-
     if (!s)
 	return;
 
-    s->min = ~0;
-    s->max = 0;
     s->h = 0;
+    s->d.unit = ns;
+    s->d.then = 0;
     memset(s->histogram, 0, sizeof(uint64_t) * 64);
     memset(s->samples, 0, sizeof(uint64_t) * s->num_samples);
-    d = (duration_t*)erl_drv_tsd_get(s->duration_key);
-    if (d)
-	d->then = 0;
 }
 
 uint64_t
@@ -146,10 +133,7 @@ __stat_tock(struct stat *s)
     if (!s)
 	return 0.0;
 
-    d = (duration_t*)erl_drv_tsd_get(s->duration_key);
-    if (!d)
-	return 0;
-
+    d = &s->d;
     now = ts(d->unit);
     elapsed = now - d->then;
     i = s->h;
@@ -160,7 +144,7 @@ __stat_tock(struct stat *s)
     }
     s->h = (s->h + 1) % s->num_samples;
     s->samples[i] = elapsed;
-    if (elapsed < s->min)
+    if (elapsed != 0 && elapsed < s->min)
 	s->min = elapsed;
     if (elapsed > s->max)
 	s->max = elapsed;
@@ -255,6 +239,6 @@ __stat_init(uint32_t n)
     s->mean = 0.0;
     s->h = 0;
     s->num_samples = n;
-    erl_drv_tsd_key_create(NULL, &(s->duration_key));
+    s->d.unit = ns;
     return s;
 }
diff --git a/c_src/stats.h b/c_src/stats.h
index 6d7f983..0bdbe8e 100644
--- a/c_src/stats.h
+++ b/c_src/stats.h
@@ -30,7 +30,7 @@ extern "C" {
 #define STAT_DEF(name) struct stat *name ## _stat;
 
 struct stat {
-    ErlDrvTSDKey duration_key;
+    duration_t d;
     uint32_t h, n, num_samples;
     uint64_t min, max;
     double mean;
diff --git a/c_src/wterl.c b/c_src/wterl.c
index 5f44ae7..95c4cf4 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -193,12 +193,11 @@ static inline uint32_t __log2(uint64_t x) {
 static int
 __ctx_cache_evict(WterlConnHandle *conn_handle)
 {
-    static uint16_t ncalls = 0;
     uint32_t mean, log, num_evicted, i;
     uint64_t now, elapsed;
     struct wterl_ctx *c, *n;
 
-    if (conn_handle->cache_size < MAX_CACHE_SIZE && ++ncalls != 0)
+    if (conn_handle->cache_size < MAX_CACHE_SIZE)
         return 0;
 
     now = cpu_clock_ticks();
@@ -264,7 +263,15 @@ __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
         }
         c = STAILQ_NEXT(c, entries);
     }
+#ifdef DEBUG
+    uint32_t sz = 0;
+    struct wterl_ctx *f;
+    STAILQ_FOREACH(f, &conn_handle->cache, entries) {
+	sz++;
+    }
+#endif
     enif_mutex_unlock(conn_handle->cache_mutex);
+    DPRINTF("cache_find: [%u:%u] %s (%p)", sz, conn_handle->cache_size, c ? "hit" : "miss", c);
     return c;
 }
 
@@ -282,7 +289,15 @@ __ctx_cache_add(WterlConnHandle *conn_handle, struct wterl_ctx *c)
     c->tstamp = cpu_clock_ticks();
     STAILQ_INSERT_TAIL(&conn_handle->cache, c, entries);
     conn_handle->cache_size += 1;
+#ifdef DEBUG
+    uint32_t sz = 0;
+    struct wterl_ctx *f;
+    STAILQ_FOREACH(f, &conn_handle->cache, entries) {
+	sz++;
+    }
+#endif
     enif_mutex_unlock(conn_handle->cache_mutex);
+    DPRINTF("cache_add: [%u:%u] (%p)", sz, conn_handle->cache_size, c);
 }
 
 static inline char *
diff --git a/src/wterl.erl b/src/wterl.erl
index 495be92..db44807 100644
--- a/src/wterl.erl
+++ b/src/wterl.erl
@@ -604,7 +604,7 @@ insert_delete_test() ->
 %%		 lists:sort([crypto:sha(<<X>>) || X <- lists:seq(1, 2000)])).
 
 many_open_tables_test_() ->
-    {timeout, 60,
+    {timeout, 120,
      fun() ->
 	     ConnOpts = [{create,true},{cache_size,"100MB"},{session_max, 8192}],
 	     DataDir = ?TEST_DATA_DIR,

From 4300b3036fe32f2df1b4a434c31c689716912bc6 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Mon, 1 Jul 2013 21:09:21 -0400
Subject: [PATCH 26/30] Working on triggers that start/stop worker threads.

---
 c_src/async_nif.h | 122 ++++++++++++++++++++++++++--------------------
 c_src/cas.h       |   8 ++-
 c_src/fifo_q.h    |  95 ------------------------------------
 c_src/stats.c     |   8 +--
 c_src/wterl.c     |   6 +--
 5 files changed, 83 insertions(+), 156 deletions(-)
 delete mode 100644 c_src/fifo_q.h

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 13bbce7..9f5b94f 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -25,7 +25,6 @@ extern "C" {
 #endif
 
 #include <assert.h>
-#include "fifo_q.h"
 #include "queue.h"
 #include "stats.h"
 
@@ -44,15 +43,17 @@ struct async_nif_req_entry {
   void *args;
   void (*fn_work)(ErlNifEnv*, ERL_NIF_TERM, ErlNifPid*, unsigned int, void *);
   void (*fn_post)(void *);
+  STAILQ_ENTRY(async_nif_req_entry) entries;
 };
-DECL_FIFO_QUEUE(reqs, struct async_nif_req_entry);
+
 
 struct async_nif_work_queue {
-  STAT_DEF(qwait);
-  unsigned int workers;
+  unsigned int num_workers;
   ErlNifMutex *reqs_mutex;
   ErlNifCond *reqs_cnd;
-  FIFO_QUEUE_TYPE(reqs) reqs;
+  STAILQ_HEAD(reqs, async_nif_req_entry) reqs;
+  STAT_DEF(qwait);
+  STAT_DEF(wt);
 };
 
 struct async_nif_worker_entry {
@@ -64,14 +65,13 @@ struct async_nif_worker_entry {
 };
 
 struct async_nif_state {
-  STAT_DEF(qwait);
   unsigned int shutdown;
   ErlNifMutex *we_mutex;
   unsigned int we_active;
   SLIST_HEAD(joining, async_nif_worker_entry) we_joining;
   unsigned int num_queues;
   unsigned int next_q;
-  FIFO_QUEUE_TYPE(reqs) recycled_reqs;
+  STAILQ_HEAD(recycled_reqs, async_nif_req_entry) recycled_reqs;
   unsigned int num_reqs;
   ErlNifMutex *recycled_req_mutex;
   struct async_nif_work_queue queues[];
@@ -130,14 +130,14 @@ struct async_nif_state {
     req->fn_post = (void (*)(void *))fn_post_ ## decl;                 \
     int h = -1;                                                        \
     if (affinity)                                                      \
-        h = affinity % async_nif->num_queues;                          \
+        h = ((unsigned int)affinity) % async_nif->num_queues;          \
     ERL_NIF_TERM reply = async_nif_enqueue_req(async_nif, req, h);     \
     if (!reply) {                                                      \
       fn_post_ ## decl (args);                                         \
       async_nif_recycle_req(req, async_nif);                           \
       enif_free(copy_of_args);                                         \
       return enif_make_tuple2(env, enif_make_atom(env, "error"),       \
-                              enif_make_atom(env, "shutdown"));        \
+                              enif_make_atom(env, "eagain"));        \
     }                                                                  \
     return reply;                                                      \
   }
@@ -188,7 +188,7 @@ async_nif_reuse_req(struct async_nif_state *async_nif)
     ErlNifEnv *env = NULL;
 
     enif_mutex_lock(async_nif->recycled_req_mutex);
-    if (fifo_q_empty(reqs, async_nif->recycled_reqs)) {
+    if (STAILQ_EMPTY(&async_nif->recycled_reqs)) {
         if (async_nif->num_reqs < ASYNC_NIF_MAX_QUEUED_REQS) {
             req = enif_alloc(sizeof(struct async_nif_req_entry));
             if (req) {
@@ -204,10 +204,10 @@ async_nif_reuse_req(struct async_nif_state *async_nif)
             }
         }
     } else {
-        req = fifo_q_get(reqs, async_nif->recycled_reqs);
+        req = STAILQ_FIRST(&async_nif->recycled_reqs);
+        STAILQ_REMOVE(&async_nif->recycled_reqs, req, async_nif_req_entry, entries);
     }
     enif_mutex_unlock(async_nif->recycled_req_mutex);
-    __stat_tick(async_nif->qwait_stat);
     return req;
 }
 
@@ -222,13 +222,12 @@ void
 async_nif_recycle_req(struct async_nif_req_entry *req, struct async_nif_state *async_nif)
 {
     ErlNifEnv *env = NULL;
-    __stat_tock(async_nif->qwait_stat);
     enif_mutex_lock(async_nif->recycled_req_mutex);
     enif_clear_env(req->env);
     env = req->env;
     memset(req, 0, sizeof(struct async_nif_req_entry));
     req->env = env;
-    fifo_q_put(reqs, async_nif->recycled_reqs, req);
+    STAILQ_INSERT_TAIL(&async_nif->recycled_reqs, req, entries);
     enif_mutex_unlock(async_nif->recycled_req_mutex);
 }
 
@@ -247,6 +246,7 @@ async_nif_start_worker(struct async_nif_state *async_nif, struct async_nif_work_
 
   enif_mutex_lock(async_nif->we_mutex);
 
+#if 0 // TODO:
   we = SLIST_FIRST(&async_nif->we_joining);
   while(we != NULL) {
     struct async_nif_worker_entry *n = SLIST_NEXT(we, entries);
@@ -257,6 +257,7 @@ async_nif_start_worker(struct async_nif_state *async_nif, struct async_nif_work_
     async_nif->we_active--;
     we = n;
   }
+#endif
 
   if (async_nif->we_active == ASYNC_NIF_MAX_WORKERS) {
       enif_mutex_unlock(async_nif->we_mutex);
@@ -287,11 +288,10 @@ static ERL_NIF_TERM
 async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_entry *req, int hint)
 {
   /* Identify the most appropriate worker for this request. */
-  unsigned int qid = 0;
+  unsigned int i, qid = 0;
   unsigned int n = async_nif->num_queues;
   struct async_nif_work_queue *q = NULL;
-  double await = 0;
-  double await_inthisq = 0;
+  double avg_wait_across_q, avg_wt_service_time, avg_wait_this_q = 0;
 
   /* Either we're choosing a queue based on some affinity/hinted value or we
      need to select the next queue in the rotation and atomically update that
@@ -304,6 +304,13 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
       async_nif->next_q = qid;
   }
 
+  avg_wait_across_q = 0;
+  for (i = 0; i < async_nif->num_queues; i++) {
+      avg_wait_across_q += __stat_mean(async_nif->queues[i].qwait_stat);
+  }
+  if (avg_wait_across_q != 0)
+      avg_wait_across_q /= async_nif->num_queues;
+
   /* Now we inspect and interate across the set of queues trying to select one
      that isn't too full or too slow. */
   do {
@@ -313,36 +320,36 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
       /* Now that we hold the lock, check for shutdown.  As long as we hold
          this lock either a) we're shutting down so exit now or b) this queue
          will be valid until we release the lock. */
+
       if (async_nif->shutdown) {
           enif_mutex_unlock(q->reqs_mutex);
           return 0;
       }
-      if (!fifo_q_full(reqs, q->reqs)) {
-          await = __stat_mean_log2(async_nif->qwait_stat);
-          await_inthisq = __stat_mean_log2(q->qwait_stat);
-	  //DPRINTF("q:%d %f/%f", qid, await_inthisq, await);
-          if (await_inthisq > await) {
-              enif_mutex_unlock(q->reqs_mutex);
-              qid = (qid + 1) % async_nif->num_queues;
-              q = &async_nif->queues[qid];
-          } else {
-              // q->reqs_mutex unlocked at end of function
-              break;
-          }
+
+      avg_wait_this_q = __stat_mean(q->qwait_stat);
+      avg_wt_service_time = __stat_mean(q->wt_stat);
+      DPRINTF("q:%d w:%u %f/%f(%f) %c", qid, q->num_workers, avg_wait_this_q, avg_wait_across_q, avg_wt_service_time, avg_wait_this_q <= avg_wait_across_q ? 't' : 'f');
+      if (avg_wait_this_q <= avg_wait_across_q) break;
+      else {
+	  enif_mutex_unlock(q->reqs_mutex);
+	  qid = (qid + 1) % async_nif->num_queues;
+	  q = &async_nif->queues[qid];
       }
   } while(n-- > 0);
 
+  if (n == 0) return 0; // All queues are full, trigger eagain
+
   /* We hold the queue's lock, and we've seletect a reasonable queue for this
      new request now check to make sure there are enough workers actively
      processing requests on this queue. */
-  if (q->workers < 2 || await_inthisq > await) {
+  if (q->num_workers == 0 || avg_wait_this_q >= avg_wt_service_time) {
       if (async_nif_start_worker(async_nif, q) == 0)
-	  q->workers++;
+	  q->num_workers++;
   }
 
   /* And finally add the request to the queue. */
   __stat_tick(q->qwait_stat);
-  fifo_q_put(reqs, q->reqs, req);
+  STAILQ_INSERT_TAIL(&q->reqs, req, entries);
 
   /* Build the term before releasing the lock so as not to race on the use of
      the req pointer (which will soon become invalid in another thread
@@ -376,10 +383,9 @@ async_nif_worker_fn(void *arg)
         enif_mutex_unlock(q->reqs_mutex);
         break;
     }
-    if (fifo_q_empty(reqs, q->reqs)) {
+    if (STAILQ_EMPTY(&q->reqs)) {
       /* Queue is empty so we wait for more work to arrive. */
-      __stat_reset(q->qwait_stat);
-      if (q->workers > 2) {
+      if (q->num_workers > 2) {
 	  enif_mutex_unlock(q->reqs_mutex);
 	  break;
       } else {
@@ -387,21 +393,23 @@ async_nif_worker_fn(void *arg)
 	  goto check_again_for_work;
       }
     } else {
-      assert(fifo_q_size(reqs, q->reqs) > 0);
-      assert(fifo_q_size(reqs, q->reqs) < fifo_q_capacity(reqs, q->reqs));
       /* At this point the next req is ours to process and we hold the
          reqs_mutex lock.  Take the request off the queue. */
-      req = fifo_q_get(reqs, q->reqs);
+      req = STAILQ_FIRST(&q->reqs);
+      STAILQ_REMOVE(&q->reqs, req, async_nif_req_entry, entries);
 
       /* Ensure that there is at least one other worker thread watching this
          queue. */
       enif_cond_signal(q->reqs_cnd);
       enif_mutex_unlock(q->reqs_mutex);
 
-      /* Perform the work. */
-      req->fn_work(req->env, req->ref, &req->pid, worker_id, req->args);
       __stat_tock(q->qwait_stat);
 
+      /* Perform the work. */
+      __stat_tick(q->wt_stat);
+      req->fn_work(req->env, req->ref, &req->pid, worker_id, req->args);
+      __stat_tock(q->wt_stat);
+
       /* Now call the post-work cleanup function. */
       req->fn_post(req->args);
 
@@ -418,7 +426,11 @@ async_nif_worker_fn(void *arg)
   enif_mutex_lock(async_nif->we_mutex);
   SLIST_INSERT_HEAD(&async_nif->we_joining, we, entries);
   enif_mutex_unlock(async_nif->we_mutex);
-  q->workers--;
+  q->num_workers--;
+  if (q->num_workers == 0) {
+      __stat_reset(q->qwait_stat);
+      __stat_reset(q->wt_stat);
+  }
   enif_thread_exit(0);
   return 0;
 }
@@ -433,8 +445,6 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
   struct async_nif_worker_entry *we = NULL;
   UNUSED(env);
 
-  __stat_print_histogram(async_nif->qwait_stat, "wterl");
-
   /* Signal the worker threads, stop what you're doing and exit.  To
      ensure that we don't race with the enqueue() process we first
      lock all the worker queues, then set shutdown to true, then
@@ -452,8 +462,10 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
 
   for (i = 0; i < num_queues; i++) {
       q = &async_nif->queues[i];
+      __stat_print_histogram(async_nif->queues[i].qwait_stat, "wterl q-wait");
       enif_mutex_unlock(q->reqs_mutex);
   }
+  __stat_print_histogram(async_nif->queues[i].qwait_stat, "wterl service time");
 
   /* Join for the now exiting worker threads. */
   while(async_nif->we_active > 0) {
@@ -480,7 +492,9 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
 
       /* Worker threads are stopped, now toss anything left in the queue. */
       req = NULL;
-      fifo_q_foreach(reqs, q->reqs, req, {
+      req = STAILQ_FIRST(&q->reqs);
+      while(req != NULL) {
+          struct async_nif_req_entry *n = STAILQ_NEXT(req, entries);
           enif_clear_env(req->env);
           enif_send(NULL, &req->pid, req->env,
                     enif_make_tuple2(req->env, enif_make_atom(req->env, "error"),
@@ -489,8 +503,8 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
           enif_free_env(req->env);
           enif_free(req->args);
           enif_free(req);
-          });
-      fifo_q_free(reqs, q->reqs);
+          req = n;
+      }
       enif_mutex_destroy(q->reqs_mutex);
       enif_cond_destroy(q->reqs_cnd);
   }
@@ -498,11 +512,13 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
   /* Free any req structures sitting unused on the recycle queue. */
   enif_mutex_lock(async_nif->recycled_req_mutex);
   req = NULL;
-  fifo_q_foreach(reqs, async_nif->recycled_reqs, req, {
+  req = STAILQ_FIRST(&async_nif->recycled_reqs);
+  while(req != NULL) {
+      struct async_nif_req_entry *n = STAILQ_NEXT(req, entries);
       enif_free_env(req->env);
       enif_free(req);
-  });
-  fifo_q_free(reqs, async_nif->recycled_reqs);
+      req = n;
+  }
 
   enif_mutex_unlock(async_nif->recycled_req_mutex);
   enif_mutex_destroy(async_nif->recycled_req_mutex);
@@ -550,18 +566,18 @@ async_nif_load()
   async_nif->we_active = 0;
   async_nif->next_q = 0;
   async_nif->shutdown = 0;
-  async_nif->recycled_reqs = fifo_q_new(reqs, ASYNC_NIF_MAX_QUEUED_REQS);
+  STAILQ_INIT(&async_nif->recycled_reqs);
   async_nif->recycled_req_mutex = enif_mutex_create(NULL);
-  async_nif->qwait_stat = __stat_init(1000);
   async_nif->we_mutex = enif_mutex_create(NULL);
   SLIST_INIT(&async_nif->we_joining);
 
   for (i = 0; i < async_nif->num_queues; i++) {
       struct async_nif_work_queue *q = &async_nif->queues[i];
-      q->reqs = fifo_q_new(reqs, ASYNC_NIF_WORKER_QUEUE_SIZE);
+      STAILQ_INIT(&q->reqs);
       q->reqs_mutex = enif_mutex_create(NULL);
       q->reqs_cnd = enif_cond_create(NULL);
-      q->qwait_stat = __stat_init(1000);
+      q->qwait_stat = __stat_init(100000);
+      q->wt_stat = __stat_init(100000);
   }
   return async_nif;
 }
diff --git a/c_src/cas.h b/c_src/cas.h
index 61c1f61..2f35cb4 100644
--- a/c_src/cas.h
+++ b/c_src/cas.h
@@ -55,9 +55,15 @@
 
 #define CACHE_LINE_SIZE 64
 
+#define ATOMIC_INCR(_v,_newval)                                         \
+do {                                                                    \
+    __typeof(_v) __val = (_v);                                          \
+    while ( (_newval = CASIO(&(_v),__val,__val+1)) != __val )           \
+        __val = _newval;                                                \
+} while ( 0 )
 #define ATOMIC_ADD_TO(_v,_x)                                            \
 do {                                                                    \
-    int __val = (_v), __newval;                                         \
+    __typeof(_v) __val = (_v), __newval;                                \
     while ( (__newval = CASIO(&(_v),__val,__val+(_x))) != __val )       \
         __val = __newval;                                               \
 } while ( 0 )
diff --git a/c_src/fifo_q.h b/c_src/fifo_q.h
deleted file mode 100644
index bbc4ff0..0000000
--- a/c_src/fifo_q.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * fifo_q: a macro-based implementation of a FIFO Queue
- *
- * Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved.
- * Author: Gregory Burd <greg@basho.com> <greg@burd.me>
- *
- * This file is provided to you under the Apache License,
- * Version 2.0 (the "License"); you may not use this file
- * except in compliance with the License.  You may obtain
- * a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef __FIFO_Q_H__
-#define __FIFO_Q_H__
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#define fifo_t(name)                      \
-  struct fifo_q__ ## name *
-#define FIFO_QUEUE_TYPE(name)             \
-  struct fifo_q__ ## name *
-#define DECL_FIFO_QUEUE(name, type)       \
-  struct fifo_q__ ## name {               \
-    unsigned int h, t, s;                 \
-    type *items[];                        \
-  };                                      \
-  static struct fifo_q__ ## name *fifo_q_ ## name ## _new(unsigned int n) { \
-    int sz = sizeof(struct fifo_q__ ## name) + ((n+1) * sizeof(type *));\
-    struct fifo_q__ ## name *q = enif_alloc(sz);                        \
-    if (!q)                                                             \
-        return 0;                                                       \
-    memset(q, 0, sz);                                                   \
-    q->s = n + 1;                                                       \
-    return q;                                                           \
-  }                                                                     \
-  static inline void fifo_q_ ## name ## _free(struct fifo_q__ ## name *q) {    \
-    memset(q, 0, sizeof(struct fifo_q__ ## name) + (q->s * sizeof(type *))); \
-    enif_free(q);                                                       \
-  }                                                                     \
-  static inline type *fifo_q_ ## name ## _put(struct fifo_q__ ## name *q, type *n) { \
-    q->items[q->h] = n;                                                 \
-    q->h = (q->h + 1) % q->s;                                           \
-    return n;                                                           \
-  }                                                                     \
-  static inline type *fifo_q_ ## name ## _get(struct fifo_q__ ## name *q) {    \
-    type *n = q->items[q->t];                                           \
-    q->items[q->t] = 0;                                                 \
-    q->t = (q->t + 1) % q->s;                                           \
-    return n;                                                           \
-  }                                                                     \
-  static inline unsigned int fifo_q_ ## name ## _size(struct fifo_q__ ## name *q) { \
-    return (q->h - q->t + q->s) % q->s;                                 \
-  }                                                                     \
-  static inline unsigned int fifo_q_ ## name ## _capacity(struct fifo_q__ ## name *q) { \
-    return q->s - 1;                                                    \
-  }                                                                     \
-  static inline int fifo_q_ ## name ## _empty(struct fifo_q__ ## name *q) {    \
-    return (q->t == q->h);                                              \
-  }                                                                     \
-  static inline int fifo_q_ ## name ## _full(struct fifo_q__ ## name *q) {     \
-    return ((q->h + 1) % q->s) == q->t;                                 \
-  }
-
-#define fifo_q_new(name, size) fifo_q_ ## name ## _new(size)
-#define fifo_q_free(name, queue) fifo_q_ ## name ## _free(queue)
-#define fifo_q_get(name, queue) fifo_q_ ## name ## _get(queue)
-#define fifo_q_put(name, queue, item) fifo_q_ ## name ## _put(queue, item)
-#define fifo_q_size(name, queue) fifo_q_ ## name ## _size(queue)
-#define fifo_q_capacity(name, queue) fifo_q_ ## name ## _capacity(queue)
-#define fifo_q_empty(name, queue) fifo_q_ ## name ## _empty(queue)
-#define fifo_q_full(name, queue) fifo_q_ ## name ## _full(queue)
-#define fifo_q_foreach(name, queue, item, task) do {                    \
-    while(!fifo_q_ ## name ## _empty(queue)) {                          \
-      item = fifo_q_ ## name ## _get(queue);                            \
-      do task while(0);                                                 \
-    }                                                                   \
-  } while(0);
-
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif // __FIFO_Q_H__
diff --git a/c_src/stats.c b/c_src/stats.c
index 7b9c28f..9e6938c 100644
--- a/c_src/stats.c
+++ b/c_src/stats.c
@@ -75,7 +75,7 @@ __stat_mean(struct stat *s)
 	h = (h + 1) % s->num_samples;
     }
     if (mean > 0)
-	mean /= (double)(s->n < s->num_samples ? s->n : s->num_samples);
+	mean /= (s->n < s->num_samples ? (double)s->n : (double)s->num_samples);
     return mean;
 }
 
@@ -83,16 +83,16 @@ double
 __stat_mean_log2(struct stat *s)
 {
     uint32_t i;
-    double mean;
+    double mean = 0.0;
 
     if (!s)
 	return 0.0;
 
-    mean = 0;
     for (i = 0; i < 64; i++)
 	mean += (s->histogram[i] * i);
     if (mean > 0)
-	mean /= (double)s->n;
+	mean /= (s->n < s->num_samples ? s->n : s->num_samples);
+    DPRINTF("n: %u %llu %f", s->n < 64 ? s->n : 64, PRIuint64(s), mean);
     return mean;
 }
 
diff --git a/c_src/wterl.c b/c_src/wterl.c
index 95c4cf4..7526ef3 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -1427,7 +1427,7 @@ ASYNC_NIF_DECL(
     }
     args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = __str_hash(0, args->uri, __strlen(args->uri));
+    //affinity = __str_hash(0, args->uri, __strlen(args->uri));
   },
   { // work
 
@@ -1486,7 +1486,7 @@ ASYNC_NIF_DECL(
     }
     args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = __str_hash(0, args->uri, __strlen(args->uri));
+    //affinity = __str_hash(0, args->uri, __strlen(args->uri));
   },
   { // work
 
@@ -1566,7 +1566,7 @@ ASYNC_NIF_DECL(
     args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
     args->value = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[3]);
     enif_keep_resource((void*)args->conn_handle);
-    affinity = __str_hash(0, args->uri, __strlen(args->uri));
+    //affinity = __str_hash(0, args->uri, __strlen(args->uri));
   },
   { // work
 

From 00e5889ac9bc767eb997a65e5849e59614197edc Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 2 Jul 2013 16:46:04 -0400
Subject: [PATCH 27/30] Changed conditions for worker thread creation.

---
 c_src/async_nif.h | 153 ++++++++++++++++++++++++--------------------
 c_src/stats.c     | 158 +++++++++++++++++++++++++---------------------
 c_src/stats.h     |   4 +-
 c_src/wterl.c     |  58 ++++++++---------
 src/async_nif.hrl |   8 ++-
 src/wterl.erl     |  32 +++++++---
 6 files changed, 228 insertions(+), 185 deletions(-)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 9f5b94f..c18106c 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -25,6 +25,7 @@ extern "C" {
 #endif
 
 #include <assert.h>
+
 #include "queue.h"
 #include "stats.h"
 
@@ -33,6 +34,7 @@ extern "C" {
 #endif
 
 #define ASYNC_NIF_MAX_WORKERS 1024
+#define ASYNC_NIF_MIN_WORKERS 2
 #define ASYNC_NIF_WORKER_QUEUE_SIZE 1000
 #define ASYNC_NIF_MAX_QUEUED_REQS ASYNC_NIF_WORKER_QUEUE_SIZE * ASYNC_NIF_MAX_WORKERS
 
@@ -43,17 +45,18 @@ struct async_nif_req_entry {
   void *args;
   void (*fn_work)(ErlNifEnv*, ERL_NIF_TERM, ErlNifPid*, unsigned int, void *);
   void (*fn_post)(void *);
+  uint64_t submitted;
   STAILQ_ENTRY(async_nif_req_entry) entries;
 };
 
 
 struct async_nif_work_queue {
   unsigned int num_workers;
+  unsigned int depth;
   ErlNifMutex *reqs_mutex;
   ErlNifCond *reqs_cnd;
   STAILQ_HEAD(reqs, async_nif_req_entry) reqs;
-  STAT_DEF(qwait);
-  STAT_DEF(wt);
+  STAT_DEF(work);
 };
 
 struct async_nif_worker_entry {
@@ -65,6 +68,8 @@ struct async_nif_worker_entry {
 };
 
 struct async_nif_state {
+  STAT_DEF(wait);
+  STAT_DEF(service);
   unsigned int shutdown;
   ErlNifMutex *we_mutex;
   unsigned int we_active;
@@ -103,10 +108,12 @@ struct async_nif_state {
     argc -= 1;                                                          \
     /* Note: !!! this assumes that the first element of priv_data is ours */ \
     struct async_nif_state *async_nif = *(struct async_nif_state**)enif_priv_data(env); \
-    if (async_nif->shutdown)                                            \
+    if (async_nif->shutdown) {                                          \
       return enif_make_tuple2(env, enif_make_atom(env, "error"),        \
                               enif_make_atom(env, "shutdown"));         \
+    }                                                                   \
     req = async_nif_reuse_req(async_nif);                               \
+    req->submitted = ts(ns);                                            \
     if (!req) {                                                         \
         return enif_make_tuple2(env, enif_make_atom(env, "error"),      \
                                 enif_make_atom(env, "eagain"));         \
@@ -137,7 +144,7 @@ struct async_nif_state {
       async_nif_recycle_req(req, async_nif);                           \
       enif_free(copy_of_args);                                         \
       return enif_make_tuple2(env, enif_make_atom(env, "error"),       \
-                              enif_make_atom(env, "eagain"));        \
+                              enif_make_atom(env, "eagain"));          \
     }                                                                  \
     return reply;                                                      \
   }
@@ -246,7 +253,6 @@ async_nif_start_worker(struct async_nif_state *async_nif, struct async_nif_work_
 
   enif_mutex_lock(async_nif->we_mutex);
 
-#if 0 // TODO:
   we = SLIST_FIRST(&async_nif->we_joining);
   while(we != NULL) {
     struct async_nif_worker_entry *n = SLIST_NEXT(we, entries);
@@ -257,7 +263,6 @@ async_nif_start_worker(struct async_nif_state *async_nif, struct async_nif_work_
     async_nif->we_active--;
     we = n;
   }
-#endif
 
   if (async_nif->we_active == ASYNC_NIF_MAX_WORKERS) {
       enif_mutex_unlock(async_nif->we_mutex);
@@ -289,9 +294,8 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
 {
   /* Identify the most appropriate worker for this request. */
   unsigned int i, qid = 0;
-  unsigned int n = async_nif->num_queues;
   struct async_nif_work_queue *q = NULL;
-  double avg_wait_across_q, avg_wt_service_time, avg_wait_this_q = 0;
+  double avg_depth;
 
   /* Either we're choosing a queue based on some affinity/hinted value or we
      need to select the next queue in the rotation and atomically update that
@@ -304,52 +308,66 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
       async_nif->next_q = qid;
   }
 
-  avg_wait_across_q = 0;
-  for (i = 0; i < async_nif->num_queues; i++) {
-      avg_wait_across_q += __stat_mean(async_nif->queues[i].qwait_stat);
-  }
-  if (avg_wait_across_q != 0)
-      avg_wait_across_q /= async_nif->num_queues;
-
   /* Now we inspect and interate across the set of queues trying to select one
      that isn't too full or too slow. */
-  do {
+  for (i = 0; i < async_nif->num_queues; i++) {
+      /* Compute the average queue depth not counting queues which are empty or
+         the queue we're considering right now. */
+      unsigned int j, n = 0;
+      for (j = 0; j < async_nif->num_queues; j++) {
+          if (j != qid && async_nif->queues[j].depth != 0) {
+              n++;
+              avg_depth += async_nif->queues[j].depth;
+          }
+      }
+      if (avg_depth != 0)
+          avg_depth /= n;
+
+      /* Lock this queue under consideration, then check for shutdown.  While
+         we hold this lock either a) we're shutting down so exit now or b) this
+         queue will be valid until we release the lock. */
       q = &async_nif->queues[qid];
       enif_mutex_lock(q->reqs_mutex);
-
-      /* Now that we hold the lock, check for shutdown.  As long as we hold
-         this lock either a) we're shutting down so exit now or b) this queue
-         will be valid until we release the lock. */
-
       if (async_nif->shutdown) {
           enif_mutex_unlock(q->reqs_mutex);
           return 0;
       }
 
-      avg_wait_this_q = __stat_mean(q->qwait_stat);
-      avg_wt_service_time = __stat_mean(q->wt_stat);
-      DPRINTF("q:%d w:%u %f/%f(%f) %c", qid, q->num_workers, avg_wait_this_q, avg_wait_across_q, avg_wt_service_time, avg_wait_this_q <= avg_wait_across_q ? 't' : 'f');
-      if (avg_wait_this_q <= avg_wait_across_q) break;
+      /* Try not to enqueue a request into a queue that isn't keeping up with
+         the request volume. */
+      if (q->depth <= avg_depth) break;
       else {
-	  enif_mutex_unlock(q->reqs_mutex);
-	  qid = (qid + 1) % async_nif->num_queues;
-	  q = &async_nif->queues[qid];
+          enif_mutex_unlock(q->reqs_mutex);
+          qid = (qid + 1) % async_nif->num_queues;
       }
-  } while(n-- > 0);
+  }
 
-  if (n == 0) return 0; // All queues are full, trigger eagain
+  /* If the for loop finished then we didn't find a suitable queue for this
+     request, meaning we're backed up so trigger eagain. */
+  if (i == async_nif->num_queues) {
+      enif_mutex_unlock(q->reqs_mutex);
+      return 0;
+  }
 
-  /* We hold the queue's lock, and we've seletect a reasonable queue for this
-     new request now check to make sure there are enough workers actively
-     processing requests on this queue. */
-  if (q->num_workers == 0 || avg_wait_this_q >= avg_wt_service_time) {
-      if (async_nif_start_worker(async_nif, q) == 0)
-	  q->num_workers++;
+  /* We've selected queue for this new request now check to make sure there are
+     enough workers actively processing requests on this queue. */
+  if (q->num_workers < ASYNC_NIF_MIN_WORKERS) {
+      if (async_nif_start_worker(async_nif, q) == 0) q->num_workers++;
+  } else {
+      /* If more the 1/4 of the time it takes to complete a request is spent in
+         waiting to be serviced then we need more worker threads to service
+         requests. */
+      double m, n;
+      m = __stat_mean(async_nif->wait_stat);
+      n = __stat_mean(q->work_stat);
+      DPRINTF("w: %f\ts: %f\t%f", m, n, n/(m+n));
+      if (m && n && n / (m+n)  < 0.75)
+          if (async_nif_start_worker(async_nif, q) == 0) q->num_workers++;
   }
 
   /* And finally add the request to the queue. */
-  __stat_tick(q->qwait_stat);
   STAILQ_INSERT_TAIL(&q->reqs, req, entries);
+  q->depth++;
 
   /* Build the term before releasing the lock so as not to race on the use of
      the req pointer (which will soon become invalid in another thread
@@ -385,36 +403,40 @@ async_nif_worker_fn(void *arg)
     }
     if (STAILQ_EMPTY(&q->reqs)) {
       /* Queue is empty so we wait for more work to arrive. */
-      if (q->num_workers > 2) {
-	  enif_mutex_unlock(q->reqs_mutex);
-	  break;
+      if (q->num_workers > ASYNC_NIF_MIN_WORKERS) {
+          enif_mutex_unlock(q->reqs_mutex);
+          break;
       } else {
-	  enif_cond_wait(q->reqs_cnd, q->reqs_mutex);
-	  goto check_again_for_work;
+          enif_cond_wait(q->reqs_cnd, q->reqs_mutex);
+          goto check_again_for_work;
       }
     } else {
       /* At this point the next req is ours to process and we hold the
          reqs_mutex lock.  Take the request off the queue. */
       req = STAILQ_FIRST(&q->reqs);
       STAILQ_REMOVE(&q->reqs, req, async_nif_req_entry, entries);
+      q->depth--;
 
       /* Ensure that there is at least one other worker thread watching this
          queue. */
       enif_cond_signal(q->reqs_cnd);
       enif_mutex_unlock(q->reqs_mutex);
 
-      __stat_tock(q->qwait_stat);
-
       /* Perform the work. */
-      __stat_tick(q->wt_stat);
+      uint64_t then = ts(q->work_stat->d.unit);
+      uint64_t wait = then - req->submitted;
       req->fn_work(req->env, req->ref, &req->pid, worker_id, req->args);
-      __stat_tock(q->wt_stat);
+      uint64_t work = ts(q->work_stat->d.unit) - then;
+      __stat_add(async_nif->wait_stat, wait);
+      __stat_add(q->work_stat, work);
+      __stat_add(async_nif->service_stat, wait + work);
 
       /* Now call the post-work cleanup function. */
       req->fn_post(req->args);
 
       /* Clean up req for reuse. */
       req->ref = 0;
+      req->submitted = 0;
       req->fn_work = 0;
       req->fn_post = 0;
       enif_free(req->args);
@@ -427,10 +449,6 @@ async_nif_worker_fn(void *arg)
   SLIST_INSERT_HEAD(&async_nif->we_joining, we, entries);
   enif_mutex_unlock(async_nif->we_mutex);
   q->num_workers--;
-  if (q->num_workers == 0) {
-      __stat_reset(q->qwait_stat);
-      __stat_reset(q->wt_stat);
-  }
   enif_thread_exit(0);
   return 0;
 }
@@ -445,12 +463,11 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
   struct async_nif_worker_entry *we = NULL;
   UNUSED(env);
 
-  /* Signal the worker threads, stop what you're doing and exit.  To
-     ensure that we don't race with the enqueue() process we first
-     lock all the worker queues, then set shutdown to true, then
-     unlock.  The enqueue function will take the queue mutex, then
-     test for shutdown condition, then enqueue only if not shutting
-     down. */
+  /* Signal the worker threads, stop what you're doing and exit.  To ensure
+     that we don't race with the enqueue() process we first lock all the worker
+     queues, then set shutdown to true, then unlock.  The enqueue function will
+     take the queue mutex, then test for shutdown condition, then enqueue only
+     if not shutting down. */
   for (i = 0; i < num_queues; i++) {
       q = &async_nif->queues[i];
       enif_mutex_lock(q->reqs_mutex);
@@ -462,26 +479,25 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
 
   for (i = 0; i < num_queues; i++) {
       q = &async_nif->queues[i];
-      __stat_print_histogram(async_nif->queues[i].qwait_stat, "wterl q-wait");
       enif_mutex_unlock(q->reqs_mutex);
   }
-  __stat_print_histogram(async_nif->queues[i].qwait_stat, "wterl service time");
+  __stat_print_histogram(async_nif->service_stat, "wterl");
 
   /* Join for the now exiting worker threads. */
   while(async_nif->we_active > 0) {
 
       for (i = 0; i < num_queues; i++)
-	  enif_cond_broadcast(async_nif->queues[i].reqs_cnd);
+          enif_cond_broadcast(async_nif->queues[i].reqs_cnd);
 
       we = SLIST_FIRST(&async_nif->we_joining);
       while(we != NULL) {
-	  struct async_nif_worker_entry *n = SLIST_NEXT(we, entries);
-	  SLIST_REMOVE_HEAD(&async_nif->we_joining, entries);
-	  void *exit_value = 0; /* We ignore the thread_join's exit value. */
-	  enif_thread_join(we->tid, &exit_value);
-	  enif_free(we);
-	  async_nif->we_active--;
-	  we = n;
+          struct async_nif_worker_entry *n = SLIST_NEXT(we, entries);
+          SLIST_REMOVE_HEAD(&async_nif->we_joining, entries);
+          void *exit_value = 0; /* We ignore the thread_join's exit value. */
+          enif_thread_join(we->tid, &exit_value);
+          enif_free(we);
+          async_nif->we_active--;
+          we = n;
       }
   }
   enif_mutex_destroy(async_nif->we_mutex);
@@ -570,14 +586,15 @@ async_nif_load()
   async_nif->recycled_req_mutex = enif_mutex_create(NULL);
   async_nif->we_mutex = enif_mutex_create(NULL);
   SLIST_INIT(&async_nif->we_joining);
+  async_nif->wait_stat = __stat_init(100000);
+  async_nif->service_stat = __stat_init(100000);
 
   for (i = 0; i < async_nif->num_queues; i++) {
       struct async_nif_work_queue *q = &async_nif->queues[i];
       STAILQ_INIT(&q->reqs);
       q->reqs_mutex = enif_mutex_create(NULL);
       q->reqs_cnd = enif_cond_create(NULL);
-      q->qwait_stat = __stat_init(100000);
-      q->wt_stat = __stat_init(100000);
+      q->work_stat = __stat_init(100000);
   }
   return async_nif;
 }
diff --git a/c_src/stats.c b/c_src/stats.c
index 9e6938c..d018c76 100644
--- a/c_src/stats.c
+++ b/c_src/stats.c
@@ -64,18 +64,20 @@ __stat_mean(struct stat *s)
     double mean;
 
     if (!s)
-	return 0.0;
+        return 0.0;
 
+    enif_mutex_lock(s->mutex);
     t = s->h;
     h = (s->h + 1) % s->num_samples;
     mean = 0;
 
     while (h != t) {
-	mean += s->samples[h];
-	h = (h + 1) % s->num_samples;
+        mean += s->samples[h];
+        h = (h + 1) % s->num_samples;
     }
     if (mean > 0)
-	mean /= (s->n < s->num_samples ? (double)s->n : (double)s->num_samples);
+        mean /= (s->n < s->num_samples ? (double)s->n : (double)s->num_samples);
+    enif_mutex_unlock(s->mutex);
     return mean;
 }
 
@@ -86,13 +88,14 @@ __stat_mean_log2(struct stat *s)
     double mean = 0.0;
 
     if (!s)
-	return 0.0;
+        return 0.0;
 
+    enif_mutex_lock(s->mutex);
     for (i = 0; i < 64; i++)
-	mean += (s->histogram[i] * i);
+        mean += (s->histogram[i] * i);
     if (mean > 0)
-	mean /= (s->n < s->num_samples ? s->n : s->num_samples);
-    DPRINTF("n: %u %llu %f", s->n < 64 ? s->n : 64, PRIuint64(s), mean);
+        mean /= (s->n < s->num_samples ? s->n : s->num_samples);
+    enif_mutex_unlock(s->mutex);
     return mean;
 }
 
@@ -102,10 +105,12 @@ __stat_tick(struct stat *s)
     uint64_t t;
 
     if (!s)
-	return 0.0;
+        return 0.0;
 
+    enif_mutex_lock(s->mutex);
     t = ts(s->d.unit);
     s->d.then = t;
+    enif_mutex_unlock(s->mutex);
     return t;
 }
 
@@ -113,45 +118,48 @@ void
 __stat_reset(struct stat *s)
 {
     if (!s)
-	return;
+        return;
 
+    enif_mutex_lock(s->mutex);
     s->h = 0;
     s->d.unit = ns;
     s->d.then = 0;
     memset(s->histogram, 0, sizeof(uint64_t) * 64);
     memset(s->samples, 0, sizeof(uint64_t) * s->num_samples);
+    enif_mutex_unlock(s->mutex);
 }
 
-uint64_t
-__stat_tock(struct stat *s)
+void
+__stat_add(struct stat *s, uint64_t elapsed)
 {
-    uint64_t now;
-    uint64_t elapsed;
     uint32_t i;
-    duration_t *d;
 
-    if (!s)
-	return 0.0;
-
-    d = &s->d;
-    now = ts(d->unit);
-    elapsed = now - d->then;
-    i = s->h;
+    enif_mutex_lock(s->mutex);
     if (s->n == s->num_samples) {
-	s->mean = (s->mean + __stat_mean(s)) / 2.0;
-	if (s->n >= 4294967295)
-	    __stat_reset(s);
+        s->mean = (s->mean + __stat_mean(s)) / 2.0;
+        if (s->n >= 4294967295) {
+            enif_mutex_unlock(s->mutex);
+            __stat_reset(s);
+            enif_mutex_lock(s->mutex);
+        }
     }
+    i = s->h;
     s->h = (s->h + 1) % s->num_samples;
     s->samples[i] = elapsed;
     if (elapsed != 0 && elapsed < s->min)
-	s->min = elapsed;
+        s->min = elapsed;
     if (elapsed > s->max)
-	s->max = elapsed;
+        s->max = elapsed;
     s->histogram[LOG2(elapsed)]++;
     s->n++;
-    d->then = ts(d->unit);
-    return elapsed;
+    enif_mutex_unlock(s->mutex);
+}
+
+void
+__stat_tock(struct stat *s)
+{
+    if (s)
+        __stat_add(s, ts(s->d.unit));
 }
 
 void
@@ -162,68 +170,73 @@ __stat_print_histogram(struct stat *s, const char *mod)
     double m;
 
     if (!s)
-	return;
+        return;
 
+    enif_mutex_lock(s->mutex);
     m = (s->mean + __stat_mean(s) / 2.0);
 
     fprintf(stderr, "%s:async_nif request latency histogram:\n", mod);
     for (i = 0; i < 64; i++) {
-	logs[i] = LOG2(s->histogram[i]);
-	if (logs[i] > max_log)
-	    max_log = logs[i];
+        logs[i] = LOG2(s->histogram[i]);
+        if (logs[i] > max_log)
+            max_log = logs[i];
     }
     for (i = max_log; i > 0; i--) {
-	if (!(i % 10))
-	    fprintf(stderr, "2^%2d ", i);
-	else
-	    fprintf(stderr, "     ");
-	for(j = 0; j < 64; j++)
-	    fprintf(stderr, logs[j] >= i ?  "•" : " ");
-	fprintf(stderr, "\n");
+        if (!(i % 10))
+            fprintf(stderr, "2^%2d ", i);
+        else
+            fprintf(stderr, "     ");
+        for(j = 0; j < 64; j++)
+            fprintf(stderr, logs[j] >= i ?  "•" : " ");
+        fprintf(stderr, "\n");
     }
     if (max_log == 0) {
-	fprintf(stderr, "[empty]\n");
+        fprintf(stderr, "[empty]\n");
     } else {
-	fprintf(stderr, "     ns        μs        ms        s         ks\n");
-	fprintf(stderr, "min: ");
-	if (s->min < 1000)
-	    fprintf(stderr, "%llu (ns)", PRIuint64(s->min));
-	else if (s->min < 1000000)
-	    fprintf(stderr, "%.2f (μs)", s->min / 1000.0);
-	else if (s->min < 1000000000)
-	    fprintf(stderr, "%.2f (ms)", s->min / 1000000.0);
-	else if (s->min < 1000000000000)
-	    fprintf(stderr, "%.2f (s)", s->min / 1000000000.0);
-	fprintf(stderr, "  max: ");
-	if (s->max < 1000)
-	    fprintf(stderr, "%llu (ns)", PRIuint64(s->max));
-	else if (s->max < 1000000)
-	    fprintf(stderr, "%.2f (μs)", s->max / 1000.0);
-	else if (s->max < 1000000000)
-	    fprintf(stderr, "%.2f (ms)", s->max / 1000000.0);
-	else if (s->max < 1000000000000)
-	    fprintf(stderr, "%.2f (s)", s->max / 1000000000.0);
-	fprintf(stderr, "  mean: ");
-	if (m < 1000)
-	    fprintf(stderr, "%.2f (ns)", m);
-	else if (m < 1000000)
-	    fprintf(stderr, "%.2f (μs)", m / 1000.0);
-	else if (m < 1000000000)
-	    fprintf(stderr, "%.2f (ms)", m / 1000000.0);
-	else if (m < 1000000000000)
-	    fprintf(stderr, "%.2f (s)", m / 1000000000.0);
-	fprintf(stderr, "\n");
+        fprintf(stderr, "     ns        μs        ms        s         ks\n");
+        fprintf(stderr, "min: ");
+        if (s->min < 1000)
+            fprintf(stderr, "%llu (ns)", PRIuint64(s->min));
+        else if (s->min < 1000000)
+            fprintf(stderr, "%.2f (μs)", s->min / 1000.0);
+        else if (s->min < 1000000000)
+            fprintf(stderr, "%.2f (ms)", s->min / 1000000.0);
+        else if (s->min < 1000000000000)
+            fprintf(stderr, "%.2f (s)", s->min / 1000000000.0);
+        fprintf(stderr, "  max: ");
+        if (s->max < 1000)
+            fprintf(stderr, "%llu (ns)", PRIuint64(s->max));
+        else if (s->max < 1000000)
+            fprintf(stderr, "%.2f (μs)", s->max / 1000.0);
+        else if (s->max < 1000000000)
+            fprintf(stderr, "%.2f (ms)", s->max / 1000000.0);
+        else if (s->max < 1000000000000)
+            fprintf(stderr, "%.2f (s)", s->max / 1000000000.0);
+        fprintf(stderr, "  mean: ");
+        if (m < 1000)
+            fprintf(stderr, "%.2f (ns)", m);
+        else if (m < 1000000)
+            fprintf(stderr, "%.2f (μs)", m / 1000.0);
+        else if (m < 1000000000)
+            fprintf(stderr, "%.2f (ms)", m / 1000000.0);
+        else if (m < 1000000000000)
+            fprintf(stderr, "%.2f (s)", m / 1000000000.0);
+        fprintf(stderr, "\n");
     }
     fflush(stderr);
+    enif_mutex_unlock(s->mutex);
 }
 
 void
 __stat_free(struct stat *s)
 {
     if (!s)
-	return;
+        return;
 
+    enif_mutex_lock(s->mutex);
     enif_free(s->samples);
+    enif_mutex_unlock(s->mutex);
+    enif_mutex_destroy(s->mutex);
     enif_free(s);
 }
 
@@ -232,7 +245,7 @@ __stat_init(uint32_t n)
 {
     struct stat *s = enif_alloc(sizeof(struct stat) + (sizeof(uint64_t) * n));
     if (!s)
-	return NULL;
+        return NULL;
     memset(s, 0, sizeof(struct stat) + (sizeof(uint64_t) * n));
     s->min = ~0;
     s->max = 0;
@@ -240,5 +253,6 @@ __stat_init(uint32_t n)
     s->h = 0;
     s->num_samples = n;
     s->d.unit = ns;
+    s->mutex = enif_mutex_create(NULL);
     return s;
 }
diff --git a/c_src/stats.h b/c_src/stats.h
index 0bdbe8e..f0e550f 100644
--- a/c_src/stats.h
+++ b/c_src/stats.h
@@ -30,6 +30,7 @@ extern "C" {
 #define STAT_DEF(name) struct stat *name ## _stat;
 
 struct stat {
+    ErlNifMutex *mutex;
     duration_t d;
     uint32_t h, n, num_samples;
     uint64_t min, max;
@@ -41,8 +42,9 @@ struct stat {
 extern double __stat_mean(struct stat *s);
 extern double __stat_mean_log2(struct stat *s);
 extern uint64_t __stat_tick(struct stat *s);
+extern void __stat_add(struct stat *s, uint64_t d);
 extern void __stat_reset(struct stat *s);
-extern uint64_t __stat_tock(struct stat *s);
+extern void __stat_tock(struct stat *s);
 extern void __stat_print_histogram(struct stat *s, const char *mod);
 extern void __stat_free(struct stat *s);
 extern struct stat *__stat_init(uint32_t n);
diff --git a/c_src/wterl.c b/c_src/wterl.c
index 7526ef3..c38654f 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -197,10 +197,12 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
     uint64_t now, elapsed;
     struct wterl_ctx *c, *n;
 
+#ifndef DEBUG
     if (conn_handle->cache_size < MAX_CACHE_SIZE)
         return 0;
+#endif
 
-    now = cpu_clock_ticks();
+    now = ts(ns);
 
     // Find the mean of the recorded times that items stayed in cache.
     mean = 0;
@@ -226,7 +228,8 @@ __ctx_cache_evict(WterlConnHandle *conn_handle)
         if (log > mean) {
             STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
             DPRINTF("evicting: %llu", PRIuint64(c->sig));
-            c->session->close(c->session, NULL);
+            if (c->session)
+                c->session->close(c->session, NULL);
             enif_free(c);
             num_evicted++;
         }
@@ -256,22 +259,15 @@ __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
         if (c->sig == sig) { // TODO: hash collisions *will* lead to SEGVs
             // cache hit:
             STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
-            conn_handle->histogram[__log2(cpu_clock_ticks() - c->tstamp)]++;
+            conn_handle->histogram[__log2(ts(ns) - c->tstamp)]++;
             conn_handle->histogram_count++;
             conn_handle->cache_size -= 1;
             break;
         }
         c = STAILQ_NEXT(c, entries);
     }
-#ifdef DEBUG
-    uint32_t sz = 0;
-    struct wterl_ctx *f;
-    STAILQ_FOREACH(f, &conn_handle->cache, entries) {
-	sz++;
-    }
-#endif
     enif_mutex_unlock(conn_handle->cache_mutex);
-    DPRINTF("cache_find: [%u:%u] %s (%p)", sz, conn_handle->cache_size, c ? "hit" : "miss", c);
+    DPRINTF("cache_find: [%u] %s (%p)", conn_handle->cache_size, c ? "hit" : "miss", c);
     return c;
 }
 
@@ -286,14 +282,14 @@ __ctx_cache_add(WterlConnHandle *conn_handle, struct wterl_ctx *c)
 {
     enif_mutex_lock(conn_handle->cache_mutex);
     __ctx_cache_evict(conn_handle);
-    c->tstamp = cpu_clock_ticks();
+    c->tstamp = ts(ns);
     STAILQ_INSERT_TAIL(&conn_handle->cache, c, entries);
     conn_handle->cache_size += 1;
 #ifdef DEBUG
     uint32_t sz = 0;
     struct wterl_ctx *f;
     STAILQ_FOREACH(f, &conn_handle->cache, entries) {
-	sz++;
+        sz++;
     }
 #endif
     enif_mutex_unlock(conn_handle->cache_mutex);
@@ -336,7 +332,7 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
         hash = __str_hash(hash, session_config, l);
         crc = __crc32(crc, session_config, l);
         sig_len += l + 1;
-	DPRINTF("sig/1: %s", session_config);
+        DPRINTF("sig/1: %s", session_config);
     } else {
         sig_len += 1;
     }
@@ -344,7 +340,7 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
         arg = va_arg(ap, const char *);
         if (arg) {
             l = __strlen(arg);
-	    DPRINTF("sig/args: %s", arg);
+            DPRINTF("sig/args: %s", arg);
             hash = __str_hash(hash, arg, l);
             crc = __crc32(crc, arg, l);
             sig_len += l + 1;
@@ -360,21 +356,21 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
 
     c = conn_handle->mru_ctx[worker_id];
     if (CASPO(&conn_handle->mru_ctx[worker_id], c, 0) == c) {
-	if (c == 0) {
-	    // mru miss:
-	    DPRINTF("[%.4u] mru miss, empty", worker_id);
-	    *ctx = NULL;
-	} else {
-	    if (c->sig == sig) {
-		// mru hit:
-		DPRINTF("[%.4u] mru hit: %llu found", worker_id, PRIuint64(sig));
-		*ctx = c;
-	    } else {
-		// mru mismatch:
-		DPRINTF("[%.4u] mru miss: %llu != %llu", worker_id, PRIuint64(sig), PRIuint64(c->sig));
-		__ctx_cache_add(conn_handle, c);
-		*ctx = NULL;
-	    }
+        if (c == 0) {
+            // mru miss:
+            DPRINTF("[%.4u] mru miss, empty", worker_id);
+            *ctx = NULL;
+        } else {
+            if (c->sig == sig) {
+                // mru hit:
+                DPRINTF("[%.4u] mru hit: %llu found", worker_id, PRIuint64(sig));
+                *ctx = c;
+            } else {
+                // mru mismatch:
+                DPRINTF("[%.4u] mru miss: %llu != %llu", worker_id, PRIuint64(sig), PRIuint64(c->sig));
+                __ctx_cache_add(conn_handle, c);
+                *ctx = NULL;
+            }
         }
     }
 
@@ -2309,7 +2305,7 @@ static void __wterl_conn_dtor(ErlNifEnv* env, void* obj)
     WterlConnHandle *conn_handle = (WterlConnHandle *)obj;
 
     if (conn_handle->cache_mutex) {
-        DPRINTF("Non-NULL conn_handle (%p) to free", obj);
+        DPRINTF("conn_handle dtor free'ing (%p)", obj);
         enif_mutex_lock(conn_handle->cache_mutex);
         __close_all_sessions(conn_handle);
         conn_handle->conn->close(conn_handle->conn, NULL);
diff --git a/src/async_nif.hrl b/src/async_nif.hrl
index 9d0f215..9034d8a 100644
--- a/src/async_nif.hrl
+++ b/src/async_nif.hrl
@@ -26,9 +26,6 @@ async_nif_enqueue(R, F, A) ->
     case erlang:apply(F, [R|A]) of
         {ok, enqueued} ->
             receive
-                {R, {error, eagain}} ->
-                    %% Work unit was not queued, try again.
-                    async_nif_enqueue(R, F, A);
                 {R, {error, shutdown}=Error} ->
                     %% Work unit was queued, but not executed.
                     Error;
@@ -38,6 +35,11 @@ async_nif_enqueue(R, F, A) ->
                 {R, Reply} ->
                     Reply
             end;
+        {error, eagain} ->
+            %% Work unit was not queued, try again.
+            async_nif_enqueue(R, F, A);
+        %{error, enomem} ->
+        %{error, shutdown} ->
         Other ->
             Other
     end.
diff --git a/src/wterl.erl b/src/wterl.erl
index db44807..9045be2 100644
--- a/src/wterl.erl
+++ b/src/wterl.erl
@@ -524,7 +524,7 @@ set_event_handler_pid(Pid)
 -define(TEST_DATA_DIR, "test/wterl.basic").
 
 open_test_conn(DataDir) ->
-    open_test_conn(DataDir, [{create,true},{cache_size,"100MB"},{session_max, 8192}]).
+    open_test_conn(DataDir, [{create,true},{cache_size,"1GB"},{session_max, 8192}]).
 open_test_conn(DataDir, OpenConfig) ->
     {ok, CWD} = file:get_cwd(),
     rmdir:path(filename:join([CWD, DataDir])), %?cmd("rm -rf " ++ filename:join([CWD, DataDir])),
@@ -606,7 +606,7 @@ insert_delete_test() ->
 many_open_tables_test_() ->
     {timeout, 120,
      fun() ->
-	     ConnOpts = [{create,true},{cache_size,"100MB"},{session_max, 8192}],
+	     ConnOpts = [{create,true},{cache_size,"1GB"},{session_max, 8192}],
 	     DataDir = ?TEST_DATA_DIR,
 	     KeyGen =
 		 fun(X) ->
@@ -620,19 +620,31 @@ many_open_tables_test_() ->
 		 fun(X) ->
 			 "lsm:" ++ integer_to_list(X)
 		 end,
-	     N = 1000,
+	     NumTables = 16, N = 100,
 	     ConnRef = open_test_conn(DataDir, ConnOpts),
 	     Parent = self(),
-	     [wterl:create(ConnRef, TableNameGen(X), [{checksum, "uncompressed"}]) || X <- lists:seq(0, 128)],
+	     [ok = wterl:create(ConnRef, TableNameGen(X), [{checksum, "uncompressed"}]) || X <- lists:seq(0, NumTables)],
 	     [spawn(fun() ->
 			    TableName = TableNameGen(X),
-			    [wterl:put(ConnRef, TableName, KeyGen(P), ValGen()) || P <- lists:seq(1, N)],
-			    [wterl:get(ConnRef, TableName, KeyGen(P)) || P <- lists:seq(1, N)],
-			    [wterl:delete(ConnRef, TableName, KeyGen(P)) || P <- lists:seq(1, N)],
+			    [case wterl:put(ConnRef, TableName, KeyGen(P), ValGen()) of
+                                 ok -> ok;
+                                 {error, {enoent, _}} -> io:format("put failed, table missing ~p~n", [TableName])
+                             end || P <- lists:seq(1, N)],
+			    [case wterl:get(ConnRef, TableName, KeyGen(P)) of
+                                 {ok, _} -> ok;
+                                 {error, {enoent, _}} -> io:format("get failed, table missing ~p~n", [TableName])
+                             end || P <- lists:seq(1, N)],
+			    [case wterl:delete(ConnRef, TableName, KeyGen(P)) of
+                                 ok -> ok;
+                                 {error, {enoent, _}} -> io:format("delete failed, table missing ~p~n", [TableName])
+                             end || P <- lists:seq(1, N)],
 			    Parent ! done
-		    end) || X <- lists:seq(0, 128)],
-	     [wterl:drop(ConnRef, TableNameGen(X)) || X <- lists:seq(0, 128)],
-	     [receive done -> ok end || _ <- lists:seq(0, 128)],
+		    end) || X <- lists:seq(0, NumTables)],
+	     [receive done -> ok end || _ <- lists:seq(0, NumTables)],
+	     [case wterl:drop(ConnRef, TableNameGen(X)) of
+		  ok -> ok;
+		  {error, {enoent, _}} -> io:format("drop failed, table missing ~p~n", [TableNameGen(X)])
+	      end || X <- lists:seq(0, NumTables)],
 	     ok = wterl:connection_close(ConnRef)
      end}.
 

From 2672bab3ea0ae2c311373abd598397bdde0200ac Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 2 Jul 2013 19:58:00 -0400
Subject: [PATCH 28/30] Stats overhead due to hitting the clock and pulling a
 mutex caused a massive slowdown so now work is assigned to a new queue only
 when the candidate queue is deeper than the average of the other queues and
 threads are created only when the depth of the queue is larger than the
 number of threads working on that queue.

---
 c_src/async_nif.h | 39 +++++++--------------------------------
 c_src/stats.c     | 18 ------------------
 c_src/stats.h     |  1 -
 3 files changed, 7 insertions(+), 51 deletions(-)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index c18106c..425c0f1 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -27,7 +27,6 @@ extern "C" {
 #include <assert.h>
 
 #include "queue.h"
-#include "stats.h"
 
 #ifndef UNUSED
 #define UNUSED(v) ((void)(v))
@@ -35,7 +34,7 @@ extern "C" {
 
 #define ASYNC_NIF_MAX_WORKERS 1024
 #define ASYNC_NIF_MIN_WORKERS 2
-#define ASYNC_NIF_WORKER_QUEUE_SIZE 1000
+#define ASYNC_NIF_WORKER_QUEUE_SIZE 100
 #define ASYNC_NIF_MAX_QUEUED_REQS ASYNC_NIF_WORKER_QUEUE_SIZE * ASYNC_NIF_MAX_WORKERS
 
 struct async_nif_req_entry {
@@ -56,7 +55,6 @@ struct async_nif_work_queue {
   ErlNifMutex *reqs_mutex;
   ErlNifCond *reqs_cnd;
   STAILQ_HEAD(reqs, async_nif_req_entry) reqs;
-  STAT_DEF(work);
 };
 
 struct async_nif_worker_entry {
@@ -68,8 +66,6 @@ struct async_nif_worker_entry {
 };
 
 struct async_nif_state {
-  STAT_DEF(wait);
-  STAT_DEF(service);
   unsigned int shutdown;
   ErlNifMutex *we_mutex;
   unsigned int we_active;
@@ -349,26 +345,15 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
       return 0;
   }
 
-  /* We've selected queue for this new request now check to make sure there are
-     enough workers actively processing requests on this queue. */
-  if (q->num_workers < ASYNC_NIF_MIN_WORKERS) {
-      if (async_nif_start_worker(async_nif, q) == 0) q->num_workers++;
-  } else {
-      /* If more the 1/4 of the time it takes to complete a request is spent in
-         waiting to be serviced then we need more worker threads to service
-         requests. */
-      double m, n;
-      m = __stat_mean(async_nif->wait_stat);
-      n = __stat_mean(q->work_stat);
-      DPRINTF("w: %f\ts: %f\t%f", m, n, n/(m+n));
-      if (m && n && n / (m+n)  < 0.75)
-          if (async_nif_start_worker(async_nif, q) == 0) q->num_workers++;
-  }
-
-  /* And finally add the request to the queue. */
+  /* Add the request to the queue. */
   STAILQ_INSERT_TAIL(&q->reqs, req, entries);
   q->depth++;
 
+  /* We've selected a queue for this new request now check to make sure there are
+     enough workers actively processing requests on this queue. */
+  if (q->depth > q->num_workers)
+      if (async_nif_start_worker(async_nif, q) == 0) q->num_workers++;
+
   /* Build the term before releasing the lock so as not to race on the use of
      the req pointer (which will soon become invalid in another thread
      performing the request). */
@@ -423,13 +408,7 @@ async_nif_worker_fn(void *arg)
       enif_mutex_unlock(q->reqs_mutex);
 
       /* Perform the work. */
-      uint64_t then = ts(q->work_stat->d.unit);
-      uint64_t wait = then - req->submitted;
       req->fn_work(req->env, req->ref, &req->pid, worker_id, req->args);
-      uint64_t work = ts(q->work_stat->d.unit) - then;
-      __stat_add(async_nif->wait_stat, wait);
-      __stat_add(q->work_stat, work);
-      __stat_add(async_nif->service_stat, wait + work);
 
       /* Now call the post-work cleanup function. */
       req->fn_post(req->args);
@@ -481,7 +460,6 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
       q = &async_nif->queues[i];
       enif_mutex_unlock(q->reqs_mutex);
   }
-  __stat_print_histogram(async_nif->service_stat, "wterl");
 
   /* Join for the now exiting worker threads. */
   while(async_nif->we_active > 0) {
@@ -586,15 +564,12 @@ async_nif_load()
   async_nif->recycled_req_mutex = enif_mutex_create(NULL);
   async_nif->we_mutex = enif_mutex_create(NULL);
   SLIST_INIT(&async_nif->we_joining);
-  async_nif->wait_stat = __stat_init(100000);
-  async_nif->service_stat = __stat_init(100000);
 
   for (i = 0; i < async_nif->num_queues; i++) {
       struct async_nif_work_queue *q = &async_nif->queues[i];
       STAILQ_INIT(&q->reqs);
       q->reqs_mutex = enif_mutex_create(NULL);
       q->reqs_cnd = enif_cond_create(NULL);
-      q->work_stat = __stat_init(100000);
   }
   return async_nif;
 }
diff --git a/c_src/stats.c b/c_src/stats.c
index d018c76..5583374 100644
--- a/c_src/stats.c
+++ b/c_src/stats.c
@@ -66,7 +66,6 @@ __stat_mean(struct stat *s)
     if (!s)
         return 0.0;
 
-    enif_mutex_lock(s->mutex);
     t = s->h;
     h = (s->h + 1) % s->num_samples;
     mean = 0;
@@ -77,7 +76,6 @@ __stat_mean(struct stat *s)
     }
     if (mean > 0)
         mean /= (s->n < s->num_samples ? (double)s->n : (double)s->num_samples);
-    enif_mutex_unlock(s->mutex);
     return mean;
 }
 
@@ -90,12 +88,10 @@ __stat_mean_log2(struct stat *s)
     if (!s)
         return 0.0;
 
-    enif_mutex_lock(s->mutex);
     for (i = 0; i < 64; i++)
         mean += (s->histogram[i] * i);
     if (mean > 0)
         mean /= (s->n < s->num_samples ? s->n : s->num_samples);
-    enif_mutex_unlock(s->mutex);
     return mean;
 }
 
@@ -107,10 +103,8 @@ __stat_tick(struct stat *s)
     if (!s)
         return 0.0;
 
-    enif_mutex_lock(s->mutex);
     t = ts(s->d.unit);
     s->d.then = t;
-    enif_mutex_unlock(s->mutex);
     return t;
 }
 
@@ -120,13 +114,11 @@ __stat_reset(struct stat *s)
     if (!s)
         return;
 
-    enif_mutex_lock(s->mutex);
     s->h = 0;
     s->d.unit = ns;
     s->d.then = 0;
     memset(s->histogram, 0, sizeof(uint64_t) * 64);
     memset(s->samples, 0, sizeof(uint64_t) * s->num_samples);
-    enif_mutex_unlock(s->mutex);
 }
 
 void
@@ -134,13 +126,10 @@ __stat_add(struct stat *s, uint64_t elapsed)
 {
     uint32_t i;
 
-    enif_mutex_lock(s->mutex);
     if (s->n == s->num_samples) {
         s->mean = (s->mean + __stat_mean(s)) / 2.0;
         if (s->n >= 4294967295) {
-            enif_mutex_unlock(s->mutex);
             __stat_reset(s);
-            enif_mutex_lock(s->mutex);
         }
     }
     i = s->h;
@@ -152,7 +141,6 @@ __stat_add(struct stat *s, uint64_t elapsed)
         s->max = elapsed;
     s->histogram[LOG2(elapsed)]++;
     s->n++;
-    enif_mutex_unlock(s->mutex);
 }
 
 void
@@ -172,7 +160,6 @@ __stat_print_histogram(struct stat *s, const char *mod)
     if (!s)
         return;
 
-    enif_mutex_lock(s->mutex);
     m = (s->mean + __stat_mean(s) / 2.0);
 
     fprintf(stderr, "%s:async_nif request latency histogram:\n", mod);
@@ -224,7 +211,6 @@ __stat_print_histogram(struct stat *s, const char *mod)
         fprintf(stderr, "\n");
     }
     fflush(stderr);
-    enif_mutex_unlock(s->mutex);
 }
 
 void
@@ -233,10 +219,7 @@ __stat_free(struct stat *s)
     if (!s)
         return;
 
-    enif_mutex_lock(s->mutex);
     enif_free(s->samples);
-    enif_mutex_unlock(s->mutex);
-    enif_mutex_destroy(s->mutex);
     enif_free(s);
 }
 
@@ -253,6 +236,5 @@ __stat_init(uint32_t n)
     s->h = 0;
     s->num_samples = n;
     s->d.unit = ns;
-    s->mutex = enif_mutex_create(NULL);
     return s;
 }
diff --git a/c_src/stats.h b/c_src/stats.h
index f0e550f..c563491 100644
--- a/c_src/stats.h
+++ b/c_src/stats.h
@@ -30,7 +30,6 @@ extern "C" {
 #define STAT_DEF(name) struct stat *name ## _stat;
 
 struct stat {
-    ErlNifMutex *mutex;
     duration_t d;
     uint32_t h, n, num_samples;
     uint64_t min, max;

From b7275381624746993e6d9e3576c9353e4c830ce1 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 2 Jul 2013 22:07:34 -0400
Subject: [PATCH 29/30] Fix shutdown

---
 c_src/async_nif.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 425c0f1..713270b 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -252,7 +252,7 @@ async_nif_start_worker(struct async_nif_state *async_nif, struct async_nif_work_
   we = SLIST_FIRST(&async_nif->we_joining);
   while(we != NULL) {
     struct async_nif_worker_entry *n = SLIST_NEXT(we, entries);
-    SLIST_REMOVE_HEAD(&async_nif->we_joining, entries);
+    SLIST_REMOVE(&async_nif->we_joining, we, async_nif_worker_entry, entries);
     void *exit_value = 0; /* We ignore the thread_join's exit value. */
     enif_thread_join(we->tid, &exit_value);
     enif_free(we);
@@ -451,11 +451,9 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
       q = &async_nif->queues[i];
       enif_mutex_lock(q->reqs_mutex);
   }
-
   /* Set the shutdown flag so that worker threads will no continue
      executing requests. */
   async_nif->shutdown = 1;
-
   for (i = 0; i < num_queues; i++) {
       q = &async_nif->queues[i];
       enif_mutex_unlock(q->reqs_mutex);
@@ -463,20 +461,20 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
 
   /* Join for the now exiting worker threads. */
   while(async_nif->we_active > 0) {
-
       for (i = 0; i < num_queues; i++)
           enif_cond_broadcast(async_nif->queues[i].reqs_cnd);
-
+      enif_mutex_lock(async_nif->we_mutex);
       we = SLIST_FIRST(&async_nif->we_joining);
       while(we != NULL) {
           struct async_nif_worker_entry *n = SLIST_NEXT(we, entries);
-          SLIST_REMOVE_HEAD(&async_nif->we_joining, entries);
+          SLIST_REMOVE(&async_nif->we_joining, we, async_nif_worker_entry, entries);
           void *exit_value = 0; /* We ignore the thread_join's exit value. */
           enif_thread_join(we->tid, &exit_value);
           enif_free(we);
           async_nif->we_active--;
           we = n;
       }
+      enif_mutex_unlock(async_nif->we_mutex);
   }
   enif_mutex_destroy(async_nif->we_mutex);
 

From bc0f5dbfc7c14e890e45c420a4fcb1fa1dba2b30 Mon Sep 17 00:00:00 2001
From: Gregory Burd <greg@basho.com>
Date: Tue, 2 Jul 2013 22:23:32 -0400
Subject: [PATCH 30/30] Evict older half of items in the cache by removing
 items from the end of the list, don't waste cycles computing timestamps.

---
 c_src/async_nif.h |   3 -
 c_src/duration.h  | 123 ------------------------
 c_src/queue.h     |  11 +++
 c_src/stats.c     | 240 ----------------------------------------------
 c_src/stats.h     |  55 -----------
 c_src/wterl.c     |  42 ++------
 6 files changed, 18 insertions(+), 456 deletions(-)
 delete mode 100644 c_src/duration.h
 delete mode 100644 c_src/stats.c
 delete mode 100644 c_src/stats.h

diff --git a/c_src/async_nif.h b/c_src/async_nif.h
index 713270b..92ebe66 100644
--- a/c_src/async_nif.h
+++ b/c_src/async_nif.h
@@ -44,7 +44,6 @@ struct async_nif_req_entry {
   void *args;
   void (*fn_work)(ErlNifEnv*, ERL_NIF_TERM, ErlNifPid*, unsigned int, void *);
   void (*fn_post)(void *);
-  uint64_t submitted;
   STAILQ_ENTRY(async_nif_req_entry) entries;
 };
 
@@ -109,7 +108,6 @@ struct async_nif_state {
                               enif_make_atom(env, "shutdown"));         \
     }                                                                   \
     req = async_nif_reuse_req(async_nif);                               \
-    req->submitted = ts(ns);                                            \
     if (!req) {                                                         \
         return enif_make_tuple2(env, enif_make_atom(env, "error"),      \
                                 enif_make_atom(env, "eagain"));         \
@@ -415,7 +413,6 @@ async_nif_worker_fn(void *arg)
 
       /* Clean up req for reuse. */
       req->ref = 0;
-      req->submitted = 0;
       req->fn_work = 0;
       req->fn_post = 0;
       enif_free(req->args);
diff --git a/c_src/duration.h b/c_src/duration.h
deleted file mode 100644
index 2d86385..0000000
--- a/c_src/duration.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (C) 2013, all rights reserved by Gregory Burd <greg@burd.me>
- *
- * This Source Code Form is subject to the terms of the Mozilla Public License,
- * version 2 (MPLv2).  If a copy of the MPL was not distributed with this file,
- * you can obtain one at: http://mozilla.org/MPL/2.0/
- *
- * NOTES:
- *    - on some platforms this will require -lrt
- */
-#include <stdio.h>
-#include <stdint.h>
-#include <time.h>
-#include <sys/timeb.h>
-
-#ifdef __MACH__
-#include <mach/clock.h>
-#include <mach/mach.h>
-#endif
-
-
-static inline void current_utc_time(struct timespec *ts)
-{
-#ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
-    clock_serv_t cclock;
-    mach_timespec_t mts;
-    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
-    clock_get_time(cclock, &mts);
-    mach_port_deallocate(mach_task_self(), cclock);
-    ts->tv_sec = mts.tv_sec;
-    ts->tv_nsec = mts.tv_nsec;
-#else
-    clock_gettime(CLOCK_REALTIME, ts);
-#endif
-
-}
-
-typedef enum { ns = 0, mcs, ms, s } time_scale;
-struct scale_time {
-     const char *abbreviation;
-     const char *name;
-     uint64_t mul, div, overhead, ticks_per;
-};
-static const struct scale_time scale[] = {
-     { "ns",  "nanosecond",  1000000000LL, 1LL, 10, 2300000000000LL },
-     { "μs",  "microsecond", 1000000LL, 1000LL, 10, 2300000000LL },
-     { "ms",  "millisecond", 1000LL, 1000000LL, 10, 2300000LL },
-     { "sec", "second",      1LL, 1000000000LL, 10, 2300LL } };
-
-static uint64_t ts(time_scale unit)
-{
-    struct timespec ts;
-    current_utc_time(&ts);
-    return (((uint64_t)ts.tv_sec * scale[unit].mul) +
-            ((uint64_t)ts.tv_nsec / scale[unit].div));
-}
-
-#if defined(__i386__) || defined(__x86_64__)
-
-/**
- * cpu_clock_ticks()
- *
- * A measure provided by Intel x86 CPUs which provides the number of cycles
- * (aka "ticks") executed as a counter using the RDTSC instruction.
- */
-static inline uint64_t cpu_clock_ticks()
-{
-     uint32_t lo, hi;
-     __asm__ __volatile__ (
-         "XORL %%eax, %%eax\n" /* Flush the pipeline */
-         "CPUID\n"
-         "RDTSC\n"             /* Get RDTSC counter in edx:eax */
-         : "=a" (lo), "=d" (hi)
-         :
-         : "%ebx", "%ecx" );
-     return (uint64_t)hi << 32 | lo;
-}
-
-#endif
-
-#if 0
-
-/**
- * cpu_clock_ticks()
- *
- * An approximation of elapsed [ns, mcs, ms, s] from CPU clock ticks.
- */
-static uint64_t elapsed_cpu_clock_ticks(uint64_t start, time_scale unit)
-{
-    return (cpu_clock_ticks() - start - scale[unit].overhead) * scale[unit].ticks_per;
-}
-
-#endif
-
-typedef struct {
-     uint64_t then;
-     time_scale unit;
-} duration_t;
-
-static inline uint64_t elapsed(duration_t *d)
-{
-     uint64_t now = ts(d->unit);
-     uint64_t elapsed = now - d->then;
-     d->then = now;
-     return elapsed;
-}
-
-#define DURATION(name, resolution) duration_t name =    \
-     {ts(resolution), resolution}
-
-#define ELAPSED_DURING(result, resolution, block)       \
-     do {                                               \
-          DURATION(__x, resolution);                    \
-          do block while(0);                            \
-          *result = elapsed(&__x);                      \
-     } while(0);
-
-#define CYCLES_DURING(result, block)                    \
-     do {                                               \
-         uint64_t __begin = cpu_clock_ticks();          \
-         do block while(0);                             \
-         *result = cpu_clock_ticks() - __begin;         \
-     } while(0);
diff --git a/c_src/queue.h b/c_src/queue.h
index 9235d47..4c6a153 100644
--- a/c_src/queue.h
+++ b/c_src/queue.h
@@ -33,6 +33,17 @@
 #ifndef	_DB_QUEUE_H_
 #define	_DB_QUEUE_H_
 
+#ifndef __offsetof
+#define __offsetof(st, m) \
+          ((size_t) ( (char *)&((st *)0)->m - (char *)0 ))
+#endif
+
+#ifndef __containerof
+#define __containerof(ptr, type, member) ({ \
+          const typeof( ((type *)0)->member ) *__mptr = (ptr);              \
+                (type *)( (char *)__mptr - __offsetof(type,member) );})
+#endif
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
diff --git a/c_src/stats.c b/c_src/stats.c
deleted file mode 100644
index 5583374..0000000
--- a/c_src/stats.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * stats:
- *
- * Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved.
- * Author: Gregory Burd <greg@basho.com> <greg@burd.me>
- *
- * This file is provided to you under the Apache License,
- * Version 2.0 (the "License"); you may not use this file
- * except in compliance with the License.  You may obtain
- * a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <inttypes.h>
-
-#include "erl_nif.h"
-#include "erl_driver.h"
-
-#include "common.h"
-#include "duration.h"
-#include "stats.h"
-
-/**
- * Calculate the log2 of 64bit unsigned integers.
- */
-#ifdef __GCC__
-#define LOG2(X) ((unsigned) ((8 * (sizeof(uint64_t) - 1))  - __builtin_clzll((X))))
-#else
-static unsigned int __log2_64(uint64_t x) {
-     static const int tab64[64] = {
-          63,  0, 58,  1, 59, 47, 53,  2,
-          60, 39, 48, 27, 54, 33, 42,  3,
-          61, 51, 37, 40, 49, 18, 28, 20,
-          55, 30, 34, 11, 43, 14, 22,  4,
-          62, 57, 46, 52, 38, 26, 32, 41,
-          50, 36, 17, 19, 29, 10, 13, 21,
-          56, 45, 25, 31, 35, 16,  9, 12,
-          44, 24, 15,  8, 23,  7,  6,  5};
-     if (x == 0) return 0;
-     uint64_t v = x;
-     v |= v >> 1;
-     v |= v >> 2;
-     v |= v >> 4;
-     v |= v >> 8;
-     v |= v >> 16;
-     v |= v >> 32;
-     return tab64[((uint64_t)((v - (v >> 1)) * 0x07EDD5E59A4E28C2)) >> 58];
-}
-#define LOG2(X) __log2_64(X)
-#endif
-
-double
-__stat_mean(struct stat *s)
-{
-    uint32_t t, h;
-    double mean;
-
-    if (!s)
-        return 0.0;
-
-    t = s->h;
-    h = (s->h + 1) % s->num_samples;
-    mean = 0;
-
-    while (h != t) {
-        mean += s->samples[h];
-        h = (h + 1) % s->num_samples;
-    }
-    if (mean > 0)
-        mean /= (s->n < s->num_samples ? (double)s->n : (double)s->num_samples);
-    return mean;
-}
-
-double
-__stat_mean_log2(struct stat *s)
-{
-    uint32_t i;
-    double mean = 0.0;
-
-    if (!s)
-        return 0.0;
-
-    for (i = 0; i < 64; i++)
-        mean += (s->histogram[i] * i);
-    if (mean > 0)
-        mean /= (s->n < s->num_samples ? s->n : s->num_samples);
-    return mean;
-}
-
-uint64_t
-__stat_tick(struct stat *s)
-{
-    uint64_t t;
-
-    if (!s)
-        return 0.0;
-
-    t = ts(s->d.unit);
-    s->d.then = t;
-    return t;
-}
-
-void
-__stat_reset(struct stat *s)
-{
-    if (!s)
-        return;
-
-    s->h = 0;
-    s->d.unit = ns;
-    s->d.then = 0;
-    memset(s->histogram, 0, sizeof(uint64_t) * 64);
-    memset(s->samples, 0, sizeof(uint64_t) * s->num_samples);
-}
-
-void
-__stat_add(struct stat *s, uint64_t elapsed)
-{
-    uint32_t i;
-
-    if (s->n == s->num_samples) {
-        s->mean = (s->mean + __stat_mean(s)) / 2.0;
-        if (s->n >= 4294967295) {
-            __stat_reset(s);
-        }
-    }
-    i = s->h;
-    s->h = (s->h + 1) % s->num_samples;
-    s->samples[i] = elapsed;
-    if (elapsed != 0 && elapsed < s->min)
-        s->min = elapsed;
-    if (elapsed > s->max)
-        s->max = elapsed;
-    s->histogram[LOG2(elapsed)]++;
-    s->n++;
-}
-
-void
-__stat_tock(struct stat *s)
-{
-    if (s)
-        __stat_add(s, ts(s->d.unit));
-}
-
-void
-__stat_print_histogram(struct stat *s, const char *mod)
-{
-    uint8_t logs[64];
-    uint8_t i, j, max_log = 0;
-    double m;
-
-    if (!s)
-        return;
-
-    m = (s->mean + __stat_mean(s) / 2.0);
-
-    fprintf(stderr, "%s:async_nif request latency histogram:\n", mod);
-    for (i = 0; i < 64; i++) {
-        logs[i] = LOG2(s->histogram[i]);
-        if (logs[i] > max_log)
-            max_log = logs[i];
-    }
-    for (i = max_log; i > 0; i--) {
-        if (!(i % 10))
-            fprintf(stderr, "2^%2d ", i);
-        else
-            fprintf(stderr, "     ");
-        for(j = 0; j < 64; j++)
-            fprintf(stderr, logs[j] >= i ?  "•" : " ");
-        fprintf(stderr, "\n");
-    }
-    if (max_log == 0) {
-        fprintf(stderr, "[empty]\n");
-    } else {
-        fprintf(stderr, "     ns        μs        ms        s         ks\n");
-        fprintf(stderr, "min: ");
-        if (s->min < 1000)
-            fprintf(stderr, "%llu (ns)", PRIuint64(s->min));
-        else if (s->min < 1000000)
-            fprintf(stderr, "%.2f (μs)", s->min / 1000.0);
-        else if (s->min < 1000000000)
-            fprintf(stderr, "%.2f (ms)", s->min / 1000000.0);
-        else if (s->min < 1000000000000)
-            fprintf(stderr, "%.2f (s)", s->min / 1000000000.0);
-        fprintf(stderr, "  max: ");
-        if (s->max < 1000)
-            fprintf(stderr, "%llu (ns)", PRIuint64(s->max));
-        else if (s->max < 1000000)
-            fprintf(stderr, "%.2f (μs)", s->max / 1000.0);
-        else if (s->max < 1000000000)
-            fprintf(stderr, "%.2f (ms)", s->max / 1000000.0);
-        else if (s->max < 1000000000000)
-            fprintf(stderr, "%.2f (s)", s->max / 1000000000.0);
-        fprintf(stderr, "  mean: ");
-        if (m < 1000)
-            fprintf(stderr, "%.2f (ns)", m);
-        else if (m < 1000000)
-            fprintf(stderr, "%.2f (μs)", m / 1000.0);
-        else if (m < 1000000000)
-            fprintf(stderr, "%.2f (ms)", m / 1000000.0);
-        else if (m < 1000000000000)
-            fprintf(stderr, "%.2f (s)", m / 1000000000.0);
-        fprintf(stderr, "\n");
-    }
-    fflush(stderr);
-}
-
-void
-__stat_free(struct stat *s)
-{
-    if (!s)
-        return;
-
-    enif_free(s->samples);
-    enif_free(s);
-}
-
-struct stat *
-__stat_init(uint32_t n)
-{
-    struct stat *s = enif_alloc(sizeof(struct stat) + (sizeof(uint64_t) * n));
-    if (!s)
-        return NULL;
-    memset(s, 0, sizeof(struct stat) + (sizeof(uint64_t) * n));
-    s->min = ~0;
-    s->max = 0;
-    s->mean = 0.0;
-    s->h = 0;
-    s->num_samples = n;
-    s->d.unit = ns;
-    return s;
-}
diff --git a/c_src/stats.h b/c_src/stats.h
deleted file mode 100644
index c563491..0000000
--- a/c_src/stats.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * stats:
- *
- * Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved.
- * Author: Gregory Burd <greg@basho.com> <greg@burd.me>
- *
- * This file is provided to you under the Apache License,
- * Version 2.0 (the "License"); you may not use this file
- * except in compliance with the License.  You may obtain
- * a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-#ifndef __STATS_H__
-#define __STATS_H__
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#define STAT_DEF(name) struct stat *name ## _stat;
-
-struct stat {
-    duration_t d;
-    uint32_t h, n, num_samples;
-    uint64_t min, max;
-    double mean;
-    uint64_t histogram[64];
-    uint64_t samples[];
-};
-
-extern double __stat_mean(struct stat *s);
-extern double __stat_mean_log2(struct stat *s);
-extern uint64_t __stat_tick(struct stat *s);
-extern void __stat_add(struct stat *s, uint64_t d);
-extern void __stat_reset(struct stat *s);
-extern void __stat_tock(struct stat *s);
-extern void __stat_print_histogram(struct stat *s, const char *mod);
-extern void __stat_free(struct stat *s);
-extern struct stat *__stat_init(uint32_t n);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif // __STATS_H__
diff --git a/c_src/wterl.c b/c_src/wterl.c
index c38654f..8a04d57 100644
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@@ -29,8 +29,6 @@
 #include "wiredtiger.h"
 
 #include "common.h"
-#include "duration.h"
-#include "stats.h"
 #include "async_nif.h"
 #include "queue.h"
 #include "cas.h"
@@ -44,7 +42,6 @@ typedef char Uri[128];
 
 struct wterl_ctx {
     STAILQ_ENTRY(wterl_ctx) entries;
-    uint64_t tstamp;
     uint64_t sig;
     size_t sig_len;
     WT_SESSION *session;
@@ -64,8 +61,6 @@ typedef struct wterl_conn {
     ErlNifMutex *cache_mutex;
     uint32_t cache_size;
     struct wterl_ctx *mru_ctx[ASYNC_NIF_MAX_WORKERS];
-    uint64_t histogram[64];
-    uint64_t histogram_count;
 } WterlConnHandle;
 
 typedef struct {
@@ -193,47 +188,27 @@ static inline uint32_t __log2(uint64_t x) {
 static int
 __ctx_cache_evict(WterlConnHandle *conn_handle)
 {
-    uint32_t mean, log, num_evicted, i;
-    uint64_t now, elapsed;
-    struct wterl_ctx *c, *n;
+    uint32_t mean, num_evicted;
+    struct wterl_ctx *c;
 
 #ifndef DEBUG
     if (conn_handle->cache_size < MAX_CACHE_SIZE)
         return 0;
 #endif
 
-    now = ts(ns);
+    mean = conn_handle->cache_size / 2;
+    if (mean < 2) return 0;
 
-    // Find the mean of the recorded times that items stayed in cache.
-    mean = 0;
-    for (i = 0; i < 64; i++)
-        mean += (conn_handle->histogram[i] * i);
-    if (mean > 0)
-        mean /= conn_handle->histogram_count;
-
-    // Clear out the histogram and hit/misses
-    memset(conn_handle->histogram, 0, sizeof(uint64_t) * 64);
-    conn_handle->histogram_count = 0;
-
-    /*
-     * Evict anything older than the mean time in queue by removing those
-     * items from the lists stored in the tree.
-     */
     num_evicted = 0;
-    c = STAILQ_FIRST(&conn_handle->cache);
-    while (c != NULL) {
-        n = STAILQ_NEXT(c, entries);
-        elapsed = c->tstamp - now;
-        log = __log2(elapsed);
-        if (log > mean) {
+    while (mean--) {
+	c = STAILQ_LAST(&conn_handle->cache, wterl_ctx, entries);
+	if (c) {
             STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
-            DPRINTF("evicting: %llu", PRIuint64(c->sig));
             if (c->session)
                 c->session->close(c->session, NULL);
             enif_free(c);
             num_evicted++;
         }
-        c = n;
     }
     conn_handle->cache_size -= num_evicted;
     return num_evicted;
@@ -259,8 +234,6 @@ __ctx_cache_find(WterlConnHandle *conn_handle, const uint64_t sig)
         if (c->sig == sig) { // TODO: hash collisions *will* lead to SEGVs
             // cache hit:
             STAILQ_REMOVE(&conn_handle->cache, c, wterl_ctx, entries);
-            conn_handle->histogram[__log2(ts(ns) - c->tstamp)]++;
-            conn_handle->histogram_count++;
             conn_handle->cache_size -= 1;
             break;
         }
@@ -282,7 +255,6 @@ __ctx_cache_add(WterlConnHandle *conn_handle, struct wterl_ctx *c)
 {
     enif_mutex_lock(conn_handle->cache_mutex);
     __ctx_cache_evict(conn_handle);
-    c->tstamp = ts(ns);
     STAILQ_INSERT_TAIL(&conn_handle->cache, c, entries);
     conn_handle->cache_size += 1;
 #ifdef DEBUG