Compare commits

..

No commits in common. "master" and "gsb-rename-as-lmdb" have entirely different histories.

17 changed files with 3414 additions and 7827 deletions

1
.gitignore vendored
View file

@ -5,4 +5,3 @@ c_src/*.o
deps/ deps/
priv/ priv/
*~ *~
.rebar

View file

@ -4,16 +4,18 @@
* Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved. * Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved.
* Author: Gregory Burd <greg@basho.com> <greg@burd.me> * Author: Gregory Burd <greg@basho.com> <greg@burd.me>
* *
* This file is provided to you under the Apache License, Version 2.0 (the * This file is provided to you under the Apache License,
* "License"); you may not use this file except in compliance with the License. * Version 2.0 (the "License"); you may not use this file
* You may obtain a copy of the License at: * except in compliance with the License. You may obtain
* a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing,
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * software distributed under the License is distributed on an
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* License for the specific language governing permissions and limitations * KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License. * under the License.
*/ */
@ -25,26 +27,18 @@ extern "C" {
#endif #endif
#include <assert.h> #include <assert.h>
#include "fifo_q.h"
#include "stats.h"
#include "queue.h" #ifndef __UNUSED
#define __UNUSED(v) ((void)(v))
#ifndef UNUSED
#define UNUSED(v) ((void)(v))
#endif #endif
#define ASYNC_NIF_MAX_WORKERS 1024 #define ASYNC_NIF_MAX_WORKERS 128
#define ASYNC_NIF_MIN_WORKERS 2 #define ASYNC_NIF_WORKER_QUEUE_SIZE 500
#define ASYNC_NIF_WORKER_QUEUE_SIZE 8192 #define ASYNC_NIF_MAX_QUEUED_REQS 1000 * ASYNC_NIF_MAX_WORKERS
#define ASYNC_NIF_MAX_QUEUED_REQS ASYNC_NIF_WORKER_QUEUE_SIZE * ASYNC_NIF_MAX_WORKERS
/* Atoms (initialized in on_load) */
static ERL_NIF_TERM ATOM_EAGAIN;
static ERL_NIF_TERM ATOM_ENOMEM;
static ERL_NIF_TERM ATOM_ENQUEUED;
static ERL_NIF_TERM ATOM_ERROR;
static ERL_NIF_TERM ATOM_OK;
static ERL_NIF_TERM ATOM_SHUTDOWN;
STAT_DECL(qwait, 1000);
struct async_nif_req_entry { struct async_nif_req_entry {
ERL_NIF_TERM ref; ERL_NIF_TERM ref;
@ -53,17 +47,14 @@ struct async_nif_req_entry {
void *args; void *args;
void (*fn_work)(ErlNifEnv*, ERL_NIF_TERM, ErlNifPid*, unsigned int, void *); void (*fn_work)(ErlNifEnv*, ERL_NIF_TERM, ErlNifPid*, unsigned int, void *);
void (*fn_post)(void *); void (*fn_post)(void *);
STAILQ_ENTRY(async_nif_req_entry) entries;
}; };
DECL_FIFO_QUEUE(reqs, struct async_nif_req_entry);
struct async_nif_work_queue { struct async_nif_work_queue {
unsigned int num_workers; STAT_DEF(qwait);
unsigned int depth;
ErlNifMutex *reqs_mutex; ErlNifMutex *reqs_mutex;
ErlNifCond *reqs_cnd; ErlNifCond *reqs_cnd;
struct async_nif_work_queue *next; FIFO_QUEUE_TYPE(reqs) reqs;
STAILQ_HEAD(reqs, async_nif_req_entry) reqs;
}; };
struct async_nif_worker_entry { struct async_nif_worker_entry {
@ -71,17 +62,16 @@ struct async_nif_worker_entry {
unsigned int worker_id; unsigned int worker_id;
struct async_nif_state *async_nif; struct async_nif_state *async_nif;
struct async_nif_work_queue *q; struct async_nif_work_queue *q;
SLIST_ENTRY(async_nif_worker_entry) entries;
}; };
struct async_nif_state { struct async_nif_state {
STAT_DEF(qwait);
unsigned int shutdown; unsigned int shutdown;
ErlNifMutex *we_mutex; unsigned int num_workers;
unsigned int we_active; struct async_nif_worker_entry worker_entries[ASYNC_NIF_MAX_WORKERS];
SLIST_HEAD(joining, async_nif_worker_entry) we_joining;
unsigned int num_queues; unsigned int num_queues;
unsigned int next_q; unsigned int next_q;
STAILQ_HEAD(recycled_reqs, async_nif_req_entry) recycled_reqs; FIFO_QUEUE_TYPE(reqs) recycled_reqs;
unsigned int num_reqs; unsigned int num_reqs;
ErlNifMutex *recycled_req_mutex; ErlNifMutex *recycled_req_mutex;
struct async_nif_work_queue queues[]; struct async_nif_work_queue queues[];
@ -90,43 +80,39 @@ struct async_nif_state {
#define ASYNC_NIF_DECL(decl, frame, pre_block, work_block, post_block) \ #define ASYNC_NIF_DECL(decl, frame, pre_block, work_block, post_block) \
struct decl ## _args frame; \ struct decl ## _args frame; \
static void fn_work_ ## decl (ErlNifEnv *env, ERL_NIF_TERM ref, ErlNifPid *pid, unsigned int worker_id, struct decl ## _args *args) { \ static void fn_work_ ## decl (ErlNifEnv *env, ERL_NIF_TERM ref, ErlNifPid *pid, unsigned int worker_id, struct decl ## _args *args) { \
UNUSED(worker_id); \ __UNUSED(worker_id); \
DPRINTF("async_nif: calling \"%s\"", __func__); \
do work_block while(0); \ do work_block while(0); \
DPRINTF("async_nif: returned from \"%s\"", __func__); \
} \ } \
static void fn_post_ ## decl (struct decl ## _args *args) { \ static void fn_post_ ## decl (struct decl ## _args *args) { \
UNUSED(args); \ __UNUSED(args); \
DPRINTF("async_nif: calling \"fn_post_%s\"", #decl); \
do post_block while(0); \ do post_block while(0); \
DPRINTF("async_nif: returned from \"fn_post_%s\"", #decl); \
} \ } \
static ERL_NIF_TERM decl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv_in[]) { \ static ERL_NIF_TERM decl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv_in[]) { \
struct decl ## _args on_stack_args; \ struct decl ## _args on_stack_args; \
struct decl ## _args *args = &on_stack_args; \ struct decl ## _args *args = &on_stack_args; \
struct decl ## _args *copy_of_args; \ struct decl ## _args *copy_of_args; \
struct async_nif_req_entry *req = NULL; \ struct async_nif_req_entry *req = NULL; \
unsigned int affinity = 0; \ const char *affinity = NULL; \
ErlNifEnv *new_env = NULL; \ ErlNifEnv *new_env = NULL; \
/* argv[0] is a ref used for selective recv */ \ /* argv[0] is a ref used for selective recv */ \
const ERL_NIF_TERM *argv = argv_in + 1; \ const ERL_NIF_TERM *argv = argv_in + 1; \
argc -= 1; \ argc -= 1; \
/* Note: !!! this assumes that the first element of priv_data is ours */ \ /* Note: !!! this assumes that the first element of priv_data is ours */ \
struct async_nif_state *async_nif = *(struct async_nif_state**)enif_priv_data(env); \ struct async_nif_state *async_nif = *(struct async_nif_state**)enif_priv_data(env); \
if (async_nif->shutdown) \ if (async_nif->shutdown) \
return enif_make_tuple2(env, ATOM_ERROR, ATOM_SHUTDOWN); \ return enif_make_tuple2(env, enif_make_atom(env, "error"), \
enif_make_atom(env, "shutdown")); \
req = async_nif_reuse_req(async_nif); \ req = async_nif_reuse_req(async_nif); \
if (!req) \
return enif_make_tuple2(env, ATOM_ERROR, ATOM_ENOMEM); \
new_env = req->env; \ new_env = req->env; \
DPRINTF("async_nif: calling \"%s\"", __func__); \ if (!req) \
return enif_make_tuple2(env, enif_make_atom(env, "error"), \
enif_make_atom(env, "eagain")); \
do pre_block while(0); \ do pre_block while(0); \
DPRINTF("async_nif: returned from \"%s\"", __func__); \ copy_of_args = (struct decl ## _args *)enif_alloc(sizeof(struct decl ## _args)); \
copy_of_args = (struct decl ## _args *)malloc(sizeof(struct decl ## _args)); \
if (!copy_of_args) { \ if (!copy_of_args) { \
fn_post_ ## decl (args); \ fn_post_ ## decl (args); \
async_nif_recycle_req(req, async_nif); \ return enif_make_tuple2(env, enif_make_atom(env, "error"), \
return enif_make_tuple2(env, ATOM_ERROR, ATOM_ENOMEM); \ enif_make_atom(env, "enomem")); \
} \ } \
memcpy(copy_of_args, args, sizeof(struct decl ## _args)); \ memcpy(copy_of_args, args, sizeof(struct decl ## _args)); \
req->ref = enif_make_copy(new_env, argv_in[0]); \ req->ref = enif_make_copy(new_env, argv_in[0]); \
@ -136,13 +122,13 @@ struct async_nif_state {
req->fn_post = (void (*)(void *))fn_post_ ## decl; \ req->fn_post = (void (*)(void *))fn_post_ ## decl; \
int h = -1; \ int h = -1; \
if (affinity) \ if (affinity) \
h = ((unsigned int)affinity) % async_nif->num_queues; \ h = async_nif_str_hash_func(affinity) % async_nif->num_queues; \
ERL_NIF_TERM reply = async_nif_enqueue_req(async_nif, req, h); \ ERL_NIF_TERM reply = async_nif_enqueue_req(async_nif, req, h); \
if (!reply) { \ if (!reply) { \
fn_post_ ## decl (args); \ fn_post_ ## decl (args); \
async_nif_recycle_req(req, async_nif); \ enif_free(copy_of_args); \
free(copy_of_args); \ return enif_make_tuple2(env, enif_make_atom(env, "error"), \
return enif_make_tuple2(env, ATOM_ERROR, ATOM_EAGAIN); \ enif_make_atom(env, "shutdown")); \
} \ } \
return reply; \ return reply; \
} }
@ -150,16 +136,16 @@ struct async_nif_state {
#define ASYNC_NIF_INIT(name) \ #define ASYNC_NIF_INIT(name) \
static ErlNifMutex *name##_async_nif_coord = NULL; static ErlNifMutex *name##_async_nif_coord = NULL;
#define ASYNC_NIF_LOAD(name, env, priv) do { \ #define ASYNC_NIF_LOAD(name, priv) do { \
if (!name##_async_nif_coord) \ if (!name##_async_nif_coord) \
name##_async_nif_coord = enif_mutex_create("nif_coord load"); \ name##_async_nif_coord = enif_mutex_create(NULL); \
enif_mutex_lock(name##_async_nif_coord); \ enif_mutex_lock(name##_async_nif_coord); \
priv = async_nif_load(env); \ priv = async_nif_load(); \
enif_mutex_unlock(name##_async_nif_coord); \ enif_mutex_unlock(name##_async_nif_coord); \
} while(0); } while(0);
#define ASYNC_NIF_UNLOAD(name, env, priv) do { \ #define ASYNC_NIF_UNLOAD(name, env, priv) do { \
if (!name##_async_nif_coord) \ if (!name##_async_nif_coord) \
name##_async_nif_coord = enif_mutex_create("nif_coord unload"); \ name##_async_nif_coord = enif_mutex_create(NULL); \
enif_mutex_lock(name##_async_nif_coord); \ enif_mutex_lock(name##_async_nif_coord); \
async_nif_unload(env, priv); \ async_nif_unload(env, priv); \
enif_mutex_unlock(name##_async_nif_coord); \ enif_mutex_unlock(name##_async_nif_coord); \
@ -168,7 +154,7 @@ struct async_nif_state {
} while(0); } while(0);
#define ASYNC_NIF_UPGRADE(name, env) do { \ #define ASYNC_NIF_UPGRADE(name, env) do { \
if (!name##_async_nif_coord) \ if (!name##_async_nif_coord) \
name##_async_nif_coord = enif_mutex_create("nif_coord upgrade"); \ name##_async_nif_coord = enif_mutex_create(NULL); \
enif_mutex_lock(name##_async_nif_coord); \ enif_mutex_lock(name##_async_nif_coord); \
async_nif_upgrade(env); \ async_nif_upgrade(env); \
enif_mutex_unlock(name##_async_nif_coord); \ enif_mutex_unlock(name##_async_nif_coord); \
@ -193,26 +179,26 @@ async_nif_reuse_req(struct async_nif_state *async_nif)
ErlNifEnv *env = NULL; ErlNifEnv *env = NULL;
enif_mutex_lock(async_nif->recycled_req_mutex); enif_mutex_lock(async_nif->recycled_req_mutex);
if (STAILQ_EMPTY(&async_nif->recycled_reqs)) { if (fifo_q_empty(reqs, async_nif->recycled_reqs)) {
if (async_nif->num_reqs < ASYNC_NIF_MAX_QUEUED_REQS) { if (async_nif->num_reqs < ASYNC_NIF_MAX_QUEUED_REQS) {
req = malloc(sizeof(struct async_nif_req_entry)); req = enif_alloc(sizeof(struct async_nif_req_entry));
if (req) { if (req) {
memset(req, 0, sizeof(struct async_nif_req_entry)); memset(req, 0, sizeof(struct async_nif_req_entry));
env = enif_alloc_env(); env = enif_alloc_env();
if (env) { if (!env) {
req->env = env; enif_free(req);
__sync_fetch_and_add(&async_nif->num_reqs, 1);
} else {
free(req);
req = NULL; req = NULL;
} else {
req->env = env;
async_nif->num_reqs++;
} }
} }
} }
} else { } else {
req = STAILQ_FIRST(&async_nif->recycled_reqs); req = fifo_q_get(reqs, async_nif->recycled_reqs);
STAILQ_REMOVE(&async_nif->recycled_reqs, req, async_nif_req_entry, entries);
} }
enif_mutex_unlock(async_nif->recycled_req_mutex); enif_mutex_unlock(async_nif->recycled_req_mutex);
STAT_TICK(async_nif, qwait);
return req; return req;
} }
@ -226,59 +212,27 @@ async_nif_reuse_req(struct async_nif_state *async_nif)
void void
async_nif_recycle_req(struct async_nif_req_entry *req, struct async_nif_state *async_nif) async_nif_recycle_req(struct async_nif_req_entry *req, struct async_nif_state *async_nif)
{ {
ErlNifEnv *env = NULL; STAT_TOCK(async_nif, qwait);
enif_mutex_lock(async_nif->recycled_req_mutex); enif_mutex_lock(async_nif->recycled_req_mutex);
enif_clear_env(req->env); fifo_q_put(reqs, async_nif->recycled_reqs, req);
env = req->env;
memset(req, 0, sizeof(struct async_nif_req_entry));
req->env = env;
STAILQ_INSERT_TAIL(&async_nif->recycled_reqs, req, entries);
enif_mutex_unlock(async_nif->recycled_req_mutex); enif_mutex_unlock(async_nif->recycled_req_mutex);
} }
static void *async_nif_worker_fn(void *);
/** /**
* Start up a worker thread. * A string hash function.
*
* A basic hash function for strings of characters used during the
* affinity association.
*
* s a NULL terminated set of bytes to be hashed
* -> an integer hash encoding of the bytes
*/ */
static int static inline unsigned int
async_nif_start_worker(struct async_nif_state *async_nif, struct async_nif_work_queue *q) async_nif_str_hash_func(const char *s)
{ {
struct async_nif_worker_entry *we; unsigned int h = (unsigned int)*s;
if (h) for (++s ; *s; ++s) h = (h << 5) - h + (unsigned int)*s;
if (0 == q) return h;
return EINVAL;
enif_mutex_lock(async_nif->we_mutex);
we = SLIST_FIRST(&async_nif->we_joining);
while(we != NULL) {
struct async_nif_worker_entry *n = SLIST_NEXT(we, entries);
SLIST_REMOVE(&async_nif->we_joining, we, async_nif_worker_entry, entries);
void *exit_value = 0; /* We ignore the thread_join's exit value. */
enif_thread_join(we->tid, &exit_value);
free(we);
async_nif->we_active--;
we = n;
}
if (async_nif->we_active == ASYNC_NIF_MAX_WORKERS) {
enif_mutex_unlock(async_nif->we_mutex);
return EAGAIN;
}
we = malloc(sizeof(struct async_nif_worker_entry));
if (!we) {
enif_mutex_unlock(async_nif->we_mutex);
return ENOMEM;
}
memset(we, 0, sizeof(struct async_nif_worker_entry));
we->worker_id = async_nif->we_active++;
we->async_nif = async_nif;
we->q = q;
enif_mutex_unlock(async_nif->we_mutex);
return enif_thread_create(NULL,&we->tid, &async_nif_worker_fn, (void*)we, 0);
} }
/** /**
@ -291,9 +245,9 @@ static ERL_NIF_TERM
async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_entry *req, int hint) async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_entry *req, int hint)
{ {
/* Identify the most appropriate worker for this request. */ /* Identify the most appropriate worker for this request. */
unsigned int i, last_qid, qid = 0; unsigned int qid = 0;
struct async_nif_work_queue *q = NULL; struct async_nif_work_queue *q = NULL;
double avg_depth = 0.0; unsigned int n = async_nif->num_queues;
/* Either we're choosing a queue based on some affinity/hinted value or we /* Either we're choosing a queue based on some affinity/hinted value or we
need to select the next queue in the rotation and atomically update that need to select the next queue in the rotation and atomically update that
@ -301,76 +255,53 @@ async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_en
if (hint >= 0) { if (hint >= 0) {
qid = (unsigned int)hint; qid = (unsigned int)hint;
} else { } else {
do { qid = async_nif->next_q;
last_qid = __sync_fetch_and_add(&async_nif->next_q, 0); qid = (qid + 1) % async_nif->num_queues;
qid = (last_qid + 1) % async_nif->num_queues; async_nif->next_q = qid;
} while (!__sync_bool_compare_and_swap(&async_nif->next_q, last_qid, qid));
} }
/* Now we inspect and interate across the set of queues trying to select one /* Now we inspect and interate across the set of queues trying to select one
that isn't too full or too slow. */ that isn't too full or too slow. */
for (i = 0; i < async_nif->num_queues; i++) { do {
/* Compute the average queue depth not counting queues which are empty or
the queue we're considering right now. */
unsigned int j, n = 0;
for (j = 0; j < async_nif->num_queues; j++) {
if (j != qid && async_nif->queues[j].depth != 0) {
n++;
avg_depth += async_nif->queues[j].depth;
}
}
if (avg_depth) avg_depth /= n;
/* Lock this queue under consideration, then check for shutdown. While
we hold this lock either a) we're shutting down so exit now or b) this
queue will be valid until we release the lock. */
q = &async_nif->queues[qid]; q = &async_nif->queues[qid];
enif_mutex_lock(q->reqs_mutex); enif_mutex_lock(q->reqs_mutex);
/* Try not to enqueue a request into a queue that isn't keeping up with /* Now that we hold the lock, check for shutdown. As long as we hold
the request volume. */ this lock either a) we're shutting down so exit now or b) this queue
if (q->depth <= avg_depth) break; will be valid until we release the lock. */
else { if (async_nif->shutdown) {
enif_mutex_unlock(q->reqs_mutex);
return 0;
}
double await = STAT_MEAN_LOG2_SAMPLE(async_nif, qwait);
double await_inthisq = STAT_MEAN_LOG2_SAMPLE(q, qwait);
if (fifo_q_full(reqs, q->reqs) || await_inthisq > await) {
enif_mutex_unlock(q->reqs_mutex); enif_mutex_unlock(q->reqs_mutex);
qid = (qid + 1) % async_nif->num_queues; qid = (qid + 1) % async_nif->num_queues;
q = &async_nif->queues[qid];
} else {
break;
} }
} // TODO: at some point add in work sheading/stealing
} while(n-- > 0);
/* If the for loop finished then we didn't find a suitable queue for this /* We hold the queue's lock, and we've seletect a reasonable queue for this
request, meaning we're backed up so trigger eagain. Note that if we left new request so add the request. */
the loop in this way we hold no lock. */ STAT_TICK(q, qwait);
if (i == async_nif->num_queues) return 0; fifo_q_put(reqs, q->reqs, req);
/* Add the request to the queue. */
STAILQ_INSERT_TAIL(&q->reqs, req, entries);
__sync_fetch_and_add(&q->depth, 1);
/* We've selected a queue for this new request now check to make sure there are
enough workers actively processing requests on this queue. */
while (q->depth > q->num_workers) {
switch(async_nif_start_worker(async_nif, q)) {
case EINVAL: case ENOMEM: default: return 0;
case EAGAIN: continue;
case 0: __sync_fetch_and_add(&q->num_workers, 1); goto done;
}
}done:;
/* Build the term before releasing the lock so as not to race on the use of /* Build the term before releasing the lock so as not to race on the use of
the req pointer (which will soon become invalid in another thread the req pointer (which will soon become invalid in another thread
performing the request). */ performing the request). */
double pct_full = (double)avg_depth / (double)ASYNC_NIF_WORKER_QUEUE_SIZE; ERL_NIF_TERM reply = enif_make_tuple2(req->env, enif_make_atom(req->env, "ok"),
ERL_NIF_TERM reply = enif_make_tuple2(req->env, ATOM_OK, enif_make_atom(req->env, "enqueued"));
enif_make_tuple2(req->env, ATOM_ENQUEUED,
enif_make_double(req->env, pct_full)));
enif_cond_signal(q->reqs_cnd);
enif_mutex_unlock(q->reqs_mutex); enif_mutex_unlock(q->reqs_mutex);
enif_cond_signal(q->reqs_cnd);
return reply; return reply;
} }
/** /**
* Worker threads execute this function. Here each worker pulls requests of * TODO:
* their respective queues, executes that work and continues doing that until
* they see the shutdown flag is set at which point they exit.
*/ */
static void * static void *
async_nif_worker_fn(void *arg) async_nif_worker_fn(void *arg)
@ -380,7 +311,6 @@ async_nif_worker_fn(void *arg)
struct async_nif_state *async_nif = we->async_nif; struct async_nif_state *async_nif = we->async_nif;
struct async_nif_work_queue *q = we->q; struct async_nif_work_queue *q = we->q;
struct async_nif_req_entry *req = NULL; struct async_nif_req_entry *req = NULL;
unsigned int tries = async_nif->num_queues;
for(;;) { for(;;) {
/* Examine the request queue, are there things to be done? */ /* Examine the request queue, are there things to be done? */
@ -390,40 +320,26 @@ async_nif_worker_fn(void *arg)
enif_mutex_unlock(q->reqs_mutex); enif_mutex_unlock(q->reqs_mutex);
break; break;
} }
if (STAILQ_EMPTY(&q->reqs)) { if (fifo_q_empty(reqs, q->reqs)) {
/* Queue is empty so we wait for more work to arrive. */ /* Queue is empty so we wait for more work to arrive. */
enif_mutex_unlock(q->reqs_mutex); STAT_RESET(q, qwait);
if (tries == 0 && q == we->q) { enif_cond_wait(q->reqs_cnd, q->reqs_mutex);
if (q->num_workers > ASYNC_NIF_MIN_WORKERS) { goto check_again_for_work;
/* At this point we've tried to find/execute work on all queues
* and there are at least MIN_WORKERS on this queue so we
* leaving this loop (break) which leads to a thread exit/join. */
break;
} else {
enif_mutex_lock(q->reqs_mutex);
enif_cond_wait(q->reqs_cnd, q->reqs_mutex);
goto check_again_for_work;
}
} else {
tries--;
__sync_fetch_and_add(&q->num_workers, -1);
q = q->next;
__sync_fetch_and_add(&q->num_workers, 1);
continue; // try next queue
}
} else { } else {
assert(fifo_q_size(reqs, q->reqs) > 0);
assert(fifo_q_size(reqs, q->reqs) < fifo_q_capacity(reqs, q->reqs));
/* At this point the next req is ours to process and we hold the /* At this point the next req is ours to process and we hold the
reqs_mutex lock. Take the request off the queue. */ reqs_mutex lock. Take the request off the queue. */
req = STAILQ_FIRST(&q->reqs); req = fifo_q_get(reqs, q->reqs);
STAILQ_REMOVE(&q->reqs, req, async_nif_req_entry, entries);
__sync_fetch_and_add(&q->depth, -1);
/* Wake up other worker thread watching this queue to help process work. */
enif_cond_signal(q->reqs_cnd);
enif_mutex_unlock(q->reqs_mutex); enif_mutex_unlock(q->reqs_mutex);
/* Ensure that there is at least one other worker thread watching this
queue. */
enif_cond_signal(q->reqs_cnd);
/* Perform the work. */ /* Perform the work. */
req->fn_work(req->env, req->ref, &req->pid, worker_id, req->args); req->fn_work(req->env, req->ref, &req->pid, worker_id, req->args);
STAT_TOCK(q, qwait);
/* Now call the post-work cleanup function. */ /* Now call the post-work cleanup function. */
req->fn_post(req->args); req->fn_post(req->args);
@ -432,16 +348,13 @@ async_nif_worker_fn(void *arg)
req->ref = 0; req->ref = 0;
req->fn_work = 0; req->fn_work = 0;
req->fn_post = 0; req->fn_post = 0;
free(req->args); enif_free(req->args);
req->args = NULL; req->args = NULL;
enif_clear_env(req->env);
async_nif_recycle_req(req, async_nif); async_nif_recycle_req(req, async_nif);
req = NULL; req = NULL;
} }
} }
enif_mutex_lock(async_nif->we_mutex);
SLIST_INSERT_HEAD(&async_nif->we_joining, we, entries);
enif_mutex_unlock(async_nif->we_mutex);
__sync_fetch_and_add(&q->num_workers, -1);
enif_thread_exit(0); enif_thread_exit(0);
return 0; return 0;
} }
@ -453,44 +366,41 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
unsigned int num_queues = async_nif->num_queues; unsigned int num_queues = async_nif->num_queues;
struct async_nif_work_queue *q = NULL; struct async_nif_work_queue *q = NULL;
struct async_nif_req_entry *req = NULL; struct async_nif_req_entry *req = NULL;
struct async_nif_worker_entry *we = NULL; __UNUSED(env);
UNUSED(env);
/* Signal the worker threads, stop what you're doing and exit. To ensure STAT_PRINT(async_nif, qwait, "wterl");
that we don't race with the enqueue() process we first lock all the worker
queues, then set shutdown to true, then unlock. The enqueue function will /* Signal the worker threads, stop what you're doing and exit. To
take the queue mutex, then test for shutdown condition, then enqueue only ensure that we don't race with the enqueue() process we first
if not shutting down. */ lock all the worker queues, then set shutdown to true, then
unlock. The enqueue function will take the queue mutex, then
test for shutdown condition, then enqueue only if not shutting
down. */
for (i = 0; i < num_queues; i++) { for (i = 0; i < num_queues; i++) {
q = &async_nif->queues[i]; q = &async_nif->queues[i];
enif_mutex_lock(q->reqs_mutex); enif_mutex_lock(q->reqs_mutex);
} }
/* Set the shutdown flag so that worker threads will no continue
executing requests. */
async_nif->shutdown = 1; async_nif->shutdown = 1;
for (i = 0; i < num_queues; i++) { for (i = 0; i < num_queues; i++) {
q = &async_nif->queues[i]; q = &async_nif->queues[i];
enif_cond_broadcast(q->reqs_cnd);
enif_mutex_unlock(q->reqs_mutex); enif_mutex_unlock(q->reqs_mutex);
} }
/* Join for the now exiting worker threads. */ /* Join for the now exiting worker threads. */
while(async_nif->we_active > 0) { for (i = 0; i < async_nif->num_workers; ++i) {
for (i = 0; i < num_queues; i++) void *exit_value = 0; /* We ignore the thread_join's exit value. */
enif_cond_broadcast(async_nif->queues[i].reqs_cnd); enif_thread_join(async_nif->worker_entries[i].tid, &exit_value);
enif_mutex_lock(async_nif->we_mutex);
we = SLIST_FIRST(&async_nif->we_joining);
while(we != NULL) {
struct async_nif_worker_entry *n = SLIST_NEXT(we, entries);
SLIST_REMOVE(&async_nif->we_joining, we, async_nif_worker_entry, entries);
void *exit_value = 0; /* We ignore the thread_join's exit value. */
enif_thread_join(we->tid, &exit_value);
free(we);
async_nif->we_active--;
we = n;
}
enif_mutex_unlock(async_nif->we_mutex);
} }
enif_mutex_destroy(async_nif->we_mutex);
/* Free req structres sitting on the recycle queue. */
enif_mutex_lock(async_nif->recycled_req_mutex);
req = NULL;
fifo_q_foreach(reqs, async_nif->recycled_reqs, req, {
enif_free_env(req->env);
enif_free(req);
});
fifo_q_free(reqs, async_nif->recycled_reqs);
/* Cleanup in-flight requests, mutexes and conditions in each work queue. */ /* Cleanup in-flight requests, mutexes and conditions in each work queue. */
for (i = 0; i < num_queues; i++) { for (i = 0; i < num_queues; i++) {
@ -498,44 +408,32 @@ async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif)
/* Worker threads are stopped, now toss anything left in the queue. */ /* Worker threads are stopped, now toss anything left in the queue. */
req = NULL; req = NULL;
req = STAILQ_FIRST(&q->reqs); fifo_q_foreach(reqs, q->reqs, req, {
while(req != NULL) {
struct async_nif_req_entry *n = STAILQ_NEXT(req, entries);
enif_clear_env(req->env); enif_clear_env(req->env);
enif_send(NULL, &req->pid, req->env, enif_send(NULL, &req->pid, req->env,
enif_make_tuple2(req->env, ATOM_ERROR, ATOM_SHUTDOWN)); enif_make_tuple2(req->env, enif_make_atom(req->env, "error"),
enif_make_atom(req->env, "shutdown")));
req->fn_post(req->args); req->fn_post(req->args);
enif_free_env(req->env); enif_free_env(req->env);
free(req->args); enif_free(req->args);
free(req); enif_free(req);
req = n; });
} fifo_q_free(reqs, q->reqs);
enif_mutex_destroy(q->reqs_mutex); enif_mutex_destroy(q->reqs_mutex);
enif_cond_destroy(q->reqs_cnd); enif_cond_destroy(q->reqs_cnd);
} }
/* Free any req structures sitting unused on the recycle queue. */
enif_mutex_lock(async_nif->recycled_req_mutex);
req = NULL;
req = STAILQ_FIRST(&async_nif->recycled_reqs);
while(req != NULL) {
struct async_nif_req_entry *n = STAILQ_NEXT(req, entries);
enif_free_env(req->env);
free(req);
req = n;
}
enif_mutex_unlock(async_nif->recycled_req_mutex); enif_mutex_unlock(async_nif->recycled_req_mutex);
enif_mutex_destroy(async_nif->recycled_req_mutex); enif_mutex_destroy(async_nif->recycled_req_mutex);
memset(async_nif, 0, sizeof(struct async_nif_state) + (sizeof(struct async_nif_work_queue) * async_nif->num_queues)); memset(async_nif, 0, sizeof(struct async_nif_state) + (sizeof(struct async_nif_work_queue) * async_nif->num_queues));
free(async_nif); enif_free(async_nif);
} }
static void * static void *
async_nif_load(ErlNifEnv *env) async_nif_load()
{ {
static int has_init = 0; static int has_init = 0;
unsigned int i, num_queues; unsigned int i, j, num_queues;
ErlNifSysInfo info; ErlNifSysInfo info;
struct async_nif_state *async_nif; struct async_nif_state *async_nif;
@ -543,14 +441,6 @@ async_nif_load(ErlNifEnv *env)
if (has_init) return 0; if (has_init) return 0;
else has_init = 1; else has_init = 1;
/* Init some static references to commonly used atoms. */
ATOM_EAGAIN = enif_make_atom(env, "eagain");
ATOM_ENOMEM = enif_make_atom(env, "enomem");
ATOM_ENQUEUED = enif_make_atom(env, "enqueued");
ATOM_ERROR = enif_make_atom(env, "error");
ATOM_OK = enif_make_atom(env, "ok");
ATOM_SHUTDOWN = enif_make_atom(env, "shutdown");
/* Find out how many schedulers there are. */ /* Find out how many schedulers there are. */
enif_system_info(&info, sizeof(ErlNifSysInfo)); enif_system_info(&info, sizeof(ErlNifSysInfo));
@ -568,28 +458,62 @@ async_nif_load(ErlNifEnv *env)
} }
/* Init our portion of priv_data's module-specific state. */ /* Init our portion of priv_data's module-specific state. */
async_nif = malloc(sizeof(struct async_nif_state) + async_nif = enif_alloc(sizeof(struct async_nif_state) +
sizeof(struct async_nif_work_queue) * num_queues); sizeof(struct async_nif_work_queue) * num_queues);
if (!async_nif) if (!async_nif)
return NULL; return NULL;
memset(async_nif, 0, sizeof(struct async_nif_state) + memset(async_nif, 0, sizeof(struct async_nif_state) +
sizeof(struct async_nif_work_queue) * num_queues); sizeof(struct async_nif_work_queue) * num_queues);
async_nif->num_queues = num_queues; async_nif->num_queues = num_queues;
async_nif->we_active = 0; async_nif->num_workers = 2 * num_queues;
async_nif->next_q = 0; async_nif->next_q = 0;
async_nif->shutdown = 0; async_nif->shutdown = 0;
STAILQ_INIT(&async_nif->recycled_reqs); async_nif->recycled_reqs = fifo_q_new(reqs, ASYNC_NIF_MAX_QUEUED_REQS);
async_nif->recycled_req_mutex = enif_mutex_create("recycled_req"); async_nif->recycled_req_mutex = enif_mutex_create(NULL);
async_nif->we_mutex = enif_mutex_create("we"); STAT_INIT(async_nif, qwait);
SLIST_INIT(&async_nif->we_joining);
for (i = 0; i < async_nif->num_queues; i++) { for (i = 0; i < async_nif->num_queues; i++) {
struct async_nif_work_queue *q = &async_nif->queues[i]; struct async_nif_work_queue *q = &async_nif->queues[i];
STAILQ_INIT(&q->reqs); q->reqs = fifo_q_new(reqs, ASYNC_NIF_WORKER_QUEUE_SIZE);
q->reqs_mutex = enif_mutex_create("reqs"); q->reqs_mutex = enif_mutex_create(NULL);
q->reqs_cnd = enif_cond_create("reqs"); q->reqs_cnd = enif_cond_create(NULL);
q->next = &async_nif->queues[(i + 1) % num_queues]; STAT_INIT(q, qwait);
}
/* Setup the thread pool management. */
memset(async_nif->worker_entries, 0, sizeof(struct async_nif_worker_entry) * ASYNC_NIF_MAX_WORKERS);
/* Start the worker threads. */
for (i = 0; i < async_nif->num_workers; i++) {
struct async_nif_worker_entry *we = &async_nif->worker_entries[i];
we->async_nif = async_nif;
we->worker_id = i;
we->q = &async_nif->queues[i % async_nif->num_queues];
if (enif_thread_create(NULL, &async_nif->worker_entries[i].tid,
&async_nif_worker_fn, (void*)we, NULL) != 0) {
async_nif->shutdown = 1;
for (j = 0; j < async_nif->num_queues; j++) {
struct async_nif_work_queue *q = &async_nif->queues[j];
enif_cond_broadcast(q->reqs_cnd);
}
while(i-- > 0) {
void *exit_value = 0; /* Ignore this. */
enif_thread_join(async_nif->worker_entries[i].tid, &exit_value);
}
for (j = 0; j < async_nif->num_queues; j++) {
struct async_nif_work_queue *q = &async_nif->queues[j];
enif_mutex_destroy(q->reqs_mutex);
enif_cond_destroy(q->reqs_cnd);
}
memset(async_nif->worker_entries, 0, sizeof(struct async_nif_worker_entry) * ASYNC_NIF_MAX_WORKERS);
enif_free(async_nif);
return NULL;
}
} }
return async_nif; return async_nif;
} }
@ -597,7 +521,7 @@ async_nif_load(ErlNifEnv *env)
static void static void
async_nif_upgrade(ErlNifEnv *env) async_nif_upgrade(ErlNifEnv *env)
{ {
UNUSED(env); __UNUSED(env);
// TODO: // TODO:
} }

View file

@ -24,28 +24,24 @@
extern "C" { extern "C" {
#endif #endif
#if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__)) #ifdef DEBUG
# undef DEBUG
# define DEBUG 0
# define DPRINTF (void) /* Vararg macros may be unsupported */
#elif DEBUG
#include <stdio.h> #include <stdio.h>
#include <stdarg.h> #include <stdarg.h>
#ifndef DPRINTF
#define DPRINTF(fmt, ...) \ #define DPRINTF(fmt, ...) \
do { \ do { \
fprintf(stderr, "%s:%d " fmt "\n", __FILE__, __LINE__, __VA_ARGS__); \ fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__); \
fflush(stderr); \ fflush(stderr); \
} while(0) } while(0)
#define DPUTS(arg) DPRINTF("%s", arg) #endif
#ifndef DPUTS
#define DPUTS(arg) DPRINTF("%s", arg)
#endif
#else #else
#define DPRINTF(fmt, ...) ((void) 0) #define DPRINTF(fmt, ...) ((void) 0)
#define DPUTS(arg) ((void) 0) #define DPUTS(arg) ((void) 0)
#endif #endif
#ifndef __UNUSED
#define __UNUSED(v) ((void)(v))
#endif
#ifndef COMPQUIET #ifndef COMPQUIET
#define COMPQUIET(n, v) do { \ #define COMPQUIET(n, v) do { \
(n) = (v); \ (n) = (v); \
@ -53,12 +49,11 @@ extern "C" {
} while (0) } while (0)
#endif #endif
#ifdef __APPLE__ #ifndef __UNUSED
#define PRIuint64(x) (x) #define __UNUSED(v) ((void)(v))
#else
#define PRIuint64(x) (unsigned long long)(x)
#endif #endif
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif #endif

98
c_src/duration.h Normal file
View file

@ -0,0 +1,98 @@
/*
* Copyright (C) 2013, all rights reserved by Gregory Burd <greg@burd.me>
*
* This Source Code Form is subject to the terms of the Mozilla Public License,
* version 2 (MPLv2). If a copy of the MPL was not distributed with this file,
* you can obtain one at: http://mozilla.org/MPL/2.0/
*
* NOTES:
* - on some platforms this will require -lrt
*/
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <sys/timeb.h>
typedef enum { ns = 0, mcs, ms, s } time_scale;
struct scale_time {
const char *abbreviation;
const char *name;
uint64_t mul, div, overhead, ticks_per;
};
static const struct scale_time scale[] = {
{ "ns", "nanosecond", 1000000000LL, 1LL, 10, 2300000000000LL },
{ "mcs", "microsecond", 1000000LL, 1000LL, 10, 2300000000LL },
{ "ms", "millisecond", 1000LL, 1000000LL, 10, 2300000LL },
{ "sec", "second", 1LL, 1000000000LL, 10, 2300LL } };
static uint64_t ts(time_scale unit)
{
struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
return (((uint64_t)ts.tv_sec * scale[unit].mul) +
((uint64_t)ts.tv_nsec / scale[unit].div));
}
#if 0
//if defined(__i386__) || defined(__x86_64__)
/**
* cpu_clock_ticks()
*
* A measure provided by Intel x86 CPUs which provides the number of cycles
* (aka "ticks") executed as a counter using the RDTSC instruction.
*/
static inline uint64_t cpu_clock_ticks()
{
uint32_t lo, hi;
__asm__ __volatile__ (
"xorl %%eax, %%eax\n"
"cpuid\n"
"rdtsc\n"
: "=a" (lo), "=d" (hi)
:
: "%ebx", "%ecx" );
return (uint64_t)hi << 32 | lo;
}
/**
* cpu_clock_ticks()
*
* An approximation of elapsed [ns, mcs, ms, s] from CPU clock ticks.
*/
static uint64_t elapsed_cpu_clock_ticks(uint64_t start, time_scale unit)
{
return (cpu_clock_ticks() - start - scale[unit].overhead) * scale[unit].ticks_per;
}
#endif
typedef struct {
uint64_t then;
time_scale unit;
} duration_t;
static inline uint64_t elapsed(duration_t *d)
{
uint64_t now = ts(d->unit);
uint64_t elapsed = now - d->then;
d->then = now;
return elapsed;
}
#define DURATION(name, resolution) duration_t name = \
{ts(resolution), resolution}
#define ELAPSED_DURING(result, resolution, block) \
do { \
DURATION(__x, resolution); \
do block while(0); \
*result = elapsed(&__x); \
} while(0);
#define CYCLES_DURING(result, block) \
do { \
uint64_t __begin = cpu_clock_ticks(); \
do block while(0); \
*result = cpu_clock_ticks() - __begin; \
} while(0);

93
c_src/fifo_q.h Normal file
View file

@ -0,0 +1,93 @@
/*
* fifo_q: a macro-based implementation of a FIFO Queue
*
* Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved.
* Author: Gregory Burd <greg@basho.com> <greg@burd.me>
*
* This file is provided to you under the Apache License,
* Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain
* a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#ifndef __FIFO_Q_H__
#define __FIFO_Q_H__
#if defined(__cplusplus)
extern "C" {
#endif
#define FIFO_QUEUE_TYPE(name) \
struct fifo_q__ ## name *
#define DECL_FIFO_QUEUE(name, type) \
struct fifo_q__ ## name { \
unsigned int h, t, s; \
type *items[]; \
}; \
static struct fifo_q__ ## name *fifo_q_ ## name ## _new(unsigned int n) { \
int sz = sizeof(struct fifo_q__ ## name) + ((n+1) * sizeof(type *));\
struct fifo_q__ ## name *q = enif_alloc(sz); \
if (!q) \
return 0; \
memset(q, 0, sz); \
q->s = n + 1; \
return q; \
} \
static inline void fifo_q_ ## name ## _free(struct fifo_q__ ## name *q) { \
memset(q, 0, sizeof(struct fifo_q__ ## name) + (q->s * sizeof(type *))); \
enif_free(q); \
} \
static inline type *fifo_q_ ## name ## _put(struct fifo_q__ ## name *q, type *n) { \
q->items[q->h] = n; \
q->h = (q->h + 1) % q->s; \
return n; \
} \
static inline type *fifo_q_ ## name ## _get(struct fifo_q__ ## name *q) { \
type *n = q->items[q->t]; \
q->items[q->t] = 0; \
q->t = (q->t + 1) % q->s; \
return n; \
} \
static inline unsigned int fifo_q_ ## name ## _size(struct fifo_q__ ## name *q) { \
return (q->h - q->t + q->s) % q->s; \
} \
static inline unsigned int fifo_q_ ## name ## _capacity(struct fifo_q__ ## name *q) { \
return q->s - 1; \
} \
static inline int fifo_q_ ## name ## _empty(struct fifo_q__ ## name *q) { \
return (q->t == q->h); \
} \
static inline int fifo_q_ ## name ## _full(struct fifo_q__ ## name *q) { \
return ((q->h + 1) % q->s) == q->t; \
}
#define fifo_q_new(name, size) fifo_q_ ## name ## _new(size)
#define fifo_q_free(name, queue) fifo_q_ ## name ## _free(queue)
#define fifo_q_get(name, queue) fifo_q_ ## name ## _get(queue)
#define fifo_q_put(name, queue, item) fifo_q_ ## name ## _put(queue, item)
#define fifo_q_size(name, queue) fifo_q_ ## name ## _size(queue)
#define fifo_q_capacity(name, queue) fifo_q_ ## name ## _capacity(queue)
#define fifo_q_empty(name, queue) fifo_q_ ## name ## _empty(queue)
#define fifo_q_full(name, queue) fifo_q_ ## name ## _full(queue)
#define fifo_q_foreach(name, queue, item, task) do { \
while(!fifo_q_ ## name ## _empty(queue)) { \
item = fifo_q_ ## name ## _get(queue); \
do task while(0); \
} \
} while(0);
#if defined(__cplusplus)
}
#endif
#endif // __FIFO_Q_H__

File diff suppressed because it is too large Load diff

View file

@ -1,10 +1,10 @@
/** @file lmdb.h /** @file lmdb.h
* @brief Lightning memory-mapped database library * @brief Lightning memory-mapped database library
* *
* @mainpage Lightning Memory-Mapped Database Manager (LMDB) * @mainpage Lightning Memory-Mapped Database Manager (MDB)
* *
* @section intro_sec Introduction * @section intro_sec Introduction
* LMDB is a Btree-based database management library modeled loosely on the * MDB is a Btree-based database management library modeled loosely on the
* BerkeleyDB API, but much simplified. The entire database is exposed * BerkeleyDB API, but much simplified. The entire database is exposed
* in a memory map, and all data fetches return data directly * in a memory map, and all data fetches return data directly
* from the mapped memory, so no malloc's or memcpy's occur during * from the mapped memory, so no malloc's or memcpy's occur during
@ -26,10 +26,10 @@
* readers, and readers don't block writers. * readers, and readers don't block writers.
* *
* Unlike other well-known database mechanisms which use either write-ahead * Unlike other well-known database mechanisms which use either write-ahead
* transaction logs or append-only data writes, LMDB requires no maintenance * transaction logs or append-only data writes, MDB requires no maintenance
* during operation. Both write-ahead loggers and append-only databases * during operation. Both write-ahead loggers and append-only databases
* require periodic checkpointing and/or compaction of their log or database * require periodic checkpointing and/or compaction of their log or database
* files otherwise they grow without bound. LMDB tracks free pages within * files otherwise they grow without bound. MDB tracks free pages within
* the database and re-uses them for new write operations, so the database * the database and re-uses them for new write operations, so the database
* size does not grow without bound in normal use. * size does not grow without bound in normal use.
* *
@ -40,9 +40,6 @@
* corrupt the database. Of course if your application code is known to * corrupt the database. Of course if your application code is known to
* be bug-free (...) then this is not an issue. * be bug-free (...) then this is not an issue.
* *
* If this is your first time using a transactional embedded key/value
* store, you may find the \ref starting page to be helpful.
*
* @section caveats_sec Caveats * @section caveats_sec Caveats
* Troubleshooting the lock file, plus semaphores on BSD systems: * Troubleshooting the lock file, plus semaphores on BSD systems:
* *
@ -51,17 +48,10 @@
* cause further writes to grow the database quickly, and * cause further writes to grow the database quickly, and
* stale locks can block further operation. * stale locks can block further operation.
* *
* Fix: Check for stale readers periodically, using the * Fix: Terminate all programs using the database, or make
* #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. * them close it. Next database user will reset the lockfile.
* Stale writers will be cleared automatically on most systems:
* - Windows - automatic
* - BSD, systems using SysV semaphores - automatic
* - Linux, systems using POSIX mutexes with Robust option - automatic
* Otherwise just make all programs using the database close it;
* the lockfile is always reset on first open of the environment.
* *
* - On BSD systems or others configured with MDB_USE_SYSV_SEM or * - On BSD systems or others configured with MDB_USE_POSIX_SEM,
* MDB_USE_POSIX_SEM,
* startup can fail due to semaphores owned by another userid. * startup can fail due to semaphores owned by another userid.
* *
* Fix: Open and close the database as the user which owns the * Fix: Open and close the database as the user which owns the
@ -74,32 +64,13 @@
* BSD systems or when otherwise configured with MDB_USE_POSIX_SEM. * BSD systems or when otherwise configured with MDB_USE_POSIX_SEM.
* Multiple users can cause startup to fail later, as noted above. * Multiple users can cause startup to fail later, as noted above.
* *
* - There is normally no pure read-only mode, since readers need write
* access to locks and lock file. Exceptions: On read-only filesystems
* or with the #MDB_NOLOCK flag described under #mdb_env_open().
*
* - An LMDB configuration will often reserve considerable \b unused
* memory address space and maybe file size for future growth.
* This does not use actual memory or disk space, but users may need
* to understand the difference so they won't be scared off.
*
* - By default, in versions before 0.9.10, unused portions of the data
* file might receive garbage data from memory freed by other code.
* (This does not happen when using the #MDB_WRITEMAP flag.) As of
* 0.9.10 the default behavior is to initialize such memory before
* writing to the data file. Since there may be a slight performance
* cost due to this initialization, applications may disable it using
* the #MDB_NOMEMINIT flag. Applications handling sensitive data
* which must not be written should not use this flag. This flag is
* irrelevant when using #MDB_WRITEMAP.
*
* - A thread can only use one transaction at a time, plus any child * - A thread can only use one transaction at a time, plus any child
* transactions. Each transaction belongs to one thread. See below. * transactions. Each transaction belongs to one thread. See below.
* The #MDB_NOTLS flag changes this for read-only transactions. * The #MDB_NOTLS flag changes this for read-only transactions.
* *
* - Use an MDB_env* in the process which opened it, without fork()ing. * - Use an MDB_env* in the process which opened it, without fork()ing.
* *
* - Do not have open an LMDB database twice in the same process at * - Do not have open an MDB database twice in the same process at
* the same time. Not even from a plain open() call - close()ing it * the same time. Not even from a plain open() call - close()ing it
* breaks flock() advisory locking. * breaks flock() advisory locking.
* *
@ -115,17 +86,13 @@
* ...when several processes can use a database concurrently: * ...when several processes can use a database concurrently:
* *
* - Avoid aborting a process with an active transaction. * - Avoid aborting a process with an active transaction.
* The transaction becomes "long-lived" as above until a check * The transaction becomes "long-lived" as above until the lockfile
* for stale readers is performed or the lockfile is reset, * is reset, since the process may not remove it from the lockfile.
* since the process may not remove it from the lockfile.
* *
* This does not apply to write transactions if the system clears * - If you do that anyway, close the environment once in a while,
* stale writers, see above. * so the lockfile can get reset.
* *
* - If you do that anyway, do a periodic check for stale readers. Or * - Do not use MDB databases on remote filesystems, even between
* close the environment once in a while, so the lockfile can get reset.
*
* - Do not use LMDB databases on remote filesystems, even between
* processes on the same host. This breaks flock() on some OSes, * processes on the same host. This breaks flock() on some OSes,
* possibly memory map sync, and certainly sync between programs * possibly memory map sync, and certainly sync between programs
* on different hosts. * on different hosts.
@ -135,7 +102,7 @@
* *
* @author Howard Chu, Symas Corporation. * @author Howard Chu, Symas Corporation.
* *
* @copyright Copyright 2011-2016 Howard Chu, Symas Corp. All rights reserved. * @copyright Copyright 2011-2013 Howard Chu, Symas Corp. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP * modification, are permitted only as authorized by the OpenLDAP
@ -166,37 +133,18 @@
#define _LMDB_H_ #define _LMDB_H_
#include <sys/types.h> #include <sys/types.h>
#include <inttypes.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
/** Unix permissions for creating files, or dummy definition for Windows */
#ifdef _MSC_VER #ifdef _MSC_VER
typedef int mdb_mode_t; typedef int mdb_mode_t;
#else #else
typedef mode_t mdb_mode_t; typedef mode_t mdb_mode_t;
#endif #endif
#ifdef MDB_VL32 /** @defgroup mdb MDB API
typedef uint64_t mdb_size_t;
#define mdb_env_create mdb_env_create_vl32 /**< Prevent mixing with non-VL32 builds */
#else
typedef size_t mdb_size_t;
#endif
/** An abstraction for a file handle.
* On POSIX systems file handles are small integers. On Windows
* they're opaque pointers.
*/
#ifdef _WIN32
typedef void *mdb_filehandle_t;
#else
typedef int mdb_filehandle_t;
#endif
/** @defgroup mdb LMDB API
* @{ * @{
* @brief OpenLDAP Lightning Memory-Mapped Database Manager * @brief OpenLDAP Lightning Memory-Mapped Database Manager
*/ */
@ -208,7 +156,7 @@ typedef int mdb_filehandle_t;
/** Library minor version */ /** Library minor version */
#define MDB_VERSION_MINOR 9 #define MDB_VERSION_MINOR 9
/** Library patch version */ /** Library patch version */
#define MDB_VERSION_PATCH 70 #define MDB_VERSION_PATCH 6
/** Combine args a,b,c into a single integer for easy version comparisons */ /** Combine args a,b,c into a single integer for easy version comparisons */
#define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c))
@ -218,10 +166,10 @@ typedef int mdb_filehandle_t;
MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH)
/** The release date of this library version */ /** The release date of this library version */
#define MDB_VERSION_DATE "December 19, 2015" #define MDB_VERSION_DATE "January 10, 2013"
/** A stringifier for the version info */ /** A stringifier for the version info */
#define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")" #define MDB_VERSTR(a,b,c,d) "MDB " #a "." #b "." #c ": (" d ")"
/** A helper for the stringifier macro */ /** A helper for the stringifier macro */
#define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d) #define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d)
@ -254,13 +202,13 @@ typedef struct MDB_cursor MDB_cursor;
/** @brief Generic structure used for passing keys and data in and out /** @brief Generic structure used for passing keys and data in and out
* of the database. * of the database.
* *
* Values returned from the database are valid only until a subsequent * Key sizes must be between 1 and the liblmdb build-time constant
* update operation, or the end of the transaction. Do not modify or * #MDB_MAXKEYSIZE inclusive. This currently defaults to 511. The
* free them, they commonly point into the database itself. * same applies to data sizes in databases with the #MDB_DUPSORT flag.
*
* Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive.
* The same applies to data sizes in databases with the #MDB_DUPSORT flag.
* Other data items can in theory be from 0 to 0xffffffff bytes long. * Other data items can in theory be from 0 to 0xffffffff bytes long.
*
* Values returned from the database are valid only until a subsequent
* update operation, or the end of the transaction.
*/ */
typedef struct MDB_val { typedef struct MDB_val {
size_t mv_size; /**< size of the data item */ size_t mv_size; /**< size of the data item */
@ -287,12 +235,14 @@ typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b);
typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *relctx); typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *relctx);
/** @defgroup mdb_env Environment Flags /** @defgroup mdb_env Environment Flags
*
* Values do not overlap Database Flags.
* @{ * @{
*/ */
/** mmap at a fixed address (experimental) */ /** mmap at a fixed address (experimental) */
#define MDB_FIXEDMAP 0x01 #define MDB_FIXEDMAP 0x01
/** no environment directory */ /** no environment directory */
#define MDB_NOSUBDIR 0x4000 #define MDB_NOSUBDIR 0x4000
/** don't fsync after commit */ /** don't fsync after commit */
#define MDB_NOSYNC 0x10000 #define MDB_NOSYNC 0x10000
/** read only */ /** read only */
@ -301,31 +251,27 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
#define MDB_NOMETASYNC 0x40000 #define MDB_NOMETASYNC 0x40000
/** use writable mmap */ /** use writable mmap */
#define MDB_WRITEMAP 0x80000 #define MDB_WRITEMAP 0x80000
/** use asynchronous msync when #MDB_WRITEMAP is used */ /** use asynchronous msync when MDB_WRITEMAP is used */
#define MDB_MAPASYNC 0x100000 #define MDB_MAPASYNC 0x100000
/** tie reader locktable slots to #MDB_txn objects instead of to threads */ /** tie reader locktable slots to #MDB_txn objects instead of to threads */
#define MDB_NOTLS 0x200000 #define MDB_NOTLS 0x200000
/** don't do any locking, caller must manage their own locks */
#define MDB_NOLOCK 0x400000
/** don't do readahead (no effect on Windows) */
#define MDB_NORDAHEAD 0x800000
/** don't initialize malloc'd memory before writing to datafile */
#define MDB_NOMEMINIT 0x1000000
/** @} */ /** @} */
/** @defgroup mdb_dbi_open Database Flags /** @defgroup mdb_dbi_open Database Flags
*
* Values do not overlap Environment Flags.
* @{ * @{
*/ */
/** use reverse string keys */ /** use reverse string keys */
#define MDB_REVERSEKEY 0x02 #define MDB_REVERSEKEY 0x02
/** use sorted duplicates */ /** use sorted duplicates */
#define MDB_DUPSORT 0x04 #define MDB_DUPSORT 0x04
/** numeric keys in native byte order: either unsigned int or size_t. /** numeric keys in native byte order.
* The keys must all be of the same size. */ * The keys must all be of the same size. */
#define MDB_INTEGERKEY 0x08 #define MDB_INTEGERKEY 0x08
/** with #MDB_DUPSORT, sorted dup items have fixed size */ /** with #MDB_DUPSORT, sorted dup items have fixed size */
#define MDB_DUPFIXED 0x10 #define MDB_DUPFIXED 0x10
/** with #MDB_DUPSORT, dups are #MDB_INTEGERKEY-style integers */ /** with #MDB_DUPSORT, dups are numeric in native byte order */
#define MDB_INTEGERDUP 0x20 #define MDB_INTEGERDUP 0x20
/** with #MDB_DUPSORT, use reverse string dups */ /** with #MDB_DUPSORT, use reverse string dups */
#define MDB_REVERSEDUP 0x40 #define MDB_REVERSEDUP 0x40
@ -353,19 +299,10 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
#define MDB_APPEND 0x20000 #define MDB_APPEND 0x20000
/** Duplicate data is being appended, don't split full pages. */ /** Duplicate data is being appended, don't split full pages. */
#define MDB_APPENDDUP 0x40000 #define MDB_APPENDDUP 0x40000
/** Store multiple data items in one call. Only for #MDB_DUPFIXED. */ /** Store multiple data items in one call. */
#define MDB_MULTIPLE 0x80000 #define MDB_MULTIPLE 0x80000
/* @} */ /* @} */
/** @defgroup mdb_copy Copy Flags
* @{
*/
/** Compacting copy: Omit free space from copy, and renumber all
* pages sequentially.
*/
#define MDB_CP_COMPACT 0x01
/* @} */
/** @brief Cursor Get operations. /** @brief Cursor Get operations.
* *
* This is the set of all operations for retrieving data * This is the set of all operations for retrieving data
@ -378,28 +315,26 @@ typedef enum MDB_cursor_op {
MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */
MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */ MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */
MDB_GET_CURRENT, /**< Return key/data at current cursor position */ MDB_GET_CURRENT, /**< Return key/data at current cursor position */
MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items MDB_GET_MULTIPLE, /**< Return all the duplicate data items at the current
from current cursor position. Move cursor to prepare cursor position. Only for #MDB_DUPFIXED */
for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */
MDB_LAST, /**< Position at last key/data item */ MDB_LAST, /**< Position at last key/data item */
MDB_LAST_DUP, /**< Position at last data item of current key. MDB_LAST_DUP, /**< Position at last data item of current key.
Only for #MDB_DUPSORT */ Only for #MDB_DUPSORT */
MDB_NEXT, /**< Position at next data item */ MDB_NEXT, /**< Position at next data item */
MDB_NEXT_DUP, /**< Position at next data item of current key. MDB_NEXT_DUP, /**< Position at next data item of current key.
Only for #MDB_DUPSORT */ Only for #MDB_DUPSORT */
MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items MDB_NEXT_MULTIPLE, /**< Return all duplicate data items at the next
from next cursor position. Move cursor to prepare cursor position. Only for #MDB_DUPFIXED */
for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ MDB_NEXT_NODUP, /**< Position at first data item of next key.
MDB_NEXT_NODUP, /**< Position at first data item of next key */ Only for #MDB_DUPSORT */
MDB_PREV, /**< Position at previous data item */ MDB_PREV, /**< Position at previous data item */
MDB_PREV_DUP, /**< Position at previous data item of current key. MDB_PREV_DUP, /**< Position at previous data item of current key.
Only for #MDB_DUPSORT */ Only for #MDB_DUPSORT */
MDB_PREV_NODUP, /**< Position at last data item of previous key */ MDB_PREV_NODUP, /**< Position at last data item of previous key.
Only for #MDB_DUPSORT */
MDB_SET, /**< Position at specified key */ MDB_SET, /**< Position at specified key */
MDB_SET_KEY, /**< Position at specified key, return key + data */ MDB_SET_KEY, /**< Position at specified key, return key + data */
MDB_SET_RANGE, /**< Position at first key greater than or equal to specified key. */ MDB_SET_RANGE /**< Position at first key greater than or equal to specified key. */
MDB_PREV_MULTIPLE /**< Position at previous page and return key and up to
a page of duplicate data items. Only for #MDB_DUPFIXED */
} MDB_cursor_op; } MDB_cursor_op;
/** @defgroup errors Return Codes /** @defgroup errors Return Codes
@ -417,11 +352,11 @@ typedef enum MDB_cursor_op {
#define MDB_PAGE_NOTFOUND (-30797) #define MDB_PAGE_NOTFOUND (-30797)
/** Located page was wrong type */ /** Located page was wrong type */
#define MDB_CORRUPTED (-30796) #define MDB_CORRUPTED (-30796)
/** Update of meta page failed or environment had fatal error */ /** Update of meta page failed, probably I/O error */
#define MDB_PANIC (-30795) #define MDB_PANIC (-30795)
/** Environment version mismatch */ /** Environment version mismatch */
#define MDB_VERSION_MISMATCH (-30794) #define MDB_VERSION_MISMATCH (-30794)
/** File is not a valid LMDB file */ /** File is not a valid MDB file */
#define MDB_INVALID (-30793) #define MDB_INVALID (-30793)
/** Environment mapsize reached */ /** Environment mapsize reached */
#define MDB_MAP_FULL (-30792) #define MDB_MAP_FULL (-30792)
@ -439,25 +374,11 @@ typedef enum MDB_cursor_op {
#define MDB_PAGE_FULL (-30786) #define MDB_PAGE_FULL (-30786)
/** Database contents grew beyond environment mapsize */ /** Database contents grew beyond environment mapsize */
#define MDB_MAP_RESIZED (-30785) #define MDB_MAP_RESIZED (-30785)
/** Operation and DB incompatible, or DB type changed. This can mean: /** Database flags changed or would change */
* <ul>
* <li>The operation expects an #MDB_DUPSORT / #MDB_DUPFIXED database.
* <li>Opening a named DB when the unnamed DB has #MDB_DUPSORT / #MDB_INTEGERKEY.
* <li>Accessing a data record as a database, or vice versa.
* <li>The database was dropped and recreated with different flags.
* </ul>
*/
#define MDB_INCOMPATIBLE (-30784) #define MDB_INCOMPATIBLE (-30784)
/** Invalid reuse of reader locktable slot */ /** Invalid reuse of reader locktable slot */
#define MDB_BAD_RSLOT (-30783) #define MDB_BAD_RSLOT (-30783)
/** Transaction must abort, has a child, or is invalid */ #define MDB_LAST_ERRCODE MDB_BAD_RSLOT
#define MDB_BAD_TXN (-30782)
/** Unsupported size of key/DB name/data, or wrong DUPFIXED size */
#define MDB_BAD_VALSIZE (-30781)
/** The specified DBI was changed unexpectedly */
#define MDB_BAD_DBI (-30780)
/** The last defined error code */
#define MDB_LAST_ERRCODE MDB_BAD_DBI
/** @} */ /** @} */
/** @brief Statistics for a database in the environment */ /** @brief Statistics for a database in the environment */
@ -465,23 +386,23 @@ typedef struct MDB_stat {
unsigned int ms_psize; /**< Size of a database page. unsigned int ms_psize; /**< Size of a database page.
This is currently the same for all databases. */ This is currently the same for all databases. */
unsigned int ms_depth; /**< Depth (height) of the B-tree */ unsigned int ms_depth; /**< Depth (height) of the B-tree */
mdb_size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */
mdb_size_t ms_leaf_pages; /**< Number of leaf pages */ size_t ms_leaf_pages; /**< Number of leaf pages */
mdb_size_t ms_overflow_pages; /**< Number of overflow pages */ size_t ms_overflow_pages; /**< Number of overflow pages */
mdb_size_t ms_entries; /**< Number of data items */ size_t ms_entries; /**< Number of data items */
} MDB_stat; } MDB_stat;
/** @brief Information about the environment */ /** @brief Information about the environment */
typedef struct MDB_envinfo { typedef struct MDB_envinfo {
void *me_mapaddr; /**< Address of map, if fixed */ void *me_mapaddr; /**< Address of map, if fixed */
mdb_size_t me_mapsize; /**< Size of the data memory map */ size_t me_mapsize; /**< Size of the data memory map */
mdb_size_t me_last_pgno; /**< ID of the last used page */ size_t me_last_pgno; /**< ID of the last used page */
mdb_size_t me_last_txnid; /**< ID of the last committed transaction */ size_t me_last_txnid; /**< ID of the last committed transaction */
unsigned int me_maxreaders; /**< max reader slots in the environment */ unsigned int me_maxreaders; /**< max reader slots in the environment */
unsigned int me_numreaders; /**< max reader slots used in the environment */ unsigned int me_numreaders; /**< max reader slots used in the environment */
} MDB_envinfo; } MDB_envinfo;
/** @brief Return the LMDB library version information. /** @brief Return the mdb library version information.
* *
* @param[out] major if non-NULL, the library major version number is copied here * @param[out] major if non-NULL, the library major version number is copied here
* @param[out] minor if non-NULL, the library minor version number is copied here * @param[out] minor if non-NULL, the library minor version number is copied here
@ -495,14 +416,14 @@ char *mdb_version(int *major, int *minor, int *patch);
* This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3)
* function. If the error code is greater than or equal to 0, then the string * function. If the error code is greater than or equal to 0, then the string
* returned by the system function strerror(3) is returned. If the error code * returned by the system function strerror(3) is returned. If the error code
* is less than 0, an error string corresponding to the LMDB library error is * is less than 0, an error string corresponding to the MDB library error is
* returned. See @ref errors for a list of LMDB-specific error codes. * returned. See @ref errors for a list of MDB-specific error codes.
* @param[in] err The error code * @param[in] err The error code
* @retval "error message" The description of the error * @retval "error message" The description of the error
*/ */
char *mdb_strerror(int err); char *mdb_strerror(int err);
/** @brief Create an LMDB environment handle. /** @brief Create an MDB environment handle.
* *
* This function allocates memory for a #MDB_env structure. To release * This function allocates memory for a #MDB_env structure. To release
* the allocated memory and discard the handle, call #mdb_env_close(). * the allocated memory and discard the handle, call #mdb_env_close().
@ -535,24 +456,20 @@ int mdb_env_create(MDB_env **env);
* how the operating system has allocated memory to shared libraries and other uses. * how the operating system has allocated memory to shared libraries and other uses.
* The feature is highly experimental. * The feature is highly experimental.
* <li>#MDB_NOSUBDIR * <li>#MDB_NOSUBDIR
* By default, LMDB creates its environment in a directory whose * By default, MDB creates its environment in a directory whose
* pathname is given in \b path, and creates its data and lock files * pathname is given in \b path, and creates its data and lock files
* under that directory. With this option, \b path is used as-is for * under that directory. With this option, \b path is used as-is for
* the database main data file. The database lock file is the \b path * the database main data file. The database lock file is the \b path
* with "-lock" appended. * with "-lock" appended.
* <li>#MDB_RDONLY * <li>#MDB_RDONLY
* Open the environment in read-only mode. No write operations will be * Open the environment in read-only mode. No write operations will be
* allowed. LMDB will still modify the lock file - except on read-only * allowed. MDB will still modify the lock file - except on read-only
* filesystems, where LMDB does not use locks. * filesystems, where MDB does not use locks.
* <li>#MDB_WRITEMAP * <li>#MDB_WRITEMAP
* Use a writeable memory map unless MDB_RDONLY is set. This uses * Use a writeable memory map unless MDB_RDONLY is set. This is faster
* fewer mallocs but loses protection from application bugs * and uses fewer mallocs, but loses protection from application bugs
* like wild pointer writes and other bad updates into the database. * like wild pointer writes and other bad updates into the database.
* This may be slightly faster for DBs that fit entirely in RAM, but
* is slower for DBs larger than RAM.
* Incompatible with nested transactions. * Incompatible with nested transactions.
* Do not mix processes with and without MDB_WRITEMAP on the same
* environment. This can defeat durability (#mdb_env_sync etc).
* <li>#MDB_NOMETASYNC * <li>#MDB_NOMETASYNC
* Flush system buffers to disk only once per transaction, omit the * Flush system buffers to disk only once per transaction, omit the
* metadata flush. Defer that until the system flushes files to disk, * metadata flush. Defer that until the system flushes files to disk,
@ -589,46 +506,14 @@ int mdb_env_create(MDB_env **env);
* the user synchronizes its use. Applications that multiplex many * the user synchronizes its use. Applications that multiplex many
* user threads over individual OS threads need this option. Such an * user threads over individual OS threads need this option. Such an
* application must also serialize the write transactions in an OS * application must also serialize the write transactions in an OS
* thread, since LMDB's write locking is unaware of the user threads. * thread, since MDB's write locking is unaware of the user threads.
* <li>#MDB_NOLOCK
* Don't do any locking. If concurrent access is anticipated, the
* caller must manage all concurrency itself. For proper operation
* the caller must enforce single-writer semantics, and must ensure
* that no readers are using old transactions while a writer is
* active. The simplest approach is to use an exclusive lock so that
* no readers may be active at all when a writer begins.
* <li>#MDB_NORDAHEAD
* Turn off readahead. Most operating systems perform readahead on
* read requests by default. This option turns it off if the OS
* supports it. Turning it off may help random read performance
* when the DB is larger than RAM and system RAM is full.
* The option is not implemented on Windows.
* <li>#MDB_NOMEMINIT
* Don't initialize malloc'd memory before writing to unused spaces
* in the data file. By default, memory for pages written to the data
* file is obtained using malloc. While these pages may be reused in
* subsequent transactions, freshly malloc'd pages will be initialized
* to zeroes before use. This avoids persisting leftover data from other
* code (that used the heap and subsequently freed the memory) into the
* data file. Note that many other system libraries may allocate
* and free memory from the heap for arbitrary uses. E.g., stdio may
* use the heap for file I/O buffers. This initialization step has a
* modest performance cost so some applications may want to disable
* it using this flag. This option can be a problem for applications
* which handle sensitive data like passwords, and it makes memory
* checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP,
* which writes directly to the mmap instead of using malloc for pages. The
* initialization is also skipped if #MDB_RESERVE is used; the
* caller is expected to overwrite all of the memory that was
* reserved in that case.
* This flag may be changed at any time using #mdb_env_set_flags().
* </ul> * </ul>
* @param[in] mode The UNIX permissions to set on created files and semaphores. * @param[in] mode The UNIX permissions to set on created files. This parameter
* This parameter is ignored on Windows. * is ignored on Windows.
* @return A non-zero error value on failure and 0 on success. Some possible * @return A non-zero error value on failure and 0 on success. Some possible
* errors are: * errors are:
* <ul> * <ul>
* <li>#MDB_VERSION_MISMATCH - the version of the LMDB library doesn't match the * <li>#MDB_VERSION_MISMATCH - the version of the MDB library doesn't match the
* version that created the database environment. * version that created the database environment.
* <li>#MDB_INVALID - the environment file headers are corrupted. * <li>#MDB_INVALID - the environment file headers are corrupted.
* <li>ENOENT - the directory specified by the path parameter doesn't exist. * <li>ENOENT - the directory specified by the path parameter doesn't exist.
@ -638,13 +523,9 @@ int mdb_env_create(MDB_env **env);
*/ */
int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode); int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode);
/** @brief Copy an LMDB environment to the specified path. /** @brief Copy an MDB environment to the specified path.
* *
* This function may be used to make a backup of an existing environment. * This function may be used to make a backup of an existing environment.
* No lockfile is created, since it gets recreated at need.
* @note This call can trigger significant file size growth if run in
* parallel with write transactions, because it employs a read-only
* transaction. See long-lived transactions under @ref caveats_sec.
* @param[in] env An environment handle returned by #mdb_env_create(). It * @param[in] env An environment handle returned by #mdb_env_create(). It
* must have already been opened successfully. * must have already been opened successfully.
* @param[in] path The directory in which the copy will reside. This * @param[in] path The directory in which the copy will reside. This
@ -654,65 +535,7 @@ int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t
*/ */
int mdb_env_copy(MDB_env *env, const char *path); int mdb_env_copy(MDB_env *env, const char *path);
/** @brief Copy an LMDB environment to the specified file descriptor. /** @brief Return statistics about the MDB environment.
*
* This function may be used to make a backup of an existing environment.
* No lockfile is created, since it gets recreated at need.
* @note This call can trigger significant file size growth if run in
* parallel with write transactions, because it employs a read-only
* transaction. See long-lived transactions under @ref caveats_sec.
* @param[in] env An environment handle returned by #mdb_env_create(). It
* must have already been opened successfully.
* @param[in] fd The filedescriptor to write the copy to. It must
* have already been opened for Write access.
* @return A non-zero error value on failure and 0 on success.
*/
int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd);
/** @brief Copy an LMDB environment to the specified path, with options.
*
* This function may be used to make a backup of an existing environment.
* No lockfile is created, since it gets recreated at need.
* @note This call can trigger significant file size growth if run in
* parallel with write transactions, because it employs a read-only
* transaction. See long-lived transactions under @ref caveats_sec.
* @param[in] env An environment handle returned by #mdb_env_create(). It
* must have already been opened successfully.
* @param[in] path The directory in which the copy will reside. This
* directory must already exist and be writable but must otherwise be
* empty.
* @param[in] flags Special options for this operation. This parameter
* must be set to 0 or by bitwise OR'ing together one or more of the
* values described here.
* <ul>
* <li>#MDB_CP_COMPACT - Perform compaction while copying: omit free
* pages and sequentially renumber all pages in output. This option
* consumes more CPU and runs more slowly than the default.
* </ul>
* @return A non-zero error value on failure and 0 on success.
*/
int mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags);
/** @brief Copy an LMDB environment to the specified file descriptor,
* with options.
*
* This function may be used to make a backup of an existing environment.
* No lockfile is created, since it gets recreated at need. See
* #mdb_env_copy2() for further details.
* @note This call can trigger significant file size growth if run in
* parallel with write transactions, because it employs a read-only
* transaction. See long-lived transactions under @ref caveats_sec.
* @param[in] env An environment handle returned by #mdb_env_create(). It
* must have already been opened successfully.
* @param[in] fd The filedescriptor to write the copy to. It must
* have already been opened for Write access.
* @param[in] flags Special options for this operation.
* See #mdb_env_copy2() for options.
* @return A non-zero error value on failure and 0 on success.
*/
int mdb_env_copyfd2(MDB_env *env, mdb_filehandle_t fd, unsigned int flags);
/** @brief Return statistics about the LMDB environment.
* *
* @param[in] env An environment handle returned by #mdb_env_create() * @param[in] env An environment handle returned by #mdb_env_create()
* @param[out] stat The address of an #MDB_stat structure * @param[out] stat The address of an #MDB_stat structure
@ -720,7 +543,7 @@ int mdb_env_copyfd2(MDB_env *env, mdb_filehandle_t fd, unsigned int flags);
*/ */
int mdb_env_stat(MDB_env *env, MDB_stat *stat); int mdb_env_stat(MDB_env *env, MDB_stat *stat);
/** @brief Return information about the LMDB environment. /** @brief Return information about the MDB environment.
* *
* @param[in] env An environment handle returned by #mdb_env_create() * @param[in] env An environment handle returned by #mdb_env_create()
* @param[out] stat The address of an #MDB_envinfo structure * @param[out] stat The address of an #MDB_envinfo structure
@ -731,10 +554,9 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat);
/** @brief Flush the data buffers to disk. /** @brief Flush the data buffers to disk.
* *
* Data is always written to disk when #mdb_txn_commit() is called, * Data is always written to disk when #mdb_txn_commit() is called,
* but the operating system may keep it buffered. LMDB always flushes * but the operating system may keep it buffered. MDB always flushes
* the OS buffers upon commit as well, unless the environment was * the OS buffers upon commit as well, unless the environment was
* opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC.
* not valid if the environment was opened with #MDB_RDONLY.
* @param[in] env An environment handle returned by #mdb_env_create() * @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] force If non-zero, force a synchronous flush. Otherwise * @param[in] force If non-zero, force a synchronous flush. Otherwise
* if the environment has the #MDB_NOSYNC flag set the flushes * if the environment has the #MDB_NOSYNC flag set the flushes
@ -742,7 +564,6 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat);
* @return A non-zero error value on failure and 0 on success. Some possible * @return A non-zero error value on failure and 0 on success. Some possible
* errors are: * errors are:
* <ul> * <ul>
* <li>EACCES - the environment is read-only.
* <li>EINVAL - an invalid parameter was specified. * <li>EINVAL - an invalid parameter was specified.
* <li>EIO - an error occurred during synchronization. * <li>EIO - an error occurred during synchronization.
* </ul> * </ul>
@ -762,8 +583,7 @@ void mdb_env_close(MDB_env *env);
/** @brief Set environment flags. /** @brief Set environment flags.
* *
* This may be used to set some flags in addition to those from * This may be used to set some flags in addition to those from
* #mdb_env_open(), or to unset these flags. If several threads * #mdb_env_open(), or to unset these flags.
* change the flags at the same time, the result is undefined.
* @param[in] env An environment handle returned by #mdb_env_create() * @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] flags The flags to change, bitwise OR'ed together * @param[in] flags The flags to change, bitwise OR'ed together
* @param[in] onoff A non-zero value sets the flags, zero clears them. * @param[in] onoff A non-zero value sets the flags, zero clears them.
@ -801,39 +621,14 @@ int mdb_env_get_flags(MDB_env *env, unsigned int *flags);
*/ */
int mdb_env_get_path(MDB_env *env, const char **path); int mdb_env_get_path(MDB_env *env, const char **path);
/** @brief Return the filedescriptor for the given environment.
*
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[out] fd Address of a mdb_filehandle_t to contain the descriptor.
* @return A non-zero error value on failure and 0 on success. Some possible
* errors are:
* <ul>
* <li>EINVAL - an invalid parameter was specified.
* </ul>
*/
int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd);
/** @brief Set the size of the memory map to use for this environment. /** @brief Set the size of the memory map to use for this environment.
* *
* The size should be a multiple of the OS page size. The default is * The size should be a multiple of the OS page size. The default is
* 10485760 bytes. The size of the memory map is also the maximum size * 10485760 bytes. The size of the memory map is also the maximum size
* of the database. The value should be chosen as large as possible, * of the database. The value should be chosen as large as possible,
* to accommodate future growth of the database. * to accommodate future growth of the database.
* This function should be called after #mdb_env_create() and before #mdb_env_open(). * This function may only be called after #mdb_env_create() and before #mdb_env_open().
* It may be called at later times if no transactions are active in * The size may be changed by closing and reopening the environment.
* this process. Note that the library does not check for this condition,
* the caller must ensure it explicitly.
*
* The new size takes effect immediately for the current process but
* will not be persisted to any others until a write transaction has been
* committed by the current process. Also, only mapsize increases are
* persisted into the environment.
*
* If the mapsize is increased by another process, and data has grown
* beyond the range of the current mapsize, #mdb_txn_begin() will
* return #MDB_MAP_RESIZED. This function may be called with a size
* of zero to adopt the new size.
*
* Any attempt to set a size smaller than the space already consumed * Any attempt to set a size smaller than the space already consumed
* by the environment will be silently changed to the current size of the used space. * by the environment will be silently changed to the current size of the used space.
* @param[in] env An environment handle returned by #mdb_env_create() * @param[in] env An environment handle returned by #mdb_env_create()
@ -841,11 +636,10 @@ int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd);
* @return A non-zero error value on failure and 0 on success. Some possible * @return A non-zero error value on failure and 0 on success. Some possible
* errors are: * errors are:
* <ul> * <ul>
* <li>EINVAL - an invalid parameter was specified, or the environment has * <li>EINVAL - an invalid parameter was specified, or the environment is already open.
* an active write transaction.
* </ul> * </ul>
*/ */
int mdb_env_set_mapsize(MDB_env *env, mdb_size_t size); int mdb_env_set_mapsize(MDB_env *env, size_t size);
/** @brief Set the maximum number of threads/reader slots for the environment. /** @brief Set the maximum number of threads/reader slots for the environment.
* *
@ -884,10 +678,6 @@ int mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers);
* environment. Simpler applications that use the environment as a single * environment. Simpler applications that use the environment as a single
* unnamed database can ignore this option. * unnamed database can ignore this option.
* This function may only be called after #mdb_env_create() and before #mdb_env_open(). * This function may only be called after #mdb_env_create() and before #mdb_env_open().
*
* Currently a moderate number of slots are cheap but a huge number gets
* expensive: 7-120 words per transaction, and every #mdb_dbi_open()
* does a linear search of the opened slots.
* @param[in] env An environment handle returned by #mdb_env_create() * @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] dbs The maximum number of databases * @param[in] dbs The maximum number of databases
* @return A non-zero error value on failure and 0 on success. Some possible * @return A non-zero error value on failure and 0 on success. Some possible
@ -898,47 +688,6 @@ int mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers);
*/ */
int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs);
/** @brief Get the maximum size of keys and #MDB_DUPSORT data we can write.
*
* Depends on the compile-time constant #MDB_MAXKEYSIZE. Default 511.
* See @ref MDB_val.
* @param[in] env An environment handle returned by #mdb_env_create()
* @return The maximum size of a key we can write
*/
int mdb_env_get_maxkeysize(MDB_env *env);
/** @brief Set application information associated with the #MDB_env.
*
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] ctx An arbitrary pointer for whatever the application needs.
* @return A non-zero error value on failure and 0 on success.
*/
int mdb_env_set_userctx(MDB_env *env, void *ctx);
/** @brief Get the application information associated with the #MDB_env.
*
* @param[in] env An environment handle returned by #mdb_env_create()
* @return The pointer set by #mdb_env_set_userctx().
*/
void *mdb_env_get_userctx(MDB_env *env);
/** @brief A callback function for most LMDB assert() failures,
* called before printing the message and aborting.
*
* @param[in] env An environment handle returned by #mdb_env_create().
* @param[in] msg The assertion message, not including newline.
*/
typedef void MDB_assert_func(MDB_env *env, const char *msg);
/** Set or reset the assert() callback of the environment.
* Disabled if liblmdb is buillt with NDEBUG.
* @note This hack should become obsolete as lmdb's error handling matures.
* @param[in] env An environment handle returned by #mdb_env_create().
* @param[in] func An #MDB_assert_func function, or 0.
* @return A non-zero error value on failure and 0 on success.
*/
int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func);
/** @brief Create a transaction for use with the environment. /** @brief Create a transaction for use with the environment.
* *
* The transaction handle may be discarded using #mdb_txn_abort() or #mdb_txn_commit(). * The transaction handle may be discarded using #mdb_txn_abort() or #mdb_txn_commit().
@ -950,18 +699,14 @@ int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func);
* @param[in] parent If this parameter is non-NULL, the new transaction * @param[in] parent If this parameter is non-NULL, the new transaction
* will be a nested transaction, with the transaction indicated by \b parent * will be a nested transaction, with the transaction indicated by \b parent
* as its parent. Transactions may be nested to any level. A parent * as its parent. Transactions may be nested to any level. A parent
* transaction and its cursors may not issue any other operations than * transaction may not issue any other operations besides mdb_txn_begin,
* mdb_txn_commit and mdb_txn_abort while it has active child transactions. * mdb_txn_abort, or mdb_txn_commit while it has active child transactions.
* @param[in] flags Special options for this transaction. This parameter * @param[in] flags Special options for this transaction. This parameter
* must be set to 0 or by bitwise OR'ing together one or more of the * must be set to 0 or by bitwise OR'ing together one or more of the
* values described here. * values described here.
* <ul> * <ul>
* <li>#MDB_RDONLY * <li>#MDB_RDONLY
* This transaction will not perform any write operations. * This transaction will not perform any write operations.
* <li>#MDB_NOSYNC
* Don't flush system buffers to disk when committing this transaction.
* <li>#MDB_NOMETASYNC
* Flush system buffers but omit metadata flush when committing this transaction.
* </ul> * </ul>
* @param[out] txn Address where the new #MDB_txn handle will be stored * @param[out] txn Address where the new #MDB_txn handle will be stored
* @return A non-zero error value on failure and 0 on success. Some possible * @return A non-zero error value on failure and 0 on success. Some possible
@ -970,8 +715,7 @@ int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func);
* <li>#MDB_PANIC - a fatal error occurred earlier and the environment * <li>#MDB_PANIC - a fatal error occurred earlier and the environment
* must be shut down. * must be shut down.
* <li>#MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's * <li>#MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's
* mapsize and this environment's map must be resized as well. * mapsize and the environment must be shut down.
* See #mdb_env_set_mapsize().
* <li>#MDB_READERS_FULL - a read-only transaction was requested and * <li>#MDB_READERS_FULL - a read-only transaction was requested and
* the reader lock table is full. See #mdb_env_set_maxreaders(). * the reader lock table is full. See #mdb_env_set_maxreaders().
* <li>ENOMEM - out of memory. * <li>ENOMEM - out of memory.
@ -979,23 +723,6 @@ int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func);
*/ */
int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **txn); int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **txn);
/** @brief Returns the transaction's #MDB_env
*
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
*/
MDB_env *mdb_txn_env(MDB_txn *txn);
/** @brief Return the transaction's ID.
*
* This returns the identifier associated with this transaction. For a
* read-only transaction, this corresponds to the snapshot being read;
* concurrent readers will frequently have the same transaction ID.
*
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @return A transaction ID, valid if input is an active transaction.
*/
mdb_size_t mdb_txn_id(MDB_txn *txn);
/** @brief Commit all the operations of a transaction into the database. /** @brief Commit all the operations of a transaction into the database.
* *
* The transaction handle is freed. It and its cursors must not be used * The transaction handle is freed. It and its cursors must not be used
@ -1070,23 +797,19 @@ int mdb_txn_renew(MDB_txn *txn);
* independently of whether such a database exists. * independently of whether such a database exists.
* The database handle may be discarded by calling #mdb_dbi_close(). * The database handle may be discarded by calling #mdb_dbi_close().
* The old database handle is returned if the database was already open. * The old database handle is returned if the database was already open.
* The handle may only be closed once. * The handle must only be closed once.
*
* The database handle will be private to the current transaction until * The database handle will be private to the current transaction until
* the transaction is successfully committed. If the transaction is * the transaction is successfully committed. If the transaction is
* aborted the handle will be closed automatically. * aborted the handle will be closed automatically.
* After a successful commit the handle will reside in the shared * After a successful commit the
* environment, and may be used by other transactions. * handle will reside in the shared environment, and may be used
* * by other transactions. This function must not be called from
* This function must not be called from multiple concurrent * multiple concurrent transactions. A transaction that uses this function
* transactions in the same process. A transaction that uses * must finish (either commit or abort) before any other transaction may
* this function must finish (either commit or abort) before * use this function.
* any other transaction in the process may use this function.
* *
* To use named databases (with name != NULL), #mdb_env_set_maxdbs() * To use named databases (with name != NULL), #mdb_env_set_maxdbs()
* must be called before opening the environment. Database names are * must be called before opening the environment.
* keys in the unnamed database, and may be read but not written.
*
* @param[in] txn A transaction handle returned by #mdb_txn_begin() * @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @param[in] name The name of the database to open. If only a single * @param[in] name The name of the database to open. If only a single
* database is needed in the environment, this value may be NULL. * database is needed in the environment, this value may be NULL.
@ -1103,9 +826,9 @@ int mdb_txn_renew(MDB_txn *txn);
* keys may have multiple data items, stored in sorted order.) By default * keys may have multiple data items, stored in sorted order.) By default
* keys must be unique and may have only a single data item. * keys must be unique and may have only a single data item.
* <li>#MDB_INTEGERKEY * <li>#MDB_INTEGERKEY
* Keys are binary integers in native byte order, either unsigned int * Keys are binary integers in native byte order. Setting this option
* or size_t, and will be sorted as such. * requires all keys to be the same size, typically sizeof(int)
* The keys must all be of the same size. * or sizeof(size_t).
* <li>#MDB_DUPFIXED * <li>#MDB_DUPFIXED
* This flag may only be used in combination with #MDB_DUPSORT. This option * This flag may only be used in combination with #MDB_DUPSORT. This option
* tells the library that the data items for this database are all the same * tells the library that the data items for this database are all the same
@ -1113,8 +836,8 @@ int mdb_txn_renew(MDB_txn *txn);
* all data items are the same size, the #MDB_GET_MULTIPLE and #MDB_NEXT_MULTIPLE * all data items are the same size, the #MDB_GET_MULTIPLE and #MDB_NEXT_MULTIPLE
* cursor operations may be used to retrieve multiple items at once. * cursor operations may be used to retrieve multiple items at once.
* <li>#MDB_INTEGERDUP * <li>#MDB_INTEGERDUP
* This option specifies that duplicate data items are binary integers, * This option specifies that duplicate data items are also integers, and
* similar to #MDB_INTEGERKEY keys. * should be sorted as such.
* <li>#MDB_REVERSEDUP * <li>#MDB_REVERSEDUP
* This option specifies that duplicate data items should be compared as * This option specifies that duplicate data items should be compared as
* strings in reverse order. * strings in reverse order.
@ -1147,40 +870,25 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *d
*/ */
int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat); int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat);
/** @brief Retrieve the DB flags for a database handle. /** @brief Close a database handle.
*
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @param[in] dbi A database handle returned by #mdb_dbi_open()
* @param[out] flags Address where the flags will be returned.
* @return A non-zero error value on failure and 0 on success.
*/
int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags);
/** @brief Close a database handle. Normally unnecessary. Use with care:
* *
* This call is not mutex protected. Handles should only be closed by * This call is not mutex protected. Handles should only be closed by
* a single thread, and only if no other threads are going to reference * a single thread, and only if no other threads are going to reference
* the database handle or one of its cursors any further. Do not close * the database handle or one of its cursors any further. Do not close
* a handle if an existing transaction has modified its database. * a handle if an existing transaction has modified its database.
* Doing so can cause misbehavior from database corruption to errors
* like MDB_BAD_VALSIZE (since the DB name is gone).
*
* Closing a database handle is not necessary, but lets #mdb_dbi_open()
* reuse the handle value. Usually it's better to set a bigger
* #mdb_env_set_maxdbs(), unless that value would be large.
*
* @param[in] env An environment handle returned by #mdb_env_create() * @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] dbi A database handle returned by #mdb_dbi_open() * @param[in] dbi A database handle returned by #mdb_dbi_open()
*/ */
void mdb_dbi_close(MDB_env *env, MDB_dbi dbi); void mdb_dbi_close(MDB_env *env, MDB_dbi dbi);
/** @brief Empty or delete+close a database. /** @brief Delete a database and/or free all its pages.
* *
* See #mdb_dbi_close() for restrictions about closing the DB handle. * If the \b del parameter is 1, the DB handle will be closed
* and the DB will be deleted.
* @param[in] txn A transaction handle returned by #mdb_txn_begin() * @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @param[in] dbi A database handle returned by #mdb_dbi_open() * @param[in] dbi A database handle returned by #mdb_dbi_open()
* @param[in] del 0 to empty the DB, 1 to delete it from the * @param[in] del 1 to delete the DB from the environment,
* environment and close the DB handle. * 0 to just free its pages.
* @return A non-zero error value on failure and 0 on success. * @return A non-zero error value on failure and 0 on success.
*/ */
int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del); int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del);
@ -1322,13 +1030,11 @@ int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data);
* reserved space, which the caller can fill in later - before * reserved space, which the caller can fill in later - before
* the next update operation or the transaction ends. This saves * the next update operation or the transaction ends. This saves
* an extra memcpy if the data is being generated later. * an extra memcpy if the data is being generated later.
* LMDB does nothing else with this memory, the caller is expected
* to modify all of the space requested. This flag must not be
* specified if the database was opened with #MDB_DUPSORT.
* <li>#MDB_APPEND - append the given key/data pair to the end of the * <li>#MDB_APPEND - append the given key/data pair to the end of the
* database. This option allows fast bulk loading when keys are * database. No key comparisons are performed. This option allows
* already known to be in the correct order. Loading unsorted keys * fast bulk loading when keys are already known to be in the
* with this flag will cause a #MDB_KEYEXIST error. * correct order. Loading unsorted keys with this flag will cause
* data corruption.
* <li>#MDB_APPENDDUP - as above, but for sorted dup data. * <li>#MDB_APPENDDUP - as above, but for sorted dup data.
* </ul> * </ul>
* @return A non-zero error value on failure and 0 on success. Some possible * @return A non-zero error value on failure and 0 on success. Some possible
@ -1454,21 +1160,18 @@ int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data,
/** @brief Store by cursor. /** @brief Store by cursor.
* *
* This function stores key/data pairs into the database. * This function stores key/data pairs into the database.
* The cursor is positioned at the new item, or on failure usually near it. * If the function fails for any reason, the state of the cursor will be
* @note Earlier documentation incorrectly said errors would leave the * unchanged. If the function succeeds and an item is inserted into the
* state of the cursor unchanged. * database, the cursor is always positioned to refer to the newly inserted item.
* @param[in] cursor A cursor handle returned by #mdb_cursor_open() * @param[in] cursor A cursor handle returned by #mdb_cursor_open()
* @param[in] key The key operated on. * @param[in] key The key operated on.
* @param[in] data The data operated on. * @param[in] data The data operated on.
* @param[in] flags Options for this operation. This parameter * @param[in] flags Options for this operation. This parameter
* must be set to 0 or one of the values described here. * must be set to 0 or one of the values described here.
* <ul> * <ul>
* <li>#MDB_CURRENT - replace the item at the current cursor position. * <li>#MDB_CURRENT - overwrite the data of the key/data pair to which
* The \b key parameter must still be provided, and must match it. * the cursor refers with the specified data item. The \b key
* If using sorted duplicates (#MDB_DUPSORT) the data item must still * parameter is ignored.
* sort into the same place. This is intended to be used when the
* new data is the same size as the old. Otherwise it will simply
* perform a delete of the old record followed by an insert.
* <li>#MDB_NODUPDATA - enter the new key/data pair only if it does not * <li>#MDB_NODUPDATA - enter the new key/data pair only if it does not
* already appear in the database. This flag may only be specified * already appear in the database. This flag may only be specified
* if the database was opened with #MDB_DUPSORT. The function will * if the database was opened with #MDB_DUPSORT. The function will
@ -1480,33 +1183,21 @@ int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data,
* the database supports duplicates (#MDB_DUPSORT). * the database supports duplicates (#MDB_DUPSORT).
* <li>#MDB_RESERVE - reserve space for data of the given size, but * <li>#MDB_RESERVE - reserve space for data of the given size, but
* don't copy the given data. Instead, return a pointer to the * don't copy the given data. Instead, return a pointer to the
* reserved space, which the caller can fill in later - before * reserved space, which the caller can fill in later. This saves
* the next update operation or the transaction ends. This saves * an extra memcpy if the data is being generated later.
* an extra memcpy if the data is being generated later. This flag
* must not be specified if the database was opened with #MDB_DUPSORT.
* <li>#MDB_APPEND - append the given key/data pair to the end of the * <li>#MDB_APPEND - append the given key/data pair to the end of the
* database. No key comparisons are performed. This option allows * database. No key comparisons are performed. This option allows
* fast bulk loading when keys are already known to be in the * fast bulk loading when keys are already known to be in the
* correct order. Loading unsorted keys with this flag will cause * correct order. Loading unsorted keys with this flag will cause
* a #MDB_KEYEXIST error. * data corruption.
* <li>#MDB_APPENDDUP - as above, but for sorted dup data. * <li>#MDB_APPENDDUP - as above, but for sorted dup data.
* <li>#MDB_MULTIPLE - store multiple contiguous data elements in a
* single request. This flag may only be specified if the database
* was opened with #MDB_DUPFIXED. The \b data argument must be an
* array of two MDB_vals. The mv_size of the first MDB_val must be
* the size of a single data element. The mv_data of the first MDB_val
* must point to the beginning of the array of contiguous data elements.
* The mv_size of the second MDB_val must be the count of the number
* of data elements to store. On return this field will be set to
* the count of the number of elements actually written. The mv_data
* of the second MDB_val is unused.
* </ul> * </ul>
* @return A non-zero error value on failure and 0 on success. Some possible * @return A non-zero error value on failure and 0 on success. Some possible
* errors are: * errors are:
* <ul> * <ul>
* <li>#MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). * <li>#MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize().
* <li>#MDB_TXN_FULL - the transaction has too many dirty pages. * <li>#MDB_TXN_FULL - the transaction has too many dirty pages.
* <li>EACCES - an attempt was made to write in a read-only transaction. * <li>EACCES - an attempt was made to modify a read-only database.
* <li>EINVAL - an invalid parameter was specified. * <li>EINVAL - an invalid parameter was specified.
* </ul> * </ul>
*/ */
@ -1526,7 +1217,7 @@ int mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data,
* @return A non-zero error value on failure and 0 on success. Some possible * @return A non-zero error value on failure and 0 on success. Some possible
* errors are: * errors are:
* <ul> * <ul>
* <li>EACCES - an attempt was made to write in a read-only transaction. * <li>EACCES - an attempt was made to modify a read-only database.
* <li>EINVAL - an invalid parameter was specified. * <li>EINVAL - an invalid parameter was specified.
* </ul> * </ul>
*/ */
@ -1544,7 +1235,7 @@ int mdb_cursor_del(MDB_cursor *cursor, unsigned int flags);
* <li>EINVAL - cursor is not initialized, or an invalid parameter was specified. * <li>EINVAL - cursor is not initialized, or an invalid parameter was specified.
* </ul> * </ul>
*/ */
int mdb_cursor_count(MDB_cursor *cursor, mdb_size_t *countp); int mdb_cursor_count(MDB_cursor *cursor, size_t *countp);
/** @brief Compare two data items according to a particular database. /** @brief Compare two data items according to a particular database.
* *
@ -1569,42 +1260,11 @@ int mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b);
* @return < 0 if a < b, 0 if a == b, > 0 if a > b * @return < 0 if a < b, 0 if a == b, > 0 if a > b
*/ */
int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b);
/** @brief A callback function used to print a message from the library.
*
* @param[in] msg The string to be printed.
* @param[in] ctx An arbitrary context pointer for the callback.
* @return < 0 on failure, >= 0 on success.
*/
typedef int (MDB_msg_func)(const char *msg, void *ctx);
/** @brief Dump the entries in the reader lock table.
*
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] func A #MDB_msg_func function
* @param[in] ctx Anything the message function needs
* @return < 0 on failure, >= 0 on success.
*/
int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx);
/** @brief Check for stale entries in the reader lock table.
*
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[out] dead Number of stale slots that were cleared
* @return 0 on success, non-zero on failure.
*/
int mdb_reader_check(MDB_env *env, int *dead);
/** @} */ /** @} */
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
/** @page tools LMDB Command Line Tools
The following describes the command line tools that are available for LMDB.
\li \ref mdb_copy_1
\li \ref mdb_dump_1
\li \ref mdb_load_1
\li \ref mdb_stat_1
*/
#endif /* _LMDB_H_ */ #endif /* _LMDB_H_ */
/* * http://gitorious.org/mdb/mdb/blobs/raw/b389341b4b2413804726276d01676a6a9d05346f/libraries/liblmdb/lmdb.h */

View file

@ -2,7 +2,7 @@
* This file is part of LMDB - Erlang Lightning MDB API * This file is part of LMDB - Erlang Lightning MDB API
* *
* Copyright (c) 2012 by Aleph Archives. All rights reserved. * Copyright (c) 2012 by Aleph Archives. All rights reserved.
* Copyright (c) 2013 by Basho Technologies, Inc. All rights reserved. %% Copyright (c) 2013 by Basho Technologies, Inc. All rights reserved.
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@ -28,7 +28,6 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <inttypes.h>
#include <errno.h> #include <errno.h>
#include <sys/param.h> #include <sys/param.h>
#include <erl_nif.h> #include <erl_nif.h>
@ -36,14 +35,22 @@
#include "common.h" #include "common.h"
#include "async_nif.h" #include "async_nif.h"
#include "stats.h"
#include "lmdb.h" #include "lmdb.h"
STAT_DECL(lmdb_get, 1000);
STAT_DECL(lmdb_put, 1000);
STAT_DECL(lmdb_del, 1000);
STAT_DECL(lmdb_upd, 1000);
static ErlNifResourceType *lmdb_RESOURCE; static ErlNifResourceType *lmdb_RESOURCE;
struct lmdb { struct lmdb {
MDB_env *env; MDB_env *env;
MDB_txn *txn;
MDB_cursor *cursor;
MDB_dbi dbi; MDB_dbi dbi;
STAT_DEF(lmdb_get);
STAT_DEF(lmdb_put);
STAT_DEF(lmdb_del);
STAT_DEF(lmdb_upd);
}; };
struct lmdb_priv_data { struct lmdb_priv_data {
@ -76,9 +83,6 @@ static ERL_NIF_TERM ATOM_MAP_RESIZED;
static ERL_NIF_TERM ATOM_INCOMPATIBLE; static ERL_NIF_TERM ATOM_INCOMPATIBLE;
static ERL_NIF_TERM ATOM_BAD_RSLOT; static ERL_NIF_TERM ATOM_BAD_RSLOT;
static ERL_NIF_TERM ATOM_TXN_STARTED;
static ERL_NIF_TERM ATOM_TXN_NOT_STARTED;
#define CHECK(expr, label) \ #define CHECK(expr, label) \
if (MDB_SUCCESS != (ret = (expr))) { \ if (MDB_SUCCESS != (ret = (expr))) { \
DPRINTF("CHECK(\"%s\") failed \"%s\" at %s:%d in %s()\n", \ DPRINTF("CHECK(\"%s\") failed \"%s\" at %s:%d in %s()\n", \
@ -102,7 +106,7 @@ static ERL_NIF_TERM ATOM_TXN_NOT_STARTED;
static ERL_NIF_TERM static ERL_NIF_TERM
__strerror_term(ErlNifEnv* env, int err) __strerror_term(ErlNifEnv* env, int err)
{ {
ERL_NIF_TERM term = 0; ERL_NIF_TERM term;
if (err < MDB_LAST_ERRCODE && err > MDB_KEYEXIST) { if (err < MDB_LAST_ERRCODE && err > MDB_KEYEXIST) {
switch (err) { switch (err) {
@ -209,6 +213,11 @@ ASYNC_NIF_DECL(
if ((handle = enif_alloc_resource(lmdb_RESOURCE, sizeof(struct lmdb))) == NULL) if ((handle = enif_alloc_resource(lmdb_RESOURCE, sizeof(struct lmdb))) == NULL)
FAIL_ERR(ENOMEM, err3); FAIL_ERR(ENOMEM, err3);
STAT_INIT(handle, lmdb_get);
STAT_INIT(handle, lmdb_put);
STAT_INIT(handle, lmdb_upd);
STAT_INIT(handle, lmdb_del);
CHECK(mdb_env_create(&(handle->env)), err2); CHECK(mdb_env_create(&(handle->env)), err2);
if (mdb_env_set_mapsize(handle->env, args->mapsize)) { if (mdb_env_set_mapsize(handle->env, args->mapsize)) {
@ -221,9 +230,6 @@ ASYNC_NIF_DECL(
CHECK(mdb_open(txn, NULL, 0, &(handle->dbi)), err1); CHECK(mdb_open(txn, NULL, 0, &(handle->dbi)), err1);
CHECK(mdb_txn_commit(txn), err1); CHECK(mdb_txn_commit(txn), err1);
handle->txn = NULL;
handle->cursor = NULL;
ERL_NIF_TERM term = enif_make_resource(env, handle); ERL_NIF_TERM term = enif_make_resource(env, handle);
enif_release_resource(handle); enif_release_resource(handle);
ASYNC_NIF_REPLY(enif_make_tuple(env, 2, ATOM_OK, term)); ASYNC_NIF_REPLY(enif_make_tuple(env, 2, ATOM_OK, term));
@ -265,7 +271,15 @@ ASYNC_NIF_DECL(
}, },
{ // work { // work
STAT_PRINT(args->handle, lmdb_get, "lmdb");
STAT_PRINT(args->handle, lmdb_put, "lmdb");
STAT_PRINT(args->handle, lmdb_del, "lmdb");
STAT_PRINT(args->handle, lmdb_upd, "lmdb");
mdb_env_close(args->handle->env); mdb_env_close(args->handle->env);
STAT_RESET(args->handle, lmdb_get);
STAT_RESET(args->handle, lmdb_put);
STAT_RESET(args->handle, lmdb_del);
STAT_RESET(args->handle, lmdb_upd);
args->handle->env = NULL; args->handle->env = NULL;
ASYNC_NIF_REPLY(ATOM_OK); ASYNC_NIF_REPLY(ATOM_OK);
return; return;
@ -301,6 +315,7 @@ ASYNC_NIF_DECL(
} }
if (!args->handle->env) if (!args->handle->env)
ASYNC_NIF_RETURN_BADARG(); ASYNC_NIF_RETURN_BADARG();
STAT_TICK(args->handle, lmdb_put);
enif_keep_resource((void*)args->handle); enif_keep_resource((void*)args->handle);
args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[1]); args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[1]);
args->val = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]); args->val = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
@ -328,11 +343,7 @@ ASYNC_NIF_DECL(
mkey.mv_data = key.data; mkey.mv_data = key.data;
mdata.mv_size = val.size; mdata.mv_size = val.size;
mdata.mv_data = val.data; mdata.mv_data = val.data;
if(args->handle->txn == NULL) { CHECK(mdb_txn_begin(args->handle->env, NULL, 0, & txn), err2);
CHECK(mdb_txn_begin(args->handle->env, NULL, 0, & txn), err2);
} else {
txn = args->handle->txn;
}
ret = mdb_put(txn, args->handle->dbi, &mkey, &mdata, MDB_NOOVERWRITE); ret = mdb_put(txn, args->handle->dbi, &mkey, &mdata, MDB_NOOVERWRITE);
if (MDB_KEYEXIST == ret) { if (MDB_KEYEXIST == ret) {
@ -342,8 +353,8 @@ ASYNC_NIF_DECL(
if (ret != 0) if (ret != 0)
FAIL_ERR(ret, err1); FAIL_ERR(ret, err1);
if(args->handle->txn == NULL) CHECK(mdb_txn_commit(txn), err1);
CHECK(mdb_txn_commit(txn), err1); STAT_TOCK(args->handle, lmdb_put);
ASYNC_NIF_REPLY(ATOM_OK); ASYNC_NIF_REPLY(ATOM_OK);
return; return;
@ -358,6 +369,7 @@ ASYNC_NIF_DECL(
enif_release_resource((void*)args->handle); enif_release_resource((void*)args->handle);
}); });
/** /**
* Update and existin value indexed by key. * Update and existin value indexed by key.
* *
@ -383,6 +395,7 @@ ASYNC_NIF_DECL(
} }
if (!args->handle->env) if (!args->handle->env)
ASYNC_NIF_RETURN_BADARG(); ASYNC_NIF_RETURN_BADARG();
STAT_TICK(args->handle, lmdb_upd);
enif_keep_resource((void*)args->handle); enif_keep_resource((void*)args->handle);
args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[1]); args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[1]);
args->val = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]); args->val = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]);
@ -411,16 +424,10 @@ ASYNC_NIF_DECL(
mdata.mv_size = val.size; mdata.mv_size = val.size;
mdata.mv_data = val.data; mdata.mv_data = val.data;
if(args->handle->txn == NULL) { CHECK(mdb_txn_begin(args->handle->env, NULL, 0, & txn), err2);
CHECK(mdb_txn_begin(args->handle->env, NULL, 0, & txn), err2);
} else {
txn = args->handle->txn;
}
CHECK(mdb_put(txn, args->handle->dbi, &mkey, &mdata, 0), err1); CHECK(mdb_put(txn, args->handle->dbi, &mkey, &mdata, 0), err1);
CHECK(mdb_txn_commit(txn), err1);
if(args->handle->txn == NULL) STAT_TOCK(args->handle, lmdb_upd);
CHECK(mdb_txn_commit(txn), err1);
ASYNC_NIF_REPLY(ATOM_OK); ASYNC_NIF_REPLY(ATOM_OK);
return; return;
@ -458,6 +465,7 @@ ASYNC_NIF_DECL(
} }
if (!args->handle->env) if (!args->handle->env)
ASYNC_NIF_RETURN_BADARG(); ASYNC_NIF_RETURN_BADARG();
STAT_TICK(args->handle, lmdb_get);
enif_keep_resource((void*)args->handle); enif_keep_resource((void*)args->handle);
args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[1]); args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[1]);
}, },
@ -480,15 +488,10 @@ ASYNC_NIF_DECL(
mkey.mv_size = key.size; mkey.mv_size = key.size;
mkey.mv_data = key.data; mkey.mv_data = key.data;
if(args->handle->txn == NULL) { CHECK(mdb_txn_begin(args->handle->env, NULL, 0, &txn), err);
CHECK(mdb_txn_begin(args->handle->env, NULL, 0, & txn), err);
} else {
txn = args->handle->txn;
}
ret = mdb_get(txn, args->handle->dbi, &mkey, &mdata); ret = mdb_get(txn, args->handle->dbi, &mkey, &mdata);
if(args->handle->txn == NULL) mdb_txn_abort(txn);
mdb_txn_abort(txn);
if (MDB_NOTFOUND == ret) { if (MDB_NOTFOUND == ret) {
ASYNC_NIF_REPLY(ATOM_NOT_FOUND); ASYNC_NIF_REPLY(ATOM_NOT_FOUND);
return; return;
@ -502,6 +505,7 @@ ASYNC_NIF_DECL(
FAIL_ERR(ENOMEM, err); FAIL_ERR(ENOMEM, err);
memcpy(bin, mdata.mv_data, mdata.mv_size); memcpy(bin, mdata.mv_data, mdata.mv_size);
STAT_TOCK(args->handle, lmdb_get);
ASYNC_NIF_REPLY(enif_make_tuple(env, 2, ATOM_OK, val)); ASYNC_NIF_REPLY(enif_make_tuple(env, 2, ATOM_OK, val));
return; return;
@ -537,6 +541,7 @@ ASYNC_NIF_DECL(
} }
if (!args->handle->env) if (!args->handle->env)
ASYNC_NIF_RETURN_BADARG(); ASYNC_NIF_RETURN_BADARG();
STAT_TICK(args->handle, lmdb_del);
enif_keep_resource((void*)args->handle); enif_keep_resource((void*)args->handle);
args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[1]); args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[1]);
}, },
@ -556,22 +561,17 @@ ASYNC_NIF_DECL(
mkey.mv_size = key.size; mkey.mv_size = key.size;
mkey.mv_data = key.data; mkey.mv_data = key.data;
if(args->handle->txn == NULL) { CHECK(mdb_txn_begin(args->handle->env, NULL, 0, & txn), err);
CHECK(mdb_txn_begin(args->handle->env, NULL, 0, & txn), err);
} else {
txn = args->handle->txn;
}
ret = mdb_del(txn, args->handle->dbi, &mkey, NULL); ret = mdb_del(txn, args->handle->dbi, &mkey, NULL);
if(MDB_NOTFOUND == ret) { if(MDB_NOTFOUND == ret) {
if(args->handle->txn == NULL) mdb_txn_abort(txn);
mdb_txn_abort(txn);
ASYNC_NIF_REPLY(ATOM_NOT_FOUND); ASYNC_NIF_REPLY(ATOM_NOT_FOUND);
return; return;
} }
if(args->handle->txn == NULL)
CHECK(mdb_txn_commit(txn), err); CHECK(mdb_txn_commit(txn), err);
STAT_TOCK(args->handle, lmdb_del);
ASYNC_NIF_REPLY(ATOM_OK); ASYNC_NIF_REPLY(ATOM_OK);
return; return;
@ -630,109 +630,7 @@ ASYNC_NIF_DECL(
enif_release_resource((void*)args->handle); enif_release_resource((void*)args->handle);
}); });
ASYNC_NIF_DECL(
lmdb_txn_begin,
{ // struct
struct lmdb *handle;
},
{ // pre
if (!(argc == 1 &&
enif_get_resource(env, argv[0], lmdb_RESOURCE, (void**)&args->handle))) {
ASYNC_NIF_RETURN_BADARG();
}
if (!args->handle->env)
ASYNC_NIF_RETURN_BADARG();
enif_keep_resource((void*)args->handle);
},
{ // work
ERL_NIF_TERM err;
int ret;
if(args->handle->txn == NULL) {
CHECK(mdb_txn_begin(args->handle->env, NULL, 0, &(args->handle->txn)), err2);
ASYNC_NIF_REPLY(ATOM_OK);
} else
ASYNC_NIF_REPLY(enif_make_tuple(env, 2, ATOM_ERROR, ATOM_TXN_STARTED));
return;
err2:
ASYNC_NIF_REPLY(err);
return;
},
{ // post
enif_release_resource((void*)args->handle);
});
ASYNC_NIF_DECL(
lmdb_txn_commit,
{ // struct
struct lmdb *handle;
},
{ // pre
if (!(argc == 1 &&
enif_get_resource(env, argv[0], lmdb_RESOURCE, (void**)&args->handle))) {
ASYNC_NIF_RETURN_BADARG();
}
if (!args->handle->env)
ASYNC_NIF_RETURN_BADARG();
enif_keep_resource((void*)args->handle);
},
{ // work
ERL_NIF_TERM err;
int ret;
if(args->handle->txn != NULL) {
CHECK(mdb_txn_commit(args->handle->txn), err2);
args->handle->txn = NULL;
ASYNC_NIF_REPLY(ATOM_OK);
} else
ASYNC_NIF_REPLY(enif_make_tuple(env, 2, ATOM_ERROR, ATOM_TXN_NOT_STARTED));
return;
err2:
ASYNC_NIF_REPLY(err);
return;
},
{ // post
enif_release_resource((void*)args->handle);
});
ASYNC_NIF_DECL(
lmdb_txn_abort,
{ // struct
struct lmdb *handle;
},
{ // pre
if (!(argc == 1 &&
enif_get_resource(env, argv[0], lmdb_RESOURCE, (void**)&args->handle))) {
ASYNC_NIF_RETURN_BADARG();
}
if (!args->handle->env)
ASYNC_NIF_RETURN_BADARG();
enif_keep_resource((void*)args->handle);
},
{ // work
if(args->handle->txn != NULL) {
mdb_txn_abort(args->handle->txn);
args->handle->txn = NULL;
ASYNC_NIF_REPLY(ATOM_OK);
} else
ASYNC_NIF_REPLY(enif_make_tuple(env, 2, ATOM_ERROR, ATOM_TXN_NOT_STARTED));
return;
},
{ // post
enif_release_resource((void*)args->handle);
});
static int lmdb_load(ErlNifEnv* env, void** priv_data, ERL_NIF_TERM load_info) static int lmdb_load(ErlNifEnv* env, void** priv_data, ERL_NIF_TERM load_info)
{ {
@ -747,7 +645,7 @@ static int lmdb_load(ErlNifEnv* env, void** priv_data, ERL_NIF_TERM load_info)
/* Note: !!! the first element of our priv_data struct *must* be the /* Note: !!! the first element of our priv_data struct *must* be the
pointer to the async_nif's private data which we set here. */ pointer to the async_nif's private data which we set here. */
ASYNC_NIF_LOAD(lmdb, env, priv->async_nif_priv); ASYNC_NIF_LOAD(lmdb, priv->async_nif_priv);
if (!priv) if (!priv)
return ENOMEM; return ENOMEM;
*priv_data = priv; *priv_data = priv;
@ -773,9 +671,6 @@ static int lmdb_load(ErlNifEnv* env, void** priv_data, ERL_NIF_TERM load_info)
ATOM_INCOMPATIBLE = enif_make_atom(env, "incompatible"); ATOM_INCOMPATIBLE = enif_make_atom(env, "incompatible");
ATOM_BAD_RSLOT = enif_make_atom(env, "bad_rslot"); ATOM_BAD_RSLOT = enif_make_atom(env, "bad_rslot");
ATOM_TXN_STARTED = enif_make_atom(env, "txn_started");
ATOM_TXN_NOT_STARTED = enif_make_atom(env, "txn_not_started");
lmdb_RESOURCE = enif_open_resource_type(env, NULL, "lmdb_resource", lmdb_RESOURCE = enif_open_resource_type(env, NULL, "lmdb_resource",
NULL, flags, NULL); NULL, flags, NULL);
return (0); return (0);
@ -816,15 +711,7 @@ static ErlNifFunc nif_funcs [] = {
{"get", 3, lmdb_get}, {"get", 3, lmdb_get},
{"del", 3, lmdb_del}, {"del", 3, lmdb_del},
{"update", 4, lmdb_update}, {"update", 4, lmdb_update},
{"drop", 2, lmdb_drop}, {"drop", 2, lmdb_drop}
{"txn_begin", 2, lmdb_txn_begin},
{"txn_commit", 2, lmdb_txn_commit},
{"txn_abort", 2, lmdb_txn_abort}/*,
{"cursor_open", 2, lmdb_cursor_open},
{"cursor_close", 2, lmdb_cursor_close} */
}; };
/* driver entry point */ /* driver entry point */

View file

@ -3,7 +3,7 @@
/* $OpenLDAP$ */ /* $OpenLDAP$ */
/* This work is part of OpenLDAP Software <http://www.openldap.org/>. /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
* *
* Copyright 2000-2016 The OpenLDAP Foundation. * Copyright 2000-2013 The OpenLDAP Foundation.
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@ -20,9 +20,10 @@
#include <stdlib.h> #include <stdlib.h>
#include <errno.h> #include <errno.h>
#include <sys/types.h> #include <sys/types.h>
#include <assert.h>
#include "midl.h" #include "midl.h"
/** @defgroup internal LMDB Internals /** @defgroup internal MDB Internals
* @{ * @{
*/ */
/** @defgroup idls ID List Management /** @defgroup idls ID List Management
@ -30,7 +31,8 @@
*/ */
#define CMP(x,y) ( (x) < (y) ? -1 : (x) > (y) ) #define CMP(x,y) ( (x) < (y) ? -1 : (x) > (y) )
unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ) #if 0 /* superseded by append/sort */
static unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id )
{ {
/* /*
* binary search of id in ids * binary search of id in ids
@ -65,11 +67,21 @@ unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id )
return cursor; return cursor;
} }
#if 0 /* superseded by append/sort */
int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) int mdb_midl_insert( MDB_IDL ids, MDB_ID id )
{ {
unsigned x, i; unsigned x, i;
if (MDB_IDL_IS_RANGE( ids )) {
/* if already in range, treat as a dup */
if (id >= MDB_IDL_RANGE_FIRST(ids) && id <= MDB_IDL_RANGE_LAST(ids))
return -1;
if (id < MDB_IDL_RANGE_FIRST(ids))
ids[1] = id;
else if (id > MDB_IDL_RANGE_LAST(ids))
ids[2] = id;
return 0;
}
x = mdb_midl_search( ids, id ); x = mdb_midl_search( ids, id );
assert( x > 0 ); assert( x > 0 );
@ -85,9 +97,15 @@ int mdb_midl_insert( MDB_IDL ids, MDB_ID id )
} }
if ( ++ids[0] >= MDB_IDL_DB_MAX ) { if ( ++ids[0] >= MDB_IDL_DB_MAX ) {
/* no room */ if( id < ids[1] ) {
--ids[0]; ids[1] = id;
return -2; ids[2] = ids[ids[0]-1];
} else if ( ids[ids[0]-1] < id ) {
ids[2] = id;
} else {
ids[2] = ids[ids[0]-1];
}
ids[0] = MDB_NOID;
} else { } else {
/* insert id */ /* insert id */
@ -103,10 +121,8 @@ int mdb_midl_insert( MDB_IDL ids, MDB_ID id )
MDB_IDL mdb_midl_alloc(int num) MDB_IDL mdb_midl_alloc(int num)
{ {
MDB_IDL ids = malloc((num+2) * sizeof(MDB_ID)); MDB_IDL ids = malloc((num+2) * sizeof(MDB_ID));
if (ids) { if (ids)
*ids++ = num; *ids++ = num;
*ids = 0;
}
return ids; return ids;
} }
@ -116,18 +132,19 @@ void mdb_midl_free(MDB_IDL ids)
free(ids-1); free(ids-1);
} }
void mdb_midl_shrink( MDB_IDL *idp ) int mdb_midl_shrink( MDB_IDL *idp )
{ {
MDB_IDL ids = *idp; MDB_IDL ids = *idp;
if (*(--ids) > MDB_IDL_UM_MAX && if (*(--ids) > MDB_IDL_UM_MAX) {
(ids = realloc(ids, (MDB_IDL_UM_MAX+2) * sizeof(MDB_ID)))) ids = realloc(ids, (MDB_IDL_UM_MAX+1) * sizeof(MDB_ID));
{
*ids++ = MDB_IDL_UM_MAX; *ids++ = MDB_IDL_UM_MAX;
*idp = ids; *idp = ids;
return 1;
} }
return 0;
} }
static int mdb_midl_grow( MDB_IDL *idp, int num ) int mdb_midl_grow( MDB_IDL *idp, int num )
{ {
MDB_IDL idn = *idp-1; MDB_IDL idn = *idp-1;
/* grow it */ /* grow it */
@ -139,20 +156,6 @@ static int mdb_midl_grow( MDB_IDL *idp, int num )
return 0; return 0;
} }
int mdb_midl_need( MDB_IDL *idp, unsigned num )
{
MDB_IDL ids = *idp;
num += ids[0];
if (num > ids[-1]) {
num = (num + num/4 + (256 + 2)) & -256;
if (!(ids = realloc(ids-1, num * sizeof(MDB_ID))))
return ENOMEM;
*ids++ = num - 2;
*idp = ids;
}
return 0;
}
int mdb_midl_append( MDB_IDL *idp, MDB_ID id ) int mdb_midl_append( MDB_IDL *idp, MDB_ID id )
{ {
MDB_IDL ids = *idp; MDB_IDL ids = *idp;
@ -181,40 +184,10 @@ int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app )
return 0; return 0;
} }
int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n )
{
MDB_ID *ids = *idp, len = ids[0];
/* Too big? */
if (len + n > ids[-1]) {
if (mdb_midl_grow(idp, n | MDB_IDL_UM_MAX))
return ENOMEM;
ids = *idp;
}
ids[0] = len + n;
ids += len;
while (n)
ids[n--] = id++;
return 0;
}
void mdb_midl_xmerge( MDB_IDL idl, MDB_IDL merge )
{
MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i+j, total = k;
idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */
old_id = idl[j];
while (i) {
merge_id = merge[i--];
for (; old_id < merge_id; old_id = idl[--j])
idl[k--] = old_id;
idl[k--] = merge_id;
}
idl[0] = total;
}
/* Quicksort + Insertion sort for small arrays */ /* Quicksort + Insertion sort for small arrays */
#define SMALL 8 #define SMALL 8
#define MIDL_SWAP(a,b) { itmp=(a); (a)=(b); (b)=itmp; } #define SWAP(a,b) { itmp=(a); (a)=(b); (b)=itmp; }
void void
mdb_midl_sort( MDB_IDL ids ) mdb_midl_sort( MDB_IDL ids )
@ -242,15 +215,15 @@ mdb_midl_sort( MDB_IDL ids )
l = istack[jstack--]; l = istack[jstack--];
} else { } else {
k = (l + ir) >> 1; /* Choose median of left, center, right */ k = (l + ir) >> 1; /* Choose median of left, center, right */
MIDL_SWAP(ids[k], ids[l+1]); SWAP(ids[k], ids[l+1]);
if (ids[l] < ids[ir]) { if (ids[l] < ids[ir]) {
MIDL_SWAP(ids[l], ids[ir]); SWAP(ids[l], ids[ir]);
} }
if (ids[l+1] < ids[ir]) { if (ids[l+1] < ids[ir]) {
MIDL_SWAP(ids[l+1], ids[ir]); SWAP(ids[l+1], ids[ir]);
} }
if (ids[l] < ids[l+1]) { if (ids[l] < ids[l+1]) {
MIDL_SWAP(ids[l], ids[l+1]); SWAP(ids[l], ids[l+1]);
} }
i = l+1; i = l+1;
j = ir; j = ir;
@ -259,7 +232,7 @@ mdb_midl_sort( MDB_IDL ids )
do i++; while(ids[i] > a); do i++; while(ids[i] > a);
do j--; while(ids[j] < a); do j--; while(ids[j] < a);
if (j < i) break; if (j < i) break;
MIDL_SWAP(ids[i],ids[j]); SWAP(ids[i],ids[j]);
} }
ids[l+1] = ids[j]; ids[l+1] = ids[j];
ids[j] = a; ids[j] = a;
@ -317,6 +290,7 @@ int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id )
unsigned x, i; unsigned x, i;
x = mdb_mid2l_search( ids, id->mid ); x = mdb_mid2l_search( ids, id->mid );
assert( x > 0 );
if( x < 1 ) { if( x < 1 ) {
/* internal error */ /* internal error */
@ -354,67 +328,7 @@ int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id )
return 0; return 0;
} }
#ifdef MDB_VL32
unsigned mdb_mid3l_search( MDB_ID3L ids, MDB_ID id )
{
/*
* binary search of id in ids
* if found, returns position of id
* if not found, returns first position greater than id
*/
unsigned base = 0;
unsigned cursor = 1;
int val = 0;
unsigned n = (unsigned)ids[0].mid;
while( 0 < n ) {
unsigned pivot = n >> 1;
cursor = base + pivot + 1;
val = CMP( id, ids[cursor].mid );
if( val < 0 ) {
n = pivot;
} else if ( val > 0 ) {
base = cursor;
n -= pivot + 1;
} else {
return cursor;
}
}
if( val > 0 ) {
++cursor;
}
return cursor;
}
int mdb_mid3l_insert( MDB_ID3L ids, MDB_ID3 *id )
{
unsigned x, i;
x = mdb_mid3l_search( ids, id->mid );
if( x < 1 ) {
/* internal error */
return -2;
}
if ( x <= ids[0].mid && ids[x].mid == id->mid ) {
/* duplicate */
return -1;
}
/* insert id */
ids[0].mid++;
for (i=(unsigned)ids[0].mid; i>x; i--)
ids[i] = ids[i-1];
ids[x] = *id;
return 0;
}
#endif /* MDB_VL32 */
/** @} */ /** @} */
/** @} */ /** @} */
/* http://gitorious.org/mdb/mdb/blobs/raw/mdb.master/libraries/liblmdb/midl.c */

View file

@ -1,5 +1,5 @@
/** @file midl.h /** @file midl.h
* @brief LMDB ID List header file. * @brief mdb ID List header file.
* *
* This file was originally part of back-bdb but has been * This file was originally part of back-bdb but has been
* modified for use in libmdb. Most of the macros defined * modified for use in libmdb. Most of the macros defined
@ -11,7 +11,7 @@
/* $OpenLDAP$ */ /* $OpenLDAP$ */
/* This work is part of OpenLDAP Software <http://www.openldap.org/>. /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
* *
* Copyright 2000-2016 The OpenLDAP Foundation. * Copyright 2000-2013 The OpenLDAP Foundation.
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@ -27,27 +27,22 @@
#define _MDB_MIDL_H_ #define _MDB_MIDL_H_
#include <stddef.h> #include <stddef.h>
#include <inttypes.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
/** @defgroup internal LMDB Internals /** @defgroup internal MDB Internals
* @{ * @{
*/ */
/** @defgroup idls ID List Management /** @defgroup idls ID List Management
* @{ * @{
*/ */
/** A generic unsigned ID number. These were entryIDs in back-bdb. /** A generic ID number. These were entryIDs in back-bdb.
* Preferably it should have the same size as a pointer. * Preferably it should have the same size as a pointer.
*/ */
#ifdef MDB_VL32
typedef uint64_t MDB_ID;
#else
typedef size_t MDB_ID; typedef size_t MDB_ID;
#endif
/** An IDL is an ID List, a sorted array of IDs. The first /** An IDL is an ID List, a sorted array of IDs. The first
* element of the array is a counter for how many actual * element of the array is a counter for how many actual
@ -57,41 +52,67 @@ typedef size_t MDB_ID;
*/ */
typedef MDB_ID *MDB_IDL; typedef MDB_ID *MDB_IDL;
#define MDB_NOID (~(MDB_ID)0)
/* IDL sizes - likely should be even bigger /* IDL sizes - likely should be even bigger
* limiting factors: sizeof(ID), thread stack size * limiting factors: sizeof(ID), thread stack size
*/ */
#ifdef MDB_VL32
#define MDB_IDL_LOGN 10 /* DB_SIZE is 2^10, UM_SIZE is 2^11 */
#else
#define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ #define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */
#endif
#define MDB_IDL_DB_SIZE (1<<MDB_IDL_LOGN) #define MDB_IDL_DB_SIZE (1<<MDB_IDL_LOGN)
#define MDB_IDL_UM_SIZE (1<<(MDB_IDL_LOGN+1)) #define MDB_IDL_UM_SIZE (1<<(MDB_IDL_LOGN+1))
#define MDB_IDL_UM_SIZEOF (MDB_IDL_UM_SIZE * sizeof(MDB_ID))
#define MDB_IDL_DB_MAX (MDB_IDL_DB_SIZE-1) #define MDB_IDL_DB_MAX (MDB_IDL_DB_SIZE-1)
#define MDB_IDL_UM_MAX (MDB_IDL_UM_SIZE-1) #define MDB_IDL_UM_MAX (MDB_IDL_UM_SIZE-1)
#define MDB_IDL_SIZEOF(ids) (((ids)[0]+1) * sizeof(MDB_ID)) #define MDB_IDL_IS_RANGE(ids) ((ids)[0] == MDB_NOID)
#define MDB_IDL_RANGE_SIZE (3)
#define MDB_IDL_RANGE_SIZEOF (MDB_IDL_RANGE_SIZE * sizeof(MDB_ID))
#define MDB_IDL_SIZEOF(ids) ((MDB_IDL_IS_RANGE(ids) \
? MDB_IDL_RANGE_SIZE : ((ids)[0]+1)) * sizeof(MDB_ID))
#define MDB_IDL_RANGE_FIRST(ids) ((ids)[1])
#define MDB_IDL_RANGE_LAST(ids) ((ids)[2])
#define MDB_IDL_RANGE( ids, f, l ) \
do { \
(ids)[0] = MDB_NOID; \
(ids)[1] = (f); \
(ids)[2] = (l); \
} while(0)
#define MDB_IDL_ZERO(ids) \
do { \
(ids)[0] = 0; \
(ids)[1] = 0; \
(ids)[2] = 0; \
} while(0)
#define MDB_IDL_IS_ZERO(ids) ( (ids)[0] == 0 ) #define MDB_IDL_IS_ZERO(ids) ( (ids)[0] == 0 )
#define MDB_IDL_IS_ALL( range, ids ) ( (ids)[0] == MDB_NOID \
&& (ids)[1] <= (range)[1] && (range)[2] <= (ids)[2] )
#define MDB_IDL_CPY( dst, src ) (memcpy( dst, src, MDB_IDL_SIZEOF( src ) )) #define MDB_IDL_CPY( dst, src ) (memcpy( dst, src, MDB_IDL_SIZEOF( src ) ))
#define MDB_IDL_ID( bdb, ids, id ) MDB_IDL_RANGE( ids, id, ((bdb)->bi_lastid) )
#define MDB_IDL_ALL( bdb, ids ) MDB_IDL_RANGE( ids, 1, ((bdb)->bi_lastid) )
#define MDB_IDL_FIRST( ids ) ( (ids)[1] ) #define MDB_IDL_FIRST( ids ) ( (ids)[1] )
#define MDB_IDL_LAST( ids ) ( (ids)[(ids)[0]] ) #define MDB_IDL_LAST( ids ) ( MDB_IDL_IS_RANGE(ids) \
? (ids)[2] : (ids)[(ids)[0]] )
/** Current max length of an #mdb_midl_alloc()ed IDL */ #define MDB_IDL_N( ids ) ( MDB_IDL_IS_RANGE(ids) \
#define MDB_IDL_ALLOCLEN( ids ) ( (ids)[-1] ) ? ((ids)[2]-(ids)[1])+1 : (ids)[0] )
/** Append ID to IDL. The IDL must be big enough. */ #if 0 /* superseded by append/sort */
#define mdb_midl_xappend(idl, id) do { \ /** Insert an ID into an IDL.
MDB_ID *xidl = (idl), xlen = ++(xidl[0]); \ * @param[in,out] ids The IDL to insert into.
xidl[xlen] = (id); \ * @param[in] id The ID to insert.
} while (0) * @return 0 on success, -1 if the ID was already present in the IDL.
/** Search for an ID in an IDL.
* @param[in] ids The IDL to search.
* @param[in] id The ID to search for.
* @return The index of the first ID greater than or equal to \b id.
*/ */
unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ); int mdb_midl_insert( MDB_IDL ids, MDB_ID id );
#endif
/** Allocate an IDL. /** Allocate an IDL.
* Allocates memory for an IDL of the given size. * Allocates memory for an IDL of the given size.
@ -107,44 +128,32 @@ void mdb_midl_free(MDB_IDL ids);
/** Shrink an IDL. /** Shrink an IDL.
* Return the IDL to the default size if it has grown larger. * Return the IDL to the default size if it has grown larger.
* @param[in,out] idp Address of the IDL to shrink. * @param[in,out] idp Address of the IDL to shrink.
* @return 0 on no change, non-zero if shrunk.
*/ */
void mdb_midl_shrink(MDB_IDL *idp); int mdb_midl_shrink(MDB_IDL *idp);
/** Make room for num additional elements in an IDL. /** Grow an IDL.
* @param[in,out] idp Address of the IDL. * Add room for num additional elements.
* @param[in] num Number of elements to make room for. * @param[in,out] idp Address of the IDL to grow.
* @return 0 on success, ENOMEM on failure. * @param[in] num Number of elements to add.
* @return 0 on success, -1 on failure.
*/ */
int mdb_midl_need(MDB_IDL *idp, unsigned num); int mdb_midl_grow(MDB_IDL *idp, int num);
/** Append an ID onto an IDL. /** Append an ID onto an IDL.
* @param[in,out] idp Address of the IDL to append to. * @param[in,out] idp Address of the IDL to append to.
* @param[in] id The ID to append. * @param[in] id The ID to append.
* @return 0 on success, ENOMEM if the IDL is too large. * @return 0 on success, -1 if the IDL is too large.
*/ */
int mdb_midl_append( MDB_IDL *idp, MDB_ID id ); int mdb_midl_append( MDB_IDL *idp, MDB_ID id );
/** Append an IDL onto an IDL. /** Append an IDL onto an IDL.
* @param[in,out] idp Address of the IDL to append to. * @param[in,out] idp Address of the IDL to append to.
* @param[in] app The IDL to append. * @param[in] app The IDL to append.
* @return 0 on success, ENOMEM if the IDL is too large. * @return 0 on success, -1 if the IDL is too large.
*/ */
int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app ); int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app );
/** Append an ID range onto an IDL.
* @param[in,out] idp Address of the IDL to append to.
* @param[in] id The lowest ID to append.
* @param[in] n Number of IDs to append.
* @return 0 on success, ENOMEM if the IDL is too large.
*/
int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n );
/** Merge an IDL onto an IDL. The destination IDL must be big enough.
* @param[in] idl The IDL to merge into.
* @param[in] merge The IDL to merge.
*/
void mdb_midl_xmerge( MDB_IDL idl, MDB_IDL merge );
/** Sort an IDL. /** Sort an IDL.
* @param[in,out] ids The IDL to sort. * @param[in,out] ids The IDL to sort.
*/ */
@ -186,23 +195,11 @@ int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id );
*/ */
int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ); int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id );
#ifdef MDB_VL32
typedef struct MDB_ID3 {
MDB_ID mid; /**< The ID */
void *mptr; /**< The pointer */
unsigned int mcnt; /**< Number of pages */
unsigned int mref; /**< Refcounter */
} MDB_ID3;
typedef MDB_ID3 *MDB_ID3L;
unsigned mdb_mid3l_search( MDB_ID3L ids, MDB_ID id );
int mdb_mid3l_insert( MDB_ID3L ids, MDB_ID3 *id );
#endif /* MDB_VL32 */
/** @} */ /** @} */
/** @} */ /** @} */
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif /* _MDB_MIDL_H_ */ #endif /* _MDB_MIDL_H_ */
/* http://gitorious.org/mdb/mdb/blobs/raw/mdb.master/libraries/liblmdb/midl.h */

View file

@ -1,678 +0,0 @@
/*
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)queue.h 8.5 (Berkeley) 8/20/94
* $FreeBSD: src/sys/sys/queue.h,v 1.54 2002/08/05 05:18:43 alfred Exp $
*/
#ifndef _DB_QUEUE_H_
#define _DB_QUEUE_H_
#ifndef __offsetof
#define __offsetof(st, m) \
((size_t) ( (char *)&((st *)0)->m - (char *)0 ))
#endif
#ifndef __containerof
#define __containerof(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - __offsetof(type,member) );})
#endif
#if defined(__cplusplus)
extern "C" {
#endif
/*
* This file defines four types of data structures: singly-linked lists,
* singly-linked tail queues, lists and tail queues.
*
* A singly-linked list is headed by a single forward pointer. The elements
* are singly linked for minimum space and pointer manipulation overhead at
* the expense of O(n) removal for arbitrary elements. New elements can be
* added to the list after an existing element or at the head of the list.
* Elements being removed from the head of the list should use the explicit
* macro for this purpose for optimum efficiency. A singly-linked list may
* only be traversed in the forward direction. Singly-linked lists are ideal
* for applications with large datasets and few or no removals or for
* implementing a LIFO queue.
*
* A singly-linked tail queue is headed by a pair of pointers, one to the
* head of the list and the other to the tail of the list. The elements are
* singly linked for minimum space and pointer manipulation overhead at the
* expense of O(n) removal for arbitrary elements. New elements can be added
* to the list after an existing element, at the head of the list, or at the
* end of the list. Elements being removed from the head of the tail queue
* should use the explicit macro for this purpose for optimum efficiency.
* A singly-linked tail queue may only be traversed in the forward direction.
* Singly-linked tail queues are ideal for applications with large datasets
* and few or no removals or for implementing a FIFO queue.
*
* A list is headed by a single forward pointer (or an array of forward
* pointers for a hash table header). The elements are doubly linked
* so that an arbitrary element can be removed without a need to
* traverse the list. New elements can be added to the list before
* or after an existing element or at the head of the list. A list
* may only be traversed in the forward direction.
*
* A tail queue is headed by a pair of pointers, one to the head of the
* list and the other to the tail of the list. The elements are doubly
* linked so that an arbitrary element can be removed without a need to
* traverse the list. New elements can be added to the list before or
* after an existing element, at the head of the list, or at the end of
* the list. A tail queue may be traversed in either direction.
*
* For details on the use of these macros, see the queue(3) manual page.
*
*
* SLIST LIST STAILQ TAILQ
* _HEAD + + + +
* _HEAD_INITIALIZER + + + +
* _ENTRY + + + +
* _INIT + + + +
* _EMPTY + + + +
* _FIRST + + + +
* _NEXT + + + +
* _PREV - - - +
* _LAST - - + +
* _FOREACH + + + +
* _FOREACH_REVERSE - - - +
* _INSERT_HEAD + + + +
* _INSERT_BEFORE - + - +
* _INSERT_AFTER + + + +
* _INSERT_TAIL - - + +
* _CONCAT - - + +
* _REMOVE_HEAD + - + -
* _REMOVE + + + +
*
*/
/*
* XXX
* We #undef all of the macros because there are incompatible versions of this
* file and these macros on various systems. What makes the problem worse is
* they are included and/or defined by system include files which we may have
* already loaded into Berkeley DB before getting here. For example, FreeBSD's
* <rpc/rpc.h> includes its system <sys/queue.h>, and VxWorks UnixLib.h defines
* several of the LIST_XXX macros. Visual C.NET 7.0 also defines some of these
* same macros in Vc7\PlatformSDK\Include\WinNT.h. Make sure we use ours.
*/
#undef LIST_EMPTY
#undef LIST_ENTRY
#undef LIST_FIRST
#undef LIST_FOREACH
#undef LIST_HEAD
#undef LIST_HEAD_INITIALIZER
#undef LIST_INIT
#undef LIST_INSERT_AFTER
#undef LIST_INSERT_BEFORE
#undef LIST_INSERT_HEAD
#undef LIST_NEXT
#undef LIST_REMOVE
#undef QMD_TRACE_ELEM
#undef QMD_TRACE_HEAD
#undef QUEUE_MACRO_DEBUG
#undef SLIST_EMPTY
#undef SLIST_ENTRY
#undef SLIST_FIRST
#undef SLIST_FOREACH
#undef SLIST_FOREACH_PREVPTR
#undef SLIST_HEAD
#undef SLIST_HEAD_INITIALIZER
#undef SLIST_INIT
#undef SLIST_INSERT_AFTER
#undef SLIST_INSERT_HEAD
#undef SLIST_NEXT
#undef SLIST_REMOVE
#undef SLIST_REMOVE_HEAD
#undef STAILQ_CONCAT
#undef STAILQ_EMPTY
#undef STAILQ_ENTRY
#undef STAILQ_FIRST
#undef STAILQ_FOREACH
#undef STAILQ_HEAD
#undef STAILQ_HEAD_INITIALIZER
#undef STAILQ_INIT
#undef STAILQ_INSERT_AFTER
#undef STAILQ_INSERT_HEAD
#undef STAILQ_INSERT_TAIL
#undef STAILQ_LAST
#undef STAILQ_NEXT
#undef STAILQ_REMOVE
#undef STAILQ_REMOVE_HEAD
#undef STAILQ_REMOVE_HEAD_UNTIL
#undef TAILQ_CONCAT
#undef TAILQ_EMPTY
#undef TAILQ_ENTRY
#undef TAILQ_FIRST
#undef TAILQ_FOREACH
#undef TAILQ_FOREACH_REVERSE
#undef TAILQ_HEAD
#undef TAILQ_HEAD_INITIALIZER
#undef TAILQ_INIT
#undef TAILQ_INSERT_AFTER
#undef TAILQ_INSERT_BEFORE
#undef TAILQ_INSERT_HEAD
#undef TAILQ_INSERT_TAIL
#undef TAILQ_LAST
#undef TAILQ_NEXT
#undef TAILQ_PREV
#undef TAILQ_REMOVE
#undef TRACEBUF
#undef TRASHIT
#define QUEUE_MACRO_DEBUG 0
#if QUEUE_MACRO_DEBUG
/* Store the last 2 places the queue element or head was altered */
struct qm_trace {
char * lastfile;
int lastline;
char * prevfile;
int prevline;
};
#define TRACEBUF struct qm_trace trace;
#define TRASHIT(x) do {(x) = (void *)-1;} while (0)
#define QMD_TRACE_HEAD(head) do { \
(head)->trace.prevline = (head)->trace.lastline; \
(head)->trace.prevfile = (head)->trace.lastfile; \
(head)->trace.lastline = __LINE__; \
(head)->trace.lastfile = __FILE__; \
} while (0)
#define QMD_TRACE_ELEM(elem) do { \
(elem)->trace.prevline = (elem)->trace.lastline; \
(elem)->trace.prevfile = (elem)->trace.lastfile; \
(elem)->trace.lastline = __LINE__; \
(elem)->trace.lastfile = __FILE__; \
} while (0)
#else
#define QMD_TRACE_ELEM(elem)
#define QMD_TRACE_HEAD(head)
#define TRACEBUF
#define TRASHIT(x)
#endif /* QUEUE_MACRO_DEBUG */
/*
* Singly-linked List declarations.
*/
#define SLIST_HEAD(name, type) \
struct name { \
struct type *slh_first; /* first element */ \
}
#define SLIST_HEAD_INITIALIZER(head) \
{ NULL }
#define SLIST_ENTRY(type) \
struct { \
struct type *sle_next; /* next element */ \
}
/*
* Singly-linked List functions.
*/
#define SLIST_EMPTY(head) ((head)->slh_first == NULL)
#define SLIST_FIRST(head) ((head)->slh_first)
#define SLIST_FOREACH(var, head, field) \
for ((var) = SLIST_FIRST((head)); \
(var); \
(var) = SLIST_NEXT((var), field))
#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \
for ((varp) = &SLIST_FIRST((head)); \
((var) = *(varp)) != NULL; \
(varp) = &SLIST_NEXT((var), field))
#define SLIST_INIT(head) do { \
SLIST_FIRST((head)) = NULL; \
} while (0)
#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \
SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \
SLIST_NEXT((slistelm), field) = (elm); \
} while (0)
#define SLIST_INSERT_HEAD(head, elm, field) do { \
SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \
SLIST_FIRST((head)) = (elm); \
} while (0)
#define SLIST_NEXT(elm, field) ((elm)->field.sle_next)
#define SLIST_REMOVE(head, elm, type, field) do { \
if (SLIST_FIRST((head)) == (elm)) { \
SLIST_REMOVE_HEAD((head), field); \
} \
else { \
struct type *curelm = SLIST_FIRST((head)); \
while (SLIST_NEXT(curelm, field) != (elm)) \
curelm = SLIST_NEXT(curelm, field); \
SLIST_NEXT(curelm, field) = \
SLIST_NEXT(SLIST_NEXT(curelm, field), field); \
} \
} while (0)
#define SLIST_REMOVE_HEAD(head, field) do { \
SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \
} while (0)
/*
* Singly-linked Tail queue declarations.
*/
#define STAILQ_HEAD(name, type) \
struct name { \
struct type *stqh_first;/* first element */ \
struct type **stqh_last;/* addr of last next element */ \
}
#define STAILQ_HEAD_INITIALIZER(head) \
{ NULL, &(head).stqh_first }
#define STAILQ_ENTRY(type) \
struct { \
struct type *stqe_next; /* next element */ \
}
/*
* Singly-linked Tail queue functions.
*/
#define STAILQ_CONCAT(head1, head2) do { \
if (!STAILQ_EMPTY((head2))) { \
*(head1)->stqh_last = (head2)->stqh_first; \
(head1)->stqh_last = (head2)->stqh_last; \
STAILQ_INIT((head2)); \
} \
} while (0)
#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL)
#define STAILQ_FIRST(head) ((head)->stqh_first)
#define STAILQ_FOREACH(var, head, field) \
for ((var) = STAILQ_FIRST((head)); \
(var); \
(var) = STAILQ_NEXT((var), field))
#define STAILQ_INIT(head) do { \
STAILQ_FIRST((head)) = NULL; \
(head)->stqh_last = &STAILQ_FIRST((head)); \
} while (0)
#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \
if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
(head)->stqh_last = &STAILQ_NEXT((elm), field); \
STAILQ_NEXT((tqelm), field) = (elm); \
} while (0)
#define STAILQ_INSERT_HEAD(head, elm, field) do { \
if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \
(head)->stqh_last = &STAILQ_NEXT((elm), field); \
STAILQ_FIRST((head)) = (elm); \
} while (0)
#define STAILQ_INSERT_TAIL(head, elm, field) do { \
STAILQ_NEXT((elm), field) = NULL; \
*(head)->stqh_last = (elm); \
(head)->stqh_last = &STAILQ_NEXT((elm), field); \
} while (0)
#define STAILQ_LAST(head, type, field) \
(STAILQ_EMPTY((head)) ? \
NULL : \
((struct type *) \
((char *)((head)->stqh_last) - __offsetof(struct type, field))))
#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next)
#define STAILQ_REMOVE(head, elm, type, field) do { \
if (STAILQ_FIRST((head)) == (elm)) { \
STAILQ_REMOVE_HEAD((head), field); \
} \
else { \
struct type *curelm = STAILQ_FIRST((head)); \
while (STAILQ_NEXT(curelm, field) != (elm)) \
curelm = STAILQ_NEXT(curelm, field); \
if ((STAILQ_NEXT(curelm, field) = \
STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
(head)->stqh_last = &STAILQ_NEXT((curelm), field);\
} \
} while (0)
#define STAILQ_REMOVE_HEAD(head, field) do { \
if ((STAILQ_FIRST((head)) = \
STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \
(head)->stqh_last = &STAILQ_FIRST((head)); \
} while (0)
#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \
if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \
(head)->stqh_last = &STAILQ_FIRST((head)); \
} while (0)
/*
* List declarations.
*/
#define LIST_HEAD(name, type) \
struct name { \
struct type *lh_first; /* first element */ \
}
#define LIST_HEAD_INITIALIZER(head) \
{ NULL }
#define LIST_ENTRY(type) \
struct { \
struct type *le_next; /* next element */ \
struct type **le_prev; /* address of previous next element */ \
}
/*
* List functions.
*/
#define LIST_EMPTY(head) ((head)->lh_first == NULL)
#define LIST_FIRST(head) ((head)->lh_first)
#define LIST_FOREACH(var, head, field) \
for ((var) = LIST_FIRST((head)); \
(var); \
(var) = LIST_NEXT((var), field))
#define LIST_INIT(head) do { \
LIST_FIRST((head)) = NULL; \
} while (0)
#define LIST_INSERT_AFTER(listelm, elm, field) do { \
if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
LIST_NEXT((listelm), field)->field.le_prev = \
&LIST_NEXT((elm), field); \
LIST_NEXT((listelm), field) = (elm); \
(elm)->field.le_prev = &LIST_NEXT((listelm), field); \
} while (0)
#define LIST_INSERT_BEFORE(listelm, elm, field) do { \
(elm)->field.le_prev = (listelm)->field.le_prev; \
LIST_NEXT((elm), field) = (listelm); \
*(listelm)->field.le_prev = (elm); \
(listelm)->field.le_prev = &LIST_NEXT((elm), field); \
} while (0)
#define LIST_INSERT_HEAD(head, elm, field) do { \
if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \
LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
LIST_FIRST((head)) = (elm); \
(elm)->field.le_prev = &LIST_FIRST((head)); \
} while (0)
#define LIST_NEXT(elm, field) ((elm)->field.le_next)
#define LIST_REMOVE(elm, field) do { \
if (LIST_NEXT((elm), field) != NULL) \
LIST_NEXT((elm), field)->field.le_prev = \
(elm)->field.le_prev; \
*(elm)->field.le_prev = LIST_NEXT((elm), field); \
} while (0)
/*
* Tail queue declarations.
*/
#define TAILQ_HEAD(name, type) \
struct name { \
struct type *tqh_first; /* first element */ \
struct type **tqh_last; /* addr of last next element */ \
TRACEBUF \
}
#define TAILQ_HEAD_INITIALIZER(head) \
{ NULL, &(head).tqh_first }
#define TAILQ_ENTRY(type) \
struct { \
struct type *tqe_next; /* next element */ \
struct type **tqe_prev; /* address of previous next element */ \
TRACEBUF \
}
/*
* Tail queue functions.
*/
#define TAILQ_CONCAT(head1, head2, field) do { \
if (!TAILQ_EMPTY(head2)) { \
*(head1)->tqh_last = (head2)->tqh_first; \
(head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \
(head1)->tqh_last = (head2)->tqh_last; \
TAILQ_INIT((head2)); \
QMD_TRACE_HEAD(head); \
QMD_TRACE_HEAD(head2); \
} \
} while (0)
#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL)
#define TAILQ_FIRST(head) ((head)->tqh_first)
#define TAILQ_FOREACH(var, head, field) \
for ((var) = TAILQ_FIRST((head)); \
(var); \
(var) = TAILQ_NEXT((var), field))
#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \
for ((var) = TAILQ_LAST((head), headname); \
(var); \
(var) = TAILQ_PREV((var), headname, field))
#define TAILQ_INIT(head) do { \
TAILQ_FIRST((head)) = NULL; \
(head)->tqh_last = &TAILQ_FIRST((head)); \
QMD_TRACE_HEAD(head); \
} while (0)
#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \
if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
TAILQ_NEXT((elm), field)->field.tqe_prev = \
&TAILQ_NEXT((elm), field); \
else { \
(head)->tqh_last = &TAILQ_NEXT((elm), field); \
QMD_TRACE_HEAD(head); \
} \
TAILQ_NEXT((listelm), field) = (elm); \
(elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \
QMD_TRACE_ELEM(&(elm)->field); \
QMD_TRACE_ELEM(&listelm->field); \
} while (0)
#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \
(elm)->field.tqe_prev = (listelm)->field.tqe_prev; \
TAILQ_NEXT((elm), field) = (listelm); \
*(listelm)->field.tqe_prev = (elm); \
(listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \
QMD_TRACE_ELEM(&(elm)->field); \
QMD_TRACE_ELEM(&listelm->field); \
} while (0)
#define TAILQ_INSERT_HEAD(head, elm, field) do { \
if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \
TAILQ_FIRST((head))->field.tqe_prev = \
&TAILQ_NEXT((elm), field); \
else \
(head)->tqh_last = &TAILQ_NEXT((elm), field); \
TAILQ_FIRST((head)) = (elm); \
(elm)->field.tqe_prev = &TAILQ_FIRST((head)); \
QMD_TRACE_HEAD(head); \
QMD_TRACE_ELEM(&(elm)->field); \
} while (0)
#define TAILQ_INSERT_TAIL(head, elm, field) do { \
TAILQ_NEXT((elm), field) = NULL; \
(elm)->field.tqe_prev = (head)->tqh_last; \
*(head)->tqh_last = (elm); \
(head)->tqh_last = &TAILQ_NEXT((elm), field); \
QMD_TRACE_HEAD(head); \
QMD_TRACE_ELEM(&(elm)->field); \
} while (0)
#define TAILQ_LAST(head, headname) \
(*(((struct headname *)((head)->tqh_last))->tqh_last))
#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
#define TAILQ_PREV(elm, headname, field) \
(*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
#define TAILQ_REMOVE(head, elm, field) do { \
if ((TAILQ_NEXT((elm), field)) != NULL) \
TAILQ_NEXT((elm), field)->field.tqe_prev = \
(elm)->field.tqe_prev; \
else { \
(head)->tqh_last = (elm)->field.tqe_prev; \
QMD_TRACE_HEAD(head); \
} \
*(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \
TRASHIT((elm)->field.tqe_next); \
TRASHIT((elm)->field.tqe_prev); \
QMD_TRACE_ELEM(&(elm)->field); \
} while (0)
/*
* Circular queue definitions.
*/
#define CIRCLEQ_HEAD(name, type) \
struct name { \
struct type *cqh_first; /* first element */ \
struct type *cqh_last; /* last element */ \
}
#define CIRCLEQ_HEAD_INITIALIZER(head) \
{ (void *)&head, (void *)&head }
#define CIRCLEQ_ENTRY(type) \
struct { \
struct type *cqe_next; /* next element */ \
struct type *cqe_prev; /* previous element */ \
}
/*
* Circular queue functions.
*/
#define CIRCLEQ_INIT(head) do { \
(head)->cqh_first = (void *)(head); \
(head)->cqh_last = (void *)(head); \
} while (/*CONSTCOND*/0)
#define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do { \
(elm)->field.cqe_next = (listelm)->field.cqe_next; \
(elm)->field.cqe_prev = (listelm); \
if ((listelm)->field.cqe_next == (void *)(head)) \
(head)->cqh_last = (elm); \
else \
(listelm)->field.cqe_next->field.cqe_prev = (elm); \
(listelm)->field.cqe_next = (elm); \
} while (/*CONSTCOND*/0)
#define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do { \
(elm)->field.cqe_next = (listelm); \
(elm)->field.cqe_prev = (listelm)->field.cqe_prev; \
if ((listelm)->field.cqe_prev == (void *)(head)) \
(head)->cqh_first = (elm); \
else \
(listelm)->field.cqe_prev->field.cqe_next = (elm); \
(listelm)->field.cqe_prev = (elm); \
} while (/*CONSTCOND*/0)
#define CIRCLEQ_INSERT_HEAD(head, elm, field) do { \
(elm)->field.cqe_next = (head)->cqh_first; \
(elm)->field.cqe_prev = (void *)(head); \
if ((head)->cqh_last == (void *)(head)) \
(head)->cqh_last = (elm); \
else \
(head)->cqh_first->field.cqe_prev = (elm); \
(head)->cqh_first = (elm); \
} while (/*CONSTCOND*/0)
#define CIRCLEQ_INSERT_TAIL(head, elm, field) do { \
(elm)->field.cqe_next = (void *)(head); \
(elm)->field.cqe_prev = (head)->cqh_last; \
if ((head)->cqh_first == (void *)(head)) \
(head)->cqh_first = (elm); \
else \
(head)->cqh_last->field.cqe_next = (elm); \
(head)->cqh_last = (elm); \
} while (/*CONSTCOND*/0)
#define CIRCLEQ_REMOVE(head, elm, field) do { \
if ((elm)->field.cqe_next == (void *)(head)) \
(head)->cqh_last = (elm)->field.cqe_prev; \
else \
(elm)->field.cqe_next->field.cqe_prev = \
(elm)->field.cqe_prev; \
if ((elm)->field.cqe_prev == (void *)(head)) \
(head)->cqh_first = (elm)->field.cqe_next; \
else \
(elm)->field.cqe_prev->field.cqe_next = \
(elm)->field.cqe_next; \
} while (/*CONSTCOND*/0)
#define CIRCLEQ_FOREACH(var, head, field) \
for ((var) = ((head)->cqh_first); \
(var) != (const void *)(head); \
(var) = ((var)->field.cqe_next))
#define CIRCLEQ_FOREACH_REVERSE(var, head, field) \
for ((var) = ((head)->cqh_last); \
(var) != (const void *)(head); \
(var) = ((var)->field.cqe_prev))
/*
* Circular queue access methods.
*/
#define CIRCLEQ_EMPTY(head) ((head)->cqh_first == (void *)(head))
#define CIRCLEQ_FIRST(head) ((head)->cqh_first)
#define CIRCLEQ_LAST(head) ((head)->cqh_last)
#define CIRCLEQ_NEXT(elm, field) ((elm)->field.cqe_next)
#define CIRCLEQ_PREV(elm, field) ((elm)->field.cqe_prev)
#define CIRCLEQ_LOOP_NEXT(head, elm, field) \
(((elm)->field.cqe_next == (void *)(head)) \
? ((head)->cqh_first) \
: (elm->field.cqe_next))
#define CIRCLEQ_LOOP_PREV(head, elm, field) \
(((elm)->field.cqe_prev == (void *)(head)) \
? ((head)->cqh_last) \
: (elm->field.cqe_prev))
#if defined(__cplusplus)
}
#endif
#endif /* !_DB_QUEUE_H_ */

217
c_src/stats.h Normal file
View file

@ -0,0 +1,217 @@
/*
* stats: measure all the things
*
* Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved.
* Author: Gregory Burd <greg@basho.com> <greg@burd.me>
*
* This file is provided to you under the Apache License,
* Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain
* a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#ifndef __STATS_H__
#define __STATS_H__
#if defined(__cplusplus)
extern "C" {
#endif
#include "duration.h"
/**
* Calculate the log2 of 64bit unsigned integers.
*/
#ifdef __GCC__
#define LOG2(X) ((unsigned) ((8 * (sizeof(uint64_t) - 1)) - __builtin_clzll((X))))
#else
static unsigned int __log2_64(uint64_t x) {
static const int tab64[64] = {
63, 0, 58, 1, 59, 47, 53, 2,
60, 39, 48, 27, 54, 33, 42, 3,
61, 51, 37, 40, 49, 18, 28, 20,
55, 30, 34, 11, 43, 14, 22, 4,
62, 57, 46, 52, 38, 26, 32, 41,
50, 36, 17, 19, 29, 10, 13, 21,
56, 45, 25, 31, 35, 16, 9, 12,
44, 24, 15, 8, 23, 7, 6, 5};
if (x == 0) return 0;
uint64_t v = x;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v |= v >> 32;
return tab64[((uint64_t)((v - (v >> 1)) * 0x07EDD5E59A4E28C2)) >> 58];
}
#define LOG2(X) __log2_64(X)
#endif
#define STAT_DEF(name) struct name ## _stat name ## _stat;
#define STAT_DECL(name, nsamples) \
struct name ## _stat { \
duration_t d; \
uint64_t histogram[64]; \
uint32_t h, n; \
uint64_t samples[nsamples]; \
uint64_t min, max; \
double mean; \
}; \
static inline double name ## _stat_mean(struct name ## _stat *s) { \
uint32_t t = s->h; \
uint32_t h = (s->h + 1) % nsamples; \
double mean = 0; \
while (h != t) { \
mean += s->samples[h]; \
h = (h + 1) % nsamples; \
} \
if (mean > 0) \
mean /= (double)(s->n < nsamples ? s->n : nsamples); \
return mean; \
} \
static inline double name ## _stat_mean_lg2(struct name ## _stat *s) { \
uint32_t i; \
double mean = 0; \
for (i = 0; i < 64; i++) \
mean += (s->histogram[i] * i); \
if (mean > 0) \
mean /= (double)s->n; \
return mean; \
} \
static inline uint64_t name ## _stat_tick(struct name ## _stat *s) \
{ \
uint64_t t = ts(s->d.unit); \
s->d.then = t; \
return t; \
} \
static inline void name ## _stat_reset(struct name ## _stat *s) \
{ \
s->min = ~0; \
s->max = 0; \
s->h = 0; \
memset(&s->histogram, 0, sizeof(uint64_t) * 64); \
memset(&s->samples, 0, sizeof(uint64_t) * nsamples); \
} \
static inline uint64_t name ## _stat_tock(struct name ## _stat *s) \
{ \
uint64_t now = ts(s->d.unit); \
uint64_t elapsed = now - s->d.then; \
uint32_t i = s->h; \
if (s->n == nsamples) { \
s->mean = (s->mean + name ## _stat_mean(s)) / 2.0; \
if (s->n >= 4294967295) \
name ## _stat_reset(s); \
} \
s->h = (s->h + 1) % nsamples; \
s->samples[i] = elapsed; \
if (elapsed < s->min) \
s->min = elapsed; \
if (elapsed > s->max) \
s->max = elapsed; \
s->histogram[LOG2(elapsed)]++; \
s->n++; \
s->d.then = ts(s->d.unit); \
return elapsed; \
} \
static void name ## _stat_print_histogram(struct name ## _stat *s, const char *mod) \
{ \
uint8_t logs[64]; \
uint8_t i, j, max_log = 0; \
double m = 0.0; \
\
if (s->n < nsamples) \
return; \
\
fprintf(stderr, "\n%s:async_nif request latency histogram:\n", mod); \
m = (s->mean + name ## _stat_mean(s) / 2.0); \
for (i = 0; i < 64; i++) { \
logs[i] = LOG2(s->histogram[i]); \
if (logs[i] > max_log) \
max_log = logs[i]; \
} \
for (i = max_log; i > 0; i--) { \
if (!(i % 10)) \
fprintf(stderr, "2^%2d ", i); \
else \
fprintf(stderr, " "); \
for(j = 0; j < 64; j++) \
fprintf(stderr, logs[j] >= i ? "" : " "); \
fprintf(stderr, "\n"); \
} \
if (max_log == 100) { \
fprintf(stderr, "[empty]\n"); \
} else { \
fprintf(stderr, " ns μs ms s ks\n"); \
fprintf(stderr, "min: "); \
if (s->min < 1000) \
fprintf(stderr, "%lu (ns)", s->min); \
else if (s->min < 1000000) \
fprintf(stderr, "%.2f (μs)", s->min / 1000.0); \
else if (s->min < 1000000000) \
fprintf(stderr, "%.2f (ms)", s->min / 1000000.0); \
else if (s->min < 1000000000000) \
fprintf(stderr, "%.2f (s)", s->min / 1000000000.0); \
fprintf(stderr, " max: "); \
if (s->max < 1000) \
fprintf(stderr, "%lu (ns)", s->max); \
else if (s->max < 1000000) \
fprintf(stderr, "%.2f (μs)", s->max / 1000.0); \
else if (s->max < 1000000000) \
fprintf(stderr, "%.2f (ms)", s->max / 1000000.0); \
else if (s->max < 1000000000000) \
fprintf(stderr, "%.2f (s)", s->max / 1000000000.0); \
fprintf(stderr, " mean: "); \
if (m < 1000) \
fprintf(stderr, "%.2f (ns)", m); \
else if (m < 1000000) \
fprintf(stderr, "%.2f (μs)", m / 1000.0); \
else if (m < 1000000000) \
fprintf(stderr, "%.2f (ms)", m / 1000000.0); \
else if (m < 1000000000000) \
fprintf(stderr, "%.2f (s)", m / 1000000000.0); \
fprintf(stderr, "\n"); \
} \
fflush(stderr); \
}
#define STAT_INIT(var, name) \
var->name ## _stat.min = ~0; \
var->name ## _stat.max = 0; \
var->name ## _stat.mean = 0.0; \
var->name ## _stat.h = 0; \
var->name ## _stat.d.then = 0; \
var->name ## _stat.d.unit = ns;
#define STAT_TICK(var, name) name ## _stat_tick(&var->name ## _stat)
#define STAT_TOCK(var, name) name ## _stat_tock(&var->name ## _stat)
#define STAT_RESET(var, name) name ## _stat_reset(&var->name ## _stat)
#define STAT_MEAN_LOG2_SAMPLE(var, name) \
name ## _stat_mean_lg2(&var->name ## _stat)
#define STAT_MEAN_SAMPLE(var, name) \
name ## _stat_mean(&var->name ## _stat)
#define STAT_PRINT(var, name, mod) \
name ## _stat_print_histogram(&var->name ## _stat, mod)
#if defined(__cplusplus)
}
#endif
#endif // __STATS_H__

View file

@ -1,7 +1,7 @@
%% -*- erlang -*- %% -*- erlang -*-
%% ex: ft=erlang ts=4 sw=4 et %% ex: ft=erlang ts=4 sw=4 et
{require_otp_vsn, "R1[56]|1[78]"}. {require_otp_vsn, "R1[567]"}.
{cover_enabled, true}. {cover_enabled, true}.
@ -29,11 +29,14 @@
{eunit_opts, [verbose, {report, {eunit_surefire, [{dir, "."}]}}]}. {eunit_opts, [verbose, {report, {eunit_surefire, [{dir, "."}]}}]}.
{port_specs, [ {port_specs, [
{"priv/lmdb.so", ["c_src/*.c"]} {"unix", "priv/lmdb.so", ["c_src/*.c"]},
{"linux", "priv/lmdb.so", ["c_src/*.c"]},
{"darwin", "priv/lmdb.so", ["c_src/*.c"]},
{"win32", "priv/lmdb.dll", ["c_src/*.c"]}
]}. ]}.
{port_env, [ {port_env, [
{"DRV_CFLAGS", "$DRV_CFLAGS -O3 -fPIC -march=native -mtune=native -Wall -Wextra"} {"DRV_CFLAGS", "$DRV_CFLAGS -O3 -fPIC -march=native -mtune=native -Wall -Wextra -Werror"}
]}. ]}.
% for debugging use % for debugging use

View file

@ -1,54 +1,43 @@
%% ------------------------------------------------------------------- %% ---------------------------------------------------------------------------
%% %%
%% async_nif: An async thread-pool layer for Erlang's NIF API %% async_nif: An async thread-pool layer for Erlang's NIF API
%% %%
%% Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved. %% Copyright (c) 2012-2013 Basho Technologies, Inc. All Rights Reserved.
%% Author: Gregory Burd <greg@basho.com> <greg@burd.me> %% Author: Gregory Burd <greg@basho.com> <greg@burd.me>
%% %%
%% This file is provided to you under the Apache License, %% This file is provided to you under the Apache License, Version 2.0 (the
%% Version 2.0 (the "License"); you may not use this file %% "License"); you may not use this file except in compliance with the License.
%% except in compliance with the License. You may obtain %% You may obtain a copy of the License at:
%% a copy of the License at
%% %%
%% http://www.apache.org/licenses/LICENSE-2.0 %% http://www.apache.org/licenses/LICENSE-2.0
%% %%
%% Unless required by applicable law or agreed to in writing, %% Unless required by applicable law or agreed to in writing, software
%% software distributed under the License is distributed on an %% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY %% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
%% KIND, either express or implied. See the License for the %% License for the specific language governing permissions and limitations
%% specific language governing permissions and limitations
%% under the License. %% under the License.
%% %%
%% ------------------------------------------------------------------- %% ---------------------------------------------------------------------------
-define(ASYNC_NIF_CALL(Fun, Args), -spec async_nif_enqueue(reference(), function(), [term()]) -> term() | {error, term()}.
F = fun(F, T) -> async_nif_enqueue(R, F, A) ->
R = erlang:make_ref(), case erlang:apply(F, [R|A]) of
case erlang:apply(Fun, [R|Args]) of {ok, enqueued} ->
{ok, {enqueued, PctBusy}} -> receive
if {R, {error, eagain}} ->
PctBusy > 0.25 andalso PctBusy =< 1.0 -> %% Work unit was not queued, try again.
erlang:bump_reductions(erlang:trunc(2000 * PctBusy)); async_nif_enqueue(R, F, A);
true -> {R, {error, shutdown}=Error} ->
ok %% Work unit was queued, but not executed.
end, Error;
receive {R, {error, _Reason}=Error} ->
{R, {error, shutdown}=Error} -> %% Work unit returned an error.
%% Work unit was queued, but not executed. Error;
Error; {R, Reply} ->
{R, {error, _Reason}=Error} -> Reply
%% Work unit returned an error. end;
Error; Other ->
{R, Reply} -> Other
Reply end.
end;
{error, eagain} -> -define(ASYNC_NIF_CALL(Fun, Args), async_nif_enqueue(erlang:make_ref(), Fun, Args)).
case T of
3 -> not_found;
_ -> F(F, T + 1)
end;
Other ->
Other
end
end,
F(F, 1)).

View file

@ -32,7 +32,7 @@
%% EXPORTS %% EXPORTS
%%==================================================================== %%====================================================================
-export([ -export([
open/1, %open/1,
open/2, open/2,
open/3, open/3,
@ -40,11 +40,8 @@
put/3, put/3,
get/2, get/2,
txn_begin/1,
txn_commit/1,
txn_abort/1,
del/2, del/2,
update/3, upd/3, update/3, upd/3,
drop/1 drop/1
]). ]).
@ -83,8 +80,8 @@
%% @doc Create a new MDB database %% @doc Create a new MDB database
%% @end %% @end
%%-------------------------------------------------------------------- %%--------------------------------------------------------------------
open(DirName) -> %open(DirName) ->
open(DirName, ?MDB_MAP_SIZE). % open(DirName, ?MDB_MAP_SIZE).
open(DirName, MapSize) open(DirName, MapSize)
when is_integer(MapSize) when is_integer(MapSize)
andalso MapSize > 0 -> andalso MapSize > 0 ->
@ -119,24 +116,6 @@ get(Handle, Key)
get(_AsyncRef, _Handle, _Key) -> get(_AsyncRef, _Handle, _Key) ->
?NOT_LOADED. ?NOT_LOADED.
txn_begin(Handle) ->
?ASYNC_NIF_CALL(fun txn_begin/2, [Handle]).
txn_begin(_AsyncRef, _Handle) ->
?NOT_LOADED.
txn_commit(Handle) ->
?ASYNC_NIF_CALL(fun txn_commit/2, [Handle]).
txn_commit(_AsyncRef, _Handle) ->
?NOT_LOADED.
txn_abort(Handle) ->
?ASYNC_NIF_CALL(fun txn_abort/2, [Handle]).
txn_abort(_AsyncRef, _Handle) ->
?NOT_LOADED.
del(Handle, Key) del(Handle, Key)
when is_binary(Key) -> when is_binary(Key) ->
?ASYNC_NIF_CALL(fun del/3, [Handle, Key]). ?ASYNC_NIF_CALL(fun del/3, [Handle, Key]).
@ -198,9 +177,9 @@ open_test_db() ->
?cmd("rm -rf " ++ DataDir), ?cmd("rm -rf " ++ DataDir),
?assertMatch(ok, filelib:ensure_dir(filename:join([DataDir, "x"]))), ?assertMatch(ok, filelib:ensure_dir(filename:join([DataDir, "x"]))),
{ok, Handle} = ?MODULE:open(DataDir, 2147483648), {ok, Handle} = ?MODULE:open(DataDir, 2147483648),
[?MODULE:upd(Handle, crypto:hash(sha, <<X>>), [?MODULE:upd(Handle, crypto:sha(<<X>>),
crypto:rand_bytes(crypto:rand_uniform(128, 4096))) || crypto:rand_bytes(crypto:rand_uniform(128, 4096))) ||
X <- lists:seq(1, 10)], X <- lists:seq(1, 100)],
Handle. Handle.
basics_test_() -> basics_test_() ->

View file

@ -24,14 +24,13 @@
%% adding a "_" to the name and take the "_" out of the other's name). %% adding a "_" to the name and take the "_" out of the other's name).
{mode, max}. {mode, max}.
{duration, 480}. {duration, 10}.
{concurrent, 32}. {concurrent, 8}.
{driver, basho_bench_driver_lmdb}. {driver, basho_bench_driver_lmdb}.
{key_generator, {int_to_bin_littleendian,{uniform_int, 5000000000}}}. {key_generator, {int_to_bin_littleendian,{uniform_int, 5000000}}}.
{value_generator, {highly_compressible_bin, 2048}}. {value_generator, {fixed_bin, 1024}}.
%{value_generator, {fixed_bin, 1024}}. %{operations, [{get, 9}, {put, 9}, {delete, 2}]}.
{operations, [{get, 25}, {put, 70}, {delete, 5}]}. {operations, [{put, 1}]}.
%{operations, [{put, 1}]}.
{code_paths, ["../lmdb"]}. {code_paths, ["../lmdb"]}.
{lmdb_dir, "/home/gburd/ws/basho_bench/data"}. {lmdb_dir, "/home/gburd/ws/basho_bench/data"}.

View file

@ -1,6 +0,0 @@
#!/bin/bash
for file in lmdb.h mdb.c midl.h midl.c; do
curl -O https://raw.githubusercontent.com/LMDB/lmdb/mdb.master/libraries/liblmdb/$file
done