From f2c5ff30e75f4c888e7ad39d8f6b6884f7e1c78d Mon Sep 17 00:00:00 2001 From: Gregory Burd Date: Sun, 19 May 2013 00:19:01 -0400 Subject: [PATCH 1/2] Re-worked code to use async_nif among other things, more to come. --- Makefile | 47 +- README.md | 105 +- c_src/async_nif.h | 533 ++++++++ c_src/common.h | 61 + c_src/duration.h | 98 ++ c_src/emdb.c | 722 +++++++++++ c_src/emdb_drv.c | 479 -------- c_src/fifo_q.h | 93 ++ c_src/khash.h | 643 ++++++++++ c_src/{mdb.c => lmdb.c} | 1800 +++++++++++++++++++--------- c_src/{mdb.h => lmdb.h} | 401 +++++-- c_src/midl.c | 63 +- c_src/midl.h | 16 +- c_src/stats.h | 213 ++++ docs/20120829-LinuxCon-MDB-txt.pdf | Bin 0 -> 242407 bytes docs/mdm-slides.pdf | Bin 0 -> 628687 bytes rebar.config | 43 +- src/async_nif.hrl | 45 + src/emdb.erl | 167 ++- src/emdb_drv.erl | 137 --- src/emdb_oop.erl | 97 -- src/emdb_sup.erl | 56 + tools/basho_bench_driver_emdb.erl | 79 ++ tools/emdb.config | 37 + 24 files changed, 4452 insertions(+), 1483 deletions(-) create mode 100644 c_src/async_nif.h create mode 100644 c_src/common.h create mode 100644 c_src/duration.h create mode 100644 c_src/emdb.c delete mode 100644 c_src/emdb_drv.c create mode 100644 c_src/fifo_q.h create mode 100644 c_src/khash.h rename c_src/{mdb.c => lmdb.c} (81%) rename c_src/{mdb.h => lmdb.h} (72%) create mode 100644 c_src/stats.h create mode 100644 docs/20120829-LinuxCon-MDB-txt.pdf create mode 100644 docs/mdm-slides.pdf create mode 100644 src/async_nif.hrl delete mode 100644 src/emdb_drv.erl delete mode 100644 src/emdb_oop.erl create mode 100644 src/emdb_sup.erl create mode 100644 tools/basho_bench_driver_emdb.erl create mode 100644 tools/emdb.config diff --git a/Makefile b/Makefile index db712e7..ceae982 100644 --- a/Makefile +++ b/Makefile @@ -5,9 +5,9 @@ MODULE = emdb DIALYZER = dialyzer REBAR = rebar -.PHONY: build clean +.PHONY: compile clean -all: ebin priv build +all: ebin priv compile ebin: @mkdir -p $@ @@ -15,10 +15,51 @@ ebin: priv: @mkdir -p $@ -build: +compile: @$(REBAR) compile clean: @$(REBAR) clean @rm -f *~ */*~ erl_crash.dump @rm -rf ebin priv + +xref: + @$(REBAR) xref skip_deps=true + +test: eunit + +eunit: compile-for-eunit + @$(REBAR) eunit skip_deps=true + +eqc: compile-for-eqc + @$(REBAR) eqc skip_deps=true + +proper: compile-for-proper + @echo "rebar does not implement a 'proper' command" && false + +triq: compile-for-triq + @$(REBAR) triq skip_deps=true + +compile-for-eunit: + @$(REBAR) compile eunit compile_only=true + +compile-for-eqc: + @$(REBAR) -D QC -D QC_EQC compile eqc compile_only=true + +compile-for-eqcmini: + @$(REBAR) -D QC -D QC_EQCMINI compile eqc compile_only=true + +compile-for-proper: + @$(REBAR) -D QC -D QC_PROPER compile eqc compile_only=true + +compile-for-triq: + @$(REBAR) -D QC -D QC_TRIQ compile triq compile_only=true + +plt: compile + @$(DIALYZER) --build_plt --output_plt .$(TARGET).plt -pa deps/lager/ebin --apps kernel stdlib + +analyze: compile + @$(DIALYZER) --plt .$(TARGET).plt -pa deps/lager/ebin ebin + +repl: + @$(ERL) -pa ebin -pz deps/lager/ebin diff --git a/README.md b/README.md index 71237ea..1437b51 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -EMDB ==== EMDB is a NIF library for the [Memory-Mapped Database](http://highlandsun.com/hyc/mdb/) database, aka. MDB. The main purpose of this package is to provide a **very fast** Riak [backend](http://wiki.basho.com/Storage-Backends.html). +EMDB ==== EMDB is a NIF library for the [Memory-Mapped Database](http://highlandsun.com/hyc/mdb/) database, aka. MDB. The main purpose of this package is to provide a **very fast** Riak [backend](http://wiki.basho.com/Storage-Backends.html). But this module could also be used as a general key-value store to replace: @@ -26,65 +26,86 @@ But this module could also be used as a general key-value store to replace: * `drop/1`: deletes all key-value pairs in the database. -Usage ----- $ make +Usage +----- +$ make $ ./start.sh + %% create a new database + 1> {ok, Handle} = emdb:open("/tmp/emdb1"). - %% create a new database 1> {ok, Handle} = emdb:open("/tmp/emdb1"). + %% insert the key <<"a">> with value <<"1">> + 2> ok = emdb:put(Handle, <<"a">>, <<"1">>). - %% insert the key <<"a">> with value <<"1">> 2> ok = Handle:put(<<"a">>, <<"1">>). + %% try to re-insert the same key <<"a">> + 3> key_exist = emdb:put(Handle, <<"a">>, <<"2">>). - %% try to re-insert the same key <<"a">> 3> key_exist = Handle:put(<<"a">>, <<"2">>). + %% add a new key-value pair + 4> ok = emdb:put(Handle, <<"b">>, <<"2">>). - %% add a new key-value pair 4> ok = Handle:put(<<"b">>, <<"2">>). + %% search a non-existing key <<"c">> + 5> none = emdb:get(Handle, <<"c">>). - %% search a non-existing key <<"c">> 5> none = Handle:get(<<"c">>). + %% retrieve the value for key <<"b">> + 6> {ok, <<"2">>} = emdb:get(Handle, <<"b">>). - %% retrieve the value for key <<"b">> 6> {ok, <<"2">>} = Handle:get(<<"b">>). + %% retrieve the value for key <<"a">> + 7> {ok, <<"1">>} = emdb:get(Handle, <<"a">>). - %% retrieve the value for key <<"a">> 7> {ok, <<"1">>} = Handle:get(<<"a">>). - - %% delete key <<"b">> 8> ok = Handle:del(<<"b">>). + %% delete key <<"b">> + 8> ok = emdb:del(Handle, <<"b">>). %% search a non-existing key <<"b">> - 9> none = Handle:get(<<"b">>). + 9> none = emdb:get(Handle, <<"b">>). - %% delete a non-existing key <<"z">> 10> none = Handle:del(<<"z">>). + %% delete a non-existing key <<"z">> + 10> none = emdb:del(Handle, <<"z">>). + + %% ensure key <<"a">>'s value is still <<"1">> + 11> {ok, <<"1">>} = emdb:get(Handle, <<"a">>). - %% ensure key <<"a">>'s value is still <<"1">> 11> {ok, <<"1">>} = Handle:get(<<"a">>). %% update the value for key <<"a">> - 12> ok = Handle:update(<<"a">>, <<"7">>). + 12> ok = emdb:update(Handle, <<"a">>, <<"7">>). %% check the new value for key <<"a">> - 13> {ok, <<"7">>} = Handle:get(<<"a">>). + 13> {ok, <<"7">>} = emdb:get(Handle, <<"a">>). - %% delete all key-value pairs in the database 14> ok = Handle:drop(). + %% delete all key-value pairs in the database + 14> ok = emdb:drop(Handle). - %% try to retrieve key <<"a">> value 15> none = Handle:get(<<"a">>). + %% try to retrieve key <<"a">> value + 15> none = emdb:get(Handle, <<"a">>). - %% close the database 16> ok = Handle:close(). + %% close the database + 16> ok = emdb:close(Handle). ... - - 17> q(). - -####Note: + 17> q(). + + +#### Note: The code below creates a new database with **80GB** MapSize, **avoid fsync** -after each commit (for max speed) and use the experimental **MDB_FIXEDMAP**. {ok, Handle} = emdb:open("/tmp/emdb2", 85899345920, ?MDB_NOSYNC bor ?MDB_FIXEDMAP). - - Performance ----------- For maximum speed, this library use only binaries for both keys and values. - See the impressive [microbench](http://highlandsun.com/hyc/mdb/microbench/) against: +after each commit (for max speed) and use the experimental **MDB_FIXEDMAP**. -* Google's LevelDB -* SQLite + {ok, Handle} = emdb:open("/tmp/emdb2", 85899345920, ?MDB_NOSYNC bor ?MDB_FIXEDMAP). + +Performance +----------- + +For maximum speed, this library use only binaries for both keys and values. + +See the impressive [microbench](http://highlandsun.com/hyc/mdb/microbench/) against: +* Google's LevelDB (which is slower and can stall unlike Basho's fork of LevelDB) +* SQLite3 * Kyoto TreeDB -* BerkeleyDB +* BerkeleyDB 5.x MDB performs better on 64-bit arch. -Supported OSes -------------- +Supported Operating Systems +-------------- Should work on 32/64-bit architectures: @@ -93,14 +114,24 @@ Should work on 32/64-bit architectures: * FreeBSD * Windows - TODO ---- +TODO +---- -* Unit tests * PropEr testing +* Fold over keys and/or values +* Unit tests +* PropEr testing * Bulk "writing" +* basho_bench driver +* EQC, PULSE testing +* Key expirey +* Atomic group commit (for 2i) -Volunteers are always welcome! Status +Volunteers are always welcome! + +Status ------ - #### Work in progress. Don't use it in production! - LICENSE ------- - EMDB is Copyright (C) 2012 by Aleph Archives, and released under the [OpenLDAP](http://www.OpenLDAP.org/license.html) License. +LICENSE +------- + +EMDB is Copyright (C) 2012-2013 by Aleph Archives and Basho Technologies, Inc., and released under the [OpenLDAP](http://www.OpenLDAP.org/license.html) License. diff --git a/c_src/async_nif.h b/c_src/async_nif.h new file mode 100644 index 0000000..e7a9670 --- /dev/null +++ b/c_src/async_nif.h @@ -0,0 +1,533 @@ +/* + * async_nif: An async thread-pool layer for Erlang's NIF API + * + * Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved. + * Author: Gregory Burd + * + * This file is provided to you under the Apache License, + * Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain + * a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef __ASYNC_NIF_H__ +#define __ASYNC_NIF_H__ + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "fifo_q.h" +#include "stats.h" + +#ifndef __UNUSED +#define __UNUSED(v) ((void)(v)) +#endif + +#define ASYNC_NIF_MAX_WORKERS 128 +#define ASYNC_NIF_WORKER_QUEUE_SIZE 500 +#define ASYNC_NIF_MAX_QUEUED_REQS 1000 * ASYNC_NIF_MAX_WORKERS + +STAT_DECL(qwait, 1000); + +struct async_nif_req_entry { + ERL_NIF_TERM ref; + ErlNifEnv *env; + ErlNifPid pid; + void *args; + void (*fn_work)(ErlNifEnv*, ERL_NIF_TERM, ErlNifPid*, unsigned int, void *); + void (*fn_post)(void *); +}; +DECL_FIFO_QUEUE(reqs, struct async_nif_req_entry); + +struct async_nif_work_queue { + STAT_DEF(qwait); + ErlNifMutex *reqs_mutex; + ErlNifCond *reqs_cnd; + FIFO_QUEUE_TYPE(reqs) reqs; +}; + +struct async_nif_worker_entry { + ErlNifTid tid; + unsigned int worker_id; + struct async_nif_state *async_nif; + struct async_nif_work_queue *q; +}; + +struct async_nif_state { + STAT_DEF(qwait); + unsigned int shutdown; + unsigned int num_workers; + struct async_nif_worker_entry worker_entries[ASYNC_NIF_MAX_WORKERS]; + unsigned int num_queues; + unsigned int next_q; + FIFO_QUEUE_TYPE(reqs) recycled_reqs; + unsigned int num_reqs; + ErlNifMutex *recycled_req_mutex; + struct async_nif_work_queue queues[]; +}; + +#define ASYNC_NIF_DECL(decl, frame, pre_block, work_block, post_block) \ + struct decl ## _args frame; \ + static void fn_work_ ## decl (ErlNifEnv *env, ERL_NIF_TERM ref, ErlNifPid *pid, unsigned int worker_id, struct decl ## _args *args) { \ + __UNUSED(worker_id); \ + do work_block while(0); \ + } \ + static void fn_post_ ## decl (struct decl ## _args *args) { \ + __UNUSED(args); \ + do post_block while(0); \ + } \ + static ERL_NIF_TERM decl(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv_in[]) { \ + struct decl ## _args on_stack_args; \ + struct decl ## _args *args = &on_stack_args; \ + struct decl ## _args *copy_of_args; \ + struct async_nif_req_entry *req = NULL; \ + const char *affinity = NULL; \ + ErlNifEnv *new_env = NULL; \ + /* argv[0] is a ref used for selective recv */ \ + const ERL_NIF_TERM *argv = argv_in + 1; \ + argc -= 1; \ + /* Note: !!! this assumes that the first element of priv_data is ours */ \ + struct async_nif_state *async_nif = *(struct async_nif_state**)enif_priv_data(env); \ + if (async_nif->shutdown) \ + return enif_make_tuple2(env, enif_make_atom(env, "error"), \ + enif_make_atom(env, "shutdown")); \ + req = async_nif_reuse_req(async_nif); \ + new_env = req->env; \ + if (!req) \ + return enif_make_tuple2(env, enif_make_atom(env, "error"), \ + enif_make_atom(env, "eagain")); \ + do pre_block while(0); \ + copy_of_args = (struct decl ## _args *)enif_alloc(sizeof(struct decl ## _args)); \ + if (!copy_of_args) { \ + fn_post_ ## decl (args); \ + return enif_make_tuple2(env, enif_make_atom(env, "error"), \ + enif_make_atom(env, "enomem")); \ + } \ + memcpy(copy_of_args, args, sizeof(struct decl ## _args)); \ + req->ref = enif_make_copy(new_env, argv_in[0]); \ + enif_self(env, &req->pid); \ + req->args = (void*)copy_of_args; \ + req->fn_work = (void (*)(ErlNifEnv *, ERL_NIF_TERM, ErlNifPid*, unsigned int, void *))fn_work_ ## decl ; \ + req->fn_post = (void (*)(void *))fn_post_ ## decl; \ + int h = -1; \ + if (affinity) \ + h = async_nif_str_hash_func(affinity) % async_nif->num_queues; \ + ERL_NIF_TERM reply = async_nif_enqueue_req(async_nif, req, h); \ + if (!reply) { \ + fn_post_ ## decl (args); \ + enif_free(copy_of_args); \ + return enif_make_tuple2(env, enif_make_atom(env, "error"), \ + enif_make_atom(env, "shutdown")); \ + } \ + return reply; \ + } + +#define ASYNC_NIF_INIT(name) \ + static ErlNifMutex *name##_async_nif_coord = NULL; + +#define ASYNC_NIF_LOAD(name, priv) do { \ + if (!name##_async_nif_coord) \ + name##_async_nif_coord = enif_mutex_create(NULL); \ + enif_mutex_lock(name##_async_nif_coord); \ + priv = async_nif_load(); \ + enif_mutex_unlock(name##_async_nif_coord); \ + } while(0); +#define ASYNC_NIF_UNLOAD(name, env, priv) do { \ + if (!name##_async_nif_coord) \ + name##_async_nif_coord = enif_mutex_create(NULL); \ + enif_mutex_lock(name##_async_nif_coord); \ + async_nif_unload(env, priv); \ + enif_mutex_unlock(name##_async_nif_coord); \ + enif_mutex_destroy(name##_async_nif_coord); \ + name##_async_nif_coord = NULL; \ + } while(0); +#define ASYNC_NIF_UPGRADE(name, env) do { \ + if (!name##_async_nif_coord) \ + name##_async_nif_coord = enif_mutex_create(NULL); \ + enif_mutex_lock(name##_async_nif_coord); \ + async_nif_upgrade(env); \ + enif_mutex_unlock(name##_async_nif_coord); \ + } while(0); + +#define ASYNC_NIF_RETURN_BADARG() do { \ + async_nif_recycle_req(req, async_nif); \ + return enif_make_badarg(env); \ + } while(0); +#define ASYNC_NIF_WORK_ENV new_env + +#define ASYNC_NIF_REPLY(msg) enif_send(NULL, pid, env, enif_make_tuple2(env, ref, msg)) + +/** + * Return a request structure from the recycled req queue if one exists, + * otherwise create one. + */ +struct async_nif_req_entry * +async_nif_reuse_req(struct async_nif_state *async_nif) +{ + struct async_nif_req_entry *req = NULL; + ErlNifEnv *env = NULL; + + enif_mutex_lock(async_nif->recycled_req_mutex); + if (fifo_q_empty(reqs, async_nif->recycled_reqs)) { + if (async_nif->num_reqs < ASYNC_NIF_MAX_QUEUED_REQS) { + req = enif_alloc(sizeof(struct async_nif_req_entry)); + if (req) { + memset(req, 0, sizeof(struct async_nif_req_entry)); + env = enif_alloc_env(); + if (!env) { + enif_free(req); + req = NULL; + } else { + req->env = env; + async_nif->num_reqs++; + } + } + } + } else { + req = fifo_q_get(reqs, async_nif->recycled_reqs); + } + enif_mutex_unlock(async_nif->recycled_req_mutex); + STAT_TICK(async_nif, qwait); + return req; +} + +/** + * Store the request for future re-use. + * + * req a request entry with an ErlNifEnv* which will be cleared + * before reuse, but not until then. + * async_nif a handle to our state so that we can find and use the mutex + */ +void +async_nif_recycle_req(struct async_nif_req_entry *req, struct async_nif_state *async_nif) +{ + STAT_TOCK(async_nif, qwait); + enif_mutex_lock(async_nif->recycled_req_mutex); + fifo_q_put(reqs, async_nif->recycled_reqs, req); + enif_mutex_unlock(async_nif->recycled_req_mutex); +} + +/** + * A string hash function. + * + * A basic hash function for strings of characters used during the + * affinity association. + * + * s a NULL terminated set of bytes to be hashed + * -> an integer hash encoding of the bytes + */ +static inline unsigned int +async_nif_str_hash_func(const char *s) +{ + unsigned int h = (unsigned int)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (unsigned int)*s; + return h; +} + +/** + * Enqueue a request for processing by a worker thread. + * + * Places the request into a work queue determined either by the + * provided affinity or by iterating through the available queues. + */ +static ERL_NIF_TERM +async_nif_enqueue_req(struct async_nif_state* async_nif, struct async_nif_req_entry *req, int hint) +{ + /* Identify the most appropriate worker for this request. */ + unsigned int qid = 0; + struct async_nif_work_queue *q = NULL; + unsigned int n = async_nif->num_queues; + + /* Either we're choosing a queue based on some affinity/hinted value or we + need to select the next queue in the rotation and atomically update that + global value (next_q is shared across worker threads) . */ + if (hint >= 0) { + qid = (unsigned int)hint; + } else { + qid = async_nif->next_q; + qid = (qid + 1) % async_nif->num_queues; + async_nif->next_q = qid; + } + + /* Now we inspect and interate across the set of queues trying to select one + that isn't too full or too slow. */ + do { + q = &async_nif->queues[qid]; + enif_mutex_lock(q->reqs_mutex); + + /* Now that we hold the lock, check for shutdown. As long as we hold + this lock either a) we're shutting down so exit now or b) this queue + will be valid until we release the lock. */ + if (async_nif->shutdown) { + enif_mutex_unlock(q->reqs_mutex); + return 0; + } + double await = STAT_MEAN_LOG2_SAMPLE(async_nif, qwait); + double await_inthisq = STAT_MEAN_LOG2_SAMPLE(q, qwait); + if (fifo_q_full(reqs, q->reqs) || await_inthisq > await) { + enif_mutex_unlock(q->reqs_mutex); + qid = (qid + 1) % async_nif->num_queues; + q = &async_nif->queues[qid]; + } else { + break; + } + // TODO: at some point add in work sheading/stealing + } while(n-- > 0); + + /* We hold the queue's lock, and we've seletect a reasonable queue for this + new request so add the request. */ + STAT_TICK(q, qwait); + fifo_q_put(reqs, q->reqs, req); + + /* Build the term before releasing the lock so as not to race on the use of + the req pointer (which will soon become invalid in another thread + performing the request). */ + ERL_NIF_TERM reply = enif_make_tuple2(req->env, enif_make_atom(req->env, "ok"), + enif_make_atom(req->env, "enqueued")); + enif_mutex_unlock(q->reqs_mutex); + enif_cond_signal(q->reqs_cnd); + return reply; +} + +/** + * TODO: + */ +static void * +async_nif_worker_fn(void *arg) +{ + struct async_nif_worker_entry *we = (struct async_nif_worker_entry *)arg; + unsigned int worker_id = we->worker_id; + struct async_nif_state *async_nif = we->async_nif; + struct async_nif_work_queue *q = we->q; + struct async_nif_req_entry *req = NULL; + + for(;;) { + /* Examine the request queue, are there things to be done? */ + enif_mutex_lock(q->reqs_mutex); + check_again_for_work: + if (async_nif->shutdown) { + enif_mutex_unlock(q->reqs_mutex); + break; + } + if (fifo_q_empty(reqs, q->reqs)) { + /* Queue is empty so we wait for more work to arrive. */ + STAT_RESET(q, qwait); + enif_cond_wait(q->reqs_cnd, q->reqs_mutex); + goto check_again_for_work; + } else { + assert(fifo_q_size(reqs, q->reqs) > 0); + assert(fifo_q_size(reqs, q->reqs) < fifo_q_capacity(reqs, q->reqs)); + /* At this point the next req is ours to process and we hold the + reqs_mutex lock. Take the request off the queue. */ + req = fifo_q_get(reqs, q->reqs); + enif_mutex_unlock(q->reqs_mutex); + + /* Ensure that there is at least one other worker thread watching this + queue. */ + enif_cond_signal(q->reqs_cnd); + + /* Perform the work. */ + req->fn_work(req->env, req->ref, &req->pid, worker_id, req->args); + STAT_TOCK(q, qwait); + + /* Now call the post-work cleanup function. */ + req->fn_post(req->args); + + /* Clean up req for reuse. */ + req->ref = 0; + req->fn_work = 0; + req->fn_post = 0; + enif_free(req->args); + req->args = NULL; + enif_clear_env(req->env); + async_nif_recycle_req(req, async_nif); + req = NULL; + } + } + enif_thread_exit(0); + return 0; +} + +static void +async_nif_unload(ErlNifEnv *env, struct async_nif_state *async_nif) +{ + unsigned int i; + unsigned int num_queues = async_nif->num_queues; + struct async_nif_work_queue *q = NULL; + struct async_nif_req_entry *req = NULL; + __UNUSED(env); + + STAT_PRINT(async_nif, qwait, "wterl"); + + /* Signal the worker threads, stop what you're doing and exit. To + ensure that we don't race with the enqueue() process we first + lock all the worker queues, then set shutdown to true, then + unlock. The enqueue function will take the queue mutex, then + test for shutdown condition, then enqueue only if not shutting + down. */ + for (i = 0; i < num_queues; i++) { + q = &async_nif->queues[i]; + enif_mutex_lock(q->reqs_mutex); + } + async_nif->shutdown = 1; + for (i = 0; i < num_queues; i++) { + q = &async_nif->queues[i]; + enif_cond_broadcast(q->reqs_cnd); + enif_mutex_unlock(q->reqs_mutex); + } + + /* Join for the now exiting worker threads. */ + for (i = 0; i < async_nif->num_workers; ++i) { + void *exit_value = 0; /* We ignore the thread_join's exit value. */ + enif_thread_join(async_nif->worker_entries[i].tid, &exit_value); + } + + /* Free req structres sitting on the recycle queue. */ + enif_mutex_lock(async_nif->recycled_req_mutex); + req = NULL; + fifo_q_foreach(reqs, async_nif->recycled_reqs, req, { + enif_free_env(req->env); + enif_free(req); + }); + fifo_q_free(reqs, async_nif->recycled_reqs); + + /* Cleanup in-flight requests, mutexes and conditions in each work queue. */ + for (i = 0; i < num_queues; i++) { + q = &async_nif->queues[i]; + + /* Worker threads are stopped, now toss anything left in the queue. */ + req = NULL; + fifo_q_foreach(reqs, q->reqs, req, { + enif_clear_env(req->env); + enif_send(NULL, &req->pid, req->env, + enif_make_tuple2(req->env, enif_make_atom(req->env, "error"), + enif_make_atom(req->env, "shutdown"))); + req->fn_post(req->args); + enif_free_env(req->env); + enif_free(req->args); + enif_free(req); + }); + fifo_q_free(reqs, q->reqs); + enif_mutex_destroy(q->reqs_mutex); + enif_cond_destroy(q->reqs_cnd); + } + + enif_mutex_unlock(async_nif->recycled_req_mutex); + enif_mutex_destroy(async_nif->recycled_req_mutex); + memset(async_nif, 0, sizeof(struct async_nif_state) + (sizeof(struct async_nif_work_queue) * async_nif->num_queues)); + enif_free(async_nif); +} + +static void * +async_nif_load() +{ + static int has_init = 0; + unsigned int i, j, num_queues; + ErlNifSysInfo info; + struct async_nif_state *async_nif; + + /* Don't init more than once. */ + if (has_init) return 0; + else has_init = 1; + + /* Find out how many schedulers there are. */ + enif_system_info(&info, sizeof(ErlNifSysInfo)); + + /* Size the number of work queues according to schedulers. */ + if (info.scheduler_threads > ASYNC_NIF_MAX_WORKERS / 2) { + num_queues = ASYNC_NIF_MAX_WORKERS / 2; + } else { + int remainder = ASYNC_NIF_MAX_WORKERS % info.scheduler_threads; + if (remainder != 0) + num_queues = info.scheduler_threads - remainder; + else + num_queues = info.scheduler_threads; + if (num_queues < 2) + num_queues = 2; + } + + /* Init our portion of priv_data's module-specific state. */ + async_nif = enif_alloc(sizeof(struct async_nif_state) + + sizeof(struct async_nif_work_queue) * num_queues); + if (!async_nif) + return NULL; + memset(async_nif, 0, sizeof(struct async_nif_state) + + sizeof(struct async_nif_work_queue) * num_queues); + + async_nif->num_queues = num_queues; + async_nif->num_workers = 2 * num_queues; + async_nif->next_q = 0; + async_nif->shutdown = 0; + async_nif->recycled_reqs = fifo_q_new(reqs, ASYNC_NIF_MAX_QUEUED_REQS); + async_nif->recycled_req_mutex = enif_mutex_create(NULL); + STAT_INIT(async_nif, qwait); + + for (i = 0; i < async_nif->num_queues; i++) { + struct async_nif_work_queue *q = &async_nif->queues[i]; + q->reqs = fifo_q_new(reqs, ASYNC_NIF_WORKER_QUEUE_SIZE); + q->reqs_mutex = enif_mutex_create(NULL); + q->reqs_cnd = enif_cond_create(NULL); + STAT_INIT(q, qwait); + } + + /* Setup the thread pool management. */ + memset(async_nif->worker_entries, 0, sizeof(struct async_nif_worker_entry) * ASYNC_NIF_MAX_WORKERS); + + /* Start the worker threads. */ + for (i = 0; i < async_nif->num_workers; i++) { + struct async_nif_worker_entry *we = &async_nif->worker_entries[i]; + we->async_nif = async_nif; + we->worker_id = i; + we->q = &async_nif->queues[i % async_nif->num_queues]; + if (enif_thread_create(NULL, &async_nif->worker_entries[i].tid, + &async_nif_worker_fn, (void*)we, NULL) != 0) { + async_nif->shutdown = 1; + + for (j = 0; j < async_nif->num_queues; j++) { + struct async_nif_work_queue *q = &async_nif->queues[j]; + enif_cond_broadcast(q->reqs_cnd); + } + + while(i-- > 0) { + void *exit_value = 0; /* Ignore this. */ + enif_thread_join(async_nif->worker_entries[i].tid, &exit_value); + } + + for (j = 0; j < async_nif->num_queues; j++) { + struct async_nif_work_queue *q = &async_nif->queues[j]; + enif_mutex_destroy(q->reqs_mutex); + enif_cond_destroy(q->reqs_cnd); + } + + memset(async_nif->worker_entries, 0, sizeof(struct async_nif_worker_entry) * ASYNC_NIF_MAX_WORKERS); + enif_free(async_nif); + return NULL; + } + } + return async_nif; +} + +static void +async_nif_upgrade(ErlNifEnv *env) +{ + __UNUSED(env); + // TODO: +} + + +#if defined(__cplusplus) +} +#endif + +#endif // __ASYNC_NIF_H__ diff --git a/c_src/common.h b/c_src/common.h new file mode 100644 index 0000000..bbb4fdd --- /dev/null +++ b/c_src/common.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved. + * Author: Gregory Burd + * + * This file is provided to you under the Apache License, + * Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain + * a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef __COMMON_H__ +#define __COMMON_H__ + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifdef DEBUG +#include +#include +#ifndef DPRINTF +#define DPRINTF(fmt, ...) \ + do { \ + fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__); \ + fflush(stderr); \ + } while(0) +#endif +#ifndef DPUTS +#define DPUTS(arg) DPRINTF("%s", arg) +#endif +#else +#define DPRINTF(fmt, ...) ((void) 0) +#define DPUTS(arg) ((void) 0) +#endif + +#ifndef COMPQUIET +#define COMPQUIET(n, v) do { \ + (n) = (v); \ + (n) = (n); \ +} while (0) +#endif + +#ifndef __UNUSED +#define __UNUSED(v) ((void)(v)) +#endif + + +#if defined(__cplusplus) +} +#endif + +#endif // __COMMON_H__ diff --git a/c_src/duration.h b/c_src/duration.h new file mode 100644 index 0000000..635d0fd --- /dev/null +++ b/c_src/duration.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2013, all rights reserved by Gregory Burd + * + * This Source Code Form is subject to the terms of the Mozilla Public License, + * version 2 (MPLv2). If a copy of the MPL was not distributed with this file, + * you can obtain one at: http://mozilla.org/MPL/2.0/ + * + * NOTES: + * - on some platforms this will require -lrt + */ +#include +#include +#include +#include + +typedef enum { ns = 0, mcs, ms, s } time_scale; +struct scale_time { + const char *abbreviation; + const char *name; + uint64_t mul, div, overhead, ticks_per; +}; +static const struct scale_time scale[] = { + { "ns", "nanosecond", 1000000000LL, 1LL, 10, 2300000000000LL }, + { "mcs", "microsecond", 1000000LL, 1000LL, 10, 2300000000LL }, + { "ms", "millisecond", 1000LL, 1000000LL, 10, 2300000LL }, + { "sec", "second", 1LL, 1000000000LL, 10, 2300LL } }; + +static uint64_t ts(time_scale unit) +{ + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + return (((uint64_t)ts.tv_sec * scale[unit].mul) + + ((uint64_t)ts.tv_nsec / scale[unit].div)); +} + +#if 0 +//if defined(__i386__) || defined(__x86_64__) + +/** + * cpu_clock_ticks() + * + * A measure provided by Intel x86 CPUs which provides the number of cycles + * (aka "ticks") executed as a counter using the RDTSC instruction. + */ +static inline uint64_t cpu_clock_ticks() +{ + uint32_t lo, hi; + __asm__ __volatile__ ( + "xorl %%eax, %%eax\n" + "cpuid\n" + "rdtsc\n" + : "=a" (lo), "=d" (hi) + : + : "%ebx", "%ecx" ); + return (uint64_t)hi << 32 | lo; +} + +/** + * cpu_clock_ticks() + * + * An approximation of elapsed [ns, mcs, ms, s] from CPU clock ticks. + */ +static uint64_t elapsed_cpu_clock_ticks(uint64_t start, time_scale unit) +{ + return (cpu_clock_ticks() - start - scale[unit].overhead) * scale[unit].ticks_per; +} + +#endif + +typedef struct { + uint64_t then; + time_scale unit; +} duration_t; + +static inline uint64_t elapsed(duration_t *d) +{ + uint64_t now = ts(d->unit); + uint64_t elapsed = now - d->then; + d->then = now; + return elapsed; +} + +#define DURATION(name, resolution) duration_t name = \ + {ts(resolution), resolution} + +#define ELAPSED_DURING(result, resolution, block) \ + do { \ + DURATION(__x, resolution); \ + do block while(0); \ + *result = elapsed(&__x); \ + } while(0); + +#define CYCLES_DURING(result, block) \ + do { \ + uint64_t __begin = cpu_clock_ticks(); \ + do block while(0); \ + *result = cpu_clock_ticks() - __begin; \ + } while(0); diff --git a/c_src/emdb.c b/c_src/emdb.c new file mode 100644 index 0000000..db6028b --- /dev/null +++ b/c_src/emdb.c @@ -0,0 +1,722 @@ +/* ------------------------------------------------------------------------- + * This file is part of EMDB - Erlang MDB API + * + * Copyright (c) 2012 by Aleph Archives. All rights reserved. + * + * ------------------------------------------------------------------------- + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * -------------------------------------------------------------------------*/ + +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "async_nif.h" +#include "stats.h" +#include "lmdb.h" + +STAT_DECL(emdb_get, 1000); +STAT_DECL(emdb_put, 1000); +STAT_DECL(emdb_del, 1000); +STAT_DECL(emdb_upd, 1000); + +static ErlNifResourceType *emdb_RESOURCE; +struct emdb { + MDB_env *env; + MDB_dbi dbi; + STAT_DEF(emdb_get); + STAT_DEF(emdb_put); + STAT_DEF(emdb_del); + STAT_DEF(emdb_upd); +}; + +struct emdb_priv_data { + void *async_nif_priv; // Note: must be first element in struct +}; + +/* Global init for async_nif. */ +ASYNC_NIF_INIT(emdb); + +/* Atoms (initialized in on_load) */ +static ERL_NIF_TERM ATOM_ERROR; +static ERL_NIF_TERM ATOM_OK; +static ERL_NIF_TERM ATOM_NOT_FOUND; +static ERL_NIF_TERM ATOM_EXISTS; +static ERL_NIF_TERM ATOM_KEYEXIST; +static ERL_NIF_TERM ATOM_NOTFOUND; +static ERL_NIF_TERM ATOM_PAGE_NOTFOUND; +static ERL_NIF_TERM ATOM_CORRUPTED; +static ERL_NIF_TERM ATOM_PANIC; +static ERL_NIF_TERM ATOM_VERSION_MISMATCH; +static ERL_NIF_TERM ATOM_KEYEXIST; +static ERL_NIF_TERM ATOM_MAP_FULL; +static ERL_NIF_TERM ATOM_DBS_FULL; +static ERL_NIF_TERM ATOM_READERS_FULL; +static ERL_NIF_TERM ATOM_TLS_FULL; +static ERL_NIF_TERM ATOM_TXN_FULL; +static ERL_NIF_TERM ATOM_CURSOR_FULL; +static ERL_NIF_TERM ATOM_PAGE_FULL; +static ERL_NIF_TERM ATOM_MAP_RESIZED; +static ERL_NIF_TERM ATOM_INCOMPATIBLE; +static ERL_NIF_TERM ATOM_BAD_RSLOT; + +#define CHECK(expr, label) \ + if (MDB_SUCCESS != (ret = (expr))) { \ + DPRINTF("CHECK(\"%s\") failed \"%s\" at %s:%d in %s()\n", \ + #expr, mdb_strerror(ret), __FILE__, __LINE__, __func__);\ + err = __strerror_term(env, ret); \ + goto label; \ + } + +#define FAIL_ERR(e, label) \ + do { \ + err = __strerror_term(env, (e)); \ + goto label; \ + } while(0) + +/** + * Convenience function to generate {error, {errno, Reason}} + * + * env NIF environment + * err number of last error + */ +static ERL_NIF_TERM +__strerror_term(ErlNifEnv* env, int err) +{ + ERL_NIF_TERM term; + + if (err < MDB_LAST_ERRCODE && err > MDB_KEYEXIST) { + switch (err) { + case MDB_KEYEXIST: /** key/data pair already exists */ + term = ATOM_KEYEXIST; + break; + case MDB_NOTFOUND: /** key/data pair not found (EOF) */ + term = ATOM_NOTFOUND; + break; + case MDB_PAGE_NOTFOUND: /** Requested page not found - this usually indicates corruption */ + term = ATOM_PAGE_NOTFOUND; + break; + case MDB_CORRUPTED: /** Located page was wrong type */ + term = ATOM_CORRUPTED; + break; + case MDB_PANIC : /** Update of meta page failed, probably I/O error */ + term = ATOM_PANIC; + break; + case MDB_VERSION_MISMATCH: /** Environment version mismatch */ + term = ATOM_VERSION_MISMATCH; + break; + case MDB_INVALID: /** File is not a valid MDB file */ + term = ATOM_KEYEXIST; + break; + case MDB_MAP_FULL: /** Environment mapsize reached */ + term = ATOM_MAP_FULL; + break; + case MDB_DBS_FULL: /** Environment maxdbs reached */ + term = ATOM_DBS_FULL; + break; + case MDB_READERS_FULL: /** Environment maxreaders reached */ + term = ATOM_READERS_FULL; + break; + case MDB_TLS_FULL: /** Too many TLS keys in use - Windows only */ + term = ATOM_TLS_FULL; + break; + case MDB_TXN_FULL: /** Txn has too many dirty pages */ + term = ATOM_TXN_FULL; + break; + case MDB_CURSOR_FULL: /** Cursor stack too deep - internal error */ + term = ATOM_CURSOR_FULL; + break; + case MDB_PAGE_FULL: /** Page has not enough space - internal error */ + term = ATOM_PAGE_FULL; + break; + case MDB_MAP_RESIZED: /** Database contents grew beyond environment mapsize */ + term = ATOM_MAP_RESIZED; + break; + case MDB_INCOMPATIBLE: /** Database flags changed or would change */ + term = ATOM_INCOMPATIBLE; + break; + case MDB_BAD_RSLOT: /** Invalid reuse of reader locktable slot */ + term = ATOM_BAD_RSLOT; + break; + } + } else { + term = enif_make_atom(env, erl_errno_id(err)); + } + + /* We return the errno value as well as the message here because the error + message provided by strerror() for differ across platforms and/or may be + localized to any given language (i18n). Use the errno atom rather than + the message when matching in Erlang. You've been warned. */ + return enif_make_tuple(env, 2, ATOM_ERROR, + enif_make_tuple(env, 2, term, + enif_make_string(env, mdb_strerror(err), ERL_NIF_LATIN1))); +} + +/** + * Opens a MDB database. + * + * argv[0] path to directory for the database files + * argv[1] size of database + * argv[2] flags + */ +ASYNC_NIF_DECL( + emdb_open, + { // struct + + char dirname[MAXPATHLEN]; + ErlNifUInt64 mapsize; + ErlNifUInt64 envflags; + }, + { // pre + if (!(argc == 3 && + enif_is_list(env, argv[0]) && + enif_is_number(env, argv[1]) && + enif_is_number(env, argv[2]))) { + ASYNC_NIF_RETURN_BADARG(); + } + if (enif_get_string(env, argv[0], args->dirname, + MAXPATHLEN, ERL_NIF_LATIN1) <= 0) + ASYNC_NIF_RETURN_BADARG(); + enif_get_uint64(env, argv[1], &(args->mapsize)); + enif_get_uint64(env, argv[2], &(args->envflags)); + }, + { // work + + ERL_NIF_TERM err; + MDB_txn *txn; + struct emdb *handle; + int ret; + + if ((handle = enif_alloc_resource(emdb_RESOURCE, sizeof(struct emdb))) == NULL) + FAIL_ERR(ENOMEM, err3); + + STAT_INIT(handle, emdb_get); + STAT_INIT(handle, emdb_put); + STAT_INIT(handle, emdb_upd); + STAT_INIT(handle, emdb_del); + + CHECK(mdb_env_create(&(handle->env)), err2); + + if (mdb_env_set_mapsize(handle->env, args->mapsize)) { + ASYNC_NIF_REPLY(enif_make_badarg(env)); + return; + } + + CHECK(mdb_env_open(handle->env, args->dirname, args->envflags, 0664), err2); + CHECK(mdb_txn_begin(handle->env, NULL, 0, &txn), err2); + CHECK(mdb_open(txn, NULL, 0, &(handle->dbi)), err1); + CHECK(mdb_txn_commit(txn), err1); + + ERL_NIF_TERM term = enif_make_resource(env, handle); + enif_release_resource(handle); + ASYNC_NIF_REPLY(enif_make_tuple(env, 2, ATOM_OK, term)); + return; + + err1: + mdb_txn_abort(txn); + err2: + mdb_env_close(handle->env); + err3: + ASYNC_NIF_REPLY(err); + return; + }, + { // post + + }); + + +/** + * Closes a MDB database. + * + * argv[0] reference to the MDB handle resource + */ +ASYNC_NIF_DECL( + emdb_close, + { // struct + + struct emdb *handle; + }, + { // pre + + if (!(argc == 1 && + enif_get_resource(env, argv[0], emdb_RESOURCE, (void**)&args->handle))) { + ASYNC_NIF_RETURN_BADARG(); + } + if (!args->handle->env) + ASYNC_NIF_RETURN_BADARG(); + enif_keep_resource((void*)args->handle); + }, + { // work + + STAT_PRINT(args->handle, emdb_get, "emdb"); + STAT_PRINT(args->handle, emdb_put, "emdb"); + STAT_PRINT(args->handle, emdb_del, "emdb"); + STAT_PRINT(args->handle, emdb_upd, "emdb"); + mdb_env_close(args->handle->env); + STAT_RESET(args->handle, emdb_get); + STAT_RESET(args->handle, emdb_put); + STAT_RESET(args->handle, emdb_del); + STAT_RESET(args->handle, emdb_upd); + args->handle->env = NULL; + ASYNC_NIF_REPLY(ATOM_OK); + return; + }, + { // post + + enif_release_resource((void*)args->handle); + }); + + +/** + * Store a value indexed by key. + * + * argv[0] reference to the MDB handle resource + * argv[1] key as an Erlang binary + * argv[2] value as an Erlang binary + */ +ASYNC_NIF_DECL( + emdb_put, + { // struct + + struct emdb *handle; + ERL_NIF_TERM key; + ERL_NIF_TERM val; + }, + { // pre + + if (!(argc == 3 && + enif_get_resource(env, argv[0], emdb_RESOURCE, (void**)&args->handle) && + enif_is_binary(env, argv[1]) && + enif_is_binary(env, argv[2]) )) { + ASYNC_NIF_RETURN_BADARG(); + } + if (!args->handle->env) + ASYNC_NIF_RETURN_BADARG(); + STAT_TICK(args->handle, emdb_put); + enif_keep_resource((void*)args->handle); + args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[1]); + args->val = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]); + }, + { // work + + ERL_NIF_TERM err; + ErlNifBinary key; + ErlNifBinary val; + MDB_val mkey; + MDB_val mdata; + MDB_txn * txn; + int ret; + + if (!enif_inspect_iolist_as_binary(env, args->key, &key)) { + ASYNC_NIF_REPLY(enif_make_badarg(env)); + return; + } + if (!enif_inspect_iolist_as_binary(env, args->val, &val)) { + ASYNC_NIF_REPLY(enif_make_badarg(env)); + return; + } + + mkey.mv_size = key.size; + mkey.mv_data = key.data; + mdata.mv_size = val.size; + mdata.mv_data = val.data; + CHECK(mdb_txn_begin(args->handle->env, NULL, 0, & txn), err2); + + ret = mdb_put(txn, args->handle->dbi, &mkey, &mdata, MDB_NOOVERWRITE); + if (MDB_KEYEXIST == ret) { + ASYNC_NIF_REPLY(enif_make_tuple(env, 2, ATOM_ERROR, ATOM_EXISTS)); + return; + } + if (ret != 0) + FAIL_ERR(ret, err1); + + CHECK(mdb_txn_commit(txn), err1); + STAT_TOCK(args->handle, emdb_put); + ASYNC_NIF_REPLY(ATOM_OK); + return; + + err1: + mdb_txn_abort(txn); + err2: + ASYNC_NIF_REPLY(err); + return; + }, + { // post + + enif_release_resource((void*)args->handle); + }); + + +/** + * Update and existin value indexed by key. + * + * argv[0] reference to the MDB handle resource + * argv[1] key as an Erlang binary + * argv[2] value as an Erlang binary + */ +ASYNC_NIF_DECL( + emdb_update, + { // struct + + struct emdb *handle; + ERL_NIF_TERM key; + ERL_NIF_TERM val; + }, + { // pre + + if (!(argc == 3 && + enif_get_resource(env, argv[0], emdb_RESOURCE, (void**)&args->handle) && + enif_is_binary(env, argv[1]) && + enif_is_binary(env, argv[2]) )) { + ASYNC_NIF_RETURN_BADARG(); + } + if (!args->handle->env) + ASYNC_NIF_RETURN_BADARG(); + STAT_TICK(args->handle, emdb_upd); + enif_keep_resource((void*)args->handle); + args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[1]); + args->val = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[2]); + }, + { // work + + ERL_NIF_TERM err; + ErlNifBinary key; + ErlNifBinary val; + MDB_val mkey; + MDB_val mdata; + MDB_txn * txn; + int ret; + + if (!enif_inspect_iolist_as_binary(env, args->key, &key)) { + ASYNC_NIF_REPLY(enif_make_badarg(env)); + return; + } + if (!enif_inspect_iolist_as_binary(env, args->val, &val)) { + ASYNC_NIF_REPLY(enif_make_badarg(env)); + return; + } + + mkey.mv_size = key.size; + mkey.mv_data = key.data; + mdata.mv_size = val.size; + mdata.mv_data = val.data; + + CHECK(mdb_txn_begin(args->handle->env, NULL, 0, & txn), err2); + CHECK(mdb_put(txn, args->handle->dbi, &mkey, &mdata, 0), err1); + CHECK(mdb_txn_commit(txn), err1); + STAT_TOCK(args->handle, emdb_upd); + ASYNC_NIF_REPLY(ATOM_OK); + return; + + err1: + mdb_txn_abort(txn); + err2: + ASYNC_NIF_REPLY(err); + return; + }, + { // post + + enif_release_resource((void*)args->handle); + }); + + +/** + * Retrieve the value associated with the key. + * + * argv[0] reference to the MDB handle resource + * argv[1] key as an Erlang binary + */ +ASYNC_NIF_DECL( + emdb_get, + { // struct + + struct emdb *handle; + ERL_NIF_TERM key; + }, + { // pre + + if (!(argc == 2 && + enif_get_resource(env, argv[0], emdb_RESOURCE, (void**)&args->handle) && + enif_is_binary(env, argv[1]) )) { + ASYNC_NIF_RETURN_BADARG(); + } + if (!args->handle->env) + ASYNC_NIF_RETURN_BADARG(); + STAT_TICK(args->handle, emdb_get); + enif_keep_resource((void*)args->handle); + args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[1]); + }, + { // work + + ERL_NIF_TERM err; + ErlNifBinary key; + ERL_NIF_TERM val; + unsigned char *bin; + MDB_val mkey; + MDB_val mdata; + MDB_txn * txn; + int ret; + + if (!enif_inspect_iolist_as_binary(env, args->key, &key)) { + ASYNC_NIF_REPLY(enif_make_badarg(env)); + return; + } + + mkey.mv_size = key.size; + mkey.mv_data = key.data; + + CHECK(mdb_txn_begin(args->handle->env, NULL, 0, &txn), err); + + ret = mdb_get(txn, args->handle->dbi, &mkey, &mdata); + mdb_txn_abort(txn); + if (MDB_NOTFOUND == ret) { + ASYNC_NIF_REPLY(ATOM_NOT_FOUND); + return; + } + + if (ret != 0) + FAIL_ERR(ret, err); + + bin = enif_make_new_binary(env, mdata.mv_size, &val); + if (!bin) + FAIL_ERR(ENOMEM, err); + memcpy(bin, mdata.mv_data, mdata.mv_size); + + STAT_TOCK(args->handle, emdb_get); + ASYNC_NIF_REPLY(enif_make_tuple(env, 2, ATOM_OK, val)); + return; + + err: + ASYNC_NIF_REPLY(err); + return; + }, + { // post + + enif_release_resource((void*)args->handle); + }); + + +/** + * Delete the value associated with the key. + * + * argv[0] reference to the MDB handle resource + * argv[1] key as an Erlang binary + */ +ASYNC_NIF_DECL( + emdb_del, + { // struct + + struct emdb *handle; + ERL_NIF_TERM key; + }, + { // pre + + if (!(argc == 2 && + enif_get_resource(env, argv[0], emdb_RESOURCE, (void**)&args->handle) && + enif_is_binary(env, argv[1]) )) { + ASYNC_NIF_RETURN_BADARG(); + } + if (!args->handle->env) + ASYNC_NIF_RETURN_BADARG(); + STAT_TICK(args->handle, emdb_del); + enif_keep_resource((void*)args->handle); + args->key = enif_make_copy(ASYNC_NIF_WORK_ENV, argv[1]); + }, + { // work + + ERL_NIF_TERM err; + ErlNifBinary key; + MDB_val mkey; + MDB_txn * txn; + int ret; + + if (!enif_inspect_iolist_as_binary(env, args->key, &key)) { + ASYNC_NIF_REPLY(enif_make_badarg(env)); + return; + } + + mkey.mv_size = key.size; + mkey.mv_data = key.data; + + CHECK(mdb_txn_begin(args->handle->env, NULL, 0, & txn), err); + ret = mdb_del(txn, args->handle->dbi, &mkey, NULL); + + if(MDB_NOTFOUND == ret) { + mdb_txn_abort(txn); + ASYNC_NIF_REPLY(ATOM_NOT_FOUND); + return; + } + + CHECK(mdb_txn_commit(txn), err); + STAT_TOCK(args->handle, emdb_del); + ASYNC_NIF_REPLY(ATOM_OK); + return; + + err: + ASYNC_NIF_REPLY(err); + return; + }, + { // post + + enif_release_resource((void*)args->handle); + }); + + +/** + * Drop a MDB database. + * + * argv[0] reference to the MDB handle resource + */ +ASYNC_NIF_DECL( + emdb_drop, + { // struct + + struct emdb *handle; + }, + { // pre + + if (!(argc == 1 && + enif_get_resource(env, argv[0], emdb_RESOURCE, (void**)&args->handle))) { + ASYNC_NIF_RETURN_BADARG(); + } + if (!args->handle->env) + ASYNC_NIF_RETURN_BADARG(); + enif_keep_resource((void*)args->handle); + }, + { // work + + ERL_NIF_TERM err; + MDB_txn * txn; + int ret; + + CHECK(mdb_txn_begin(args->handle->env, NULL, 0, & txn), err2); + CHECK(mdb_drop(txn, args->handle->dbi, 0), err1); + CHECK(mdb_txn_commit(txn), err1); + ASYNC_NIF_REPLY(ATOM_OK); + return; + + err1: + mdb_txn_abort(txn); + + err2: + ASYNC_NIF_REPLY(err); + return; + }, + { // post + + enif_release_resource((void*)args->handle); + }); + + + +static int emdb_load(ErlNifEnv* env, void** priv_data, ERL_NIF_TERM load_info) +{ + __UNUSED(load_info); + + ErlNifResourceFlags flags = ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER; + + struct emdb_priv_data *priv = enif_alloc(sizeof(struct emdb_priv_data)); + if (!priv) + return ENOMEM; + memset(priv, 0, sizeof(struct emdb_priv_data)); + + /* Note: !!! the first element of our priv_data struct *must* be the + pointer to the async_nif's private data which we set here. */ + ASYNC_NIF_LOAD(emdb, priv->async_nif_priv); + if (!priv) + return ENOMEM; + *priv_data = priv; + + ATOM_ERROR = enif_make_atom(env, "error"); + ATOM_OK = enif_make_atom(env, "ok"); + ATOM_NOT_FOUND = enif_make_atom(env, "not_found"); + ATOM_EXISTS = enif_make_atom(env, "exists"); + + ATOM_KEYEXIST = enif_make_atom(env, "key_exist"); + ATOM_NOTFOUND = enif_make_atom(env, "notfound"); + ATOM_CORRUPTED = enif_make_atom(env, "corrupted"); + ATOM_PANIC = enif_make_atom(env, "panic"); + ATOM_VERSION_MISMATCH = enif_make_atom(env, "version_mismatch"); + ATOM_MAP_FULL = enif_make_atom(env, "map_full"); + ATOM_DBS_FULL = enif_make_atom(env, "dbs_full"); + ATOM_READERS_FULL = enif_make_atom(env, "readers_full"); + ATOM_TLS_FULL = enif_make_atom(env, "tls_full"); + ATOM_TXN_FULL = enif_make_atom(env, "txn_full"); + ATOM_CURSOR_FULL = enif_make_atom(env, "cursor_full"); + ATOM_PAGE_FULL = enif_make_atom(env, "page_full"); + ATOM_MAP_RESIZED = enif_make_atom(env, "map_resized"); + ATOM_INCOMPATIBLE = enif_make_atom(env, "incompatible"); + ATOM_BAD_RSLOT = enif_make_atom(env, "bad_rslot"); + + emdb_RESOURCE = enif_open_resource_type(env, NULL, "emdb_resource", + NULL, flags, NULL); + return (0); +} + +static int emdb_reload(ErlNifEnv* env, void** priv_data, ERL_NIF_TERM info) +{ + __UNUSED(env); + __UNUSED(priv_data); + __UNUSED(info); + return (0); // TODO: +} + + +static int emdb_upgrade(ErlNifEnv* env, void** priv_data, void** old_priv, ERL_NIF_TERM load_info) +{ + __UNUSED(env); + __UNUSED(priv_data); + __UNUSED(old_priv); + __UNUSED(load_info); + ASYNC_NIF_UPGRADE(emdb, env); + return (0); // TODO: +} + + +static void emdb_unload(ErlNifEnv* env, void* priv_data) +{ + struct emdb_priv_data *priv = (struct emdb_priv_data *)priv_data; + ASYNC_NIF_UNLOAD(emdb, env, priv->async_nif_priv); + enif_free(priv); + return; +} + +static ErlNifFunc nif_funcs [] = { + {"open", 4, emdb_open}, + {"close", 2, emdb_close}, + {"put", 4, emdb_put}, + {"get", 3, emdb_get}, + {"del", 3, emdb_del}, + {"update", 4, emdb_update}, + {"drop", 2, emdb_drop} +}; + +/* driver entry point */ +ERL_NIF_INIT(emdb, + nif_funcs, + & emdb_load, + & emdb_reload, + & emdb_upgrade, + & emdb_unload) diff --git a/c_src/emdb_drv.c b/c_src/emdb_drv.c deleted file mode 100644 index 1610cce..0000000 --- a/c_src/emdb_drv.c +++ /dev/null @@ -1,479 +0,0 @@ -/* ------------------------------------------------------------------------- - * This file is part of EMDB - Erlang MDB API - * - * Copyright (c) 2012 by Aleph Archives. All rights reserved. - * - * ------------------------------------------------------------------------- - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * -------------------------------------------------------------------------*/ - -/* - * C headers - */ - -#include /* for MAXPATHLEN constant */ -#include /* for Erlang NIF interface */ -#include "uthash.h" /* for uthash */ -#include "mdb.h" /* for MDB interface */ - - - -#define FREE(p) (NULL == (p) ? 0 : (free(p), p = NULL)) - -#define FAIL_FAST(Error, Goto) \ - do{ \ - err = Error; \ - goto Goto; \ -}while(0) - - -struct emdb_map_t { - MDB_env * env; - MDB_dbi dbi; - - UT_hash_handle hh; -}; - - -static ERL_NIF_TERM atom_ok; -static ERL_NIF_TERM atom_none; - - -static struct emdb_map_t * emdb_map = NULL; - - -/* emdb ret */ -#define EMDB_RET_KEY_EXIST "key_exist" - -/* emdb errors */ -#define EMDB_MALLOC_ERR "error_malloc" -#define EMDB_MAKE_BINARY_ERR "error_make_binary" -#define EMDB_CREATE_ERR "error_create" -#define EMDB_MAPSIZE_ERR "error_mapsize" -#define EMDB_OPEN_ERR "error_open" -#define EMDB_TXN_BEGIN_ERR "error_txn_begin" -#define EMDB_TXN_COMMIT_ERR "error_txn_commit" -#define EMDB_OPEN_DBI_ERR "error_open_dbi" -#define EMDB_INVALID_HANDLE_ERR "error_invalid_handle" -#define EMDB_PUT_ERR "error_put" -#define EMDB_UPDATE_ERR "error_update" -#define EMDB_KEY_NOT_FOUND "error_key_not_found" -#define EMDB_DROP_ERR "error_drop" - - - -/* - * Error handling callbacks - */ - -static void emdb_free (struct emdb_map_t * emdb_obj) -{ - FREE(emdb_obj); -} - - -/* - * Driver callbacks - */ - -static ERL_NIF_TERM emdb_open_nif (ErlNifEnv * env, - int argc, const ERL_NIF_TERM argv[]) -{ - char dirname [MAXPATHLEN]; - struct emdb_map_t * node; - MDB_txn * txn; - char * err; - ErlNifUInt64 mapsize; - ErlNifUInt64 envflags; - - if (enif_get_string(env, argv[0], dirname, MAXPATHLEN, ERL_NIF_LATIN1) <= 0) - return enif_make_badarg(env); - - if(! (node = calloc(1, sizeof(struct emdb_map_t)))) - FAIL_FAST(EMDB_MALLOC_ERR, err3); - - if (mdb_env_create(& (node -> env))) - FAIL_FAST(EMDB_CREATE_ERR, err2); - - if (! enif_get_uint64(env, argv[1], & mapsize)) - return enif_make_badarg(env); - - if (mdb_env_set_mapsize(node -> env, mapsize)) - FAIL_FAST(EMDB_MAPSIZE_ERR, err2); - - if (! enif_get_uint64(env, argv[2], & envflags)) - return enif_make_badarg(env); - - if (mdb_env_open(node -> env, dirname, envflags, 0664)) - FAIL_FAST(EMDB_OPEN_ERR, err2); - - if (mdb_txn_begin(node -> env, NULL, 0, & txn)) - FAIL_FAST(EMDB_TXN_BEGIN_ERR, err2); - - if (mdb_open(txn, NULL, 0, & (node -> dbi))) - FAIL_FAST(EMDB_OPEN_DBI_ERR, err1); - - if (mdb_txn_commit(txn)) - FAIL_FAST(EMDB_TXN_COMMIT_ERR, err1); - - HASH_ADD_PTR(emdb_map, env, node); - - return enif_make_tuple(env, 2, - atom_ok, - enif_make_ulong(env, (unsigned long) node -> env)); - - err1: - mdb_txn_abort(txn); - err2: - mdb_env_close(node -> env); - err3: - emdb_free(node); - - return enif_make_atom(env, err); -} - -static ERL_NIF_TERM emdb_close_nif (ErlNifEnv * env, - int argc, const ERL_NIF_TERM argv[]) -{ - MDB_env * handle; - struct emdb_map_t * node; - unsigned long addr; - - if (! enif_get_ulong(env, argv[0], & addr)) - return enif_make_badarg(env); - - handle = (MDB_env *) addr; - - HASH_FIND_PTR(emdb_map, & handle, node); - if (NULL == node) - return enif_make_atom(env, EMDB_INVALID_HANDLE_ERR); - - HASH_DEL(emdb_map, node); - - mdb_env_close(handle); - emdb_free(node); - - return atom_ok; - } - - -static ERL_NIF_TERM emdb_put_nif (ErlNifEnv * env, - int argc, const ERL_NIF_TERM argv[]) -{ - ErlNifBinary key; - ErlNifBinary val; - - MDB_val mkey; - MDB_val mdata; - - MDB_env * handle; - MDB_txn * txn; - - struct emdb_map_t * node; - unsigned long addr; - char * err; - int ret; - - if (! enif_get_ulong(env, argv[0], & addr)) - return enif_make_badarg(env); - - handle = (MDB_env *) addr; - - HASH_FIND_PTR(emdb_map, & handle, node); - if (NULL == node) - return enif_make_atom(env, EMDB_INVALID_HANDLE_ERR); - - if (! enif_inspect_iolist_as_binary(env, argv[1], &key)) - return enif_make_badarg(env); - - if (! enif_inspect_iolist_as_binary(env, argv[2], &val)) - return enif_make_badarg(env); - - if (mdb_txn_begin(handle, NULL, 0, & txn)) - FAIL_FAST(EMDB_TXN_BEGIN_ERR, err2); - - mkey.mv_size = key.size; - mkey.mv_data = key.data; - mdata.mv_size = val.size; - mdata.mv_data = val.data; - - ret = mdb_put(txn, node -> dbi, & mkey, & mdata, MDB_NOOVERWRITE); - if (MDB_KEYEXIST == ret) - FAIL_FAST(EMDB_RET_KEY_EXIST, err1); - if (ret) - FAIL_FAST(EMDB_PUT_ERR, err1); - - if (mdb_txn_commit(txn)) - FAIL_FAST(EMDB_TXN_COMMIT_ERR, err1); - - return atom_ok; - - err1: - mdb_txn_abort(txn); - err2: - return enif_make_atom(env, err); -} - - -static ERL_NIF_TERM emdb_get_nif (ErlNifEnv * env, - int argc, const ERL_NIF_TERM argv[]) -{ - ErlNifBinary key; - ErlNifBinary val = {0}; - ERL_NIF_TERM term; - - MDB_val mkey; - MDB_val mdata; - - MDB_env * handle; - MDB_txn * txn; - - struct emdb_map_t * node; - char * err; - unsigned long addr; - - if (! enif_get_ulong(env, argv[0], & addr)) - return enif_make_badarg(env); - - handle = (MDB_env *) addr; - - HASH_FIND_PTR(emdb_map, & handle, node); - if (NULL == node) - return enif_make_atom(env, EMDB_INVALID_HANDLE_ERR); - - if (! enif_inspect_iolist_as_binary(env, argv[1], &key)) - return enif_make_badarg(env); - - mkey.mv_size = key.size; - mkey.mv_data = key.data; - - if (mdb_txn_begin(handle, NULL, 0, & txn)) - FAIL_FAST(EMDB_TXN_BEGIN_ERR, err); - - if(mdb_get(txn, node -> dbi, & mkey, & mdata)) - { - mdb_txn_abort(txn); - return atom_none; - } - - val.size = mdata.mv_size; - val.data = mdata.mv_data; - - term = enif_make_binary(env, &val); - mdb_txn_abort(txn); - - if (! term) - FAIL_FAST(EMDB_MAKE_BINARY_ERR, err); - - return enif_make_tuple(env, 2, - atom_ok, - term); - - err: - return enif_make_atom(env, err); -} - - -static ERL_NIF_TERM emdb_del_nif (ErlNifEnv * env, - int argc, const ERL_NIF_TERM argv[]) -{ - ErlNifBinary key; - - MDB_val mkey; - - MDB_env * handle; - MDB_txn * txn; - - struct emdb_map_t * node; - char * err; - unsigned long addr; - int ret; - - if (! enif_get_ulong(env, argv[0], & addr)) - return enif_make_badarg(env); - - handle = (MDB_env *) addr; - - HASH_FIND_PTR(emdb_map, & handle, node); - if (NULL == node) - return enif_make_atom(env, EMDB_INVALID_HANDLE_ERR); - - if (! enif_inspect_iolist_as_binary(env, argv[1], &key)) - return enif_make_badarg(env); - - mkey.mv_size = key.size; - mkey.mv_data = key.data; - - if (mdb_txn_begin(handle, NULL, 0, & txn)) - FAIL_FAST(EMDB_TXN_BEGIN_ERR, err); - - ret = mdb_del(txn, node -> dbi, & mkey, NULL); - - if (mdb_txn_commit(txn)) - FAIL_FAST(EMDB_TXN_COMMIT_ERR, err); - - if(ret) - return atom_none; - - return atom_ok; - - err: - return enif_make_atom(env, err); -} - - -static ERL_NIF_TERM emdb_update_nif (ErlNifEnv * env, - int argc, const ERL_NIF_TERM argv[]) -{ - ErlNifBinary key; - ErlNifBinary val; - - MDB_val mkey; - MDB_val mdata; - - MDB_env * handle; - MDB_txn * txn; - - struct emdb_map_t * node; - unsigned long addr; - char * err; - - if (! enif_get_ulong(env, argv[0], & addr)) - return enif_make_badarg(env); - - handle = (MDB_env *) addr; - - HASH_FIND_PTR(emdb_map, & handle, node); - if (NULL == node) - return enif_make_atom(env, EMDB_INVALID_HANDLE_ERR); - - if (! enif_inspect_iolist_as_binary(env, argv[1], &key)) - return enif_make_badarg(env); - - if (! enif_inspect_iolist_as_binary(env, argv[2], &val)) - return enif_make_badarg(env); - - if (mdb_txn_begin(handle, NULL, 0, & txn)) - FAIL_FAST(EMDB_TXN_BEGIN_ERR, err2); - - mkey.mv_size = key.size; - mkey.mv_data = key.data; - mdata.mv_size = val.size; - mdata.mv_data = val.data; - - if (mdb_put(txn, node -> dbi, & mkey, & mdata, 0)) - FAIL_FAST(EMDB_UPDATE_ERR, err1); - - if (mdb_txn_commit(txn)) - FAIL_FAST(EMDB_TXN_COMMIT_ERR, err1); - - return atom_ok; - - err1: - mdb_txn_abort(txn); - err2: - return enif_make_atom(env, err); -} - - -static ERL_NIF_TERM emdb_drop_nif (ErlNifEnv * env, - int argc, const ERL_NIF_TERM argv[]) -{ - MDB_env * handle; - MDB_txn * txn; - struct emdb_map_t * node; - unsigned long addr; - char * err; - int ret; - - if (! enif_get_ulong(env, argv[0], & addr)) - return enif_make_badarg(env); - - handle = (MDB_env *) addr; - - HASH_FIND_PTR(emdb_map, & handle, node); - if (NULL == node) - return enif_make_atom(env, EMDB_INVALID_HANDLE_ERR); - - if (mdb_txn_begin(handle, NULL, 0, & txn)) - FAIL_FAST(EMDB_TXN_BEGIN_ERR, err2); - - ret = mdb_drop(txn, node -> dbi, 0); - if (ret) - FAIL_FAST(EMDB_DROP_ERR, err1); - - if (mdb_txn_commit(txn)) - FAIL_FAST(EMDB_TXN_COMMIT_ERR, err1); - - return atom_ok; - - err1: - mdb_txn_abort(txn); - - err2: - return enif_make_atom(env, err); - } - - -static int emdb_load(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info) - { - atom_ok = enif_make_atom(env, "ok"); - atom_none = enif_make_atom(env, "none"); - - return (0); - } - -static int emdb_reload(ErlNifEnv* env, void** priv, ERL_NIF_TERM info) -{ - return (0); -} - - -static int emdb_upgrade(ErlNifEnv* env, void** priv, void** old_priv, ERL_NIF_TERM load_info) -{ - return (0); -} - - -static void emdb_unload(ErlNifEnv* env, void* priv) -{ - return; -} - - - -static ErlNifFunc nif_funcs [] = { - {"open", 3, emdb_open_nif}, - {"close", 1, emdb_close_nif}, - {"put", 3, emdb_put_nif}, - {"get", 2, emdb_get_nif}, - {"del", 2, emdb_del_nif}, - {"update", 3, emdb_update_nif}, - {"drop", 1, emdb_drop_nif} -}; - -/* driver entry point */ -ERL_NIF_INIT(emdb_drv, - nif_funcs, - & emdb_load, - & emdb_reload, - & emdb_upgrade, - & emdb_unload) diff --git a/c_src/fifo_q.h b/c_src/fifo_q.h new file mode 100644 index 0000000..f37bf67 --- /dev/null +++ b/c_src/fifo_q.h @@ -0,0 +1,93 @@ +/* + * fifo_q: a macro-based implementation of a FIFO Queue + * + * Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved. + * Author: Gregory Burd + * + * This file is provided to you under the Apache License, + * Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain + * a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef __FIFO_Q_H__ +#define __FIFO_Q_H__ + +#if defined(__cplusplus) +extern "C" { +#endif + +#define FIFO_QUEUE_TYPE(name) \ + struct fifo_q__ ## name * +#define DECL_FIFO_QUEUE(name, type) \ + struct fifo_q__ ## name { \ + unsigned int h, t, s; \ + type *items[]; \ + }; \ + static struct fifo_q__ ## name *fifo_q_ ## name ## _new(unsigned int n) { \ + int sz = sizeof(struct fifo_q__ ## name) + ((n+1) * sizeof(type *));\ + struct fifo_q__ ## name *q = enif_alloc(sz); \ + if (!q) \ + return 0; \ + memset(q, 0, sz); \ + q->s = n + 1; \ + return q; \ + } \ + static inline void fifo_q_ ## name ## _free(struct fifo_q__ ## name *q) { \ + memset(q, 0, sizeof(struct fifo_q__ ## name) + (q->s * sizeof(type *))); \ + enif_free(q); \ + } \ + static inline type *fifo_q_ ## name ## _put(struct fifo_q__ ## name *q, type *n) { \ + q->items[q->h] = n; \ + q->h = (q->h + 1) % q->s; \ + return n; \ + } \ + static inline type *fifo_q_ ## name ## _get(struct fifo_q__ ## name *q) { \ + type *n = q->items[q->t]; \ + q->items[q->t] = 0; \ + q->t = (q->t + 1) % q->s; \ + return n; \ + } \ + static inline unsigned int fifo_q_ ## name ## _size(struct fifo_q__ ## name *q) { \ + return (q->h - q->t + q->s) % q->s; \ + } \ + static inline unsigned int fifo_q_ ## name ## _capacity(struct fifo_q__ ## name *q) { \ + return q->s - 1; \ + } \ + static inline int fifo_q_ ## name ## _empty(struct fifo_q__ ## name *q) { \ + return (q->t == q->h); \ + } \ + static inline int fifo_q_ ## name ## _full(struct fifo_q__ ## name *q) { \ + return ((q->h + 1) % q->s) == q->t; \ + } + +#define fifo_q_new(name, size) fifo_q_ ## name ## _new(size) +#define fifo_q_free(name, queue) fifo_q_ ## name ## _free(queue) +#define fifo_q_get(name, queue) fifo_q_ ## name ## _get(queue) +#define fifo_q_put(name, queue, item) fifo_q_ ## name ## _put(queue, item) +#define fifo_q_size(name, queue) fifo_q_ ## name ## _size(queue) +#define fifo_q_capacity(name, queue) fifo_q_ ## name ## _capacity(queue) +#define fifo_q_empty(name, queue) fifo_q_ ## name ## _empty(queue) +#define fifo_q_full(name, queue) fifo_q_ ## name ## _full(queue) +#define fifo_q_foreach(name, queue, item, task) do { \ + while(!fifo_q_ ## name ## _empty(queue)) { \ + item = fifo_q_ ## name ## _get(queue); \ + do task while(0); \ + } \ + } while(0); + + +#if defined(__cplusplus) +} +#endif + +#endif // __FIFO_Q_H__ diff --git a/c_src/khash.h b/c_src/khash.h new file mode 100644 index 0000000..ab157b1 --- /dev/null +++ b/c_src/khash.h @@ -0,0 +1,643 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + + #include "khash.h" + KHASH_MAP_INIT_INT(32, char) + int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; + } +*/ + +/* + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor + */ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. +*/ + +#define AC_VERSION_KHASH_H "0.2.6" + +#include +#include +#include + +/* compiler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline +#endif + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +#ifdef KHASH_LINEAR +#define __ac_inc(k, m) 1 +#else +#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) +#endif + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + +static const double __ac_HASH_UPPER = 0.77; + +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last, mask; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + inc) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) return -1; \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) return -1; \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k & new_mask; \ + inc = __ac_inc(k, new_mask); \ + while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + kfree(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + return 0; \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + inc = __ac_inc(k, mask); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + inc) & mask; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] +*/ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function +*/ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] +*/ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function +*/ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract Pointer hash function + @param key The integer void * + @return The hash value [khint_t] +*/ +#define kh_ptr_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Pointer comparison function +*/ +#define kh_ptr_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit pointer hash function + @param key The integer void * + @return The hash value [khint_t] +*/ +#define kh_ptr64_hash_func(key) (khint32_t)(((khint64_t)key)>>33^((khint64_t)key)^((khint64_t)key)<<11) +/*! @function + @abstract 64-bit pointer comparison function +*/ +#define kh_ptr64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value +*/ +static kh_inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] +*/ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function +*/ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other convenient macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] +*/ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] +*/ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] +*/ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] +*/ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] +*/ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] +*/ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] +*/ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] +*/ +#define kh_del(name, h, k) kh_del_##name(h, k) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] +*/ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] +*/ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. +*/ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() +*/ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] +*/ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] +*/ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] +*/ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] +*/ +#define kh_n_buckets(h) ((h)->n_buckets) + +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute +*/ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute +*/ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash map containing (void *) keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] +*/ +#ifdef __x86_64__ +#define KHASH_MAP_INIT_PTR(name, khval_t) \ + KHASH_INIT(name, void*, khval_t, 1, kh_ptr64_hash_func, kh_ptr64_hash_equal) +#else +#define KHASH_MAP_INIT_PTR(name, khval_t) \ + KHASH_INIT(name, void*, khval_t, 1, kh_ptr_hash_func, kh_ptr_hash_equal) +#endif + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] +*/ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] +*/ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] +*/ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] +*/ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] +*/ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] +*/ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/c_src/mdb.c b/c_src/lmdb.c similarity index 81% rename from c_src/mdb.c rename to c_src/lmdb.c index 42e3382..6b9c0b4 100644 --- a/c_src/mdb.c +++ b/c_src/lmdb.c @@ -5,7 +5,7 @@ * BerkeleyDB API, but much simplified. */ /* - * Copyright 2011-2012 Howard Chu, Symas Corp. + * Copyright 2011-2013 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,6 +32,7 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#define _GNU_SOURCE 1 #include #include #include @@ -58,9 +59,11 @@ #include #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) +#include #include /* defines BYTE_ORDER on HPUX and Solaris */ #endif + #if defined(__APPLE__) || defined (BSD) # define MDB_USE_POSIX_SEM 1 # define MDB_FDATASYNC fsync @@ -116,7 +119,7 @@ #define MISALIGNED_OK 1 #endif -#include "mdb.h" +#include "lmdb.h" #include "midl.h" #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) @@ -140,10 +143,11 @@ #define pthread_mutex_t HANDLE #define pthread_key_t DWORD #define pthread_self() GetCurrentThreadId() -#define pthread_key_create(x,y) (*(x) = TlsAlloc()) +#define pthread_key_create(x,y) \ + ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0) #define pthread_key_delete(x) TlsFree(x) #define pthread_getspecific(x) TlsGetValue(x) -#define pthread_setspecific(x,y) TlsSetValue(x,y) +#define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) #define pthread_mutex_unlock(x) ReleaseMutex(x) #define pthread_mutex_lock(x) WaitForSingleObject(x, INFINITE) #define LOCK_MUTEX_R(env) pthread_mutex_lock((env)->me_rmutex) @@ -284,6 +288,8 @@ typedef MDB_ID txnid_t; #endif #if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__)) +# undef MDB_DEBUG +# define MDB_DEBUG 0 # define DPRINTF (void) /* Vararg macros may be unsupported */ #elif MDB_DEBUG static int mdb_debug; @@ -295,6 +301,7 @@ static txnid_t mdb_debug_start; fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__))) #else # define DPRINTF(fmt, ...) ((void) 0) +# define MDB_DEBUG_SKIP #endif /** Print a debug string. * The string is printed literally, with no format processing. @@ -339,26 +346,38 @@ static txnid_t mdb_debug_start; /** The version number for a database's file format. */ #define MDB_VERSION 1 - /** The maximum size of a key in the database. - * While data items have essentially unbounded size, we require that - * keys all fit onto a regular page. This limit could be raised a bit - * further if needed; to something just under #MDB_PAGESIZE / #MDB_MINKEYS. + /** @brief The maximum size of a key in the database. + * + * We require that keys all fit onto a regular page. This limit + * could be raised a bit further if needed; to something just + * under #MDB_PAGESIZE / #MDB_MINKEYS. + * + * Note that data items in an #MDB_DUPSORT database are actually keys + * of a subDB, so they're also limited to this size. */ -#define MAXKEYSIZE 511 +#ifndef MDB_MAXKEYSIZE +#define MDB_MAXKEYSIZE 511 +#endif + + /** @brief The maximum size of a data item. + * + * We only store a 32 bit value for node sizes. + */ +#define MAXDATASIZE 0xffffffffUL #if MDB_DEBUG /** A key buffer. * @ingroup debug * This is used for printing a hex dump of a key's contents. */ -#define DKBUF char kbuf[(MAXKEYSIZE*2+1)] +#define DKBUF char kbuf[(MDB_MAXKEYSIZE*2+1)] /** Display a key in hex. * @ingroup debug * Invoke a function to display a key in hex. */ #define DKEY(x) mdb_dkey(x, kbuf) #else -#define DKBUF typedef int dummy_kbuf /* so we can put ';' after */ +#define DKBUF #define DKEY(x) 0 #endif @@ -367,7 +386,7 @@ static txnid_t mdb_debug_start; */ #define P_INVALID (~(pgno_t)0) - /** Test if a flag \b f is set in a flag word \b w. */ + /** Test if the flags \b f are set in a flag word \b w. */ #define F_ISSET(w, f) (((w) & (f)) == (f)) /** Used for offsets within a single page. @@ -389,6 +408,10 @@ typedef uint16_t indx_t; * slot's address is saved in thread-specific data so that subsequent read * transactions started by the same thread need no further locking to proceed. * + * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. + * + * No reader table is used if the database is on a read-only filesystem. + * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which * readers are using data from which old transactions, so that we'll know @@ -414,9 +437,6 @@ typedef uint16_t indx_t; * the longer we delay reclaiming old pages, the more likely it is that a * string of contiguous pages can be found after coalescing old pages from * many old transactions together. - * - * @todo We don't actually do such coalescing yet, we grab pages from one - * old transaction at a time. * @{ */ /** Number of slots in the reader table. @@ -714,13 +734,13 @@ typedef struct MDB_node { #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) /** Set the \b node's key into \b key, if requested. */ -#define MDB_GET_KEY(node, key) { if ((key) != NULL) { \ +#define MDB_GET_KEY(node, key) { if ((void*)(key) != NULL) { \ (key)->mv_size = NODEKSZ(node); (key)->mv_data = NODEKEY(node); } } /** Information about a single database in the environment. */ typedef struct MDB_db { uint32_t md_pad; /**< also ksize for LEAF2 pages */ - uint16_t md_flags; /**< @ref mdb_open */ + uint16_t md_flags; /**< @ref mdb_dbi_open */ uint16_t md_depth; /**< depth of this tree */ pgno_t md_branch_pages; /**< number of internal pages */ pgno_t md_leaf_pages; /**< number of leaf pages */ @@ -729,6 +749,12 @@ typedef struct MDB_db { pgno_t md_root; /**< the root page of this tree */ } MDB_db; + /** mdb_dbi_open flags */ +#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ +#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) +#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ + MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) + /** Handle for the DB used to track free pages. */ #define FREE_DBI 0 /** Handle for the default DB. */ @@ -795,8 +821,8 @@ struct MDB_txn { */ MDB_IDL mt_free_pgs; union { - MDB_ID2L dirty_list; /**< modified pages */ - MDB_reader *reader; /**< this thread's slot in the reader table */ + MDB_ID2L dirty_list; /**< for write txns: modified pages */ + MDB_reader *reader; /**< this thread's reader table slot or NULL */ } mt_u; /** Array of records for each DB known in the environment. */ MDB_dbx *mt_dbxs; @@ -808,8 +834,10 @@ struct MDB_txn { */ #define DB_DIRTY 0x01 /**< DB was written in this txn */ #define DB_STALE 0x02 /**< DB record is older than txnID */ +#define DB_NEW 0x04 /**< DB handle opened in this txn */ +#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ /** @} */ - /** Array of cursors for each DB */ + /** In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; /** Array of flags for each DB */ unsigned char *mt_dbflags; @@ -824,8 +852,11 @@ struct MDB_txn { */ #define MDB_TXN_RDONLY 0x01 /**< read-only transaction */ #define MDB_TXN_ERROR 0x02 /**< an error has occurred */ +#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ /** @} */ unsigned int mt_flags; /**< @ref mdb_txn */ + /** dirty_list maxsize - # of allocated pages allowed, including in parent txns */ + unsigned int mt_dirty_room; /** Tracks which of the two meta pages was used at the start * of this transaction. */ @@ -871,6 +902,7 @@ struct MDB_cursor { #define C_SHADOW 0x08 /**< Cursor is a dup from a parent txn */ #define C_ALLOCD 0x10 /**< Cursor was malloc'd */ #define C_SPLITTING 0x20 /**< Cursor is in page_split */ +#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ /** @} */ unsigned int mc_flags; /**< @ref mdb_cursor */ MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ @@ -893,17 +925,12 @@ typedef struct MDB_xcursor { unsigned char mx_dbflag; } MDB_xcursor; - /** A set of pages freed by an earlier transaction. */ -typedef struct MDB_oldpages { - /** Usually we only read one record from the FREEDB at a time, but - * in case we read more, this will chain them together. - */ - struct MDB_oldpages *mo_next; - /** The ID of the transaction in which these pages were freed. */ - txnid_t mo_txnid; - /** An #MDB_IDL of the pages */ - pgno_t mo_pages[1]; /* dynamic */ -} MDB_oldpages; + /** State of FreeDB old pages, stored in the MDB_env */ +typedef struct MDB_pgstate { + txnid_t mf_pglast; /**< ID of last old page record we used */ + pgno_t *mf_pghead; /**< old pages reclaimed from freelist */ + pgno_t *mf_pgfree; /**< memory to free when dropping me_pghead */ +} MDB_pgstate; /** The database environment. */ struct MDB_env { @@ -912,6 +939,10 @@ struct MDB_env { HANDLE me_mfd; /**< just for writing the meta pages */ /** Failed to update the meta page. Probably an I/O error. */ #define MDB_FATAL_ERROR 0x80000000U + /** Some fields are initialized. */ +#define MDB_ENV_ACTIVE 0x20000000U + /** me_txkey is set */ +#define MDB_ENV_TXKEY 0x10000000U uint32_t me_flags; /**< @ref mdb_env */ unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */ unsigned int me_maxreaders; /**< size of the reader table */ @@ -921,24 +952,28 @@ struct MDB_env { pid_t me_pid; /**< process ID of this env */ char *me_path; /**< path to the DB files */ char *me_map; /**< the memory map of the data file */ - MDB_txninfo *me_txns; /**< the memory map of the lock file */ + MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ MDB_txn *me_txn; /**< current write transaction */ size_t me_mapsize; /**< size of the data memory map */ off_t me_size; /**< current file size */ pgno_t me_maxpg; /**< me_mapsize / me_psize */ - txnid_t me_pgfirst; /**< ID of first old page record we used */ - txnid_t me_pglast; /**< ID of last old page record we used */ MDB_dbx *me_dbxs; /**< array of static DB info */ - uint16_t *me_dbflags; /**< array of DB flags */ - MDB_oldpages *me_pghead; /**< list of old page records */ - MDB_oldpages *me_pgfree; /**< list of page records to free */ + uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ pthread_key_t me_txkey; /**< thread-key for readers */ + MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ +# define me_pglast me_pgstate.mf_pglast +# define me_pghead me_pgstate.mf_pghead +# define me_pgfree me_pgstate.mf_pgfree MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ /** IDL of pages that became unused in a write txn */ MDB_IDL me_free_pgs; - /** ID2L of pages that were written during a write txn */ - MDB_ID2 me_dirty_list[MDB_IDL_UM_SIZE]; + /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + MDB_ID2L me_dirty_list; + /** Max number of freelist items that can fit in a single overflow page */ + unsigned int me_maxfree_1pg; + /** Max size of a node on a page */ + unsigned int me_nodemax; #ifdef _WIN32 HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */ HANDLE me_wmutex; @@ -947,6 +982,13 @@ struct MDB_env { sem_t *me_wmutex; #endif }; + + /** Nested transaction */ +typedef struct MDB_ntxn { + MDB_txn mnt_txn; /* the transaction */ + MDB_pgstate mnt_pgstate; /* parent transaction's saved freestate */ +} MDB_ntxn; + /** max number of pages to commit in one writev() call */ #define MDB_COMMIT_PAGES 64 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES @@ -974,6 +1016,9 @@ static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); static int mdb_env_pick_meta(const MDB_env *env); static int mdb_env_write_meta(MDB_txn *txn); +#if !(defined(_WIN32) || defined(MDB_USE_POSIX_SEM)) /* Drop unused excl arg */ +# define mdb_env_close0(env, excl) mdb_env_close1(env) +#endif static void mdb_env_close0(MDB_env *env, int excl); static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); @@ -987,7 +1032,7 @@ static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); static size_t mdb_branch_size(MDB_env *env, MDB_val *key); static int mdb_rebalance(MDB_cursor *mc); -static int mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key); +static int mdb_update_key(MDB_cursor *mc, MDB_val *key); static void mdb_cursor_pop(MDB_cursor *mc); static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); @@ -1041,9 +1086,12 @@ static char *const mdb_errstr[] = { "MDB_DBS_FULL: Environment maxdbs limit reached", "MDB_READERS_FULL: Environment maxreaders limit reached", "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", - "MDB_TXN_FULL: Nested transaction has too many dirty pages - transaction too big", + "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", - "MDB_PAGE_FULL: Internal error - page has no more space" + "MDB_PAGE_FULL: Internal error - page has no more space", + "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", + "MDB_INCOMPATIBLE: Database flags changed or would change", + "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", }; char * @@ -1073,8 +1121,12 @@ mdb_dkey(MDB_val *key, char *buf) char *ptr = buf; unsigned char *c = key->mv_data; unsigned int i; - if (key->mv_size > MAXKEYSIZE) - return "MAXKEYSIZE"; + + if (!key) + return ""; + + if (key->mv_size > MDB_MAXKEYSIZE) + return "MDB_MAXKEYSIZE"; /* may want to make this a dynamic check: if the key is mostly * printable characters, print it as-is instead of converting to hex. */ @@ -1089,7 +1141,7 @@ mdb_dkey(MDB_val *key, char *buf) } /** Display all the keys in the page. */ -static void +void mdb_page_list(MDB_page *mp) { MDB_node *node; @@ -1098,17 +1150,22 @@ mdb_page_list(MDB_page *mp) DKBUF; nkeys = NUMKEYS(mp); - fprintf(stderr, "numkeys %d\n", nkeys); + fprintf(stderr, "Page %zu numkeys %d\n", mp->mp_pgno, nkeys); for (i=0; imn_ksize; key.mv_data = node->mn_data; nsize = NODESIZE + NODEKSZ(node) + sizeof(indx_t); - if (F_ISSET(node->mn_flags, F_BIGDATA)) - nsize += sizeof(pgno_t); - else - nsize += NODEDSZ(node); - fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); + if (IS_BRANCH(mp)) { + fprintf(stderr, "key %d: page %zu, %s\n", i, NODEPGNO(node), + DKEY(&key)); + } else { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + nsize += sizeof(pgno_t); + else + nsize += NODEDSZ(node); + fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); + } } } @@ -1151,9 +1208,8 @@ static void mdb_audit(MDB_txn *txn) count = 0; for (i = 0; imt_numdbs; i++) { - MDB_xcursor mx, *mxp; - mxp = (txn->mt_dbs[i].md_flags & MDB_DUPSORT) ? &mx : NULL; - mdb_cursor_init(&mc, txn, i, mxp); + MDB_xcursor mx; + mdb_cursor_init(&mc, txn, i, &mx); if (txn->mt_dbs[i].md_root == P_INVALID) continue; count += txn->mt_dbs[i].md_branch_pages + @@ -1194,10 +1250,7 @@ mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { - if (txn->mt_dbxs[dbi].md_dcmp) - return txn->mt_dbxs[dbi].md_dcmp(a, b); - else - return EINVAL; /* too bad you can't distinguish this from a valid result */ + return txn->mt_dbxs[dbi].md_dcmp(a, b); } /** Allocate a single page. @@ -1217,6 +1270,18 @@ mdb_page_malloc(MDB_cursor *mc) { return ret; } +/** Free a single page. + * Saves single pages to a list, for future reuse. + * (This is not used for multi-page overflow pages.) + */ +static void +mdb_page_free(MDB_env *env, MDB_page *mp) +{ + mp->mp_next = env->me_dpages; + VGMEMP_FREE(env, mp); + env->me_dpages = mp; +} + /** Allocate pages for writing. * If there are free pages available from older transactions, they * will be re-used first. Otherwise a new page will be allocated. @@ -1234,84 +1299,90 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) MDB_page *np; pgno_t pgno = P_INVALID; MDB_ID2 mid; + txnid_t oldest = 0, last; int rc; *mp = NULL; + + /* If our dirty list is already full, we can't do anything */ + if (txn->mt_dirty_room == 0) + return MDB_TXN_FULL; + /* The free list won't have any content at all until txn 2 has * committed. The pages freed by txn 2 will be unreferenced * after txn 3 commits, and so will be safe to re-use in txn 4. */ if (txn->mt_txnid > 3) { - if (!txn->mt_env->me_pghead && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) { /* See if there's anything in the free DB */ - int j; MDB_reader *r; MDB_cursor m2; MDB_node *leaf; MDB_val data; - txnid_t *kptr, last; + txnid_t *kptr; mdb_cursor_init(&m2, txn, FREE_DBI, NULL); - if (!txn->mt_env->me_pgfirst) { + if (!txn->mt_env->me_pglast) { mdb_page_search(&m2, NULL, 0); leaf = NODEPTR(m2.mc_pg[m2.mc_top], 0); kptr = (txnid_t *)NODEKEY(leaf); last = *kptr; } else { MDB_val key; - int exact; again: - exact = 0; last = txn->mt_env->me_pglast + 1; leaf = NULL; key.mv_data = &last; key.mv_size = sizeof(last); - rc = mdb_cursor_set(&m2, &key, &data, MDB_SET, &exact); + rc = mdb_cursor_set(&m2, &key, &data, MDB_SET_RANGE, NULL); if (rc) goto none; last = *(txnid_t *)key.mv_data; } - /* Unusable if referred by a meta page or reader... */ - j = 1; - if (last < txn->mt_txnid-1) { - j = txn->mt_env->me_txns->mti_numreaders; - r = txn->mt_env->me_txns->mti_readers + j; - for (j = -j; j && (lastmt_txnid - 1; + nr = txn->mt_env->me_txns->mti_numreaders; + r = txn->mt_env->me_txns->mti_readers; + for (i=0; i last) { /* It's usable, grab it. */ - MDB_oldpages *mop; - pgno_t *idl; + pgno_t *idl, *mop; - if (!txn->mt_env->me_pgfirst) { + if (!txn->mt_env->me_pglast) { mdb_node_read(txn, leaf, &data); } - txn->mt_env->me_pglast = last; - if (!txn->mt_env->me_pgfirst) - txn->mt_env->me_pgfirst = last; idl = (MDB_ID *) data.mv_data; /* We might have a zero-length IDL due to freelist growth * during a prior commit */ - if (!idl[0]) goto again; - mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - sizeof(pgno_t)); + if (!idl[0]) { + txn->mt_env->me_pglast = last; + goto again; + } + mop = malloc(MDB_IDL_SIZEOF(idl)); if (!mop) return ENOMEM; - mop->mo_next = txn->mt_env->me_pghead; - mop->mo_txnid = last; - txn->mt_env->me_pghead = mop; - memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl)); + txn->mt_env->me_pglast = last; + txn->mt_env->me_pghead = txn->mt_env->me_pgfree = mop; + memcpy(mop, idl, MDB_IDL_SIZEOF(idl)); #if MDB_DEBUG > 1 { unsigned int i; DPRINTF("IDL read txn %zu root %zu num %zu", - mop->mo_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]); + last, txn->mt_dbs[FREE_DBI].md_root, idl[0]); for (i=0; imt_env->me_pghead) { - MDB_oldpages *mop = txn->mt_env->me_pghead; + pgno_t *mop = txn->mt_env->me_pghead; if (num > 1) { - /* FIXME: For now, always use fresh pages. We - * really ought to search the free list for a - * contiguous range. - */ - ; + MDB_cursor m2; + int retry = 1, readit = 0, n2 = num-1; + unsigned int i, j, k; + + /* If current list is too short, must fetch more and coalesce */ + if (mop[0] < (unsigned)num) + readit = 1; + + mdb_cursor_init(&m2, txn, FREE_DBI, NULL); + do { +#ifdef MDB_PARANOID /* Seems like we can ignore this now */ + /* If on freelist, don't try to read more. If what we have + * right now isn't enough just use new pages. + * TODO: get all of this working. Many circular dependencies... + */ + if (mc->mc_dbi == FREE_DBI) { + retry = 0; + readit = 0; + } +#endif + if (readit) { + MDB_val key, data; + pgno_t *idl, *mop2; + + last = txn->mt_env->me_pglast + 1; + + /* We haven't hit the readers list yet? */ + if (!oldest) { + MDB_reader *r; + unsigned int nr; + txnid_t mr; + + oldest = txn->mt_txnid - 1; + nr = txn->mt_env->me_txns->mti_numreaders; + r = txn->mt_env->me_txns->mti_readers; + for (i=0; i0 || j>0) { + if (i && idl[i] < mop[j]) + mop2[k--] = idl[i--]; + else + mop2[k--] = mop[j--]; + } + txn->mt_env->me_pglast = last; + free(txn->mt_env->me_pgfree); + txn->mt_env->me_pghead = txn->mt_env->me_pgfree = mop2; + mop = mop2; + /* Keep trying to read until we have enough */ + if (mop[0] < (unsigned)num) { + continue; + } + } + + /* current list has enough pages, but are they contiguous? */ + for (i=mop[0]; i>=(unsigned)num; i--) { + if (mop[i-n2] == mop[i] + n2) { + pgno = mop[i]; + i -= n2; + /* move any stragglers down */ + for (j=i+num; j<=mop[0]; j++) + mop[i++] = mop[j]; + mop[0] -= num; + break; + } + } + + /* Stop if we succeeded, or no retries */ + if (!retry || pgno != P_INVALID) + break; + readit = 1; + + } while (1); } else { /* peel pages off tail, so we only have to truncate the list */ - pgno = MDB_IDL_LAST(mop->mo_pages); - if (MDB_IDL_IS_RANGE(mop->mo_pages)) { - mop->mo_pages[2]++; - if (mop->mo_pages[2] > mop->mo_pages[1]) - mop->mo_pages[0] = 0; - } else { - mop->mo_pages[0]--; - } - if (MDB_IDL_IS_ZERO(mop->mo_pages)) { - txn->mt_env->me_pghead = mop->mo_next; - if (mc->mc_dbi == FREE_DBI) { - mop->mo_next = txn->mt_env->me_pgfree; - txn->mt_env->me_pgfree = mop; - } else { - free(mop); - } - } + pgno = MDB_IDL_LAST(mop); + mop[0]--; + } + if (MDB_IDL_IS_ZERO(mop)) { + free(txn->mt_env->me_pgfree); + txn->mt_env->me_pghead = txn->mt_env->me_pgfree = NULL; } } } @@ -1391,6 +1546,7 @@ none: } else { mdb_mid2l_insert(txn->mt_u.dirty_list, &mid); } + txn->mt_dirty_room--; *mp = np; return MDB_SUCCESS; @@ -1399,6 +1555,7 @@ none: /** Copy a page: avoid copying unused portions of the page. * @param[in] dst page to copy into * @param[in] src page to copy from + * @param[in] psize size of a page */ static void mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) @@ -1495,6 +1652,7 @@ finish: return 0; } } + assert(mc->mc_txn->mt_u.dirty_list[0].mid < MDB_IDL_UM_MAX); /* No - copy it */ np = mdb_page_malloc(mc); if (!np) @@ -1515,7 +1673,8 @@ mdb_env_sync(MDB_env *env, int force) int rc = 0; if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { if (env->me_flags & MDB_WRITEMAP) { - int flags = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + int flags = ((env->me_flags & MDB_MAPASYNC) && !force) + ? MS_ASYNC : MS_SYNC; if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) rc = ErrCode(); #ifdef _WIN32 @@ -1554,7 +1713,7 @@ mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) mc->mc_dbflag = &dst->mt_dbflags[i]; mc->mc_snum = m2->mc_snum; mc->mc_top = m2->mc_top; - mc->mc_flags = m2->mc_flags | C_SHADOW; + mc->mc_flags = m2->mc_flags | (C_SHADOW|C_ALLOCD); for (j=0; jmc_snum; j++) { mc->mc_pg[j] = m2->mc_pg[j]; mc->mc_ki[j] = m2->mc_ki[j]; @@ -1590,30 +1749,34 @@ mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) return MDB_SUCCESS; } -/** Merge shadow cursors back into parent's */ +/** Close this write txn's cursors, after optionally merging its shadow + * cursors back into parent's. + * @param[in] txn the transaction handle. + * @param[in] merge 0 to not merge cursors, C_SHADOW to merge. + * @return 0 on success, non-zero on failure. + */ static void -mdb_cursor_merge(MDB_txn *txn) +mdb_cursors_close(MDB_txn *txn, unsigned merge) { - MDB_dbi i; - for (i=0; imt_numdbs; i++) { - if (txn->mt_cursors[i]) { - MDB_cursor *mc; - while ((mc = txn->mt_cursors[i])) { - txn->mt_cursors[i] = mc->mc_next; - if (mc->mc_flags & C_SHADOW) { + MDB_cursor **cursors = txn->mt_cursors, *mc, *next; + int i, j; + + for (i = txn->mt_numdbs; --i >= 0; ) { + for (mc = cursors[i]; mc; mc = next) { + next = mc->mc_next; + if (mc->mc_flags & merge) { MDB_cursor *m2 = mc->mc_orig; - unsigned int j; m2->mc_snum = mc->mc_snum; m2->mc_top = mc->mc_top; - for (j=0; jmc_snum; j++) { + for (j = mc->mc_snum; --j >= 0; ) { m2->mc_pg[j] = mc->mc_pg[j]; m2->mc_ki[j] = mc->mc_ki[j]; } } if (mc->mc_flags & C_ALLOCD) free(mc); - } } + cursors[i] = NULL; } } @@ -1622,48 +1785,62 @@ mdb_txn_reset0(MDB_txn *txn); /** Common code for #mdb_txn_begin() and #mdb_txn_renew(). * @param[in] txn the transaction handle to initialize - * @return 0 on success, non-zero on failure. This can only - * fail for read-only transactions, and then only if the - * reader table is full. + * @return 0 on success, non-zero on failure. */ static int mdb_txn_renew0(MDB_txn *txn) { MDB_env *env = txn->mt_env; unsigned int i; + uint16_t x; + int rc, new_notls = 0; /* Setup db info */ txn->mt_numdbs = env->me_numdbs; txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ if (txn->mt_flags & MDB_TXN_RDONLY) { - MDB_reader *r = pthread_getspecific(env->me_txkey); - if (!r) { - pid_t pid = env->me_pid; - pthread_t tid = pthread_self(); + if (!env->me_txns) { + i = mdb_env_pick_meta(env); + txn->mt_txnid = env->me_metas[i]->mm_txnid; + txn->mt_u.reader = NULL; + } else { + MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : + pthread_getspecific(env->me_txkey); + if (r) { + if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) + return MDB_BAD_RSLOT; + } else { + pid_t pid = env->me_pid; + pthread_t tid = pthread_self(); - LOCK_MUTEX_R(env); - for (i=0; ime_txns->mti_numreaders; i++) - if (env->me_txns->mti_readers[i].mr_pid == 0) - break; - if (i == env->me_maxreaders) { + LOCK_MUTEX_R(env); + for (i=0; ime_txns->mti_numreaders; i++) + if (env->me_txns->mti_readers[i].mr_pid == 0) + break; + if (i == env->me_maxreaders) { + UNLOCK_MUTEX_R(env); + return MDB_READERS_FULL; + } + env->me_txns->mti_readers[i].mr_pid = pid; + env->me_txns->mti_readers[i].mr_tid = tid; + if (i >= env->me_txns->mti_numreaders) + env->me_txns->mti_numreaders = i+1; + /* Save numreaders for un-mutexed mdb_env_close() */ + env->me_numreaders = env->me_txns->mti_numreaders; UNLOCK_MUTEX_R(env); - return MDB_READERS_FULL; + r = &env->me_txns->mti_readers[i]; + new_notls = (env->me_flags & MDB_NOTLS); + if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { + r->mr_pid = 0; + return rc; + } } - env->me_txns->mti_readers[i].mr_pid = pid; - env->me_txns->mti_readers[i].mr_tid = tid; - if (i >= env->me_txns->mti_numreaders) - env->me_txns->mti_numreaders = i+1; - /* Save numreaders for un-mutexed mdb_env_close() */ - env->me_numreaders = env->me_txns->mti_numreaders; - UNLOCK_MUTEX_R(env); - r = &env->me_txns->mti_readers[i]; - pthread_setspecific(env->me_txkey, r); + txn->mt_txnid = r->mr_txnid = env->me_txns->mti_txnid; + txn->mt_u.reader = r; } - txn->mt_txnid = r->mr_txnid = env->me_txns->mti_txnid; txn->mt_toggle = txn->mt_txnid & 1; txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1; - txn->mt_u.reader = r; } else { LOCK_MUTEX_W(env); @@ -1675,6 +1852,7 @@ mdb_txn_renew0(MDB_txn *txn) if (txn->mt_txnid == mdb_debug_start) mdb_debug = 1; #endif + txn->mt_dirty_room = MDB_IDL_UM_MAX; txn->mt_u.dirty_list = env->me_dirty_list; txn->mt_u.dirty_list[0].mid = 0; txn->mt_free_pgs = env->me_free_pgs; @@ -1684,11 +1862,21 @@ mdb_txn_renew0(MDB_txn *txn) /* Copy the DB info and flags */ memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db)); - for (i=2; imt_numdbs; i++) - txn->mt_dbs[i].md_flags = env->me_dbflags[i]; - txn->mt_dbflags[0] = txn->mt_dbflags[1] = 0; - if (txn->mt_numdbs > 2) - memset(txn->mt_dbflags+2, DB_STALE, txn->mt_numdbs-2); + for (i=2; imt_numdbs; i++) { + x = env->me_dbflags[i]; + txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; + txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0; + } + txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; + + if (env->me_maxpg < txn->mt_next_pgno) { + mdb_txn_reset0(txn); + if (new_notls) { + txn->mt_u.reader->mr_pid = 0; + txn->mt_u.reader = NULL; + } + return MDB_MAP_RESIZED; + } return MDB_SUCCESS; } @@ -1698,7 +1886,7 @@ mdb_txn_renew(MDB_txn *txn) { int rc; - if (! (txn && txn->mt_flags & MDB_TXN_RDONLY)) + if (!txn || txn->mt_dbxs) /* A reset txn has mt_dbxs==NULL */ return EINVAL; if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { @@ -1719,7 +1907,8 @@ int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) { MDB_txn *txn; - int rc, size; + MDB_ntxn *ntxn; + int rc, size, tsize = sizeof(MDB_txn); if (env->me_flags & MDB_FATAL_ERROR) { DPUTS("environment had fatal error, must shutdown!"); @@ -1735,8 +1924,9 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) { return EINVAL; } + tsize = sizeof(MDB_ntxn); } - size = sizeof(MDB_txn) + env->me_maxdbs * (sizeof(MDB_db)+1); + size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1); if (!(flags & MDB_RDONLY)) size += env->me_maxdbs * sizeof(MDB_cursor *); @@ -1744,7 +1934,7 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) DPRINTF("calloc: %s", strerror(ErrCode())); return ENOMEM; } - txn->mt_dbs = (MDB_db *)(txn+1); + txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); if (flags & MDB_RDONLY) { txn->mt_flags |= MDB_TXN_RDONLY; txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs); @@ -1755,19 +1945,18 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) txn->mt_env = env; if (parent) { - txn->mt_free_pgs = mdb_midl_alloc(); - if (!txn->mt_free_pgs) { - free(txn); - return ENOMEM; - } + unsigned int i; txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); - if (!txn->mt_u.dirty_list) { - free(txn->mt_free_pgs); + if (!txn->mt_u.dirty_list || + !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) + { + free(txn->mt_u.dirty_list); free(txn); return ENOMEM; } txn->mt_txnid = parent->mt_txnid; txn->mt_toggle = parent->mt_toggle; + txn->mt_dirty_room = parent->mt_dirty_room; txn->mt_u.dirty_list[0].mid = 0; txn->mt_free_pgs[0] = 0; txn->mt_next_pgno = parent->mt_next_pgno; @@ -1776,9 +1965,25 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) txn->mt_numdbs = parent->mt_numdbs; txn->mt_dbxs = parent->mt_dbxs; memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); - memcpy(txn->mt_dbflags, parent->mt_dbflags, txn->mt_numdbs); - mdb_cursor_shadow(parent, txn); + /* Copy parent's mt_dbflags, but clear DB_NEW */ + for (i=0; imt_numdbs; i++) + txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; rc = 0; + ntxn = (MDB_ntxn *)txn; + ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ + if (env->me_pghead) { + size = MDB_IDL_SIZEOF(env->me_pghead); + env->me_pghead = malloc(size); + if (env->me_pghead) + memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); + else + rc = ENOMEM; + } + env->me_pgfree = env->me_pghead; + if (!rc) + rc = mdb_cursor_shadow(parent, txn); + if (rc) + mdb_txn_reset0(txn); } else { rc = mdb_txn_renew0(txn); } @@ -1794,41 +1999,64 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) return rc; } +/** Export or close DBI handles opened in this txn. */ +static void +mdb_dbis_update(MDB_txn *txn, int keep) +{ + int i; + MDB_dbi n = txn->mt_numdbs; + MDB_env *env = txn->mt_env; + unsigned char *tdbflags = txn->mt_dbflags; + + for (i = n; --i >= 2;) { + if (tdbflags[i] & DB_NEW) { + if (keep) { + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; + } else { + char *ptr = env->me_dbxs[i].md_name.mv_data; + env->me_dbxs[i].md_name.mv_data = NULL; + env->me_dbxs[i].md_name.mv_size = 0; + env->me_dbflags[i] = 0; + free(ptr); + } + } + } + if (keep && env->me_numdbs < n) + env->me_numdbs = n; +} + /** Common code for #mdb_txn_reset() and #mdb_txn_abort(). + * May be called twice for readonly txns: First reset it, then abort. * @param[in] txn the transaction handle to reset */ static void mdb_txn_reset0(MDB_txn *txn) { MDB_env *env = txn->mt_env; + unsigned int i; + + /* Close any DBI handles opened in this txn */ + mdb_dbis_update(txn, 0); if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - txn->mt_u.reader->mr_txnid = (txnid_t)-1; - } else { - MDB_oldpages *mop; - MDB_page *dp; - unsigned int i; - - /* close(free) all cursors */ - for (i=0; imt_numdbs; i++) { - if (txn->mt_cursors[i]) { - MDB_cursor *mc; - while ((mc = txn->mt_cursors[i])) { - txn->mt_cursors[i] = mc->mc_next; - if (mc->mc_flags & C_ALLOCD) - free(mc); - } - } + if (txn->mt_u.reader) { + txn->mt_u.reader->mr_txnid = (txnid_t)-1; + if (!(env->me_flags & MDB_NOTLS)) + txn->mt_u.reader = NULL; /* txn does not own reader */ } + txn->mt_numdbs = 0; /* close nothing if called again */ + txn->mt_dbxs = NULL; /* mark txn as reset */ + } else { + MDB_page *dp; + + mdb_cursors_close(txn, 0); if (!(env->me_flags & MDB_WRITEMAP)) { /* return all dirty pages to dpage list */ for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) { dp = txn->mt_u.dirty_list[i].mptr; if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { - dp->mp_next = txn->mt_env->me_dpages; - VGMEMP_FREE(txn->mt_env, dp); - txn->mt_env->me_dpages = dp; + mdb_page_free(txn->mt_env, dp); } else { /* large pages just get freed directly */ VGMEMP_FREE(txn->mt_env, dp); @@ -1837,8 +2065,11 @@ mdb_txn_reset0(MDB_txn *txn) } } + free(env->me_pgfree); + if (txn->mt_parent) { txn->mt_parent->mt_child = NULL; + env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; mdb_midl_free(txn->mt_free_pgs); free(txn->mt_u.dirty_list); return; @@ -1847,11 +2078,7 @@ mdb_txn_reset0(MDB_txn *txn) env->me_free_pgs = txn->mt_free_pgs; } - while ((mop = txn->mt_env->me_pghead)) { - txn->mt_env->me_pghead = mop->mo_next; - free(mop); - } - txn->mt_env->me_pgfirst = 0; + txn->mt_env->me_pghead = txn->mt_env->me_pgfree = NULL; txn->mt_env->me_pglast = 0; env->me_txn = NULL; @@ -1870,6 +2097,10 @@ mdb_txn_reset(MDB_txn *txn) txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *) txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); + /* This call is only valid for read-only txns */ + if (!(txn->mt_flags & MDB_TXN_RDONLY)) + return; + mdb_txn_reset0(txn); } @@ -1887,6 +2118,10 @@ mdb_txn_abort(MDB_txn *txn) mdb_txn_abort(txn->mt_child); mdb_txn_reset0(txn); + /* Free reader slot tied to this txn (if MDB_NOTLS && writable FS) */ + if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) + txn->mt_u.reader->mr_pid = 0; + free(txn); } @@ -1900,26 +2135,26 @@ mdb_txn_commit(MDB_txn *txn) MDB_page *dp; MDB_env *env; pgno_t next, freecnt; + txnid_t oldpg_txnid, id; MDB_cursor mc; assert(txn != NULL); assert(txn->mt_env != NULL); if (txn->mt_child) { - mdb_txn_commit(txn->mt_child); + rc = mdb_txn_commit(txn->mt_child); txn->mt_child = NULL; + if (rc) { + mdb_txn_abort(txn); + return rc; + } } env = txn->mt_env; if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - if (txn->mt_numdbs > env->me_numdbs) { - /* update the DB flags */ - MDB_dbi i; - for (i = env->me_numdbs; imt_numdbs; i++) - env->me_dbflags[i] = txn->mt_dbs[i].md_flags; - env->me_numdbs = i; - } + mdb_dbis_update(txn, 1); + txn->mt_numdbs = 2; /* so txn_abort() doesn't close any new handles */ mdb_txn_abort(txn); return MDB_SUCCESS; } @@ -1932,52 +2167,71 @@ mdb_txn_commit(MDB_txn *txn) return EINVAL; } - /* Merge (and close) our cursors with parent's */ - mdb_cursor_merge(txn); - if (txn->mt_parent) { - MDB_db *ip, *jp; - MDB_dbi i; - unsigned x, y; + MDB_txn *parent = txn->mt_parent; + unsigned x, y, len; MDB_ID2L dst, src; - /* Update parent's DB table */ - ip = &txn->mt_parent->mt_dbs[2]; - jp = &txn->mt_dbs[2]; - for (i = 2; i < txn->mt_numdbs; i++) { - if (ip->md_root != jp->md_root) - *ip = *jp; - ip++; jp++; - } - txn->mt_parent->mt_numdbs = txn->mt_numdbs; - /* Append our free list to parent's */ - mdb_midl_append_list(&txn->mt_parent->mt_free_pgs, - txn->mt_free_pgs); + if (mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs)) { + mdb_txn_abort(txn); + return ENOMEM; + } mdb_midl_free(txn->mt_free_pgs); - /* Merge our dirty list with parent's */ + parent->mt_next_pgno = txn->mt_next_pgno; + parent->mt_flags = txn->mt_flags; + + /* Merge our cursors into parent's and close them */ + mdb_cursors_close(txn, C_SHADOW); + + /* Update parent's DB table. */ + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + txn->mt_parent->mt_numdbs = txn->mt_numdbs; + txn->mt_parent->mt_dbflags[0] = txn->mt_dbflags[0]; + txn->mt_parent->mt_dbflags[1] = txn->mt_dbflags[1]; + for (i=2; imt_numdbs; i++) { + /* preserve parent's DB_NEW status */ + x = txn->mt_parent->mt_dbflags[i] & DB_NEW; + txn->mt_parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; + } + dst = txn->mt_parent->mt_u.dirty_list; src = txn->mt_u.dirty_list; - x = mdb_mid2l_search(dst, src[1].mid); - for (y=1; y<=src[0].mid; y++) { - while (x <= dst[0].mid && dst[x].mid != src[y].mid) x++; - if (x > dst[0].mid) - break; - free(dst[x].mptr); - dst[x].mptr = src[y].mptr; - } + /* Find len = length of merging our dirty list with parent's */ x = dst[0].mid; - for (; y<=src[0].mid; y++) { - if (++x >= MDB_IDL_UM_MAX) { - mdb_txn_abort(txn); - return MDB_TXN_FULL; + dst[0].mid = 0; /* simplify loops */ + if (parent->mt_parent) { + len = x + src[0].mid; + y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; + for (i = x; y && i; y--) { + pgno_t yp = src[y].mid; + while (yp < dst[i].mid) + i--; + if (yp == dst[i].mid) { + i--; + len--; + } } - dst[x] = src[y]; + } else { /* Simplify the above for single-ancestor case */ + len = MDB_IDL_UM_MAX - txn->mt_dirty_room; } - dst[0].mid = x; + /* Merge our dirty list with parent's */ + y = src[0].mid; + for (i = len; y; dst[i--] = src[y--]) { + pgno_t yp = src[y].mid; + while (yp < dst[x].mid) + dst[i--] = dst[x--]; + if (yp == dst[x].mid) + free(dst[x--].mptr); + } + assert(i == x); + dst[0].mid = len; free(txn->mt_u.dirty_list); + parent->mt_dirty_room = txn->mt_dirty_room; + txn->mt_parent->mt_child = NULL; + free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pgfree); free(txn); return MDB_SUCCESS; } @@ -1988,15 +2242,15 @@ mdb_txn_commit(MDB_txn *txn) return EINVAL; } - if (!txn->mt_u.dirty_list[0].mid) + mdb_cursors_close(txn, 0); + + if (!txn->mt_u.dirty_list[0].mid && !(txn->mt_flags & MDB_TXN_DIRTY)) goto done; DPRINTF("committing txn %zu %p on mdbenv %p, root page %zu", txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); - /* Update DB root pointers. Their pages have already been - * touched so this is all in-place and cannot fail. - */ + /* Update DB root pointers */ if (txn->mt_numdbs > 2) { MDB_dbi i; MDB_val data; @@ -2006,130 +2260,149 @@ mdb_txn_commit(MDB_txn *txn) for (i = 2; i < txn->mt_numdbs; i++) { if (txn->mt_dbflags[i] & DB_DIRTY) { data.mv_data = &txn->mt_dbs[i]; - mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); + rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); + if (rc) + goto fail; } } } + /* Save the freelist as of this transaction to the freeDB. This + * can change the freelist, so keep trying until it stabilizes. + * + * env->me_pglast and the length of txn->mt_free_pgs cannot decrease, + * except the code below can decrease env->me_pglast to split pghead. + * Page numbers cannot disappear from txn->mt_free_pgs. New pages + * can only appear in env->me_pghead when env->me_pglast increases. + * Until then, the me_pghead pointer won't move but can become NULL. + */ + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + oldpg_txnid = id = 0; + freecnt = 0; /* should only be one record now */ - if (env->me_pghead) { + if (env->me_pghead || env->me_pglast) { /* make sure first page of freeDB is touched and on freelist */ - mdb_page_search(&mc, NULL, MDB_PS_MODIFY); + rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) { +fail: + mdb_txn_abort(txn); + return rc; + } } /* Delete IDLs we used from the free list */ - if (env->me_pgfirst) { - txnid_t cur; + if (env->me_pglast) { MDB_val key; - int exact = 0; - key.mv_size = sizeof(cur); - for (cur = env->me_pgfirst; cur <= env->me_pglast; cur++) { - key.mv_data = &cur; - - mdb_cursor_set(&mc, &key, NULL, MDB_SET, &exact); + do { +free_pgfirst: + rc = mdb_cursor_first(&mc, &key, NULL); + if (rc) + goto fail; + oldpg_txnid = *(txnid_t *)key.mv_data; +again: + assert(oldpg_txnid <= env->me_pglast); + id = 0; rc = mdb_cursor_del(&mc, 0); - if (rc) { - mdb_txn_abort(txn); - return rc; - } - } - env->me_pgfirst = 0; - env->me_pglast = 0; + if (rc) + goto fail; + } while (oldpg_txnid < env->me_pglast); } - /* save to free list */ + /* Save IDL of pages freed by this txn, to freeDB */ free2: - freecnt = txn->mt_free_pgs[0]; - if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { + if (freecnt != txn->mt_free_pgs[0]) { MDB_val key, data; /* make sure last page of freeDB is touched and on freelist */ - key.mv_size = MAXKEYSIZE+1; + key.mv_size = MDB_MAXKEYSIZE+1; key.mv_data = NULL; - mdb_page_search(&mc, &key, MDB_PS_MODIFY); + rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + goto fail; - mdb_midl_sort(txn->mt_free_pgs); #if MDB_DEBUG > 1 { unsigned int i; MDB_IDL idl = txn->mt_free_pgs; + mdb_midl_sort(txn->mt_free_pgs); DPRINTF("IDL write txn %zu root %zu num %zu", txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]); - for (i=0; imt_txnid; - data.mv_data = txn->mt_free_pgs; /* The free list can still grow during this call, - * despite the pre-emptive touches above. So check - * and make sure the entire thing got written. + * despite the pre-emptive touches above. So retry + * until the reserved space remains big enough. */ do { + assert(freecnt < txn->mt_free_pgs[0]); freecnt = txn->mt_free_pgs[0]; data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs); - rc = mdb_cursor_put(&mc, &key, &data, 0); - if (rc) { - mdb_txn_abort(txn); - return rc; - } + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + goto fail; } while (freecnt != txn->mt_free_pgs[0]); + mdb_midl_sort(txn->mt_free_pgs); + memcpy(data.mv_data, txn->mt_free_pgs, data.mv_size); + if (oldpg_txnid < env->me_pglast || (!env->me_pghead && id)) + goto free_pgfirst; /* used up freeDB[oldpg_txnid] */ } - /* should only be one record now */ -again: + + /* Put back page numbers we took from freeDB but did not use */ if (env->me_pghead) { + for (;;) { MDB_val key, data; - MDB_oldpages *mop; - pgno_t orig; - txnid_t id; + pgno_t orig, *mop; mop = env->me_pghead; - id = mop->mo_txnid; + id = env->me_pglast; key.mv_size = sizeof(id); key.mv_data = &id; - data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages); - data.mv_data = mop->mo_pages; - orig = mop->mo_pages[0]; /* These steps may grow the freelist again * due to freed overflow pages... */ - mdb_cursor_put(&mc, &key, &data, 0); - if (mop == env->me_pghead && env->me_pghead->mo_txnid == id) { - /* could have been used again here */ - if (mop->mo_pages[0] != orig) { - data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages); - data.mv_data = mop->mo_pages; - id = mop->mo_txnid; - mdb_cursor_put(&mc, &key, &data, 0); - } - env->me_pghead = NULL; - free(mop); - } else { - /* was completely used up */ - mdb_cursor_del(&mc, 0); - if (env->me_pghead) - goto again; - } - env->me_pgfirst = 0; - env->me_pglast = 0; - } - - while (env->me_pgfree) { - MDB_oldpages *mop = env->me_pgfree; - env->me_pgfree = mop->mo_next; - free(mop);; + i = 2; + do { + orig = mop[0]; + if (orig > env->me_maxfree_1pg && id > 4) + orig = env->me_maxfree_1pg; /* Do not use more than 1 page */ + data.mv_size = (orig + 1) * sizeof(pgno_t); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + goto fail; + assert(!env->me_pghead || env->me_pglast); + /* mop could have been used again here */ + if (id != env->me_pglast || env->me_pghead == NULL) + goto again; /* was completely used up */ + assert(mop == env->me_pghead); + } while (mop[0] < orig && --i); + memcpy(data.mv_data, mop, data.mv_size); + if (mop[0] <= orig) + break; + *(pgno_t *)data.mv_data = orig; + mop[orig] = mop[0] - orig; + env->me_pghead = mop += orig; + /* Save more oldpages at the previous txnid. */ + assert(env->me_pglast == id && id == oldpg_txnid); + env->me_pglast = --oldpg_txnid; + } } /* Check for growth of freelist again */ if (freecnt != txn->mt_free_pgs[0]) goto free2; + free(env->me_pgfree); + env->me_pghead = env->me_pgfree = NULL; + if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { if (mdb_midl_shrink(&txn->mt_free_pgs)) env->me_free_pgs = txn->mt_free_pgs; @@ -2247,9 +2520,7 @@ again: for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) { dp = txn->mt_u.dirty_list[i].mptr; if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { - dp->mp_next = txn->mt_env->me_dpages; - VGMEMP_FREE(txn->mt_env, dp); - txn->mt_env->me_dpages = dp; + mdb_page_free(txn->mt_env, dp); } else { VGMEMP_FREE(txn->mt_env, dp); free(dp); @@ -2266,14 +2537,9 @@ sync: } done: + env->me_pglast = 0; env->me_txn = NULL; - if (txn->mt_numdbs > env->me_numdbs) { - /* update the DB flags */ - MDB_dbi i; - for (i = env->me_numdbs; imt_numdbs; i++) - env->me_dbflags[i] = txn->mt_dbs[i].md_flags; - env->me_numdbs = i; - } + mdb_dbis_update(txn, 1); UNLOCK_MUTEX_W(env); free(txn); @@ -2293,47 +2559,61 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) MDB_pagebuf pbuf; MDB_page *p; MDB_meta *m; - int rc, err; + int i, rc, err; /* We don't know the page size yet, so use a minimum value. + * Read both meta pages so we can use the latest one. */ + for (i=0; i<2; i++) { #ifdef _WIN32 - if (!ReadFile(env->me_fd, &pbuf, MDB_PAGESIZE, (DWORD *)&rc, NULL) || rc == 0) + if (!ReadFile(env->me_fd, &pbuf, MDB_PAGESIZE, (DWORD *)&rc, NULL) || rc == 0) #else - if ((rc = read(env->me_fd, &pbuf, MDB_PAGESIZE)) == 0) + if ((rc = read(env->me_fd, &pbuf, MDB_PAGESIZE)) == 0) #endif - { - return ENOENT; - } - else if (rc != MDB_PAGESIZE) { - err = ErrCode(); - if (rc > 0) - err = MDB_INVALID; - DPRINTF("read: %s", strerror(err)); - return err; - } + { + return ENOENT; + } + else if (rc != MDB_PAGESIZE) { + err = ErrCode(); + if (rc > 0) + err = MDB_INVALID; + DPRINTF("read: %s", strerror(err)); + return err; + } - p = (MDB_page *)&pbuf; + p = (MDB_page *)&pbuf; - if (!F_ISSET(p->mp_flags, P_META)) { - DPRINTF("page %zu not a meta page", p->mp_pgno); - return MDB_INVALID; + if (!F_ISSET(p->mp_flags, P_META)) { + DPRINTF("page %zu not a meta page", p->mp_pgno); + return MDB_INVALID; + } + + m = METADATA(p); + if (m->mm_magic != MDB_MAGIC) { + DPUTS("meta has invalid magic"); + return MDB_INVALID; + } + + if (m->mm_version != MDB_VERSION) { + DPRINTF("database is version %u, expected version %u", + m->mm_version, MDB_VERSION); + return MDB_VERSION_MISMATCH; + } + + if (i) { + if (m->mm_txnid > meta->mm_txnid) + memcpy(meta, m, sizeof(*m)); + } else { + memcpy(meta, m, sizeof(*m)); +#ifdef _WIN32 + if (SetFilePointer(env->me_fd, meta->mm_psize, NULL, FILE_BEGIN) != meta->mm_psize) +#else + if (lseek(env->me_fd, meta->mm_psize, SEEK_SET) != meta->mm_psize) +#endif + return ErrCode(); + } } - - m = METADATA(p); - if (m->mm_magic != MDB_MAGIC) { - DPUTS("meta has invalid magic"); - return MDB_INVALID; - } - - if (m->mm_version != MDB_VERSION) { - DPRINTF("database is version %u, expected version %u", - m->mm_version, MDB_VERSION); - return MDB_VERSION_MISMATCH; - } - - memcpy(meta, m, sizeof(*m)); return 0; } @@ -2356,6 +2636,7 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) meta->mm_magic = MDB_MAGIC; meta->mm_version = MDB_VERSION; + meta->mm_mapsize = env->me_mapsize; meta->mm_psize = psize; meta->mm_last_pg = 1; meta->mm_flags = env->me_flags & 0xffff; @@ -2381,10 +2662,12 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) #ifdef _WIN32 { DWORD len; + SetFilePointer(env->me_fd, 0, NULL, FILE_BEGIN); rc = WriteFile(env->me_fd, p, psize * 2, &len, NULL); rc = (len == psize * 2) ? MDB_SUCCESS : ErrCode(); } #else + lseek(env->me_fd, 0, SEEK_SET); rc = write(env->me_fd, p, psize * 2); rc = (rc == (int)psize * 2) ? MDB_SUCCESS : ErrCode(); #endif @@ -2400,10 +2683,11 @@ static int mdb_env_write_meta(MDB_txn *txn) { MDB_env *env; - MDB_meta meta, metab; + MDB_meta meta, metab, *mp; off_t off; int rc, len, toggle; char *ptr; + HANDLE mfd; #ifdef _WIN32 OVERLAPPED ov; #endif @@ -2416,9 +2700,12 @@ mdb_env_write_meta(MDB_txn *txn) toggle, txn->mt_dbs[MAIN_DBI].md_root); env = txn->mt_env; + mp = env->me_metas[toggle]; if (env->me_flags & MDB_WRITEMAP) { - MDB_meta *mp = env->me_metas[toggle]; + /* Persist any increases of mapsize config */ + if (env->me_mapsize > mp->mm_mapsize) + mp->mm_mapsize = env->me_mapsize; mp->mm_dbs[0] = txn->mt_dbs[0]; mp->mm_dbs[1] = txn->mt_dbs[1]; mp->mm_last_pg = txn->mt_next_pgno - 1; @@ -2439,7 +2726,13 @@ mdb_env_write_meta(MDB_txn *txn) metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; ptr = (char *)&meta; - off = offsetof(MDB_meta, mm_dbs[0].md_depth); + if (env->me_mapsize > mp->mm_mapsize) { + /* Persist any increases of mapsize config */ + meta.mm_mapsize = env->me_mapsize; + off = offsetof(MDB_meta, mm_mapsize); + } else { + off = offsetof(MDB_meta, mm_dbs[0].md_depth); + } len = sizeof(MDB_meta) - off; ptr += off; @@ -2453,17 +2746,18 @@ mdb_env_write_meta(MDB_txn *txn) off += PAGEHDRSZ; /* Write to the SYNC fd */ + mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ? + env->me_fd : env->me_mfd; #ifdef _WIN32 { memset(&ov, 0, sizeof(ov)); ov.Offset = off; - WriteFile(env->me_mfd, ptr, len, (DWORD *)&rc, &ov); + WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov); } #else - rc = pwrite(env->me_mfd, ptr, len, off); + rc = pwrite(mfd, ptr, len, off); #endif if (rc != len) { - int r2; rc = ErrCode(); DPUTS("write failed, disk error?"); /* On a failure, the pagecache still contains the new data. @@ -2475,7 +2769,7 @@ mdb_env_write_meta(MDB_txn *txn) #ifdef _WIN32 WriteFile(env->me_fd, ptr, len, NULL, &ov); #else - r2 = pwrite(env->me_fd, ptr, len, off); + pwrite(env->me_fd, ptr, len, off); #endif fail: env->me_flags |= MDB_FATAL_ERROR; @@ -2512,13 +2806,8 @@ mdb_env_create(MDB_env **env) if (!e) return ENOMEM; - e->me_free_pgs = mdb_midl_alloc(); - if (!e->me_free_pgs) { - free(e); - return ENOMEM; - } e->me_maxreaders = DEFAULT_READERS; - e->me_maxdbs = 2; + e->me_maxdbs = e->me_numdbs = 2; e->me_fd = INVALID_HANDLE_VALUE; e->me_lfd = INVALID_HANDLE_VALUE; e->me_mfd = INVALID_HANDLE_VALUE; @@ -2548,7 +2837,7 @@ mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { if (env->me_map) return EINVAL; - env->me_maxdbs = dbs; + env->me_maxdbs = dbs + 2; /* Named databases + main and free DB */ return MDB_SUCCESS; } @@ -2573,14 +2862,13 @@ mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) /** Further setup required for opening an MDB environment */ static int -mdb_env_open2(MDB_env *env, unsigned int flags) +mdb_env_open2(MDB_env *env) { + unsigned int flags = env->me_flags; int i, newenv = 0, prot; MDB_meta meta; MDB_page *p; - env->me_flags = flags; - memset(&meta, 0, sizeof(meta)); if ((i = mdb_env_read_header(env, &meta)) != 0) { @@ -2590,8 +2878,19 @@ mdb_env_open2(MDB_env *env, unsigned int flags) newenv = 1; } + /* Was a mapsize configured? */ if (!env->me_mapsize) { + /* If this is a new environment, take the default, + * else use the size recorded in the existing env. + */ env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize; + } else if (env->me_mapsize < meta.mm_mapsize) { + /* If the configured size is smaller, make sure it's + * still big enough. Silently round up to minimum if not. + */ + size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; + if (env->me_mapsize < minsize) + env->me_mapsize = minsize; } #ifdef _WIN32 @@ -2624,12 +2923,11 @@ mdb_env_open2(MDB_env *env, unsigned int flags) } #else i = MAP_SHARED; - if (meta.mm_address && (flags & MDB_FIXEDMAP)) - i |= MAP_FIXED; prot = PROT_READ; if (flags & MDB_WRITEMAP) { prot |= PROT_WRITE; - ftruncate(env->me_fd, env->me_mapsize); + if (ftruncate(env->me_fd, env->me_mapsize) < 0) + return ErrCode(); } env->me_map = mmap(meta.mm_address, env->me_mapsize, prot, i, env->me_fd, 0); @@ -2637,18 +2935,34 @@ mdb_env_open2(MDB_env *env, unsigned int flags) env->me_map = NULL; return ErrCode(); } -#endif + /* Turn off readahead. It's harmful when the DB is larger than RAM. */ +#ifdef MADV_RANDOM + madvise(env->me_map, env->me_mapsize, MADV_RANDOM); +#else +#ifdef POSIX_MADV_RANDOM + posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); +#endif /* POSIX_MADV_RANDOM */ +#endif /* MADV_RANDOM */ +#endif /* _WIN32 */ if (newenv) { - meta.mm_mapsize = env->me_mapsize; if (flags & MDB_FIXEDMAP) meta.mm_address = env->me_map; i = mdb_env_init_meta(env, &meta); if (i != MDB_SUCCESS) { return i; } + } else if (meta.mm_address && env->me_map != meta.mm_address) { + /* Can happen because the address argument to mmap() is just a + * hint. mmap() can pick another, e.g. if the range is in use. + * The MAP_FIXED flag would prevent that, but then mmap could + * instead unmap existing pages to make room for the new map. + */ + return EBUSY; /* TODO: Make a new MDB_* error code? */ } env->me_psize = meta.mm_psize; + env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + env->me_nodemax = (env->me_psize - PAGEHDRSZ) / MDB_MINKEYS; env->me_maxpg = env->me_mapsize / env->me_psize; @@ -2759,9 +3073,12 @@ mdb_env_share_locks(MDB_env *env, int *excl) * then release the existing exclusive lock. */ memset(&ov, 0, sizeof(ov)); - LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov); - UnlockFile(env->me_lfd, 0, 0, 1, 0); - *excl = 0; + if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { + rc = ErrCode(); + } else { + UnlockFile(env->me_lfd, 0, 0, 1, 0); + *excl = 0; + } } #else { @@ -2794,7 +3111,9 @@ mdb_env_excl_lock(MDB_env *env, int *excl) } else { OVERLAPPED ov; memset(&ov, 0, sizeof(ov)); - if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { + if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { + *excl = 0; + } else { rc = ErrCode(); } } @@ -2805,7 +3124,9 @@ mdb_env_excl_lock(MDB_env *env, int *excl) lock_info.l_whence = SEEK_SET; lock_info.l_start = 0; lock_info.l_len = 1; - if (!fcntl(env->me_lfd, F_SETLK, &lock_info)) { + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + if (!rc) { *excl = 1; } else # ifdef MDB_USE_POSIX_SEM @@ -2903,48 +3224,69 @@ mdb_hash_hex(MDB_val *val, char *hexbuf) * @param[in] lpath The pathname of the file used for the lock region. * @param[in] mode The Unix permissions for the file, if we create it. * @param[out] excl Resulting file lock type: -1 none, 0 shared, 1 exclusive + * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive * @return 0 on success, non-zero on failure. */ static int mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) { +#ifdef _WIN32 +# define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT +#else +# define MDB_ERRCODE_ROFS EROFS +#ifdef O_CLOEXEC /* Linux: Open file and set FD_CLOEXEC atomically */ +# define MDB_CLOEXEC O_CLOEXEC +#else + int fdflags; +# define MDB_CLOEXEC 0 +#endif +#endif int rc; off_t size, rsize; - *excl = -1; - #ifdef _WIN32 - if ((env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE, + env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, - FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) { - goto fail_errno; - } - /* Try to get exclusive lock. If we succeed, then - * nobody is using the lock region and we should initialize it. - */ - if ((rc = mdb_env_excl_lock(env, excl))) goto fail; - size = GetFileSize(env->me_lfd, NULL); - + FILE_ATTRIBUTE_NORMAL, NULL); #else -#if !(O_CLOEXEC) - { - int fdflags; - if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1) - goto fail_errno; - /* Lose record locks when exec*() */ - if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(env->me_lfd, F_SETFD, fdflags); - } -#else /* O_CLOEXEC on Linux: Open file and set FD_CLOEXEC atomically */ - if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT|O_CLOEXEC, mode)) == -1) + env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode); +#endif + if (env->me_lfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { + return MDB_SUCCESS; + } goto fail_errno; + } +#if ! ((MDB_CLOEXEC) || defined(_WIN32)) + /* Lose record locks when exec*() */ + if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) + fcntl(env->me_lfd, F_SETFD, fdflags); #endif + if (!(env->me_flags & MDB_NOTLS)) { + rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); + if (rc) + goto fail; + env->me_flags |= MDB_ENV_TXKEY; +#ifdef _WIN32 + /* Windows TLS callbacks need help finding their TLS info. */ + if (mdb_tls_nkeys >= MAX_TLS_KEYS) { + rc = MDB_TLS_FULL; + goto fail; + } + mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; +#endif + } + /* Try to get exclusive lock. If we succeed, then * nobody is using the lock region and we should initialize it. */ if ((rc = mdb_env_excl_lock(env, excl))) goto fail; +#ifdef _WIN32 + size = GetFileSize(env->me_lfd, NULL); +#else size = lseek(env->me_lfd, 0, SEEK_END); #endif rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); @@ -2996,7 +3338,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; mdb_sec_inited = 1; } - GetFileInformationByHandle(env->me_lfd, &stbuf); + if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno; idbuf.volume = stbuf.dwVolumeSerialNumber; idbuf.nhigh = stbuf.nFileIndexHigh; idbuf.nlow = stbuf.nFileIndexLow; @@ -3095,14 +3437,20 @@ fail: #define DATANAME "/data.mdb" /** The suffix of the lock file when no subdir is used */ #define LOCKSUFF "-lock" + /** Only a subset of the @ref mdb_env flags can be changed + * at runtime. Changing other flags requires closing the + * environment and re-opening it with the new flags. + */ +#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC) +#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS) int -mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) +mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) { - int oflags, rc, len, excl; + int oflags, rc, len, excl = -1; char *lpath, *dpath; - if (env->me_fd != INVALID_HANDLE_VALUE) + if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) return EINVAL; len = strlen(path); @@ -3124,13 +3472,31 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) sprintf(dpath, "%s" DATANAME, path); } - rc = mdb_env_setup_locks(env, lpath, mode, &excl); + rc = MDB_SUCCESS; + flags |= env->me_flags; + if (flags & MDB_RDONLY) { + /* silently ignore WRITEMAP when we're only getting read access */ + flags &= ~MDB_WRITEMAP; + } else { + if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && + (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) + rc = ENOMEM; + } + env->me_flags = flags |= MDB_ENV_ACTIVE; if (rc) goto leave; - /* silently ignore WRITEMAP if we're only getting read access */ - if (F_ISSET(flags, MDB_RDONLY) && F_ISSET(flags, MDB_WRITEMAP)) - flags ^= MDB_WRITEMAP; + env->me_path = strdup(path); + env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); + env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); + if (!(env->me_dbxs && env->me_path && env->me_dbflags)) { + rc = ENOMEM; + goto leave; + } + + rc = mdb_env_setup_locks(env, lpath, mode, &excl); + if (rc) + goto leave; #ifdef _WIN32 if (F_ISSET(flags, MDB_RDONLY)) { @@ -3156,11 +3522,13 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) goto leave; } - if ((rc = mdb_env_open2(env, flags)) == MDB_SUCCESS) { - if (flags & (MDB_RDONLY|MDB_NOSYNC|MDB_NOMETASYNC|MDB_WRITEMAP)) { + if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { + if (flags & (MDB_RDONLY|MDB_WRITEMAP)) { env->me_mfd = env->me_fd; } else { - /* synchronous fd for meta writes */ + /* Synchronous fd for meta writes. Needed even with + * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. + */ #ifdef _WIN32 env->me_mfd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, @@ -3174,27 +3542,9 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) } } DPRINTF("opened dbenv %p", (void *) env); - pthread_key_create(&env->me_txkey, mdb_env_reader_dest); - env->me_numdbs = 2; /* this notes that me_txkey was set */ -#ifdef _WIN32 - /* Windows TLS callbacks need help finding their TLS info. */ - if (mdb_tls_nkeys < MAX_TLS_KEYS) - mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; - else { - rc = MDB_TLS_FULL; - goto leave; - } -#endif if (excl > 0) { rc = mdb_env_share_locks(env, &excl); - if (rc) - goto leave; } - env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); - env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); - env->me_path = strdup(path); - if (!env->me_dbxs || !env->me_dbflags || !env->me_path) - rc = ENOMEM; } leave: @@ -3205,20 +3555,27 @@ leave: return rc; } -/** Destroy resources from mdb_env_open() and clear our readers */ +/** Destroy resources from mdb_env_open(), clear our readers & DBIs */ static void mdb_env_close0(MDB_env *env, int excl) { int i; - if (env->me_lfd == INVALID_HANDLE_VALUE) /* 1st field to get inited */ + if (!(env->me_flags & MDB_ENV_ACTIVE)) return; + /* Doing this here since me_dbxs may not exist during mdb_env_close */ + for (i = env->me_maxdbs; --i > MAIN_DBI; ) + free(env->me_dbxs[i].md_name.mv_data); + free(env->me_dbflags); free(env->me_dbxs); free(env->me_path); + free(env->me_dirty_list); + if (env->me_free_pgs) + mdb_midl_free(env->me_free_pgs); - if (env->me_numdbs) { + if (env->me_flags & MDB_ENV_TXKEY) { pthread_key_delete(env->me_txkey); #ifdef _WIN32 /* Delete our key from the global list */ @@ -3272,9 +3629,146 @@ mdb_env_close0(MDB_env *env, int excl) #endif munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); } - close(env->me_lfd); + if (env->me_lfd != INVALID_HANDLE_VALUE) { +#ifdef _WIN32 + if (excl >= 0) { + /* Unlock the lockfile. Windows would have unlocked it + * after closing anyway, but not necessarily at once. + */ + UnlockFile(env->me_lfd, 0, 0, 1, 0); + } +#endif + close(env->me_lfd); + } - env->me_lfd = INVALID_HANDLE_VALUE; /* Mark env as reset */ + env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); +} + +int +mdb_env_copy(MDB_env *env, const char *path) +{ + MDB_txn *txn = NULL; + int rc, len; + size_t wsize; + char *lpath, *ptr; + HANDLE newfd = INVALID_HANDLE_VALUE; + + if (env->me_flags & MDB_NOSUBDIR) { + lpath = (char *)path; + } else { + len = strlen(path); + len += sizeof(DATANAME); + lpath = malloc(len); + if (!lpath) + return ENOMEM; + sprintf(lpath, "%s" DATANAME, path); + } + + /* The destination path must exist, but the destination file must not. + * We don't want the OS to cache the writes, since the source data is + * already in the OS cache. + */ +#ifdef _WIN32 + newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, + FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); +#else + newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL +#ifdef O_DIRECT + |O_DIRECT +#endif + , 0666); +#endif + if (!(env->me_flags & MDB_NOSUBDIR)) + free(lpath); + if (newfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } + +#ifdef F_NOCACHE /* __APPLE__ */ + rc = fcntl(newfd, F_NOCACHE, 1); + if (rc) { + rc = ErrCode(); + goto leave; + } +#endif + + /* Do the lock/unlock of the reader mutex before starting the + * write txn. Otherwise other read txns could block writers. + */ + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + goto leave; + + if (env->me_txns) { + /* We must start the actual read txn after blocking writers */ + mdb_txn_reset0(txn); + + /* Temporarily block writers until we snapshot the meta pages */ + LOCK_MUTEX_W(env); + + rc = mdb_txn_renew0(txn); + if (rc) { + UNLOCK_MUTEX_W(env); + goto leave; + } + } + + wsize = env->me_psize * 2; +#ifdef _WIN32 + { + DWORD len; + rc = WriteFile(newfd, env->me_map, wsize, &len, NULL); + rc = (len == wsize) ? MDB_SUCCESS : ErrCode(); + } +#else + rc = write(newfd, env->me_map, wsize); + rc = (rc == (int)wsize) ? MDB_SUCCESS : ErrCode(); +#endif + if (env->me_txns) + UNLOCK_MUTEX_W(env); + + if (rc) + goto leave; + + ptr = env->me_map + wsize; + wsize = txn->mt_next_pgno * env->me_psize - wsize; +#define MAX_WRITE 2147483648U +#ifdef _WIN32 + while (wsize > 0) { + DWORD len, w2; + if (wsize > MAX_WRITE) + w2 = MAX_WRITE; + else + w2 = wsize; + rc = WriteFile(newfd, ptr, w2, &len, NULL); + rc = (len == w2) ? MDB_SUCCESS : ErrCode(); + if (rc) break; + wsize -= w2; + ptr += w2; + } +#else + while (wsize > 0) { + size_t w2; + ssize_t wres; + if (wsize > MAX_WRITE) + w2 = MAX_WRITE; + else + w2 = wsize; + wres = write(newfd, ptr, w2); + rc = (wres > 0) ? MDB_SUCCESS : ErrCode(); + if (rc) break; + wsize -= wres; + ptr += wres; + } +#endif + +leave: + mdb_txn_abort(txn); + if (newfd != INVALID_HANDLE_VALUE) + close(newfd); + + return rc; } void @@ -3293,7 +3787,6 @@ mdb_env_close(MDB_env *env) } mdb_env_close0(env, 0); - mdb_midl_free(env->me_free_pgs); free(env); } @@ -3504,7 +3997,7 @@ static void mdb_cursor_pop(MDB_cursor *mc) { if (mc->mc_snum) { -#if MDB_DEBUG +#ifndef MDB_DEBUG_SKIP MDB_page *top = mc->mc_pg[mc->mc_top]; #endif mc->mc_snum--; @@ -3546,28 +4039,31 @@ mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret) { MDB_page *p = NULL; - if (txn->mt_env->me_flags & MDB_WRITEMAP) { - if (pgno < txn->mt_next_pgno) - p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); - goto done; + if (!((txn->mt_flags & MDB_TXN_RDONLY) | + (txn->mt_env->me_flags & MDB_WRITEMAP))) + { + MDB_txn *tx2 = txn; + do { + MDB_ID2L dl = tx2->mt_u.dirty_list; + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + p = dl[x].mptr; + goto done; + } + } + } while ((tx2 = tx2->mt_parent) != NULL); } - if (!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY) && txn->mt_u.dirty_list[0].mid) { - unsigned x; - x = mdb_mid2l_search(txn->mt_u.dirty_list, pgno); - if (x <= txn->mt_u.dirty_list[0].mid && txn->mt_u.dirty_list[x].mid == pgno) { - p = txn->mt_u.dirty_list[x].mptr; - } - } - if (!p) { - if (pgno < txn->mt_next_pgno) - p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); - } -done: - *ret = p; - if (!p) { + + if (pgno < txn->mt_next_pgno) { + p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); + } else { DPRINTF("page %zu not found", pgno); assert(p != NULL); } + +done: + *ret = p; return (p != NULL) ? MDB_SUCCESS : MDB_PAGE_NOTFOUND; } @@ -3578,8 +4074,7 @@ done: * @param[in,out] mc the cursor for this operation. * @param[in] key the key to search for. If NULL, search for the lowest * page. (This is used by #mdb_cursor_first().) - * @param[in] flags If MDB_PS_MODIFY set, visited pages are updated with new page numbers. - * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. + * @param[in] modify If true, visited pages are updated with new page numbers. * @return 0 on success, non-zero on failure. */ static int @@ -3600,7 +4095,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) if (key == NULL) /* Initialize cursor to first page. */ i = 0; - else if (key->mv_size > MAXKEYSIZE && key->mv_data == NULL) { + else if (key->mv_size > MDB_MAXKEYSIZE && key->mv_data == NULL) { /* cursor to last page */ i = NUMKEYS(mp)-1; } else { @@ -3649,6 +4144,28 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) return MDB_SUCCESS; } +/** Search for the lowest key under the current branch page. + * This just bypasses a NUMKEYS check in the current page + * before calling mdb_page_search_root(), because the callers + * are all in situations where the current page is known to + * be underfilled. + */ +static int +mdb_page_search_lowest(MDB_cursor *mc) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NODEPTR(mp, 0); + int rc; + + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp))) + return rc; + + mc->mc_ki[mc->mc_top] = 0; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + return mdb_page_search_root(mc, NULL, 0); +} + /** Search for the page a given key should be in. * Pushes parent pages on the cursor stack. This function just sets up * the search; it finds the root page for \b mc's database and sets this @@ -3657,7 +4174,8 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) * @param[in,out] mc the cursor for this operation. * @param[in] key the key to search for. If NULL, search for the lowest * page. (This is used by #mdb_cursor_first().) - * @param[in] modify If true, visited pages are updated with new page numbers. + * @param[in] flags If MDB_PS_MODIFY set, visited pages are updated with new page numbers. + * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. * @return 0 on success, non-zero on failure. */ static int @@ -3686,16 +4204,25 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) if (*mc->mc_dbflag & DB_STALE) { MDB_val data; int exact = 0; + uint16_t flags; MDB_node *leaf = mdb_node_search(&mc2, &mc->mc_dbx->md_name, &exact); if (!exact) return MDB_NOTFOUND; mdb_node_read(mc->mc_txn, leaf, &data); + memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), + sizeof(uint16_t)); + /* The txn may not know this DBI, or another process may + * have dropped and recreated the DB with other flags. + */ + if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) + return MDB_INCOMPATIBLE; memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); } if (flags & MDB_PS_MODIFY) dbflag = DB_DIRTY; - *mc->mc_dbflag = dbflag; + *mc->mc_dbflag &= ~DB_STALE; + *mc->mc_dbflag |= dbflag; } } root = mc->mc_db->md_root; @@ -3773,10 +4300,10 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, assert(data); DPRINTF("===> get db %u key [%s]", dbi, DKEY(key)); - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs) + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) return EINVAL; - if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { + if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { return EINVAL; } @@ -3811,8 +4338,12 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) : (mc->mc_ki[mc->mc_top] == 0)) { DPRINTF("no more keys left, moving to %s sibling", move_right ? "right" : "left"); - if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) + if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { + /* undo cursor_pop before returning */ + mc->mc_top++; + mc->mc_snum++; return rc; + } } else { if (move_right) mc->mc_ki[mc->mc_top]++; @@ -3825,9 +4356,11 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp))) - return rc;; + return rc; mdb_cursor_push(mc, mp); + if (!move_right) + mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; return MDB_SUCCESS; } @@ -3893,7 +4426,7 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) mdb_xcursor_init1(mc, leaf); } if (data) { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data) != MDB_SUCCESS)) + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) return rc; if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -3966,7 +4499,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) mdb_xcursor_init1(mc, leaf); } if (data) { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data) != MDB_SUCCESS)) + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) return rc; if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -4211,20 +4744,20 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (!(mc->mc_flags & C_EOF)) { - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - MDB_val lkey; + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + MDB_val lkey; + + lkey.mv_size = MDB_MAXKEYSIZE+1; + lkey.mv_data = NULL; + rc = mdb_page_search(mc, &lkey, 0); + if (rc != MDB_SUCCESS) + return rc; + } + assert(IS_LEAF(mc->mc_pg[mc->mc_top])); - lkey.mv_size = MAXKEYSIZE+1; - lkey.mv_data = NULL; - rc = mdb_page_search(mc, &lkey, 0); - if (rc != MDB_SUCCESS) - return rc; } - assert(IS_LEAF(mc->mc_pg[mc->mc_top])); - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; mc->mc_flags |= C_INITIALIZED|C_EOF; - } leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { @@ -4262,7 +4795,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, switch (op) { case MDB_GET_CURRENT: - if (!mc->mc_flags & C_INITIALIZED) { + if (!(mc->mc_flags & C_INITIALIZED)) { rc = EINVAL; } else { MDB_page *mp = mc->mc_pg[mc->mc_top]; @@ -4298,7 +4831,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, case MDB_SET: case MDB_SET_KEY: case MDB_SET_RANGE: - if (key == NULL || key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { + if (key == NULL || key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { rc = EINVAL; } else if (op == MDB_SET_RANGE) rc = mdb_cursor_set(mc, key, data, op, NULL); @@ -4352,11 +4885,12 @@ fetchm: case MDB_PREV: case MDB_PREV_DUP: case MDB_PREV_NODUP: - if (!(mc->mc_flags & C_INITIALIZED) || (mc->mc_flags & C_EOF)) { + if (!(mc->mc_flags & C_INITIALIZED)) { rc = mdb_cursor_last(mc, key, data); - mc->mc_flags &= ~C_EOF; - } else - rc = mdb_cursor_prev(mc, key, data, op); + mc->mc_flags |= C_INITIALIZED; + mc->mc_ki[mc->mc_top]++; + } + rc = mdb_cursor_prev(mc, key, data, op); break; case MDB_FIRST: rc = mdb_cursor_first(mc, key, data); @@ -4404,11 +4938,12 @@ mdb_cursor_touch(MDB_cursor *mc) if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { MDB_cursor mc2; - mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); + MDB_xcursor mcx; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); if (rc) return rc; - *mc->mc_dbflag = DB_DIRTY; + *mc->mc_dbflag |= DB_DIRTY; } for (mc->mc_top = 0; mc->mc_top < mc->mc_snum; mc->mc_top++) { rc = mdb_page_touch(mc); @@ -4432,13 +4967,24 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, size_t nsize; int rc, rc2; MDB_pagebuf pbuf; - char dbuf[MAXKEYSIZE+1]; + char dbuf[MDB_MAXKEYSIZE+1]; unsigned int nflags; DKBUF; if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY)) return EACCES; + if (flags != MDB_CURRENT && (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE)) + return EINVAL; + + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT) && data->mv_size > MDB_MAXKEYSIZE) + return EINVAL; + +#if SIZE_MAX > MAXDATASIZE + if (data->mv_size > MAXDATASIZE) + return EINVAL; +#endif + DPRINTF("==> put db %u key [%s], size %zu, data size %zu", mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size); @@ -4459,7 +5005,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, mdb_cursor_push(mc, np); mc->mc_db->md_root = np->mp_pgno; mc->mc_db->md_depth++; - *mc->mc_dbflag = DB_DIRTY; + *mc->mc_dbflag |= DB_DIRTY; if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) == MDB_DUPFIXED) np->mp_flags |= P_LEAF2; @@ -4478,7 +5024,8 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, rc = MDB_NOTFOUND; mc->mc_ki[mc->mc_top]++; } else { - rc = 0; + /* new key is <= last key */ + rc = MDB_KEYEXIST; } } } else { @@ -4587,8 +5134,7 @@ reuse: } offset += offset & 1; if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) + - offset >= (mc->mc_txn->mt_env->me_psize - PAGEHDRSZ) / - MDB_MINKEYS) { + offset >= mc->mc_txn->mt_env->me_nodemax) { /* yes, convert it */ dummy.md_flags = 0; if (mc->mc_db->md_flags & MDB_DUPFIXED) { @@ -4653,9 +5199,10 @@ current: /* Is the ov page writable and large enough? */ if ((omp->mp_flags & P_DIRTY) && ovpages >= dpages) { /* yes, overwrite it. Note in this case we don't - * bother to try shrinking the node if the new data + * bother to try shrinking the page if the new data * is smaller than the overflow threshold. */ + SETDSZ(leaf, data->mv_size); if (F_ISSET(flags, MDB_RESERVE)) data->mv_data = METADATA(omp); else @@ -4678,8 +5225,10 @@ current: */ if (F_ISSET(flags, MDB_RESERVE)) data->mv_data = NODEDATA(leaf); - else + else if (data->mv_size) memcpy(NODEDATA(leaf), data->mv_data, data->mv_size); + else + memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); goto done; } mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0); @@ -4764,6 +5313,8 @@ put_sub: } } } + /* we've done our job */ + dkey.mv_size = 0; } if (flags & MDB_APPENDDUP) xflags |= MDB_APPEND; @@ -4788,6 +5339,11 @@ put_sub: } } done: + /* If we succeeded and the key didn't exist before, make sure + * the cursor is marked valid. + */ + if (!rc && insert) + mc->mc_flags |= C_INITIALIZED; return rc; } @@ -4800,7 +5356,7 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY)) return EACCES; - if (!mc->mc_flags & C_INITIALIZED) + if (!(mc->mc_flags & C_INITIALIZED)) return EINVAL; rc = mdb_cursor_touch(mc); @@ -4896,7 +5452,7 @@ mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) size_t sz; sz = LEAFSIZE(key, data); - if (sz >= env->me_psize / MDB_MINKEYS) { + if (sz >= env->me_nodemax) { /* put on overflow page */ sz -= data->mv_size - sizeof(pgno_t); } @@ -4921,7 +5477,7 @@ mdb_branch_size(MDB_env *env, MDB_val *key) size_t sz; sz = INDXSIZE(key); - if (sz >= env->me_psize / MDB_MINKEYS) { + if (sz >= env->me_nodemax) { /* put on overflow page */ /* not implemented */ /* sz -= key->size - sizeof(pgno_t); */ @@ -4989,7 +5545,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, if (F_ISSET(flags, F_BIGDATA)) { /* Data already on overflow page. */ node_size += sizeof(pgno_t); - } else if (node_size + data->mv_size >= mc->mc_txn->mt_env->me_psize / MDB_MINKEYS) { + } else if (node_size + data->mv_size >= mc->mc_txn->mt_env->me_nodemax) { int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); int rc; /* Put data on overflow page. */ @@ -5244,8 +5800,8 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) } DPRINTF("Sub-db %u for db %u root page %zu", mx->mx_cursor.mc_dbi, mc->mc_dbi, mx->mx_db.md_root); - mx->mx_dbflag = (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY)) ? - DB_DIRTY : 0; + mx->mx_dbflag = DB_VALID | (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY) ? + DB_DIRTY : 0); mx->mx_dbx.md_name.mv_data = NODEKEY(node); mx->mx_dbx.md_name.mv_size = node->mn_ksize; #if UINT_MAX < SIZE_MAX @@ -5288,10 +5844,9 @@ int mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { MDB_cursor *mc; - MDB_xcursor *mx = NULL; size_t size = sizeof(MDB_cursor); - if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs) + if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) return EINVAL; /* Allow read access to the freelist */ @@ -5302,13 +5857,11 @@ mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) size += sizeof(MDB_xcursor); if ((mc = malloc(size)) != NULL) { - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { - mx = (MDB_xcursor *)(mc + 1); - } - mdb_cursor_init(mc, txn, dbi, mx); + mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); if (txn->mt_cursors) { mc->mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = mc; + mc->mc_flags |= C_UNTRACK; } mc->mc_flags |= C_ALLOCD; } else { @@ -5323,13 +5876,19 @@ mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { + unsigned flags; + if (txn == NULL || mc == NULL || mc->mc_dbi >= txn->mt_numdbs) return EINVAL; - if (txn->mt_cursors) + if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) return EINVAL; + flags = mc->mc_flags; + mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); + + mc->mc_flags |= (flags & C_ALLOCD); return MDB_SUCCESS; } @@ -5362,7 +5921,7 @@ mdb_cursor_close(MDB_cursor *mc) { if (mc != NULL) { /* remove from txn, if tracked */ - if (mc->mc_txn->mt_cursors) { + if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; if (*prev == mc) @@ -5383,32 +5942,34 @@ mdb_cursor_txn(MDB_cursor *mc) MDB_dbi mdb_cursor_dbi(MDB_cursor *mc) { - if (!mc) return 0; + assert(mc != NULL); return mc->mc_dbi; } /** Replace the key for a node with a new key. - * @param[in] mp The page containing the node to operate on. - * @param[in] indx The index of the node to operate on. + * @param[in] mc Cursor pointing to the node to operate on. * @param[in] key The new key to use. * @return 0 on success, non-zero on failure. */ static int -mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key) +mdb_update_key(MDB_cursor *mc, MDB_val *key) { + MDB_page *mp; MDB_node *node; char *base; size_t len; int delta, delta0; - indx_t ptr, i, numkeys; + indx_t ptr, i, numkeys, indx; DKBUF; + indx = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; node = NODEPTR(mp, indx); ptr = mp->mp_ptrs[indx]; #if MDB_DEBUG { MDB_val k2; - char kbuf2[(MAXKEYSIZE*2+1)]; + char kbuf2[(MDB_MAXKEYSIZE*2+1)]; k2.mv_data = NODEKEY(node); k2.mv_size = node->mn_ksize; DPRINTF("update key %u (ofs %u) [%s] to [%s] on page %zu", @@ -5427,8 +5988,12 @@ mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key) delta += (delta & 1); if (delta) { if (delta > 0 && SIZELEFT(mp) < delta) { - DPRINTF("OUCH! Not enough room, delta = %d", delta); - return MDB_PAGE_FULL; + pgno_t pgno; + /* not enough space left, do a delete and split */ + DPRINTF("Not enough room, delta = %d, splitting...", delta); + pgno = NODEPGNO(node); + mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0); + return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); } numkeys = NUMKEYS(mp); @@ -5455,17 +6020,20 @@ mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key) return MDB_SUCCESS; } +static void +mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); + /** Move a node from csrc to cdst. */ static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) { - int rc; MDB_node *srcnode; MDB_val key, data; pgno_t srcpg; + MDB_cursor mn; + int rc; unsigned short flags; - DKBUF; /* Mark src and dst as dirty. */ @@ -5490,7 +6058,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) unsigned int snum = csrc->mc_snum; MDB_node *s2; /* must find the lowest key below src */ - mdb_page_search_root(csrc, NULL, 0); + mdb_page_search_lowest(csrc); if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { key.mv_size = csrc->mc_db->md_pad; key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); @@ -5513,7 +6081,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) MDB_node *s2; MDB_val bkey; /* must find the lowest key below dst */ - mdb_page_search_root(cdst, NULL, 0); + mdb_page_search_lowest(cdst); if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) { bkey.mv_size = cdst->mc_db->md_pad; bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size); @@ -5524,7 +6092,11 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) } cdst->mc_snum = snum--; cdst->mc_top = snum; - rc = mdb_update_key(cdst->mc_pg[cdst->mc_top], 0, &bkey); + mdb_cursor_copy(cdst, &mn); + mn.mc_ki[snum] = 0; + rc = mdb_update_key(&mn, &bkey); + if (rc) + return rc; } DPRINTF("moving %s node %u [%s] on page %zu to node %u on page %zu", @@ -5580,14 +6152,19 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) } DPRINTF("update separator for source page %zu to [%s]", csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)); - if ((rc = mdb_update_key(csrc->mc_pg[csrc->mc_top-1], csrc->mc_ki[csrc->mc_top-1], - &key)) != MDB_SUCCESS) + mdb_cursor_copy(csrc, &mn); + mn.mc_snum--; + mn.mc_top--; + if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) return rc; } if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { MDB_val nullkey; + indx_t ix = csrc->mc_ki[csrc->mc_top]; nullkey.mv_size = 0; - rc = mdb_update_key(csrc->mc_pg[csrc->mc_top], 0, &nullkey); + csrc->mc_ki[csrc->mc_top] = 0; + rc = mdb_update_key(csrc, &nullkey); + csrc->mc_ki[csrc->mc_top] = ix; assert(rc == MDB_SUCCESS); } } @@ -5603,14 +6180,19 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) } DPRINTF("update separator for destination page %zu to [%s]", cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)); - if ((rc = mdb_update_key(cdst->mc_pg[cdst->mc_top-1], cdst->mc_ki[cdst->mc_top-1], - &key)) != MDB_SUCCESS) + mdb_cursor_copy(cdst, &mn); + mn.mc_snum--; + mn.mc_top--; + if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) return rc; } if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { MDB_val nullkey; + indx_t ix = cdst->mc_ki[cdst->mc_top]; nullkey.mv_size = 0; - rc = mdb_update_key(cdst->mc_pg[cdst->mc_top], 0, &nullkey); + cdst->mc_ki[cdst->mc_top] = 0; + rc = mdb_update_key(cdst, &nullkey); + cdst->mc_ki[cdst->mc_top] = ix; assert(rc == MDB_SUCCESS); } } @@ -5663,7 +6245,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) unsigned int snum = csrc->mc_snum; MDB_node *s2; /* must find the lowest key below src */ - mdb_page_search_root(csrc, NULL, 0); + mdb_page_search_lowest(csrc); if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { key.mv_size = csrc->mc_db->md_pad; key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); @@ -5695,7 +6277,10 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) mdb_node_del(csrc->mc_pg[csrc->mc_top-1], csrc->mc_ki[csrc->mc_top-1], 0); if (csrc->mc_ki[csrc->mc_top-1] == 0) { key.mv_size = 0; - if ((rc = mdb_update_key(csrc->mc_pg[csrc->mc_top-1], 0, &key)) != MDB_SUCCESS) + csrc->mc_top--; + rc = mdb_update_key(csrc, &key); + csrc->mc_top++; + if (rc) return rc; } @@ -5764,9 +6349,10 @@ mdb_rebalance(MDB_cursor *mc) { MDB_node *node; int rc; - unsigned int ptop; + unsigned int ptop, minkeys; MDB_cursor mn; + minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top])); #if MDB_DEBUG { pgno_t pgno; @@ -5777,7 +6363,8 @@ mdb_rebalance(MDB_cursor *mc) } #endif - if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD) { + if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD && + NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { #if MDB_DEBUG pgno_t pgno; COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno); @@ -5789,6 +6376,10 @@ mdb_rebalance(MDB_cursor *mc) if (mc->mc_snum < 2) { MDB_page *mp = mc->mc_pg[0]; + if (IS_SUBP(mp)) { + DPUTS("Can't rebalance a subpage, ignoring"); + return MDB_SUCCESS; + } if (NUMKEYS(mp) == 0) { DPUTS("tree is completely empty"); mc->mc_db->md_root = P_INVALID; @@ -5844,6 +6435,8 @@ mdb_rebalance(MDB_cursor *mc) if (m3->mc_snum < mc->mc_snum) continue; if (m3->mc_pg[0] == mp) { m3->mc_pg[0] = mc->mc_pg[0]; + m3->mc_snum = 1; + m3->mc_top = 0; } } } @@ -5893,20 +6486,21 @@ mdb_rebalance(MDB_cursor *mc) DPRINTF("found neighbor page %zu (%u keys, %.1f%% full)", mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10); - /* If the neighbor page is above threshold and has at least two - * keys, move one key from it. - * - * Otherwise we should try to merge them. + /* If the neighbor page is above threshold and has enough keys, + * move one key from it. Otherwise we should try to merge them. + * (A branch page must never have less than 2 keys.) */ - if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) >= 2) + minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top])); + if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) return mdb_node_move(&mn, mc); - else { /* FIXME: if (has_enough_room()) */ - mc->mc_flags &= ~C_INITIALIZED; + else { if (mc->mc_ki[ptop] == 0) - return mdb_page_merge(&mn, mc); + rc = mdb_page_merge(&mn, mc); else - return mdb_page_merge(mc, &mn); + rc = mdb_page_merge(mc, &mn); + mc->mc_flags &= ~C_INITIALIZED; } + return rc; } /** Complete a delete operation started by #mdb_cursor_del(). */ @@ -5934,6 +6528,9 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) rc = mdb_rebalance(mc); if (rc != MDB_SUCCESS) mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + /* if mc points past last node in page, invalidate */ + else if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + mc->mc_flags &= ~C_INITIALIZED; return rc; } @@ -5953,14 +6550,14 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi, DPRINTF("====> delete db %u key [%s]", dbi, DKEY(key)); - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs) + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) return EINVAL; if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { return EACCES; } - if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { + if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { return EINVAL; } @@ -5976,8 +6573,20 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi, xdata = NULL; } rc = mdb_cursor_set(&mc, key, xdata, op, &exact); - if (rc == 0) + if (rc == 0) { + /* let mdb_page_split know about this cursor if needed: + * delete will trigger a rebalance; if it needs to move + * a node from one page to another, it will have to + * update the parent's separator key(s). If the new sepkey + * is larger than the current one, the parent page may + * run out of space, triggering a split. We need this + * cursor to be consistent until the end of the rebalance. + */ + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA); + txn->mt_cursors[dbi] = mc.mc_next; + } return rc; } @@ -6325,12 +6934,21 @@ newsep: } } else { mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; imc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + mc->mc_pg[ptop] = mn.mc_pg[ptop]; + mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; + } } /* return tmp page to freelist */ - copy->mp_next = mc->mc_txn->mt_env->me_dpages; - VGMEMP_FREE(mc->mc_txn->mt_env, copy); - mc->mc_txn->mt_env->me_dpages = copy; + mdb_page_free(mc->mc_txn->mt_env, copy); done: { /* Adjust other cursors pointing to mp */ @@ -6394,29 +7012,24 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, assert(key != NULL); assert(data != NULL); - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs) + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) return EINVAL; if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { return EACCES; } - if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { + if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { return EINVAL; } - if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND)) != flags) + if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) return EINVAL; mdb_cursor_init(&mc, txn, dbi, &mx); return mdb_cursor_put(&mc, key, data, flags); } -/** Only a subset of the @ref mdb_env flags can be changed - * at runtime. Changing other flags requires closing the environment - * and re-opening it with the new flags. - */ -#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC) int mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) { @@ -6480,12 +7093,30 @@ mdb_env_stat(MDB_env *env, MDB_stat *arg) return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg); } +int +mdb_env_info(MDB_env *env, MDB_envinfo *arg) +{ + int toggle; + + if (env == NULL || arg == NULL) + return EINVAL; + + toggle = mdb_env_pick_meta(env); + arg->me_mapaddr = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : 0; + arg->me_mapsize = env->me_mapsize; + arg->me_maxreaders = env->me_maxreaders; + arg->me_numreaders = env->me_numreaders; + arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg; + arg->me_last_txnid = env->me_metas[toggle]->mm_txnid; + return MDB_SUCCESS; +} + /** Set the default comparison functions for a database. * Called immediately after a database is opened to set the defaults. * The user can then override them with #mdb_set_compare() or * #mdb_set_dupsort(). * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_open() + * @param[in] dbi A database handle returned by #mdb_dbi_open() */ static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) @@ -6503,7 +7134,7 @@ mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); } -int mdb_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) +int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) { MDB_val key, data; MDB_dbi i; @@ -6516,11 +7147,20 @@ int mdb_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) mdb_default_cmp(txn, FREE_DBI); } + if ((flags & VALID_FLAGS) != flags) + return EINVAL; + /* main DB? */ if (!name) { *dbi = MAIN_DBI; - if (flags & (MDB_DUPSORT|MDB_REVERSEKEY|MDB_INTEGERKEY)) - txn->mt_dbs[MAIN_DBI].md_flags |= (flags & (MDB_DUPSORT|MDB_REVERSEKEY|MDB_INTEGERKEY)); + if (flags & PERSISTENT_FLAGS) { + uint16_t f2 = flags & PERSISTENT_FLAGS; + /* make sure flag changes get committed */ + if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { + txn->mt_dbs[MAIN_DBI].md_flags |= f2; + txn->mt_flags |= MDB_TXN_DIRTY; + } + } mdb_default_cmp(txn, MAIN_DBI); return MDB_SUCCESS; } @@ -6545,11 +7185,11 @@ int mdb_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) } /* If no free slot and max hit, fail */ - if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs - 1) + if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) return MDB_DBS_FULL; /* Find the DB info */ - dbflag = 0; + dbflag = DB_NEW|DB_VALID; exact = 0; key.mv_size = len; key.mv_data = (void *)name; @@ -6567,9 +7207,9 @@ int mdb_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) data.mv_data = &dummy; memset(&dummy, 0, sizeof(dummy)); dummy.md_root = P_INVALID; - dummy.md_flags = flags & 0xffff; + dummy.md_flags = flags & PERSISTENT_FLAGS; rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); - dbflag = DB_DIRTY; + dbflag |= DB_DIRTY; } /* OK, got info, add to table */ @@ -6585,7 +7225,6 @@ int mdb_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) mdb_default_cmp(txn, slot); if (!unused) { txn->mt_numdbs++; - txn->mt_env->me_numdbs++; } } @@ -6600,14 +7239,15 @@ int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); } -void mdb_close(MDB_env *env, MDB_dbi dbi) +void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) { char *ptr; - if (dbi <= MAIN_DBI || dbi >= env->me_numdbs) + if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs) return; ptr = env->me_dbxs[dbi].md_name.mv_data; env->me_dbxs[dbi].md_name.mv_data = NULL; env->me_dbxs[dbi].md_name.mv_size = 0; + env->me_dbflags[dbi] = 0; free(ptr); } @@ -6628,7 +7268,7 @@ mdb_drop0(MDB_cursor *mc, int subs) unsigned int i; /* LEAF2 pages have no nodes, cannot have sub-DBs */ - if (!subs || IS_LEAF2(mc->mc_pg[mc->mc_top])) + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) mdb_cursor_pop(mc); mdb_cursor_copy(mc, &mx); @@ -6636,7 +7276,15 @@ mdb_drop0(MDB_cursor *mc, int subs) if (IS_LEAF(mc->mc_pg[mc->mc_top])) { for (i=0; imc_pg[mc->mc_top]); i++) { ni = NODEPTR(mc->mc_pg[mc->mc_top], i); - if (ni->mn_flags & F_SUBDATA) { + if (ni->mn_flags & F_BIGDATA) { + int j, ovpages = OVPAGES(NODEDSZ(ni), mc->mc_txn->mt_env->me_psize); + pgno_t pg; + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + for (j=0; jmc_txn->mt_free_pgs, pg); + pg++; + } + } else if (subs && (ni->mn_flags & F_SUBDATA)) { mdb_xcursor_init1(mc, ni); rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); if (rc) @@ -6654,14 +7302,18 @@ mdb_drop0(MDB_cursor *mc, int subs) } if (!mc->mc_top) break; + mc->mc_ki[mc->mc_top] = i; rc = mdb_cursor_sibling(mc, 1); if (rc) { /* no more siblings, go back to beginning - * of previous level. (stack was already popped - * by mdb_cursor_sibling) + * of previous level. */ - for (i=1; imc_top; i++) + mdb_cursor_pop(mc); + mc->mc_ki[0] = 0; + for (i=1; imc_snum; i++) { + mc->mc_ki[i] = 0; mc->mc_pg[i] = mx.mc_pg[i]; + } } } /* free it */ @@ -6676,7 +7328,7 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) MDB_cursor *mc; int rc; - if (!txn || !dbi || dbi >= txn->mt_numdbs) + if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1 || !(txn->mt_dbflags[dbi] & DB_VALID)) return EINVAL; if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) @@ -6693,8 +7345,10 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) /* Can't delete the main DB */ if (del && dbi > MAIN_DBI) { rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL); - if (!rc) - mdb_close(txn->mt_env, dbi); + if (!rc) { + txn->mt_dbflags[dbi] = DB_STALE; + mdb_dbi_close(txn->mt_env, dbi); + } } else { /* reset the DB record, mark it dirty */ txn->mt_dbflags[dbi] |= DB_DIRTY; @@ -6705,19 +7359,7 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) txn->mt_dbs[dbi].md_entries = 0; txn->mt_dbs[dbi].md_root = P_INVALID; - if (!txn->mt_u.dirty_list[0].mid) { - MDB_cursor m2; - MDB_val key, data; - /* make sure we have at least one dirty page in this txn - * otherwise these changes will be ignored. - */ - key.mv_size = sizeof(txnid_t); - key.mv_data = &txn->mt_txnid; - data.mv_size = sizeof(MDB_ID); - data.mv_data = txn->mt_free_pgs; - mdb_cursor_init(&m2, txn, FREE_DBI, NULL); - rc = mdb_cursor_put(&m2, &key, &data, 0); - } + txn->mt_flags |= MDB_TXN_DIRTY; } leave: mdb_cursor_close(mc); @@ -6726,7 +7368,7 @@ leave: int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs) + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) return EINVAL; txn->mt_dbxs[dbi].md_cmp = cmp; @@ -6735,7 +7377,7 @@ int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs) + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) return EINVAL; txn->mt_dbxs[dbi].md_dcmp = cmp; @@ -6744,7 +7386,7 @@ int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) { - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs) + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) return EINVAL; txn->mt_dbxs[dbi].md_rel = rel; @@ -6753,7 +7395,7 @@ int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) { - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs) + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) return EINVAL; txn->mt_dbxs[dbi].md_relctx = ctx; @@ -6761,3 +7403,5 @@ int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) } /** @} */ + +/* http://gitorious.org/mdb/mdb/blobs/raw/mdb.master/libraries/liblmdb/mdb.c */ diff --git a/c_src/mdb.h b/c_src/lmdb.h similarity index 72% rename from c_src/mdb.h rename to c_src/lmdb.h index 925bf17..5b25ef6 100644 --- a/c_src/mdb.h +++ b/c_src/lmdb.h @@ -1,7 +1,9 @@ -/** @file mdb.h - * @brief memory-mapped database library +/** @file lmdb.h + * @brief Lightning memory-mapped database library * - * @mainpage MDB Memory-Mapped Database Manager + * @mainpage Lightning Memory-Mapped Database Manager (MDB) + * + * @section intro_sec Introduction * MDB is a Btree-based database management library modeled loosely on the * BerkeleyDB API, but much simplified. The entire database is exposed * in a memory map, and all data fetches return data directly @@ -38,9 +40,69 @@ * corrupt the database. Of course if your application code is known to * be bug-free (...) then this is not an issue. * + * @section caveats_sec Caveats + * Troubleshooting the lock file, plus semaphores on BSD systems: + * + * - A broken lockfile can cause sync issues. + * Stale reader transactions left behind by an aborted program + * cause further writes to grow the database quickly, and + * stale locks can block further operation. + * + * Fix: Terminate all programs using the database, or make + * them close it. Next database user will reset the lockfile. + * + * - On BSD systems or others configured with MDB_USE_POSIX_SEM, + * startup can fail due to semaphores owned by another userid. + * + * Fix: Open and close the database as the user which owns the + * semaphores (likely last user) or as root, while no other + * process is using the database. + * + * Restrictions/caveats (in addition to those listed for some functions): + * + * - Only the database owner should normally use the database on + * BSD systems or when otherwise configured with MDB_USE_POSIX_SEM. + * Multiple users can cause startup to fail later, as noted above. + * + * - A thread can only use one transaction at a time, plus any child + * transactions. Each transaction belongs to one thread. See below. + * The #MDB_NOTLS flag changes this for read-only transactions. + * + * - Use an MDB_env* in the process which opened it, without fork()ing. + * + * - Do not have open an MDB database twice in the same process at + * the same time. Not even from a plain open() call - close()ing it + * breaks flock() advisory locking. + * + * - Avoid long-lived transactions. Read transactions prevent + * reuse of pages freed by newer write transactions, thus the + * database can grow quickly. Write transactions prevent + * other write transactions, since writes are serialized. + * + * - Avoid suspending a process with active transactions. These + * would then be "long-lived" as above. Also read transactions + * suspended when writers commit could sometimes see wrong data. + * + * ...when several processes can use a database concurrently: + * + * - Avoid aborting a process with an active transaction. + * The transaction becomes "long-lived" as above until the lockfile + * is reset, since the process may not remove it from the lockfile. + * + * - If you do that anyway, close the environment once in a while, + * so the lockfile can get reset. + * + * - Do not use MDB databases on remote filesystems, even between + * processes on the same host. This breaks flock() on some OSes, + * possibly memory map sync, and certainly sync between programs + * on different hosts. + * + * - Opening a database can fail if another process is opening or + * closing it at exactly the same time. + * * @author Howard Chu, Symas Corporation. * - * @copyright Copyright 2011-2012 Howard Chu, Symas Corp. All rights reserved. + * @copyright Copyright 2011-2013 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -50,7 +112,7 @@ * top-level directory of the distribution or, alternatively, at * . * - * @par Derived From: + * @par Derived From: * This code is derived from btree.c written by Martin Hedenfalk. * * Copyright (c) 2009, 2010 Martin Hedenfalk @@ -67,8 +129,8 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#ifndef _MDB_H_ -#define _MDB_H_ +#ifndef _LMDB_H_ +#define _LMDB_H_ #include @@ -76,8 +138,15 @@ extern "C" { #endif -/** @defgroup public Public API +#ifdef _MSC_VER +typedef int mdb_mode_t; +#else +typedef mode_t mdb_mode_t; +#endif + +/** @defgroup mdb MDB API * @{ + * @brief OpenLDAP Lightning Memory-Mapped Database Manager */ /** @defgroup Version Version Macros * @{ @@ -87,7 +156,7 @@ extern "C" { /** Library minor version */ #define MDB_VERSION_MINOR 9 /** Library patch version */ -#define MDB_VERSION_PATCH 4 +#define MDB_VERSION_PATCH 6 /** Combine args a,b,c into a single integer for easy version comparisons */ #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) @@ -97,7 +166,7 @@ extern "C" { MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) /** The release date of this library version */ -#define MDB_VERSION_DATE "September 14, 2012" +#define MDB_VERSION_DATE "January 10, 2013" /** A stringifier for the version info */ #define MDB_VERSTR(a,b,c,d) "MDB " #a "." #b "." #c ": (" d ")" @@ -130,7 +199,17 @@ typedef unsigned int MDB_dbi; /** @brief Opaque structure for navigating through a database */ typedef struct MDB_cursor MDB_cursor; -/** @brief Generic structure used for passing keys and data in and out of the database. */ +/** @brief Generic structure used for passing keys and data in and out + * of the database. + * + * Key sizes must be between 1 and the liblmdb build-time constant + * #MDB_MAXKEYSIZE inclusive. This currently defaults to 511. The + * same applies to data sizes in databases with the #MDB_DUPSORT flag. + * Other data items can in theory be from 0 to 0xffffffff bytes long. + * + * Values returned from the database are valid only until a subsequent + * update operation, or the end of the transaction. + */ typedef struct MDB_val { size_t mv_size; /**< size of the data item */ void *mv_data; /**< address of the data item */ @@ -156,12 +235,14 @@ typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b); typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *relctx); /** @defgroup mdb_env Environment Flags + * + * Values do not overlap Database Flags. * @{ */ - /** mmap at a fixed address */ -#define MDB_FIXEDMAP 0x01 + /** mmap at a fixed address (experimental) */ +#define MDB_FIXEDMAP 0x01 /** no environment directory */ -#define MDB_NOSUBDIR 0x02 +#define MDB_NOSUBDIR 0x4000 /** don't fsync after commit */ #define MDB_NOSYNC 0x10000 /** read only */ @@ -170,11 +251,15 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_NOMETASYNC 0x40000 /** use writable mmap */ #define MDB_WRITEMAP 0x80000 - /** use asynchronous msync */ + /** use asynchronous msync when MDB_WRITEMAP is used */ #define MDB_MAPASYNC 0x100000 + /** tie reader locktable slots to #MDB_txn objects instead of to threads */ +#define MDB_NOTLS 0x200000 /** @} */ -/** @defgroup mdb_open Database Flags +/** @defgroup mdb_dbi_open Database Flags + * + * Values do not overlap Environment Flags. * @{ */ /** use reverse string keys */ @@ -281,13 +366,19 @@ typedef enum MDB_cursor_op { #define MDB_READERS_FULL (-30790) /** Too many TLS keys in use - Windows only */ #define MDB_TLS_FULL (-30789) - /** Nested txn has too many dirty pages */ + /** Txn has too many dirty pages */ #define MDB_TXN_FULL (-30788) /** Cursor stack too deep - internal error */ #define MDB_CURSOR_FULL (-30787) /** Page has not enough space - internal error */ #define MDB_PAGE_FULL (-30786) -#define MDB_LAST_ERRCODE MDB_PAGE_FULL + /** Database contents grew beyond environment mapsize */ +#define MDB_MAP_RESIZED (-30785) + /** Database flags changed or would change */ +#define MDB_INCOMPATIBLE (-30784) + /** Invalid reuse of reader locktable slot */ +#define MDB_BAD_RSLOT (-30783) +#define MDB_LAST_ERRCODE MDB_BAD_RSLOT /** @} */ /** @brief Statistics for a database in the environment */ @@ -301,6 +392,16 @@ typedef struct MDB_stat { size_t ms_entries; /**< Number of data items */ } MDB_stat; +/** @brief Information about the environment */ +typedef struct MDB_envinfo { + void *me_mapaddr; /**< Address of map, if fixed */ + size_t me_mapsize; /**< Size of the data memory map */ + size_t me_last_pgno; /**< ID of the last used page */ + size_t me_last_txnid; /**< ID of the last committed transaction */ + unsigned int me_maxreaders; /**< max reader slots in the environment */ + unsigned int me_numreaders; /**< max reader slots used in the environment */ +} MDB_envinfo; + /** @brief Return the mdb library version information. * * @param[out] major if non-NULL, the library major version number is copied here @@ -344,6 +445,7 @@ int mdb_env_create(MDB_env **env); * @param[in] flags Special options for this environment. This parameter * must be set to 0 or by bitwise OR'ing together one or more of the * values described here. + * Flags set by mdb_env_set_flags() are also used. *
    *
  • #MDB_FIXEDMAP * use a fixed address for the mmap region. This flag must be specified @@ -359,24 +461,52 @@ int mdb_env_create(MDB_env **env); * under that directory. With this option, \b path is used as-is for * the database main data file. The database lock file is the \b path * with "-lock" appended. - *
  • #MDB_NOSYNC - * Don't perform a synchronous flush after committing a transaction. This means - * transactions will exhibit the ACI (atomicity, consistency, and isolation) - * properties, but not D (durability); that is database integrity will be - * maintained but it is possible some number of the most recently committed - * transactions may be undone after a system crash. The number of transactions - * at risk is governed by how often the system flushes dirty buffers to disk - * and how often #mdb_env_sync() is called. This flag may be changed - * at any time using #mdb_env_set_flags(). - *
  • #MDB_NOMETASYNC - * Don't perform a synchronous flush of the meta page after committing - * a transaction. This is similar to the #MDB_NOSYNC case, but safer - * because the transaction data is still flushed. The meta page for any - * transaction N will be flushed by the data flush of transaction N+1. - * In case of a system crash, the last committed transaction may be - * lost. This flag may be changed at any time using #mdb_env_set_flags(). *
  • #MDB_RDONLY - * Open the environment in read-only mode. No write operations will be allowed. + * Open the environment in read-only mode. No write operations will be + * allowed. MDB will still modify the lock file - except on read-only + * filesystems, where MDB does not use locks. + *
  • #MDB_WRITEMAP + * Use a writeable memory map unless MDB_RDONLY is set. This is faster + * and uses fewer mallocs, but loses protection from application bugs + * like wild pointer writes and other bad updates into the database. + * Incompatible with nested transactions. + *
  • #MDB_NOMETASYNC + * Flush system buffers to disk only once per transaction, omit the + * metadata flush. Defer that until the system flushes files to disk, + * or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization + * maintains database integrity, but a system crash may undo the last + * committed transaction. I.e. it preserves the ACI (atomicity, + * consistency, isolation) but not D (durability) database property. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_NOSYNC + * Don't flush system buffers to disk when committing a transaction. + * This optimization means a system crash can corrupt the database or + * lose the last transactions if buffers are not yet flushed to disk. + * The risk is governed by how often the system flushes dirty buffers + * to disk and how often #mdb_env_sync() is called. However, if the + * filesystem preserves write order and the #MDB_WRITEMAP flag is not + * used, transactions exhibit ACI (atomicity, consistency, isolation) + * properties and only lose D (durability). I.e. database integrity + * is maintained, but a system crash may undo the final transactions. + * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no + * hint for when to write transactions to disk, unless #mdb_env_sync() + * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_MAPASYNC + * When using #MDB_WRITEMAP, use asynchronous flushes to disk. + * As with #MDB_NOSYNC, a system crash can then corrupt the + * database or lose the last transactions. Calling #mdb_env_sync() + * ensures on-disk database integrity until next commit. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_NOTLS + * Don't use Thread-Local Storage. Tie reader locktable slots to + * #MDB_txn objects instead of to threads. I.e. #mdb_txn_reset() keeps + * the slot reseved for the #MDB_txn object. A thread may use parallel + * read-only transactions. A read-only transaction may span threads if + * the user synchronizes its use. Applications that multiplex many + * user threads over individual OS threads need this option. Such an + * application must also serialize the write transactions in an OS + * thread, since MDB's write locking is unaware of the user threads. *
* @param[in] mode The UNIX permissions to set on created files. This parameter * is ignored on Windows. @@ -385,13 +515,25 @@ int mdb_env_create(MDB_env **env); *
    *
  • #MDB_VERSION_MISMATCH - the version of the MDB library doesn't match the * version that created the database environment. - *
  • EINVAL - the environment file headers are corrupted. + *
  • #MDB_INVALID - the environment file headers are corrupted. *
  • ENOENT - the directory specified by the path parameter doesn't exist. *
  • EACCES - the user didn't have permission to access the environment files. *
  • EAGAIN - the environment was locked by another process. *
*/ -int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode); +int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode); + + /** @brief Copy an MDB environment to the specified path. + * + * This function may be used to make a backup of an existing environment. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copy(MDB_env *env, const char *path); /** @brief Return statistics about the MDB environment. * @@ -401,16 +543,24 @@ int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mod */ int mdb_env_stat(MDB_env *env, MDB_stat *stat); + /** @brief Return information about the MDB environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] stat The address of an #MDB_envinfo structure + * where the information will be copied + */ +int mdb_env_info(MDB_env *env, MDB_envinfo *stat); + /** @brief Flush the data buffers to disk. * * Data is always written to disk when #mdb_txn_commit() is called, * but the operating system may keep it buffered. MDB always flushes * the OS buffers upon commit as well, unless the environment was - * opened with #MDB_NOSYNC. + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] force If non-zero, force the flush to occur. Otherwise + * @param[in] force If non-zero, force a synchronous flush. Otherwise * if the environment has the #MDB_NOSYNC flag set the flushes - * will be omitted. + * will be omitted, and with #MDB_MAPASYNC they will be asynchronous. * @return A non-zero error value on failure and 0 on success. Some possible * errors are: *