libdb/util/db_tuner.c
2012-11-14 15:13:24 -05:00

1335 lines
35 KiB
C

/*
* See the file LICENSE for redistribution information.
*
* Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*
* This utility choses a reasonable pagesize for a BTREE database.
*
* Here we assume that:
* 1) This set of records are already in a BTREE database, which may been
* configured with a unreasonable page size.
* 2) Treat the database as if it were compacted.
* 3) The goal is to optimize the database for the current content,
* rather than for ongoing insertions.
*
* The page size of a BTREE can be 512, 1024, 2048, 4096, 8192, 16384,
* 32768, 65536, totally 8 different cases. So we have 8 possible BTREE,
* each with different pagesize to contain them. Without actually creating
* those 8 databases, this utility tries to simulate situations, that is,
* for each pagesize, how many leaf pages, over flow pages and duplicate
* pages are needed, and what's the distribution of each kind of pages
* based on their fill factor.
*
* db_tuner contains 2 parts:
*
* I) Simulation of 8 different pagesized databases.
* This includes, the number of leaf pages, overflow pages caused by
* big key/data in leaf layers, and duplicate pages, the distribution
* of each kind of pages in different fill factor ranges.
*
* This is achieved by retrieving those records from existing btree and
* inserting them into different kind of pages. Since the records from
* the btree are sorted, they are inserted into the end of each page.
* If this page become full, that is no enough space, only this new
* record will be put into next new page.
*
* II) Recommend the best page size.
* From our simulation results, this utility choose a page size based on
* the number of overflow pages and storage (on-disk space).
* If there is no overflow pages, then choose the one resulting in
* the smallest storage as the recommended page size. Otherwise,
* choose the one that results in a reasonable small number of overflow pages.
*/
#include "db_config.h"
#include <assert.h>
#include <math.h>
#include "db_int.h"
#include "dbinc/db_page.h"
#include "dbinc/btree.h"
#ifndef lint
static const char copyright[] =
"Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.\n";
#endif
/*
* Fill factor distribution division,
* e.g., [0.000 - 0.099], ..., [0.900 - 0.999], [1.000- 1.000]
*/
#define DIST_DIVISION 11
/* Error return code in db_tuner, different with those in src\dbinc\db.in. */
/* Dist >= DIST_DIVISION. */
#define EXIT_DIST_OUTRANGE (-31000)
/* "Insert" zero needed. */
#define EXIT_INSERT_ZERO_NEED (-31001)
/* On-page duplicate set = 0. */
#define EXIT_INSERT_ZERO_ONPGDUP (-31002)
/* Follows are some special "insert" types based on the data. */
/* Insert as normal case. */
#define INSERT_NORMAL 0x0001
/* Nothing is inserted. */
#define INSERT_NOTHING 0x0002
/* No key but a slot is inserted. */
#define INSERT_SLOT 0x0003
/* A B_DUPLICATE point is inserted. */
#define INSERT_BDUPLICATE 0x0004
/*
* Page size of BTREE can be from DB_MIN_PGSIZE (512) to DB_MAX_PGSIZE(64K),
* and all the pagesize is the power of 2, so we have 8 possible cases.
*
* 8 is from ((int)(log(DB_MAX_PGSIZE) - log(DB_MIN_PGSIZE) + 1)).
*/
#define NUM_PGSIZES 8
/* Structure used to store statistics of the assessment. */
typedef struct __tuner_ff_stat {
uintmax_t pgsize_leaf_dist[NUM_PGSIZES][DIST_DIVISION];
uintmax_t pgsize_ovfl_dist[NUM_PGSIZES][DIST_DIVISION];
uintmax_t pgsize_dup_dist[NUM_PGSIZES][DIST_DIVISION];
/* Info used to track stats across page in a traverse. */
u_int32_t pg_leaf_offset[NUM_PGSIZES];
u_int32_t pg_dup_offset[NUM_PGSIZES];
}TUNER_FF_STAT;
static int __tuner_analyze_btree __P((DB_ENV *, DB *, u_int32_t));
static int __tuner_ff_stat_callback __P((DBC *, PAGE *, void *, int *));
static int __tuner_generate_fillfactor_stats __P((DB_ENV *, DB *,
TUNER_FF_STAT *));
static int __tuner_insert_dupdata __P((DB *, u_int32_t, int, TUNER_FF_STAT *));
static int __tuner_insert_kvpair __P((DB *, u_int32_t, u_int32_t, int, int,
int, TUNER_FF_STAT *));
static int __tuner_leaf_page __P((DBC *, PAGE *, TUNER_FF_STAT *));
static int __tuner_leaf_dupdata __P((DBC*, PAGE *, int, int, u_int32_t,
TUNER_FF_STAT *));
static int __tuner_leaf_dupdata_entries __P((DBC *, PAGE *, int, int, int, int,
TUNER_FF_STAT *));
static int __tuner_opd_data_entries __P((DBC *, PAGE *, int, int,
TUNER_FF_STAT *));
static int __tuner_opd_data __P((DBC *, PAGE *, int, int, TUNER_FF_STAT *));
static int __tuner_opd_page __P((DBC *, PAGE *, TUNER_FF_STAT *));
static int __tuner_print_btree_fillfactor __P((u_int32_t, TUNER_FF_STAT *));
static int __tuner_record_dup_pg __P((int, TUNER_FF_STAT *));
static int __tuner_record_last_opd __P((int, TUNER_FF_STAT *));
static int __tuner_record_leaf_pg __P((int, TUNER_FF_STAT *));
static int __tuner_record_ovfl_pg __P((u_int32_t, int, TUNER_FF_STAT *));
static int get_opd_size __P((DBC*, PAGE*, u_int32_t*));
static int item_size __P((DB *, PAGE *, db_indx_t));
static int item_space __P((DB *, PAGE *, db_indx_t));
int main __P((int, char *[]));
static int open_db __P((DB **, DB_ENV *, char *, char *));
static int sum_opd_page_data_entries __P((DB *, PAGE *));
static int usage __P((void));
static int version_check __P((void));
const char *progname = "db_tuner";
int
main(argc, argv)
int argc;
char *argv[];
{
extern char *optarg;
DB *dbp;
DB_ENV *dbenv;
DBTYPE dbtype;
char *dbname, *home, *subdb;
int ch, is_set_dbfile, ret;
u_int32_t cachesize, verbose;
if ((ret = version_check()) != 0)
return (ret);
dbenv = NULL;
dbp = NULL;
cachesize = 0;
dbname = home = subdb = NULL;
is_set_dbfile = verbose = 0;
dbtype = DB_UNKNOWN;
while ((ch = getopt(argc, argv, "c:d:h:vs:")) != EOF)
switch (ch) {
case 'c':
cachesize = atoi(optarg);
break;
case 'd':
dbname = optarg;
is_set_dbfile = 1;
break;
case 'h':
home = optarg;
break;
case 's':
subdb = optarg;
break;
case 'v':
verbose = 1;
break;
default:
usage();
}
/* Handle possible interruptions. */
__db_util_siginit();
if (!is_set_dbfile)
usage();
if ((ret = db_env_create(&dbenv, 0)) != 0) {
fprintf(stderr, "%s: db_env_create: %s\n",
progname, db_strerror(ret));
goto err;
}
dbenv->set_errfile(dbenv, stderr);
dbenv->set_errpfx(dbenv, progname);
if ((cachesize != 0) && (ret =
dbenv->set_cachesize(dbenv, (u_int32_t)0, cachesize, 1)) != 0) {
dbenv->err(dbenv, ret, "DB_ENV->set_cachesize:");
goto err;
}
/*
* If attaching to a pre-existing environment fails, create a
* private one and try again.
*/
if ((ret = dbenv->open(dbenv, home, DB_USE_ENVIRON, 0)) != 0 &&
(ret == DB_VERSION_MISMATCH || (ret = dbenv->open(dbenv, home,
DB_CREATE | DB_INIT_MPOOL | DB_USE_ENVIRON | DB_PRIVATE,
0)) != 0)) {
dbenv->err(dbenv, ret, "DB_ENV->open:");
goto err;
}
if ((ret = open_db(&dbp, dbenv, dbname, subdb)) != 0) {
dbenv->err(dbenv, ret, "open_db:");
goto err;
}
if ((ret = dbp->get_type(dbp, &dbtype)) != 0) {
dbenv->err(dbenv, ret, "DB->get_type:");
goto err;
}
switch (dbtype) {
case DB_BTREE:
if ((ret = __tuner_analyze_btree(dbenv, dbp, verbose)) != 0)
dbenv->err(dbenv, ret, "__tuner_analyze_btree fails.");
break;
default:
dbenv->errx(dbenv, DB_STR("5001",
"%s: Unsupported database type"), progname);
}
err:
if (dbp != NULL && (ret = dbp->close(dbp, 0)) != 0)
dbenv->err(dbenv, ret, "DB->close: %s", dbname);
if (dbenv != NULL && (ret = dbenv->close(dbenv, 0)) != 0)
fprintf(stderr, "%s: dbenv->close: %s", progname,
db_strerror(ret));
/* Resend any caught signal. */
__db_util_sigresend();
return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
}
/*
* Generate the simulated statistics for each different btree pagesize,
* then print out this information if verbose enabled, finally make our
* recommendation of our best pagesize based on our simulated results.
*/
static int
__tuner_analyze_btree(dbenv, dbp, verbose)
DB_ENV *dbenv;
DB *dbp;
u_int32_t verbose;
{
TUNER_FF_STAT stats;
int ret;
memset(&stats, 0, sizeof(TUNER_FF_STAT));
if ((ret = __tuner_generate_fillfactor_stats(dbenv, dbp,
&stats)) != 0) {
dbenv->err(dbenv, ret,
"__tuner_generate_fillfactor_stats fails.");
return (ret);
}
(void)__tuner_print_btree_fillfactor(verbose, &stats);
return (EXIT_SUCCESS);
}
/* Traverse the database to gather simulated statistics for each pagesize.*/
static int
__tuner_generate_fillfactor_stats(dbenv, dbp, stats)
DB_ENV *dbenv;
DB *dbp;
TUNER_FF_STAT *stats;
{
DBC *dbc;
int i, ret, t_ret;
ret = t_ret = 0;
if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) {
dbenv->err(dbenv, ret, "DB_ENV->cursor:");
return (ret);
}
/*
* Call the internal Berkeley DB function, that triggers a callback
* for each page in a btree database.
*/
if ((ret = __bam_traverse(dbc, DB_LOCK_READ, PGNO_INVALID,
__tuner_ff_stat_callback, (void *)stats)) != 0) {
dbenv->err(dbenv, ret, "__bam_traverse:");
goto err;
}
/*
* Record the last simulated page for leaf and dup page,
* which ensure at least one page is used.
*/
for (i = 0; i < NUM_PGSIZES; ++i) {
if (stats->pg_leaf_offset[i] > 0 &&
(ret = __tuner_record_leaf_pg(i, stats)) != 0) {
dbenv->err(dbenv, ret, "__tuner_record_leaf");
break;
}
if (stats->pg_dup_offset[i] > 0 &&
(ret = __tuner_record_dup_pg(i, stats)) != 0) {
dbenv->err(dbenv, ret, "__tuner_record_dup_pg");
break;
}
}
err:
if(dbc != NULL && (t_ret = dbc->close(dbc)) != 0)
dbenv->err(dbenv, t_ret, "DBC->close:");
if (ret == 0 && t_ret != 0)
ret = t_ret;
return (ret);
}
/*
* This callback is used in __bam_traverse. When traversing each page in
* the BTREE, it retrieves each record for simulation.
*/
static int
__tuner_ff_stat_callback(dbc, h, cookie, putp)
DBC *dbc;
PAGE *h;
void *cookie;
int *putp;
{
DB_ENV *dbenv;
int ret;
dbenv = dbc->dbenv;
*putp = 0;
switch (TYPE(h)) {
case P_LBTREE:
if ((ret = __tuner_leaf_page(dbc, h, cookie)) != 0) {
dbenv->err(dbenv, ret, "__tuner_leaf_page");
return (ret);
}
break;
case P_LDUP:
case P_LRECNO:
/* Coming a new off-page duplicate set.*/
if (h->prev_pgno == PGNO_INVALID &&
(ret = __tuner_opd_page(dbc, h, cookie)) != 0) {
dbenv->err(dbenv, ret, "__tuner_opd_page:");
return (ret);
}
break;
case P_IBTREE:
case P_IRECNO:
case P_OVERFLOW:
break;
default:
return (EXIT_FAILURE);
}
return (EXIT_SUCCESS);
}
/*
* Deal with the leaf page of existing database. This includes:
* 1: determine the on-page duplicate set, and calculate its total size
* 2: decise where should this set go (on-page or off-page) in the later
* simulation stage and do some "movement".
* 3: "move" the unique key data pairs to the simulated leaf pages.
*/
static int
__tuner_leaf_page(dbc, h, cookie)
DBC *dbc;
PAGE *h;
TUNER_FF_STAT *cookie;
{
DB *dbp;
DB_ENV *dbenv;
db_indx_t findx, lindx, indx, *inp, top;
u_int32_t data_sz, key_sz, onpd_sz;
int i, ret, in_data_type;
dbp = dbc->dbp;
dbenv = dbp->dbenv;
/*
* Use some macros from db_page.h to retrieve information from the
* page. P_INP retrieves the offset to the start of page index array.
* NUM_ENT retrieves the number of items on the page.
*/
inp = P_INP(dbp, h);
top = NUM_ENT(h);
ret = 0;
for (indx = 0; indx < top;) {
/*
* If on-page duplicate, first calculate the total size,
* including one key and all data.
*/
onpd_sz = 0;
if ((indx + P_INDX) < top && inp[indx] == inp[indx + P_INDX]) {
/* Ignore deleted items. */
if (B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
continue;
/* Count the key once. */
onpd_sz += item_space(dbp, h, indx);
for (findx = indx;
indx < top && inp[findx] == inp[indx];
indx += P_INDX) {
/* Ignore deleted items. */
if (B_DISSET(GET_BKEYDATA(dbp, h,
indx + O_INDX)->type))
continue;
/* Count all the data items. */
onpd_sz += item_space(dbp, h, indx + O_INDX);
}
/*Indx range of on-page duplicate set: [findx, lindx)*/
lindx = indx;
if (onpd_sz == 0)
return (EXIT_INSERT_ZERO_ONPGDUP);
/* "Move" on-page duplicate set to simualted pages.*/
if ((ret = __tuner_leaf_dupdata(dbc, h, findx, lindx,
onpd_sz, cookie)) != 0) {
dbenv->err(dbenv, ret, "__tuner_leaf_dupdata");
return (ret);
}
} else {
in_data_type = INSERT_NORMAL;
/* Ignore deleted items. */
if (B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
continue;
/* First consider key. */
key_sz = item_size(dbp, h, indx);
/* Ignore deleted items. */
if (B_DISSET(GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
continue;
/* next consider data.*/
if (B_TYPE(GET_BKEYDATA(dbp, h,
indx + O_INDX)->type) == B_DUPLICATE) {
/*
* Off-page duplicate set is not handled here
* but on the duplicate pages.
* Here the key is inserted into "simulated"
* leaf_page.
*/
in_data_type = INSERT_NOTHING;
data_sz = 0;
} else
data_sz = item_size(dbp, h, indx + O_INDX);
for (i = 0; i < NUM_PGSIZES; ++i) {
if ((ret = __tuner_insert_kvpair(dbp, key_sz,
data_sz, i, INSERT_NORMAL, in_data_type,
cookie)) != 0) {
dbenv->err(dbenv, ret,
"__tuner_insert_kvpair");
break;
}
}
indx += P_INDX;
}
}
return (ret);
}
/*
* "Move" the on-page duplicate data set from the specific page to our
* simulated databases (Indx range of this on-page duplicate set:
* [findx, lindx)), it includes following steps:
*
* First check where should this set go (on-page or off-page duplicate tree).
*
* This is determined as "If total size of duplicate data set is more than 25%
* of a specific page size, then this set go to off-page duplicate tree.
* Otherwise, it goes to on-page duplicate. "
*
* Then "move" this duplicate set to our simulated pages in each simulated
* database.
*/
static int
__tuner_leaf_dupdata(dbc, h, findx, lindx, dup_sz, cookie)
DBC *dbc;
PAGE *h;
int findx, lindx;
u_int32_t dup_sz;
TUNER_FF_STAT *cookie;
{
DB_ENV *dbenv;
int i, is_opd, ret;
u_int32_t pgsize;
dbenv = dbc->dbenv;
for (i = 0; i < NUM_PGSIZES; ++i) {
pgsize = (1 << i) * DB_MIN_PGSIZE;
/* Check whether this duplicate set go to opd? */
is_opd = (dup_sz < (pgsize / 4)) ? 0 : 1;
/* "Move" this on-page duplicate to our simulated pages. */
if ((ret = __tuner_leaf_dupdata_entries(dbc, h, findx,
lindx, i, is_opd, cookie)) != 0) {
dbenv->err(dbenv, ret, "__tuner_leaf_dupdata_entries");
return (ret);
}
/*
* Record the last simulated duplicate pages for a finished
* off-page duplicate set then reset the offset to zero
* for next opd set.
*/
if (is_opd &&
(ret = __tuner_record_last_opd(i, cookie)) != 0) {
dbenv->err(dbenv, ret, "__tuner_record_last_opd");
return (ret);
}
}
return (EXIT_SUCCESS);
}
/*
* "Move" the on-page duplicate set [findx, lindx) on the specific page to
* simulated database with pagesize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
*/
static int
__tuner_leaf_dupdata_entries(dbc, h, findx, lindx, indx_pgsz, is_opd, cookie)
DBC *dbc;
PAGE *h;
int findx, lindx, indx_pgsz, is_opd;
TUNER_FF_STAT *cookie;
{
DB *dbp;
DB_ENV *dbenv;
db_indx_t indx;
u_int32_t data_sz, key_sz;
int ret, in_key_type;
dbp = dbc->dbp;
dbenv = dbp->dbenv;
for (indx = findx; indx < lindx; indx += P_INDX) {
key_sz = 0;
in_key_type = INSERT_SLOT;
/*
* For on-page duplicate data, the key is inserted once,
* then its corresponding data.
*/
if (indx == findx) {
/* Ignore deleted items. */
if (B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
continue;
key_sz = item_size(dbp, h, indx);
in_key_type = INSERT_NORMAL;
/*
* If is_opd, then insert a key + B_DUPLICATE pair for
* this on-page duplicate to simulated leaf page.
* INSERT_BDUPLICATE: B_DUPLICATE point.
*/
if (is_opd && (ret =
__tuner_insert_kvpair(dbp, key_sz, 0, indx_pgsz,
INSERT_NORMAL, INSERT_BDUPLICATE, cookie)) != 0) {
dbenv->err(dbenv, ret, "__tuner_insert_kvpair");
return (ret);
}
}
/* Ignore deleted items. */
if (B_DISSET(GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
continue;
data_sz = item_size(dbp, h, indx + O_INDX);
if (is_opd) {
ret = __tuner_insert_dupdata(dbp, data_sz,
indx_pgsz, cookie);
if (ret != 0) {
dbenv->err(dbenv, ret,
"__tuner_insert_dupdata");
return (ret);
}
} else {
ret = __tuner_insert_kvpair(dbp, key_sz, data_sz,
indx_pgsz, in_key_type, INSERT_NORMAL,
cookie);
if (ret != 0) {
dbenv->err(dbenv, ret, "__tuner_insert_kvpair");
return (ret);
}
}
}
return (EXIT_SUCCESS);
}
/* Tuner the off-page duplicate pages from existing database. */
static int
__tuner_opd_page(dbc, h, cookie)
DBC *dbc;
PAGE *h;
TUNER_FF_STAT *cookie;
{
DB_ENV *dbenv;
u_int32_t opd_sz, pgsize;
int i, is_opd, ret;
dbenv = dbc->dbenv;
ret = opd_sz = 0;
/* 1st calculate the total size of the duplicate set. */
if ((ret = get_opd_size(dbc, h, &opd_sz)) != 0) {
dbenv->err(dbenv, ret, "get_opd_size:");
return (ret);
}
/* 2nd insert this set into "simulated" pages for each page size.*/
for (i = 0; i < NUM_PGSIZES; ++i) {
pgsize = (1 << i) * DB_MIN_PGSIZE;
/* Check whether this duplicate set go to opd? */
is_opd = (opd_sz < (pgsize / 4)) ? 0 : 1;
if ((ret = __tuner_opd_data(dbc, h, i, is_opd, cookie)) != 0) {
dbenv->err(dbenv, ret, "__tuner_opd_data:");
break;
}
}
return (ret);
}
/* "Move" all the off-page duplicate data into simulated on-page or off-page.*/
static int
__tuner_opd_data(dbc, h, indx_pgsz, is_opd, cookie)
DBC *dbc;
PAGE *h;
int indx_pgsz, is_opd;
TUNER_FF_STAT *cookie;
{
DB *dbp;
DB_ENV *dbenv;
DB_MPOOLFILE *mpf;
PAGE *p;
db_pgno_t next_pgno;
u_int32_t pgsize;
int ret;
dbp = dbc->dbp;
dbenv = dbp->dbenv;
mpf = dbp->mpf;
p = h;
pgsize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
/*
* __tuner_leaf_page has inserted one key for each opd already,
* so here only a B_DUPLICATE point is inserted into simulate
* leaf page if this duplicate set goes to on-page.
*/
if (is_opd) {
ret = __tuner_insert_kvpair(dbp, 0, 0, indx_pgsz,
INSERT_NOTHING, INSERT_BDUPLICATE, cookie);
if (ret!= 0) {
dbenv->err(dbenv, ret, "__tuner_insert_kvpair");
return (ret);
}
}
/* Next insert all the data of this duplicate set. */
while (1) {
ret = __tuner_opd_data_entries(dbc, h, indx_pgsz, is_opd,
cookie);
if (ret != 0) {
dbenv->err(dbenv, ret, "__tuner_opd_data_entries");
return (ret);
}
next_pgno = p->next_pgno;
if (p != h && (ret = mpf->put(mpf, p, dbc->priority, 0)) != 0) {
dbenv->err(dbenv, ret, "DB_MPOOLFILE->put:");
return (ret);
}
if (next_pgno == PGNO_INVALID)
break;
if ((ret = mpf->get(mpf, &next_pgno, dbc->txn, 0, &p)) != 0) {
dbenv->err(dbenv, ret, "DB_MPOOLFILE->get:");
return (ret);
}
}
/* Record the last simulate duplicate page if goto off-page duplicate*/
if (is_opd && (ret = __tuner_record_last_opd(indx_pgsz, cookie)) != 0) {
dbenv->err(dbenv, ret, "__tuner_record_last_opd");
return (ret);
}
return (EXIT_SUCCESS);
}
/*
* "Move" the off-page duplicate data set to our simulated on-page or
* off-page.
*/
static int
__tuner_opd_data_entries(dbc, h, indx_pgsz, is_opd, cookie)
DBC *dbc;
PAGE *h;
int indx_pgsz, is_opd;
TUNER_FF_STAT *cookie;
{
DB *dbp;
DB_ENV *dbenv;
db_indx_t indx;
u_int32_t data_sz;
int ret;
dbp = dbc->dbp;
dbenv = dbp->dbenv;
for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
/* Ignore deleted items. */
if (B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
continue;
data_sz = item_size(dbp, h, indx);
if (is_opd) {
ret = __tuner_insert_dupdata(dbp, data_sz,
indx_pgsz, cookie);
if (ret != 0) {
dbenv->err(dbenv, ret,
"__tuner_insert_dupdata");
return (ret);
}
} else {
/*
* __tuner_leaf_page has inserted one key for each
* opd already (this will insert moment later),
* so only data items and key slots are inserted.
*/
ret = __tuner_insert_kvpair(dbp, 0, data_sz,
indx_pgsz, INSERT_SLOT, INSERT_NORMAL,
cookie);
if (ret != 0) {
dbenv->err(dbenv, ret,
"__tuner_insert_kvpair");
return (ret);
}
}
}
return (EXIT_SUCCESS);
}
/*
* Try to insert a key and data pair into simulated leaf pages.
* Key and data pairs are always stored (or referenced) on the same leaf page.
*/
static int
__tuner_insert_kvpair(dbp, key_sz, data_sz, indx_pgsz, in_key, in_data, stats)
DB *dbp;
u_int32_t key_sz, data_sz;
int indx_pgsz, in_key, in_data;
TUNER_FF_STAT *stats;
{
DB_ENV *dbenv;
int is_big_data, is_big_key, ret;
u_int32_t needed, pgsize;
dbenv = dbp->dbenv;
is_big_data = is_big_key = 0;
needed = 0;
pgsize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
if (key_sz > B_MINKEY_TO_OVFLSIZE(dbp, 2, pgsize))
is_big_key = 1;
if (data_sz > B_MINKEY_TO_OVFLSIZE(dbp, 2, pgsize))
is_big_data = 1;
if (is_big_key) {
needed += BOVERFLOW_PSIZE;
/* Add big key into ovfl pages. */
if ((ret =
__tuner_record_ovfl_pg(key_sz, indx_pgsz, stats)) != 0) {
dbenv->err(dbenv, ret, "__tuner_record_ovfl_pg:key_sz");
return (ret);
}
} else {
/*
* key_sz = INSERT_SLOT indicates no key is inserted
* but a slot in the inp array, e.g., on-page duplicate.
* key_sz = INSERT_NOTHING indicates no key no slot is
* inserted.
*/
if (in_key == INSERT_NOTHING)
needed += 0;
else if (in_key == INSERT_SLOT)
needed += sizeof(db_indx_t);
else if (in_key == INSERT_NORMAL)
needed += BKEYDATA_PSIZE(key_sz);
}
if (is_big_data) {
needed += BOVERFLOW_PSIZE;
/* Add big data into ovfl pages. */
if ((ret =
__tuner_record_ovfl_pg(data_sz, indx_pgsz, stats)) != 0) {
dbenv->err(dbenv, ret, "__tuner_record_ovfl_pg");
return (ret);
}
} else {
/*
* in_data = INSERT_BDUPLICATE indicates a B_DUPLICATE is
* inserted, e.g., off-page duplicate case.
* in_data = INSERT_NOTHING indicates nothing is inserted,
* happens when there is a key + B_DUPLICATE pair in
* __tuner_leaf_page, in which case, only the key is inserted
* but no data because the data will considered in
* __tuner_opd_page when an off-page
* duplicate set is coming.
*/
if (in_data == INSERT_NOTHING)
needed += 0;
else if (in_data == INSERT_BDUPLICATE)
needed += BOVERFLOW_PSIZE;
else if (in_data == INSERT_NORMAL)
needed += BKEYDATA_PSIZE(data_sz);
}
if (needed == 0)
return (EXIT_INSERT_ZERO_NEED);
/* 1st leaf page, add overhead size. */
if (stats->pg_leaf_offset[indx_pgsz] == 0)
stats->pg_leaf_offset[indx_pgsz] = SIZEOF_PAGE;
if ((stats->pg_leaf_offset[indx_pgsz] + needed) > pgsize) {
/* No enough space, then record current page info. */
if ((ret = __tuner_record_leaf_pg(indx_pgsz, stats)) != 0) {
dbenv->err(dbenv, ret, "__tuner_record_leaf_pg");
return (ret);
}
/* Insert pair into new page. */
stats->pg_leaf_offset[indx_pgsz] = needed + SIZEOF_PAGE;
} else
stats->pg_leaf_offset[indx_pgsz] += needed;
return (EXIT_SUCCESS);
}
/* Try to insert a duplicate data into simulated off duplicate pages. */
static int
__tuner_insert_dupdata(dbp, data_sz, indx_pgsz, stats)
DB *dbp;
u_int32_t data_sz;
int indx_pgsz;
TUNER_FF_STAT *stats;
{
DB_ENV *dbenv;
int is_big_data, ret;
u_int32_t needed, pgsize;
dbenv = dbp->dbenv;
is_big_data = 0;
needed = 0;
pgsize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
if (data_sz > B_MINKEY_TO_OVFLSIZE(dbp, 2, pgsize))
is_big_data = 1;
if (is_big_data) {
needed = BOVERFLOW_PSIZE;
/* Add big data into ovfl pages. */
if ((ret =
__tuner_record_ovfl_pg(data_sz, indx_pgsz, stats)) != 0) {
dbenv->err(dbenv, ret, "__tuner_record_ovfl_pg");
return (ret);
}
} else
needed += BKEYDATA_PSIZE(data_sz);
if (needed == 0)
return (EXIT_INSERT_ZERO_NEED);
/* 1st opd page, add overhead size. */
if (stats->pg_dup_offset[indx_pgsz] == 0)
stats->pg_dup_offset[indx_pgsz] = SIZEOF_PAGE;
if ((stats->pg_dup_offset[indx_pgsz] + needed) > pgsize) {
/* no enough space then record current page info. */
if ((ret = __tuner_record_dup_pg(indx_pgsz, stats)) != 0) {
dbenv->err(dbenv, ret, "__tuner_record_dup_pg");
return (ret);
}
/* insert new item into new page. */
stats->pg_dup_offset[indx_pgsz] = needed + SIZEOF_PAGE;
} else
stats->pg_dup_offset[indx_pgsz] += needed;
return (EXIT_SUCCESS);
}
/* Insert big item into simulated over flow pages. */
static int
__tuner_record_ovfl_pg(size, indx_pgsz, stats)
u_int32_t size;
int indx_pgsz;
TUNER_FF_STAT *stats;
{
u_int32_t pgsize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
int dist;
/* Update OVFLPAGE list: 1. Add to "full" ovfl pages.*/
stats->pgsize_ovfl_dist[indx_pgsz][DIST_DIVISION - 1] +=
(size / (pgsize - SIZEOF_PAGE));
/* Update OVFLPAGE list: 2. Add the remainder.*/
size = size % (pgsize - SIZEOF_PAGE);
dist = (int)(((double)(size + SIZEOF_PAGE) *
(DIST_DIVISION - 1)) / pgsize);
/* assert(dist < DIST_DIVISION); */
if (dist >= DIST_DIVISION)
return (EXIT_DIST_OUTRANGE);
++stats->pgsize_ovfl_dist[indx_pgsz][dist];
return (EXIT_SUCCESS);
}
/* Record simulated leaf page if it has no space to contain new record. */
static int
__tuner_record_leaf_pg(indx_pgsz, stats)
int indx_pgsz;
TUNER_FF_STAT *stats;
{
int dist;
u_int32_t pgsize;
pgsize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
/* First calculate its fill factor. */
dist = (int)(((double)stats->pg_leaf_offset[indx_pgsz] *
(DIST_DIVISION - 1)) / pgsize);
/* assert(dist < DIST_DIVISION); */
if (dist >= DIST_DIVISION)
return (EXIT_DIST_OUTRANGE);
/* Then add one page to its corresponding distribution. */
++stats->pgsize_leaf_dist[indx_pgsz][dist];
return (EXIT_SUCCESS);
}
/* Record simulated duplicate page if it has no enough space for new record. */
static int
__tuner_record_dup_pg(indx_pgsz, stats)
int indx_pgsz;
TUNER_FF_STAT *stats;
{
int dist;
u_int32_t pgsize;
pgsize = (1 << indx_pgsz) * DB_MIN_PGSIZE;
/* First calculate its fill factor. */
dist = (int)(((double)stats->pg_dup_offset[indx_pgsz] *
(DIST_DIVISION - 1)) / pgsize);
/* assert(dist < DIST_DIVISION); */
if (dist >= DIST_DIVISION)
return (EXIT_DIST_OUTRANGE);
/* Then add one page to its corresponding distribution. */
++stats->pgsize_dup_dist[indx_pgsz][dist];
return (EXIT_SUCCESS);
}
/*
* Record the last simulated duplicate page when an off-page duplicate set
* is finished, also reset its offset to be zero for next set.
*/
static int
__tuner_record_last_opd(indx_pgsz, stats)
int indx_pgsz;
TUNER_FF_STAT *stats;
{
int ret;
if (stats->pg_dup_offset[indx_pgsz] != 0 &&
(ret = __tuner_record_dup_pg(indx_pgsz, stats)) != 0)
return (ret);
/* Reset offset to zero for new opd set. */
stats->pg_dup_offset[indx_pgsz] = 0;
return (EXIT_SUCCESS);
}
/*
* When a new off-page duplicate set is coming, we first calculate its total
* size, which will be used to determine whether this set should go to on-page
* or off-page duplicate tree in our simulation part.
*
* As a off-page duplicate set is in a linked pages, we simply traverse this
* link and sum up all the size of each data in each page.
*/
static int
get_opd_size(dbc, h, opd_sz)
DBC *dbc;
PAGE *h;
u_int32_t *opd_sz;
{
DB *dbp;
DB_ENV *dbenv;
DB_MPOOLFILE *mpf;
PAGE *p;
db_pgno_t next_pgno;
int ret;
u_int32_t dup_sz;
dbp = dbc->dbp;
dbenv = dbp->dbenv;
mpf = dbp->mpf;
dup_sz = 0;
ret = 0;
p = h;
while (1) {
dup_sz += sum_opd_page_data_entries(dbp, p);
next_pgno = p->next_pgno;
if (p != h && (ret =
mpf->put(mpf, p, dbc->priority, 0)) != 0) {
dbenv->err(dbenv, ret, "DB_MPOOLFILE->put:");
return (ret);
}
if (next_pgno == PGNO_INVALID)
break;
if ((ret =
mpf->get(mpf, &next_pgno, dbc->txn, 0, &p)) != 0) {
dbenv->err(dbenv, ret, "DB_MPOOLFILE->get:");
return (ret);
}
}
*opd_sz = dup_sz;
return (EXIT_SUCCESS);
}
/* Sum up the space used to contain all the data in a specific page.*/
static int
sum_opd_page_data_entries(dbp, h)
DB *dbp;
PAGE *h;
{
db_indx_t i;
u_int32_t sz;
sz = 0;
for (i = 0; i < NUM_ENT(h); i += O_INDX) {
/* Ignore deleted items. */
if (B_DISSET(GET_BKEYDATA(dbp, h, i)->type))
continue;
sz += item_space(dbp, h, i);
}
return sz;
}
/* The space used by one item in a page. */
static int
item_space(dbp, h, indx)
DB *dbp;
PAGE *h;
db_indx_t indx;
{
return (B_TYPE(GET_BKEYDATA(dbp, h, indx)->type) == B_KEYDATA ?
BKEYDATA_PSIZE(GET_BKEYDATA(dbp, h, indx)->len) :
BKEYDATA_PSIZE(GET_BOVERFLOW(dbp, h, indx)->tlen));
}
/* The actual length of a item. */
static int
item_size(dbp, h, indx)
DB *dbp;
PAGE *h;
db_indx_t indx;
{
return (B_TYPE(GET_BKEYDATA(dbp, h, indx)->type) == B_KEYDATA ?
GET_BKEYDATA(dbp, h, indx)->len : GET_BOVERFLOW(dbp, h,
indx)->tlen);
}
/* Print out the information according to user's options. */
static int
__tuner_print_btree_fillfactor(verbose, stats)
u_int32_t verbose;
TUNER_FF_STAT *stats;
{
const char * DIVIDE_LINE1 = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=";
const char * DIVIDE_LINE2 = "-----------|";
const char * DIVIDE_LINE3 = "---------------------------------------";
double shift_point;
int best_indx, i, j;
u_int32_t pgsize;
u_int64_t minispace, ttpgcnt[NUM_PGSIZES], ttspace[NUM_PGSIZES];
uintmax_t dup_cnt[NUM_PGSIZES], leaf_cnt[NUM_PGSIZES],
ovfl_cnt[NUM_PGSIZES];
shift_point = 0.099;
best_indx = 0;
minispace = UINT64_MAX;
for (i = 0; i < NUM_PGSIZES; ++i) {
pgsize = (1 << i) * DB_MIN_PGSIZE;
ovfl_cnt[i] = leaf_cnt[i] = dup_cnt[i] = ttpgcnt[i] = 0;
for (j = 0; j < DIST_DIVISION; ++j) {
ovfl_cnt[i] += stats->pgsize_ovfl_dist[i][j];
leaf_cnt[i] += stats->pgsize_leaf_dist[i][j];
dup_cnt[i] += stats->pgsize_dup_dist[i][j];
}
ttpgcnt[i] = ovfl_cnt[i] + leaf_cnt[i] + dup_cnt[i];
ttspace[i] = pgsize * ttpgcnt[i];
}
if (verbose == 1) {
printf("\n %50s \n",
"===========Simulation Results===========");
printf("\n %s\n %s\n %s\n",
"leaf_pg:\t percentage of leaf page in that range",
"dup_pg:\t percentage of duplicate page in that range",
"ovfl_pg:\t percentage of over flow page in that range");
for (i = 0; i < NUM_PGSIZES; ++i) {
printf("\n\n%s%s\n", DIVIDE_LINE1, DIVIDE_LINE1);
printf("page size = %d\n", (1 << i) * DB_MIN_PGSIZE);
printf("%s%s\n", DIVIDE_LINE1, DIVIDE_LINE1);
printf("%s\n", DIVIDE_LINE3);
printf("%s %s %s %s\n", "fill factor",
"leaf_pg", "dup_pg", "ovfl_pg");
for (j = 0; j < DIST_DIVISION; ++j) {
if (j == (DIST_DIVISION - 1))
shift_point = 0.000;
else
shift_point = 0.099;
printf("\n[%2.1f-%4.3f]\t",
(double)j/(DIST_DIVISION - 1),
((double)j/(DIST_DIVISION - 1) +
shift_point));
if (leaf_cnt[i] == 0 ||
stats->pgsize_leaf_dist[i][j] == 0)
printf("%3.2f\t", (double)0);
else
printf("%3.2f%%\t", (double)
(stats->pgsize_leaf_dist[i][j] *
100) / leaf_cnt[i]);
if (dup_cnt[i] == 0 ||
stats->pgsize_dup_dist[i][j] == 0)
printf("%3.2f\t", (double)0);
else
printf("%3.2f%%\t", (double)
(stats->pgsize_dup_dist[i][j] *
100) / dup_cnt[i]);
if (ovfl_cnt[i] == 0 ||
stats->pgsize_ovfl_dist[i][j] == 0)
printf("%3.2f\t", (double)0);
else
printf("%3.2f%%\t", (double)
(stats->pgsize_ovfl_dist[i][j] *
100) / ovfl_cnt[i]);
}
}
printf("\n\n\n\n %55s\n\n",
"=====Summary of simulated statistic=====");
printf(" %s\n %s\n %s\n %s\n %s\n %s\n\n",
"pgsize: \tpage size", "storage: \ton-disk space",
"pgcnt: \ttotal number of all pages "
"(e.g, sum of ovflcnt, leafcnt, dupcnt)",
"ovflcnt: \tnumber of over flow pages",
"leafcnt: \tnumber of leaf pages",
"dupcnt: \tnumber of duplicate pages");
printf("%s%s%s%s%s%s\n", DIVIDE_LINE2, DIVIDE_LINE2,
DIVIDE_LINE2, DIVIDE_LINE2, DIVIDE_LINE2, DIVIDE_LINE2);
printf(" %10s| %10s| %10s| %10s| %10s| %10s|\n", "pgsize",
"storage", "pgcnt", "ovflcnt", "leafcnt", "dupcnt");
printf("%s%s%s%s%s%s\n", DIVIDE_LINE2, DIVIDE_LINE2,
DIVIDE_LINE2, DIVIDE_LINE2, DIVIDE_LINE2, DIVIDE_LINE2);
for (i = 0; i < NUM_PGSIZES; ++i) {
printf(" %10d|", (1 << i) * DB_MIN_PGSIZE);
printf(" %10u|", (u_int32_t)ttspace[i]);
if (ttspace[i] != (u_int32_t)ttspace[i])
printf("(truncated value reported)");
printf(" %10u|", (u_int32_t)ttpgcnt[i]);
if (ttpgcnt[i] != (u_int32_t)ttpgcnt[i])
printf("(truncated value reported)");
printf(" %10u|", (u_int32_t)ovfl_cnt[i]);
if (ovfl_cnt[i] != (u_int32_t)ovfl_cnt[i])
printf("(truncated value reported)");
printf(" %10u|", (u_int32_t)leaf_cnt[i]);
if (leaf_cnt[i] != (u_int32_t)leaf_cnt[i])
printf("(truncated value reported)");
printf(" %10u|", (u_int32_t)dup_cnt[i]);
if (dup_cnt[i] != (u_int32_t)dup_cnt[i])
printf("(truncated value reported)");
printf("\n");
}
printf("%s%s%s%s%s%s\n", DIVIDE_LINE2, DIVIDE_LINE2,
DIVIDE_LINE2, DIVIDE_LINE2, DIVIDE_LINE2, DIVIDE_LINE2);
}
/*
* Choose a page size based on the overflow calculation. If there
* is no overflow consideration, then use the smallest on-disk
* space as a recommended page size.
*/
if (ovfl_cnt[0] == 0) {
minispace = ttspace[0];
for (i = 1; i < NUM_PGSIZES; ++i)
if ((ttspace[i] != 0) && (minispace > ttspace[i])) {
minispace = ttspace[i];
best_indx = i;
}
} else
for (i = 1; i < NUM_PGSIZES; ++i)
if ((ovfl_cnt[i - 1] - ovfl_cnt[i]) > 0.02 * ttpgcnt[i])
best_indx = i;
printf("\n\nFor your input database, we recommend page size = %d \n \t"
"out of 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536 for you.\n",
(1 << best_indx) * DB_MIN_PGSIZE);
return (EXIT_SUCCESS);
}
/* Open the specific existing database. */
static int
open_db(dbpp, dbenv, dbname, subdb)
DB **dbpp;
DB_ENV *dbenv;
char *dbname;
char *subdb;
{
DB *dbp;
int ret = 0;
if ((ret = db_create(&dbp, dbenv, 0)) != 0) {
dbenv->err(dbenv, ret, "db_create fails.\n");
return (ret);
}
*dbpp = dbp;
/* Open a database for read-only.*/
if ((ret =
dbp->open(dbp, NULL, dbname, subdb, DB_UNKNOWN, DB_RDONLY, 0)) != 0)
dbenv->err(dbenv, ret, "DB->open");
return (ret);
}
/* Usage flag information to indicate what can user query for given database.*/
static int
usage()
{
fprintf(stderr, "usage: %s %s\n", progname,
"[-c cachesize] -d file [-h home] [-s database] [-v verbose]");
exit(EXIT_FAILURE);
}
/*Check the verion of Berkeley DB libaray, make sure it is the right version.*/
static int
version_check()
{
int v_major, v_minor, v_patch;
/* Make sure we're loaded with the right version of the DB library. */
(void)db_version(&v_major, &v_minor, &v_patch);
if (v_major != DB_VERSION_MAJOR || v_minor != DB_VERSION_MINOR) {
fprintf(stderr, DB_STR_A("5002",
"%s: version %d.%d doesn't match library version %d.%d\n",
"%s %d %d %d %d"), progname, DB_VERSION_MAJOR,
DB_VERSION_MINOR, v_major, v_minor);
return (EXIT_FAILURE);
}
return (EXIT_SUCCESS);
}