This commit is contained in:
Gregory Burd 2024-05-20 11:05:46 -04:00
parent 1b7fafa0e1
commit 3b4106743b

View file

@ -16,6 +16,9 @@
typedef size_t pgno_t;
#define INITIAL_AMOUNT 1024 * 2
bool recording = true;
char *
bytes_as(double bytes, char *s, size_t size)
{
@ -50,7 +53,152 @@ toss(size_t max)
return level;
}
bool recording = true;
static size_t
b64_encoded_size(size_t inlen)
{
size_t ret;
ret = inlen;
if (inlen % 3 != 0)
ret += 3 - (inlen % 3);
ret /= 3;
ret *= 4;
return ret;
}
static const char b64chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
static char *
b64_encode(const unsigned char *in, size_t len)
{
char *out;
size_t elen;
size_t i;
size_t j;
size_t v;
if (in == NULL || len == 0)
return NULL;
elen = b64_encoded_size(len);
out = malloc(elen + 1);
out[elen] = '\0';
for (i = 0, j = 0; i < len; i += 3, j += 4) {
v = in[i];
v = i + 1 < len ? v << 8 | in[i + 1] : v << 8;
v = i + 2 < len ? v << 8 | in[i + 2] : v << 8;
out[j] = b64chars[(v >> 18) & 0x3F];
out[j + 1] = b64chars[(v >> 12) & 0x3F];
if (i + 1 < len) {
out[j + 2] = b64chars[(v >> 6) & 0x3F];
} else {
out[j + 2] = '=';
}
if (i + 2 < len) {
out[j + 3] = b64chars[v & 0x3F];
} else {
out[j + 3] = '=';
}
}
return out;
}
static size_t
b64_decoded_size(const char *in)
{
size_t len;
size_t ret;
size_t i;
if (in == NULL)
return 0;
len = strlen(in);
ret = len / 4 * 3;
for (i = len; i-- > 0;) {
if (in[i] == '=') {
ret--;
} else {
break;
}
}
return ret;
}
#if 0
static void
b64_generate_decode_table()
{
int inv[80];
size_t i;
memset(inv, -1, sizeof(inv));
for (i = 0; i < sizeof(b64chars) - 1; i++) {
inv[b64chars[i] - 43] = i;
}
}
#endif
static int b64invs[] = { 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
47, 48, 49, 50, 51 };
static int
b64_isvalidchar(char c)
{
if (c >= '0' && c <= '9')
return 1;
if (c >= 'A' && c <= 'Z')
return 1;
if (c >= 'a' && c <= 'z')
return 1;
if (c == '+' || c == '/' || c == '=')
return 1;
return 0;
}
static int
b64_decode(const char *in, unsigned char *out, size_t outlen)
{
size_t len;
size_t i;
size_t j;
int v;
if (in == NULL || out == NULL)
return 0;
len = strlen(in);
if (outlen < b64_decoded_size(in) || len % 4 != 0)
return 0;
for (i = 0; i < len; i++) {
if (!b64_isvalidchar(in[i])) {
return 0;
}
}
for (i = 0, j = 0; i < len; i += 4, j += 3) {
v = b64invs[in[i] - 43];
v = (v << 6) | b64invs[in[i + 1] - 43];
v = in[i + 2] == '=' ? v << 6 : (v << 6) | b64invs[in[i + 2] - 43];
v = in[i + 3] == '=' ? v << 6 : (v << 6) | b64invs[in[i + 3] - 43];
out[j] = (v >> 16) & 0xFF;
if (in[i + 2] != '=')
out[j + 1] = (v >> 8) & 0xFF;
if (in[i + 3] != '=')
out[j + 2] = v & 0xFF;
}
return 1;
}
static void
record_set_mutation(FILE *out, pgno_t pg)
@ -104,20 +252,38 @@ record_merge_mutation(FILE *out, void *handle)
}
}
static void
record_checkpoint(FILE *out, void *handle)
{
if (recording) {
sparsemap_t *map = (sparsemap_t *)handle;
size_t capacity = sparsemap_get_capacity(map);
size_t buffer_size = sparsemap_get_size(map);
size_t encoded_size = b64_encoded_size(buffer_size);
char *encoded = b64_encode(sparsemap_get_data(map), buffer_size);
fprintf(out, "checkpoint %zu %zu %zu ", capacity, buffer_size, encoded_size);
fprintf(out, "%s", encoded);
fprintf(out, "\n");
}
}
/* sparsemap ------------------------------------------------------------- */
static sparsemap_idx_t
_sparsemap_set(sparsemap_t **map, sparsemap_idx_t idx, bool value)
_sparsemap_set(sparsemap_t **_map, sparsemap_idx_t idx, bool value)
{
sparsemap_t *map = *_map, *new_map = NULL;
do {
sparsemap_idx_t l = sparsemap_set(*map, idx, value);
sparsemap_idx_t l = sparsemap_set(map, idx, value);
if (l != idx) {
if (errno == ENOSPC) {
*map = sparsemap_set_data_size(*map, NULL, sparsemap_get_capacity(*map) + 64);
assert(*map != NULL);
size_t capacity = sparsemap_get_capacity(map) + 64;
new_map = sparsemap_set_data_size(map, NULL, capacity);
assert(new_map != NULL);
errno = 0;
*_map = new_map;
} else {
assert(false);
perror("Unable to grow sparsemap");
}
} else {
return l;
@ -166,8 +332,7 @@ __sm_find_span(void *handle, unsigned len)
{
sparsemap_t *map = (sparsemap_t *)handle;
pgno_t pgno = (pgno_t)sparsemap_span(map, 0, len, true);
assert(SPARSEMAP_NOT_FOUND(pgno) == false);
return pgno;
return SPARSEMAP_NOT_FOUND(pgno) ? -1 : pgno;
}
static bool
@ -269,6 +434,8 @@ __sm_count(void *handle)
/* midl ------------------------------------------------------------------ */
static bool __midl_validate(void *handle);
static void *
__midl_alloc(size_t capacity)
{
@ -287,12 +454,16 @@ __midl_free(void *handle)
static pgno_t
__midl_set(void **handle, pgno_t pg)
{
assert(__midl_validate(*handle));
MDB_IDL *_list = (MDB_IDL *)handle, list = *_list;
if (list[0] + 1 == list[-1]) {
mdb_midl_need(_list, list[-1] + 1);
assert(mdb_midl_need(_list, list[-1] + 1) == 0);
list = *_list;
}
mdb_midl_insert(list, pg);
mdb_midl_xappend(list, pg);
mdb_midl_sort(list);
//assert(mdb_midl_insert(list, pg) == 0);
assert(__midl_validate(*handle));
return pg;
}
@ -307,13 +478,17 @@ __midl_is_set(void *handle, pgno_t pg)
static pgno_t
__midl_clear(void **handle, pgno_t pg)
{
assert(__midl_validate(*handle));
MDB_IDL list = *(MDB_IDL *)handle;
unsigned len = list[0];
list[0] = len -= 1;
for (unsigned j = pg - 1; j < len;)
list[++j] = list[++pg];
#ifdef MDB_DEBUG
for (unsigned j = len + 1; j <= list[-1]; j++)
list[j] = 0;
#endif
assert(__midl_validate(*handle));
return pg;
}
@ -323,8 +498,7 @@ __midl_find_span(void *handle, unsigned len)
MDB_IDL list = (MDB_IDL)handle;
/* Seek a big enough contiguous page range. Prefer
* pages at the tail, just truncating the list.
*/
pages at the tail, just truncating the list. */
int retry = 1;
unsigned i = 0;
pgno_t pgno = 0, *mop = list;
@ -339,15 +513,18 @@ __midl_find_span(void *handle, unsigned len)
} while (--i > n2);
if (--retry < 0)
break;
} else {
return -1;
}
} while (1);
search_done:;
return pgno;
return retry < 0 ? -1 : pgno;
}
static bool
__midl_take_span(void **handle, pgno_t pg, unsigned len)
{
assert(__midl_validate(*handle));
MDB_IDL list = *(MDB_IDL *)handle;
int i = list[list[0]] == pg ? list[0] : mdb_midl_search(list, pg);
unsigned j, num = len;
@ -358,24 +535,29 @@ __midl_take_span(void **handle, pgno_t pg, unsigned len)
/* Move any stragglers down */
for (j = i - num; j < mop_len;)
mop[++j] = mop[++i];
/* Set all unused values in the array to 0
#ifdef MDB_DEBUG
for (j = mop_len + 1; j <= mop[-1]; j++)
mop[j] = 0; */
mop[j] = 0;
#endif
assert(__midl_validate(*handle));
return true;
}
static bool
__midl_release_span(void **handle, pgno_t pg, unsigned len)
{
assert(__midl_validate(*handle));
MDB_IDL *_list = (MDB_IDL *)handle, list = *_list;
if (list[0] + len >= list[-1]) {
mdb_midl_need(_list, list[-1] + len);
assert(mdb_midl_need(_list, list[-1] + len) == 0);
list = *_list;
}
for (size_t i = pg; i < pg + len; i++) {
mdb_midl_insert(list, i);
mdb_midl_xappend(list, i);
// assert(mdb_midl_insert(list, i) == 0);
}
mdb_midl_sort(list);
assert(__midl_validate(*handle));
return true;
}
@ -410,15 +592,15 @@ __midl_is_empty(void *handle, pgno_t pg, unsigned len)
static bool
__midl_merge(void **handle, void *other_handle)
{
MDB_IDL *_list = (MDB_IDL *)handle, list = *_list;
MDB_IDL other = (MDB_IDL)other_handle;
assert(__midl_validate(*handle));
MDB_IDL *_list = (MDB_IDL *)handle, list = *_list, other = (MDB_IDL)other_handle;
if (list[0] + other[0] >= list[-1]) {
mdb_midl_need(_list, list[-1] + other[0]);
assert(mdb_midl_need(_list, list[-1] + other[0]) == 0);
list = *_list;
}
mdb_midl_append_list(_list, other);
list = *_list;
mdb_midl_sort(list);
mdb_midl_xmerge(list, other_handle);
mdb_midl_sort(*_list);
assert(__midl_validate(*handle));
return true;
}
@ -440,11 +622,19 @@ static bool
__midl_validate(void *handle)
{
MDB_IDL list = (MDB_IDL)handle;
pgno_t id = 1;
while (id < list[0]) {
if (list[id] >= list[id + 1])
return false;
id++;
if (list[0] > list[-1]) {
return false;
}
if (list[0] > 1) {
// check for duplicates
for (pgno_t i = 2; i < list[0]; i++) {
if (list[i] == list[i - 1]) {
return false;
}
// ensure ordering
if (list[i] > list[i - 1])
return false;
}
}
return true;
}
@ -501,7 +691,7 @@ __roar_find_span(void *handle, unsigned len)
}
offset++;
} while (offset <= max);
return offset;
return offset > max ? -1 : offset;
}
static bool
@ -526,24 +716,14 @@ static bool
__roar_is_span(void *handle, pgno_t pg, unsigned len)
{
roaring_bitmap_t *rbm = (roaring_bitmap_t *)handle;
for (pgno_t i = pg; i < pg + len; i++) {
if (roaring_bitmap_contains(rbm, i) != true) {
return false;
}
}
return true;
return roaring_bitmap_contains_range(rbm, pg, pg + len);
}
static bool
__roar_is_empty(void *handle, pgno_t pg, unsigned len)
{
roaring_bitmap_t *rbm = (roaring_bitmap_t *)handle;
for (pgno_t i = 0; i < len; i++) {
if (roaring_bitmap_contains(rbm, pg + i) != false) {
return false;
}
}
return true;
return !roaring_bitmap_contains_range(rbm, pg, pg + len);
}
static bool
@ -558,15 +738,13 @@ __roar_merge(void **handle, void *other_handle)
static size_t
__roar_size(void *handle)
{
// TODO
return 0;
return roaring_bitmap_frozen_size_in_bytes((roaring_bitmap_t *)handle);
}
static size_t
__roar_count(void *handle)
{
// TODO
return 0;
return roaring_bitmap_get_cardinality((roaring_bitmap_t *)handle);
}
static bool
@ -583,20 +761,35 @@ typedef enum { SM, ML, RB } container_impl_t;
typedef struct container {
const char *name;
/* allocate a new container */
void *(*alloc)(size_t capacity);
/* free the container */
void (*free)(void *handle);
/* add pg to the container */
pgno_t (*set)(void **handle, pgno_t pg);
/* is pg in the container */
bool (*is_set)(void *handle, pgno_t pg);
/* remove pg from the container */
pgno_t (*clear)(void **handle, pgno_t pg);
/* find a set of contigious page of len and return the smallest pgno */
pgno_t (*find_span)(void *handle, unsigned len);
/* remove the span [pg, pg + len) from the container */
bool (*take_span)(void **handle, pgno_t pg, unsigned len);
/* add the span [pg, pg + len) into the container */
bool (*release_span)(void **handle, pgno_t pg, unsigned len);
/* are the pgno in the span [pg, pg+ len) in the container? */
bool (*is_span)(void *handle, pgno_t pg, unsigned len);
/* are the pgno in the span [pg, pg+ len) notn in the container? */
bool (*is_empty)(void *handle, pgno_t pg, unsigned len);
/* is the span the first one (brute force check) */
bool (*is_first)(void *handle, pgno_t pg, unsigned len);
/* ensure that all pgno contained in other_handle are also in handle */
bool (*merge)(void **handle, void *other_handle);
/* the bytes size of the container */
size_t (*size)(void *handle);
/* the number of items in the container */
size_t (*count)(void *handle);
/* perform internal validation on the container (optional) */
bool (*validate)(void *handle);
} container_t;
@ -667,48 +860,57 @@ FILE *fp;
#define invoke(type, fn, ...) containers[type].fn(handles[type], __VA_ARGS__)
#define mutate(type, fn, ...) (type == 0) ? record_##fn##_mutation(fp, __VA_ARGS__) : (void)0, containers[type].fn(&handles[type], __VA_ARGS__)
#define foreach(set) for (unsigned type = 0; type < (sizeof((set)) / sizeof((set)[0])); type++)
#define compare(set) \
#define checkpoint(set) \
for (unsigned type = 1; type < (sizeof((set)) / sizeof((set)[0])); type++) { \
verify_eq(0, handles[0], type, handles[type]); \
}
} \
record_checkpoint(fp, handles[0])
bool
verify_sm_eq_rb(sparsemap_t *map, roaring_bitmap_t *rbm)
{
bool ret = true;
uint64_t max = roaring_bitmap_maximum(rbm);
roaring_uint32_iterator_t iter;
roaring_iterator_init(rbm, &iter);
for (uint64_t i = 0; i <= max; i++) {
if (i == iter.current_value) {
assert(sparsemap_is_set(map, i) == true);
if (sparsemap_is_set(map, i) == false) {
fprintf(stdout, "- %zu ", i);
ret = false;
}
roaring_uint32_iterator_advance(&iter);
} else {
assert(sparsemap_is_set(map, i) == false);
if (sparsemap_is_set(map, i) == true) {
fprintf(stdout, "+ %zu ", i);
ret = false;
}
}
}
return true;
return ret;
}
bool
verify_sm_eq_ml(sparsemap_t *map, MDB_IDL list)
{
bool ret = true;
for (MDB_ID i = 1; i <= list[0]; i++) {
pgno_t pg = list[i];
unsigned skipped = i == 1 ? 0 : list[i - 1] - list[i] - 1;
if (skipped) {
for (MDB_ID j = list[i - 1]; j > list[i]; j--) {
if (sparsemap_is_set(map, pg - j) != false) {
__diag("%zu\n", pg - j);
return false;
fprintf(stdout, "+ %zu ", pg - j);
ret = false;
}
}
}
if (sparsemap_is_set(map, pg) != true) {
__diag("%zu\n", pg);
return false;
fprintf(stdout, "- %zu ", pg);
ret = false;
}
}
return true;
return ret;
}
bool
@ -769,8 +971,6 @@ stats(size_t iterations, sparsemap_t *map, MDB_IDL list)
td_quantile(b_span_merge, .999));
}
#define INITIAL_AMOUNT 1024 * 2
#define SHORT_OPT "r:fa:bh"
#define LONG_OPT "record:,force,amount:,buffer,help"
@ -791,7 +991,7 @@ main(int argc, char *argv[])
int opt;
const char *record_file = NULL;
int force_flag = 0;
size_t left, amt = INITIAL_AMOUNT;
size_t left, iteration = 0, amt = INITIAL_AMOUNT;
bool buffer = true;
fp = stdout;
@ -862,16 +1062,20 @@ main(int argc, char *argv[])
}
cast(type, validate);
}
compare(types);
checkpoint(types);
left = amt;
while (true) {
iteration++;
// the an amount [1, 16] of pages to find preferring smaller sizes
unsigned len = toss(15) + 1;
pgno_t loc[num_types];
foreach(types)
{
loc[type] = invoke(type, find_span, len);
if (loc[type] == -1) {
goto larger_please;
}
}
for (unsigned n = 0; n < num_types; n++) {
foreach(types)
@ -890,14 +1094,13 @@ main(int argc, char *argv[])
assert(mutate(type, take_span, loc[which_loc], len));
cast(type, validate);
}
compare(types);
checkpoint(types);
left -= len;
// Once we've used 1/10th of the free list, let's replenish it a bit.
if (amt - left > amt / 10) {
if (toss(15) > 13) {
do {
pgno_t pgno;
size_t len, retries = amt;
size_t len, retries = amt / 10;
// Find a hole in the map to replenish.
do {
len = toss(15) + 1;
@ -911,7 +1114,7 @@ main(int argc, char *argv[])
{
assert(invoke(type, is_empty, pgno, len));
}
compare(types);
checkpoint(types);
foreach(types)
{
assert(invoke(type, is_span, pgno, len) == false);
@ -919,7 +1122,7 @@ main(int argc, char *argv[])
assert(invoke(type, is_span, pgno, len) == true);
cast(type, validate);
}
compare(types);
checkpoint(types);
left += len;
}
} while (amt - left > amt / 100);
@ -930,7 +1133,7 @@ main(int argc, char *argv[])
pgno_t max;
larger_please:
new_amt = 1024 + (xorshift32() % 2048) + toss(1024);
new_offset = sparsemap_get_ending_offset(handles[SM]);
new_offset = sparsemap_get_ending_offset(handles[SM]) + 1;
// Build a new container to merge with the existing one.
foreach(types)
@ -939,6 +1142,7 @@ main(int argc, char *argv[])
for (size_t i = 0; i < new_amt; i++) {
// We don't want to record and we're using new_handles not
// handles, so call fn directly.
assert(containers[type].is_set(handles[type], i + new_offset) == false);
assert(containers[type].is_set(new_handles[type], i + new_offset) == false);
containers[type].set(&new_handles[type], i + new_offset);
assert(containers[type].is_set(new_handles[type], i + new_offset) == true);
@ -949,7 +1153,7 @@ main(int argc, char *argv[])
assert(mutate(type, merge, new_handles[type]));
cast(type, validate);
}
compare(types);
checkpoint(types);
left += new_amt;
amt += new_amt;
foreach(types)