compare against roaring bitmaps

This commit is contained in:
Gregory Burd 2024-05-03 15:15:39 -04:00
parent 57a8f99a32
commit b9612f12cc
14 changed files with 28989 additions and 84 deletions

3
.gitignore vendored
View file

@ -3,7 +3,7 @@
**/*.o
tests/test
examples/ex_?
examples/soak
tests/soak
.cache
hints.txt
tmp/
@ -28,6 +28,7 @@ compile_commands.json
*.dat
*.fsm
*.db
.vscode/
# Created by https://www.gitignore.io/api/jetbrains
# Edit at https://www.gitignore.io/?templates=jetbrains

View file

@ -3,23 +3,25 @@ OBJS = sparsemap.o
STATIC_LIB = libsparsemap.a
SHARED_LIB = libsparsemap.so
#CFLAGS = -Wall -Wextra -Wpedantic -Of -std=c11 -Iinclude/ -fPIC
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
#CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -O0 -g -std=c11 -Iinclude/ -fPIC
#CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -Ofast -g -std=c11 -Iinclude/ -fPIC
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
CFLAGS = -Wall -Wextra -Wpedantic -Ofast -g -std=c11 -Iinclude/ -fPIC
#CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=all -fhardened -std=c11 -Iinclude/ -fPIC
LIBS = -lm
#CFLAGS = -Wall -Wextra -Wpedantic -Of -std=c11 -Iinclude/ -fPIC $(LIBS)
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC $(LIBS)
CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -O0 -g -std=c11 -Iinclude/ -fPIC $(LIBS)
#CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -Ofast -g -std=c11 -Iinclude/ -fPIC $(LIBS)
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC $(LIBS)
#CFLAGS = -Wall -Wextra -Wpedantic -Ofast -g -std=c11 -Iinclude/ -fPIC $(LIBS)
#CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC $(LIBS)
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=all -fhardened -std=c11 -Iinclude/ -fPIC $(LIBS)
#TEST_FLAGS = -DDEBUG -Wall -Wextra -Wpedantic -O0 -g -std=c11 -Iinclude/ -Itests/ -fPIC
#TEST_FLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -Itests/ -fPIC
TEST_FLAGS = -Wall -Wextra -Wpedantic -Ofast -g -std=c11 -Iinclude/ -Itests/ -fPIC
#TEST_FLAGS = -DDEBUG -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC
TEST_FLAGS = -DDEBUG -Wall -Wextra -Wpedantic -O0 -g -std=c11 -Iinclude/ -Itests/ -fPIC $(LIBS)
#TEST_FLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -Itests/ -fPIC $(LIBS)
#TEST_FLAGS = -Wall -Wextra -Wpedantic -Ofast -g -std=c11 -Iinclude/ -Itests/ -fPIC $(LIBS)
#TEST_FLAGS = -DDEBUG -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC $(LIBS)
TESTS = tests/test
TEST_OBJS = tests/test.o tests/munit.o tests/tdigest.o tests/common.o
EXAMPLES = examples/ex_1 examples/ex_2 examples/ex_3 examples/ex_4 examples/soak
TESTS = tests/test tests/soak
TEST_OBJS = tests/test.o lib/munit.o lib/tdigest.o lib/common.o
LIB_OBJS = lib/munit.o lib/tdigest.o lib/common.o lib/roaring.o
EXAMPLES = examples/ex_1 examples/ex_2 examples/ex_3 examples/ex_4
.PHONY: all shared static clean test examples mls
@ -35,10 +37,10 @@ $(STATIC_LIB): $(OBJS)
$(SHARED_LIB): $(OBJS)
$(CC) $(CFLAGS) -o $@ $? -shared
examples: $(STATIC_LIB) $(EXAMPLES) examples/common.o
examples: $(STATIC_LIB) $(EXAMPLES) $(TEST_OBJS)
soak: examples/soak.c
examples/soak
soak: tests/soak.c
tests/soak
mls: examples/mls
@ -47,7 +49,7 @@ test: $(TESTS)
check: test
env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/test
tests/test: $(TEST_OBJS) $(STATIC_LIB)
tests/test: $(TEST_OBJS) $(LIB_OBJS) $(STATIC_LIB)
$(CC) $^ -lm -o $@ $(TEST_FLAGS)
clean:
@ -58,34 +60,34 @@ clean:
rm -f $(EXAMPLES) examples/*.o
format:
clang-format -i src/sparsemap.c include/sparsemap.h examples/ex_*.c examples/soak.c tests/test.c tests/common.c tests/common.h
clang-format -i src/sparsemap.c include/sparsemap.h examples/ex_*.c tests/soak.c tests/test.c lib/common.c include/common.h
# clang-format -i include/*.h src/*.c tests/*.c tests/*.h examples/*.c
%.o: src/%.c
$(CC) $(CFLAGS) -c -o $@ $^
lib/%.o: tests/%.c
$(CC) $(CFLAGS) -c -o $@ $^
tests/%.o: tests/%.c
$(CC) $(CFLAGS) -c -o $@ $^
examples/%.o: examples/%.c
$(CC) $(CFLAGS) -c -o $@ $^
examples/common.o: tests/common.c
$(CC) $(CFLAGS) -c -o $@ $^
examples/ex_1: examples/common.o examples/ex_1.o $(STATIC_LIB)
examples/ex_1: $(LIB_OBJS) examples/ex_1.o $(STATIC_LIB)
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS)
examples/ex_2: examples/common.o examples/ex_2.o $(STATIC_LIB)
examples/ex_2: $(LIB_OBJS) examples/ex_2.o $(STATIC_LIB)
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS)
examples/ex_3: examples/common.o examples/ex_3.o $(STATIC_LIB)
examples/ex_3: $(LIB_OBJS) examples/ex_3.o $(STATIC_LIB)
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS)
examples/ex_4: examples/common.o examples/ex_4.o $(STATIC_LIB)
examples/ex_4: $(LIB_OBJS) examples/ex_4.o $(STATIC_LIB)
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS)
examples/soak: examples/common.o tests/tdigest.o examples/soak.o $(STATIC_LIB)
tests/soak: $(LIB_OBJS) tests/soak.o $(STATIC_LIB)
$(CC) $^ -lm -o $@ $(CFLAGS) $(TEST_FLAGS)
todo:

View file

@ -1,5 +1,9 @@
# Sparsemap
Bitsets, also called bitmaps, are commonly used as fast data structures.
Unfortunately, they can use too much memory. To compensate, we often use
compressed bitmaps.
`sparsemap` is a sparse, compressed bitmap. In best case, it can store 2048
bits in just 8 bytes. In worst case, it stores the 2048 bits uncompressed and
requires additional 8 bytes of overhead.
@ -14,7 +18,7 @@ On the lowest level stores bits in sm_bitvec_t's (a uint32_t or uint64_t).
Each sm_bitvec_t has an additional descriptor (2 bits). A single word prepended
to each sm_bitvec_t describes its condition. The descriptor word and the
sm_bitvec_t's have the same size.) The descriptor of a sm_bitvec_t
sm_bitvec_t's have the same size. The descriptor of a sm_bitvec_t
specifies whether the sm_bitvec_t consists only of set bits ("1"), unset
bits ("0") or has a mixed payload. In the first and second case the
sm_bitvec_t is not stored.
@ -45,7 +49,8 @@ offset 0, the second starts at offset 8192).
## Usage instructions
The file `examples/ex_1.c` has example code.
Copy the files `src/sparsemap.c` and `include/sparsemap.h` into your project.
Review the `examples/*` and `tests/*` code.
## Final words
@ -58,7 +63,10 @@ However, if the sequence is not consecutive and has gaps, it's possible that
the compression is inefficient, and the size (in the worst case) is identical
to an uncompressed bit vector (sometimes higher due to the bytes required for
metadata). In such cases, other compression schemes are more efficient (i.e.
http://lemire.me/blog/archives/2008/08/20/the-mythical-bitmap-index/).
http://lemire.me/blog/archives/2008/08/20/the-mythical-bitmap-index/). We
include in `lib` the amalgamated (git `2dc8070`) and well-known
[Roaring Bitmaps](https://github.com/RoaringBitmap/CRoaring/tree/master) and
use it in the soak test to ensure our results are as accurate as theirs.
This library was originally created for [hamsterdb](http://hamsterdb.com) in
C++ and then translated to C and further improved by Greg Burd <greg@burd.me>

View file

@ -1,11 +1,10 @@
#include <assert.h>
#include <common.h>
#include <sparsemap.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include "../include/sparsemap.h"
#include "../tests/common.h"
int
main(void)
{

View file

@ -1,11 +1,10 @@
#include <assert.h>
#include <common.h>
#include <sparsemap.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "../include/sparsemap.h"
#include "../tests/common.h"
#define TEST_ARRAY_SIZE 1024
int

View file

@ -5,7 +5,6 @@
# nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
nixpkgs.url = "github:NixOS/nixpkgs/23.11";
utils.url = "github:numtide/flake-utils";
utils.inputs.nixpkgs.follows = "nixpkgs";
};
outputs = { self, nixpkgs, ... }
@ -18,38 +17,39 @@
config.allowUnfree = true;
};
in {
devShell = pkgs.mkShell rec {
name = "sparsemap";
packages = with pkgs; [
act
autoconf
clang
ed
gcc
gdb
gettext
graphviz-nox
libtool
m4
perl
pkg-config
python3
ripgrep
valgrind
];
flake-utils.inputs.systems.follows = "system";
devShell = pkgs.mkShell rec {
name = "sparsemap";
packages = with pkgs; [
act
autoconf
clang
ed
gcc
gdb
gettext
graphviz-nox
libtool
m4
perl
pkg-config
python3
ripgrep
valgrind
];
buildInputs = with pkgs; [
libbacktrace
glibc.out
glibc.static
];
buildInputs = with pkgs; [
libbacktrace
glibc.out
glibc.static
];
shellHook = let
icon = "f121";
in ''
shellHook = let
icon = "f121";
in ''
export PS1="$(echo -e '\u${icon}') {\[$(tput sgr0)\]\[\033[38;5;228m\]\w\[$(tput sgr0)\]\[\033[38;5;15m\]} (${name}) \\$ \[$(tput sgr0)\]"
'';
};
DOCKER_BUILDKIT = 1;
});
};
DOCKER_BUILDKIT = 1;
});
}

2908
include/roaring.h Normal file

File diff suppressed because it is too large Load diff

25883
lib/roaring.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -1270,6 +1270,7 @@ sparsemap_merge(sparsemap_t *map, sparsemap_t *other)
uint8_t *src, *dst;
size_t src_count = __sm_get_chunk_map_count(other), dst_count = __sm_get_chunk_map_count(map), max_chunk_count = src_count + dst_count;
// TODO: ensure there is space, or ENOSPC
dst = __sm_get_chunk_map_data(map, 0);
src = __sm_get_chunk_map_data(other, 0);
for (size_t i = 0; i < max_chunk_count && src_count; i++) {

View file

@ -6,9 +6,10 @@
#include <stdlib.h>
#include <string.h>
#include "../include/common.h"
#include "../include/roaring.h"
#include "../include/sparsemap.h"
#include "../tests/common.h"
#include "../tests/tdigest.h"
#include "../include/tdigest.h"
/* midl.h ------------------------------------------------------------------ */
/** @defgroup idls ID List Management
@ -511,6 +512,17 @@ verify_empty_midl(MDB_IDL list, pgno_t pg, unsigned len)
return true;
}
bool
verify_span_roaring(roaring_bitmap_t *rbm, pgno_t pg, unsigned len)
{
for (pgno_t i = pg; i < pg + len; i++) {
if (roaring_bitmap_contains(rbm, i) != true) {
return false;
}
}
return true;
}
bool
verify_span_sparsemap(sparsemap_t *map, pgno_t pg, unsigned len)
{
@ -533,6 +545,17 @@ verify_empty_sparsemap(sparsemap_t *map, pgno_t pg, unsigned len)
return true;
}
bool
verify_empty_roaring(roaring_bitmap_t *rbm, pgno_t pg, unsigned len)
{
for (pgno_t i = 0; i < len; i++) {
if (roaring_bitmap_contains(rbm, pg + i) != false) {
return false;
}
}
return true;
}
bool
verify_sm_is_first_available_span(sparsemap_t *map, sparsemap_idx_t idx, size_t len, bool value)
{
@ -548,6 +571,23 @@ verify_sm_is_first_available_span(sparsemap_t *map, sparsemap_idx_t idx, size_t
return false;
}
bool
verify_sm_eq_rm(sparsemap_t *map, roaring_bitmap_t *rbm)
{
uint64_t max = roaring_bitmap_maximum(rbm);
roaring_uint32_iterator_t iter;
roaring_iterator_init(rbm, &iter);
for (uint64_t i = 0; i <= max; i++) {
if (i == iter.current_value) {
assert(sparsemap_is_set(map, i) == true);
roaring_uint32_iterator_advance(&iter);
} else {
assert(sparsemap_is_set(map, i) == false);
}
}
return true;
}
bool
verify_sm_eq_ml(sparsemap_t *map, MDB_IDL list)
{
@ -637,7 +677,6 @@ int
main(void)
{
size_t replenish = 0, iterations = 0;
bool prefer_mdb_idl_location = (bool)xorshift32() % 2;
// disable buffering
#ifdef DEBUG
@ -657,6 +696,7 @@ main(void)
sparsemap_idx_t amt = INITIAL_AMOUNT;
MDB_IDL list = mdb_midl_alloc(amt);
sparsemap_t *map = sparsemap(INITIAL_AMOUNT);
roaring_bitmap_t *rbm = roaring_bitmap_create();
// start with 2GiB of 4KiB free pages to track:
// - MDB_IDL requires one int for each free page
@ -665,16 +705,19 @@ main(void)
for (sparsemap_idx_t pg = 0; pg < amt; pg++) {
// We list every free (unallocated) page in the IDL, while...
mdb_midl_xappend(list, pg);
// ... true (unset in the bitmap) indicates free in the bitmap.
// ... true (unset in the bitmap) indicates free in the bitmap, ...
assert(_sparsemap_set(&map, pg, true) == pg);
assert(roaring_bitmap_add_checked(rbm, pg));
}
mdb_midl_sort(list);
roaring_bitmap_run_optimize(rbm);
assert(verify_sm_eq_ml(map, list));
assert(verify_sm_eq_rm(map, rbm));
double b, e;
while (1) {
unsigned mi;
pgno_t ml, sl;
pgno_t ml, sl, rl;
// get an amount [1, 16] of pages to find preferring smaller sizes
unsigned n = toss(15) + 1;
@ -707,6 +750,7 @@ main(void)
}
assert(verify_span_midl(list, ml, n));
assert(verify_span_sparsemap(map, ml, n));
assert(verify_span_roaring(rbm, ml, n));
// find a set of pages using the Sparsemap
{
@ -720,9 +764,30 @@ main(void)
}
assert(verify_span_midl(list, sl, n));
assert(verify_span_sparsemap(map, sl, n));
assert(verify_span_roaring(rbm, sl, n));
// find a set of pages using the Roaring Bitmap
{
b = nsts();
uint64_t max = roaring_bitmap_maximum(rbm);
uint64_t offset = roaring_bitmap_minimum(rbm);
do {
if (n == 1 || roaring_bitmap_range_cardinality(rbm, offset, offset + n) == n) {
break;
}
offset++;
} while (offset <= max);
rl = offset;
e = nsts();
}
assert(verify_span_midl(list, rl, n));
assert(verify_span_sparsemap(map, rl, n));
assert(verify_span_roaring(rbm, rl, n));
bool prefer_mdb_idl_loc = (bool)xorshift32() % 2;
// acquire the set of pages within the list
if (prefer_mdb_idl_location) {
if (prefer_mdb_idl_loc) {
b = nsts();
unsigned j, num = n;
int i = mi;
@ -755,7 +820,7 @@ main(void)
}
// acquire the set of pages within the sparsemap
if (prefer_mdb_idl_location) {
if (prefer_mdb_idl_loc) {
b = nsts();
for (pgno_t i = ml; i < ml + n; i++) {
assert(_sparsemap_set(&map, i, false) == i);
@ -771,7 +836,20 @@ main(void)
td_add(b_span_take, e - b, 1);
}
// acquire the set of pages within the roaring bitmap
if (prefer_mdb_idl_loc) {
b = nsts();
roaring_bitmap_remove_range(rbm, ml, ml + n);
e = nsts();
} else {
b = nsts();
roaring_bitmap_remove_range(rbm, sl, sl + n);
e = nsts();
}
roaring_bitmap_run_optimize(rbm);
assert(verify_sm_eq_ml(map, list));
assert(verify_sm_eq_rm(map, rbm));
// Once we've used a tenth of the free list, let's replenish it a bit.
if (list[0] < amt / 10) {
@ -790,7 +868,9 @@ main(void)
if (SPARSEMAP_FOUND(pgno)) {
assert(verify_empty_midl(list, pgno, len));
assert(verify_empty_sparsemap(map, pgno, len));
assert(verify_empty_roaring(rbm, pgno, len));
assert(verify_sm_eq_ml(map, list));
assert(verify_sm_eq_rm(map, rbm));
if (list[-1] - list[0] < len) {
mdb_midl_need(&list, list[-1] + len);
}
@ -801,13 +881,16 @@ main(void)
assert(verify_midl_contains(list, i) == true);
assert(_sparsemap_set(&map, i, true) == i);
assert(sparsemap_is_set(map, i) == true);
assert(roaring_bitmap_add_checked(rbm, i) == true);
}
mdb_midl_sort(list);
assert(verify_midl_nodups(list));
assert(verify_span_midl(list, pgno, len));
assert(verify_span_sparsemap(map, pgno, len));
assert(verify_span_roaring(rbm, pgno, len));
}
assert(verify_sm_eq_ml(map, list));
assert(verify_sm_eq_rm(map, rbm));
replenish++;
} while (list[0] < amt - 32);
}
@ -821,10 +904,10 @@ main(void)
size_t len = COUNT;
// The largest page is at list[1] because this is a reverse sorted list.
pgno_t pg = list[0] ? list[1] + 1 : 0;
// if (toss(6) + 1 < 7) {
if (true) { // disable shrinking for now...
if (true) { // disable shrinking for now... (toss(6) + 1 < 7)
MDB_IDL new_list = mdb_midl_alloc(len);
sparsemap_t *new_map = sparsemap(INITIAL_AMOUNT);
roaring_bitmap_t *new_rbm = roaring_bitmap_create();
for (size_t i = 0; i < len; i++) {
pgno_t gp = (pg + len) - i;
new_list[i + 1] = gp;
@ -832,8 +915,11 @@ main(void)
assert(verify_midl_contains(new_list, gp) == true);
assert(_sparsemap_set(&new_map, gp, true) == gp);
assert(sparsemap_is_set(new_map, gp));
assert(roaring_bitmap_add_checked(new_rbm, gp));
assert(roaring_bitmap_contains(new_rbm, gp));
}
assert(verify_sm_eq_ml(new_map, new_list));
assert(verify_sm_eq_rm(new_map, new_rbm));
{
b = nsts();
mdb_midl_append_list(&list, new_list);
@ -856,19 +942,37 @@ main(void)
assert(sparsemap_is_set(map, gp));
}
free(new_map);
{
b = nsts();
roaring_bitmap_or_inplace(rbm, new_rbm);
e = nsts();
}
for (size_t i = 0; i < len; i++) {
pgno_t gp = (pg + len) - i;
assert(roaring_bitmap_contains(rbm, gp));
}
roaring_free(new_rbm);
} else {
if (list[-1] > INITIAL_AMOUNT) {
// ... a fraction of the time, remove COUNT / 2 of 4KiB pages.
pgno_t pg;
for (size_t i = 0; i < COUNT; i++) {
pg = list[list[0] - i];
assert(sparsemap_is_set(map, pg) == true);
assert(_sparsemap_set(&map, pg, false) == pg);
{
pgno_t pg;
for (size_t i = 0; i < COUNT; i++) {
pg = list[list[0] - i];
assert(sparsemap_is_set(map, pg) == true);
assert(_sparsemap_set(&map, pg, false) == pg);
}
}
{
roaring_bitmap_remove_range_closed(rbm, list[list[0] - COUNT], list[list[0]]);
}
{
mdb_midl_shrink_to(&list, list[0] - COUNT);
}
mdb_midl_shrink_to(&list, list[0] - COUNT);
assert(list[list[0]] != pg);
assert(verify_midl_nodups(list));
verify_sm_eq_ml(map, list);
verify_sm_eq_rm(map, rbm);
}
}
}