select/rank for unset as well as set bits (#4)

Reviewed-on: #4
Co-authored-by: Greg Burd <greg@burd.me>
Co-committed-by: Greg Burd <greg@burd.me>
This commit is contained in:
Gregory Burd 2024-04-24 20:32:09 +00:00 committed by Gregory Burd
parent 5d5c7f1584
commit b3dfd745e7
16 changed files with 2316 additions and 594 deletions

2
.envrc
View file

@ -1,5 +1,5 @@
if ! has nix_direnv_version || ! nix_direnv_version 3.0.4; then if ! has nix_direnv_version || ! nix_direnv_version 3.0.4; then
source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/3.0.4/direnvrc" "sha256-DzlYZ33mWF/Gs8DDeyjr8mnVmQGx7ASYqA5WlxwvBG4=" source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/3.0.4/direnvrc" "sha256-DzlYZ33mWF/Gs8DDeyjr8mnVmQGx7ASYqA5WlxwvBG4="
fi fi
watch_file devShell.nix shell.nix flake.nix watch_file shell.nix flake.nix
use flake || use nix use flake || use nix

View file

@ -5,14 +5,15 @@ SHARED_LIB = libsparsemap.so
#CFLAGS = -Wall -Wextra -Wpedantic -Of -std=c11 -Iinclude/ -fPIC #CFLAGS = -Wall -Wextra -Wpedantic -Of -std=c11 -Iinclude/ -fPIC
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC #CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC #CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=all -fhardened -std=c11 -Iinclude/ -fPIC #CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=all -fhardened -std=c11 -Iinclude/ -fPIC
TEST_FLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -Itests/ -fPIC TEST_FLAGS = -DDEBUG -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -Itests/ -fPIC
#TEST_FLAGS = -DDEBUG -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC
TESTS = tests/test TESTS = tests/test
TEST_OBJS = tests/test.o tests/munit.o tests/common.o TEST_OBJS = tests/test.o tests/munit.o tests/tdigest.o tests/common.o
EXAMPLES = examples/ex_1 examples/ex_2 examples/ex_3 examples/ex_4 EXAMPLES = examples/ex_1 examples/ex_2 examples/ex_3 examples/ex_4
.PHONY: all shared static clean test examples mls .PHONY: all shared static clean test examples mls
@ -39,7 +40,7 @@ check: test
env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/test env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/test
tests/test: $(TEST_OBJS) $(STATIC_LIB) tests/test: $(TEST_OBJS) $(STATIC_LIB)
$(CC) $^ -o $@ $(TEST_FLAGS) $(CC) $^ -lm -o $@ $(TEST_FLAGS)
clean: clean:
rm -f $(OBJS) rm -f $(OBJS)
@ -76,4 +77,7 @@ examples/ex_3: examples/common.o examples/ex_3.o $(STATIC_LIB)
examples/ex_4: examples/common.o examples/ex_4.o $(STATIC_LIB) examples/ex_4: examples/common.o examples/ex_4.o $(STATIC_LIB)
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS) $(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS)
todo:
rg -i 'todo|gsb|abort'
# cp src/sparsemap.c /tmp && clang-tidy src/sparsemap.c -fix -fix-errors -checks="readability-braces-around-statements" -- -DDEBUG -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC # cp src/sparsemap.c /tmp && clang-tidy src/sparsemap.c -fix -fix-errors -checks="readability-braces-around-statements" -- -DDEBUG -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC

View file

@ -60,5 +60,6 @@ to an uncompressed bit vector (sometimes higher due to the bytes required for
metadata). In such cases, other compression schemes are more efficient (i.e. metadata). In such cases, other compression schemes are more efficient (i.e.
http://lemire.me/blog/archives/2008/08/20/the-mythical-bitmap-index/). http://lemire.me/blog/archives/2008/08/20/the-mythical-bitmap-index/).
This library was originally created for hamsterdb [http://hamsterdb.com] in This library was originally created for [hamsterdb](http://hamsterdb.com) in
C++ and then translated to C99 code by Greg Burd <greg@burd.me>. C++ and then translated to C and further improved by Greg Burd <greg@burd.me>
for use in LMDB and OpenLDAP.

View file

@ -1,4 +1,7 @@
#include <assert.h> #include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include "../include/sparsemap.h" #include "../include/sparsemap.h"
@ -14,16 +17,18 @@
/* !!! Duplicated here for testing purposes. Keep in sync, or suffer. !!! */ /* !!! Duplicated here for testing purposes. Keep in sync, or suffer. !!! */
struct sparsemap { struct sparsemap {
uint8_t *m_data;
size_t m_capacity; size_t m_capacity;
size_t m_data_used; size_t m_data_used;
uint8_t *m_data;
}; };
int int
main() main()
{ {
size_t size = 4; size_t size = 4;
setbuf(stderr, 0); // disable buffering setvbuf(stdout, NULL, _IONBF, 0); // Disable buffering for stdout
setvbuf(stderr, NULL, _IONBF, 0); // Disable buffering for stdout
__diag("Please wait a moment..."); __diag("Please wait a moment...");
sparsemap_t mmap, *map = &mmap; sparsemap_t mmap, *map = &mmap;
uint8_t buffer[1024]; uint8_t buffer[1024];
@ -135,7 +140,7 @@ main()
sparsemap_set(map, i, true); sparsemap_set(map, i, true);
} }
for (int i = 0; i < 100000; i++) { for (int i = 0; i < 100000; i++) {
assert(sparsemap_select(map, i) == (unsigned)i); assert(sparsemap_select(map, i, true) == (unsigned)i);
} }
sparsemap_clear(map); sparsemap_clear(map);
@ -145,7 +150,7 @@ main()
sparsemap_set(map, i, true); sparsemap_set(map, i, true);
} }
for (int i = 1; i < 513; i++) { for (int i = 1; i < 513; i++) {
assert(sparsemap_select(map, i - 1) == (unsigned)i); assert(sparsemap_select(map, i - 1, true) == (unsigned)i);
} }
sparsemap_clear(map); sparsemap_clear(map);
@ -155,7 +160,7 @@ main()
sparsemap_set(map, i * 10, true); sparsemap_set(map, i * 10, true);
} }
for (size_t i = 0; i < 8; i++) { for (size_t i = 0; i < 8; i++) {
assert(sparsemap_select(map, i) == i * 10); assert(sparsemap_select(map, i, true) == (sparsemap_idx_t)i * 10);
} }
// split and move, aligned to MiniMap capacity // split and move, aligned to MiniMap capacity

View file

@ -1,9 +1,7 @@
#include <assert.h> #include <assert.h>
#include <stdarg.h> #include <stdbool.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include "../include/sparsemap.h" #include "../include/sparsemap.h"
@ -16,29 +14,28 @@
} while (0) } while (0)
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#define SEED
int int
main(void) main(void)
{ {
int i = 0; int i;
// disable buffering // disable buffering
setbuf(stderr, 0); setvbuf(stdout, NULL, _IONBF, 0); // Disable buffering for stdout
setvbuf(stderr, NULL, _IONBF, 0); // Disable buffering for stdout
// start with a 1KiB buffer, 1024 bits // start with a 1KiB buffer, 1024 bits
uint8_t *buf = calloc(1024, sizeof(uint8_t)); uint8_t *buf = calloc(1024, sizeof(uint8_t));
// create the sparse bitmap // create the sparse bitmap
sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 1024); sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 1024);
// Set every other bit (pathologically worst case) to see what happens // Set every other bit (pathologically worst case) to see what happens
// when the map is full. // when the map is full.
for (i = 0; i < 7744; i++) { for (i = 0; i < 7744; i++) {
if (i % 2) if (!i % 2) {
continue; sparsemap_set(map, i, true);
sparsemap_set(map, i, true); assert(sparsemap_is_set(map, i) == true);
assert(sparsemap_is_set(map, i) == true); }
} }
// On 1024 KiB of buffer with every other bit set the map holds 7744 bits // On 1024 KiB of buffer with every other bit set the map holds 7744 bits
// and then runs out of space. This next _set() call will fail/abort. // and then runs out of space. This next _set() call will fail/abort.

View file

@ -1,9 +1,7 @@
#include <assert.h> #include <assert.h>
#include <stdarg.h> #include <stdbool.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include "../include/sparsemap.h" #include "../include/sparsemap.h"
#include "../tests/common.h" #include "../tests/common.h"
@ -11,7 +9,7 @@
int int
main(void) main(void)
{ {
int i = 0; int i;
int array[1024] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, int array[1024] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
@ -60,7 +58,7 @@ main(void)
uint8_t *buf = calloc(1024, sizeof(uint8_t)); uint8_t *buf = calloc(1024, sizeof(uint8_t));
// create the sparse bitmap // create the sparse bitmap
sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 1024); sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 1024);
// set all the bits on in a random order // set all the bits on in a random order
for (i = 0; i < 1024; i++) { for (i = 0; i < 1024; i++) {

View file

@ -24,7 +24,7 @@ main(void)
uint8_t *buf = calloc((size_t)3 * 1024, sizeof(uint8_t)); uint8_t *buf = calloc((size_t)3 * 1024, sizeof(uint8_t));
// create the sparse bitmap // create the sparse bitmap
sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 3 * 1024); sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 3 * 1024);
// create an array of ints // create an array of ints
setup_test_array(array, TEST_ARRAY_SIZE, 1024 * 3); setup_test_array(array, TEST_ARRAY_SIZE, 1024 * 3);
@ -60,7 +60,7 @@ main(void)
assert(sparsemap_is_set(map, array[i]) == true); assert(sparsemap_is_set(map, array[i]) == true);
} }
has_span(map, array, TEST_ARRAY_SIZE, (int)len); has_span(map, array, TEST_ARRAY_SIZE, (int)len);
size_t l = sparsemap_span(map, 0, len); size_t l = sparsemap_span(map, 0, len, true);
if (l != (size_t)-1) { if (l != (size_t)-1) {
__diag("Found span in map starting at %lu of length %lu\n", l, len); __diag("Found span in map starting at %lu of length %lu\n", l, len);
__diag("is_span(%lu, %lu) == %s\n", l, len, is_span(array, TEST_ARRAY_SIZE, l, len) ? "yes" : "no"); __diag("is_span(%lu, %lu) == %s\n", l, len, is_span(array, TEST_ARRAY_SIZE, l, len) ? "yes" : "no");

View file

@ -1,43 +1,25 @@
{ {
"nodes": { "nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1710146030,
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1712192574, "lastModified": 1701282334,
"narHash": "sha256-LbbVOliJKTF4Zl2b9salumvdMXuQBr2kuKP5+ZwbYq4=", "narHash": "sha256-MxCVrXY6v4QmfTwIysjjaX0XUhqBbxTWWB4HXtDYsdk=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "f480f9d09e4b4cf87ee6151eba068197125714de", "rev": "057f9aecfb71c4437d2b27d3323df7f93c010b7e",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "NixOS", "owner": "NixOS",
"ref": "nixpkgs-unstable", "ref": "23.11",
"repo": "nixpkgs", "repo": "nixpkgs",
"type": "github" "type": "github"
} }
}, },
"root": { "root": {
"inputs": { "inputs": {
"flake-utils": "flake-utils", "nixpkgs": "nixpkgs",
"nixpkgs": "nixpkgs" "utils": "utils"
} }
}, },
"systems": { "systems": {
@ -54,6 +36,24 @@
"repo": "default", "repo": "default",
"type": "github" "type": "github"
} }
},
"utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1710146030,
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
} }
}, },
"root": "root", "root": "root",

100
flake.nix
View file

@ -1,59 +1,55 @@
{ {
description = "A Concurrent Skip List library for key/value pairs."; description = "A sparse bitmapped index library in C.";
inputs = { inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; # nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
flake-utils.url = "github:numtide/flake-utils"; nixpkgs.url = "github:NixOS/nixpkgs/23.11";
utils.url = "github:numtide/flake-utils";
utils.inputs.nixpkgs.follows = "nixpkgs";
}; };
outputs = outputs = { self, nixpkgs, ... }
{ self @inputs: inputs.utils.lib.eachSystem [
, nixpkgs "x86_64-linux" "i686-linux" "aarch64-linux" "x86_64-darwin"
, flake-utils ] (system:
, ... let pkgs = import nixpkgs {
}: inherit system;
flake-utils.lib.eachDefaultSystem (system: overlays = [];
let config.allowUnfree = true;
pkgs = import nixpkgs {
inherit system;
config = { allowUnfree = true; };
};
supportedSystems = [ "x86_64-linux" ];
forAllSystems = nixpkgs.lib.genAttrs supportedSystems;
nixpkgsFor = forAllSystems (system: import nixpkgs {
inherit system;
overlays = [ self.overlay ];
});
in {
pkgs = import nixpkgs {
inherit system;
devShell = nixpkgs.legacyPackages.${system} {
pkgs.mkShell = {
nativeBuildInputs = with pkgs.buildPackages; [
act
autoconf
clang
ed
gcc
gdb
gettext
graphviz-nox
libtool
m4
perl
pkg-config
python3
ripgrep
valgrind
];
buildInputs = with pkgs; [
libbacktrace
glibc.out
glibc.static
];
};
DOCKER_BUILDKIT = 1;
}; };
}; in {
}); devShell = pkgs.mkShell rec {
name = "sparsemap";
packages = with pkgs; [
act
autoconf
clang
ed
gcc
gdb
gettext
graphviz-nox
libtool
m4
perl
pkg-config
python3
ripgrep
valgrind
];
buildInputs = with pkgs; [
libbacktrace
glibc.out
glibc.static
];
shellHook = let
icon = "f121";
in ''
export PS1="$(echo -e '\u${icon}') {\[$(tput sgr0)\]\[\033[38;5;228m\]\w\[$(tput sgr0)\]\[\033[38;5;15m\]} (${name}) \\$ \[$(tput sgr0)\]"
'';
};
DOCKER_BUILDKIT = 1;
});
} }

View file

@ -69,14 +69,14 @@
#ifndef SPARSEMAP_H #ifndef SPARSEMAP_H
#define SPARSEMAP_H #define SPARSEMAP_H
#include <sys/types.h>
#include <assert.h>
#include <limits.h> #include <limits.h>
#include <stdbool.h> #include <stdbool.h>
#include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h>
#include <string.h> #if defined(__cplusplus)
extern "C" {
#endif
/* /*
* The public interface for a sparse bit-mapped index, a "sparse map". * The public interface for a sparse bit-mapped index, a "sparse map".
@ -88,55 +88,119 @@
*/ */
typedef struct sparsemap sparsemap_t; typedef struct sparsemap sparsemap_t;
typedef long int sparsemap_idx_t;
#define SPARSEMAP_IDX_MAX ((1UL << (sizeof(long) * CHAR_BIT - 1)) - 1)
#define SPARSEMAP_IDX_MIN (-(SPARSEMAP_IDX_MAX)-1)
#define SPARSEMAP_NOT_FOUND(_x) ((_x) == SPARSEMAP_IDX_MAX || (_x) == SPARSEMAP_IDX_MIN)
typedef uint32_t sm_idx_t; typedef uint32_t sm_idx_t;
typedef uint64_t sm_bitvec_t; typedef uint64_t sm_bitvec_t;
/* Allocate on a sparsemap_t on the heap and initialize it. */ /**
sparsemap_t *sparsemap(uint8_t *data, size_t size); * Create a new, empty sparsemap_t with a buffer of |size|.
* Default when set to 0 is 1024.
*/
sparsemap_t *sparsemap(size_t size);
/* Initialize sparsemap_t with data. */ /**
* Allocate on a sparsemap_t on the heap to wrap the provided fixed-size
* buffer (heap or stack allocated).
*/
sparsemap_t *sparsemap_wrap(uint8_t *data, size_t size);
/**
* Initialize a (possibly stack allocated) sparsemap_t with data (potentially
* also on the stack).
*/
void sparsemap_init(sparsemap_t *map, uint8_t *data, size_t size); void sparsemap_init(sparsemap_t *map, uint8_t *data, size_t size);
/* Clears the whole buffer. */ /**
void sparsemap_clear(sparsemap_t *map); * Opens an existing sparsemap contained within the specified buffer.
*/
/* Opens an existing sparsemap at the specified buffer. */
void sparsemap_open(sparsemap_t *, uint8_t *data, size_t data_size); void sparsemap_open(sparsemap_t *, uint8_t *data, size_t data_size);
/* Resizes the data range. */ /**
void sparsemap_set_data_size(sparsemap_t *map, size_t data_size); * Resets values and empties the buffer making it ready to accept new data.
*/
void sparsemap_clear(sparsemap_t *map);
/* Calculate remaining capacity, full when 0. */ /**
* Resizes the data range within the limits of the provided buffer, the map may
* move to a new address returned iff the map was created with the sparsemap() API.
* Take care to use the new reference (think: realloc()). NOTE: If the returned
* value equals NULL then the map was not resized.
*/
sparsemap_t *sparsemap_set_data_size(sparsemap_t *map, size_t data_size);
/**
* Calculate remaining capacity, approaches 0 when full.
*/
double sparsemap_capacity_remaining(sparsemap_t *map); double sparsemap_capacity_remaining(sparsemap_t *map);
/* Returns the size of the underlying byte array. */ /**
* Returns the capacity of the underlying byte array.
*/
size_t sparsemap_get_capacity(sparsemap_t *map); size_t sparsemap_get_capacity(sparsemap_t *map);
/* Returns the value of a bit at index |idx|. */ /**
bool sparsemap_is_set(sparsemap_t *map, size_t idx); * Returns the value of a bit at index |idx|, either on/true/1 or off/false/0.
* When |idx| is negative it is an error.
*/
bool sparsemap_is_set(sparsemap_t *map, sparsemap_idx_t idx);
/* Sets the bit at index |idx| to true or false, depending on |value|. */ /**
void sparsemap_set(sparsemap_t *map, size_t idx, bool value); * Sets the bit at index |idx| to true or false, depending on |value|.
* When |idx| is negative is it an error. Returns the |idx| supplied or
* SPARSEMAP_IDX_MAX on error with |errno| set to ENOSP when the map is full.
*/
sparsemap_idx_t sparsemap_set(sparsemap_t *map, sparsemap_idx_t idx, bool value);
/* Returns the offset of the very first bit. */ /**
sm_idx_t sparsemap_get_start_offset(sparsemap_t *map); * Returns the offset of the very first/last bit in the map.
*/
sm_idx_t sparsemap_get_starting_offset(sparsemap_t *map);
/* Returns the used size in the data buffer. */ /**
* Returns the used size in the data buffer in bytes.
*/
size_t sparsemap_get_size(sparsemap_t *map); size_t sparsemap_get_size(sparsemap_t *map);
/* Decompresses the whole bitmap; calls scanner for all bits. */ /**
void sparsemap_scan(sparsemap_t *map, void (*scanner)(sm_idx_t[], size_t), size_t skip); * Decompresses the whole bitmap; calls scanner for all bits with a set of
* |n| vectors |vec| each a sm_bitmap_t which can be masked and read using
* bit operators to read the values for each position in the bitmap index.
* Setting |skip| will start the scan after "skip" bits.
*/
void sparsemap_scan(sparsemap_t *map, void (*scanner)(sm_idx_t vec[], size_t n), size_t skip);
/* Appends all chunk maps from |map| starting at |sstart| to |other|, then /**
reduces the chunk map-count appropriately. */ * Appends all chunk maps from |map| starting at |offset| to |other|, then
void sparsemap_split(sparsemap_t *map, size_t sstart, sparsemap_t *other); * reduces the chunk map-count appropriately.
*/
void sparsemap_split(sparsemap_t *map, sparsemap_idx_t offset, sparsemap_t *other);
/* Returns the index of the n'th set bit; uses a 0-based index. */ /**
size_t sparsemap_select(sparsemap_t *map, size_t n); * Finds the offset of the n'th bit either set (|value| is true) or unset
* (|value| is false) from the start (positive |n|), or end (negative |n|),
* of the bitmap and returns that (uses a 0-based index). Returns -inf or +inf
* if not found (where "inf" is SPARSEMAP_IDX_MAX and "-inf" is SPARSEMAP_IDX_MIN).
*/
sparsemap_idx_t sparsemap_select(sparsemap_t *map, sparsemap_idx_t n, bool value);
/* Counts the set bits in the range [offset, idx]. */ /**
size_t sparsemap_rank(sparsemap_t *map, size_t offset, size_t idx); * Counts the set (|value| is true) or unset (|value| is false) bits starting
* at |x| bits (0-based) in the range [x, y] (inclusive on either end).
*/
size_t sparsemap_rank(sparsemap_t *map, size_t x, size_t y, bool value);
size_t sparsemap_span(sparsemap_t *map, size_t loc, size_t len); /**
* Finds the first span (i.e. a contiguous set of bits), in the bitmap that
* are set (|value| is true) or unset (|value| is false) and returns the
* starting offset for the span (0-based).
*/
size_t sparsemap_span(sparsemap_t *map, sparsemap_idx_t idx, size_t len, bool value);
#if defined(__cplusplus)
}
#endif #endif
#endif /* !defined(SPARSEMAP_H) */

File diff suppressed because it is too large Load diff

View file

@ -1,13 +1,20 @@
#include <sys/types.h> #define _POSIX_C_SOURCE 199309L
#define X86_INTRIN
#include <assert.h> #include <assert.h>
#include <pthread.h> #include <pthread.h> // If using threads
#include <sparsemap.h>
#include <stdbool.h> #include <stdbool.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h> #include <unistd.h>
#ifdef X86_INTRIN
#include <errno.h>
#include <x86intrin.h>
#endif
#include "../include/sparsemap.h"
#include "common.h" #include "common.h"
#pragma GCC diagnostic push #pragma GCC diagnostic push
@ -22,84 +29,25 @@
uint64_t uint64_t
tsc(void) tsc(void)
{ {
#ifdef X86_INTRIN
return __rdtsc();
#else
uint32_t low, high; uint32_t low, high;
__asm__ volatile("rdtsc" : "=a"(low), "=d"(high)); __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
return ((uint64_t)high << 32) | low; return ((uint64_t)high << 32) | low;
} #endif
static uint64_t
get_tsc_frequency()
{
uint32_t high, low;
__asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
__asm__ volatile("rdtsc");
return ((uint64_t)high << 32) | low;
} }
double double
tsc_ticks_to_ns(uint64_t tsc_ticks) nsts()
{ {
static uint64_t tsc_freq = 0; struct timespec ts;
if (tsc_freq == 0) {
tsc_freq = get_tsc_frequency();
}
return (double)tsc_ticks / (double)tsc_freq * 1e9;
}
void if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
est_sift_up(uint64_t *heap, int child_index) perror("clock_gettime");
{ return -1.0; // Return -1.0 on error
while (child_index > 0) {
int parent_index = (child_index - 1) / 2;
if (heap[parent_index] > heap[child_index]) {
// Swap parent and child
uint64_t temp = heap[parent_index];
heap[parent_index] = heap[child_index];
heap[child_index] = temp;
child_index = parent_index;
} else {
break; // Heap property satisfied
}
}
}
void
est_sift_down(uint64_t *heap, int heap_size, int parent_index)
{
int child_index = 2 * parent_index + 1; // Left child
while (child_index < heap_size) {
// Right child exists and is smaller than left child
if (child_index + 1 < heap_size && heap[child_index + 1] < heap[child_index]) {
child_index++;
}
// If the smallest child is smaller than the parent, swap them
if (heap[child_index] < heap[parent_index]) {
uint64_t temp = heap[child_index];
heap[child_index] = heap[parent_index];
heap[parent_index] = temp;
parent_index = child_index;
child_index = 2 * parent_index + 1;
} else {
break; // Heap property satisfied
}
}
}
void
est_insert_value(uint64_t *heap, int heap_max_size, int *heap_size, uint64_t value)
{
if (*heap_size < heap_max_size) { // Heap not full, insert value
heap[*heap_size] = value;
est_sift_up(heap, *heap_size);
(*heap_size)++;
} else {
// Heap is full, replace root with new value with a certain probability
// This is a very naive approach to maintain a sample of the input
if (rand() % 2) {
heap[0] = value;
est_sift_down(heap, heap_max_size, 0);
}
} }
return ts.tv_sec + ts.tv_nsec / 1e9;
} }
int __xorshift32_state = 0; int __xorshift32_state = 0;
@ -170,7 +118,7 @@ has_sequential_set(int a[], int l, int r)
int int
ensure_sequential_set(int a[], int l, int r) ensure_sequential_set(int a[], int l, int r)
{ {
if (!a || l == 0 || r < 1 || r > l) { if (!a || l == 0 || r < 1 || r > l - 1) {
return 0; return 0;
} }
@ -199,21 +147,6 @@ ensure_sequential_set(int a[], int l, int r)
return value; return value;
} }
int
create_sequential_set_in_empty_map(sparsemap_t *map, int s, int r)
{
int placed_at;
if (s >= r + 1) {
placed_at = 0;
} else {
placed_at = random_uint32() % (s - r - 1);
}
for (int i = placed_at; i < placed_at + r; i++) {
sparsemap_set(map, i, true);
}
return placed_at;
}
void void
print_array(int *array, int l) print_array(int *array, int l)
{ {
@ -340,6 +273,20 @@ is_unique(int a[], int l, int value)
return 1; // Unique return 1; // Unique
} }
int
whats_set_uint64(uint64_t number, int pos[64])
{
int length = 0;
for (int i = 0; i < 64; i++) {
if (number & ((uint64_t)1 << i)) {
pos[length++] = i;
}
}
return length;
}
void void
setup_test_array(int a[], int l, int max_value) setup_test_array(int a[], int l, int max_value)
{ {
@ -364,15 +311,6 @@ bitmap_from_uint32(sparsemap_t *map, uint32_t number)
} }
} }
void
bitmap_from_uint64(sparsemap_t *map, uint64_t number)
{
for (int i = 0; i < 64; i++) {
bool bit = number & (1 << i);
sparsemap_set(map, i, bit);
}
}
uint32_t uint32_t
rank_uint64(uint64_t number, int n, int p) rank_uint64(uint64_t number, int n, int p)
{ {
@ -400,22 +338,53 @@ rank_uint64(uint64_t number, int n, int p)
return count; return count;
} }
int void
whats_set_uint64(uint64_t number, int pos[64]) print_bits(char *name, uint64_t value)
{ {
int length = 0; if (name) {
printf("%s\t", name);
for (int i = 0; i < 64; i++) { }
if (number & ((uint64_t)1 << i)) { for (int i = 63; i >= 0; i--) {
pos[length++] = i; printf("%ld", (value >> i) & 1);
if (i % 8 == 0) {
printf(" "); // Add space for better readability
} }
} }
printf("\n");
return length;
} }
void void
whats_set(sparsemap_t *map, int m) sm_bitmap_from_uint64(sparsemap_t *map, uint64_t number)
{
for (int i = 0; i < 64; i++) {
bool bit = number & ((uint64_t)1 << i);
sparsemap_set(map, i, bit);
}
}
sparsemap_idx_t
sm_add_span(sparsemap_t *map, int map_size, int span_length)
{
int attempts = map_size / span_length;
sparsemap_idx_t placed_at;
do {
placed_at = random_uint32() % (map_size - span_length - 1);
if (sm_occupied(map, placed_at, span_length, true)) {
attempts--;
} else {
break;
}
} while (attempts);
for (int i = placed_at; i < placed_at + span_length; i++) {
if (sparsemap_set(map, i, true) != i) {
return placed_at; // TODO error?
}
}
return placed_at;
}
void
sm_whats_set(sparsemap_t *map, int m)
{ {
logf("what's set in the range [0, %d): ", m); logf("what's set in the range [0, %d): ", m);
for (int i = 0; i < m; i++) { for (int i = 0; i < m; i++) {
@ -425,3 +394,25 @@ whats_set(sparsemap_t *map, int m)
} }
logf("\n"); logf("\n");
} }
bool
sm_is_span(sparsemap_t *map, sparsemap_idx_t m, int len, bool value)
{
for (sparsemap_idx_t i = m; i < m + len; i++) {
if (sparsemap_is_set(map, i) != value) {
return false;
}
}
return true;
}
bool
sm_occupied(sparsemap_t *map, sparsemap_idx_t m, int len, bool value)
{
for (sparsemap_idx_t i = m; i < (sparsemap_idx_t)len; i++) {
if (sparsemap_is_set(map, i) == value) {
return true;
}
}
return false;
}

View file

@ -23,20 +23,9 @@
#define XORSHIFT_SEED_VALUE ((unsigned int)time(NULL) ^ getpid()) #define XORSHIFT_SEED_VALUE ((unsigned int)time(NULL) ^ getpid())
#endif #endif
#define EST_MEDIAN_DECL(decl, size) \
uint64_t heap_##decl[size] = { 0 }; \
int heap_##decl##_max_size = size; \
int heap_##decl##_size = 0;
#define EST_MEDIAN_ADD(decl, value) est_insert_value(heap_##decl, heap_##decl##_max_size, &heap_##decl##_size, (value));
#define EST_MEDIAN_GET(decl) heap_##decl[0]
uint64_t tsc(void); uint64_t tsc(void);
double tsc_ticks_to_ns(uint64_t tsc_ticks); double tsc_ticks_to_ns(uint64_t tsc_ticks);
void est_sift_up(uint64_t *heap, int child_index); double nsts();
void est_sift_down(uint64_t *heap, int heap_size, int parent_index);
void est_insert_value(uint64_t *heap, int heap_max_size, int *heap_size, uint64_t value);
void xorshift32_seed(); void xorshift32_seed();
uint32_t xorshift32(); uint32_t xorshift32();
@ -52,11 +41,16 @@ int is_unique(int a[], int l, int value);
void setup_test_array(int a[], int l, int max_value); void setup_test_array(int a[], int l, int max_value);
void shuffle(int *array, size_t n); void shuffle(int *array, size_t n);
int ensure_sequential_set(int a[], int l, int r); int ensure_sequential_set(int a[], int l, int r);
int create_sequential_set_in_empty_map(sparsemap_t *map, int s, int r); sparsemap_idx_t sm_add_span(sparsemap_t *map, int map_size, int span_length);
void print_bits(char *name, uint64_t value);
void bitmap_from_uint32(sparsemap_t *map, uint32_t number); void bitmap_from_uint32(sparsemap_t *map, uint32_t number);
void bitmap_from_uint64(sparsemap_t *map, uint64_t number); void sm_bitmap_from_uint64(sparsemap_t *map, uint64_t number);
uint32_t rank_uint64(uint64_t number, int n, int p); uint32_t rank_uint64(uint64_t number, int n, int p);
int whats_set_uint64(uint64_t number, int bitPositions[64]); int whats_set_uint64(uint64_t number, int bitPositions[64]);
void whats_set(sparsemap_t *map, int m); void sm_whats_set(sparsemap_t *map, int m);
bool sm_is_span(sparsemap_t *map, sparsemap_idx_t m, int len, bool value);
bool sm_occupied(sparsemap_t *map, sparsemap_idx_t m, int len, bool value);

680
tests/tdigest.c Normal file
View file

@ -0,0 +1,680 @@
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <math.h>
#include "tdigest.h"
#include <errno.h>
#include <stdint.h>
#ifndef TD_MALLOC_INCLUDE
#define TD_MALLOC_INCLUDE "td_malloc.h"
#endif
#ifndef TD_ALLOC_H
#define TD_ALLOC_H
#define __td_malloc malloc
#define __td_calloc calloc
#define __td_realloc realloc
#define __td_free free
#endif
#define __td_max(x, y) (((x) > (y)) ? (x) : (y))
#define __td_min(x, y) (((x) < (y)) ? (x) : (y))
static inline double weighted_average_sorted(double x1, double w1, double x2, double w2) {
const double x = (x1 * w1 + x2 * w2) / (w1 + w2);
return __td_max(x1, __td_min(x, x2));
}
static inline bool _tdigest_long_long_add_safe(long long a, long long b) {
if (b < 0) {
return (a >= __LONG_LONG_MAX__ - b);
} else {
return (a <= __LONG_LONG_MAX__ - b);
}
}
static inline double weighted_average(double x1, double w1, double x2, double w2) {
if (x1 <= x2) {
return weighted_average_sorted(x1, w1, x2, w2);
} else {
return weighted_average_sorted(x2, w2, x1, w1);
}
}
static inline void swap(double *arr, int i, int j) {
const double temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
static inline void swap_l(long long *arr, int i, int j) {
const long long temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
static unsigned int partition(double *means, long long *weights, unsigned int start,
unsigned int end, unsigned int pivot_idx) {
const double pivotMean = means[pivot_idx];
swap(means, pivot_idx, end);
swap_l(weights, pivot_idx, end);
int i = start - 1;
for (unsigned int j = start; j < end; j++) {
// If current element is smaller than the pivot
if (means[j] < pivotMean) {
// increment index of smaller element
i++;
swap(means, i, j);
swap_l(weights, i, j);
}
}
swap(means, i + 1, end);
swap_l(weights, i + 1, end);
return i + 1;
}
/**
* Standard quick sort except that sorting rearranges parallel arrays
*
* @param means Values to sort on
* @param weights The auxillary values to sort.
* @param start The beginning of the values to sort
* @param end The value after the last value to sort
*/
static void td_qsort(double *means, long long *weights, unsigned int start, unsigned int end) {
if (start < end) {
// two elements can be directly compared
if ((end - start) == 1) {
if (means[start] > means[end]) {
swap(means, start, end);
swap_l(weights, start, end);
}
return;
}
// generating a random number as a pivot was very expensive vs the array size
// const unsigned int pivot_idx = start + rand()%(end - start + 1);
const unsigned int pivot_idx = (end + start) / 2; // central pivot
const unsigned int new_pivot_idx = partition(means, weights, start, end, pivot_idx);
if (new_pivot_idx > start) {
td_qsort(means, weights, start, new_pivot_idx - 1);
}
td_qsort(means, weights, new_pivot_idx + 1, end);
}
}
static inline size_t cap_from_compression(double compression) {
if ((size_t)compression > ((SIZE_MAX / sizeof(double) / 6) - 10)) {
return 0;
}
return (6 * (size_t)(compression)) + 10;
}
static inline bool should_td_compress(td_histogram_t *h) {
return ((h->merged_nodes + h->unmerged_nodes) >= (h->cap - 1));
}
static inline int next_node(td_histogram_t *h) { return h->merged_nodes + h->unmerged_nodes; }
int td_compress(td_histogram_t *h);
static inline int _check_overflow(const double v) {
// double-precision overflow detected on h->unmerged_weight
if (v == INFINITY) {
return EDOM;
}
return 0;
}
static inline int _check_td_overflow(const double new_unmerged_weight,
const double new_total_weight) {
// double-precision overflow detected on h->unmerged_weight
if (new_unmerged_weight == INFINITY) {
return EDOM;
}
if (new_total_weight == INFINITY) {
return EDOM;
}
const double denom = 2 * MM_PI * new_total_weight * log(new_total_weight);
if (denom == INFINITY) {
return EDOM;
}
return 0;
}
int td_centroid_count(td_histogram_t *h) { return next_node(h); }
void td_reset(td_histogram_t *h) {
if (!h) {
return;
}
h->min = __DBL_MAX__;
h->max = -h->min;
h->merged_nodes = 0;
h->merged_weight = 0;
h->unmerged_nodes = 0;
h->unmerged_weight = 0;
h->total_compressions = 0;
}
int td_init(double compression, td_histogram_t **result) {
const size_t capacity = cap_from_compression(compression);
if (capacity < 1) {
return 1;
}
td_histogram_t *histogram;
histogram = (td_histogram_t *)__td_malloc(sizeof(td_histogram_t));
if (!histogram) {
return 1;
}
histogram->cap = capacity;
histogram->compression = (double)compression;
td_reset(histogram);
histogram->nodes_mean = (double *)__td_calloc(capacity, sizeof(double));
if (!histogram->nodes_mean) {
td_free(histogram);
return 1;
}
histogram->nodes_weight = (long long *)__td_calloc(capacity, sizeof(long long));
if (!histogram->nodes_weight) {
td_free(histogram);
return 1;
}
*result = histogram;
return 0;
}
td_histogram_t *td_new(double compression) {
td_histogram_t *mdigest = NULL;
td_init(compression, &mdigest);
return mdigest;
}
void td_free(td_histogram_t *histogram) {
if (histogram->nodes_mean) {
__td_free((void *)(histogram->nodes_mean));
}
if (histogram->nodes_weight) {
__td_free((void *)(histogram->nodes_weight));
}
__td_free((void *)(histogram));
}
int td_merge(td_histogram_t *into, td_histogram_t *from) {
if (td_compress(into) != 0)
return EDOM;
if (td_compress(from) != 0)
return EDOM;
const int pos = from->merged_nodes + from->unmerged_nodes;
for (int i = 0; i < pos; i++) {
const double mean = from->nodes_mean[i];
const long long weight = from->nodes_weight[i];
if (td_add(into, mean, weight) != 0) {
return EDOM;
}
}
return 0;
}
long long td_size(td_histogram_t *h) { return h->merged_weight + h->unmerged_weight; }
double td_cdf(td_histogram_t *h, double val) {
td_compress(h);
// no data to examine
if (h->merged_nodes == 0) {
return NAN;
}
// bellow lower bound
if (val < h->min) {
return 0;
}
// above upper bound
if (val > h->max) {
return 1;
}
if (h->merged_nodes == 1) {
// exactly one centroid, should have max==min
const double width = h->max - h->min;
if (val - h->min <= width) {
// min and max are too close together to do any viable interpolation
return 0.5;
} else {
// interpolate if somehow we have weight > 0 and max != min
return (val - h->min) / width;
}
}
const int n = h->merged_nodes;
// check for the left tail
const double left_centroid_mean = h->nodes_mean[0];
const double left_centroid_weight = (double)h->nodes_weight[0];
const double merged_weight_d = (double)h->merged_weight;
if (val < left_centroid_mean) {
// note that this is different than h->nodes_mean[0] > min
// ... this guarantees we divide by non-zero number and interpolation works
const double width = left_centroid_mean - h->min;
if (width > 0) {
// must be a sample exactly at min
if (val == h->min) {
return 0.5 / merged_weight_d;
} else {
return (1 + (val - h->min) / width * (left_centroid_weight / 2 - 1)) /
merged_weight_d;
}
} else {
// this should be redundant with the check val < h->min
return 0;
}
}
// and the right tail
const double right_centroid_mean = h->nodes_mean[n - 1];
const double right_centroid_weight = (double)h->nodes_weight[n - 1];
if (val > right_centroid_mean) {
const double width = h->max - right_centroid_mean;
if (width > 0) {
if (val == h->max) {
return 1 - 0.5 / merged_weight_d;
} else {
// there has to be a single sample exactly at max
const double dq = (1 + (h->max - val) / width * (right_centroid_weight / 2 - 1)) /
merged_weight_d;
return 1 - dq;
}
} else {
return 1;
}
}
// we know that there are at least two centroids and mean[0] < x < mean[n-1]
// that means that there are either one or more consecutive centroids all at exactly x
// or there are consecutive centroids, c0 < x < c1
double weightSoFar = 0;
for (int it = 0; it < n - 1; it++) {
// weightSoFar does not include weight[it] yet
if (h->nodes_mean[it] == val) {
// we have one or more centroids == x, treat them as one
// dw will accumulate the weight of all of the centroids at x
double dw = 0;
while (it < n && h->nodes_mean[it] == val) {
dw += (double)h->nodes_weight[it];
it++;
}
return (weightSoFar + dw / 2) / (double)h->merged_weight;
} else if (h->nodes_mean[it] <= val && val < h->nodes_mean[it + 1]) {
const double node_weight = (double)h->nodes_weight[it];
const double node_weight_next = (double)h->nodes_weight[it + 1];
const double node_mean = h->nodes_mean[it];
const double node_mean_next = h->nodes_mean[it + 1];
// landed between centroids ... check for floating point madness
if (node_mean_next - node_mean > 0) {
// note how we handle singleton centroids here
// the point is that for singleton centroids, we know that their entire
// weight is exactly at the centroid and thus shouldn't be involved in
// interpolation
double leftExcludedW = 0;
double rightExcludedW = 0;
if (node_weight == 1) {
if (node_weight_next == 1) {
// two singletons means no interpolation
// left singleton is in, right is out
return (weightSoFar + 1) / merged_weight_d;
} else {
leftExcludedW = 0.5;
}
} else if (node_weight_next == 1) {
rightExcludedW = 0.5;
}
double dw = (node_weight + node_weight_next) / 2;
// adjust endpoints for any singleton
double dwNoSingleton = dw - leftExcludedW - rightExcludedW;
double base = weightSoFar + node_weight / 2 + leftExcludedW;
return (base + dwNoSingleton * (val - node_mean) / (node_mean_next - node_mean)) /
merged_weight_d;
} else {
// this is simply caution against floating point madness
// it is conceivable that the centroids will be different
// but too near to allow safe interpolation
double dw = (node_weight + node_weight_next) / 2;
return (weightSoFar + dw) / merged_weight_d;
}
} else {
weightSoFar += (double)h->nodes_weight[it];
}
}
return 1 - 0.5 / merged_weight_d;
}
static double td_internal_iterate_centroids_to_index(const td_histogram_t *h, const double index,
const double left_centroid_weight,
const int total_centroids, double *weightSoFar,
int *node_pos) {
if (left_centroid_weight > 1 && index < left_centroid_weight / 2) {
// there is a single sample at min so we interpolate with less weight
return h->min + (index - 1) / (left_centroid_weight / 2 - 1) * (h->nodes_mean[0] - h->min);
}
// usually the last centroid will have unit weight so this test will make it moot
if (index > h->merged_weight - 1) {
return h->max;
}
// if the right-most centroid has more than one sample, we still know
// that one sample occurred at max so we can do some interpolation
const double right_centroid_weight = (double)h->nodes_weight[total_centroids - 1];
const double right_centroid_mean = h->nodes_mean[total_centroids - 1];
if (right_centroid_weight > 1 &&
(double)h->merged_weight - index <= right_centroid_weight / 2) {
return h->max - ((double)h->merged_weight - index - 1) / (right_centroid_weight / 2 - 1) *
(h->max - right_centroid_mean);
}
for (; *node_pos < total_centroids - 1; (*node_pos)++) {
const int i = *node_pos;
const double node_weight = (double)h->nodes_weight[i];
const double node_weight_next = (double)h->nodes_weight[i + 1];
const double node_mean = h->nodes_mean[i];
const double node_mean_next = h->nodes_mean[i + 1];
const double dw = (node_weight + node_weight_next) / 2;
if (*weightSoFar + dw > index) {
// centroids i and i+1 bracket our current point
// check for unit weight
double leftUnit = 0;
if (node_weight == 1) {
if (index - *weightSoFar < 0.5) {
// within the singleton's sphere
return node_mean;
} else {
leftUnit = 0.5;
}
}
double rightUnit = 0;
if (node_weight_next == 1) {
if (*weightSoFar + dw - index <= 0.5) {
// no interpolation needed near singleton
return node_mean_next;
}
rightUnit = 0.5;
}
const double z1 = index - *weightSoFar - leftUnit;
const double z2 = *weightSoFar + dw - index - rightUnit;
return weighted_average(node_mean, z2, node_mean_next, z1);
}
*weightSoFar += dw;
}
// weightSoFar = totalWeight - weight[total_centroids-1]/2 (very nearly)
// so we interpolate out to max value ever seen
const double z1 = index - h->merged_weight - right_centroid_weight / 2.0;
const double z2 = right_centroid_weight / 2 - z1;
return weighted_average(right_centroid_mean, z1, h->max, z2);
}
double td_quantile(td_histogram_t *h, double q) {
td_compress(h);
// q should be in [0,1]
if (q < 0.0 || q > 1.0 || h->merged_nodes == 0) {
return NAN;
}
// with one data point, all quantiles lead to Rome
if (h->merged_nodes == 1) {
return h->nodes_mean[0];
}
// if values were stored in a sorted array, index would be the offset we are interested in
const double index = q * (double)h->merged_weight;
// beyond the boundaries, we return min or max
// usually, the first centroid will have unit weight so this will make it moot
if (index < 1) {
return h->min;
}
// we know that there are at least two centroids now
const int n = h->merged_nodes;
// if the left centroid has more than one sample, we still know
// that one sample occurred at min so we can do some interpolation
const double left_centroid_weight = (double)h->nodes_weight[0];
// in between extremes we interpolate between centroids
double weightSoFar = left_centroid_weight / 2;
int i = 0;
return td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n, &weightSoFar,
&i);
}
int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length) {
td_compress(h);
if (NULL == quantiles || NULL == values) {
return EINVAL;
}
const int n = h->merged_nodes;
if (n == 0) {
for (size_t i = 0; i < length; i++) {
values[i] = NAN;
}
return 0;
}
if (n == 1) {
for (size_t i = 0; i < length; i++) {
const double requested_quantile = quantiles[i];
// q should be in [0,1]
if (requested_quantile < 0.0 || requested_quantile > 1.0) {
values[i] = NAN;
} else {
// with one data point, all quantiles lead to Rome
values[i] = h->nodes_mean[0];
}
}
return 0;
}
// we know that there are at least two centroids now
// if the left centroid has more than one sample, we still know
// that one sample occurred at min so we can do some interpolation
const double left_centroid_weight = (double)h->nodes_weight[0];
// in between extremes we interpolate between centroids
double weightSoFar = left_centroid_weight / 2;
int node_pos = 0;
// to avoid allocations we use the values array for intermediate computation
// i.e. to store the expected cumulative count at each percentile
for (size_t qpos = 0; qpos < length; qpos++) {
const double index = quantiles[qpos] * (double)h->merged_weight;
values[qpos] = td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n,
&weightSoFar, &node_pos);
}
return 0;
}
static double td_internal_trimmed_mean(const td_histogram_t *h, const double leftmost_weight,
const double rightmost_weight) {
double count_done = 0;
double trimmed_sum = 0;
double trimmed_count = 0;
for (int i = 0; i < h->merged_nodes; i++) {
const double n_weight = (double)h->nodes_weight[i];
// Assume the whole centroid falls into the range
double count_add = n_weight;
// If we haven't reached the low threshold yet, skip appropriate part of the centroid.
count_add -= __td_min(__td_max(0, leftmost_weight - count_done), count_add);
// If we have reached the upper threshold, ignore the overflowing part of the centroid.
count_add = __td_min(__td_max(0, rightmost_weight - count_done), count_add);
// consider the whole centroid processed
count_done += n_weight;
// increment the sum / count
trimmed_sum += h->nodes_mean[i] * count_add;
trimmed_count += count_add;
// break once we cross the high threshold
if (count_done >= rightmost_weight)
break;
}
return trimmed_sum / trimmed_count;
}
double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut) {
td_compress(h);
// proportion_to_cut should be in [0,1]
if (h->merged_nodes == 0 || proportion_to_cut < 0.0 || proportion_to_cut > 1.0) {
return NAN;
}
// with one data point, all values lead to Rome
if (h->merged_nodes == 1) {
return h->nodes_mean[0];
}
/* translate the percentiles to counts */
const double leftmost_weight = floor((double)h->merged_weight * proportion_to_cut);
const double rightmost_weight = ceil((double)h->merged_weight * (1.0 - proportion_to_cut));
return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight);
}
double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut) {
td_compress(h);
// leftmost_cut and rightmost_cut should be in [0,1]
if (h->merged_nodes == 0 || leftmost_cut < 0.0 || leftmost_cut > 1.0 || rightmost_cut < 0.0 ||
rightmost_cut > 1.0) {
return NAN;
}
// with one data point, all values lead to Rome
if (h->merged_nodes == 1) {
return h->nodes_mean[0];
}
/* translate the percentiles to counts */
const double leftmost_weight = floor((double)h->merged_weight * leftmost_cut);
const double rightmost_weight = ceil((double)h->merged_weight * rightmost_cut);
return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight);
}
int td_add(td_histogram_t *h, double mean, long long weight) {
if (should_td_compress(h)) {
const int overflow_res = td_compress(h);
if (overflow_res != 0)
return overflow_res;
}
const int pos = next_node(h);
if (pos >= h->cap)
return EDOM;
if (_tdigest_long_long_add_safe(h->unmerged_weight, weight) == false)
return EDOM;
const long long new_unmerged_weight = h->unmerged_weight + weight;
if (_tdigest_long_long_add_safe(new_unmerged_weight, h->merged_weight) == false)
return EDOM;
const long long new_total_weight = new_unmerged_weight + h->merged_weight;
// double-precision overflow detected
const int overflow_res =
_check_td_overflow((double)new_unmerged_weight, (double)new_total_weight);
if (overflow_res != 0)
return overflow_res;
if (mean < h->min) {
h->min = mean;
}
if (mean > h->max) {
h->max = mean;
}
h->nodes_mean[pos] = mean;
h->nodes_weight[pos] = weight;
h->unmerged_nodes++;
h->unmerged_weight = new_unmerged_weight;
return 0;
}
int td_compress(td_histogram_t *h) {
if (h->unmerged_nodes == 0) {
return 0;
}
int N = h->merged_nodes + h->unmerged_nodes;
td_qsort(h->nodes_mean, h->nodes_weight, 0, N - 1);
const double total_weight = (double)h->merged_weight + (double)h->unmerged_weight;
// double-precision overflow detected
const int overflow_res = _check_td_overflow((double)h->unmerged_weight, (double)total_weight);
if (overflow_res != 0)
return overflow_res;
if (total_weight <= 1)
return 0;
const double denom = 2 * MM_PI * total_weight * log(total_weight);
if (_check_overflow(denom) != 0)
return EDOM;
// Compute the normalizer given compression and number of points.
const double normalizer = h->compression / denom;
if (_check_overflow(normalizer) != 0)
return EDOM;
int cur = 0;
double weight_so_far = 0;
for (int i = 1; i < N; i++) {
const double proposed_weight = (double)h->nodes_weight[cur] + (double)h->nodes_weight[i];
const double z = proposed_weight * normalizer;
// quantile up to cur
const double q0 = weight_so_far / total_weight;
// quantile up to cur + i
const double q2 = (weight_so_far + proposed_weight) / total_weight;
// Convert a quantile to the k-scale
const bool should_add = (z <= (q0 * (1 - q0))) && (z <= (q2 * (1 - q2)));
// next point will fit
// so merge into existing centroid
if (should_add) {
h->nodes_weight[cur] += h->nodes_weight[i];
const double delta = h->nodes_mean[i] - h->nodes_mean[cur];
const double weighted_delta = (delta * h->nodes_weight[i]) / h->nodes_weight[cur];
h->nodes_mean[cur] += weighted_delta;
} else {
weight_so_far += h->nodes_weight[cur];
cur++;
h->nodes_weight[cur] = h->nodes_weight[i];
h->nodes_mean[cur] = h->nodes_mean[i];
}
if (cur != i) {
h->nodes_weight[i] = 0;
h->nodes_mean[i] = 0.0;
}
}
h->merged_nodes = cur + 1;
h->merged_weight = total_weight;
h->unmerged_nodes = 0;
h->unmerged_weight = 0;
h->total_compressions++;
return 0;
}
double td_min(td_histogram_t *h) { return h->min; }
double td_max(td_histogram_t *h) { return h->max; }
int td_compression(td_histogram_t *h) { return h->compression; }
const long long *td_centroids_weight(td_histogram_t *h) { return h->nodes_weight; }
const double *td_centroids_mean(td_histogram_t *h) { return h->nodes_mean; }
long long td_centroids_weight_at(td_histogram_t *h, int pos) { return h->nodes_weight[pos]; }
double td_centroids_mean_at(td_histogram_t *h, int pos) {
if (pos < 0 || pos > h->merged_nodes) {
return NAN;
}
return h->nodes_mean[pos];
}

258
tests/tdigest.h Normal file
View file

@ -0,0 +1,258 @@
#pragma once
#include <stdlib.h>
/**
* Adaptive histogram based on something like streaming k-means crossed with Q-digest.
* The implementation is a direct descendent of MergingDigest
* https://github.com/tdunning/t-digest/
*
* Copyright (c) 2021 Redis, All rights reserved.
* Copyright (c) 2018 Andrew Werner, All rights reserved.
*
* The special characteristics of this algorithm are:
*
* - smaller summaries than Q-digest
*
* - provides part per million accuracy for extreme quantiles and typically &lt;1000 ppm accuracy
* for middle quantiles
*
* - fast
*
* - simple
*
* - easy to adapt for use with map-reduce
*/
#define MM_PI 3.14159265358979323846
struct td_histogram {
// compression is a setting used to configure the size of centroids when merged.
double compression;
double min;
double max;
// cap is the total size of nodes
int cap;
// merged_nodes is the number of merged nodes at the front of nodes.
int merged_nodes;
// unmerged_nodes is the number of buffered nodes.
int unmerged_nodes;
// we run the merge in reverse every other merge to avoid left-to-right bias in merging
long long total_compressions;
long long merged_weight;
long long unmerged_weight;
double *nodes_mean;
long long *nodes_weight;
};
typedef struct td_histogram td_histogram_t;
#ifdef __cplusplus
extern "C" {
#endif
/**
* Allocate the memory, initialise the t-digest, and return the histogram as output parameter.
* @param compression The compression parameter.
* 100 is a common value for normal uses.
* 1000 is extremely large.
* The number of centroids retained will be a smallish (usually less than 10) multiple of this
* number.
* @return the histogram on success, NULL if allocation failed.
*/
td_histogram_t *td_new(double compression);
/**
* Allocate the memory and initialise the t-digest.
*
* @param compression The compression parameter.
* 100 is a common value for normal uses.
* 1000 is extremely large.
* The number of centroids retained will be a smallish (usually less than 10) multiple of this
* number.
* @param result Output parameter to capture allocated histogram.
* @return 0 on success, 1 if allocation failed.
*/
int td_init(double compression, td_histogram_t **result);
/**
* Frees the memory associated with the t-digest.
*
* @param h The histogram you want to free.
*/
void td_free(td_histogram_t *h);
/**
* Reset a histogram to zero - empty out a histogram and re-initialise it
*
* If you want to re-use an existing histogram, but reset everything back to zero, this
* is the routine to use.
*
* @param h The histogram you want to reset to empty.
*
*/
void td_reset(td_histogram_t *h);
/**
* Adds a sample to a histogram.
*
* @param val The value to add.
* @param weight The weight of this point.
* @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
* weight.
*
*/
int td_add(td_histogram_t *h, double val, long long weight);
/**
* Re-examines a t-digest to determine whether some centroids are redundant. If your data are
* perversely ordered, this may be a good idea. Even if not, this may save 20% or so in space.
*
* The cost is roughly the same as adding as many data points as there are centroids. This
* is typically &lt; 10 * compression, but could be as high as 100 * compression.
* This is a destructive operation that is not thread-safe.
*
* @param h The histogram you want to compress.
* @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
* weight. If overflow is detected the histogram is not changed.
*
*/
int td_compress(td_histogram_t *h);
/**
* Merges all of the values from 'from' to 'this' histogram.
*
* @param h "This" pointer
* @param from Histogram to copy values from.
* * @return 0 on success, EDOM if overflow was detected as a consequence of merging the the
* provided histogram. If overflow is detected the original histogram is not detected.
*/
int td_merge(td_histogram_t *h, td_histogram_t *from);
/**
* Returns the fraction of all points added which are &le; x.
*
* @param x The cutoff for the cdf.
* @return The fraction of all data which is less or equal to x.
*/
double td_cdf(td_histogram_t *h, double x);
/**
* Returns an estimate of the cutoff such that a specified fraction of the data
* added to this TDigest would be less than or equal to the cutoff.
*
* @param q The desired fraction
* @return The value x such that cdf(x) == q;
*/
double td_quantile(td_histogram_t *h, double q);
/**
* Returns an estimate of the cutoff such that a specified fraction of the data
* added to this TDigest would be less than or equal to the cutoffs.
*
* @param quantiles The ordered percentiles array to get the values for.
* @param values Destination array containing the values at the given quantiles.
* The values array should be allocated by the caller.
* @return 0 on success, ENOMEM if the provided destination array is null.
*/
int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length);
/**
* Returns the trimmed mean ignoring values outside given cutoff upper and lower limits.
*
* @param leftmost_cut Fraction to cut off of the left tail of the distribution.
* @param rightmost_cut Fraction to cut off of the right tail of the distribution.
* @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
*/
double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut);
/**
* Returns the trimmed mean ignoring values outside given a symmetric cutoff limits.
*
* @param proportion_to_cut Fraction to cut off of the left and right tails of the distribution.
* @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
*/
double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut);
/**
* Returns the current compression factor.
*
* @return The compression factor originally used to set up the TDigest.
*/
int td_compression(td_histogram_t *h);
/**
* Returns the number of points that have been added to this TDigest.
*
* @return The sum of the weights on all centroids.
*/
long long td_size(td_histogram_t *h);
/**
* Returns the number of centroids being used by this TDigest.
*
* @return The number of centroids being used.
*/
int td_centroid_count(td_histogram_t *h);
/**
* Get minimum value from the histogram. Will return __DBL_MAX__ if the histogram
* is empty.
*
* @param h "This" pointer
*/
double td_min(td_histogram_t *h);
/**
* Get maximum value from the histogram. Will return - __DBL_MAX__ if the histogram
* is empty.
*
* @param h "This" pointer
*/
double td_max(td_histogram_t *h);
/**
* Get the full centroids weight array for 'this' histogram.
*
* @param h "This" pointer
*
* @return The full centroids weight array.
*/
const long long *td_centroids_weight(td_histogram_t *h);
/**
* Get the full centroids mean array for 'this' histogram.
*
* @param h "This" pointer
*
* @return The full centroids mean array.
*/
const double *td_centroids_mean(td_histogram_t *h);
/**
* Get the centroid weight for 'this' histogram and 'pos'.
*
* @param h "This" pointer
* @param pos centroid position.
*
* @return The centroid weight.
*/
long long td_centroids_weight_at(td_histogram_t *h, int pos);
/**
* Get the centroid mean for 'this' histogram and 'pos'.
*
* @param h "This" pointer
* @param pos centroid position.
*
* @return The centroid mean.
*/
double td_centroids_mean_at(td_histogram_t *h, int pos);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load diff