cleanup
This commit is contained in:
parent
5d5c7f1584
commit
6d2e355e19
12 changed files with 1409 additions and 317 deletions
7
Makefile
7
Makefile
|
@ -12,7 +12,7 @@ CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og
|
||||||
TEST_FLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -Itests/ -fPIC
|
TEST_FLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -Itests/ -fPIC
|
||||||
|
|
||||||
TESTS = tests/test
|
TESTS = tests/test
|
||||||
TEST_OBJS = tests/test.o tests/munit.o tests/common.o
|
TEST_OBJS = tests/test.o tests/munit.o tests/tdigest.o tests/common.o
|
||||||
EXAMPLES = examples/ex_1 examples/ex_2 examples/ex_3 examples/ex_4
|
EXAMPLES = examples/ex_1 examples/ex_2 examples/ex_3 examples/ex_4
|
||||||
|
|
||||||
.PHONY: all shared static clean test examples mls
|
.PHONY: all shared static clean test examples mls
|
||||||
|
@ -39,7 +39,7 @@ check: test
|
||||||
env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/test
|
env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/test
|
||||||
|
|
||||||
tests/test: $(TEST_OBJS) $(STATIC_LIB)
|
tests/test: $(TEST_OBJS) $(STATIC_LIB)
|
||||||
$(CC) $^ -o $@ $(TEST_FLAGS)
|
$(CC) $^ -lm -o $@ $(TEST_FLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f $(OBJS)
|
rm -f $(OBJS)
|
||||||
|
@ -76,4 +76,7 @@ examples/ex_3: examples/common.o examples/ex_3.o $(STATIC_LIB)
|
||||||
examples/ex_4: examples/common.o examples/ex_4.o $(STATIC_LIB)
|
examples/ex_4: examples/common.o examples/ex_4.o $(STATIC_LIB)
|
||||||
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS)
|
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS)
|
||||||
|
|
||||||
|
todo:
|
||||||
|
rg -i 'todo|gsb'
|
||||||
|
|
||||||
# cp src/sparsemap.c /tmp && clang-tidy src/sparsemap.c -fix -fix-errors -checks="readability-braces-around-statements" -- -DDEBUG -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
|
# cp src/sparsemap.c /tmp && clang-tidy src/sparsemap.c -fix -fix-errors -checks="readability-braces-around-statements" -- -DDEBUG -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "../include/sparsemap.h"
|
#include "../include/sparsemap.h"
|
||||||
|
@ -23,7 +26,9 @@ int
|
||||||
main()
|
main()
|
||||||
{
|
{
|
||||||
size_t size = 4;
|
size_t size = 4;
|
||||||
setbuf(stderr, 0); // disable buffering
|
setvbuf(stdout, NULL, _IONBF, 0); // Disable buffering for stdout
|
||||||
|
setvbuf(stderr, NULL, _IONBF, 0); // Disable buffering for stdout
|
||||||
|
|
||||||
__diag("Please wait a moment...");
|
__diag("Please wait a moment...");
|
||||||
sparsemap_t mmap, *map = &mmap;
|
sparsemap_t mmap, *map = &mmap;
|
||||||
uint8_t buffer[1024];
|
uint8_t buffer[1024];
|
||||||
|
@ -135,7 +140,7 @@ main()
|
||||||
sparsemap_set(map, i, true);
|
sparsemap_set(map, i, true);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < 100000; i++) {
|
for (int i = 0; i < 100000; i++) {
|
||||||
assert(sparsemap_select(map, i) == (unsigned)i);
|
assert(sparsemap_select(map, i, true) == (unsigned)i);
|
||||||
}
|
}
|
||||||
|
|
||||||
sparsemap_clear(map);
|
sparsemap_clear(map);
|
||||||
|
@ -145,7 +150,7 @@ main()
|
||||||
sparsemap_set(map, i, true);
|
sparsemap_set(map, i, true);
|
||||||
}
|
}
|
||||||
for (int i = 1; i < 513; i++) {
|
for (int i = 1; i < 513; i++) {
|
||||||
assert(sparsemap_select(map, i - 1) == (unsigned)i);
|
assert(sparsemap_select(map, i - 1, true) == (unsigned)i);
|
||||||
}
|
}
|
||||||
|
|
||||||
sparsemap_clear(map);
|
sparsemap_clear(map);
|
||||||
|
@ -155,7 +160,7 @@ main()
|
||||||
sparsemap_set(map, i * 10, true);
|
sparsemap_set(map, i * 10, true);
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < 8; i++) {
|
for (size_t i = 0; i < 8; i++) {
|
||||||
assert(sparsemap_select(map, i) == i * 10);
|
assert(sparsemap_select(map, i, true) == (sm_loc_t)i * 10);
|
||||||
}
|
}
|
||||||
|
|
||||||
// split and move, aligned to MiniMap capacity
|
// split and move, aligned to MiniMap capacity
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <stdarg.h>
|
#include <stdbool.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <time.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
#include "../include/sparsemap.h"
|
#include "../include/sparsemap.h"
|
||||||
|
|
||||||
|
@ -16,29 +14,28 @@
|
||||||
} while (0)
|
} while (0)
|
||||||
#pragma GCC diagnostic pop
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
#define SEED
|
|
||||||
|
|
||||||
int
|
int
|
||||||
main(void)
|
main(void)
|
||||||
{
|
{
|
||||||
int i = 0;
|
int i;
|
||||||
|
|
||||||
// disable buffering
|
// disable buffering
|
||||||
setbuf(stderr, 0);
|
setvbuf(stdout, NULL, _IONBF, 0); // Disable buffering for stdout
|
||||||
|
setvbuf(stderr, NULL, _IONBF, 0); // Disable buffering for stdout
|
||||||
|
|
||||||
// start with a 1KiB buffer, 1024 bits
|
// start with a 1KiB buffer, 1024 bits
|
||||||
uint8_t *buf = calloc(1024, sizeof(uint8_t));
|
uint8_t *buf = calloc(1024, sizeof(uint8_t));
|
||||||
|
|
||||||
// create the sparse bitmap
|
// create the sparse bitmap
|
||||||
sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 1024);
|
sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 1024);
|
||||||
|
|
||||||
// Set every other bit (pathologically worst case) to see what happens
|
// Set every other bit (pathologically worst case) to see what happens
|
||||||
// when the map is full.
|
// when the map is full.
|
||||||
for (i = 0; i < 7744; i++) {
|
for (i = 0; i < 7744; i++) {
|
||||||
if (i % 2)
|
if (!i % 2) {
|
||||||
continue;
|
sparsemap_set(map, i, true);
|
||||||
sparsemap_set(map, i, true);
|
assert(sparsemap_is_set(map, i) == true);
|
||||||
assert(sparsemap_is_set(map, i) == true);
|
}
|
||||||
}
|
}
|
||||||
// On 1024 KiB of buffer with every other bit set the map holds 7744 bits
|
// On 1024 KiB of buffer with every other bit set the map holds 7744 bits
|
||||||
// and then runs out of space. This next _set() call will fail/abort.
|
// and then runs out of space. This next _set() call will fail/abort.
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <stdarg.h>
|
#include <stdbool.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <time.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
#include "../include/sparsemap.h"
|
#include "../include/sparsemap.h"
|
||||||
#include "../tests/common.h"
|
#include "../tests/common.h"
|
||||||
|
@ -11,7 +9,7 @@
|
||||||
int
|
int
|
||||||
main(void)
|
main(void)
|
||||||
{
|
{
|
||||||
int i = 0;
|
int i;
|
||||||
int array[1024] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
int array[1024] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
||||||
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
|
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
|
||||||
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
|
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
|
||||||
|
@ -60,7 +58,7 @@ main(void)
|
||||||
uint8_t *buf = calloc(1024, sizeof(uint8_t));
|
uint8_t *buf = calloc(1024, sizeof(uint8_t));
|
||||||
|
|
||||||
// create the sparse bitmap
|
// create the sparse bitmap
|
||||||
sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 1024);
|
sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 1024);
|
||||||
|
|
||||||
// set all the bits on in a random order
|
// set all the bits on in a random order
|
||||||
for (i = 0; i < 1024; i++) {
|
for (i = 0; i < 1024; i++) {
|
||||||
|
|
|
@ -24,7 +24,7 @@ main(void)
|
||||||
uint8_t *buf = calloc((size_t)3 * 1024, sizeof(uint8_t));
|
uint8_t *buf = calloc((size_t)3 * 1024, sizeof(uint8_t));
|
||||||
|
|
||||||
// create the sparse bitmap
|
// create the sparse bitmap
|
||||||
sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 3 * 1024);
|
sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 3 * 1024);
|
||||||
|
|
||||||
// create an array of ints
|
// create an array of ints
|
||||||
setup_test_array(array, TEST_ARRAY_SIZE, 1024 * 3);
|
setup_test_array(array, TEST_ARRAY_SIZE, 1024 * 3);
|
||||||
|
@ -60,7 +60,7 @@ main(void)
|
||||||
assert(sparsemap_is_set(map, array[i]) == true);
|
assert(sparsemap_is_set(map, array[i]) == true);
|
||||||
}
|
}
|
||||||
has_span(map, array, TEST_ARRAY_SIZE, (int)len);
|
has_span(map, array, TEST_ARRAY_SIZE, (int)len);
|
||||||
size_t l = sparsemap_span(map, 0, len);
|
size_t l = sparsemap_span(map, 0, len, true);
|
||||||
if (l != (size_t)-1) {
|
if (l != (size_t)-1) {
|
||||||
__diag("Found span in map starting at %lu of length %lu\n", l, len);
|
__diag("Found span in map starting at %lu of length %lu\n", l, len);
|
||||||
__diag("is_span(%lu, %lu) == %s\n", l, len, is_span(array, TEST_ARRAY_SIZE, l, len) ? "yes" : "no");
|
__diag("is_span(%lu, %lu) == %s\n", l, len, is_span(array, TEST_ARRAY_SIZE, l, len) ? "yes" : "no");
|
||||||
|
|
|
@ -69,14 +69,14 @@
|
||||||
#ifndef SPARSEMAP_H
|
#ifndef SPARSEMAP_H
|
||||||
#define SPARSEMAP_H
|
#define SPARSEMAP_H
|
||||||
|
|
||||||
#include <sys/types.h>
|
|
||||||
|
|
||||||
#include <assert.h>
|
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
#if defined(__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The public interface for a sparse bit-mapped index, a "sparse map".
|
* The public interface for a sparse bit-mapped index, a "sparse map".
|
||||||
|
@ -88,55 +88,114 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
typedef struct sparsemap sparsemap_t;
|
typedef struct sparsemap sparsemap_t;
|
||||||
|
typedef long sm_loc_t;
|
||||||
|
#define SM_LOC_MAX LONG_MAX
|
||||||
|
#define SM_LOC_MIN LONG_MIN
|
||||||
typedef uint32_t sm_idx_t;
|
typedef uint32_t sm_idx_t;
|
||||||
typedef uint64_t sm_bitvec_t;
|
typedef uint64_t sm_bitvec_t;
|
||||||
|
|
||||||
/* Allocate on a sparsemap_t on the heap and initialize it. */
|
/**
|
||||||
sparsemap_t *sparsemap(uint8_t *data, size_t size);
|
* Create a new, empty sparsemap_t with a buffer of |size|.
|
||||||
|
* Default when set to 0 is 1024.
|
||||||
|
*/
|
||||||
|
sparsemap_t *sparsemap(size_t size);
|
||||||
|
|
||||||
/* Initialize sparsemap_t with data. */
|
/**
|
||||||
|
* Allocate on a sparsemap_t on the heap to wrap the provided fixed-size
|
||||||
|
* buffer (heap or stack allocated).
|
||||||
|
*/
|
||||||
|
sparsemap_t *sparsemap_wrap(uint8_t *data, size_t size);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize a (possibly stack allocated) sparsemap_t with data (potentially
|
||||||
|
* also on the stack).
|
||||||
|
*/
|
||||||
void sparsemap_init(sparsemap_t *map, uint8_t *data, size_t size);
|
void sparsemap_init(sparsemap_t *map, uint8_t *data, size_t size);
|
||||||
|
|
||||||
/* Clears the whole buffer. */
|
/**
|
||||||
void sparsemap_clear(sparsemap_t *map);
|
* Opens an existing sparsemap contained within the specified buffer.
|
||||||
|
*/
|
||||||
/* Opens an existing sparsemap at the specified buffer. */
|
|
||||||
void sparsemap_open(sparsemap_t *, uint8_t *data, size_t data_size);
|
void sparsemap_open(sparsemap_t *, uint8_t *data, size_t data_size);
|
||||||
|
|
||||||
/* Resizes the data range. */
|
/**
|
||||||
|
* Resets values and empties the buffer making it ready to accept new data.
|
||||||
|
*/
|
||||||
|
void sparsemap_clear(sparsemap_t *map);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resizes the data range within the limits of the provided buffer.
|
||||||
|
*/
|
||||||
void sparsemap_set_data_size(sparsemap_t *map, size_t data_size);
|
void sparsemap_set_data_size(sparsemap_t *map, size_t data_size);
|
||||||
|
|
||||||
/* Calculate remaining capacity, full when 0. */
|
/**
|
||||||
|
* Calculate remaining capacity, approaches 0 when full.
|
||||||
|
*/
|
||||||
double sparsemap_capacity_remaining(sparsemap_t *map);
|
double sparsemap_capacity_remaining(sparsemap_t *map);
|
||||||
|
|
||||||
/* Returns the size of the underlying byte array. */
|
/**
|
||||||
|
* Returns the capacity of the underlying byte array.
|
||||||
|
*/
|
||||||
size_t sparsemap_get_capacity(sparsemap_t *map);
|
size_t sparsemap_get_capacity(sparsemap_t *map);
|
||||||
|
|
||||||
/* Returns the value of a bit at index |idx|. */
|
/**
|
||||||
bool sparsemap_is_set(sparsemap_t *map, size_t idx);
|
* Returns the value of a bit at index |idx|, either on/true/1 or off/false/0.
|
||||||
|
* When |idx| is negative it is an error.
|
||||||
|
*/
|
||||||
|
bool sparsemap_is_set(sparsemap_t *map, sm_loc_t idx);
|
||||||
|
|
||||||
/* Sets the bit at index |idx| to true or false, depending on |value|. */
|
/**
|
||||||
void sparsemap_set(sparsemap_t *map, size_t idx, bool value);
|
* Sets the bit at index |idx| to true or false, depending on |value|.
|
||||||
|
* When |idx| is negative is it an error.
|
||||||
|
*/
|
||||||
|
void sparsemap_set(sparsemap_t *map, sm_loc_t idx, bool value);
|
||||||
|
|
||||||
/* Returns the offset of the very first bit. */
|
/**
|
||||||
sm_idx_t sparsemap_get_start_offset(sparsemap_t *map);
|
* Returns the offset of the very first/last bit in the map.
|
||||||
|
*/
|
||||||
|
sm_idx_t sparsemap_get_starting_offset(sparsemap_t *map);
|
||||||
|
|
||||||
/* Returns the used size in the data buffer. */
|
/**
|
||||||
|
* Returns the used size in the data buffer in bytes.
|
||||||
|
*/
|
||||||
size_t sparsemap_get_size(sparsemap_t *map);
|
size_t sparsemap_get_size(sparsemap_t *map);
|
||||||
|
|
||||||
/* Decompresses the whole bitmap; calls scanner for all bits. */
|
/**
|
||||||
void sparsemap_scan(sparsemap_t *map, void (*scanner)(sm_idx_t[], size_t), size_t skip);
|
* Decompresses the whole bitmap; calls scanner for all bits with a set of
|
||||||
|
* |n| vectors |vec| each a sm_bitmap_t which can be masked and read using
|
||||||
|
* bit operators to read the values for each position in the bitmap index.
|
||||||
|
* Setting |skip| will start the scan after "skip" bits.
|
||||||
|
*/
|
||||||
|
void sparsemap_scan(sparsemap_t *map, void (*scanner)(sm_idx_t vec[], size_t n), size_t skip);
|
||||||
|
|
||||||
/* Appends all chunk maps from |map| starting at |sstart| to |other|, then
|
/**
|
||||||
reduces the chunk map-count appropriately. */
|
* Appends all chunk maps from |map| starting at |offset| to |other|, then
|
||||||
void sparsemap_split(sparsemap_t *map, size_t sstart, sparsemap_t *other);
|
* reduces the chunk map-count appropriately.
|
||||||
|
*/
|
||||||
|
void sparsemap_split(sparsemap_t *map, sm_loc_t offset, sparsemap_t *other);
|
||||||
|
|
||||||
/* Returns the index of the n'th set bit; uses a 0-based index. */
|
/**
|
||||||
size_t sparsemap_select(sparsemap_t *map, size_t n);
|
* Finds the offset of the n'th bit either set (|value| is true) or unset
|
||||||
|
* (|value| is false) from the start (positive |n|), or end (negative |n|),
|
||||||
|
* of the bitmap and returns that (uses a 0-based index). Returns -inf or +inf
|
||||||
|
* if not found (where "inf" is SM_LOC_MAX and "-inf" is SM_LOC_MIN).
|
||||||
|
*/
|
||||||
|
sm_loc_t sparsemap_select(sparsemap_t *map, sm_loc_t n, bool value);
|
||||||
|
|
||||||
/* Counts the set bits in the range [offset, idx]. */
|
/**
|
||||||
size_t sparsemap_rank(sparsemap_t *map, size_t offset, size_t idx);
|
* Counts the set (|value| is true) or unset (|value| is false) bits starting
|
||||||
|
* at |x| bits (0-based) in the range [x, y] (inclusive on either end).
|
||||||
|
*/
|
||||||
|
size_t sparsemap_rank(sparsemap_t *map, size_t x, size_t y, bool value);
|
||||||
|
|
||||||
size_t sparsemap_span(sparsemap_t *map, size_t loc, size_t len);
|
/**
|
||||||
|
* Finds the first span (i.e. a contiguous set of bits), in the bitmap that
|
||||||
|
* are set (|value| is true) or unset (|value| is false) and returns the
|
||||||
|
* starting offset for the span (0-based).
|
||||||
|
*/
|
||||||
|
size_t sparsemap_span(sparsemap_t *map, sm_loc_t idx, size_t len, bool value);
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif /* !defined(SPARSEMAP_H) */
|
||||||
|
|
269
src/sparsemap.c
269
src/sparsemap.c
|
@ -20,12 +20,18 @@
|
||||||
* SOFTWARE.
|
* SOFTWARE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <sys/types.h>
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <popcount.h>
|
#include <popcount.h>
|
||||||
#include <sparsemap.h>
|
#include <sparsemap.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef SPARSEMAP_DIAGNOSTIC
|
#ifdef SPARSEMAP_DIAGNOSTIC
|
||||||
#pragma GCC diagnostic push
|
#pragma GCC diagnostic push
|
||||||
|
@ -83,7 +89,7 @@ enum __SM_CHUNK_INFO {
|
||||||
/* sm_bitvec_t is not used (2#01) */
|
/* sm_bitvec_t is not used (2#01) */
|
||||||
SM_PAYLOAD_NONE = 1,
|
SM_PAYLOAD_NONE = 1,
|
||||||
|
|
||||||
/* a mask for checking flags (2 bits) */
|
/* a mask for checking flags (2 bits, 2#11) */
|
||||||
SM_FLAG_MASK = 3,
|
SM_FLAG_MASK = 3,
|
||||||
|
|
||||||
/* return code for set(): ok, no further action required */
|
/* return code for set(): ok, no further action required */
|
||||||
|
@ -388,10 +394,11 @@ __sm_chunk_map_set(__sm_chunk_t *map, size_t idx, bool value, size_t *pos, sm_bi
|
||||||
* value of |n|.
|
* value of |n|.
|
||||||
*/
|
*/
|
||||||
static size_t
|
static size_t
|
||||||
__sm_chunk_map_select(__sm_chunk_t *map, size_t n, ssize_t *pnew_n)
|
__sm_chunk_map_select(__sm_chunk_t *map, size_t n, ssize_t *pnew_n, bool value)
|
||||||
{
|
{
|
||||||
size_t ret = 0;
|
size_t ret = 0;
|
||||||
register uint8_t *p;
|
register uint8_t *p;
|
||||||
|
(void)value; // TODO
|
||||||
|
|
||||||
p = (uint8_t *)map->m_data;
|
p = (uint8_t *)map->m_data;
|
||||||
for (size_t i = 0; i < sizeof(sm_bitvec_t); i++, p++) {
|
for (size_t i = 0; i < sizeof(sm_bitvec_t); i++, p++) {
|
||||||
|
@ -446,7 +453,7 @@ __sm_chunk_map_select(__sm_chunk_t *map, size_t n, ssize_t *pnew_n)
|
||||||
* '*offset' has been reached 0.
|
* '*offset' has been reached 0.
|
||||||
*/
|
*/
|
||||||
static size_t
|
static size_t
|
||||||
__sm_chunk_map_rank(__sm_chunk_t *map, size_t *offset, size_t idx)
|
__sm_chunk_map_rank(__sm_chunk_t *map, size_t *offset, size_t idx, sm_bitvec_t *vec)
|
||||||
{
|
{
|
||||||
size_t ret = 0;
|
size_t ret = 0;
|
||||||
|
|
||||||
|
@ -466,6 +473,7 @@ __sm_chunk_map_rank(__sm_chunk_t *map, size_t *offset, size_t idx)
|
||||||
*offset = 0;
|
*offset = 0;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
*vec = 0;
|
||||||
return (ret);
|
return (ret);
|
||||||
}
|
}
|
||||||
} else if (flags == SM_PAYLOAD_ONES) {
|
} else if (flags == SM_PAYLOAD_ONES) {
|
||||||
|
@ -480,6 +488,7 @@ __sm_chunk_map_rank(__sm_chunk_t *map, size_t *offset, size_t idx)
|
||||||
*offset = 0;
|
*offset = 0;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
*vec = UINT64_MAX;
|
||||||
return (ret + idx);
|
return (ret + idx);
|
||||||
}
|
}
|
||||||
} else if (flags == SM_PAYLOAD_MIXED) {
|
} else if (flags == SM_PAYLOAD_MIXED) {
|
||||||
|
@ -493,8 +502,11 @@ __sm_chunk_map_rank(__sm_chunk_t *map, size_t *offset, size_t idx)
|
||||||
/* Create a mask for the range between offset and idx inclusive [*offset, idx]. */
|
/* Create a mask for the range between offset and idx inclusive [*offset, idx]. */
|
||||||
uint64_t offset_mask = (((uint64_t)1 << *offset) - 1);
|
uint64_t offset_mask = (((uint64_t)1 << *offset) - 1);
|
||||||
uint64_t idx_mask = idx >= 63 ? UINT64_MAX : ((uint64_t)1 << (idx + 1)) - 1;
|
uint64_t idx_mask = idx >= 63 ? UINT64_MAX : ((uint64_t)1 << (idx + 1)) - 1;
|
||||||
ret += popcountll(w & (idx_mask - offset_mask));
|
sm_bitvec_t mw = w & (idx_mask - offset_mask);
|
||||||
|
ret += popcountll(mw);
|
||||||
*offset = *offset > idx ? *offset - idx : 0;
|
*offset = *offset > idx ? *offset - idx : 0;
|
||||||
|
*vec = mw;
|
||||||
|
(*vec) <<= *offset;
|
||||||
return (ret);
|
return (ret);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -593,7 +605,7 @@ __sm_get_chunk_map_count(sparsemap_t *map)
|
||||||
static inline uint8_t *
|
static inline uint8_t *
|
||||||
__sm_get_chunk_map_data(sparsemap_t *map, size_t offset)
|
__sm_get_chunk_map_data(sparsemap_t *map, size_t offset)
|
||||||
{
|
{
|
||||||
return (uint8_t *)(&map->m_data[SM_SIZEOF_OVERHEAD + offset]);
|
return (&map->m_data[SM_SIZEOF_OVERHEAD + offset]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -644,33 +656,51 @@ __sm_get_aligned_offset(size_t idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the byte offset of a __sm_chunk_t in m_data
|
* Returns the byte offset of a __sm_chunk_t in m_data.
|
||||||
*/
|
*/
|
||||||
static ssize_t
|
static ssize_t
|
||||||
__sm_get_chunk_map_offset(sparsemap_t *map, size_t idx)
|
__sm_get_chunk_map_offset(sparsemap_t *map, sm_loc_t idx)
|
||||||
{
|
{
|
||||||
size_t count;
|
int count;
|
||||||
|
|
||||||
count = __sm_get_chunk_map_count(map);
|
count = __sm_get_chunk_map_count(map);
|
||||||
if (count == 0) {
|
if (count == 0) {
|
||||||
return (-1);
|
return (-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t *start = __sm_get_chunk_map_data(map, 0);
|
if (idx > 0 || idx == 0) {
|
||||||
uint8_t *p = start;
|
uint8_t *start = __sm_get_chunk_map_data(map, 0);
|
||||||
|
uint8_t *p = start;
|
||||||
|
|
||||||
for (size_t i = 0; i < count - 1; i++) {
|
for (sm_loc_t i = 0; i < count - 1; i++) {
|
||||||
sm_idx_t start = *(sm_idx_t *)p;
|
sm_idx_t s = *(sm_idx_t *)p;
|
||||||
__sm_assert(start == __sm_get_aligned_offset(start));
|
__sm_assert(s == __sm_get_aligned_offset(start));
|
||||||
__sm_chunk_t chunk;
|
__sm_chunk_t chunk;
|
||||||
__sm_chunk_map_init(&chunk, p + sizeof(sm_idx_t));
|
__sm_chunk_map_init(&chunk, p + sizeof(sm_idx_t));
|
||||||
if (start >= idx || idx < start + __sm_chunk_map_get_capacity(&chunk)) {
|
if (s >= idx || (unsigned long)idx < s + __sm_chunk_map_get_capacity(&chunk)) {
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
p += sizeof(sm_idx_t) + __sm_chunk_map_get_size(&chunk);
|
||||||
}
|
}
|
||||||
p += sizeof(sm_idx_t) + __sm_chunk_map_get_size(&chunk);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ((ssize_t)(p - start));
|
return ((ssize_t)(p - start));
|
||||||
|
} else {
|
||||||
|
uint8_t *end = __sm_get_chunk_map_data(map, count - 1);
|
||||||
|
uint8_t *p = end;
|
||||||
|
|
||||||
|
for (sm_loc_t i = count - 1; i >= 0; i--) {
|
||||||
|
sm_idx_t e = *(sm_idx_t *)p;
|
||||||
|
__sm_assert(e == __sm_get_aligned_offset(end));
|
||||||
|
__sm_chunk_t chunk;
|
||||||
|
__sm_chunk_map_init(&chunk, p + sizeof(sm_idx_t));
|
||||||
|
if (e >= idx || (unsigned long)idx < e + __sm_chunk_map_get_capacity(&chunk)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
p += sizeof(sm_idx_t) + __sm_chunk_map_get_size(&chunk);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ((ssize_t)(p - end));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -743,11 +773,21 @@ sparsemap_clear(sparsemap_t *map)
|
||||||
__sm_set_chunk_map_count(map, 0);
|
__sm_set_chunk_map_count(map, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Allocate on a sparsemap_t on the heap and initialize it.
|
|
||||||
*/
|
|
||||||
sparsemap_t *
|
sparsemap_t *
|
||||||
sparsemap(uint8_t *data, size_t size)
|
sparsemap(size_t size)
|
||||||
|
{
|
||||||
|
if (size == 0) {
|
||||||
|
size = 1024;
|
||||||
|
}
|
||||||
|
sparsemap_t *map = (sparsemap_t *)calloc(1, sizeof(sparsemap_t) + (size * sizeof(uint8_t)));
|
||||||
|
if (map) {
|
||||||
|
sparsemap_init(map, (uint8_t *)map + sizeof(sparsemap_t), size);
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
sparsemap_t *
|
||||||
|
sparsemap_wrap(uint8_t *data, size_t size)
|
||||||
{
|
{
|
||||||
sparsemap_t *map = (sparsemap_t *)calloc(1, sizeof(sparsemap_t));
|
sparsemap_t *map = (sparsemap_t *)calloc(1, sizeof(sparsemap_t));
|
||||||
if (map) {
|
if (map) {
|
||||||
|
@ -756,9 +796,6 @@ sparsemap(uint8_t *data, size_t size)
|
||||||
return map;
|
return map;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Initialize sparsemap_t with data.
|
|
||||||
*/
|
|
||||||
void
|
void
|
||||||
sparsemap_init(sparsemap_t *map, uint8_t *data, size_t size)
|
sparsemap_init(sparsemap_t *map, uint8_t *data, size_t size)
|
||||||
{
|
{
|
||||||
|
@ -768,9 +805,6 @@ sparsemap_init(sparsemap_t *map, uint8_t *data, size_t size)
|
||||||
sparsemap_clear(map);
|
sparsemap_clear(map);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Opens an existing sparsemap at the specified buffer.
|
|
||||||
*/
|
|
||||||
void
|
void
|
||||||
sparsemap_open(sparsemap_t *map, uint8_t *data, size_t data_size)
|
sparsemap_open(sparsemap_t *map, uint8_t *data, size_t data_size)
|
||||||
{
|
{
|
||||||
|
@ -779,9 +813,7 @@ sparsemap_open(sparsemap_t *map, uint8_t *data, size_t data_size)
|
||||||
map->m_capacity = data_size;
|
map->m_capacity = data_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Resizes the data range.
|
|
||||||
*
|
|
||||||
* TODO/NOTE: This is a dangerous operation because we cannot verify that
|
* TODO/NOTE: This is a dangerous operation because we cannot verify that
|
||||||
* data_size is not exceeding the size of the underlying buffer.
|
* data_size is not exceeding the size of the underlying buffer.
|
||||||
*/
|
*/
|
||||||
|
@ -791,10 +823,6 @@ sparsemap_set_data_size(sparsemap_t *map, size_t data_size)
|
||||||
map->m_capacity = data_size;
|
map->m_capacity = data_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculates the remaining capacity as an integer that approaches 0 to
|
|
||||||
* indicate full.
|
|
||||||
*/
|
|
||||||
double
|
double
|
||||||
sparsemap_capacity_remaining(sparsemap_t *map)
|
sparsemap_capacity_remaining(sparsemap_t *map)
|
||||||
{
|
{
|
||||||
|
@ -807,23 +835,21 @@ sparsemap_capacity_remaining(sparsemap_t *map)
|
||||||
return 100 - (((double)map->m_data_used / (double)map->m_capacity) * 100);
|
return 100 - (((double)map->m_data_used / (double)map->m_capacity) * 100);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the size of the underlying byte array.
|
|
||||||
*/
|
|
||||||
size_t
|
size_t
|
||||||
sparsemap_get_capacity(sparsemap_t *map)
|
sparsemap_get_capacity(sparsemap_t *map)
|
||||||
{
|
{
|
||||||
return (map->m_capacity);
|
return (map->m_capacity);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of a bit at index |idx|.
|
|
||||||
*/
|
|
||||||
bool
|
bool
|
||||||
sparsemap_is_set(sparsemap_t *map, size_t idx)
|
sparsemap_is_set(sparsemap_t *map, sm_loc_t idx)
|
||||||
{
|
{
|
||||||
__sm_assert(sparsemap_get_size(map) >= SM_SIZEOF_OVERHEAD);
|
__sm_assert(sparsemap_get_size(map) >= SM_SIZEOF_OVERHEAD);
|
||||||
|
|
||||||
|
if (idx < 0) {
|
||||||
|
return (false);
|
||||||
|
}
|
||||||
|
|
||||||
/* Get the __sm_chunk_t which manages this index */
|
/* Get the __sm_chunk_t which manages this index */
|
||||||
ssize_t offset = __sm_get_chunk_map_offset(map, idx);
|
ssize_t offset = __sm_get_chunk_map_offset(map, idx);
|
||||||
|
|
||||||
|
@ -839,8 +865,8 @@ sparsemap_is_set(sparsemap_t *map, size_t idx)
|
||||||
__sm_chunk_map_init(&chunk, p + sizeof(sm_idx_t));
|
__sm_chunk_map_init(&chunk, p + sizeof(sm_idx_t));
|
||||||
|
|
||||||
/* Determine if the bit is out of bounds of the __sm_chunk_t; if yes then
|
/* Determine if the bit is out of bounds of the __sm_chunk_t; if yes then
|
||||||
the bit is not set. */
|
the bit is not set. */
|
||||||
if (idx < start || idx - start >= __sm_chunk_map_get_capacity(&chunk)) {
|
if (idx < start || (unsigned long)idx - start >= __sm_chunk_map_get_capacity(&chunk)) {
|
||||||
return (false);
|
return (false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -848,14 +874,15 @@ sparsemap_is_set(sparsemap_t *map, size_t idx)
|
||||||
return (__sm_chunk_map_is_set(&chunk, idx - start));
|
return (__sm_chunk_map_is_set(&chunk, idx - start));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the bit at index |idx| to true or false, depending on |value|.
|
|
||||||
*/
|
|
||||||
void
|
void
|
||||||
sparsemap_set(sparsemap_t *map, size_t idx, bool value)
|
sparsemap_set(sparsemap_t *map, sm_loc_t idx, bool value)
|
||||||
{
|
{
|
||||||
__sm_assert(sparsemap_get_size(map) >= SM_SIZEOF_OVERHEAD);
|
__sm_assert(sparsemap_get_size(map) >= SM_SIZEOF_OVERHEAD);
|
||||||
|
|
||||||
|
if (idx < 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/* Get the __sm_chunk_t which manages this index */
|
/* Get the __sm_chunk_t which manages this index */
|
||||||
ssize_t offset = __sm_get_chunk_map_offset(map, idx);
|
ssize_t offset = __sm_get_chunk_map_offset(map, idx);
|
||||||
bool dont_grow = false;
|
bool dont_grow = false;
|
||||||
|
@ -917,7 +944,7 @@ sparsemap_set(sparsemap_t *map, size_t idx, bool value)
|
||||||
else {
|
else {
|
||||||
__sm_chunk_t chunk;
|
__sm_chunk_t chunk;
|
||||||
__sm_chunk_map_init(&chunk, p + sizeof(sm_idx_t));
|
__sm_chunk_map_init(&chunk, p + sizeof(sm_idx_t));
|
||||||
if (idx - start >= __sm_chunk_map_get_capacity(&chunk)) {
|
if (idx - (unsigned long)start >= __sm_chunk_map_get_capacity(&chunk)) {
|
||||||
if (value == false) {
|
if (value == false) {
|
||||||
/* nothing to do */
|
/* nothing to do */
|
||||||
return;
|
return;
|
||||||
|
@ -931,7 +958,7 @@ sparsemap_set(sparsemap_t *map, size_t idx, bool value)
|
||||||
__sm_insert_data(map, offset, &buf[0], sizeof(buf));
|
__sm_insert_data(map, offset, &buf[0], sizeof(buf));
|
||||||
|
|
||||||
start += __sm_chunk_map_get_capacity(&chunk);
|
start += __sm_chunk_map_get_capacity(&chunk);
|
||||||
if ((size_t)start + SM_CHUNK_MAX_CAPACITY < idx) {
|
if ((size_t)start + SM_CHUNK_MAX_CAPACITY < (unsigned long)idx) {
|
||||||
start = __sm_get_fully_aligned_offset(idx);
|
start = __sm_get_fully_aligned_offset(idx);
|
||||||
}
|
}
|
||||||
*(sm_idx_t *)p = start;
|
*(sm_idx_t *)p = start;
|
||||||
|
@ -984,16 +1011,15 @@ sparsemap_set(sparsemap_t *map, size_t idx, bool value)
|
||||||
__sm_assert(sparsemap_get_size(map) >= SM_SIZEOF_OVERHEAD);
|
__sm_assert(sparsemap_get_size(map) >= SM_SIZEOF_OVERHEAD);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the offset of the very first bit.
|
|
||||||
*/
|
|
||||||
sm_idx_t
|
sm_idx_t
|
||||||
sparsemap_get_start_offset(sparsemap_t *map)
|
sparsemap_get_starting_offset(sparsemap_t *map)
|
||||||
{
|
{
|
||||||
if (__sm_get_chunk_map_count(map) == 0) {
|
size_t count = __sm_get_chunk_map_count(map);
|
||||||
|
if (count == 0) {
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
return (*(sm_idx_t *)__sm_get_chunk_map_data(map, 0));
|
sm_idx_t *chunk = (sm_idx_t *)__sm_get_chunk_map_data(map, 0);
|
||||||
|
return *chunk;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1032,14 +1058,13 @@ sparsemap_scan(sparsemap_t *map, void (*scanner)(sm_idx_t[], size_t), size_t ski
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Appends all chunk maps from |sstart| to |other|, then reduces the chunk
|
|
||||||
* map-count appropriately. |sstart| must be BitVector-aligned!
|
|
||||||
*/
|
|
||||||
void
|
void
|
||||||
sparsemap_split(sparsemap_t *map, size_t sstart, sparsemap_t *other)
|
sparsemap_split(sparsemap_t *map, sm_loc_t offset, sparsemap_t *other)
|
||||||
{
|
{
|
||||||
assert(sstart % SM_BITS_PER_VECTOR == 0);
|
assert(offset % SM_BITS_PER_VECTOR == 0);
|
||||||
|
|
||||||
|
if (offset < 0)
|
||||||
|
return;
|
||||||
|
|
||||||
/* |dst| points to the destination buffer */
|
/* |dst| points to the destination buffer */
|
||||||
uint8_t *dst = __sm_get_chunk_map_end(other);
|
uint8_t *dst = __sm_get_chunk_map_end(other);
|
||||||
|
@ -1047,9 +1072,9 @@ sparsemap_split(sparsemap_t *map, size_t sstart, sparsemap_t *other)
|
||||||
/* |src| points to the source-chunk map */
|
/* |src| points to the source-chunk map */
|
||||||
uint8_t *src = __sm_get_chunk_map_data(map, 0);
|
uint8_t *src = __sm_get_chunk_map_data(map, 0);
|
||||||
|
|
||||||
/* |sstart| is relative to the beginning of this sparsemap_t; best
|
/* |offset| is relative to the beginning of this sparsemap_t; best
|
||||||
make it absolute. */
|
make it absolute. */
|
||||||
sstart += *(sm_idx_t *)src;
|
offset += *(sm_idx_t *)src;
|
||||||
|
|
||||||
bool in_middle = false;
|
bool in_middle = false;
|
||||||
uint8_t *prev = src;
|
uint8_t *prev = src;
|
||||||
|
@ -1058,14 +1083,14 @@ sparsemap_split(sparsemap_t *map, size_t sstart, sparsemap_t *other)
|
||||||
sm_idx_t start = *(sm_idx_t *)src;
|
sm_idx_t start = *(sm_idx_t *)src;
|
||||||
__sm_chunk_t chunk;
|
__sm_chunk_t chunk;
|
||||||
__sm_chunk_map_init(&chunk, src + sizeof(sm_idx_t));
|
__sm_chunk_map_init(&chunk, src + sizeof(sm_idx_t));
|
||||||
if (start == sstart) {
|
if (start == offset) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (start + __sm_chunk_map_get_capacity(&chunk) > sstart) {
|
if (start + __sm_chunk_map_get_capacity(&chunk) > (unsigned long)offset) {
|
||||||
in_middle = true;
|
in_middle = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (start > sstart) {
|
if (start > offset) {
|
||||||
src = prev;
|
src = prev;
|
||||||
i--;
|
i--;
|
||||||
break;
|
break;
|
||||||
|
@ -1083,12 +1108,12 @@ sparsemap_split(sparsemap_t *map, size_t sstart, sparsemap_t *other)
|
||||||
/* Now copy all the remaining chunks. */
|
/* Now copy all the remaining chunks. */
|
||||||
int moved = 0;
|
int moved = 0;
|
||||||
|
|
||||||
/* If |sstart| is in the middle of a chunk then this chunk has to be split */
|
/* If |offset| is in the middle of a chunk then this chunk has to be split */
|
||||||
if (in_middle) {
|
if (in_middle) {
|
||||||
uint8_t buf[sizeof(sm_idx_t) + sizeof(sm_bitvec_t) * 2] = { 0 };
|
uint8_t buf[sizeof(sm_idx_t) + sizeof(sm_bitvec_t) * 2] = { 0 };
|
||||||
memcpy(dst, &buf[0], sizeof(buf));
|
memcpy(dst, &buf[0], sizeof(buf));
|
||||||
|
|
||||||
*(sm_idx_t *)dst = sstart;
|
*(sm_idx_t *)dst = offset;
|
||||||
dst += sizeof(sm_idx_t);
|
dst += sizeof(sm_idx_t);
|
||||||
|
|
||||||
/* the |other| sparsemap_t now has one additional chunk */
|
/* the |other| sparsemap_t now has one additional chunk */
|
||||||
|
@ -1104,11 +1129,11 @@ sparsemap_split(sparsemap_t *map, size_t sstart, sparsemap_t *other)
|
||||||
|
|
||||||
__sm_chunk_t d_chunk;
|
__sm_chunk_t d_chunk;
|
||||||
__sm_chunk_map_init(&d_chunk, dst);
|
__sm_chunk_map_init(&d_chunk, dst);
|
||||||
__sm_chunk_map_set_capacity(&d_chunk, capacity - (sstart % capacity));
|
__sm_chunk_map_set_capacity(&d_chunk, capacity - (offset % capacity));
|
||||||
|
|
||||||
/* Now copy the bits. */
|
/* Now copy the bits. */
|
||||||
size_t d = sstart;
|
sm_loc_t d = offset;
|
||||||
for (size_t j = sstart % capacity; j < capacity; j++, d++) {
|
for (size_t j = offset % capacity; j < capacity; j++, d++) {
|
||||||
if (__sm_chunk_map_is_set(&s_chunk, j)) {
|
if (__sm_chunk_map_is_set(&s_chunk, j)) {
|
||||||
sparsemap_set(other, d, true);
|
sparsemap_set(other, d, true);
|
||||||
}
|
}
|
||||||
|
@ -1120,7 +1145,7 @@ sparsemap_split(sparsemap_t *map, size_t sstart, sparsemap_t *other)
|
||||||
i++;
|
i++;
|
||||||
|
|
||||||
/* Reduce the capacity of the source-chunk map. */
|
/* Reduce the capacity of the source-chunk map. */
|
||||||
__sm_chunk_map_set_capacity(&s_chunk, sstart % capacity);
|
__sm_chunk_map_set_capacity(&s_chunk, offset % capacity);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Now continue with all remaining minimaps. */
|
/* Now continue with all remaining minimaps. */
|
||||||
|
@ -1152,93 +1177,99 @@ sparsemap_split(sparsemap_t *map, size_t sstart, sparsemap_t *other)
|
||||||
assert(sparsemap_get_size(other) > SM_SIZEOF_OVERHEAD);
|
assert(sparsemap_get_size(other) > SM_SIZEOF_OVERHEAD);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
sm_loc_t
|
||||||
* Returns the index of the n'th set bit; uses a 0-based index,
|
sparsemap_select(sparsemap_t *map, sm_loc_t n, bool value)
|
||||||
* i.e. n == 0 for the first bit which is set, n == 1 for the second bit etc.
|
|
||||||
*/
|
|
||||||
size_t
|
|
||||||
sparsemap_select(sparsemap_t *map, size_t n)
|
|
||||||
{
|
{
|
||||||
assert(sparsemap_get_size(map) >= SM_SIZEOF_OVERHEAD);
|
assert(sparsemap_get_size(map) >= SM_SIZEOF_OVERHEAD);
|
||||||
size_t result;
|
size_t result;
|
||||||
size_t count = __sm_get_chunk_map_count(map);
|
size_t count = __sm_get_chunk_map_count(map);
|
||||||
uint8_t *p = __sm_get_chunk_map_data(map, 0);
|
if (n >= 0) {
|
||||||
|
uint8_t *p = __sm_get_chunk_map_data(map, 0);
|
||||||
|
|
||||||
for (size_t i = 0; i < count; i++) {
|
for (size_t i = 0; i < count; i++) {
|
||||||
result = *(sm_idx_t *)p;
|
result = *(sm_idx_t *)p;
|
||||||
p += sizeof(sm_idx_t);
|
p += sizeof(sm_idx_t);
|
||||||
__sm_chunk_t chunk;
|
__sm_chunk_t chunk;
|
||||||
__sm_chunk_map_init(&chunk, p);
|
__sm_chunk_map_init(&chunk, p);
|
||||||
|
|
||||||
ssize_t new_n = (ssize_t)n;
|
ssize_t new_n = (ssize_t)n;
|
||||||
size_t index = __sm_chunk_map_select(&chunk, n, &new_n);
|
size_t index = __sm_chunk_map_select(&chunk, n, &new_n, value);
|
||||||
if (new_n == -1) {
|
if (new_n == -1) {
|
||||||
return (result + index);
|
return (result + index);
|
||||||
|
}
|
||||||
|
n = new_n;
|
||||||
|
|
||||||
|
p += __sm_chunk_map_get_size(&chunk);
|
||||||
}
|
}
|
||||||
n = new_n;
|
|
||||||
|
|
||||||
p += __sm_chunk_map_get_size(&chunk);
|
|
||||||
}
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
assert(!"shouldn't be here");
|
assert(!"shouldn't be here");
|
||||||
#endif
|
#endif
|
||||||
return (size_t)-1;
|
return SM_LOC_MAX;
|
||||||
|
} else {
|
||||||
|
return SM_LOC_MIN; // TODO... sparsemap_select(map, -n, value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Counts the set bits starting at 'offset' until and including 'idx', meaning
|
|
||||||
* [offset, idx] inclusive.
|
|
||||||
*/
|
|
||||||
size_t
|
size_t
|
||||||
sparsemap_rank(sparsemap_t *map, size_t offset, size_t idx)
|
sparsemap_rank_vec(sparsemap_t *map, size_t x, size_t y, bool value, sm_bitvec_t *vec)
|
||||||
{
|
{
|
||||||
|
(void)value; //TODO
|
||||||
assert(sparsemap_get_size(map) >= SM_SIZEOF_OVERHEAD);
|
assert(sparsemap_get_size(map) >= SM_SIZEOF_OVERHEAD);
|
||||||
size_t result = 0, prev = 0, count = __sm_get_chunk_map_count(map);
|
size_t result = 0, prev = 0, count = __sm_get_chunk_map_count(map);
|
||||||
uint8_t *p = __sm_get_chunk_map_data(map, 0);
|
uint8_t *p = __sm_get_chunk_map_data(map, 0);
|
||||||
|
|
||||||
for (size_t i = 0; i < count; i++) {
|
for (size_t i = 0; i < count; i++) {
|
||||||
sm_idx_t start = *(sm_idx_t *)p;
|
sm_idx_t start = *(sm_idx_t *)p;
|
||||||
if (start > idx) {
|
if (start > y) {
|
||||||
return (result);
|
return (result);
|
||||||
}
|
}
|
||||||
offset -= start - prev;
|
x -= start - prev;
|
||||||
prev = start;
|
prev = start;
|
||||||
p += sizeof(sm_idx_t);
|
p += sizeof(sm_idx_t);
|
||||||
__sm_chunk_t chunk;
|
__sm_chunk_t chunk;
|
||||||
__sm_chunk_map_init(&chunk, p);
|
__sm_chunk_map_init(&chunk, p);
|
||||||
|
|
||||||
result += __sm_chunk_map_rank(&chunk, &offset, idx - start);
|
result += __sm_chunk_map_rank(&chunk, &x, y - start, vec);
|
||||||
p += __sm_chunk_map_get_size(&chunk);
|
p += __sm_chunk_map_get_size(&chunk);
|
||||||
}
|
}
|
||||||
return (result);
|
return (result);
|
||||||
|
// TODO: sparsemap_rank(map, x, y, false)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Finds a span of set bits of at least |len| after |loc|. Returns the index of
|
|
||||||
* the n'th set bit that starts a span of at least |len| bits set to true.
|
|
||||||
*/
|
|
||||||
size_t
|
size_t
|
||||||
sparsemap_span(sparsemap_t *map, size_t loc, size_t len)
|
sparsemap_rank(sparsemap_t *map, size_t x, size_t y, bool value)
|
||||||
{
|
{
|
||||||
size_t offset, nth = 0, count;
|
sm_bitvec_t vec;
|
||||||
(void)loc; // TODO
|
return sparsemap_rank_vec(map, x, y, value, &vec);
|
||||||
|
}
|
||||||
|
|
||||||
offset = sparsemap_select(map, 0);
|
size_t
|
||||||
|
sparsemap_span(sparsemap_t *map, sm_loc_t idx, size_t len, bool value)
|
||||||
|
{
|
||||||
|
size_t count, nth = 0;
|
||||||
|
sm_bitvec_t vec = 0;
|
||||||
|
sm_loc_t offset;
|
||||||
|
|
||||||
|
offset = sparsemap_select(map, nth++, value);
|
||||||
if (len == 1) {
|
if (len == 1) {
|
||||||
return offset;
|
return offset;
|
||||||
}
|
}
|
||||||
do {
|
do {
|
||||||
count = sparsemap_rank(map, offset, offset + len);
|
count = sparsemap_rank_vec(map, offset, offset + len, value, &vec);
|
||||||
if (count == len) {
|
if (count == len) {
|
||||||
return offset;
|
return offset;
|
||||||
} else {
|
} else {
|
||||||
count = len;
|
// TODO: what is nth when len > SM_BITS_PER_VECTOR?
|
||||||
while (--count && sparsemap_is_set(map, offset)) {
|
int c = len > SM_BITS_PER_VECTOR ? SM_BITS_PER_VECTOR : len;
|
||||||
|
for (int b = 0; b < c && (vec & 1 << b); b++) {
|
||||||
nth++;
|
nth++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
offset = sparsemap_select(map, nth);
|
if (count)
|
||||||
} while (offset != ((size_t)-1));
|
nth++;
|
||||||
|
/* Use select to potentially jump very far forward in the map. */
|
||||||
|
offset = sparsemap_select(map, nth, value);
|
||||||
|
} while (offset != SM_LOC_MAX);
|
||||||
|
|
||||||
return offset;
|
return idx > 0 ? SM_LOC_MAX : SM_LOC_MIN;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,19 @@
|
||||||
#include <sys/types.h>
|
#define _POSIX_C_SOURCE 199309L
|
||||||
|
#define X86_INTRIN
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <pthread.h>
|
#include <pthread.h> // If using threads
|
||||||
#include <sparsemap.h>
|
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#ifdef X86_INTRIN
|
||||||
|
#include <x86intrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "../include/sparsemap.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#pragma GCC diagnostic push
|
#pragma GCC diagnostic push
|
||||||
|
@ -22,84 +28,25 @@
|
||||||
uint64_t
|
uint64_t
|
||||||
tsc(void)
|
tsc(void)
|
||||||
{
|
{
|
||||||
|
#ifdef X86_INTRIN
|
||||||
|
return __rdtsc();
|
||||||
|
#else
|
||||||
uint32_t low, high;
|
uint32_t low, high;
|
||||||
__asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
|
__asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
|
||||||
return ((uint64_t)high << 32) | low;
|
return ((uint64_t)high << 32) | low;
|
||||||
}
|
#endif
|
||||||
|
|
||||||
static uint64_t
|
|
||||||
get_tsc_frequency()
|
|
||||||
{
|
|
||||||
uint32_t high, low;
|
|
||||||
__asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
|
|
||||||
__asm__ volatile("rdtsc");
|
|
||||||
return ((uint64_t)high << 32) | low;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
double
|
double
|
||||||
tsc_ticks_to_ns(uint64_t tsc_ticks)
|
nsts()
|
||||||
{
|
{
|
||||||
static uint64_t tsc_freq = 0;
|
struct timespec ts;
|
||||||
if (tsc_freq == 0) {
|
|
||||||
tsc_freq = get_tsc_frequency();
|
|
||||||
}
|
|
||||||
return (double)tsc_ticks / (double)tsc_freq * 1e9;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
|
||||||
est_sift_up(uint64_t *heap, int child_index)
|
perror("clock_gettime");
|
||||||
{
|
return -1.0; // Return -1.0 on error
|
||||||
while (child_index > 0) {
|
|
||||||
int parent_index = (child_index - 1) / 2;
|
|
||||||
if (heap[parent_index] > heap[child_index]) {
|
|
||||||
// Swap parent and child
|
|
||||||
uint64_t temp = heap[parent_index];
|
|
||||||
heap[parent_index] = heap[child_index];
|
|
||||||
heap[child_index] = temp;
|
|
||||||
child_index = parent_index;
|
|
||||||
} else {
|
|
||||||
break; // Heap property satisfied
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
est_sift_down(uint64_t *heap, int heap_size, int parent_index)
|
|
||||||
{
|
|
||||||
int child_index = 2 * parent_index + 1; // Left child
|
|
||||||
while (child_index < heap_size) {
|
|
||||||
// Right child exists and is smaller than left child
|
|
||||||
if (child_index + 1 < heap_size && heap[child_index + 1] < heap[child_index]) {
|
|
||||||
child_index++;
|
|
||||||
}
|
|
||||||
// If the smallest child is smaller than the parent, swap them
|
|
||||||
if (heap[child_index] < heap[parent_index]) {
|
|
||||||
uint64_t temp = heap[child_index];
|
|
||||||
heap[child_index] = heap[parent_index];
|
|
||||||
heap[parent_index] = temp;
|
|
||||||
parent_index = child_index;
|
|
||||||
child_index = 2 * parent_index + 1;
|
|
||||||
} else {
|
|
||||||
break; // Heap property satisfied
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
est_insert_value(uint64_t *heap, int heap_max_size, int *heap_size, uint64_t value)
|
|
||||||
{
|
|
||||||
if (*heap_size < heap_max_size) { // Heap not full, insert value
|
|
||||||
heap[*heap_size] = value;
|
|
||||||
est_sift_up(heap, *heap_size);
|
|
||||||
(*heap_size)++;
|
|
||||||
} else {
|
|
||||||
// Heap is full, replace root with new value with a certain probability
|
|
||||||
// This is a very naive approach to maintain a sample of the input
|
|
||||||
if (rand() % 2) {
|
|
||||||
heap[0] = value;
|
|
||||||
est_sift_down(heap, heap_max_size, 0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
return ts.tv_sec + ts.tv_nsec / 1e9;
|
||||||
}
|
}
|
||||||
|
|
||||||
int __xorshift32_state = 0;
|
int __xorshift32_state = 0;
|
||||||
|
|
|
@ -23,20 +23,9 @@
|
||||||
#define XORSHIFT_SEED_VALUE ((unsigned int)time(NULL) ^ getpid())
|
#define XORSHIFT_SEED_VALUE ((unsigned int)time(NULL) ^ getpid())
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define EST_MEDIAN_DECL(decl, size) \
|
|
||||||
uint64_t heap_##decl[size] = { 0 }; \
|
|
||||||
int heap_##decl##_max_size = size; \
|
|
||||||
int heap_##decl##_size = 0;
|
|
||||||
|
|
||||||
#define EST_MEDIAN_ADD(decl, value) est_insert_value(heap_##decl, heap_##decl##_max_size, &heap_##decl##_size, (value));
|
|
||||||
|
|
||||||
#define EST_MEDIAN_GET(decl) heap_##decl[0]
|
|
||||||
|
|
||||||
uint64_t tsc(void);
|
uint64_t tsc(void);
|
||||||
double tsc_ticks_to_ns(uint64_t tsc_ticks);
|
double tsc_ticks_to_ns(uint64_t tsc_ticks);
|
||||||
void est_sift_up(uint64_t *heap, int child_index);
|
double nsts();
|
||||||
void est_sift_down(uint64_t *heap, int heap_size, int parent_index);
|
|
||||||
void est_insert_value(uint64_t *heap, int heap_max_size, int *heap_size, uint64_t value);
|
|
||||||
|
|
||||||
void xorshift32_seed();
|
void xorshift32_seed();
|
||||||
uint32_t xorshift32();
|
uint32_t xorshift32();
|
||||||
|
|
680
tests/tdigest.c
Normal file
680
tests/tdigest.c
Normal file
|
@ -0,0 +1,680 @@
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include "tdigest.h"
|
||||||
|
#include <errno.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#ifndef TD_MALLOC_INCLUDE
|
||||||
|
#define TD_MALLOC_INCLUDE "td_malloc.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef TD_ALLOC_H
|
||||||
|
#define TD_ALLOC_H
|
||||||
|
#define __td_malloc malloc
|
||||||
|
#define __td_calloc calloc
|
||||||
|
#define __td_realloc realloc
|
||||||
|
#define __td_free free
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define __td_max(x, y) (((x) > (y)) ? (x) : (y))
|
||||||
|
#define __td_min(x, y) (((x) < (y)) ? (x) : (y))
|
||||||
|
|
||||||
|
static inline double weighted_average_sorted(double x1, double w1, double x2, double w2) {
|
||||||
|
const double x = (x1 * w1 + x2 * w2) / (w1 + w2);
|
||||||
|
return __td_max(x1, __td_min(x, x2));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool _tdigest_long_long_add_safe(long long a, long long b) {
|
||||||
|
if (b < 0) {
|
||||||
|
return (a >= __LONG_LONG_MAX__ - b);
|
||||||
|
} else {
|
||||||
|
return (a <= __LONG_LONG_MAX__ - b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline double weighted_average(double x1, double w1, double x2, double w2) {
|
||||||
|
if (x1 <= x2) {
|
||||||
|
return weighted_average_sorted(x1, w1, x2, w2);
|
||||||
|
} else {
|
||||||
|
return weighted_average_sorted(x2, w2, x1, w1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void swap(double *arr, int i, int j) {
|
||||||
|
const double temp = arr[i];
|
||||||
|
arr[i] = arr[j];
|
||||||
|
arr[j] = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void swap_l(long long *arr, int i, int j) {
|
||||||
|
const long long temp = arr[i];
|
||||||
|
arr[i] = arr[j];
|
||||||
|
arr[j] = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned int partition(double *means, long long *weights, unsigned int start,
|
||||||
|
unsigned int end, unsigned int pivot_idx) {
|
||||||
|
const double pivotMean = means[pivot_idx];
|
||||||
|
swap(means, pivot_idx, end);
|
||||||
|
swap_l(weights, pivot_idx, end);
|
||||||
|
|
||||||
|
int i = start - 1;
|
||||||
|
|
||||||
|
for (unsigned int j = start; j < end; j++) {
|
||||||
|
// If current element is smaller than the pivot
|
||||||
|
if (means[j] < pivotMean) {
|
||||||
|
// increment index of smaller element
|
||||||
|
i++;
|
||||||
|
swap(means, i, j);
|
||||||
|
swap_l(weights, i, j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
swap(means, i + 1, end);
|
||||||
|
swap_l(weights, i + 1, end);
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Standard quick sort except that sorting rearranges parallel arrays
|
||||||
|
*
|
||||||
|
* @param means Values to sort on
|
||||||
|
* @param weights The auxillary values to sort.
|
||||||
|
* @param start The beginning of the values to sort
|
||||||
|
* @param end The value after the last value to sort
|
||||||
|
*/
|
||||||
|
static void td_qsort(double *means, long long *weights, unsigned int start, unsigned int end) {
|
||||||
|
if (start < end) {
|
||||||
|
// two elements can be directly compared
|
||||||
|
if ((end - start) == 1) {
|
||||||
|
if (means[start] > means[end]) {
|
||||||
|
swap(means, start, end);
|
||||||
|
swap_l(weights, start, end);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// generating a random number as a pivot was very expensive vs the array size
|
||||||
|
// const unsigned int pivot_idx = start + rand()%(end - start + 1);
|
||||||
|
const unsigned int pivot_idx = (end + start) / 2; // central pivot
|
||||||
|
const unsigned int new_pivot_idx = partition(means, weights, start, end, pivot_idx);
|
||||||
|
if (new_pivot_idx > start) {
|
||||||
|
td_qsort(means, weights, start, new_pivot_idx - 1);
|
||||||
|
}
|
||||||
|
td_qsort(means, weights, new_pivot_idx + 1, end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline size_t cap_from_compression(double compression) {
|
||||||
|
if ((size_t)compression > ((SIZE_MAX / sizeof(double) / 6) - 10)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (6 * (size_t)(compression)) + 10;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool should_td_compress(td_histogram_t *h) {
|
||||||
|
return ((h->merged_nodes + h->unmerged_nodes) >= (h->cap - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int next_node(td_histogram_t *h) { return h->merged_nodes + h->unmerged_nodes; }
|
||||||
|
|
||||||
|
int td_compress(td_histogram_t *h);
|
||||||
|
|
||||||
|
static inline int _check_overflow(const double v) {
|
||||||
|
// double-precision overflow detected on h->unmerged_weight
|
||||||
|
if (v == INFINITY) {
|
||||||
|
return EDOM;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int _check_td_overflow(const double new_unmerged_weight,
|
||||||
|
const double new_total_weight) {
|
||||||
|
// double-precision overflow detected on h->unmerged_weight
|
||||||
|
if (new_unmerged_weight == INFINITY) {
|
||||||
|
return EDOM;
|
||||||
|
}
|
||||||
|
if (new_total_weight == INFINITY) {
|
||||||
|
return EDOM;
|
||||||
|
}
|
||||||
|
const double denom = 2 * MM_PI * new_total_weight * log(new_total_weight);
|
||||||
|
if (denom == INFINITY) {
|
||||||
|
return EDOM;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int td_centroid_count(td_histogram_t *h) { return next_node(h); }
|
||||||
|
|
||||||
|
void td_reset(td_histogram_t *h) {
|
||||||
|
if (!h) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
h->min = __DBL_MAX__;
|
||||||
|
h->max = -h->min;
|
||||||
|
h->merged_nodes = 0;
|
||||||
|
h->merged_weight = 0;
|
||||||
|
h->unmerged_nodes = 0;
|
||||||
|
h->unmerged_weight = 0;
|
||||||
|
h->total_compressions = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int td_init(double compression, td_histogram_t **result) {
|
||||||
|
|
||||||
|
const size_t capacity = cap_from_compression(compression);
|
||||||
|
if (capacity < 1) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
td_histogram_t *histogram;
|
||||||
|
histogram = (td_histogram_t *)__td_malloc(sizeof(td_histogram_t));
|
||||||
|
if (!histogram) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
histogram->cap = capacity;
|
||||||
|
histogram->compression = (double)compression;
|
||||||
|
td_reset(histogram);
|
||||||
|
histogram->nodes_mean = (double *)__td_calloc(capacity, sizeof(double));
|
||||||
|
if (!histogram->nodes_mean) {
|
||||||
|
td_free(histogram);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
histogram->nodes_weight = (long long *)__td_calloc(capacity, sizeof(long long));
|
||||||
|
if (!histogram->nodes_weight) {
|
||||||
|
td_free(histogram);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
*result = histogram;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
td_histogram_t *td_new(double compression) {
|
||||||
|
td_histogram_t *mdigest = NULL;
|
||||||
|
td_init(compression, &mdigest);
|
||||||
|
return mdigest;
|
||||||
|
}
|
||||||
|
|
||||||
|
void td_free(td_histogram_t *histogram) {
|
||||||
|
if (histogram->nodes_mean) {
|
||||||
|
__td_free((void *)(histogram->nodes_mean));
|
||||||
|
}
|
||||||
|
if (histogram->nodes_weight) {
|
||||||
|
__td_free((void *)(histogram->nodes_weight));
|
||||||
|
}
|
||||||
|
__td_free((void *)(histogram));
|
||||||
|
}
|
||||||
|
|
||||||
|
int td_merge(td_histogram_t *into, td_histogram_t *from) {
|
||||||
|
if (td_compress(into) != 0)
|
||||||
|
return EDOM;
|
||||||
|
if (td_compress(from) != 0)
|
||||||
|
return EDOM;
|
||||||
|
const int pos = from->merged_nodes + from->unmerged_nodes;
|
||||||
|
for (int i = 0; i < pos; i++) {
|
||||||
|
const double mean = from->nodes_mean[i];
|
||||||
|
const long long weight = from->nodes_weight[i];
|
||||||
|
if (td_add(into, mean, weight) != 0) {
|
||||||
|
return EDOM;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
long long td_size(td_histogram_t *h) { return h->merged_weight + h->unmerged_weight; }
|
||||||
|
|
||||||
|
double td_cdf(td_histogram_t *h, double val) {
|
||||||
|
td_compress(h);
|
||||||
|
// no data to examine
|
||||||
|
if (h->merged_nodes == 0) {
|
||||||
|
return NAN;
|
||||||
|
}
|
||||||
|
// bellow lower bound
|
||||||
|
if (val < h->min) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
// above upper bound
|
||||||
|
if (val > h->max) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (h->merged_nodes == 1) {
|
||||||
|
// exactly one centroid, should have max==min
|
||||||
|
const double width = h->max - h->min;
|
||||||
|
if (val - h->min <= width) {
|
||||||
|
// min and max are too close together to do any viable interpolation
|
||||||
|
return 0.5;
|
||||||
|
} else {
|
||||||
|
// interpolate if somehow we have weight > 0 and max != min
|
||||||
|
return (val - h->min) / width;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const int n = h->merged_nodes;
|
||||||
|
// check for the left tail
|
||||||
|
const double left_centroid_mean = h->nodes_mean[0];
|
||||||
|
const double left_centroid_weight = (double)h->nodes_weight[0];
|
||||||
|
const double merged_weight_d = (double)h->merged_weight;
|
||||||
|
if (val < left_centroid_mean) {
|
||||||
|
// note that this is different than h->nodes_mean[0] > min
|
||||||
|
// ... this guarantees we divide by non-zero number and interpolation works
|
||||||
|
const double width = left_centroid_mean - h->min;
|
||||||
|
if (width > 0) {
|
||||||
|
// must be a sample exactly at min
|
||||||
|
if (val == h->min) {
|
||||||
|
return 0.5 / merged_weight_d;
|
||||||
|
} else {
|
||||||
|
return (1 + (val - h->min) / width * (left_centroid_weight / 2 - 1)) /
|
||||||
|
merged_weight_d;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// this should be redundant with the check val < h->min
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// and the right tail
|
||||||
|
const double right_centroid_mean = h->nodes_mean[n - 1];
|
||||||
|
const double right_centroid_weight = (double)h->nodes_weight[n - 1];
|
||||||
|
if (val > right_centroid_mean) {
|
||||||
|
const double width = h->max - right_centroid_mean;
|
||||||
|
if (width > 0) {
|
||||||
|
if (val == h->max) {
|
||||||
|
return 1 - 0.5 / merged_weight_d;
|
||||||
|
} else {
|
||||||
|
// there has to be a single sample exactly at max
|
||||||
|
const double dq = (1 + (h->max - val) / width * (right_centroid_weight / 2 - 1)) /
|
||||||
|
merged_weight_d;
|
||||||
|
return 1 - dq;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// we know that there are at least two centroids and mean[0] < x < mean[n-1]
|
||||||
|
// that means that there are either one or more consecutive centroids all at exactly x
|
||||||
|
// or there are consecutive centroids, c0 < x < c1
|
||||||
|
double weightSoFar = 0;
|
||||||
|
for (int it = 0; it < n - 1; it++) {
|
||||||
|
// weightSoFar does not include weight[it] yet
|
||||||
|
if (h->nodes_mean[it] == val) {
|
||||||
|
// we have one or more centroids == x, treat them as one
|
||||||
|
// dw will accumulate the weight of all of the centroids at x
|
||||||
|
double dw = 0;
|
||||||
|
while (it < n && h->nodes_mean[it] == val) {
|
||||||
|
dw += (double)h->nodes_weight[it];
|
||||||
|
it++;
|
||||||
|
}
|
||||||
|
return (weightSoFar + dw / 2) / (double)h->merged_weight;
|
||||||
|
} else if (h->nodes_mean[it] <= val && val < h->nodes_mean[it + 1]) {
|
||||||
|
const double node_weight = (double)h->nodes_weight[it];
|
||||||
|
const double node_weight_next = (double)h->nodes_weight[it + 1];
|
||||||
|
const double node_mean = h->nodes_mean[it];
|
||||||
|
const double node_mean_next = h->nodes_mean[it + 1];
|
||||||
|
// landed between centroids ... check for floating point madness
|
||||||
|
if (node_mean_next - node_mean > 0) {
|
||||||
|
// note how we handle singleton centroids here
|
||||||
|
// the point is that for singleton centroids, we know that their entire
|
||||||
|
// weight is exactly at the centroid and thus shouldn't be involved in
|
||||||
|
// interpolation
|
||||||
|
double leftExcludedW = 0;
|
||||||
|
double rightExcludedW = 0;
|
||||||
|
if (node_weight == 1) {
|
||||||
|
if (node_weight_next == 1) {
|
||||||
|
// two singletons means no interpolation
|
||||||
|
// left singleton is in, right is out
|
||||||
|
return (weightSoFar + 1) / merged_weight_d;
|
||||||
|
} else {
|
||||||
|
leftExcludedW = 0.5;
|
||||||
|
}
|
||||||
|
} else if (node_weight_next == 1) {
|
||||||
|
rightExcludedW = 0.5;
|
||||||
|
}
|
||||||
|
double dw = (node_weight + node_weight_next) / 2;
|
||||||
|
|
||||||
|
// adjust endpoints for any singleton
|
||||||
|
double dwNoSingleton = dw - leftExcludedW - rightExcludedW;
|
||||||
|
|
||||||
|
double base = weightSoFar + node_weight / 2 + leftExcludedW;
|
||||||
|
return (base + dwNoSingleton * (val - node_mean) / (node_mean_next - node_mean)) /
|
||||||
|
merged_weight_d;
|
||||||
|
} else {
|
||||||
|
// this is simply caution against floating point madness
|
||||||
|
// it is conceivable that the centroids will be different
|
||||||
|
// but too near to allow safe interpolation
|
||||||
|
double dw = (node_weight + node_weight_next) / 2;
|
||||||
|
return (weightSoFar + dw) / merged_weight_d;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
weightSoFar += (double)h->nodes_weight[it];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 1 - 0.5 / merged_weight_d;
|
||||||
|
}
|
||||||
|
|
||||||
|
static double td_internal_iterate_centroids_to_index(const td_histogram_t *h, const double index,
|
||||||
|
const double left_centroid_weight,
|
||||||
|
const int total_centroids, double *weightSoFar,
|
||||||
|
int *node_pos) {
|
||||||
|
if (left_centroid_weight > 1 && index < left_centroid_weight / 2) {
|
||||||
|
// there is a single sample at min so we interpolate with less weight
|
||||||
|
return h->min + (index - 1) / (left_centroid_weight / 2 - 1) * (h->nodes_mean[0] - h->min);
|
||||||
|
}
|
||||||
|
|
||||||
|
// usually the last centroid will have unit weight so this test will make it moot
|
||||||
|
if (index > h->merged_weight - 1) {
|
||||||
|
return h->max;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if the right-most centroid has more than one sample, we still know
|
||||||
|
// that one sample occurred at max so we can do some interpolation
|
||||||
|
const double right_centroid_weight = (double)h->nodes_weight[total_centroids - 1];
|
||||||
|
const double right_centroid_mean = h->nodes_mean[total_centroids - 1];
|
||||||
|
if (right_centroid_weight > 1 &&
|
||||||
|
(double)h->merged_weight - index <= right_centroid_weight / 2) {
|
||||||
|
return h->max - ((double)h->merged_weight - index - 1) / (right_centroid_weight / 2 - 1) *
|
||||||
|
(h->max - right_centroid_mean);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; *node_pos < total_centroids - 1; (*node_pos)++) {
|
||||||
|
const int i = *node_pos;
|
||||||
|
const double node_weight = (double)h->nodes_weight[i];
|
||||||
|
const double node_weight_next = (double)h->nodes_weight[i + 1];
|
||||||
|
const double node_mean = h->nodes_mean[i];
|
||||||
|
const double node_mean_next = h->nodes_mean[i + 1];
|
||||||
|
const double dw = (node_weight + node_weight_next) / 2;
|
||||||
|
if (*weightSoFar + dw > index) {
|
||||||
|
// centroids i and i+1 bracket our current point
|
||||||
|
// check for unit weight
|
||||||
|
double leftUnit = 0;
|
||||||
|
if (node_weight == 1) {
|
||||||
|
if (index - *weightSoFar < 0.5) {
|
||||||
|
// within the singleton's sphere
|
||||||
|
return node_mean;
|
||||||
|
} else {
|
||||||
|
leftUnit = 0.5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double rightUnit = 0;
|
||||||
|
if (node_weight_next == 1) {
|
||||||
|
if (*weightSoFar + dw - index <= 0.5) {
|
||||||
|
// no interpolation needed near singleton
|
||||||
|
return node_mean_next;
|
||||||
|
}
|
||||||
|
rightUnit = 0.5;
|
||||||
|
}
|
||||||
|
const double z1 = index - *weightSoFar - leftUnit;
|
||||||
|
const double z2 = *weightSoFar + dw - index - rightUnit;
|
||||||
|
return weighted_average(node_mean, z2, node_mean_next, z1);
|
||||||
|
}
|
||||||
|
*weightSoFar += dw;
|
||||||
|
}
|
||||||
|
|
||||||
|
// weightSoFar = totalWeight - weight[total_centroids-1]/2 (very nearly)
|
||||||
|
// so we interpolate out to max value ever seen
|
||||||
|
const double z1 = index - h->merged_weight - right_centroid_weight / 2.0;
|
||||||
|
const double z2 = right_centroid_weight / 2 - z1;
|
||||||
|
return weighted_average(right_centroid_mean, z1, h->max, z2);
|
||||||
|
}
|
||||||
|
|
||||||
|
double td_quantile(td_histogram_t *h, double q) {
|
||||||
|
td_compress(h);
|
||||||
|
// q should be in [0,1]
|
||||||
|
if (q < 0.0 || q > 1.0 || h->merged_nodes == 0) {
|
||||||
|
return NAN;
|
||||||
|
}
|
||||||
|
// with one data point, all quantiles lead to Rome
|
||||||
|
if (h->merged_nodes == 1) {
|
||||||
|
return h->nodes_mean[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// if values were stored in a sorted array, index would be the offset we are interested in
|
||||||
|
const double index = q * (double)h->merged_weight;
|
||||||
|
|
||||||
|
// beyond the boundaries, we return min or max
|
||||||
|
// usually, the first centroid will have unit weight so this will make it moot
|
||||||
|
if (index < 1) {
|
||||||
|
return h->min;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we know that there are at least two centroids now
|
||||||
|
const int n = h->merged_nodes;
|
||||||
|
|
||||||
|
// if the left centroid has more than one sample, we still know
|
||||||
|
// that one sample occurred at min so we can do some interpolation
|
||||||
|
const double left_centroid_weight = (double)h->nodes_weight[0];
|
||||||
|
|
||||||
|
// in between extremes we interpolate between centroids
|
||||||
|
double weightSoFar = left_centroid_weight / 2;
|
||||||
|
int i = 0;
|
||||||
|
return td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n, &weightSoFar,
|
||||||
|
&i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length) {
|
||||||
|
td_compress(h);
|
||||||
|
|
||||||
|
if (NULL == quantiles || NULL == values) {
|
||||||
|
return EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n = h->merged_nodes;
|
||||||
|
if (n == 0) {
|
||||||
|
for (size_t i = 0; i < length; i++) {
|
||||||
|
values[i] = NAN;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (n == 1) {
|
||||||
|
for (size_t i = 0; i < length; i++) {
|
||||||
|
const double requested_quantile = quantiles[i];
|
||||||
|
|
||||||
|
// q should be in [0,1]
|
||||||
|
if (requested_quantile < 0.0 || requested_quantile > 1.0) {
|
||||||
|
values[i] = NAN;
|
||||||
|
} else {
|
||||||
|
// with one data point, all quantiles lead to Rome
|
||||||
|
values[i] = h->nodes_mean[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we know that there are at least two centroids now
|
||||||
|
// if the left centroid has more than one sample, we still know
|
||||||
|
// that one sample occurred at min so we can do some interpolation
|
||||||
|
const double left_centroid_weight = (double)h->nodes_weight[0];
|
||||||
|
|
||||||
|
// in between extremes we interpolate between centroids
|
||||||
|
double weightSoFar = left_centroid_weight / 2;
|
||||||
|
int node_pos = 0;
|
||||||
|
|
||||||
|
// to avoid allocations we use the values array for intermediate computation
|
||||||
|
// i.e. to store the expected cumulative count at each percentile
|
||||||
|
for (size_t qpos = 0; qpos < length; qpos++) {
|
||||||
|
const double index = quantiles[qpos] * (double)h->merged_weight;
|
||||||
|
values[qpos] = td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n,
|
||||||
|
&weightSoFar, &node_pos);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static double td_internal_trimmed_mean(const td_histogram_t *h, const double leftmost_weight,
|
||||||
|
const double rightmost_weight) {
|
||||||
|
double count_done = 0;
|
||||||
|
double trimmed_sum = 0;
|
||||||
|
double trimmed_count = 0;
|
||||||
|
for (int i = 0; i < h->merged_nodes; i++) {
|
||||||
|
|
||||||
|
const double n_weight = (double)h->nodes_weight[i];
|
||||||
|
// Assume the whole centroid falls into the range
|
||||||
|
double count_add = n_weight;
|
||||||
|
|
||||||
|
// If we haven't reached the low threshold yet, skip appropriate part of the centroid.
|
||||||
|
count_add -= __td_min(__td_max(0, leftmost_weight - count_done), count_add);
|
||||||
|
|
||||||
|
// If we have reached the upper threshold, ignore the overflowing part of the centroid.
|
||||||
|
|
||||||
|
count_add = __td_min(__td_max(0, rightmost_weight - count_done), count_add);
|
||||||
|
|
||||||
|
// consider the whole centroid processed
|
||||||
|
count_done += n_weight;
|
||||||
|
|
||||||
|
// increment the sum / count
|
||||||
|
trimmed_sum += h->nodes_mean[i] * count_add;
|
||||||
|
trimmed_count += count_add;
|
||||||
|
|
||||||
|
// break once we cross the high threshold
|
||||||
|
if (count_done >= rightmost_weight)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return trimmed_sum / trimmed_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut) {
|
||||||
|
td_compress(h);
|
||||||
|
// proportion_to_cut should be in [0,1]
|
||||||
|
if (h->merged_nodes == 0 || proportion_to_cut < 0.0 || proportion_to_cut > 1.0) {
|
||||||
|
return NAN;
|
||||||
|
}
|
||||||
|
// with one data point, all values lead to Rome
|
||||||
|
if (h->merged_nodes == 1) {
|
||||||
|
return h->nodes_mean[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* translate the percentiles to counts */
|
||||||
|
const double leftmost_weight = floor((double)h->merged_weight * proportion_to_cut);
|
||||||
|
const double rightmost_weight = ceil((double)h->merged_weight * (1.0 - proportion_to_cut));
|
||||||
|
|
||||||
|
return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight);
|
||||||
|
}
|
||||||
|
|
||||||
|
double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut) {
|
||||||
|
td_compress(h);
|
||||||
|
// leftmost_cut and rightmost_cut should be in [0,1]
|
||||||
|
if (h->merged_nodes == 0 || leftmost_cut < 0.0 || leftmost_cut > 1.0 || rightmost_cut < 0.0 ||
|
||||||
|
rightmost_cut > 1.0) {
|
||||||
|
return NAN;
|
||||||
|
}
|
||||||
|
// with one data point, all values lead to Rome
|
||||||
|
if (h->merged_nodes == 1) {
|
||||||
|
return h->nodes_mean[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* translate the percentiles to counts */
|
||||||
|
const double leftmost_weight = floor((double)h->merged_weight * leftmost_cut);
|
||||||
|
const double rightmost_weight = ceil((double)h->merged_weight * rightmost_cut);
|
||||||
|
|
||||||
|
return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight);
|
||||||
|
}
|
||||||
|
|
||||||
|
int td_add(td_histogram_t *h, double mean, long long weight) {
|
||||||
|
if (should_td_compress(h)) {
|
||||||
|
const int overflow_res = td_compress(h);
|
||||||
|
if (overflow_res != 0)
|
||||||
|
return overflow_res;
|
||||||
|
}
|
||||||
|
const int pos = next_node(h);
|
||||||
|
if (pos >= h->cap)
|
||||||
|
return EDOM;
|
||||||
|
if (_tdigest_long_long_add_safe(h->unmerged_weight, weight) == false)
|
||||||
|
return EDOM;
|
||||||
|
const long long new_unmerged_weight = h->unmerged_weight + weight;
|
||||||
|
if (_tdigest_long_long_add_safe(new_unmerged_weight, h->merged_weight) == false)
|
||||||
|
return EDOM;
|
||||||
|
const long long new_total_weight = new_unmerged_weight + h->merged_weight;
|
||||||
|
// double-precision overflow detected
|
||||||
|
const int overflow_res =
|
||||||
|
_check_td_overflow((double)new_unmerged_weight, (double)new_total_weight);
|
||||||
|
if (overflow_res != 0)
|
||||||
|
return overflow_res;
|
||||||
|
|
||||||
|
if (mean < h->min) {
|
||||||
|
h->min = mean;
|
||||||
|
}
|
||||||
|
if (mean > h->max) {
|
||||||
|
h->max = mean;
|
||||||
|
}
|
||||||
|
h->nodes_mean[pos] = mean;
|
||||||
|
h->nodes_weight[pos] = weight;
|
||||||
|
h->unmerged_nodes++;
|
||||||
|
h->unmerged_weight = new_unmerged_weight;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int td_compress(td_histogram_t *h) {
|
||||||
|
if (h->unmerged_nodes == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
int N = h->merged_nodes + h->unmerged_nodes;
|
||||||
|
td_qsort(h->nodes_mean, h->nodes_weight, 0, N - 1);
|
||||||
|
const double total_weight = (double)h->merged_weight + (double)h->unmerged_weight;
|
||||||
|
// double-precision overflow detected
|
||||||
|
const int overflow_res = _check_td_overflow((double)h->unmerged_weight, (double)total_weight);
|
||||||
|
if (overflow_res != 0)
|
||||||
|
return overflow_res;
|
||||||
|
if (total_weight <= 1)
|
||||||
|
return 0;
|
||||||
|
const double denom = 2 * MM_PI * total_weight * log(total_weight);
|
||||||
|
if (_check_overflow(denom) != 0)
|
||||||
|
return EDOM;
|
||||||
|
|
||||||
|
// Compute the normalizer given compression and number of points.
|
||||||
|
const double normalizer = h->compression / denom;
|
||||||
|
if (_check_overflow(normalizer) != 0)
|
||||||
|
return EDOM;
|
||||||
|
int cur = 0;
|
||||||
|
double weight_so_far = 0;
|
||||||
|
|
||||||
|
for (int i = 1; i < N; i++) {
|
||||||
|
const double proposed_weight = (double)h->nodes_weight[cur] + (double)h->nodes_weight[i];
|
||||||
|
const double z = proposed_weight * normalizer;
|
||||||
|
// quantile up to cur
|
||||||
|
const double q0 = weight_so_far / total_weight;
|
||||||
|
// quantile up to cur + i
|
||||||
|
const double q2 = (weight_so_far + proposed_weight) / total_weight;
|
||||||
|
// Convert a quantile to the k-scale
|
||||||
|
const bool should_add = (z <= (q0 * (1 - q0))) && (z <= (q2 * (1 - q2)));
|
||||||
|
// next point will fit
|
||||||
|
// so merge into existing centroid
|
||||||
|
if (should_add) {
|
||||||
|
h->nodes_weight[cur] += h->nodes_weight[i];
|
||||||
|
const double delta = h->nodes_mean[i] - h->nodes_mean[cur];
|
||||||
|
const double weighted_delta = (delta * h->nodes_weight[i]) / h->nodes_weight[cur];
|
||||||
|
h->nodes_mean[cur] += weighted_delta;
|
||||||
|
} else {
|
||||||
|
weight_so_far += h->nodes_weight[cur];
|
||||||
|
cur++;
|
||||||
|
h->nodes_weight[cur] = h->nodes_weight[i];
|
||||||
|
h->nodes_mean[cur] = h->nodes_mean[i];
|
||||||
|
}
|
||||||
|
if (cur != i) {
|
||||||
|
h->nodes_weight[i] = 0;
|
||||||
|
h->nodes_mean[i] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
h->merged_nodes = cur + 1;
|
||||||
|
h->merged_weight = total_weight;
|
||||||
|
h->unmerged_nodes = 0;
|
||||||
|
h->unmerged_weight = 0;
|
||||||
|
h->total_compressions++;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double td_min(td_histogram_t *h) { return h->min; }
|
||||||
|
|
||||||
|
double td_max(td_histogram_t *h) { return h->max; }
|
||||||
|
|
||||||
|
int td_compression(td_histogram_t *h) { return h->compression; }
|
||||||
|
|
||||||
|
const long long *td_centroids_weight(td_histogram_t *h) { return h->nodes_weight; }
|
||||||
|
|
||||||
|
const double *td_centroids_mean(td_histogram_t *h) { return h->nodes_mean; }
|
||||||
|
|
||||||
|
long long td_centroids_weight_at(td_histogram_t *h, int pos) { return h->nodes_weight[pos]; }
|
||||||
|
|
||||||
|
double td_centroids_mean_at(td_histogram_t *h, int pos) {
|
||||||
|
if (pos < 0 || pos > h->merged_nodes) {
|
||||||
|
return NAN;
|
||||||
|
}
|
||||||
|
return h->nodes_mean[pos];
|
||||||
|
}
|
258
tests/tdigest.h
Normal file
258
tests/tdigest.h
Normal file
|
@ -0,0 +1,258 @@
|
||||||
|
#pragma once
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adaptive histogram based on something like streaming k-means crossed with Q-digest.
|
||||||
|
* The implementation is a direct descendent of MergingDigest
|
||||||
|
* https://github.com/tdunning/t-digest/
|
||||||
|
*
|
||||||
|
* Copyright (c) 2021 Redis, All rights reserved.
|
||||||
|
* Copyright (c) 2018 Andrew Werner, All rights reserved.
|
||||||
|
*
|
||||||
|
* The special characteristics of this algorithm are:
|
||||||
|
*
|
||||||
|
* - smaller summaries than Q-digest
|
||||||
|
*
|
||||||
|
* - provides part per million accuracy for extreme quantiles and typically <1000 ppm accuracy
|
||||||
|
* for middle quantiles
|
||||||
|
*
|
||||||
|
* - fast
|
||||||
|
*
|
||||||
|
* - simple
|
||||||
|
*
|
||||||
|
* - easy to adapt for use with map-reduce
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define MM_PI 3.14159265358979323846
|
||||||
|
|
||||||
|
struct td_histogram {
|
||||||
|
// compression is a setting used to configure the size of centroids when merged.
|
||||||
|
double compression;
|
||||||
|
|
||||||
|
double min;
|
||||||
|
double max;
|
||||||
|
|
||||||
|
// cap is the total size of nodes
|
||||||
|
int cap;
|
||||||
|
// merged_nodes is the number of merged nodes at the front of nodes.
|
||||||
|
int merged_nodes;
|
||||||
|
// unmerged_nodes is the number of buffered nodes.
|
||||||
|
int unmerged_nodes;
|
||||||
|
|
||||||
|
// we run the merge in reverse every other merge to avoid left-to-right bias in merging
|
||||||
|
long long total_compressions;
|
||||||
|
|
||||||
|
long long merged_weight;
|
||||||
|
long long unmerged_weight;
|
||||||
|
|
||||||
|
double *nodes_mean;
|
||||||
|
long long *nodes_weight;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct td_histogram td_histogram_t;
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allocate the memory, initialise the t-digest, and return the histogram as output parameter.
|
||||||
|
* @param compression The compression parameter.
|
||||||
|
* 100 is a common value for normal uses.
|
||||||
|
* 1000 is extremely large.
|
||||||
|
* The number of centroids retained will be a smallish (usually less than 10) multiple of this
|
||||||
|
* number.
|
||||||
|
* @return the histogram on success, NULL if allocation failed.
|
||||||
|
*/
|
||||||
|
td_histogram_t *td_new(double compression);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allocate the memory and initialise the t-digest.
|
||||||
|
*
|
||||||
|
* @param compression The compression parameter.
|
||||||
|
* 100 is a common value for normal uses.
|
||||||
|
* 1000 is extremely large.
|
||||||
|
* The number of centroids retained will be a smallish (usually less than 10) multiple of this
|
||||||
|
* number.
|
||||||
|
* @param result Output parameter to capture allocated histogram.
|
||||||
|
* @return 0 on success, 1 if allocation failed.
|
||||||
|
*/
|
||||||
|
int td_init(double compression, td_histogram_t **result);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Frees the memory associated with the t-digest.
|
||||||
|
*
|
||||||
|
* @param h The histogram you want to free.
|
||||||
|
*/
|
||||||
|
void td_free(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset a histogram to zero - empty out a histogram and re-initialise it
|
||||||
|
*
|
||||||
|
* If you want to re-use an existing histogram, but reset everything back to zero, this
|
||||||
|
* is the routine to use.
|
||||||
|
*
|
||||||
|
* @param h The histogram you want to reset to empty.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
void td_reset(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a sample to a histogram.
|
||||||
|
*
|
||||||
|
* @param val The value to add.
|
||||||
|
* @param weight The weight of this point.
|
||||||
|
* @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
|
||||||
|
* weight.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
int td_add(td_histogram_t *h, double val, long long weight);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Re-examines a t-digest to determine whether some centroids are redundant. If your data are
|
||||||
|
* perversely ordered, this may be a good idea. Even if not, this may save 20% or so in space.
|
||||||
|
*
|
||||||
|
* The cost is roughly the same as adding as many data points as there are centroids. This
|
||||||
|
* is typically < 10 * compression, but could be as high as 100 * compression.
|
||||||
|
* This is a destructive operation that is not thread-safe.
|
||||||
|
*
|
||||||
|
* @param h The histogram you want to compress.
|
||||||
|
* @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
|
||||||
|
* weight. If overflow is detected the histogram is not changed.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
int td_compress(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Merges all of the values from 'from' to 'this' histogram.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
* @param from Histogram to copy values from.
|
||||||
|
* * @return 0 on success, EDOM if overflow was detected as a consequence of merging the the
|
||||||
|
* provided histogram. If overflow is detected the original histogram is not detected.
|
||||||
|
*/
|
||||||
|
int td_merge(td_histogram_t *h, td_histogram_t *from);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the fraction of all points added which are ≤ x.
|
||||||
|
*
|
||||||
|
* @param x The cutoff for the cdf.
|
||||||
|
* @return The fraction of all data which is less or equal to x.
|
||||||
|
*/
|
||||||
|
double td_cdf(td_histogram_t *h, double x);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an estimate of the cutoff such that a specified fraction of the data
|
||||||
|
* added to this TDigest would be less than or equal to the cutoff.
|
||||||
|
*
|
||||||
|
* @param q The desired fraction
|
||||||
|
* @return The value x such that cdf(x) == q;
|
||||||
|
*/
|
||||||
|
double td_quantile(td_histogram_t *h, double q);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an estimate of the cutoff such that a specified fraction of the data
|
||||||
|
* added to this TDigest would be less than or equal to the cutoffs.
|
||||||
|
*
|
||||||
|
* @param quantiles The ordered percentiles array to get the values for.
|
||||||
|
* @param values Destination array containing the values at the given quantiles.
|
||||||
|
* The values array should be allocated by the caller.
|
||||||
|
* @return 0 on success, ENOMEM if the provided destination array is null.
|
||||||
|
*/
|
||||||
|
int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the trimmed mean ignoring values outside given cutoff upper and lower limits.
|
||||||
|
*
|
||||||
|
* @param leftmost_cut Fraction to cut off of the left tail of the distribution.
|
||||||
|
* @param rightmost_cut Fraction to cut off of the right tail of the distribution.
|
||||||
|
* @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
|
||||||
|
*/
|
||||||
|
double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the trimmed mean ignoring values outside given a symmetric cutoff limits.
|
||||||
|
*
|
||||||
|
* @param proportion_to_cut Fraction to cut off of the left and right tails of the distribution.
|
||||||
|
* @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
|
||||||
|
*/
|
||||||
|
double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the current compression factor.
|
||||||
|
*
|
||||||
|
* @return The compression factor originally used to set up the TDigest.
|
||||||
|
*/
|
||||||
|
int td_compression(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of points that have been added to this TDigest.
|
||||||
|
*
|
||||||
|
* @return The sum of the weights on all centroids.
|
||||||
|
*/
|
||||||
|
long long td_size(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of centroids being used by this TDigest.
|
||||||
|
*
|
||||||
|
* @return The number of centroids being used.
|
||||||
|
*/
|
||||||
|
int td_centroid_count(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get minimum value from the histogram. Will return __DBL_MAX__ if the histogram
|
||||||
|
* is empty.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
*/
|
||||||
|
double td_min(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get maximum value from the histogram. Will return - __DBL_MAX__ if the histogram
|
||||||
|
* is empty.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
*/
|
||||||
|
double td_max(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the full centroids weight array for 'this' histogram.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
*
|
||||||
|
* @return The full centroids weight array.
|
||||||
|
*/
|
||||||
|
const long long *td_centroids_weight(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the full centroids mean array for 'this' histogram.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
*
|
||||||
|
* @return The full centroids mean array.
|
||||||
|
*/
|
||||||
|
const double *td_centroids_mean(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the centroid weight for 'this' histogram and 'pos'.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
* @param pos centroid position.
|
||||||
|
*
|
||||||
|
* @return The centroid weight.
|
||||||
|
*/
|
||||||
|
long long td_centroids_weight_at(td_histogram_t *h, int pos);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the centroid mean for 'this' histogram and 'pos'.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
* @param pos centroid position.
|
||||||
|
*
|
||||||
|
* @return The centroid mean.
|
||||||
|
*/
|
||||||
|
double td_centroids_mean_at(td_histogram_t *h, int pos);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
239
tests/test.c
239
tests/test.c
|
@ -10,6 +10,7 @@
|
||||||
#define MUNIT_NO_FORK (1)
|
#define MUNIT_NO_FORK (1)
|
||||||
#define MUNIT_ENABLE_ASSERT_ALIASES (1)
|
#define MUNIT_ENABLE_ASSERT_ALIASES (1)
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
|
@ -32,6 +33,8 @@ struct user_data {
|
||||||
int foo;
|
int foo;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* -------------------------- Supporting Functions for Testing */
|
||||||
|
|
||||||
void
|
void
|
||||||
populate_map(sparsemap_t *map, int size, int max_value)
|
populate_map(sparsemap_t *map, int size, int max_value)
|
||||||
{
|
{
|
||||||
|
@ -42,7 +45,8 @@ populate_map(sparsemap_t *map, int size, int max_value)
|
||||||
shuffle(array, size);
|
shuffle(array, size);
|
||||||
for (int i = 0; i < size; i++) {
|
for (int i = 0; i < size; i++) {
|
||||||
sparsemap_set(map, array[i], true);
|
sparsemap_set(map, array[i], true);
|
||||||
munit_assert_true(sparsemap_is_set(map, array[i]));
|
bool set = sparsemap_is_set(map, array[i]);
|
||||||
|
munit_assert_true(set);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -65,6 +69,8 @@ test_api_tear_down(void *fixture)
|
||||||
free(map);
|
free(map);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* -------------------------- API Tests */
|
||||||
|
|
||||||
static MunitResult
|
static MunitResult
|
||||||
test_api_static_init(const MunitParameter params[], void *data)
|
test_api_static_init(const MunitParameter params[], void *data)
|
||||||
{
|
{
|
||||||
|
@ -232,7 +238,7 @@ test_api_remaining_capacity(const MunitParameter params[], void *data)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void *
|
static void *
|
||||||
test_api_get_range_size_setup(const MunitParameter params[], void *user_data)
|
test_api_get_capacity_setup(const MunitParameter params[], void *user_data)
|
||||||
{
|
{
|
||||||
uint8_t *buf = munit_calloc(1024, sizeof(uint8_t));
|
uint8_t *buf = munit_calloc(1024, sizeof(uint8_t));
|
||||||
sparsemap_t *map = (sparsemap_t *)test_api_setup(params, user_data);
|
sparsemap_t *map = (sparsemap_t *)test_api_setup(params, user_data);
|
||||||
|
@ -243,14 +249,14 @@ test_api_get_range_size_setup(const MunitParameter params[], void *user_data)
|
||||||
return (void *)map;
|
return (void *)map;
|
||||||
}
|
}
|
||||||
static void
|
static void
|
||||||
test_api_get_range_size_tear_down(void *fixture)
|
test_api_get_capacity_tear_down(void *fixture)
|
||||||
{
|
{
|
||||||
sparsemap_t *map = (sparsemap_t *)fixture;
|
sparsemap_t *map = (sparsemap_t *)fixture;
|
||||||
free(map->m_data);
|
free(map->m_data);
|
||||||
test_api_tear_down(fixture);
|
test_api_tear_down(fixture);
|
||||||
}
|
}
|
||||||
static MunitResult
|
static MunitResult
|
||||||
test_api_get_range_size(const MunitParameter params[], void *data)
|
test_api_get_capacity(const MunitParameter params[], void *data)
|
||||||
{
|
{
|
||||||
sparsemap_t *map = (sparsemap_t *)data;
|
sparsemap_t *map = (sparsemap_t *)data;
|
||||||
(void)params;
|
(void)params;
|
||||||
|
@ -259,8 +265,7 @@ test_api_get_range_size(const MunitParameter params[], void *data)
|
||||||
|
|
||||||
sparsemap_set(map, 42, true);
|
sparsemap_set(map, 42, true);
|
||||||
assert_true(sparsemap_is_set(map, 42));
|
assert_true(sparsemap_is_set(map, 42));
|
||||||
size_t size = sparsemap_get_capacity(map);
|
assert_true(sparsemap_get_capacity(map) == 1024);
|
||||||
assert_true(size == 1024);
|
|
||||||
|
|
||||||
return MUNIT_OK;
|
return MUNIT_OK;
|
||||||
}
|
}
|
||||||
|
@ -337,7 +342,7 @@ test_api_set(const MunitParameter params[], void *data)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void *
|
static void *
|
||||||
test_api_get_start_offset_setup(const MunitParameter params[], void *user_data)
|
test_api_get_starting_offset_setup(const MunitParameter params[], void *user_data)
|
||||||
{
|
{
|
||||||
uint8_t *buf = munit_calloc(1024, sizeof(uint8_t));
|
uint8_t *buf = munit_calloc(1024, sizeof(uint8_t));
|
||||||
sparsemap_t *map = (sparsemap_t *)test_api_setup(params, user_data);
|
sparsemap_t *map = (sparsemap_t *)test_api_setup(params, user_data);
|
||||||
|
@ -348,14 +353,14 @@ test_api_get_start_offset_setup(const MunitParameter params[], void *user_data)
|
||||||
return (void *)map;
|
return (void *)map;
|
||||||
}
|
}
|
||||||
static void
|
static void
|
||||||
test_api_get_start_offset_tear_down(void *fixture)
|
test_api_get_starting_offset_tear_down(void *fixture)
|
||||||
{
|
{
|
||||||
sparsemap_t *map = (sparsemap_t *)fixture;
|
sparsemap_t *map = (sparsemap_t *)fixture;
|
||||||
free(map->m_data);
|
free(map->m_data);
|
||||||
test_api_tear_down(fixture);
|
test_api_tear_down(fixture);
|
||||||
}
|
}
|
||||||
static MunitResult
|
static MunitResult
|
||||||
test_api_get_start_offset(const MunitParameter params[], void *data)
|
test_api_get_starting_offset(const MunitParameter params[], void *data)
|
||||||
{
|
{
|
||||||
sparsemap_t *map = (sparsemap_t *)data;
|
sparsemap_t *map = (sparsemap_t *)data;
|
||||||
(void)params;
|
(void)params;
|
||||||
|
@ -364,7 +369,7 @@ test_api_get_start_offset(const MunitParameter params[], void *data)
|
||||||
|
|
||||||
sparsemap_set(map, 42, true);
|
sparsemap_set(map, 42, true);
|
||||||
assert_true(sparsemap_is_set(map, 42));
|
assert_true(sparsemap_is_set(map, 42));
|
||||||
size_t offset = sparsemap_get_start_offset(map);
|
size_t offset = sparsemap_get_starting_offset(map);
|
||||||
assert_true(offset == 0);
|
assert_true(offset == 0);
|
||||||
|
|
||||||
return MUNIT_OK;
|
return MUNIT_OK;
|
||||||
|
@ -513,10 +518,41 @@ test_api_select(const MunitParameter params[], void *data)
|
||||||
|
|
||||||
/* NOTE: select() is 0-based, to get the bit position of the 1st logical bit set
|
/* NOTE: select() is 0-based, to get the bit position of the 1st logical bit set
|
||||||
call select(map, 0), to get the 18th, select(map, 17), etc. */
|
call select(map, 0), to get the 18th, select(map, 17), etc. */
|
||||||
assert_true(sparsemap_select(map, 0) == 1);
|
assert_true(sparsemap_select(map, 0, true) == 1);
|
||||||
assert_true(sparsemap_select(map, 4) == 6);
|
assert_true(sparsemap_select(map, 4, true) == 6);
|
||||||
assert_true(sparsemap_select(map, 17) == 26);
|
assert_true(sparsemap_select(map, 17, true) == 26);
|
||||||
|
|
||||||
|
#if 0 // TODO
|
||||||
|
size_t f = sparsemap_select(map, 0, false);
|
||||||
|
for (int i = 0; i <= f; i++) {
|
||||||
|
assert_false(sparsemap_is_set(map, i % 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
sparsemap_clear(map);
|
||||||
|
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
sparsemap_set(map, i, i % 2 ? true : false);
|
||||||
|
}
|
||||||
|
|
||||||
|
f = sparsemap_select(map, 0, false);
|
||||||
|
assert_true(f == 1000);
|
||||||
|
|
||||||
|
sparsemap_clear(map);
|
||||||
|
|
||||||
|
sparsemap_set(map, 42, true);
|
||||||
|
sparsemap_set(map, 420, true);
|
||||||
|
sparsemap_set(map, 4200, true);
|
||||||
|
|
||||||
|
f = sparsemap_select(map, 0, false);
|
||||||
|
assert_true(f == 0);
|
||||||
|
|
||||||
|
f = sparsemap_select(map, -1, true);
|
||||||
|
assert_true(f == 4200);
|
||||||
|
f = sparsemap_select(map, -2, true);
|
||||||
|
assert_true(f == 420);
|
||||||
|
f = sparsemap_select(map, -3, true);
|
||||||
|
assert_true(f == 42);
|
||||||
|
#endif
|
||||||
return MUNIT_OK;
|
return MUNIT_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -559,15 +595,15 @@ test_api_rank(const MunitParameter params[], void *data)
|
||||||
range as [start, end] of [0, 9] counts the bits set in the first 10
|
range as [start, end] of [0, 9] counts the bits set in the first 10
|
||||||
positions (starting from the LSB) in the index. */
|
positions (starting from the LSB) in the index. */
|
||||||
r1 = rank_uint64((uint64_t)-1, 0, 9);
|
r1 = rank_uint64((uint64_t)-1, 0, 9);
|
||||||
r2 = sparsemap_rank(map, 0, 9);
|
r2 = sparsemap_rank(map, 0, 9, true);
|
||||||
assert_true(r1 == r2);
|
assert_true(r1 == r2);
|
||||||
assert_true(sparsemap_rank(map, 0, 9) == 10);
|
assert_true(sparsemap_rank(map, 0, 9, true) == 10);
|
||||||
assert_true(sparsemap_rank(map, 1000, 1050) == 0);
|
assert_true(sparsemap_rank(map, 1000, 1050, true) == 0);
|
||||||
|
|
||||||
for (int i = 0; i < 10; i++) {
|
for (int i = 0; i < 10; i++) {
|
||||||
for (int j = i; j < 10; j++) {
|
for (int j = i; j < 10; j++) {
|
||||||
r1 = rank_uint64((uint64_t)-1, i, j);
|
r1 = rank_uint64((uint64_t)-1, i, j);
|
||||||
r2 = sparsemap_rank(map, i, j);
|
r2 = sparsemap_rank(map, i, j, true);
|
||||||
assert_true(r1 == r2);
|
assert_true(r1 == r2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -603,26 +639,26 @@ test_api_span(const MunitParameter params[], void *data)
|
||||||
int located_at, placed_at, amt = 10000;
|
int located_at, placed_at, amt = 10000;
|
||||||
|
|
||||||
placed_at = create_sequential_set_in_empty_map(map, amt, 1);
|
placed_at = create_sequential_set_in_empty_map(map, amt, 1);
|
||||||
located_at = sparsemap_span(map, 0, 1);
|
located_at = sparsemap_span(map, 0, 1, true);
|
||||||
assert_true(located_at == placed_at);
|
assert_true(located_at == placed_at);
|
||||||
|
|
||||||
sparsemap_clear(map);
|
sparsemap_clear(map);
|
||||||
|
|
||||||
placed_at = create_sequential_set_in_empty_map(map, amt, 50);
|
placed_at = create_sequential_set_in_empty_map(map, amt, 50);
|
||||||
located_at = sparsemap_span(map, 0, 50);
|
located_at = sparsemap_span(map, 0, 50, true);
|
||||||
assert_true(located_at == placed_at);
|
assert_true(located_at == placed_at);
|
||||||
|
|
||||||
sparsemap_clear(map);
|
sparsemap_clear(map);
|
||||||
|
|
||||||
placed_at = create_sequential_set_in_empty_map(map, amt, 50);
|
placed_at = create_sequential_set_in_empty_map(map, amt, 50);
|
||||||
located_at = sparsemap_span(map, placed_at / 2, 50);
|
located_at = sparsemap_span(map, placed_at / 2, 50, true);
|
||||||
assert_true(located_at == placed_at);
|
assert_true(located_at == placed_at);
|
||||||
|
|
||||||
/* TODO
|
/* TODO
|
||||||
sparsemap_clear(map);
|
sparsemap_clear(map);
|
||||||
|
|
||||||
placed_at = create_sequential_set_in_empty_map(map, amt, amt - 1);
|
placed_at = create_sequential_set_in_empty_map(map, amt, amt - 1);
|
||||||
located_at = sparsemap_span(map, 0, amt - 1);
|
located_at = sparsemap_span(map, 0, amt - 1, true);
|
||||||
assert_true(located_at == placed_at);
|
assert_true(located_at == placed_at);
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -637,11 +673,11 @@ static MunitTest api_test_suite[] = {
|
||||||
{ (char *)"/set_data_size", test_api_set_data_size, test_api_set_data_size_setup, test_api_set_data_size_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
{ (char *)"/set_data_size", test_api_set_data_size, test_api_set_data_size_setup, test_api_set_data_size_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
{ (char *)"/remaining_capacity", test_api_remaining_capacity, test_api_remaining_capacity_setup, test_api_remaining_capacity_tear_down,
|
{ (char *)"/remaining_capacity", test_api_remaining_capacity, test_api_remaining_capacity_setup, test_api_remaining_capacity_tear_down,
|
||||||
MUNIT_TEST_OPTION_NONE, NULL },
|
MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
{ (char *)"/get_range_size", test_api_get_range_size, test_api_get_range_size_setup, test_api_get_range_size_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
{ (char *)"/get_capacity", test_api_get_capacity, test_api_get_capacity_setup, test_api_get_capacity_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
{ (char *)"/is_set", test_api_is_set, test_api_is_set_setup, test_api_is_set_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
{ (char *)"/is_set", test_api_is_set, test_api_is_set_setup, test_api_is_set_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
{ (char *)"/set", test_api_set, test_api_set_setup, test_api_set_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
{ (char *)"/set", test_api_set, test_api_set_setup, test_api_set_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
{ (char *)"/get_start_offset", test_api_get_start_offset, test_api_get_start_offset_setup, test_api_get_start_offset_tear_down, MUNIT_TEST_OPTION_NONE,
|
{ (char *)"/get_starting_offset", test_api_get_starting_offset, test_api_get_starting_offset_setup, test_api_get_starting_offset_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
NULL },
|
//TODO { (char *)"/get_ending_offset", test_api_get_ending_offset, test_api_get_ending_offset_setup, test_api_get_ending_offset_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
{ (char *)"/get_size", test_api_get_size, test_api_get_size_setup, test_api_get_size_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
{ (char *)"/get_size", test_api_get_size, test_api_get_size_setup, test_api_get_size_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
{ (char *)"/scan", test_api_scan, test_api_scan_setup, test_api_scan_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
{ (char *)"/scan", test_api_scan, test_api_scan_setup, test_api_scan_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
{ (char *)"/split", test_api_split, test_api_split_setup, test_api_split_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
{ (char *)"/split", test_api_split, test_api_split_setup, test_api_split_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
|
@ -652,6 +688,98 @@ static MunitTest api_test_suite[] = {
|
||||||
};
|
};
|
||||||
// clang-format on
|
// clang-format on
|
||||||
|
|
||||||
|
/* -------------------------- Scale Tests */
|
||||||
|
|
||||||
|
static void *
|
||||||
|
test_scale_best_case_setup(const MunitParameter params[], void *user_data)
|
||||||
|
{
|
||||||
|
uint8_t *buf = munit_calloc(1024, sizeof(uint8_t));
|
||||||
|
sparsemap_t *map = (sparsemap_t *)test_api_setup(params, user_data);
|
||||||
|
|
||||||
|
sparsemap_init(map, buf, 1024);
|
||||||
|
|
||||||
|
return (void *)map;
|
||||||
|
}
|
||||||
|
static void
|
||||||
|
test_scale_best_case_tear_down(void *fixture)
|
||||||
|
{
|
||||||
|
sparsemap_t *map = (sparsemap_t *)fixture;
|
||||||
|
free(map->m_data);
|
||||||
|
test_api_tear_down(fixture);
|
||||||
|
}
|
||||||
|
static MunitResult
|
||||||
|
test_scale_best_case(const MunitParameter params[], void *data)
|
||||||
|
{
|
||||||
|
sparsemap_t *map = (sparsemap_t *)data;
|
||||||
|
(void)params;
|
||||||
|
|
||||||
|
assert_ptr_not_null(map);
|
||||||
|
|
||||||
|
/* Best case a map can contain 2048 bits in 8 bytes.
|
||||||
|
So, in a 1KiB buffer you have:
|
||||||
|
(1024 KiB / 8 bytes) * 2048 = 268,435,456 bits
|
||||||
|
or 1.09 TiB of 4KiB pages. Let's investigate, and find out if that's the case.
|
||||||
|
|
||||||
|
TODO: Actually, 172032 are stored before SEGV, or 706 MiB of 4KiB pages.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Set every bit on, that should be the best case. */
|
||||||
|
for (int i = 0; i < 268435456; i++) {
|
||||||
|
/* ANSI esc code to clear line, carrage return, then print on the same line */
|
||||||
|
// printf("\033[2K\r%d", i);
|
||||||
|
// fflush(stdout);
|
||||||
|
sparsemap_set(map, i, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
return MUNIT_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void *
|
||||||
|
test_scale_worst_case_setup(const MunitParameter params[], void *user_data)
|
||||||
|
{
|
||||||
|
uint8_t *buf = munit_calloc(1024, sizeof(uint8_t));
|
||||||
|
sparsemap_t *map = (sparsemap_t *)test_api_setup(params, user_data);
|
||||||
|
|
||||||
|
sparsemap_init(map, buf, 1024);
|
||||||
|
|
||||||
|
return (void *)map;
|
||||||
|
}
|
||||||
|
static void
|
||||||
|
test_scale_worst_case_tear_down(void *fixture)
|
||||||
|
{
|
||||||
|
sparsemap_t *map = (sparsemap_t *)fixture;
|
||||||
|
free(map->m_data);
|
||||||
|
test_api_tear_down(fixture);
|
||||||
|
}
|
||||||
|
static MunitResult
|
||||||
|
test_scale_worst_case(const MunitParameter params[], void *data)
|
||||||
|
{
|
||||||
|
sparsemap_t *map = (sparsemap_t *)data;
|
||||||
|
(void)params;
|
||||||
|
|
||||||
|
assert_ptr_not_null(map);
|
||||||
|
|
||||||
|
/* Worst case a map can contain 2048 bits in 265 + 8 = 264 bytes.
|
||||||
|
So, in a 1KiB buffer you have:
|
||||||
|
(1024 KiB / 264 bytes) * 2048 = 8,134,407.75758 bits
|
||||||
|
or 33.3 GiB of 4KiB pages. Let's investigate, and find out if that's the case.
|
||||||
|
|
||||||
|
TODO: actually 7744 are stored before SEGV, or 31MiB of 4KiB pages.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Set every other bit, that has to be the "worst case" for this index. */
|
||||||
|
for (int i = 0; i < 8134407; i += 2) {
|
||||||
|
/* ANSI esc code to clear line, carrage return, then print on the same line */
|
||||||
|
// printf("\033[2K\r%d", i);
|
||||||
|
// fflush(stdout);
|
||||||
|
sparsemap_set(map, i, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
return MUNIT_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -------------------------- Performance Tests */
|
||||||
|
|
||||||
static void *
|
static void *
|
||||||
test_perf_span_solo_setup(const MunitParameter params[], void *user_data)
|
test_perf_span_solo_setup(const MunitParameter params[], void *user_data)
|
||||||
{
|
{
|
||||||
|
@ -669,37 +797,33 @@ test_perf_span_solo_tear_down(void *fixture)
|
||||||
free(map->m_data);
|
free(map->m_data);
|
||||||
test_api_tear_down(fixture);
|
test_api_tear_down(fixture);
|
||||||
}
|
}
|
||||||
EST_MEDIAN_DECL(solo, 10000)
|
|
||||||
static MunitResult
|
static MunitResult
|
||||||
test_perf_span_solo(const MunitParameter params[], void *data)
|
test_perf_span_solo(const MunitParameter params[], void *data)
|
||||||
{
|
{
|
||||||
sparsemap_t *map = (sparsemap_t *)data;
|
sparsemap_t *map = (sparsemap_t *)data;
|
||||||
uint64_t stop, start;
|
// double stop, start;
|
||||||
(void)params;
|
(void)params;
|
||||||
int located_at, placed_at, amt = 1000;
|
int located_at, placed_at, amt = 500;
|
||||||
|
|
||||||
assert_ptr_not_null(map);
|
assert_ptr_not_null(map);
|
||||||
|
|
||||||
for (int i = 1; i < amt; i++) {
|
for (int i = 1; i < amt; i++) {
|
||||||
for (int j = 1; j < amt / 10; j++) {
|
for (int j = 1; j <= 100; j++) {
|
||||||
sparsemap_clear(map);
|
sparsemap_clear(map);
|
||||||
placed_at = create_sequential_set_in_empty_map(map, amt, j);
|
placed_at = create_sequential_set_in_empty_map(map, amt, j);
|
||||||
// logf("i = %d, j = %d\tplaced_at %d\n", i, j, placed_at);
|
// logf("i = %d, j = %d\tplaced_at %d\n", i, j, placed_at);
|
||||||
// whats_set(map, 5000);
|
// whats_set(map, 5000);
|
||||||
start = tsc();
|
// start = nsts();
|
||||||
located_at = sparsemap_span(map, 0, j);
|
located_at = sparsemap_span(map, 0, j, true);
|
||||||
stop = tsc();
|
// stop = nsts();
|
||||||
// fprintf(stdout, "%ll - %ll = %ll\n", stop, start, stop - start);
|
// double amt = (stop - start) * 1e6;
|
||||||
EST_MEDIAN_ADD(solo, stop - start);
|
// if (amt > 0) {
|
||||||
|
// fprintf(stdout, "%0.8f\n", amt);
|
||||||
|
// }
|
||||||
if (placed_at != located_at)
|
if (placed_at != located_at)
|
||||||
logf("a: i = %d, j = %d\tplaced_at %d located_at %d\n", i, j, placed_at, located_at);
|
logf("a: i = %d, j = %d\tplaced_at %d located_at %d\n", i, j, placed_at, located_at);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
uint64_t est = EST_MEDIAN_GET(solo);
|
|
||||||
// fprintf(stdout, "median time %zu or %f ns\n", est, tsc_ticks_to_ns(est)); // measured 228
|
|
||||||
assert_true(est < 500);
|
|
||||||
fflush(stdout);
|
|
||||||
|
|
||||||
return MUNIT_OK;
|
return MUNIT_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -720,45 +844,45 @@ test_perf_span_tainted_tear_down(void *fixture)
|
||||||
free(map->m_data);
|
free(map->m_data);
|
||||||
test_api_tear_down(fixture);
|
test_api_tear_down(fixture);
|
||||||
}
|
}
|
||||||
EST_MEDIAN_DECL(tainted, 10000)
|
|
||||||
static MunitResult
|
static MunitResult
|
||||||
test_perf_span_tainted(const MunitParameter params[], void *data)
|
test_perf_span_tainted(const MunitParameter params[], void *data)
|
||||||
{
|
{
|
||||||
sparsemap_t *map = (sparsemap_t *)data;
|
sparsemap_t *map = (sparsemap_t *)data;
|
||||||
uint64_t stop, start;
|
// double stop, start;
|
||||||
(void)params;
|
(void)params;
|
||||||
|
|
||||||
assert_ptr_not_null(map);
|
assert_ptr_not_null(map);
|
||||||
|
|
||||||
int located_at, placed_at, amt = 1000;
|
int located_at, placed_at, amt = 500;
|
||||||
for (int i = 1; i < amt; i++) {
|
for (int i = 1; i < amt; i++) {
|
||||||
for (int j = 1; j < amt / 10; j++) {
|
for (int j = 100; j <= 10; j++) {
|
||||||
sparsemap_clear(map);
|
sparsemap_clear(map);
|
||||||
populate_map(map, 1024, 1 * 1024);
|
populate_map(map, 1024, 1 * 1024);
|
||||||
placed_at = create_sequential_set_in_empty_map(map, amt, j);
|
placed_at = create_sequential_set_in_empty_map(map, amt, j);
|
||||||
start = tsc();
|
// start = nsts();
|
||||||
located_at = sparsemap_span(map, 0, j);
|
located_at = sparsemap_span(map, 0, j, true);
|
||||||
stop = tsc();
|
// stop = nsts();
|
||||||
EST_MEDIAN_ADD(tainted, stop - start);
|
// double amt = (stop - start) * 1e6;
|
||||||
|
// if (amt > 0) {
|
||||||
|
// fprintf(stdout, "%0.8f\n", amt);
|
||||||
|
// }
|
||||||
if (located_at >= placed_at)
|
if (located_at >= placed_at)
|
||||||
logf("b: i = %d, j = %d\tplaced_at %d located_at %d\n", i, j, placed_at, located_at);
|
logf("b: i = %d, j = %d\tplaced_at %d located_at %d\n", i, j, placed_at, located_at);
|
||||||
// assert_true(located_at >= placed_at);
|
|
||||||
// start = tsc();
|
|
||||||
// located_at = sparsemap_span(map, (placed_at < j ? 0 : placed_at / 2), i);
|
|
||||||
// stop = tsc();
|
|
||||||
// EST_MEDIAN_ADD(solo, stop - start);
|
|
||||||
// assert_true(placed_at == located_at);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
uint64_t est = EST_MEDIAN_GET(tainted);
|
|
||||||
// fprintf(stdout, "median time %zu or %f ns\n", est, tsc_ticks_to_ns(est)); // measured 228
|
|
||||||
assert_true(est < 500);
|
|
||||||
|
|
||||||
return MUNIT_OK;
|
return MUNIT_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
// clang-format off
|
// clang-format off
|
||||||
static MunitTest performance_test_suite[] = {
|
static MunitTest scale_test_suite[] = {
|
||||||
|
{ (char *)"/best-case", test_scale_best_case, test_scale_best_case_setup, test_scale_best_case_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
|
{ (char *)"/worst-case", test_scale_worst_case, test_scale_worst_case_setup, test_scale_worst_case_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
|
{ NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL } };
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
static MunitTest perf_test_suite[] = {
|
||||||
{ (char *)"/span/solo", test_perf_span_solo, test_perf_span_solo_setup, test_perf_span_solo_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
{ (char *)"/span/solo", test_perf_span_solo, test_perf_span_solo_setup, test_perf_span_solo_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
{ (char *)"/span/tainted", test_perf_span_tainted, test_perf_span_tainted_setup, test_perf_span_tainted_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
{ (char *)"/span/tainted", test_perf_span_tainted, test_perf_span_tainted_setup, test_perf_span_tainted_tear_down, MUNIT_TEST_OPTION_NONE, NULL },
|
||||||
{ NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL } };
|
{ NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL } };
|
||||||
|
@ -766,7 +890,8 @@ static MunitTest performance_test_suite[] = {
|
||||||
|
|
||||||
// clang-format off
|
// clang-format off
|
||||||
static MunitSuite other_test_suite[] = {
|
static MunitSuite other_test_suite[] = {
|
||||||
{ "/performance", performance_test_suite, NULL, 1, MUNIT_SUITE_OPTION_NONE },
|
{ "/perf", perf_test_suite, NULL, 1, MUNIT_SUITE_OPTION_NONE },
|
||||||
|
{ "/scale", scale_test_suite, NULL, 1, MUNIT_SUITE_OPTION_NONE },
|
||||||
{ NULL, NULL, NULL, 0, MUNIT_SUITE_OPTION_NONE } };
|
{ NULL, NULL, NULL, 0, MUNIT_SUITE_OPTION_NONE } };
|
||||||
// clang-format on
|
// clang-format on
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue