select/rank for unset as well as set bits (#4)

Reviewed-on: #4 Co-authored-by: Greg Burd <greg@burd.me> Co-committed-by: Greg Burd <greg@burd.me>
2024-04-24 20:32:09 +00:00 · 2024-04-24 20:32:09 +00:00 · b3dfd745e7
commit b3dfd745e7
parent 5d5c7f1584
16 changed files with 2316 additions and 594 deletions
--- a/.envrc
+++ b/.envrc
@ -1,5 +1,5 @@
 if ! has nix_direnv_version || ! nix_direnv_version 3.0.4; then
    source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/3.0.4/direnvrc" "sha256-DzlYZ33mWF/Gs8DDeyjr8mnVmQGx7ASYqA5WlxwvBG4="
 fi
-watch_file devShell.nix shell.nix flake.nix
+watch_file shell.nix flake.nix
 use flake || use nix
--- a/14
+++ b/14
@ -5,14 +5,15 @@ SHARED_LIB = libsparsemap.so

 #CFLAGS = -Wall -Wextra -Wpedantic -Of -std=c11 -Iinclude/ -fPIC
 #CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
-CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
-#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC
+CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
+#CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC
 #CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=all -fhardened -std=c11 -Iinclude/ -fPIC

-TEST_FLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -Itests/ -fPIC
+TEST_FLAGS = -DDEBUG -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -Itests/ -fPIC
+#TEST_FLAGS = -DDEBUG -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC

 TESTS = tests/test
-TEST_OBJS = tests/test.o tests/munit.o tests/common.o
+TEST_OBJS = tests/test.o tests/munit.o tests/tdigest.o tests/common.o
 EXAMPLES = examples/ex_1 examples/ex_2 examples/ex_3 examples/ex_4

 .PHONY: all shared static clean test examples mls
@ -39,7 +40,7 @@ check: test
 	env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/test

 tests/test: $(TEST_OBJS) $(STATIC_LIB)
-	$(CC) $^ -o $@ $(TEST_FLAGS)
+	$(CC) $^ -lm -o $@ $(TEST_FLAGS)

 clean:
 	rm -f $(OBJS)
@ -76,4 +77,7 @@ examples/ex_3: examples/common.o examples/ex_3.o $(STATIC_LIB)
 examples/ex_4: examples/common.o examples/ex_4.o $(STATIC_LIB)
 	$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS)

+todo:
+	rg -i 'todo|gsb|abort'
+
 # cp src/sparsemap.c /tmp && clang-tidy src/sparsemap.c -fix -fix-errors -checks="readability-braces-around-statements" -- -DDEBUG -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
--- a/README.md
+++ b/README.md
@ -60,5 +60,6 @@ to an uncompressed bit vector (sometimes higher due to the bytes required for
 metadata). In such cases, other compression schemes are more efficient (i.e.
 http://lemire.me/blog/archives/2008/08/20/the-mythical-bitmap-index/).

-This library was originally created for hamsterdb [http://hamsterdb.com] in
-C++ and then translated to C99 code by Greg Burd <greg@burd.me>.
+This library was originally created for [hamsterdb](http://hamsterdb.com) in
+C++ and then translated to C and further improved by Greg Burd <greg@burd.me>
+for use in LMDB and OpenLDAP.
--- a/examples/ex_1.c
+++ b/examples/ex_1.c
@ -1,4 +1,7 @@
 #include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
 #include <stdio.h>

 #include "../include/sparsemap.h"
@ -14,16 +17,18 @@

 /* !!! Duplicated here for testing purposes. Keep in sync, or suffer. !!! */
 struct sparsemap {
-  uint8_t *m_data;
  size_t m_capacity;
  size_t m_data_used;
+  uint8_t *m_data;
 };

 int
 main()
 {
  size_t size = 4;
-  setbuf(stderr, 0); // disable buffering
+  setvbuf(stdout, NULL, _IONBF, 0); // Disable buffering for stdout
+  setvbuf(stderr, NULL, _IONBF, 0); // Disable buffering for stdout
+
  __diag("Please wait a moment...");
  sparsemap_t mmap, *map = &mmap;
  uint8_t buffer[1024];
@ -135,7 +140,7 @@ main()
    sparsemap_set(map, i, true);
  }
  for (int i = 0; i < 100000; i++) {
-    assert(sparsemap_select(map, i) == (unsigned)i);
+    assert(sparsemap_select(map, i, true) == (unsigned)i);
  }

  sparsemap_clear(map);
@ -145,7 +150,7 @@ main()
    sparsemap_set(map, i, true);
  }
  for (int i = 1; i < 513; i++) {
-    assert(sparsemap_select(map, i - 1) == (unsigned)i);
+    assert(sparsemap_select(map, i - 1, true) == (unsigned)i);
  }

  sparsemap_clear(map);
@ -155,7 +160,7 @@ main()
    sparsemap_set(map, i * 10, true);
  }
  for (size_t i = 0; i < 8; i++) {
-    assert(sparsemap_select(map, i) == i * 10);
+    assert(sparsemap_select(map, i, true) == (sparsemap_idx_t)i * 10);
  }

  // split and move, aligned to MiniMap capacity
--- a/examples/ex_2.c
+++ b/examples/ex_2.c
@ -1,9 +1,7 @@
 #include <assert.h>
-#include <stdarg.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
-#include <unistd.h>

 #include "../include/sparsemap.h"

@ -16,29 +14,28 @@
  } while (0)
 #pragma GCC diagnostic pop

-#define SEED
-
 int
 main(void)
 {
-  int i = 0;
+  int i;

  // disable buffering
-  setbuf(stderr, 0);
+  setvbuf(stdout, NULL, _IONBF, 0); // Disable buffering for stdout
+  setvbuf(stderr, NULL, _IONBF, 0); // Disable buffering for stdout

  // start with a 1KiB buffer, 1024 bits
  uint8_t *buf = calloc(1024, sizeof(uint8_t));

  // create the sparse bitmap
-  sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 1024);
+  sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 1024);

  // Set every other bit (pathologically worst case) to see what happens
  // when the map is full.
  for (i = 0; i < 7744; i++) {
-    if (i % 2)
-      continue;
-    sparsemap_set(map, i, true);
-    assert(sparsemap_is_set(map, i) == true);
+    if (!i % 2) {
+      sparsemap_set(map, i, true);
+      assert(sparsemap_is_set(map, i) == true);
+    }
  }
  // On 1024 KiB of buffer with every other bit set the map holds 7744 bits
  // and then runs out of space.  This next _set() call will fail/abort.
--- a/examples/ex_3.c
+++ b/examples/ex_3.c
@ -1,9 +1,7 @@
 #include <assert.h>
-#include <stdarg.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
-#include <unistd.h>

 #include "../include/sparsemap.h"
 #include "../tests/common.h"
@ -11,7 +9,7 @@
 int
 main(void)
 {
-  int i = 0;
+  int i;
  int array[1024] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
    38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
    77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
@ -60,7 +58,7 @@ main(void)
  uint8_t *buf = calloc(1024, sizeof(uint8_t));

  // create the sparse bitmap
-  sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 1024);
+  sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 1024);

  // set all the bits on in a random order
  for (i = 0; i < 1024; i++) {
--- a/examples/ex_4.c
+++ b/examples/ex_4.c
@ -24,7 +24,7 @@ main(void)
  uint8_t *buf = calloc((size_t)3 * 1024, sizeof(uint8_t));

  // create the sparse bitmap
-  sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 3 * 1024);
+  sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 3 * 1024);

  // create an array of ints
  setup_test_array(array, TEST_ARRAY_SIZE, 1024 * 3);
@ -60,7 +60,7 @@ main(void)
      assert(sparsemap_is_set(map, array[i]) == true);
    }
    has_span(map, array, TEST_ARRAY_SIZE, (int)len);
-    size_t l = sparsemap_span(map, 0, len);
+    size_t l = sparsemap_span(map, 0, len, true);
    if (l != (size_t)-1) {
      __diag("Found span in map starting at %lu of length %lu\n", l, len);
      __diag("is_span(%lu, %lu) == %s\n", l, len, is_span(array, TEST_ARRAY_SIZE, l, len) ? "yes" : "no");
--- a/flake.lock
+++ b/flake.lock
@ -1,43 +1,25 @@
 {
  "nodes": {
-    "flake-utils": {
-      "inputs": {
-        "systems": "systems"
-      },
-      "locked": {
-        "lastModified": 1710146030,
-        "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1712192574,
-        "narHash": "sha256-LbbVOliJKTF4Zl2b9salumvdMXuQBr2kuKP5+ZwbYq4=",
+        "lastModified": 1701282334,
+        "narHash": "sha256-MxCVrXY6v4QmfTwIysjjaX0XUhqBbxTWWB4HXtDYsdk=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "f480f9d09e4b4cf87ee6151eba068197125714de",
+        "rev": "057f9aecfb71c4437d2b27d3323df7f93c010b7e",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixpkgs-unstable",
+        "ref": "23.11",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
-        "flake-utils": "flake-utils",
-        "nixpkgs": "nixpkgs"
+        "nixpkgs": "nixpkgs",
+        "utils": "utils"
      }
    },
    "systems": {
@ -54,6 +36,24 @@
        "repo": "default",
        "type": "github"
      }
+    },
+    "utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1710146030,
+        "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
    }
  },
  "root": "root",
--- a/flake.nix
+++ b/flake.nix
@ -1,59 +1,55 @@
 {
-  description = "A Concurrent Skip List library for key/value pairs.";
+  description = "A sparse bitmapped index library in C.";

  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
-    flake-utils.url = "github:numtide/flake-utils";
+    # nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
+    nixpkgs.url = "github:NixOS/nixpkgs/23.11";
+    utils.url = "github:numtide/flake-utils";
+    utils.inputs.nixpkgs.follows = "nixpkgs";
  };

-  outputs =
-    { self
-    , nixpkgs
-    , flake-utils
-    , ...
-    }:
-    flake-utils.lib.eachDefaultSystem (system:
-      let
-        pkgs = import nixpkgs {
-          inherit system;
-          config = { allowUnfree = true; };
-        };
-        supportedSystems = [ "x86_64-linux" ];
-        forAllSystems = nixpkgs.lib.genAttrs supportedSystems;
-        nixpkgsFor = forAllSystems (system: import nixpkgs {
-          inherit system;
-          overlays = [ self.overlay ];
-        });
-      in {
-        pkgs = import nixpkgs {
-          inherit system;
-          devShell = nixpkgs.legacyPackages.${system} {
-            pkgs.mkShell = {
-              nativeBuildInputs = with pkgs.buildPackages; [
-                act
-                autoconf
-                clang
-                ed
-                gcc
-                gdb
-                gettext
-                graphviz-nox
-                libtool
-                m4
-                perl
-                pkg-config
-                python3
-                ripgrep
-                valgrind
-              ];
-              buildInputs = with pkgs; [
-                libbacktrace
-                glibc.out
-                glibc.static
-              ];
-            };
-            DOCKER_BUILDKIT = 1;
+  outputs = { self, nixpkgs, ... }
+    @inputs: inputs.utils.lib.eachSystem [
+      "x86_64-linux" "i686-linux" "aarch64-linux" "x86_64-darwin"
+    ] (system:
+      let pkgs = import nixpkgs {
+            inherit system;
+            overlays = [];
+            config.allowUnfree = true;
          };
-        };
-      });
+      in {
+      devShell = pkgs.mkShell rec {
+        name = "sparsemap";
+        packages = with pkgs; [
+          act
+          autoconf
+          clang
+          ed
+          gcc
+          gdb
+          gettext
+          graphviz-nox
+          libtool
+          m4
+          perl
+          pkg-config
+          python3
+          ripgrep
+          valgrind
+        ];
+
+        buildInputs = with pkgs; [
+          libbacktrace
+          glibc.out
+          glibc.static
+        ];
+
+        shellHook = let
+          icon = "f121";
+        in ''
+        export PS1="$(echo -e '\u${icon}') {\[$(tput sgr0)\]\[\033[38;5;228m\]\w\[$(tput sgr0)\]\[\033[38;5;15m\]} (${name}) \\$ \[$(tput sgr0)\]"
+        '';
+      };
+      DOCKER_BUILDKIT = 1;
+    });
 }
--- a/include/sparsemap.h
+++ b/include/sparsemap.h
@ -69,14 +69,14 @@
 #ifndef SPARSEMAP_H
 #define SPARSEMAP_H

-#include <sys/types.h>
-
-#include <assert.h>
 #include <limits.h>
 #include <stdbool.h>
+#include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
-#include <string.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif

 /*
 * The public interface for a sparse bit-mapped index, a "sparse map".
@ -88,55 +88,119 @@
 */

 typedef struct sparsemap sparsemap_t;
+typedef long int sparsemap_idx_t;
+#define SPARSEMAP_IDX_MAX ((1UL << (sizeof(long) * CHAR_BIT - 1)) - 1)
+#define SPARSEMAP_IDX_MIN (-(SPARSEMAP_IDX_MAX)-1)
+#define SPARSEMAP_NOT_FOUND(_x) ((_x) == SPARSEMAP_IDX_MAX || (_x) == SPARSEMAP_IDX_MIN)
 typedef uint32_t sm_idx_t;
 typedef uint64_t sm_bitvec_t;

-/* Allocate on a sparsemap_t on the heap and initialize it. */
-sparsemap_t *sparsemap(uint8_t *data, size_t size);
+/**
+ * Create a new, empty sparsemap_t with a buffer of |size|.
+ * Default when set to 0 is 1024.
+ */
+sparsemap_t *sparsemap(size_t size);

-/* Initialize sparsemap_t with data. */
+/**
+ * Allocate on a sparsemap_t on the heap to wrap the provided fixed-size
+ * buffer (heap or stack allocated).
+ */
+sparsemap_t *sparsemap_wrap(uint8_t *data, size_t size);
+
+/**
+ * Initialize a (possibly stack allocated) sparsemap_t with data (potentially
+ * also on the stack).
+ */
 void sparsemap_init(sparsemap_t *map, uint8_t *data, size_t size);

-/* Clears the whole buffer. */
-void sparsemap_clear(sparsemap_t *map);
-
-/* Opens an existing sparsemap at the specified buffer. */
+/**
+ * Opens an existing sparsemap contained within the specified buffer.
+ */
 void sparsemap_open(sparsemap_t *, uint8_t *data, size_t data_size);

-/* Resizes the data range. */
-void sparsemap_set_data_size(sparsemap_t *map, size_t data_size);
+/**
+ * Resets values and empties the buffer making it ready to accept new data.
+ */
+void sparsemap_clear(sparsemap_t *map);

-/* Calculate remaining capacity, full when 0. */
+/**
+ * Resizes the data range within the limits of the provided buffer, the map may
+ * move to a new address returned iff the map was created with the sparsemap() API.
+ * Take care to use the new reference (think: realloc()).  NOTE: If the returned
+ * value equals NULL then the map was not resized.
+ */
+sparsemap_t *sparsemap_set_data_size(sparsemap_t *map, size_t data_size);
+
+/**
+ * Calculate remaining capacity, approaches 0 when full.
+ */
 double sparsemap_capacity_remaining(sparsemap_t *map);

-/* Returns the size of the underlying byte array. */
+/**
+ * Returns the capacity of the underlying byte array.
+ */
 size_t sparsemap_get_capacity(sparsemap_t *map);

-/* Returns the value of a bit at index |idx|. */
-bool sparsemap_is_set(sparsemap_t *map, size_t idx);
+/**
+ * Returns the value of a bit at index |idx|, either on/true/1 or off/false/0.
+ * When |idx| is negative it is an error.
+ */
+bool sparsemap_is_set(sparsemap_t *map, sparsemap_idx_t idx);

-/* Sets the bit at index |idx| to true or false, depending on |value|. */
-void sparsemap_set(sparsemap_t *map, size_t idx, bool value);
+/**
+ * Sets the bit at index |idx| to true or false, depending on |value|.
+ * When |idx| is negative is it an error.  Returns the |idx| supplied or
+ * SPARSEMAP_IDX_MAX on error with |errno| set to ENOSP when the map is full.
+ */
+sparsemap_idx_t sparsemap_set(sparsemap_t *map, sparsemap_idx_t idx, bool value);

-/* Returns the offset of the very first bit. */
-sm_idx_t sparsemap_get_start_offset(sparsemap_t *map);
+/**
+ * Returns the offset of the very first/last bit in the map.
+ */
+sm_idx_t sparsemap_get_starting_offset(sparsemap_t *map);

-/* Returns the used size in the data buffer. */
+/**
+ * Returns the used size in the data buffer in bytes.
+ */
 size_t sparsemap_get_size(sparsemap_t *map);

-/* Decompresses the whole bitmap; calls scanner for all bits. */
-void sparsemap_scan(sparsemap_t *map, void (*scanner)(sm_idx_t[], size_t), size_t skip);
+/**
+ * Decompresses the whole bitmap; calls scanner for all bits with a set of
+ * |n| vectors |vec| each a sm_bitmap_t which can be masked and read using
+ * bit operators to read the values for each position in the bitmap index.
+ * Setting |skip| will start the scan after "skip" bits.
+ */
+void sparsemap_scan(sparsemap_t *map, void (*scanner)(sm_idx_t vec[], size_t n), size_t skip);

-/* Appends all chunk maps from |map| starting at |sstart| to |other|, then
-   reduces the chunk map-count appropriately. */
-void sparsemap_split(sparsemap_t *map, size_t sstart, sparsemap_t *other);
+/**
+ * Appends all chunk maps from |map| starting at |offset| to |other|, then
+ * reduces the chunk map-count appropriately.
+ */
+void sparsemap_split(sparsemap_t *map, sparsemap_idx_t offset, sparsemap_t *other);

-/* Returns the index of the n'th set bit; uses a 0-based index. */
-size_t sparsemap_select(sparsemap_t *map, size_t n);
+/**
+ * Finds the offset of the n'th bit either set (|value| is true) or unset
+ * (|value| is false) from the start (positive |n|), or end (negative |n|),
+ * of the bitmap and returns that (uses a 0-based index).  Returns -inf or +inf
+ * if not found (where "inf" is SPARSEMAP_IDX_MAX and "-inf" is SPARSEMAP_IDX_MIN).
+ */
+sparsemap_idx_t sparsemap_select(sparsemap_t *map, sparsemap_idx_t n, bool value);

-/* Counts the set bits in the range [offset, idx]. */
-size_t sparsemap_rank(sparsemap_t *map, size_t offset, size_t idx);
+/**
+ * Counts the set (|value| is true) or unset (|value| is false) bits starting
+ * at |x| bits (0-based) in the range [x, y] (inclusive on either end).
+ */
+size_t sparsemap_rank(sparsemap_t *map, size_t x, size_t y, bool value);

-size_t sparsemap_span(sparsemap_t *map, size_t loc, size_t len);
+/**
+ * Finds the first span (i.e. a contiguous set of bits), in the bitmap that
+ * are set (|value| is true) or unset (|value| is false) and returns the
+ * starting offset for the span (0-based).
+ */
+size_t sparsemap_span(sparsemap_t *map, sparsemap_idx_t idx, size_t len, bool value);

+#if defined(__cplusplus)
+}
 #endif
+
+#endif /* !defined(SPARSEMAP_H) */
--- a/src/sparsemap.c
+++ b/src/sparsemap.c
--- a/tests/common.c
+++ b/tests/common.c
@ -1,13 +1,20 @@
-#include <sys/types.h>
+#define _POSIX_C_SOURCE 199309L
+#define X86_INTRIN

 #include <assert.h>
-#include <pthread.h>
-#include <sparsemap.h>
+#include <pthread.h> // If using threads
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+#include <time.h>
 #include <unistd.h>
+#ifdef X86_INTRIN
+#include <errno.h>
+#include <x86intrin.h>
+#endif

+#include "../include/sparsemap.h"
 #include "common.h"

 #pragma GCC diagnostic push
@ -22,84 +29,25 @@
 uint64_t
 tsc(void)
 {
+#ifdef X86_INTRIN
+  return __rdtsc();
+#else
  uint32_t low, high;
  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
  return ((uint64_t)high << 32) | low;
-}
-
-static uint64_t
-get_tsc_frequency()
-{
-  uint32_t high, low;
-  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
-  __asm__ volatile("rdtsc");
-  return ((uint64_t)high << 32) | low;
+#endif
 }

 double
-tsc_ticks_to_ns(uint64_t tsc_ticks)
+nsts()
 {
-  static uint64_t tsc_freq = 0;
-  if (tsc_freq == 0) {
-    tsc_freq = get_tsc_frequency();
-  }
-  return (double)tsc_ticks / (double)tsc_freq * 1e9;
-}
+  struct timespec ts;

-void
-est_sift_up(uint64_t *heap, int child_index)
-{
-  while (child_index > 0) {
-    int parent_index = (child_index - 1) / 2;
-    if (heap[parent_index] > heap[child_index]) {
-      // Swap parent and child
-      uint64_t temp = heap[parent_index];
-      heap[parent_index] = heap[child_index];
-      heap[child_index] = temp;
-      child_index = parent_index;
-    } else {
-      break; // Heap property satisfied
-    }
-  }
-}
-
-void
-est_sift_down(uint64_t *heap, int heap_size, int parent_index)
-{
-  int child_index = 2 * parent_index + 1; // Left child
-  while (child_index < heap_size) {
-    // Right child exists and is smaller than left child
-    if (child_index + 1 < heap_size && heap[child_index + 1] < heap[child_index]) {
-      child_index++;
-    }
-    // If the smallest child is smaller than the parent, swap them
-    if (heap[child_index] < heap[parent_index]) {
-      uint64_t temp = heap[child_index];
-      heap[child_index] = heap[parent_index];
-      heap[parent_index] = temp;
-      parent_index = child_index;
-      child_index = 2 * parent_index + 1;
-    } else {
-      break; // Heap property satisfied
-    }
-  }
-}
-
-void
-est_insert_value(uint64_t *heap, int heap_max_size, int *heap_size, uint64_t value)
-{
-  if (*heap_size < heap_max_size) { // Heap not full, insert value
-    heap[*heap_size] = value;
-    est_sift_up(heap, *heap_size);
-    (*heap_size)++;
-  } else {
-    // Heap is full, replace root with new value with a certain probability
-    // This is a very naive approach to maintain a sample of the input
-    if (rand() % 2) {
-      heap[0] = value;
-      est_sift_down(heap, heap_max_size, 0);
-    }
+  if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
+    perror("clock_gettime");
+    return -1.0; // Return -1.0 on error
  }
+  return ts.tv_sec + ts.tv_nsec / 1e9;
 }

 int __xorshift32_state = 0;
@ -170,7 +118,7 @@ has_sequential_set(int a[], int l, int r)
 int
 ensure_sequential_set(int a[], int l, int r)
 {
-  if (!a || l == 0 || r < 1 || r > l) {
+  if (!a || l == 0 || r < 1 || r > l - 1) {
    return 0;
  }

@ -199,21 +147,6 @@ ensure_sequential_set(int a[], int l, int r)
  return value;
 }

-int
-create_sequential_set_in_empty_map(sparsemap_t *map, int s, int r)
-{
-  int placed_at;
-  if (s >= r + 1) {
-    placed_at = 0;
-  } else {
-    placed_at = random_uint32() % (s - r - 1);
-  }
-  for (int i = placed_at; i < placed_at + r; i++) {
-    sparsemap_set(map, i, true);
-  }
-  return placed_at;
-}
-
 void
 print_array(int *array, int l)
 {
@ -340,6 +273,20 @@ is_unique(int a[], int l, int value)
  return 1; // Unique
 }

+int
+whats_set_uint64(uint64_t number, int pos[64])
+{
+  int length = 0;
+
+  for (int i = 0; i < 64; i++) {
+    if (number & ((uint64_t)1 << i)) {
+      pos[length++] = i;
+    }
+  }
+
+  return length;
+}
+
 void
 setup_test_array(int a[], int l, int max_value)
 {
@ -364,15 +311,6 @@ bitmap_from_uint32(sparsemap_t *map, uint32_t number)
  }
 }

-void
-bitmap_from_uint64(sparsemap_t *map, uint64_t number)
-{
-  for (int i = 0; i < 64; i++) {
-    bool bit = number & (1 << i);
-    sparsemap_set(map, i, bit);
-  }
-}
-
 uint32_t
 rank_uint64(uint64_t number, int n, int p)
 {
@ -400,22 +338,53 @@ rank_uint64(uint64_t number, int n, int p)
  return count;
 }

-int
-whats_set_uint64(uint64_t number, int pos[64])
+void
+print_bits(char *name, uint64_t value)
 {
-  int length = 0;
-
-  for (int i = 0; i < 64; i++) {
-    if (number & ((uint64_t)1 << i)) {
-      pos[length++] = i;
+  if (name) {
+    printf("%s\t", name);
+  }
+  for (int i = 63; i >= 0; i--) {
+    printf("%ld", (value >> i) & 1);
+    if (i % 8 == 0) {
+      printf(" "); // Add space for better readability
    }
  }
-
-  return length;
+  printf("\n");
 }

 void
-whats_set(sparsemap_t *map, int m)
+sm_bitmap_from_uint64(sparsemap_t *map, uint64_t number)
+{
+  for (int i = 0; i < 64; i++) {
+    bool bit = number & ((uint64_t)1 << i);
+    sparsemap_set(map, i, bit);
+  }
+}
+
+sparsemap_idx_t
+sm_add_span(sparsemap_t *map, int map_size, int span_length)
+{
+  int attempts = map_size / span_length;
+  sparsemap_idx_t placed_at;
+  do {
+    placed_at = random_uint32() % (map_size - span_length - 1);
+    if (sm_occupied(map, placed_at, span_length, true)) {
+      attempts--;
+    } else {
+      break;
+    }
+  } while (attempts);
+  for (int i = placed_at; i < placed_at + span_length; i++) {
+    if (sparsemap_set(map, i, true) != i) {
+      return placed_at; // TODO error?
+    }
+  }
+  return placed_at;
+}
+
+void
+sm_whats_set(sparsemap_t *map, int m)
 {
  logf("what's set in the range [0, %d): ", m);
  for (int i = 0; i < m; i++) {
@ -425,3 +394,25 @@ whats_set(sparsemap_t *map, int m)
  }
  logf("\n");
 }
+
+bool
+sm_is_span(sparsemap_t *map, sparsemap_idx_t m, int len, bool value)
+{
+  for (sparsemap_idx_t i = m; i < m + len; i++) {
+    if (sparsemap_is_set(map, i) != value) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool
+sm_occupied(sparsemap_t *map, sparsemap_idx_t m, int len, bool value)
+{
+  for (sparsemap_idx_t i = m; i < (sparsemap_idx_t)len; i++) {
+    if (sparsemap_is_set(map, i) == value) {
+      return true;
+    }
+  }
+  return false;
+}
--- a/tests/common.h
+++ b/tests/common.h
@ -23,20 +23,9 @@
 #define XORSHIFT_SEED_VALUE ((unsigned int)time(NULL) ^ getpid())
 #endif

-#define EST_MEDIAN_DECL(decl, size)   \
-  uint64_t heap_##decl[size] = { 0 }; \
-  int heap_##decl##_max_size = size;  \
-  int heap_##decl##_size = 0;
-
-#define EST_MEDIAN_ADD(decl, value) est_insert_value(heap_##decl, heap_##decl##_max_size, &heap_##decl##_size, (value));
-
-#define EST_MEDIAN_GET(decl) heap_##decl[0]
-
 uint64_t tsc(void);
 double tsc_ticks_to_ns(uint64_t tsc_ticks);
-void est_sift_up(uint64_t *heap, int child_index);
-void est_sift_down(uint64_t *heap, int heap_size, int parent_index);
-void est_insert_value(uint64_t *heap, int heap_max_size, int *heap_size, uint64_t value);
+double nsts();

 void xorshift32_seed();
 uint32_t xorshift32();
@ -52,11 +41,16 @@ int is_unique(int a[], int l, int value);
 void setup_test_array(int a[], int l, int max_value);
 void shuffle(int *array, size_t n);
 int ensure_sequential_set(int a[], int l, int r);
-int create_sequential_set_in_empty_map(sparsemap_t *map, int s, int r);
+sparsemap_idx_t sm_add_span(sparsemap_t *map, int map_size, int span_length);
+
+void print_bits(char *name, uint64_t value);

 void bitmap_from_uint32(sparsemap_t *map, uint32_t number);
-void bitmap_from_uint64(sparsemap_t *map, uint64_t number);
+void sm_bitmap_from_uint64(sparsemap_t *map, uint64_t number);
 uint32_t rank_uint64(uint64_t number, int n, int p);
 int whats_set_uint64(uint64_t number, int bitPositions[64]);

-void whats_set(sparsemap_t *map, int m);
+void sm_whats_set(sparsemap_t *map, int m);
+
+bool sm_is_span(sparsemap_t *map, sparsemap_idx_t m, int len, bool value);
+bool sm_occupied(sparsemap_t *map, sparsemap_idx_t m, int len, bool value);
--- a/tests/tdigest.c
+++ b/tests/tdigest.c
@ -0,0 +1,680 @@
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <math.h>
+#include "tdigest.h"
+#include <errno.h>
+#include <stdint.h>
+
+#ifndef TD_MALLOC_INCLUDE
+#define TD_MALLOC_INCLUDE "td_malloc.h"
+#endif
+
+#ifndef TD_ALLOC_H
+#define TD_ALLOC_H
+#define __td_malloc malloc
+#define __td_calloc calloc
+#define __td_realloc realloc
+#define __td_free free
+#endif
+
+#define __td_max(x, y) (((x) > (y)) ? (x) : (y))
+#define __td_min(x, y) (((x) < (y)) ? (x) : (y))
+
+static inline double weighted_average_sorted(double x1, double w1, double x2, double w2) {
+    const double x = (x1 * w1 + x2 * w2) / (w1 + w2);
+    return __td_max(x1, __td_min(x, x2));
+}
+
+static inline bool _tdigest_long_long_add_safe(long long a, long long b) {
+    if (b < 0) {
+        return (a >= __LONG_LONG_MAX__ - b);
+    } else {
+        return (a <= __LONG_LONG_MAX__ - b);
+    }
+}
+
+static inline double weighted_average(double x1, double w1, double x2, double w2) {
+    if (x1 <= x2) {
+        return weighted_average_sorted(x1, w1, x2, w2);
+    } else {
+        return weighted_average_sorted(x2, w2, x1, w1);
+    }
+}
+
+static inline void swap(double *arr, int i, int j) {
+    const double temp = arr[i];
+    arr[i] = arr[j];
+    arr[j] = temp;
+}
+
+static inline void swap_l(long long *arr, int i, int j) {
+    const long long temp = arr[i];
+    arr[i] = arr[j];
+    arr[j] = temp;
+}
+
+static unsigned int partition(double *means, long long *weights, unsigned int start,
+                              unsigned int end, unsigned int pivot_idx) {
+    const double pivotMean = means[pivot_idx];
+    swap(means, pivot_idx, end);
+    swap_l(weights, pivot_idx, end);
+
+    int i = start - 1;
+
+    for (unsigned int j = start; j < end; j++) {
+        // If current element is smaller than the pivot
+        if (means[j] < pivotMean) {
+            // increment index of smaller element
+            i++;
+            swap(means, i, j);
+            swap_l(weights, i, j);
+        }
+    }
+    swap(means, i + 1, end);
+    swap_l(weights, i + 1, end);
+    return i + 1;
+}
+
+/**
+ * Standard quick sort except that sorting rearranges parallel arrays
+ *
+ * @param means  Values to sort on
+ * @param weights The auxillary values to sort.
+ * @param start  The beginning of the values to sort
+ * @param end    The value after the last value to sort
+ */
+static void td_qsort(double *means, long long *weights, unsigned int start, unsigned int end) {
+    if (start < end) {
+        // two elements can be directly compared
+        if ((end - start) == 1) {
+            if (means[start] > means[end]) {
+                swap(means, start, end);
+                swap_l(weights, start, end);
+            }
+            return;
+        }
+        // generating a random number as a pivot was very expensive vs the array size
+        // const unsigned int pivot_idx = start + rand()%(end - start + 1);
+        const unsigned int pivot_idx = (end + start) / 2; // central pivot
+        const unsigned int new_pivot_idx = partition(means, weights, start, end, pivot_idx);
+        if (new_pivot_idx > start) {
+            td_qsort(means, weights, start, new_pivot_idx - 1);
+        }
+        td_qsort(means, weights, new_pivot_idx + 1, end);
+    }
+}
+
+static inline size_t cap_from_compression(double compression) {
+    if ((size_t)compression > ((SIZE_MAX / sizeof(double) / 6) - 10)) {
+        return 0;
+    }
+
+    return (6 * (size_t)(compression)) + 10;
+}
+
+static inline bool should_td_compress(td_histogram_t *h) {
+    return ((h->merged_nodes + h->unmerged_nodes) >= (h->cap - 1));
+}
+
+static inline int next_node(td_histogram_t *h) { return h->merged_nodes + h->unmerged_nodes; }
+
+int td_compress(td_histogram_t *h);
+
+static inline int _check_overflow(const double v) {
+    // double-precision overflow detected on h->unmerged_weight
+    if (v == INFINITY) {
+        return EDOM;
+    }
+    return 0;
+}
+
+static inline int _check_td_overflow(const double new_unmerged_weight,
+                                     const double new_total_weight) {
+    // double-precision overflow detected on h->unmerged_weight
+    if (new_unmerged_weight == INFINITY) {
+        return EDOM;
+    }
+    if (new_total_weight == INFINITY) {
+        return EDOM;
+    }
+    const double denom = 2 * MM_PI * new_total_weight * log(new_total_weight);
+    if (denom == INFINITY) {
+        return EDOM;
+    }
+
+    return 0;
+}
+
+int td_centroid_count(td_histogram_t *h) { return next_node(h); }
+
+void td_reset(td_histogram_t *h) {
+    if (!h) {
+        return;
+    }
+    h->min = __DBL_MAX__;
+    h->max = -h->min;
+    h->merged_nodes = 0;
+    h->merged_weight = 0;
+    h->unmerged_nodes = 0;
+    h->unmerged_weight = 0;
+    h->total_compressions = 0;
+}
+
+int td_init(double compression, td_histogram_t **result) {
+
+    const size_t capacity = cap_from_compression(compression);
+    if (capacity < 1) {
+        return 1;
+    }
+    td_histogram_t *histogram;
+    histogram = (td_histogram_t *)__td_malloc(sizeof(td_histogram_t));
+    if (!histogram) {
+        return 1;
+    }
+    histogram->cap = capacity;
+    histogram->compression = (double)compression;
+    td_reset(histogram);
+    histogram->nodes_mean = (double *)__td_calloc(capacity, sizeof(double));
+    if (!histogram->nodes_mean) {
+        td_free(histogram);
+        return 1;
+    }
+    histogram->nodes_weight = (long long *)__td_calloc(capacity, sizeof(long long));
+    if (!histogram->nodes_weight) {
+        td_free(histogram);
+        return 1;
+    }
+    *result = histogram;
+
+    return 0;
+}
+
+td_histogram_t *td_new(double compression) {
+    td_histogram_t *mdigest = NULL;
+    td_init(compression, &mdigest);
+    return mdigest;
+}
+
+void td_free(td_histogram_t *histogram) {
+    if (histogram->nodes_mean) {
+        __td_free((void *)(histogram->nodes_mean));
+    }
+    if (histogram->nodes_weight) {
+        __td_free((void *)(histogram->nodes_weight));
+    }
+    __td_free((void *)(histogram));
+}
+
+int td_merge(td_histogram_t *into, td_histogram_t *from) {
+    if (td_compress(into) != 0)
+        return EDOM;
+    if (td_compress(from) != 0)
+        return EDOM;
+    const int pos = from->merged_nodes + from->unmerged_nodes;
+    for (int i = 0; i < pos; i++) {
+        const double mean = from->nodes_mean[i];
+        const long long weight = from->nodes_weight[i];
+        if (td_add(into, mean, weight) != 0) {
+            return EDOM;
+        }
+    }
+    return 0;
+}
+
+long long td_size(td_histogram_t *h) { return h->merged_weight + h->unmerged_weight; }
+
+double td_cdf(td_histogram_t *h, double val) {
+    td_compress(h);
+    // no data to examine
+    if (h->merged_nodes == 0) {
+        return NAN;
+    }
+    // bellow lower bound
+    if (val < h->min) {
+        return 0;
+    }
+    // above upper bound
+    if (val > h->max) {
+        return 1;
+    }
+    if (h->merged_nodes == 1) {
+        // exactly one centroid, should have max==min
+        const double width = h->max - h->min;
+        if (val - h->min <= width) {
+            // min and max are too close together to do any viable interpolation
+            return 0.5;
+        } else {
+            // interpolate if somehow we have weight > 0 and max != min
+            return (val - h->min) / width;
+        }
+    }
+    const int n = h->merged_nodes;
+    // check for the left tail
+    const double left_centroid_mean = h->nodes_mean[0];
+    const double left_centroid_weight = (double)h->nodes_weight[0];
+    const double merged_weight_d = (double)h->merged_weight;
+    if (val < left_centroid_mean) {
+        // note that this is different than h->nodes_mean[0] > min
+        // ... this guarantees we divide by non-zero number and interpolation works
+        const double width = left_centroid_mean - h->min;
+        if (width > 0) {
+            // must be a sample exactly at min
+            if (val == h->min) {
+                return 0.5 / merged_weight_d;
+            } else {
+                return (1 + (val - h->min) / width * (left_centroid_weight / 2 - 1)) /
+                       merged_weight_d;
+            }
+        } else {
+            // this should be redundant with the check val < h->min
+            return 0;
+        }
+    }
+    // and the right tail
+    const double right_centroid_mean = h->nodes_mean[n - 1];
+    const double right_centroid_weight = (double)h->nodes_weight[n - 1];
+    if (val > right_centroid_mean) {
+        const double width = h->max - right_centroid_mean;
+        if (width > 0) {
+            if (val == h->max) {
+                return 1 - 0.5 / merged_weight_d;
+            } else {
+                // there has to be a single sample exactly at max
+                const double dq = (1 + (h->max - val) / width * (right_centroid_weight / 2 - 1)) /
+                                  merged_weight_d;
+                return 1 - dq;
+            }
+        } else {
+            return 1;
+        }
+    }
+    // we know that there are at least two centroids and mean[0] < x < mean[n-1]
+    // that means that there are either one or more consecutive centroids all at exactly x
+    // or there are consecutive centroids, c0 < x < c1
+    double weightSoFar = 0;
+    for (int it = 0; it < n - 1; it++) {
+        // weightSoFar does not include weight[it] yet
+        if (h->nodes_mean[it] == val) {
+            // we have one or more centroids == x, treat them as one
+            // dw will accumulate the weight of all of the centroids at x
+            double dw = 0;
+            while (it < n && h->nodes_mean[it] == val) {
+                dw += (double)h->nodes_weight[it];
+                it++;
+            }
+            return (weightSoFar + dw / 2) / (double)h->merged_weight;
+        } else if (h->nodes_mean[it] <= val && val < h->nodes_mean[it + 1]) {
+            const double node_weight = (double)h->nodes_weight[it];
+            const double node_weight_next = (double)h->nodes_weight[it + 1];
+            const double node_mean = h->nodes_mean[it];
+            const double node_mean_next = h->nodes_mean[it + 1];
+            // landed between centroids ... check for floating point madness
+            if (node_mean_next - node_mean > 0) {
+                // note how we handle singleton centroids here
+                // the point is that for singleton centroids, we know that their entire
+                // weight is exactly at the centroid and thus shouldn't be involved in
+                // interpolation
+                double leftExcludedW = 0;
+                double rightExcludedW = 0;
+                if (node_weight == 1) {
+                    if (node_weight_next == 1) {
+                        // two singletons means no interpolation
+                        // left singleton is in, right is out
+                        return (weightSoFar + 1) / merged_weight_d;
+                    } else {
+                        leftExcludedW = 0.5;
+                    }
+                } else if (node_weight_next == 1) {
+                    rightExcludedW = 0.5;
+                }
+                double dw = (node_weight + node_weight_next) / 2;
+
+                // adjust endpoints for any singleton
+                double dwNoSingleton = dw - leftExcludedW - rightExcludedW;
+
+                double base = weightSoFar + node_weight / 2 + leftExcludedW;
+                return (base + dwNoSingleton * (val - node_mean) / (node_mean_next - node_mean)) /
+                       merged_weight_d;
+            } else {
+                // this is simply caution against floating point madness
+                // it is conceivable that the centroids will be different
+                // but too near to allow safe interpolation
+                double dw = (node_weight + node_weight_next) / 2;
+                return (weightSoFar + dw) / merged_weight_d;
+            }
+        } else {
+            weightSoFar += (double)h->nodes_weight[it];
+        }
+    }
+    return 1 - 0.5 / merged_weight_d;
+}
+
+static double td_internal_iterate_centroids_to_index(const td_histogram_t *h, const double index,
+                                                     const double left_centroid_weight,
+                                                     const int total_centroids, double *weightSoFar,
+                                                     int *node_pos) {
+    if (left_centroid_weight > 1 && index < left_centroid_weight / 2) {
+        // there is a single sample at min so we interpolate with less weight
+        return h->min + (index - 1) / (left_centroid_weight / 2 - 1) * (h->nodes_mean[0] - h->min);
+    }
+
+    // usually the last centroid will have unit weight so this test will make it moot
+    if (index > h->merged_weight - 1) {
+        return h->max;
+    }
+
+    // if the right-most centroid has more than one sample, we still know
+    // that one sample occurred at max so we can do some interpolation
+    const double right_centroid_weight = (double)h->nodes_weight[total_centroids - 1];
+    const double right_centroid_mean = h->nodes_mean[total_centroids - 1];
+    if (right_centroid_weight > 1 &&
+        (double)h->merged_weight - index <= right_centroid_weight / 2) {
+        return h->max - ((double)h->merged_weight - index - 1) / (right_centroid_weight / 2 - 1) *
+                            (h->max - right_centroid_mean);
+    }
+
+    for (; *node_pos < total_centroids - 1; (*node_pos)++) {
+        const int i = *node_pos;
+        const double node_weight = (double)h->nodes_weight[i];
+        const double node_weight_next = (double)h->nodes_weight[i + 1];
+        const double node_mean = h->nodes_mean[i];
+        const double node_mean_next = h->nodes_mean[i + 1];
+        const double dw = (node_weight + node_weight_next) / 2;
+        if (*weightSoFar + dw > index) {
+            // centroids i and i+1 bracket our current point
+            // check for unit weight
+            double leftUnit = 0;
+            if (node_weight == 1) {
+                if (index - *weightSoFar < 0.5) {
+                    // within the singleton's sphere
+                    return node_mean;
+                } else {
+                    leftUnit = 0.5;
+                }
+            }
+            double rightUnit = 0;
+            if (node_weight_next == 1) {
+                if (*weightSoFar + dw - index <= 0.5) {
+                    // no interpolation needed near singleton
+                    return node_mean_next;
+                }
+                rightUnit = 0.5;
+            }
+            const double z1 = index - *weightSoFar - leftUnit;
+            const double z2 = *weightSoFar + dw - index - rightUnit;
+            return weighted_average(node_mean, z2, node_mean_next, z1);
+        }
+        *weightSoFar += dw;
+    }
+
+    // weightSoFar = totalWeight - weight[total_centroids-1]/2 (very nearly)
+    // so we interpolate out to max value ever seen
+    const double z1 = index - h->merged_weight - right_centroid_weight / 2.0;
+    const double z2 = right_centroid_weight / 2 - z1;
+    return weighted_average(right_centroid_mean, z1, h->max, z2);
+}
+
+double td_quantile(td_histogram_t *h, double q) {
+    td_compress(h);
+    // q should be in [0,1]
+    if (q < 0.0 || q > 1.0 || h->merged_nodes == 0) {
+        return NAN;
+    }
+    // with one data point, all quantiles lead to Rome
+    if (h->merged_nodes == 1) {
+        return h->nodes_mean[0];
+    }
+
+    // if values were stored in a sorted array, index would be the offset we are interested in
+    const double index = q * (double)h->merged_weight;
+
+    // beyond the boundaries, we return min or max
+    // usually, the first centroid will have unit weight so this will make it moot
+    if (index < 1) {
+        return h->min;
+    }
+
+    // we know that there are at least two centroids now
+    const int n = h->merged_nodes;
+
+    // if the left centroid has more than one sample, we still know
+    // that one sample occurred at min so we can do some interpolation
+    const double left_centroid_weight = (double)h->nodes_weight[0];
+
+    // in between extremes we interpolate between centroids
+    double weightSoFar = left_centroid_weight / 2;
+    int i = 0;
+    return td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n, &weightSoFar,
+                                                  &i);
+}
+
+int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length) {
+    td_compress(h);
+
+    if (NULL == quantiles || NULL == values) {
+        return EINVAL;
+    }
+
+    const int n = h->merged_nodes;
+    if (n == 0) {
+        for (size_t i = 0; i < length; i++) {
+            values[i] = NAN;
+        }
+        return 0;
+    }
+    if (n == 1) {
+        for (size_t i = 0; i < length; i++) {
+            const double requested_quantile = quantiles[i];
+
+            // q should be in [0,1]
+            if (requested_quantile < 0.0 || requested_quantile > 1.0) {
+                values[i] = NAN;
+            } else {
+                // with one data point, all quantiles lead to Rome
+                values[i] = h->nodes_mean[0];
+            }
+        }
+        return 0;
+    }
+
+    // we know that there are at least two centroids now
+    // if the left centroid has more than one sample, we still know
+    // that one sample occurred at min so we can do some interpolation
+    const double left_centroid_weight = (double)h->nodes_weight[0];
+
+    // in between extremes we interpolate between centroids
+    double weightSoFar = left_centroid_weight / 2;
+    int node_pos = 0;
+
+    // to avoid allocations we use the values array for intermediate computation
+    // i.e. to store the expected cumulative count at each percentile
+    for (size_t qpos = 0; qpos < length; qpos++) {
+        const double index = quantiles[qpos] * (double)h->merged_weight;
+        values[qpos] = td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n,
+                                                              &weightSoFar, &node_pos);
+    }
+    return 0;
+}
+
+static double td_internal_trimmed_mean(const td_histogram_t *h, const double leftmost_weight,
+                                       const double rightmost_weight) {
+    double count_done = 0;
+    double trimmed_sum = 0;
+    double trimmed_count = 0;
+    for (int i = 0; i < h->merged_nodes; i++) {
+
+        const double n_weight = (double)h->nodes_weight[i];
+        // Assume the whole centroid falls into the range
+        double count_add = n_weight;
+
+        // If we haven't reached the low threshold yet, skip appropriate part of the centroid.
+        count_add -= __td_min(__td_max(0, leftmost_weight - count_done), count_add);
+
+        // If we have reached the upper threshold, ignore the overflowing part of the centroid.
+
+        count_add = __td_min(__td_max(0, rightmost_weight - count_done), count_add);
+
+        // consider the whole centroid processed
+        count_done += n_weight;
+
+        // increment the sum / count
+        trimmed_sum += h->nodes_mean[i] * count_add;
+        trimmed_count += count_add;
+
+        // break once we cross the high threshold
+        if (count_done >= rightmost_weight)
+            break;
+    }
+
+    return trimmed_sum / trimmed_count;
+}
+
+double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut) {
+    td_compress(h);
+    // proportion_to_cut should be in [0,1]
+    if (h->merged_nodes == 0 || proportion_to_cut < 0.0 || proportion_to_cut > 1.0) {
+        return NAN;
+    }
+    // with one data point, all values lead to Rome
+    if (h->merged_nodes == 1) {
+        return h->nodes_mean[0];
+    }
+
+    /* translate the percentiles to counts */
+    const double leftmost_weight = floor((double)h->merged_weight * proportion_to_cut);
+    const double rightmost_weight = ceil((double)h->merged_weight * (1.0 - proportion_to_cut));
+
+    return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight);
+}
+
+double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut) {
+    td_compress(h);
+    // leftmost_cut and rightmost_cut should be in [0,1]
+    if (h->merged_nodes == 0 || leftmost_cut < 0.0 || leftmost_cut > 1.0 || rightmost_cut < 0.0 ||
+        rightmost_cut > 1.0) {
+        return NAN;
+    }
+    // with one data point, all values lead to Rome
+    if (h->merged_nodes == 1) {
+        return h->nodes_mean[0];
+    }
+
+    /* translate the percentiles to counts */
+    const double leftmost_weight = floor((double)h->merged_weight * leftmost_cut);
+    const double rightmost_weight = ceil((double)h->merged_weight * rightmost_cut);
+
+    return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight);
+}
+
+int td_add(td_histogram_t *h, double mean, long long weight) {
+    if (should_td_compress(h)) {
+        const int overflow_res = td_compress(h);
+        if (overflow_res != 0)
+            return overflow_res;
+    }
+    const int pos = next_node(h);
+    if (pos >= h->cap)
+        return EDOM;
+    if (_tdigest_long_long_add_safe(h->unmerged_weight, weight) == false)
+        return EDOM;
+    const long long new_unmerged_weight = h->unmerged_weight + weight;
+    if (_tdigest_long_long_add_safe(new_unmerged_weight, h->merged_weight) == false)
+        return EDOM;
+    const long long new_total_weight = new_unmerged_weight + h->merged_weight;
+    // double-precision overflow detected
+    const int overflow_res =
+        _check_td_overflow((double)new_unmerged_weight, (double)new_total_weight);
+    if (overflow_res != 0)
+        return overflow_res;
+
+    if (mean < h->min) {
+        h->min = mean;
+    }
+    if (mean > h->max) {
+        h->max = mean;
+    }
+    h->nodes_mean[pos] = mean;
+    h->nodes_weight[pos] = weight;
+    h->unmerged_nodes++;
+    h->unmerged_weight = new_unmerged_weight;
+    return 0;
+}
+
+int td_compress(td_histogram_t *h) {
+    if (h->unmerged_nodes == 0) {
+        return 0;
+    }
+    int N = h->merged_nodes + h->unmerged_nodes;
+    td_qsort(h->nodes_mean, h->nodes_weight, 0, N - 1);
+    const double total_weight = (double)h->merged_weight + (double)h->unmerged_weight;
+    // double-precision overflow detected
+    const int overflow_res = _check_td_overflow((double)h->unmerged_weight, (double)total_weight);
+    if (overflow_res != 0)
+        return overflow_res;
+    if (total_weight <= 1)
+        return 0;
+    const double denom = 2 * MM_PI * total_weight * log(total_weight);
+    if (_check_overflow(denom) != 0)
+        return EDOM;
+
+    // Compute the normalizer given compression and number of points.
+    const double normalizer = h->compression / denom;
+    if (_check_overflow(normalizer) != 0)
+        return EDOM;
+    int cur = 0;
+    double weight_so_far = 0;
+
+    for (int i = 1; i < N; i++) {
+        const double proposed_weight = (double)h->nodes_weight[cur] + (double)h->nodes_weight[i];
+        const double z = proposed_weight * normalizer;
+        // quantile up to cur
+        const double q0 = weight_so_far / total_weight;
+        // quantile up to cur + i
+        const double q2 = (weight_so_far + proposed_weight) / total_weight;
+        // Convert  a quantile to the k-scale
+        const bool should_add = (z <= (q0 * (1 - q0))) && (z <= (q2 * (1 - q2)));
+        // next point will fit
+        // so merge into existing centroid
+        if (should_add) {
+            h->nodes_weight[cur] += h->nodes_weight[i];
+            const double delta = h->nodes_mean[i] - h->nodes_mean[cur];
+            const double weighted_delta = (delta * h->nodes_weight[i]) / h->nodes_weight[cur];
+            h->nodes_mean[cur] += weighted_delta;
+        } else {
+            weight_so_far += h->nodes_weight[cur];
+            cur++;
+            h->nodes_weight[cur] = h->nodes_weight[i];
+            h->nodes_mean[cur] = h->nodes_mean[i];
+        }
+        if (cur != i) {
+            h->nodes_weight[i] = 0;
+            h->nodes_mean[i] = 0.0;
+        }
+    }
+    h->merged_nodes = cur + 1;
+    h->merged_weight = total_weight;
+    h->unmerged_nodes = 0;
+    h->unmerged_weight = 0;
+    h->total_compressions++;
+    return 0;
+}
+
+double td_min(td_histogram_t *h) { return h->min; }
+
+double td_max(td_histogram_t *h) { return h->max; }
+
+int td_compression(td_histogram_t *h) { return h->compression; }
+
+const long long *td_centroids_weight(td_histogram_t *h) { return h->nodes_weight; }
+
+const double *td_centroids_mean(td_histogram_t *h) { return h->nodes_mean; }
+
+long long td_centroids_weight_at(td_histogram_t *h, int pos) { return h->nodes_weight[pos]; }
+
+double td_centroids_mean_at(td_histogram_t *h, int pos) {
+    if (pos < 0 || pos > h->merged_nodes) {
+        return NAN;
+    }
+    return h->nodes_mean[pos];
+}
--- a/tests/tdigest.h
+++ b/tests/tdigest.h
@ -0,0 +1,258 @@
+#pragma once
+#include <stdlib.h>
+
+/**
+ * Adaptive histogram based on something like streaming k-means crossed with Q-digest.
+ * The implementation is a direct descendent of MergingDigest
+ * https://github.com/tdunning/t-digest/
+ *
+ * Copyright (c) 2021 Redis, All rights reserved.
+ * Copyright (c) 2018 Andrew Werner, All rights reserved.
+ *
+ * The special characteristics of this algorithm are:
+ *
+ * - smaller summaries than Q-digest
+ *
+ * - provides part per million accuracy for extreme quantiles and typically &lt;1000 ppm accuracy
+ * for middle quantiles
+ *
+ * - fast
+ *
+ * - simple
+ *
+ * - easy to adapt for use with map-reduce
+ */
+
+#define MM_PI 3.14159265358979323846
+
+struct td_histogram {
+    // compression is a setting used to configure the size of centroids when merged.
+    double compression;
+
+    double min;
+    double max;
+
+    // cap is the total size of nodes
+    int cap;
+    // merged_nodes is the number of merged nodes at the front of nodes.
+    int merged_nodes;
+    // unmerged_nodes is the number of buffered nodes.
+    int unmerged_nodes;
+
+    // we run the merge in reverse every other merge to avoid left-to-right bias in merging
+    long long total_compressions;
+
+    long long merged_weight;
+    long long unmerged_weight;
+
+    double *nodes_mean;
+    long long *nodes_weight;
+};
+
+typedef struct td_histogram td_histogram_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Allocate the memory, initialise the t-digest, and return the histogram as output parameter.
+ * @param compression The compression parameter.
+ * 100 is a common value for normal uses.
+ * 1000 is extremely large.
+ * The number of centroids retained will be a smallish (usually less than 10) multiple of this
+ * number.
+ * @return the histogram on success, NULL if allocation failed.
+ */
+td_histogram_t *td_new(double compression);
+
+/**
+ * Allocate the memory and initialise the t-digest.
+ *
+ * @param compression The compression parameter.
+ * 100 is a common value for normal uses.
+ * 1000 is extremely large.
+ * The number of centroids retained will be a smallish (usually less than 10) multiple of this
+ * number.
+ * @param result Output parameter to capture allocated histogram.
+ * @return 0 on success, 1 if allocation failed.
+ */
+int td_init(double compression, td_histogram_t **result);
+
+/**
+ * Frees the memory associated with the t-digest.
+ *
+ * @param h The histogram you want to free.
+ */
+void td_free(td_histogram_t *h);
+
+/**
+ * Reset a histogram to zero - empty out a histogram and re-initialise it
+ *
+ * If you want to re-use an existing histogram, but reset everything back to zero, this
+ * is the routine to use.
+ *
+ * @param h The histogram you want to reset to empty.
+ *
+ */
+void td_reset(td_histogram_t *h);
+
+/**
+ * Adds a sample to a histogram.
+ *
+ * @param val The value to add.
+ * @param weight The weight of this point.
+ * @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
+ * weight.
+ *
+ */
+int td_add(td_histogram_t *h, double val, long long weight);
+
+/**
+ * Re-examines a t-digest to determine whether some centroids are redundant.  If your data are
+ * perversely ordered, this may be a good idea.  Even if not, this may save 20% or so in space.
+ *
+ * The cost is roughly the same as adding as many data points as there are centroids.  This
+ * is typically &lt; 10 * compression, but could be as high as 100 * compression.
+ * This is a destructive operation that is not thread-safe.
+ *
+ * @param h The histogram you want to compress.
+ * @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
+ * weight. If overflow is detected the histogram is not changed.
+ *
+ */
+int td_compress(td_histogram_t *h);
+
+/**
+ * Merges all of the values from 'from' to 'this' histogram.
+ *
+ * @param h "This" pointer
+ * @param from Histogram to copy values from.
+ * * @return 0 on success, EDOM if overflow was detected as a consequence of merging the the
+ * provided histogram. If overflow is detected the original histogram is not detected.
+ */
+int td_merge(td_histogram_t *h, td_histogram_t *from);
+
+/**
+ * Returns the fraction of all points added which are &le; x.
+ *
+ * @param x The cutoff for the cdf.
+ * @return The fraction of all data which is less or equal to x.
+ */
+double td_cdf(td_histogram_t *h, double x);
+
+/**
+ * Returns an estimate of the cutoff such that a specified fraction of the data
+ * added to this TDigest would be less than or equal to the cutoff.
+ *
+ * @param q The desired fraction
+ * @return The value x such that cdf(x) == q;
+ */
+double td_quantile(td_histogram_t *h, double q);
+
+/**
+ * Returns an estimate of the cutoff such that a specified fraction of the data
+ * added to this TDigest would be less than or equal to the cutoffs.
+ *
+ * @param quantiles The ordered percentiles array to get the values for.
+ * @param values Destination array containing the values at the given quantiles.
+ * The values array should be allocated by the caller.
+ * @return 0 on success, ENOMEM if the provided destination array is null.
+ */
+int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length);
+
+/**
+ * Returns the trimmed mean ignoring values outside given cutoff upper and lower limits.
+ *
+ * @param leftmost_cut Fraction to cut off of the left tail of the distribution.
+ * @param rightmost_cut Fraction to cut off of the right tail of the distribution.
+ * @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
+ */
+double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut);
+
+/**
+ * Returns the trimmed mean ignoring values outside given a symmetric cutoff limits.
+ *
+ * @param proportion_to_cut Fraction to cut off of the left and right tails of the distribution.
+ * @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
+ */
+double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut);
+
+/**
+ * Returns the current compression factor.
+ *
+ * @return The compression factor originally used to set up the TDigest.
+ */
+int td_compression(td_histogram_t *h);
+
+/**
+ * Returns the number of points that have been added to this TDigest.
+ *
+ * @return The sum of the weights on all centroids.
+ */
+long long td_size(td_histogram_t *h);
+
+/**
+ * Returns the number of centroids being used by this TDigest.
+ *
+ * @return The number of centroids being used.
+ */
+int td_centroid_count(td_histogram_t *h);
+
+/**
+ * Get minimum value from the histogram.  Will return __DBL_MAX__ if the histogram
+ * is empty.
+ *
+ * @param h "This" pointer
+ */
+double td_min(td_histogram_t *h);
+
+/**
+ * Get maximum value from the histogram.  Will return - __DBL_MAX__ if the histogram
+ * is empty.
+ *
+ * @param h "This" pointer
+ */
+double td_max(td_histogram_t *h);
+
+/**
+ * Get the full centroids weight array for 'this' histogram.
+ *
+ * @param h "This" pointer
+ *
+ * @return The full centroids weight array.
+ */
+const long long *td_centroids_weight(td_histogram_t *h);
+
+/**
+ * Get the full centroids mean array for 'this' histogram.
+ *
+ * @param h "This" pointer
+ *
+ * @return The full centroids mean array.
+ */
+const double *td_centroids_mean(td_histogram_t *h);
+
+/**
+ * Get the centroid weight for 'this' histogram and 'pos'.
+ *
+ * @param h "This" pointer
+ * @param pos centroid position.
+ *
+ * @return The centroid weight.
+ */
+long long td_centroids_weight_at(td_histogram_t *h, int pos);
+
+/**
+ * Get the centroid mean for 'this' histogram and 'pos'.
+ *
+ * @param h "This" pointer
+ * @param pos centroid position.
+ *
+ * @return The centroid mean.
+ */
+double td_centroids_mean_at(td_histogram_t *h, int pos);
+
+#ifdef __cplusplus
+}
+#endif
--- a/tests/test.c
+++ b/tests/test.c