Compare commits
61 commits
gburd/resi
...
main
Author | SHA1 | Date | |
---|---|---|---|
6c8ad3b25f | |||
2f6e9bdf7b | |||
7335c8e1d8 | |||
f7fd8eed9a | |||
7ecd2e5dc2 | |||
eae0743b56 | |||
68c8cd0858 | |||
e398d014e8 | |||
a5c62cfe1e | |||
7bb26dbe88 | |||
69dd960558 | |||
b028408150 | |||
f5b500087d | |||
604d48617e | |||
7a572453c9 | |||
6990f94278 | |||
984a1a920d | |||
e641e6cc63 | |||
32721da645 | |||
cea31ab184 | |||
f525a097c7 | |||
|
a24a4e1cc5 | ||
08e8c0a5d1 | |||
7494a9a3a2 | |||
0e348efaf6 | |||
367d15a160 | |||
3b8e083806 | |||
c53bd53f9a | |||
a2b1a22a79 | |||
1909954b42 | |||
cbd53976f9 | |||
40de89bd8a | |||
756476561f | |||
4e2b4bde26 | |||
9dccdcbf76 | |||
ad910f0adf | |||
7c9ecc6d79 | |||
|
ea32274524 | ||
4bf1a5a00c | |||
2afbfa946e | |||
ffc762a796 | |||
fcab709f62 | |||
b9612f12cc | |||
|
57a8f99a32 | ||
a7754b05ba | |||
86798b32bd | |||
c9ae042b40 | |||
|
6b6ed5290f | ||
897e1af9df | |||
7192c7dcff | |||
|
c65fcebaad | ||
ee695b7243 | |||
b37d390e12 | |||
94fbf768c0 | |||
2dac3ed385 | |||
4ebe555fac | |||
ef7ac48e32 | |||
faea4510dc | |||
c742185b73 | |||
0297757856 | |||
b3dfd745e7 |
29 changed files with 34769 additions and 993 deletions
|
@ -3,8 +3,11 @@ Checks: >
|
||||||
bugprone-*,
|
bugprone-*,
|
||||||
clang-analyzer-*,
|
clang-analyzer-*,
|
||||||
google-*,
|
google-*,
|
||||||
misc-*,
|
-google-objectivec-*,
|
||||||
modernize-*,
|
modernize-*,
|
||||||
|
-modernize-deprecated-headers,
|
||||||
|
-modernize-use-using,
|
||||||
|
misc-*,
|
||||||
performance-*,
|
performance-*,
|
||||||
portability-*,
|
portability-*,
|
||||||
-bugprone-branch-clone,
|
-bugprone-branch-clone,
|
||||||
|
|
2
.envrc
2
.envrc
|
@ -1,5 +1,5 @@
|
||||||
if ! has nix_direnv_version || ! nix_direnv_version 3.0.4; then
|
if ! has nix_direnv_version || ! nix_direnv_version 3.0.4; then
|
||||||
source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/3.0.4/direnvrc" "sha256-DzlYZ33mWF/Gs8DDeyjr8mnVmQGx7ASYqA5WlxwvBG4="
|
source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/3.0.4/direnvrc" "sha256-DzlYZ33mWF/Gs8DDeyjr8mnVmQGx7ASYqA5WlxwvBG4="
|
||||||
fi
|
fi
|
||||||
watch_file devShell.nix shell.nix flake.nix
|
watch_file shell.nix flake.nix
|
||||||
use flake || use nix
|
use flake || use nix
|
||||||
|
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -3,6 +3,7 @@
|
||||||
**/*.o
|
**/*.o
|
||||||
tests/test
|
tests/test
|
||||||
examples/ex_?
|
examples/ex_?
|
||||||
|
tests/soak
|
||||||
.cache
|
.cache
|
||||||
hints.txt
|
hints.txt
|
||||||
tmp/
|
tmp/
|
||||||
|
@ -27,6 +28,7 @@ compile_commands.json
|
||||||
*.dat
|
*.dat
|
||||||
*.fsm
|
*.fsm
|
||||||
*.db
|
*.db
|
||||||
|
.vscode/
|
||||||
|
|
||||||
# Created by https://www.gitignore.io/api/jetbrains
|
# Created by https://www.gitignore.io/api/jetbrains
|
||||||
# Edit at https://www.gitignore.io/?templates=jetbrains
|
# Edit at https://www.gitignore.io/?templates=jetbrains
|
||||||
|
|
15
.idea/customTargets.xml
Normal file
15
.idea/customTargets.xml
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="CLionExternalBuildManager">
|
||||||
|
<target id="db0ccaeb-4851-470b-83d0-afa663f6ceb9" name="tests/soak" defaultType="MAKE">
|
||||||
|
<configuration id="98973a90-a9d0-431b-9071-9ce6960b0b01" name="tests/soak">
|
||||||
|
<build type="MAKE">
|
||||||
|
<make targetName="tests/soak" />
|
||||||
|
</build>
|
||||||
|
<clean type="MAKE">
|
||||||
|
<make targetName="clean" />
|
||||||
|
</clean>
|
||||||
|
</configuration>
|
||||||
|
</target>
|
||||||
|
</component>
|
||||||
|
</project>
|
25
.idea/makefile.xml
Normal file
25
.idea/makefile.xml
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="MakefileBuildTargetsManager">
|
||||||
|
<user-build-targets>
|
||||||
|
<build-target name="tests/soak">
|
||||||
|
<build-configurations>
|
||||||
|
<build-configuration>
|
||||||
|
<make-targets>
|
||||||
|
<make-target>tests/soak</make-target>
|
||||||
|
</make-targets>
|
||||||
|
</build-configuration>
|
||||||
|
</build-configurations>
|
||||||
|
</build-target>
|
||||||
|
<build-target name="clean">
|
||||||
|
<build-configurations>
|
||||||
|
<build-configuration>
|
||||||
|
<make-targets>
|
||||||
|
<make-target>clean</make-target>
|
||||||
|
</make-targets>
|
||||||
|
</build-configuration>
|
||||||
|
</build-configurations>
|
||||||
|
</build-target>
|
||||||
|
</user-build-targets>
|
||||||
|
</component>
|
||||||
|
</project>
|
|
@ -8,6 +8,7 @@
|
||||||
<sourceRoots>
|
<sourceRoots>
|
||||||
<file path="$PROJECT_DIR$/examples" />
|
<file path="$PROJECT_DIR$/examples" />
|
||||||
<file path="$PROJECT_DIR$/include" />
|
<file path="$PROJECT_DIR$/include" />
|
||||||
|
<file path="$PROJECT_DIR$/lib" />
|
||||||
<file path="$PROJECT_DIR$/src" />
|
<file path="$PROJECT_DIR$/src" />
|
||||||
<file path="$PROJECT_DIR$/tests" />
|
<file path="$PROJECT_DIR$/tests" />
|
||||||
</sourceRoots>
|
</sourceRoots>
|
||||||
|
|
105
.idx/dev.nix
Normal file
105
.idx/dev.nix
Normal file
|
@ -0,0 +1,105 @@
|
||||||
|
# To learn more about how to use Nix to configure your environment
|
||||||
|
# see: https://developers.google.com/idx/guides/customize-idx-env
|
||||||
|
{ pkgs, ... }: {
|
||||||
|
# Which nixpkgs channel to use.
|
||||||
|
channel = "stable-23.11"; # or "unstable"
|
||||||
|
# Use https://search.nixos.org/packages to find packages
|
||||||
|
packages = with pkgs; [
|
||||||
|
act
|
||||||
|
autoconf
|
||||||
|
clang
|
||||||
|
clang-tools
|
||||||
|
cmake
|
||||||
|
ed
|
||||||
|
fira-code-nerdfont
|
||||||
|
gcc
|
||||||
|
gdb
|
||||||
|
gettext
|
||||||
|
glibc.out
|
||||||
|
glibc.static
|
||||||
|
gnumake
|
||||||
|
graphviz-nox
|
||||||
|
libbacktrace
|
||||||
|
libtool
|
||||||
|
lldb
|
||||||
|
m4
|
||||||
|
neovim
|
||||||
|
ninja
|
||||||
|
openssh
|
||||||
|
perl
|
||||||
|
pkg-config
|
||||||
|
python3
|
||||||
|
ripgrep
|
||||||
|
# pkgs.python311
|
||||||
|
# pkgs.python311Packages.pip
|
||||||
|
];
|
||||||
|
# Sets environment variables in the workspace
|
||||||
|
env = {
|
||||||
|
GIT_SSH_COMMAND="ssh -i ~/.ssh/id_ed25519 -F /dev/null";
|
||||||
|
};
|
||||||
|
idx = {
|
||||||
|
# Search for the extensions you want on https://open-vsx.org/ and use "publisher.id"
|
||||||
|
extensions = [
|
||||||
|
"coolbear.systemd-unit-file"
|
||||||
|
"dotjoshjohnson.xml"
|
||||||
|
"editorconfig.editorconfig"
|
||||||
|
"esbenp.prettier-vscode"
|
||||||
|
"mads-hartmann.bash-ide-vscode"
|
||||||
|
"ms-python.python"
|
||||||
|
"ms-vscode.clangd"
|
||||||
|
"ms-vscode.cmake-tools"
|
||||||
|
"ms-vscode.cpptools"
|
||||||
|
"ms-vscode.cpptools-extension-pack"
|
||||||
|
"ms-vscode.makefile-tools"
|
||||||
|
"ms-vsliveshare.vsliveshare"
|
||||||
|
"redhat.vscode-yaml"
|
||||||
|
"rogalmic.bash-debug"
|
||||||
|
"ryu1kn.partial-diff"
|
||||||
|
"streetsidesoftware.code-spell-checker"
|
||||||
|
"timonwong.shellcheck"
|
||||||
|
"twxs.cmake"
|
||||||
|
"vadimcn.vscode-lldb"
|
||||||
|
#"vscode-icons-team.vscode-icons"
|
||||||
|
"yzhang.markdown-all-in-one"
|
||||||
|
"znck.grammarly"
|
||||||
|
#"llvm-vs-code-extensions.vscode-clangd"
|
||||||
|
#"eamodio.gitlens"
|
||||||
|
"asvetliakov.vscode-neovim"
|
||||||
|
#"golang.go"
|
||||||
|
#"jnoortheen.nix-ide"
|
||||||
|
#"ms-python.vscode-pylance"
|
||||||
|
#"mspython.debugpy"
|
||||||
|
#"scala-lang.scala"
|
||||||
|
#"scalameta.metals"
|
||||||
|
#"vscodevim.vim"
|
||||||
|
];
|
||||||
|
# Enable previews
|
||||||
|
previews = {
|
||||||
|
enable = true;
|
||||||
|
previews = {
|
||||||
|
# web = {
|
||||||
|
# # Example: run "npm run dev" with PORT set to IDX's defined port for previews,
|
||||||
|
# # and show it in IDX's web preview panel
|
||||||
|
# command = ["npm" "run" "dev"];
|
||||||
|
# manager = "web";
|
||||||
|
# env = {
|
||||||
|
# # Environment variables to set for your server
|
||||||
|
# PORT = "$PORT";
|
||||||
|
# };
|
||||||
|
# };
|
||||||
|
};
|
||||||
|
};
|
||||||
|
# Workspace lifecycle hooks
|
||||||
|
workspace = {
|
||||||
|
# Runs when a workspace is first created
|
||||||
|
onCreate = {
|
||||||
|
# Example: install JS dependencies from NPM
|
||||||
|
# npm-install = 'npm install';
|
||||||
|
};
|
||||||
|
onStart = {
|
||||||
|
# Example: start a background task to watch and re-build backend code
|
||||||
|
# watch-backend = "npm run watch-backend";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
94
CMakeLists.txt
Normal file
94
CMakeLists.txt
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
cmake_minimum_required(VERSION 3.27)
|
||||||
|
|
||||||
|
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR)
|
||||||
|
message(FATAL_ERROR "Do not build in-source. Please remove CMakeCache.txt and the CMakeFiles/ directory. Then build out-of-source.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
project(sparsemap LANGUAGES C)
|
||||||
|
|
||||||
|
set(CMAKE_C_STANDARD 11)
|
||||||
|
set(CMAKE_C_STANDARD_REQUIRED ON)
|
||||||
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
set(CMAKE_C_OUTPUT_EXTENSION .o)
|
||||||
|
|
||||||
|
# Set source and header file locations
|
||||||
|
set(SOURCE_DIR src)
|
||||||
|
set(HEADER_DIR include)
|
||||||
|
|
||||||
|
set(COMMON_CMAKE_C_FLAGS "-Wall -Wextra -Wpedantic")
|
||||||
|
set(CMAKE_C_FLAGS_DEBUG "-DSPARSEMAP_DIAGNOSTIC -DDEBUG -g -O0")
|
||||||
|
set(CMAKE_C_FLAGS_PROFILE "-DSPARSEMAP_DIAGNOSTIC -DDEBUG -g -Og -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope")
|
||||||
|
set(CMAKE_C_FLAGS_RELEASE "-Ofast")
|
||||||
|
|
||||||
|
# Include all header files from the header directory
|
||||||
|
file(GLOB_RECURSE HEADERS CONFIGURE_FILES ${HEADER_DIR}/*.h)
|
||||||
|
|
||||||
|
# Configure library sources
|
||||||
|
set(LIB_SRC
|
||||||
|
${SOURCE_DIR}/sparsemap.c
|
||||||
|
)
|
||||||
|
|
||||||
|
# Option to control building shared/static libraries
|
||||||
|
option(BUILD_SHARED_LIBS "Build shared libraries" ON)
|
||||||
|
|
||||||
|
# Add shared library
|
||||||
|
add_library(sparsemap_SHARED SHARED ${LIB_SRC} ${HEADERS})
|
||||||
|
# Set target properties for the shared library (adjust if needed)
|
||||||
|
set_target_properties(sparsemap_SHARED PROPERTIES
|
||||||
|
VERSION 1.0.0 # Set library version
|
||||||
|
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" # Set output directory
|
||||||
|
COMPILE_FLAGS "${CMAKE_C_FLAGS_${CMAKE_CURRENT_LIST_MODE}}"
|
||||||
|
)
|
||||||
|
target_include_directories(sparsemap_SHARED PRIVATE ${HEADER_DIR})
|
||||||
|
|
||||||
|
# Add static library
|
||||||
|
add_library(sparsemap STATIC ${LIB_SRC} ${HEADERS})
|
||||||
|
# Set target properties for static library (adjust if needed)
|
||||||
|
set_target_properties(sparsemap PROPERTIES
|
||||||
|
OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" # Set output directory
|
||||||
|
)
|
||||||
|
target_include_directories(sparsemap PRIVATE ${HEADER_DIR})
|
||||||
|
|
||||||
|
# Add ex_1 program
|
||||||
|
add_executable(ex_1 examples/ex_1.c tests/munit.c lib/common.c)
|
||||||
|
target_link_libraries(ex_1 PRIVATE sparsemap)
|
||||||
|
target_include_directories(ex_1 PRIVATE ${HEADER_DIR})
|
||||||
|
add_custom_target(run_ex_1 COMMAND ex_1 WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
|
||||||
|
|
||||||
|
# Add ex_2 program
|
||||||
|
add_executable(ex_2 examples/ex_2.c tests/munit.c lib/common.c)
|
||||||
|
target_link_libraries(ex_2 PRIVATE sparsemap)
|
||||||
|
target_include_directories(ex_2 PRIVATE ${HEADER_DIR})
|
||||||
|
add_custom_target(run_ex_2 COMMAND ex_2 WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
|
||||||
|
|
||||||
|
# Add ex_3 program
|
||||||
|
add_executable(ex_3 examples/ex_3.c tests/munit.c lib/common.c)
|
||||||
|
target_link_libraries(ex_3 PRIVATE sparsemap)
|
||||||
|
target_include_directories(ex_3 PRIVATE ${HEADER_DIR})
|
||||||
|
add_custom_target(run_ex_3 COMMAND ex_3 WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
|
||||||
|
|
||||||
|
# Add ex_4 program
|
||||||
|
add_executable(ex_4 examples/ex_4.c tests/munit.c lib/common.c)
|
||||||
|
target_link_libraries(ex_4 PRIVATE sparsemap)
|
||||||
|
target_include_directories(ex_4 PRIVATE ${HEADER_DIR})
|
||||||
|
add_custom_target(run_ex_4 COMMAND ex_4 WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
|
||||||
|
|
||||||
|
# Add test program
|
||||||
|
add_executable(test tests/test.c tests/munit.c lib/common.c)
|
||||||
|
target_link_libraries(test PRIVATE sparsemap)
|
||||||
|
target_include_directories(test PRIVATE ${HEADER_DIR})
|
||||||
|
add_custom_target(run_test COMMAND test WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
|
||||||
|
|
||||||
|
# Add soak program
|
||||||
|
add_executable(soak tests/soak.c lib/common.c lib/tdigest.c lib/roaring.c)
|
||||||
|
target_link_libraries(soak PRIVATE sparsemap)
|
||||||
|
target_include_directories(soak PRIVATE ${HEADER_DIR} lib)
|
||||||
|
target_link_libraries(soak PUBLIC m)
|
||||||
|
add_custom_target(run_soak COMMAND soak WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
|
||||||
|
|
||||||
|
# Add fuzzer program
|
||||||
|
# add_executable(fuzzer tests/fuzzer.c)
|
||||||
|
# target_link_libraries(fuzzer PRIVATE sparsemap)
|
||||||
|
# target_include_directories(fuzzer PRIVATE ${HEADER_DIR} lib)
|
||||||
|
# target_link_libraries(fuzzer PUBLIC m)
|
||||||
|
# add_custom_target(run_fuzzer COMMAND fuzzer WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
|
69
Makefile
69
Makefile
|
@ -3,16 +3,24 @@ OBJS = sparsemap.o
|
||||||
STATIC_LIB = libsparsemap.a
|
STATIC_LIB = libsparsemap.a
|
||||||
SHARED_LIB = libsparsemap.so
|
SHARED_LIB = libsparsemap.so
|
||||||
|
|
||||||
|
LIBS = -lm
|
||||||
#CFLAGS = -Wall -Wextra -Wpedantic -Of -std=c11 -Iinclude/ -fPIC
|
#CFLAGS = -Wall -Wextra -Wpedantic -Of -std=c11 -Iinclude/ -fPIC
|
||||||
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
|
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
|
||||||
CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
|
#CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -O0 -g -std=c11 -Iinclude/ -fPIC
|
||||||
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC
|
CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Ofast -g -std=c11 -Iinclude/ -fPIC
|
||||||
|
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
|
||||||
|
#CFLAGS = -Wall -Wextra -Wpedantic -Ofast -g -std=c11 -Iinclude/ -fPIC
|
||||||
|
#CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC
|
||||||
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=all -fhardened -std=c11 -Iinclude/ -fPIC
|
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=all -fhardened -std=c11 -Iinclude/ -fPIC
|
||||||
|
|
||||||
TEST_FLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -Itests/ -fPIC
|
#TEST_FLAGS = -DDEBUG -Wall -Wextra -Wpedantic -O0 -g -std=c11 -Iinclude/ -Itests/ -fPIC
|
||||||
|
TEST_FLAGS = -Wall -Wextra -Wpedantic -Ofast -g -std=c11 -Iinclude/ -Itests/ -fPIC
|
||||||
|
#TEST_FLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -Itests/ -fPIC
|
||||||
|
#TEST_FLAGS = -DDEBUG -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC
|
||||||
|
|
||||||
TESTS = tests/test
|
TESTS = tests/test tests/soak
|
||||||
TEST_OBJS = tests/test.o tests/munit.o tests/common.o
|
TEST_OBJS = tests/test.o lib/munit.o lib/tdigest.o lib/common.o
|
||||||
|
LIB_OBJS = lib/munit.o lib/tdigest.o lib/common.o lib/roaring.o
|
||||||
EXAMPLES = examples/ex_1 examples/ex_2 examples/ex_3 examples/ex_4
|
EXAMPLES = examples/ex_1 examples/ex_2 examples/ex_3 examples/ex_4
|
||||||
|
|
||||||
.PHONY: all shared static clean test examples mls
|
.PHONY: all shared static clean test examples mls
|
||||||
|
@ -29,17 +37,23 @@ $(STATIC_LIB): $(OBJS)
|
||||||
$(SHARED_LIB): $(OBJS)
|
$(SHARED_LIB): $(OBJS)
|
||||||
$(CC) $(CFLAGS) -o $@ $? -shared
|
$(CC) $(CFLAGS) -o $@ $? -shared
|
||||||
|
|
||||||
examples: $(STATIC_LIB) $(EXAMPLES) examples/common.o
|
examples: $(STATIC_LIB) $(EXAMPLES) $(TEST_OBJS)
|
||||||
|
|
||||||
mls: examples/mls
|
mls: examples/mls
|
||||||
|
|
||||||
test: $(TESTS)
|
tests: $(TESTS)
|
||||||
|
|
||||||
check: test
|
test: tests
|
||||||
env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/test
|
env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/test
|
||||||
|
|
||||||
tests/test: $(TEST_OBJS) $(STATIC_LIB)
|
soak: tests
|
||||||
$(CC) $^ -o $@ $(TEST_FLAGS)
|
env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/soak
|
||||||
|
|
||||||
|
fuzzer: tests
|
||||||
|
env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/fuzzer ./crash.case
|
||||||
|
|
||||||
|
tests/test: $(TEST_OBJS) $(LIB_OBJS) $(STATIC_LIB)
|
||||||
|
$(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f $(OBJS)
|
rm -f $(OBJS)
|
||||||
|
@ -49,31 +63,44 @@ clean:
|
||||||
rm -f $(EXAMPLES) examples/*.o
|
rm -f $(EXAMPLES) examples/*.o
|
||||||
|
|
||||||
format:
|
format:
|
||||||
clang-format -i src/sparsemap.c include/sparsemap.h examples/ex_*.c tests/test.c tests/common.c tests/common.h
|
clang-format -i src/sparsemap.c include/sparsemap.h examples/ex_*.c tests/soak.c tests/test.c tests/midl.c lib/common.c include/common.h
|
||||||
# clang-format -i include/*.h src/*.c tests/*.c tests/*.h examples/*.c
|
# clang-format -i include/*.h src/*.c tests/*.c tests/*.h examples/*.c
|
||||||
|
|
||||||
%.o: src/%.c
|
%.o: src/%.c
|
||||||
$(CC) $(CFLAGS) -c -o $@ $^
|
$(CC) $(CFLAGS) -c -o $@ $^
|
||||||
|
|
||||||
|
lib/%.o: tests/%.c
|
||||||
|
$(CC) $(CFLAGS) -c -o $@ $^
|
||||||
|
|
||||||
tests/%.o: tests/%.c
|
tests/%.o: tests/%.c
|
||||||
$(CC) $(CFLAGS) -c -o $@ $^
|
$(CC) $(CFLAGS) -c -o $@ $^
|
||||||
|
|
||||||
examples/%.o: examples/%.c
|
examples/%.o: examples/%.c
|
||||||
$(CC) $(CFLAGS) -c -o $@ $^
|
$(CC) $(CFLAGS) -c -o $@ $^
|
||||||
|
|
||||||
examples/common.o: tests/common.c
|
examples/ex_1: $(LIB_OBJS) examples/ex_1.o $(STATIC_LIB)
|
||||||
$(CC) $(CFLAGS) -c -o $@ $^
|
$(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS)
|
||||||
|
|
||||||
examples/ex_1: examples/common.o examples/ex_1.o $(STATIC_LIB)
|
examples/ex_2: $(LIB_OBJS) examples/ex_2.o $(STATIC_LIB)
|
||||||
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS)
|
$(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS)
|
||||||
|
|
||||||
examples/ex_2: examples/common.o examples/ex_2.o $(STATIC_LIB)
|
examples/ex_3: $(LIB_OBJS) examples/ex_3.o $(STATIC_LIB)
|
||||||
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS)
|
$(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS)
|
||||||
|
|
||||||
examples/ex_3: examples/common.o examples/ex_3.o $(STATIC_LIB)
|
examples/ex_4: $(LIB_OBJS) examples/ex_4.o $(STATIC_LIB)
|
||||||
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS)
|
$(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS)
|
||||||
|
|
||||||
examples/ex_4: examples/common.o examples/ex_4.o $(STATIC_LIB)
|
tests/soak: $(LIB_OBJS) tests/soak.o $(STATIC_LIB)
|
||||||
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS)
|
$(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS)
|
||||||
|
|
||||||
|
tests/fuzzer: $(LIB_OBJS) tests/fuzzer.o $(STATIC_LIB)
|
||||||
|
$(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS) -DFUZZ_DEBUG
|
||||||
|
|
||||||
|
todo:
|
||||||
|
rg -i 'todo|gsb|abort'
|
||||||
|
|
||||||
# cp src/sparsemap.c /tmp && clang-tidy src/sparsemap.c -fix -fix-errors -checks="readability-braces-around-statements" -- -DDEBUG -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
|
# cp src/sparsemap.c /tmp && clang-tidy src/sparsemap.c -fix -fix-errors -checks="readability-braces-around-statements" -- -DDEBUG -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
|
||||||
|
|
||||||
|
# clear; make clean examples test && env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/test
|
||||||
|
|
||||||
|
# clear; make clean examples test && env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./examples/soak
|
||||||
|
|
37
README.md
37
README.md
|
@ -1,8 +1,12 @@
|
||||||
# Sparsemap
|
# Sparsemap
|
||||||
|
|
||||||
`sparsemap` is a sparse, compressed bitmap. In best case, it can store 2048
|
Bitsets, also called bitmaps, are commonly used as fast data structures.
|
||||||
bits in just 8 bytes. In worst case, it stores the 2048 bits uncompressed and
|
Unfortunately, they can use too much memory. To compensate, we often use
|
||||||
requires additional 8 bytes of overhead.
|
compressed bitmaps.
|
||||||
|
|
||||||
|
`sparsemap` is a sparse, compressed bitmap. In the best case, it can store 2048
|
||||||
|
bits in just 8 bytes. In the worst case, it stores the 2048 bits uncompressed and
|
||||||
|
requires an additional 8 bytes of overhead.
|
||||||
|
|
||||||
The "best" case happens when large consecutive sequences of the bits are
|
The "best" case happens when large consecutive sequences of the bits are
|
||||||
either set ("1") or not set ("0"). If your numbers are consecutive 64bit
|
either set ("1") or not set ("0"). If your numbers are consecutive 64bit
|
||||||
|
@ -14,9 +18,9 @@ On the lowest level stores bits in sm_bitvec_t's (a uint32_t or uint64_t).
|
||||||
|
|
||||||
Each sm_bitvec_t has an additional descriptor (2 bits). A single word prepended
|
Each sm_bitvec_t has an additional descriptor (2 bits). A single word prepended
|
||||||
to each sm_bitvec_t describes its condition. The descriptor word and the
|
to each sm_bitvec_t describes its condition. The descriptor word and the
|
||||||
sm_bitvec_t's have the same size.) The descriptor of a sm_bitvec_t
|
sm_bitvec_t's have the same size. The descriptor of a sm_bitvec_t
|
||||||
specifies whether the sm_bitvec_t consists only of set bits ("1"), unset
|
specifies whether the sm_bitvec_t consists only of set bits ("1"), unset
|
||||||
bits ("0") or has a mixed payload. In the first and second case the
|
bits ("0") or has a mixed payload. In the first and second cases, the
|
||||||
sm_bitvec_t is not stored.
|
sm_bitvec_t is not stored.
|
||||||
|
|
||||||
An example shows a sequence of 4 x 16 bits (here, each sm_bitvec_t and the
|
An example shows a sequence of 4 x 16 bits (here, each sm_bitvec_t and the
|
||||||
|
@ -27,7 +31,7 @@ Descriptor word has 16 bits):
|
||||||
^^ ^^ ^^ ^^-- sm_bitvec_t #0 - #3 are "0000000000000000"
|
^^ ^^ ^^ ^^-- sm_bitvec_t #0 - #3 are "0000000000000000"
|
||||||
^^-- sm_bitvec_t #4 is "1111111111111111"
|
^^-- sm_bitvec_t #4 is "1111111111111111"
|
||||||
^^-- sm_bitvec_t #5 is "0000000000000000"
|
^^-- sm_bitvec_t #5 is "0000000000000000"
|
||||||
^^-- sm_bitvec_t #7 is "1111111111111111"
|
^^-- sm_bitvec_t #6 is "1111111111111111"
|
||||||
^^-- sm_bitvec_t #7 is "0110010101111001"
|
^^-- sm_bitvec_t #7 is "0110010101111001"
|
||||||
|
|
||||||
Since the first 7 sm_bitvec_t's are either all "1" or "0" they are not stored.
|
Since the first 7 sm_bitvec_t's are either all "1" or "0" they are not stored.
|
||||||
|
@ -36,29 +40,34 @@ The actual memory sequence looks like this:
|
||||||
0000000011001110 0110010101111001
|
0000000011001110 0110010101111001
|
||||||
|
|
||||||
Instead of storing 8 Words (16 bytes), we only store 2 Words (2 bytes): one
|
Instead of storing 8 Words (16 bytes), we only store 2 Words (2 bytes): one
|
||||||
for the descriptor, one for last sm_bitvec_t #7.
|
for the descriptor, and one for the last sm_bitvec_t #7.
|
||||||
|
|
||||||
The sparsemap stores a list of chunk maps, and for each chunk map it stores the
|
The sparsemap stores a list of chunk maps, and for each chunk map, it stores the
|
||||||
absolute address (i.e. if the user sets bit 0 and bit 10000, and the chunk map
|
absolute address (i.e. if the user sets bit 0 and bit 10000, and the chunk map
|
||||||
capacity is 2048, the sparsemap creates two chunk maps; the first starts at
|
capacity is 2048, the sparsemap creates two chunk maps; the first starts at
|
||||||
offset 0, the second starts at offset 8192).
|
offset 0, the second starts at offset 8192).
|
||||||
|
|
||||||
## Usage instructions
|
## Usage instructions
|
||||||
|
|
||||||
The file `examples/ex_1.c` has example code.
|
Copy the files `src/sparsemap.c` and `include/sparsemap.h` into your project.
|
||||||
|
Review the `examples/*` and `tests/*` code.
|
||||||
|
|
||||||
## Final words
|
## Final words
|
||||||
|
|
||||||
This bitmap has efficient compression when used on long sequences of set (or
|
This bitmap has efficient compression when used on long sequences of set (or
|
||||||
unset) bits (i.e. with a word size of 64bit, and a payload of consecutive
|
unset) bits (i.e. with a word size of 64 bit and a payload of consecutive
|
||||||
numbers without gaps, the payload of 2048 x sizeof(uint64_t) = 16kb will occupy
|
numbers without gaps, the payload of 2048 x sizeof(uint64_t) = 16kb will occupy
|
||||||
only 8 bytes!
|
only 8 bytes!).
|
||||||
|
|
||||||
However, if the sequence is not consecutive and has gaps, it's possible that
|
However, if the sequence is not consecutive and has gaps, it's possible that
|
||||||
the compression is inefficient, and the size (in the worst case) is identical
|
the compression is inefficient, and the size (in the worst case) is identical
|
||||||
to an uncompressed bit vector (sometimes higher due to the bytes required for
|
to an uncompressed bit vector (sometimes higher due to the bytes required for
|
||||||
metadata). In such cases, other compression schemes are more efficient (i.e.
|
metadata). In such cases, other compression schemes are more efficient (i.e.
|
||||||
http://lemire.me/blog/archives/2008/08/20/the-mythical-bitmap-index/).
|
http://lemire.me/blog/archives/2008/08/20/the-mythical-bitmap-index/). We
|
||||||
|
include in `lib` the amalgamated (git `2dc8070`) and well-known
|
||||||
|
[Roaring Bitmaps](https://github.com/RoaringBitmap/CRoaring/tree/master) and
|
||||||
|
use it in the soak test to ensure our results are as accurate as theirs.
|
||||||
|
|
||||||
This library was originally created for hamsterdb [http://hamsterdb.com] in
|
This library was originally created by [Christoph Rupp](https://crupp.de) in
|
||||||
C++ and then translated to C99 code by Greg Burd <greg@burd.me>.
|
C++ and then translated to C and further improved by Greg Burd <greg@burd.me>
|
||||||
|
for use in LMDB and OpenLDAP.
|
||||||
|
|
14
cmake-it.sh
Executable file
14
cmake-it.sh
Executable file
|
@ -0,0 +1,14 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
target=${1:-Debug}
|
||||||
|
set targets="Debug Profile Release"
|
||||||
|
case "$target" in
|
||||||
|
$targets*) echo "Building ${target}..." ;;
|
||||||
|
*) echo "Unknown target ${target}, exiting." ;;
|
||||||
|
esac
|
||||||
|
name=${target,,}
|
||||||
|
echo $name
|
||||||
|
rm -rf "./cmake-build-${name}-system" && \
|
||||||
|
cmake -DCMAKE_BUILD_TYPE=${target} -DCMAKE_MAKE_PROGRAM=ninja -DCMAKE_C_COMPILER=clang -G Ninja -S "${PWD}" -B "${PWD}/cmake-build-${name}-system" && \
|
||||||
|
(cd "${PWD}/cmake-build-${name}-system" && ninja)
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "../include/sparsemap.h"
|
#include "../include/sparsemap.h"
|
||||||
|
@ -14,16 +17,18 @@
|
||||||
|
|
||||||
/* !!! Duplicated here for testing purposes. Keep in sync, or suffer. !!! */
|
/* !!! Duplicated here for testing purposes. Keep in sync, or suffer. !!! */
|
||||||
struct sparsemap {
|
struct sparsemap {
|
||||||
uint8_t *m_data;
|
|
||||||
size_t m_capacity;
|
size_t m_capacity;
|
||||||
size_t m_data_used;
|
size_t m_data_used;
|
||||||
|
uint8_t *m_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
int
|
int
|
||||||
main()
|
main()
|
||||||
{
|
{
|
||||||
size_t size = 4;
|
size_t size = 4;
|
||||||
setbuf(stderr, 0); // disable buffering
|
setvbuf(stdout, NULL, _IONBF, 0); // Disable buffering for stdout
|
||||||
|
setvbuf(stderr, NULL, _IONBF, 0); // Disable buffering for stdout
|
||||||
|
|
||||||
__diag("Please wait a moment...");
|
__diag("Please wait a moment...");
|
||||||
sparsemap_t mmap, *map = &mmap;
|
sparsemap_t mmap, *map = &mmap;
|
||||||
uint8_t buffer[1024];
|
uint8_t buffer[1024];
|
||||||
|
@ -135,7 +140,7 @@ main()
|
||||||
sparsemap_set(map, i, true);
|
sparsemap_set(map, i, true);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < 100000; i++) {
|
for (int i = 0; i < 100000; i++) {
|
||||||
assert(sparsemap_select(map, i) == (unsigned)i);
|
assert(sparsemap_select(map, i, true) == (unsigned)i);
|
||||||
}
|
}
|
||||||
|
|
||||||
sparsemap_clear(map);
|
sparsemap_clear(map);
|
||||||
|
@ -145,7 +150,7 @@ main()
|
||||||
sparsemap_set(map, i, true);
|
sparsemap_set(map, i, true);
|
||||||
}
|
}
|
||||||
for (int i = 1; i < 513; i++) {
|
for (int i = 1; i < 513; i++) {
|
||||||
assert(sparsemap_select(map, i - 1) == (unsigned)i);
|
assert(sparsemap_select(map, i - 1, true) == (unsigned)i);
|
||||||
}
|
}
|
||||||
|
|
||||||
sparsemap_clear(map);
|
sparsemap_clear(map);
|
||||||
|
@ -155,7 +160,7 @@ main()
|
||||||
sparsemap_set(map, i * 10, true);
|
sparsemap_set(map, i * 10, true);
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < 8; i++) {
|
for (size_t i = 0; i < 8; i++) {
|
||||||
assert(sparsemap_select(map, i) == i * 10);
|
assert(sparsemap_select(map, i, true) == (sparsemap_idx_t)i * 10);
|
||||||
}
|
}
|
||||||
|
|
||||||
// split and move, aligned to MiniMap capacity
|
// split and move, aligned to MiniMap capacity
|
||||||
|
@ -181,16 +186,18 @@ main()
|
||||||
sparsemap_clear(map);
|
sparsemap_clear(map);
|
||||||
for (int i = 0; i < 2048 * 3; i++) {
|
for (int i = 0; i < 2048 * 3; i++) {
|
||||||
sparsemap_set(map, i, true);
|
sparsemap_set(map, i, true);
|
||||||
|
assert(sparsemap_is_set(map, i) == true);
|
||||||
}
|
}
|
||||||
sparsemap_split(map, 64, sm2);
|
sparsemap_split(map, 64, sm2);
|
||||||
for (int i = 0; i < 64; i++) {
|
for (int i = 0; i < 2048 * 3; i++) {
|
||||||
|
if (i < 64) {
|
||||||
assert(sparsemap_is_set(map, i) == true);
|
assert(sparsemap_is_set(map, i) == true);
|
||||||
assert(sparsemap_is_set(sm2, i) == false);
|
assert(sparsemap_is_set(sm2, i) == false);
|
||||||
}
|
} else {
|
||||||
for (int i = 64; i < 2048 * 3; i++) {
|
|
||||||
assert(sparsemap_is_set(map, i) == false);
|
assert(sparsemap_is_set(map, i) == false);
|
||||||
assert(sparsemap_is_set(sm2, i) == true);
|
assert(sparsemap_is_set(sm2, i) == true);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, " ok\n");
|
fprintf(stderr, " ok\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <stdarg.h>
|
#include <stdbool.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <time.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
#include "../include/sparsemap.h"
|
#include "../include/sparsemap.h"
|
||||||
|
|
||||||
|
@ -16,32 +14,31 @@
|
||||||
} while (0)
|
} while (0)
|
||||||
#pragma GCC diagnostic pop
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
#define SEED
|
|
||||||
|
|
||||||
int
|
int
|
||||||
main(void)
|
main(void)
|
||||||
{
|
{
|
||||||
int i = 0;
|
int i;
|
||||||
|
|
||||||
// disable buffering
|
// disable buffering
|
||||||
setbuf(stderr, 0);
|
setvbuf(stdout, NULL, _IONBF, 0); // Disable buffering for stdout
|
||||||
|
setvbuf(stderr, NULL, _IONBF, 0); // Disable buffering for stdout
|
||||||
|
|
||||||
// start with a 1KiB buffer, 1024 bits
|
// start with a 1KiB buffer, 1024 bits
|
||||||
uint8_t *buf = calloc(1024, sizeof(uint8_t));
|
uint8_t *buf = calloc(1024, sizeof(uint8_t));
|
||||||
|
|
||||||
// create the sparse bitmap
|
// create the sparse bitmap
|
||||||
sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 1024);
|
sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 1024);
|
||||||
|
|
||||||
// Set every other bit (pathologically worst case) to see what happens
|
// Set every other bit (pathologically worst case) to see what happens
|
||||||
// when the map is full.
|
// when the map is full.
|
||||||
for (i = 0; i < 7744; i++) {
|
for (i = 0; i < 7744; i++) {
|
||||||
if (i % 2)
|
if (!i % 2) {
|
||||||
continue;
|
|
||||||
sparsemap_set(map, i, true);
|
sparsemap_set(map, i, true);
|
||||||
assert(sparsemap_is_set(map, i) == true);
|
assert(sparsemap_is_set(map, i) == true);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// On 1024 KiB of buffer with every other bit set the map holds 7744 bits
|
// On 1024 KiB of buffer with every other bit set the map holds 7744 bits
|
||||||
// and then runs out of space. This next _set() call will fail/abort.
|
// and then runs out of space. This next _set() call will fail.
|
||||||
sparsemap_set(map, ++i, true);
|
sparsemap_set(map, ++i, true);
|
||||||
assert(sparsemap_is_set(map, i) == true);
|
assert(sparsemap_is_set(map, i) == true);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -1,17 +1,14 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <stdarg.h>
|
#include <common.h>
|
||||||
|
#include <sparsemap.h>
|
||||||
|
#include <stdbool.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <time.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
#include "../include/sparsemap.h"
|
|
||||||
#include "../tests/common.h"
|
|
||||||
|
|
||||||
int
|
int
|
||||||
main(void)
|
main(void)
|
||||||
{
|
{
|
||||||
int i = 0;
|
int i;
|
||||||
int array[1024] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
int array[1024] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
||||||
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
|
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
|
||||||
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
|
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
|
||||||
|
@ -60,7 +57,7 @@ main(void)
|
||||||
uint8_t *buf = calloc(1024, sizeof(uint8_t));
|
uint8_t *buf = calloc(1024, sizeof(uint8_t));
|
||||||
|
|
||||||
// create the sparse bitmap
|
// create the sparse bitmap
|
||||||
sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 1024);
|
sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 1024);
|
||||||
|
|
||||||
// set all the bits on in a random order
|
// set all the bits on in a random order
|
||||||
for (i = 0; i < 1024; i++) {
|
for (i = 0; i < 1024; i++) {
|
||||||
|
|
|
@ -1,11 +1,10 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <common.h>
|
||||||
|
#include <sparsemap.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#include "../include/sparsemap.h"
|
|
||||||
#include "../tests/common.h"
|
|
||||||
|
|
||||||
#define TEST_ARRAY_SIZE 1024
|
#define TEST_ARRAY_SIZE 1024
|
||||||
|
|
||||||
int
|
int
|
||||||
|
@ -24,7 +23,7 @@ main(void)
|
||||||
uint8_t *buf = calloc((size_t)3 * 1024, sizeof(uint8_t));
|
uint8_t *buf = calloc((size_t)3 * 1024, sizeof(uint8_t));
|
||||||
|
|
||||||
// create the sparse bitmap
|
// create the sparse bitmap
|
||||||
sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 3 * 1024);
|
sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 3 * 1024);
|
||||||
|
|
||||||
// create an array of ints
|
// create an array of ints
|
||||||
setup_test_array(array, TEST_ARRAY_SIZE, 1024 * 3);
|
setup_test_array(array, TEST_ARRAY_SIZE, 1024 * 3);
|
||||||
|
@ -60,7 +59,7 @@ main(void)
|
||||||
assert(sparsemap_is_set(map, array[i]) == true);
|
assert(sparsemap_is_set(map, array[i]) == true);
|
||||||
}
|
}
|
||||||
has_span(map, array, TEST_ARRAY_SIZE, (int)len);
|
has_span(map, array, TEST_ARRAY_SIZE, (int)len);
|
||||||
size_t l = sparsemap_span(map, 0, len);
|
size_t l = sparsemap_span(map, 0, len, true);
|
||||||
if (l != (size_t)-1) {
|
if (l != (size_t)-1) {
|
||||||
__diag("Found span in map starting at %lu of length %lu\n", l, len);
|
__diag("Found span in map starting at %lu of length %lu\n", l, len);
|
||||||
__diag("is_span(%lu, %lu) == %s\n", l, len, is_span(array, TEST_ARRAY_SIZE, l, len) ? "yes" : "no");
|
__diag("is_span(%lu, %lu) == %s\n", l, len, is_span(array, TEST_ARRAY_SIZE, l, len) ? "yes" : "no");
|
||||||
|
|
48
flake.lock
48
flake.lock
|
@ -1,43 +1,25 @@
|
||||||
{
|
{
|
||||||
"nodes": {
|
"nodes": {
|
||||||
"flake-utils": {
|
|
||||||
"inputs": {
|
|
||||||
"systems": "systems"
|
|
||||||
},
|
|
||||||
"locked": {
|
|
||||||
"lastModified": 1710146030,
|
|
||||||
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
|
|
||||||
"owner": "numtide",
|
|
||||||
"repo": "flake-utils",
|
|
||||||
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
|
|
||||||
"type": "github"
|
|
||||||
},
|
|
||||||
"original": {
|
|
||||||
"owner": "numtide",
|
|
||||||
"repo": "flake-utils",
|
|
||||||
"type": "github"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1712192574,
|
"lastModified": 1701282334,
|
||||||
"narHash": "sha256-LbbVOliJKTF4Zl2b9salumvdMXuQBr2kuKP5+ZwbYq4=",
|
"narHash": "sha256-MxCVrXY6v4QmfTwIysjjaX0XUhqBbxTWWB4HXtDYsdk=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "f480f9d09e4b4cf87ee6151eba068197125714de",
|
"rev": "057f9aecfb71c4437d2b27d3323df7f93c010b7e",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"ref": "nixpkgs-unstable",
|
"ref": "23.11",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"root": {
|
"root": {
|
||||||
"inputs": {
|
"inputs": {
|
||||||
"flake-utils": "flake-utils",
|
"nixpkgs": "nixpkgs",
|
||||||
"nixpkgs": "nixpkgs"
|
"utils": "utils"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"systems": {
|
"systems": {
|
||||||
|
@ -54,6 +36,24 @@
|
||||||
"repo": "default",
|
"repo": "default",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"utils": {
|
||||||
|
"inputs": {
|
||||||
|
"systems": "systems"
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1710146030,
|
||||||
|
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"root": "root",
|
"root": "root",
|
||||||
|
|
50
flake.nix
50
flake.nix
|
@ -1,38 +1,30 @@
|
||||||
{
|
{
|
||||||
description = "A Concurrent Skip List library for key/value pairs.";
|
description = "A sparse bitmapped index library in C.";
|
||||||
|
|
||||||
inputs = {
|
inputs = {
|
||||||
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
|
# nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
|
||||||
flake-utils.url = "github:numtide/flake-utils";
|
nixpkgs.url = "github:NixOS/nixpkgs/23.11";
|
||||||
|
utils.url = "github:numtide/flake-utils";
|
||||||
};
|
};
|
||||||
|
|
||||||
outputs =
|
outputs = { self, nixpkgs, ... }
|
||||||
{ self
|
@inputs: inputs.utils.lib.eachSystem [
|
||||||
, nixpkgs
|
"x86_64-linux" "i686-linux" "aarch64-linux" "x86_64-darwin"
|
||||||
, flake-utils
|
] (system:
|
||||||
, ...
|
let pkgs = import nixpkgs {
|
||||||
}:
|
|
||||||
flake-utils.lib.eachDefaultSystem (system:
|
|
||||||
let
|
|
||||||
pkgs = import nixpkgs {
|
|
||||||
inherit system;
|
inherit system;
|
||||||
config = { allowUnfree = true; };
|
overlays = [];
|
||||||
|
config.allowUnfree = true;
|
||||||
};
|
};
|
||||||
supportedSystems = [ "x86_64-linux" ];
|
|
||||||
forAllSystems = nixpkgs.lib.genAttrs supportedSystems;
|
|
||||||
nixpkgsFor = forAllSystems (system: import nixpkgs {
|
|
||||||
inherit system;
|
|
||||||
overlays = [ self.overlay ];
|
|
||||||
});
|
|
||||||
in {
|
in {
|
||||||
pkgs = import nixpkgs {
|
flake-utils.inputs.systems.follows = "system";
|
||||||
inherit system;
|
devShell = pkgs.mkShell rec {
|
||||||
devShell = nixpkgs.legacyPackages.${system} {
|
name = "sparsemap";
|
||||||
pkgs.mkShell = {
|
packages = with pkgs; [
|
||||||
nativeBuildInputs = with pkgs.buildPackages; [
|
|
||||||
act
|
act
|
||||||
autoconf
|
autoconf
|
||||||
clang
|
clang
|
||||||
|
cmake
|
||||||
ed
|
ed
|
||||||
gcc
|
gcc
|
||||||
gdb
|
gdb
|
||||||
|
@ -40,20 +32,26 @@
|
||||||
graphviz-nox
|
graphviz-nox
|
||||||
libtool
|
libtool
|
||||||
m4
|
m4
|
||||||
|
ninja
|
||||||
perl
|
perl
|
||||||
pkg-config
|
pkg-config
|
||||||
python3
|
python3
|
||||||
ripgrep
|
ripgrep
|
||||||
valgrind
|
valgrind
|
||||||
];
|
];
|
||||||
|
|
||||||
buildInputs = with pkgs; [
|
buildInputs = with pkgs; [
|
||||||
libbacktrace
|
libbacktrace
|
||||||
glibc.out
|
glibc.out
|
||||||
glibc.static
|
glibc.static
|
||||||
];
|
];
|
||||||
|
|
||||||
|
shellHook = let
|
||||||
|
icon = "f121";
|
||||||
|
in ''
|
||||||
|
export PS1="$(echo -e '\u${icon}') {\[$(tput sgr0)\]\[\033[38;5;228m\]\w\[$(tput sgr0)\]\[\033[38;5;15m\]} (${name}) \\$ \[$(tput sgr0)\]"
|
||||||
|
'';
|
||||||
};
|
};
|
||||||
DOCKER_BUILDKIT = 1;
|
DOCKER_BUILDKIT = 1;
|
||||||
};
|
|
||||||
};
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
|
||||||
|
#include "../include/sparsemap.h"
|
||||||
|
|
||||||
#pragma GCC diagnostic push
|
#pragma GCC diagnostic push
|
||||||
#pragma GCC diagnostic ignored "-Wvariadic-macros"
|
#pragma GCC diagnostic ignored "-Wvariadic-macros"
|
||||||
#define __diag(...) \
|
#define __diag(...) \
|
||||||
|
@ -23,23 +25,12 @@
|
||||||
#define XORSHIFT_SEED_VALUE ((unsigned int)time(NULL) ^ getpid())
|
#define XORSHIFT_SEED_VALUE ((unsigned int)time(NULL) ^ getpid())
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define EST_MEDIAN_DECL(decl, size) \
|
|
||||||
uint64_t heap_##decl[size] = { 0 }; \
|
|
||||||
int heap_##decl##_max_size = size; \
|
|
||||||
int heap_##decl##_size = 0;
|
|
||||||
|
|
||||||
#define EST_MEDIAN_ADD(decl, value) est_insert_value(heap_##decl, heap_##decl##_max_size, &heap_##decl##_size, (value));
|
|
||||||
|
|
||||||
#define EST_MEDIAN_GET(decl) heap_##decl[0]
|
|
||||||
|
|
||||||
uint64_t tsc(void);
|
uint64_t tsc(void);
|
||||||
double tsc_ticks_to_ns(uint64_t tsc_ticks);
|
double tsc_ticks_to_ns(uint64_t tsc_ticks);
|
||||||
void est_sift_up(uint64_t *heap, int child_index);
|
double nsts(void);
|
||||||
void est_sift_down(uint64_t *heap, int heap_size, int parent_index);
|
|
||||||
void est_insert_value(uint64_t *heap, int heap_max_size, int *heap_size, uint64_t value);
|
|
||||||
|
|
||||||
void xorshift32_seed();
|
void xorshift32_seed(void);
|
||||||
uint32_t xorshift32();
|
uint32_t xorshift32(void);
|
||||||
|
|
||||||
void print_array(int *array, int l);
|
void print_array(int *array, int l);
|
||||||
void print_spans(int *array, int n);
|
void print_spans(int *array, int n);
|
||||||
|
@ -52,11 +43,18 @@ int is_unique(int a[], int l, int value);
|
||||||
void setup_test_array(int a[], int l, int max_value);
|
void setup_test_array(int a[], int l, int max_value);
|
||||||
void shuffle(int *array, size_t n);
|
void shuffle(int *array, size_t n);
|
||||||
int ensure_sequential_set(int a[], int l, int r);
|
int ensure_sequential_set(int a[], int l, int r);
|
||||||
int create_sequential_set_in_empty_map(sparsemap_t *map, int s, int r);
|
sparsemap_idx_t sm_add_span(sparsemap_t *map, int map_size, int span_length);
|
||||||
|
|
||||||
|
void print_bits(char *name, uint64_t value);
|
||||||
|
|
||||||
void bitmap_from_uint32(sparsemap_t *map, uint32_t number);
|
void bitmap_from_uint32(sparsemap_t *map, uint32_t number);
|
||||||
void bitmap_from_uint64(sparsemap_t *map, uint64_t number);
|
void sm_bitmap_from_uint64(sparsemap_t *map, int offset, uint64_t number);
|
||||||
uint32_t rank_uint64(uint64_t number, int n, int p);
|
uint32_t rank_uint64(uint64_t number, int n, int p);
|
||||||
int whats_set_uint64(uint64_t number, int bitPositions[64]);
|
int whats_set_uint64(uint64_t number, int bitPositions[64]);
|
||||||
|
|
||||||
void whats_set(sparsemap_t *map, int m);
|
void sm_whats_set(sparsemap_t *map, int off, int len);
|
||||||
|
|
||||||
|
bool sm_is_span(sparsemap_t *map, sparsemap_idx_t m, int len, bool value);
|
||||||
|
bool sm_occupied(sparsemap_t *map, sparsemap_idx_t m, int len, bool value);
|
||||||
|
|
||||||
|
char *bytes_as(double bytes, char *s, size_t size);
|
2908
include/roaring.h
Normal file
2908
include/roaring.h
Normal file
File diff suppressed because it is too large
Load diff
|
@ -69,14 +69,14 @@
|
||||||
#ifndef SPARSEMAP_H
|
#ifndef SPARSEMAP_H
|
||||||
#define SPARSEMAP_H
|
#define SPARSEMAP_H
|
||||||
|
|
||||||
#include <sys/types.h>
|
|
||||||
|
|
||||||
#include <assert.h>
|
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
#if defined(__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The public interface for a sparse bit-mapped index, a "sparse map".
|
* The public interface for a sparse bit-mapped index, a "sparse map".
|
||||||
|
@ -88,55 +88,299 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
typedef struct sparsemap sparsemap_t;
|
typedef struct sparsemap sparsemap_t;
|
||||||
|
typedef size_t sparsemap_idx_t;
|
||||||
|
#define SPARSEMAP_IDX_MAX SIZE_MAX
|
||||||
|
#define SPARSEMAP_FOUND(x) ((x) != SPARSEMAP_IDX_MAX)
|
||||||
|
#define SPARSEMAP_NOT_FOUND(x) ((x) == SPARSEMAP_IDX_MAX)
|
||||||
typedef uint32_t sm_idx_t;
|
typedef uint32_t sm_idx_t;
|
||||||
typedef uint64_t sm_bitvec_t;
|
typedef uint64_t sm_bitvec_t;
|
||||||
|
|
||||||
/* Allocate on a sparsemap_t on the heap and initialize it. */
|
/** @brief Allocate a new, empty sparsemap_t with a buffer of \b size on the
|
||||||
sparsemap_t *sparsemap(uint8_t *data, size_t size);
|
* heap to use for storage of bitmap data.
|
||||||
|
*
|
||||||
|
* The buffer used for the bitmap is allocated in the same heap allocation as
|
||||||
|
* the structure, this means that you only need to call free() on the returned
|
||||||
|
* object to free all resources. Using this method allows you to grow the
|
||||||
|
* buffer size by calling #sparsemap_set_data_size(). This function calls
|
||||||
|
* #sparsemap_init().
|
||||||
|
*
|
||||||
|
* @param[in] size The starting size of the buffer used for the bitmap, default
|
||||||
|
* is 1024 bytes.
|
||||||
|
* @returns The newly allocated sparsemap reference.
|
||||||
|
*/
|
||||||
|
sparsemap_t *sparsemap(size_t size);
|
||||||
|
|
||||||
/* Initialize sparsemap_t with data. */
|
/** @brief Allocate a new, copy of the \b other sparsemap_t.
|
||||||
|
*
|
||||||
|
* @param[in] other The sparsemap to copy.
|
||||||
|
*/
|
||||||
|
sparsemap_t *sparsemap_copy(sparsemap_t *other);
|
||||||
|
|
||||||
|
/** @brief Allocate a new, empty sparsemap_t that references (wraps) the buffer
|
||||||
|
* \b data of \b size bytes to use for storage of bitmap data.
|
||||||
|
*
|
||||||
|
* This function allocates a new sparsemap_t but not the buffer which is
|
||||||
|
* provided by the caller as \b data which can be allocated on the stack or
|
||||||
|
* heap. Caller is responsible for calling free() on the returned heap object
|
||||||
|
* and releasing the memory used for \b data. Resizing the buffer is only
|
||||||
|
* supported when the heap object for the map includes the buffer and the
|
||||||
|
* \b data offset supplied is relative to the object (see #sparsemap()).
|
||||||
|
*
|
||||||
|
* @param[in] data A heap or stack memory buffer of \b size for use storing
|
||||||
|
* bitmap data.
|
||||||
|
* @param[in] size The size of the buffer \b data used for the bitmap.
|
||||||
|
* @returns The newly allocated sparsemap reference.
|
||||||
|
*/
|
||||||
|
sparsemap_t *sparsemap_wrap(uint8_t *data, size_t size);
|
||||||
|
|
||||||
|
/** @brief Initialize an existing sparsemap_t by assigning \b data of \b size
|
||||||
|
* bytes for storage of bitmap data.
|
||||||
|
*
|
||||||
|
* Given the address of an existing \b map allocated on the stack or heap this
|
||||||
|
* function will initialize the data structure and use the provided \b data of
|
||||||
|
* \b size for bitmap data. Caller is responsible for all memory management.
|
||||||
|
* Resizing the buffer is not directly supported, you
|
||||||
|
* may resize it and call #sparsemap_set_data_size() and then ensure that should
|
||||||
|
* the address of the object changed you need to update it by calling #sparsemap_
|
||||||
|
* m_data field.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @param[in] data A heap or stack memory buffer of \b size for use storing
|
||||||
|
* bitmap data.
|
||||||
|
* @param[in] size The size of the buffer \b data used for the bitmap.
|
||||||
|
*/
|
||||||
void sparsemap_init(sparsemap_t *map, uint8_t *data, size_t size);
|
void sparsemap_init(sparsemap_t *map, uint8_t *data, size_t size);
|
||||||
|
|
||||||
/* Clears the whole buffer. */
|
/** @brief Opens, without initializing, an existing sparsemap contained within
|
||||||
|
* the specified buffer.
|
||||||
|
*
|
||||||
|
* Given the address of an existing \b map this function will assign to the
|
||||||
|
* provided data structure \b data of \b size for bitmap data. Caller is
|
||||||
|
* responsible for all memory management. Use this when as a way to
|
||||||
|
* "deserialize" bytes and make them ready for use as a bitmap.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @param[in] data A heap or stack memory buffer of \b size for use storing
|
||||||
|
* bitmap data.
|
||||||
|
* @param[in] size The size of the buffer \b data used for the bitmap.
|
||||||
|
*/
|
||||||
|
void sparsemap_open(sparsemap_t *map, uint8_t *data, size_t size);
|
||||||
|
|
||||||
|
/** @brief Resets values and empties the buffer making it ready to accept new
|
||||||
|
* data but does not free the memory.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
*/
|
||||||
void sparsemap_clear(sparsemap_t *map);
|
void sparsemap_clear(sparsemap_t *map);
|
||||||
|
|
||||||
/* Opens an existing sparsemap at the specified buffer. */
|
/** @brief Update the size of the buffer \b data used for storing the bitmap.
|
||||||
void sparsemap_open(sparsemap_t *, uint8_t *data, size_t data_size);
|
*
|
||||||
|
* When called with \b data NULL on a \b map that was created with #sparsemap()
|
||||||
|
* this function will reallocate the storage for both the map and data possibly
|
||||||
|
* changing the address of the map itself so it is important for the caller to
|
||||||
|
* update all references to this map to the address returned in this scenario.
|
||||||
|
* Access to stale references will result in memory violations and program
|
||||||
|
* termination. Caller is not required to free() the old address, only the new
|
||||||
|
* one should it have changed. This uses #realloc() under the covers, all
|
||||||
|
* caveats apply here as well.
|
||||||
|
*
|
||||||
|
* When called referencing a \b map that was allocate by the caller this
|
||||||
|
* function will only update the values within the data structure.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @param[in] size The desired size of the buffer \b data used for the bitmap.
|
||||||
|
* @returns The -- potentially changed -- sparsemap reference, or NULL should a
|
||||||
|
* #realloc() fail (\b ENOMEM)
|
||||||
|
* @note The resizing of caller supplied allocated objects is not yet fully
|
||||||
|
* supported.
|
||||||
|
*/
|
||||||
|
sparsemap_t *sparsemap_set_data_size(sparsemap_t *map, uint8_t *data, size_t size);
|
||||||
|
|
||||||
/* Resizes the data range. */
|
/** @brief Calculate remaining capacity, approaches 0 when full.
|
||||||
void sparsemap_set_data_size(sparsemap_t *map, size_t data_size);
|
*
|
||||||
|
* Provides an estimate in the range [0.0, 100.0] of the remaining capacity of
|
||||||
/* Calculate remaining capacity, full when 0. */
|
* the buffer storing bitmap data. This can change up or down as more data
|
||||||
|
* is added/removed due to the method for compressed representation, do not
|
||||||
|
* expect a smooth progression either direction. This is a rough estimate only
|
||||||
|
* and may also jump in value after seemingly indiscriminate changes to the map.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @returns an estimate for remaining capacity that approaches 0.0 when full or
|
||||||
|
* 100.0 when empty
|
||||||
|
*/
|
||||||
double sparsemap_capacity_remaining(sparsemap_t *map);
|
double sparsemap_capacity_remaining(sparsemap_t *map);
|
||||||
|
|
||||||
/* Returns the size of the underlying byte array. */
|
/** @brief Returns the capacity of the underlying byte array in bytes.
|
||||||
|
*
|
||||||
|
* Specifically, this returns the byte \b size provided for the underlying
|
||||||
|
* buffer used to store bitmap data.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @returns byte size of the buffer used for storing bitmap data
|
||||||
|
*/
|
||||||
size_t sparsemap_get_capacity(sparsemap_t *map);
|
size_t sparsemap_get_capacity(sparsemap_t *map);
|
||||||
|
|
||||||
/* Returns the value of a bit at index |idx|. */
|
/** @brief Returns the value of a bit at index \b idx, either true for "set" (1)
|
||||||
bool sparsemap_is_set(sparsemap_t *map, size_t idx);
|
* or \b false for "unset" (0).
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @param[in] idx The 0-based offset into the bitmap index to examine.
|
||||||
|
* @returns either true or false
|
||||||
|
*/
|
||||||
|
bool sparsemap_is_set(sparsemap_t *map, sparsemap_idx_t idx);
|
||||||
|
|
||||||
/* Sets the bit at index |idx| to true or false, depending on |value|. */
|
/** @brief Sets the bit at index \b idx to \b value.
|
||||||
void sparsemap_set(sparsemap_t *map, size_t idx, bool value);
|
*
|
||||||
|
* A sparsemap has a fixed size buffer with a capacity that can be exhausted by
|
||||||
|
* when calling this function. In such cases the return value is not equal to
|
||||||
|
* the provided \b idx and errno is set to ENOSPC. In such situations it is
|
||||||
|
* possible to grow the data size and retry the set() operation under certain
|
||||||
|
* circumstances (see #sparsemap() and #sparsemap_set_data_size()).
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @param[in] idx The 0-based offset into the bitmap index to modify.
|
||||||
|
* @returns the \b idx supplied on success or SPARSEMAP_IDX_MAX on error
|
||||||
|
* with \b errno set to ENOSPC when the map is full.
|
||||||
|
*/
|
||||||
|
sparsemap_idx_t sparsemap_set(sparsemap_t *map, sparsemap_idx_t idx, bool value);
|
||||||
|
|
||||||
/* Returns the offset of the very first bit. */
|
/** @brief Returns the byte size of the data buffer that has been used thus far.
|
||||||
sm_idx_t sparsemap_get_start_offset(sparsemap_t *map);
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
/* Returns the used size in the data buffer. */
|
* @returns the byte size of the data buffer that has been used thus far
|
||||||
|
*/
|
||||||
size_t sparsemap_get_size(sparsemap_t *map);
|
size_t sparsemap_get_size(sparsemap_t *map);
|
||||||
|
|
||||||
/* Decompresses the whole bitmap; calls scanner for all bits. */
|
/** @brief Returns a pointer to the data buffer used for the map.
|
||||||
void sparsemap_scan(sparsemap_t *map, void (*scanner)(sm_idx_t[], size_t), size_t skip);
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @returns a pointer to the data buffer used for the map
|
||||||
|
*/
|
||||||
|
void *sparsemap_get_data(sparsemap_t *map);
|
||||||
|
|
||||||
/* Appends all chunk maps from |map| starting at |sstart| to |other|, then
|
/** @brief Returns the number of elements in the map.
|
||||||
reduces the chunk map-count appropriately. */
|
*
|
||||||
void sparsemap_split(sparsemap_t *map, size_t sstart, sparsemap_t *other);
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @returns the number of elements in the map
|
||||||
|
*/
|
||||||
|
size_t sparsemap_count(sparsemap_t *map);
|
||||||
|
|
||||||
/* Returns the index of the n'th set bit; uses a 0-based index. */
|
/** @brief Returns the offset of the first bit set in the map.
|
||||||
size_t sparsemap_select(sparsemap_t *map, size_t n);
|
*
|
||||||
|
* This is the same as the value of the first set bit in the
|
||||||
|
* map.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @returns the offset of the first bit set in the map
|
||||||
|
*/
|
||||||
|
sparsemap_idx_t sparsemap_get_starting_offset(sparsemap_t *map);
|
||||||
|
|
||||||
/* Counts the set bits in the range [offset, idx]. */
|
/** @brief Returns the offset of the last bit set in the map.
|
||||||
size_t sparsemap_rank(sparsemap_t *map, size_t offset, size_t idx);
|
*
|
||||||
|
* This is the same as the value of the last bit set in the
|
||||||
|
* map.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @returns the offset of the index bit set in the map
|
||||||
|
*/
|
||||||
|
sparsemap_idx_t sparsemap_get_ending_offset(sparsemap_t *map);
|
||||||
|
|
||||||
size_t sparsemap_span(sparsemap_t *map, size_t loc, size_t len);
|
/** @brief Returns the percent of bits set in the map.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @returns the percent of bits set.
|
||||||
|
*/
|
||||||
|
double sparsemap_fill_factor(sparsemap_t *map);
|
||||||
|
|
||||||
|
/** @brief Provides a method for a callback function to examine every bit set in
|
||||||
|
* the index.
|
||||||
|
*
|
||||||
|
* This decompresses the whole bitmap and invokes #scanner() passing an array
|
||||||
|
* of the positions of set bits in order from 0 index to the end of the map.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @param[in] skip Start the scan after \b skip position in the map.
|
||||||
|
* @param[in] aux Auxiliary information passed to the scanner.
|
||||||
|
*/
|
||||||
|
void sparsemap_scan(sparsemap_t *map, void (*scanner)(sm_idx_t vec[], size_t n, void *aux), size_t skip, void *aux);
|
||||||
|
|
||||||
|
/** @brief Merges the values from \b source into \b destination, \b source is unchanged.
|
||||||
|
*
|
||||||
|
* Efficiently adds all set bits from \b source into \b destination.
|
||||||
|
*
|
||||||
|
* @param[in] destination The sparsemap reference into which we will merge \b source.
|
||||||
|
* @param[in] source The bitmap to merge into \b destination.
|
||||||
|
* @returns 0 on success, or sets errno to ENOSPC and returns the amount of
|
||||||
|
* additional space required to successfully merge the maps.
|
||||||
|
*/
|
||||||
|
int sparsemap_merge(sparsemap_t *destination, sparsemap_t *source);
|
||||||
|
|
||||||
|
/** @brief Splits the bitmap by assigning all bits starting at \b offset to the
|
||||||
|
* \b other bitmap while removing them from \b map.
|
||||||
|
*
|
||||||
|
* The \b other bitmap is expected to be empty.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @param[in] offset The 0-based offset into the bitmap at which to split, if
|
||||||
|
* set to SPARSEMAP_IDX_MAX then the bits will be evenly split.
|
||||||
|
* @param[in] other The bitmap into which we place the split.
|
||||||
|
* @returns the offset at which the map was split
|
||||||
|
*/
|
||||||
|
sparsemap_idx_t sparsemap_split(sparsemap_t *map, sparsemap_idx_t offset, sparsemap_t *other);
|
||||||
|
|
||||||
|
/** @brief Finds the index of the \b n'th bit set to \b value.
|
||||||
|
*
|
||||||
|
* Locates the \b n'th bit either set, \b value is true, or unset, \b value is
|
||||||
|
* false, from the start of the bitmap.
|
||||||
|
* So, if your bit pattern is: ```1101 1110 1010 1101 1011 1110 1110 1111``` and
|
||||||
|
* you request the first set bit the result is `0` (meaning the 1st bit in the
|
||||||
|
* map which is index 0 because this is 0-based indexing). The first unset bit
|
||||||
|
* is `2` (or the third bit in the pattern). When n is 3 and value is true the
|
||||||
|
* result would be `3` (the fourth bit, or the third set bit which is at index
|
||||||
|
* 3 when 0-based).
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @param[in] n Specifies how many bits to ignore (when n=2 return the position
|
||||||
|
* of the third matching bit).
|
||||||
|
* @param[in] value Determines if the search is to examine set (true) or unset
|
||||||
|
* (false) bits in the bitmap index.
|
||||||
|
* @returns the 0-based index of the located bit position within the map; when
|
||||||
|
* not found either SPARSEMAP_IDX_MAX.
|
||||||
|
*/
|
||||||
|
sparsemap_idx_t sparsemap_select(sparsemap_t *map, sparsemap_idx_t n, bool value);
|
||||||
|
|
||||||
|
/** @brief Counts the bits matching \b value in the provided range, [\b x, \b
|
||||||
|
* y].
|
||||||
|
*
|
||||||
|
* Counts the set, \b value is true, or unset, \b value is false, bits starting
|
||||||
|
* at the \b idx'th bit (0-based) in the range [\b x, \b y] (inclusive on either
|
||||||
|
* end). If range is [0, 0] this examines 1 bit, the first one in the map, and
|
||||||
|
* returns 1 if value is true and the bit was set.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @param[in] x 0-based start of the inclusive range to examine.
|
||||||
|
* @param[in] y 0-based end of the inclusive range to examine.
|
||||||
|
* @param[in] value Determines if the scan is to count the set (true) or unset
|
||||||
|
* (false) bits in the range.
|
||||||
|
* @returns the count of bits found within the range that match the \b value
|
||||||
|
*/
|
||||||
|
size_t sparsemap_rank(sparsemap_t *map, size_t x, size_t y, bool value);
|
||||||
|
|
||||||
|
/** @brief Locates the first contiguous set of bits of \b len starting at \b idx
|
||||||
|
* matching \b value in the bitmap.
|
||||||
|
*
|
||||||
|
* @param[in] map The sparsemap reference.
|
||||||
|
* @param[in] start 0-based start of search within the bitmap.
|
||||||
|
* @param[in] len The length of contiguous bits we're seeking.
|
||||||
|
* @param[in] value Determines if the scan is to find all set (true) or unset
|
||||||
|
* (false) bits of \b len.
|
||||||
|
* @returns the index of the first bit matching the criteria; when not found
|
||||||
|
* found SPARSEMAP_IDX_MAX
|
||||||
|
*/
|
||||||
|
size_t sparsemap_span(sparsemap_t *map, sparsemap_idx_t start, size_t len, bool value);
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif /* !defined(SPARSEMAP_H) */
|
||||||
|
|
258
include/tdigest.h
Normal file
258
include/tdigest.h
Normal file
|
@ -0,0 +1,258 @@
|
||||||
|
#pragma once
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adaptive histogram based on something like streaming k-means crossed with Q-digest.
|
||||||
|
* The implementation is a direct descendent of MergingDigest
|
||||||
|
* https://github.com/tdunning/t-digest/
|
||||||
|
*
|
||||||
|
* Copyright (c) 2021 Redis, All rights reserved.
|
||||||
|
* Copyright (c) 2018 Andrew Werner, All rights reserved.
|
||||||
|
*
|
||||||
|
* The special characteristics of this algorithm are:
|
||||||
|
*
|
||||||
|
* - smaller summaries than Q-digest
|
||||||
|
*
|
||||||
|
* - provides part per million accuracy for extreme quantiles and typically <1000 ppm accuracy
|
||||||
|
* for middle quantiles
|
||||||
|
*
|
||||||
|
* - fast
|
||||||
|
*
|
||||||
|
* - simple
|
||||||
|
*
|
||||||
|
* - easy to adapt for use with map-reduce
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define MM_PI 3.14159265358979323846
|
||||||
|
|
||||||
|
struct td_histogram {
|
||||||
|
// compression is a setting used to configure the size of centroids when merged.
|
||||||
|
double compression;
|
||||||
|
|
||||||
|
double min;
|
||||||
|
double max;
|
||||||
|
|
||||||
|
// cap is the total size of nodes
|
||||||
|
int cap;
|
||||||
|
// merged_nodes is the number of merged nodes at the front of nodes.
|
||||||
|
int merged_nodes;
|
||||||
|
// unmerged_nodes is the number of buffered nodes.
|
||||||
|
int unmerged_nodes;
|
||||||
|
|
||||||
|
// we run the merge in reverse every other merge to avoid left-to-right bias in merging
|
||||||
|
long long total_compressions;
|
||||||
|
|
||||||
|
long long merged_weight;
|
||||||
|
long long unmerged_weight;
|
||||||
|
|
||||||
|
double *nodes_mean;
|
||||||
|
long long *nodes_weight;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct td_histogram td_histogram_t;
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allocate the memory, initialise the t-digest, and return the histogram as output parameter.
|
||||||
|
* @param compression The compression parameter.
|
||||||
|
* 100 is a common value for normal uses.
|
||||||
|
* 1000 is extremely large.
|
||||||
|
* The number of centroids retained will be a smallish (usually less than 10) multiple of this
|
||||||
|
* number.
|
||||||
|
* @return the histogram on success, NULL if allocation failed.
|
||||||
|
*/
|
||||||
|
td_histogram_t *td_new(double compression);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allocate the memory and initialise the t-digest.
|
||||||
|
*
|
||||||
|
* @param compression The compression parameter.
|
||||||
|
* 100 is a common value for normal uses.
|
||||||
|
* 1000 is extremely large.
|
||||||
|
* The number of centroids retained will be a smallish (usually less than 10) multiple of this
|
||||||
|
* number.
|
||||||
|
* @param result Output parameter to capture allocated histogram.
|
||||||
|
* @return 0 on success, 1 if allocation failed.
|
||||||
|
*/
|
||||||
|
int td_init(double compression, td_histogram_t **result);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Frees the memory associated with the t-digest.
|
||||||
|
*
|
||||||
|
* @param h The histogram you want to free.
|
||||||
|
*/
|
||||||
|
void td_free(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset a histogram to zero - empty out a histogram and re-initialise it
|
||||||
|
*
|
||||||
|
* If you want to re-use an existing histogram, but reset everything back to zero, this
|
||||||
|
* is the routine to use.
|
||||||
|
*
|
||||||
|
* @param h The histogram you want to reset to empty.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
void td_reset(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a sample to a histogram.
|
||||||
|
*
|
||||||
|
* @param val The value to add.
|
||||||
|
* @param weight The weight of this point.
|
||||||
|
* @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
|
||||||
|
* weight.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
int td_add(td_histogram_t *h, double val, long long weight);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Re-examines a t-digest to determine whether some centroids are redundant. If your data are
|
||||||
|
* perversely ordered, this may be a good idea. Even if not, this may save 20% or so in space.
|
||||||
|
*
|
||||||
|
* The cost is roughly the same as adding as many data points as there are centroids. This
|
||||||
|
* is typically < 10 * compression, but could be as high as 100 * compression.
|
||||||
|
* This is a destructive operation that is not thread-safe.
|
||||||
|
*
|
||||||
|
* @param h The histogram you want to compress.
|
||||||
|
* @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
|
||||||
|
* weight. If overflow is detected the histogram is not changed.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
int td_compress(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Merges all of the values from 'from' to 'this' histogram.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
* @param from Histogram to copy values from.
|
||||||
|
* * @return 0 on success, EDOM if overflow was detected as a consequence of merging the the
|
||||||
|
* provided histogram. If overflow is detected the original histogram is not detected.
|
||||||
|
*/
|
||||||
|
int td_merge(td_histogram_t *h, td_histogram_t *from);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the fraction of all points added which are ≤ x.
|
||||||
|
*
|
||||||
|
* @param x The cutoff for the cdf.
|
||||||
|
* @return The fraction of all data which is less or equal to x.
|
||||||
|
*/
|
||||||
|
double td_cdf(td_histogram_t *h, double x);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an estimate of the cutoff such that a specified fraction of the data
|
||||||
|
* added to this TDigest would be less than or equal to the cutoff.
|
||||||
|
*
|
||||||
|
* @param q The desired fraction
|
||||||
|
* @return The value x such that cdf(x) == q;
|
||||||
|
*/
|
||||||
|
double td_quantile(td_histogram_t *h, double q);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an estimate of the cutoff such that a specified fraction of the data
|
||||||
|
* added to this TDigest would be less than or equal to the cutoffs.
|
||||||
|
*
|
||||||
|
* @param quantiles The ordered percentiles array to get the values for.
|
||||||
|
* @param values Destination array containing the values at the given quantiles.
|
||||||
|
* The values array should be allocated by the caller.
|
||||||
|
* @return 0 on success, ENOMEM if the provided destination array is null.
|
||||||
|
*/
|
||||||
|
int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the trimmed mean ignoring values outside given cutoff upper and lower limits.
|
||||||
|
*
|
||||||
|
* @param leftmost_cut Fraction to cut off of the left tail of the distribution.
|
||||||
|
* @param rightmost_cut Fraction to cut off of the right tail of the distribution.
|
||||||
|
* @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
|
||||||
|
*/
|
||||||
|
double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the trimmed mean ignoring values outside given a symmetric cutoff limits.
|
||||||
|
*
|
||||||
|
* @param proportion_to_cut Fraction to cut off of the left and right tails of the distribution.
|
||||||
|
* @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
|
||||||
|
*/
|
||||||
|
double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the current compression factor.
|
||||||
|
*
|
||||||
|
* @return The compression factor originally used to set up the TDigest.
|
||||||
|
*/
|
||||||
|
int td_compression(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of points that have been added to this TDigest.
|
||||||
|
*
|
||||||
|
* @return The sum of the weights on all centroids.
|
||||||
|
*/
|
||||||
|
long long td_size(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of centroids being used by this TDigest.
|
||||||
|
*
|
||||||
|
* @return The number of centroids being used.
|
||||||
|
*/
|
||||||
|
int td_centroid_count(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get minimum value from the histogram. Will return __DBL_MAX__ if the histogram
|
||||||
|
* is empty.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
*/
|
||||||
|
double td_min(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get maximum value from the histogram. Will return - __DBL_MAX__ if the histogram
|
||||||
|
* is empty.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
*/
|
||||||
|
double td_max(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the full centroids weight array for 'this' histogram.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
*
|
||||||
|
* @return The full centroids weight array.
|
||||||
|
*/
|
||||||
|
const long long *td_centroids_weight(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the full centroids mean array for 'this' histogram.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
*
|
||||||
|
* @return The full centroids mean array.
|
||||||
|
*/
|
||||||
|
const double *td_centroids_mean(td_histogram_t *h);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the centroid weight for 'this' histogram and 'pos'.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
* @param pos centroid position.
|
||||||
|
*
|
||||||
|
* @return The centroid weight.
|
||||||
|
*/
|
||||||
|
long long td_centroids_weight_at(td_histogram_t *h, int pos);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the centroid mean for 'this' histogram and 'pos'.
|
||||||
|
*
|
||||||
|
* @param h "This" pointer
|
||||||
|
* @param pos centroid position.
|
||||||
|
*
|
||||||
|
* @return The centroid mean.
|
||||||
|
*/
|
||||||
|
double td_centroids_mean_at(td_histogram_t *h, int pos);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
|
@ -1,14 +1,25 @@
|
||||||
#include <sys/types.h>
|
#define _POSIX_C_SOURCE 199309L
|
||||||
|
#define X86_INTRIN
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <pthread.h>
|
#include <pthread.h> // If using threads
|
||||||
#include <sparsemap.h>
|
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#ifdef __x86_64__ // Check if running on x86_64 architecture
|
||||||
|
#ifdef X86_INTRIN
|
||||||
|
#include <errno.h>
|
||||||
|
#include <x86intrin.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "common.h"
|
#include "../include/common.h"
|
||||||
|
#include "../include/sparsemap.h"
|
||||||
|
|
||||||
#pragma GCC diagnostic push
|
#pragma GCC diagnostic push
|
||||||
#pragma GCC diagnostic ignored "-Wvariadic-macros"
|
#pragma GCC diagnostic ignored "-Wvariadic-macros"
|
||||||
|
@ -22,95 +33,69 @@
|
||||||
uint64_t
|
uint64_t
|
||||||
tsc(void)
|
tsc(void)
|
||||||
{
|
{
|
||||||
|
#ifdef __x86_64__ // Check if running on x86_64 architecture
|
||||||
|
#ifdef X86_INTRIN
|
||||||
|
return __rdtsc();
|
||||||
|
#else
|
||||||
uint32_t low, high;
|
uint32_t low, high;
|
||||||
__asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
|
__asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
|
||||||
return ((uint64_t)high << 32) | low;
|
return ((uint64_t)high << 32) | low;
|
||||||
|
#endif
|
||||||
|
#ifdef __arm__ // Check if compiling for ARM architecture
|
||||||
|
uint64_t result;
|
||||||
|
__asm__ volatile("mrs %0, pmccntr_el0" : "=r"(result));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint64_t
|
// get microsecond timestamp
|
||||||
get_tsc_frequency()
|
uint64_t
|
||||||
|
msts()
|
||||||
{
|
{
|
||||||
uint32_t high, low;
|
#ifdef _SC_MONOTONIC_CLOCK
|
||||||
__asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
|
struct timespec ts;
|
||||||
__asm__ volatile("rdtsc");
|
if (sysconf(_SC_MONOTONIC_CLOCK) > 0) {
|
||||||
return ((uint64_t)high << 32) | low;
|
/* A monotonic clock presents */
|
||||||
|
if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0)
|
||||||
|
return (uint64_t)(ts.tv_sec * 1000000 + ts.tv_nsec / 1000);
|
||||||
|
else
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
#else
|
||||||
|
struct timeval tv;
|
||||||
|
if (gettimeofday(&tv, NULL) == 0)
|
||||||
|
return (uint64_t)(tv.tv_sec * 1000000 + tv.tv_usec);
|
||||||
|
else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
double
|
double
|
||||||
tsc_ticks_to_ns(uint64_t tsc_ticks)
|
nsts(void)
|
||||||
{
|
{
|
||||||
static uint64_t tsc_freq = 0;
|
struct timespec ts;
|
||||||
if (tsc_freq == 0) {
|
|
||||||
tsc_freq = get_tsc_frequency();
|
|
||||||
}
|
|
||||||
return (double)tsc_ticks / (double)tsc_freq * 1e9;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
|
||||||
est_sift_up(uint64_t *heap, int child_index)
|
perror("clock_gettime");
|
||||||
{
|
return -1.0; // Return -1.0 on error
|
||||||
while (child_index > 0) {
|
|
||||||
int parent_index = (child_index - 1) / 2;
|
|
||||||
if (heap[parent_index] > heap[child_index]) {
|
|
||||||
// Swap parent and child
|
|
||||||
uint64_t temp = heap[parent_index];
|
|
||||||
heap[parent_index] = heap[child_index];
|
|
||||||
heap[child_index] = temp;
|
|
||||||
child_index = parent_index;
|
|
||||||
} else {
|
|
||||||
break; // Heap property satisfied
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
est_sift_down(uint64_t *heap, int heap_size, int parent_index)
|
|
||||||
{
|
|
||||||
int child_index = 2 * parent_index + 1; // Left child
|
|
||||||
while (child_index < heap_size) {
|
|
||||||
// Right child exists and is smaller than left child
|
|
||||||
if (child_index + 1 < heap_size && heap[child_index + 1] < heap[child_index]) {
|
|
||||||
child_index++;
|
|
||||||
}
|
|
||||||
// If the smallest child is smaller than the parent, swap them
|
|
||||||
if (heap[child_index] < heap[parent_index]) {
|
|
||||||
uint64_t temp = heap[child_index];
|
|
||||||
heap[child_index] = heap[parent_index];
|
|
||||||
heap[parent_index] = temp;
|
|
||||||
parent_index = child_index;
|
|
||||||
child_index = 2 * parent_index + 1;
|
|
||||||
} else {
|
|
||||||
break; // Heap property satisfied
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
est_insert_value(uint64_t *heap, int heap_max_size, int *heap_size, uint64_t value)
|
|
||||||
{
|
|
||||||
if (*heap_size < heap_max_size) { // Heap not full, insert value
|
|
||||||
heap[*heap_size] = value;
|
|
||||||
est_sift_up(heap, *heap_size);
|
|
||||||
(*heap_size)++;
|
|
||||||
} else {
|
|
||||||
// Heap is full, replace root with new value with a certain probability
|
|
||||||
// This is a very naive approach to maintain a sample of the input
|
|
||||||
if (rand() % 2) {
|
|
||||||
heap[0] = value;
|
|
||||||
est_sift_down(heap, heap_max_size, 0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
return ts.tv_sec + ts.tv_nsec / 1e9;
|
||||||
}
|
}
|
||||||
|
|
||||||
int __xorshift32_state = 0;
|
int __xorshift32_state = 0;
|
||||||
|
|
||||||
// Xorshift algorithm for PRNG
|
// Xorshift algorithm for PRNG
|
||||||
uint32_t
|
uint32_t
|
||||||
xorshift32()
|
xorshift32(void)
|
||||||
{
|
{
|
||||||
uint32_t x = __xorshift32_state;
|
uint32_t x = __xorshift32_state;
|
||||||
if (x == 0)
|
if (x == 0) {
|
||||||
x = 123456789;
|
x = 123456789;
|
||||||
|
}
|
||||||
x ^= x << 13;
|
x ^= x << 13;
|
||||||
x ^= x >> 17;
|
x ^= x >> 17;
|
||||||
x ^= x << 5;
|
x ^= x << 5;
|
||||||
|
@ -119,13 +104,13 @@ xorshift32()
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
xorshift32_seed()
|
xorshift32_seed(void)
|
||||||
{
|
{
|
||||||
__xorshift32_state = XORSHIFT_SEED_VALUE;
|
__xorshift32_state = XORSHIFT_SEED_VALUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
shuffle(int *array, size_t n) // TODO working?
|
shuffle(int *array, size_t n)
|
||||||
{
|
{
|
||||||
for (size_t i = n - 1; i > 0; --i) {
|
for (size_t i = n - 1; i > 0; --i) {
|
||||||
size_t j = xorshift32() % (i + 1);
|
size_t j = xorshift32() % (i + 1);
|
||||||
|
@ -170,7 +155,7 @@ has_sequential_set(int a[], int l, int r)
|
||||||
int
|
int
|
||||||
ensure_sequential_set(int a[], int l, int r)
|
ensure_sequential_set(int a[], int l, int r)
|
||||||
{
|
{
|
||||||
if (!a || l == 0 || r < 1 || r > l) {
|
if (!a || l == 0 || r < 1 || r > l - 1) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -190,7 +175,8 @@ ensure_sequential_set(int a[], int l, int r)
|
||||||
// Generate a random value between min_value and max_value
|
// Generate a random value between min_value and max_value
|
||||||
int value = random_uint32() % (max_value - min_value - r + 1);
|
int value = random_uint32() % (max_value - min_value - r + 1);
|
||||||
// Generate a random location between 0 and l - r
|
// Generate a random location between 0 and l - r
|
||||||
offset = random_uint32() % (l - r - 1);
|
int d = l - r - 1;
|
||||||
|
offset = d == 0 ? 0 : random_uint32() % d;
|
||||||
|
|
||||||
// Adjust the array to include a sequential set of 'r' integers at the random offset
|
// Adjust the array to include a sequential set of 'r' integers at the random offset
|
||||||
for (int i = 0; i < r; ++i) {
|
for (int i = 0; i < r; ++i) {
|
||||||
|
@ -199,21 +185,6 @@ ensure_sequential_set(int a[], int l, int r)
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
|
||||||
create_sequential_set_in_empty_map(sparsemap_t *map, int s, int r)
|
|
||||||
{
|
|
||||||
int placed_at;
|
|
||||||
if (s >= r + 1) {
|
|
||||||
placed_at = 0;
|
|
||||||
} else {
|
|
||||||
placed_at = random_uint32() % (s - r - 1);
|
|
||||||
}
|
|
||||||
for (int i = placed_at; i < placed_at + r; i++) {
|
|
||||||
sparsemap_set(map, i, true);
|
|
||||||
}
|
|
||||||
return placed_at;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
print_array(int *array, int l)
|
print_array(int *array, int l)
|
||||||
{
|
{
|
||||||
|
@ -322,7 +293,7 @@ bool
|
||||||
is_set(const int array[], int bit)
|
is_set(const int array[], int bit)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
if (array[i] == (int)bit) {
|
if (array[i] == bit) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -330,28 +301,44 @@ is_set(const int array[], int bit)
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
is_unique(int a[], int l, int value)
|
whats_set_uint64(uint64_t number, int pos[64])
|
||||||
{
|
{
|
||||||
for (int i = 0; i < l; ++i) {
|
int length = 0;
|
||||||
if (a[i] == value) {
|
|
||||||
return 0; // Not unique
|
for (int i = 0; i < 64; i++) {
|
||||||
|
if (number & ((uint64_t)1 << i)) {
|
||||||
|
pos[length++] = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 1; // Unique
|
|
||||||
}
|
|
||||||
|
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @brief Fills an array with unique random values between 0 and max_value.
|
||||||
|
*
|
||||||
|
* @param[in] a The array to fill.
|
||||||
|
* @param[in] l The length of the array to fill.
|
||||||
|
* @param[in] max_value The maximum value for the random numbers.
|
||||||
|
*/
|
||||||
void
|
void
|
||||||
setup_test_array(int a[], int l, int max_value)
|
setup_test_array(int a[], int l, int max_value)
|
||||||
{
|
{
|
||||||
if (a == NULL || max_value < 0)
|
|
||||||
return; // Basic error handling and validation
|
|
||||||
|
|
||||||
for (int i = 0; i < l; ++i) {
|
// Create a set to store the unique values.
|
||||||
int candidate;
|
int unique_values[max_value + 1];
|
||||||
do {
|
for (int i = 0; i <= max_value; ++i) {
|
||||||
candidate = random_uint32() % (max_value + 1); // Generate a new value within the specified range
|
unique_values[i] = 0;
|
||||||
} while (!is_unique(a, i, candidate)); // Repeat until a unique value is found
|
}
|
||||||
a[i] = candidate; // Assign the unique value to the array
|
|
||||||
|
// Keep generating random numbers until we have l unique values.
|
||||||
|
int count = 0;
|
||||||
|
while (count < l) {
|
||||||
|
int random_number = random_uint32() % (max_value + 1);
|
||||||
|
if (unique_values[random_number] == 0) {
|
||||||
|
unique_values[random_number] = 1;
|
||||||
|
a[count] = random_number;
|
||||||
|
count++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -364,15 +351,6 @@ bitmap_from_uint32(sparsemap_t *map, uint32_t number)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
|
||||||
bitmap_from_uint64(sparsemap_t *map, uint64_t number)
|
|
||||||
{
|
|
||||||
for (int i = 0; i < 64; i++) {
|
|
||||||
bool bit = number & (1 << i);
|
|
||||||
sparsemap_set(map, i, bit);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t
|
uint32_t
|
||||||
rank_uint64(uint64_t number, int n, int p)
|
rank_uint64(uint64_t number, int n, int p)
|
||||||
{
|
{
|
||||||
|
@ -400,28 +378,96 @@ rank_uint64(uint64_t number, int n, int p)
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
void
|
||||||
whats_set_uint64(uint64_t number, int pos[64])
|
print_bits(char *name, uint64_t value)
|
||||||
{
|
{
|
||||||
int length = 0;
|
if (name) {
|
||||||
|
printf("%s\t", name);
|
||||||
for (int i = 0; i < 64; i++) {
|
}
|
||||||
if (number & ((uint64_t)1 << i)) {
|
for (int i = 63; i >= 0; i--) {
|
||||||
pos[length++] = i;
|
printf("%lu", (value >> i) & 1);
|
||||||
|
if (i % 8 == 0) {
|
||||||
|
printf(" "); // Add space for better readability
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
printf("\n");
|
||||||
return length;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
whats_set(sparsemap_t *map, int m)
|
sm_bitmap_from_uint64(sparsemap_t *map, int offset, uint64_t number)
|
||||||
{
|
{
|
||||||
logf("what's set in the range [0, %d): ", m);
|
for (int i = offset; i < 64; i++) {
|
||||||
for (int i = 0; i < m; i++) {
|
bool bit = number & ((uint64_t)1 << i);
|
||||||
|
sparsemap_set(map, i, bit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sparsemap_idx_t
|
||||||
|
sm_add_span(sparsemap_t *map, int map_size, int span_length)
|
||||||
|
{
|
||||||
|
int attempts = map_size / span_length;
|
||||||
|
sparsemap_idx_t placed_at;
|
||||||
|
do {
|
||||||
|
placed_at = random_uint32() % (map_size - span_length - 1);
|
||||||
|
if (sm_occupied(map, placed_at, span_length, true)) {
|
||||||
|
attempts--;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} while (attempts);
|
||||||
|
for (sparsemap_idx_t i = placed_at; i < placed_at + span_length; i++) {
|
||||||
|
if (sparsemap_set(map, i, true) != i) {
|
||||||
|
return placed_at; // TODO error?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return placed_at;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
sm_whats_set(sparsemap_t *map, int off, int len)
|
||||||
|
{
|
||||||
|
printf("what's set in the range [%d, %d): ", off, off + len);
|
||||||
|
for (int i = off; i < off + len; i++) {
|
||||||
if (sparsemap_is_set(map, i)) {
|
if (sparsemap_is_set(map, i)) {
|
||||||
logf("%d ", i);
|
printf("%d ", i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
logf("\n");
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
sm_is_span(sparsemap_t *map, sparsemap_idx_t m, int len, bool value)
|
||||||
|
{
|
||||||
|
for (sparsemap_idx_t i = m; i < m + len; i++) {
|
||||||
|
if (sparsemap_is_set(map, i) != value) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
sm_occupied(sparsemap_t *map, sparsemap_idx_t m, int len, bool value)
|
||||||
|
{
|
||||||
|
for (sparsemap_idx_t i = m; i < (sparsemap_idx_t)len; i++) {
|
||||||
|
if (sparsemap_is_set(map, i) == value) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *
|
||||||
|
bytes_as(double bytes, char *s, size_t size)
|
||||||
|
{
|
||||||
|
const char *units[] = { "b", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" };
|
||||||
|
size_t i = 0;
|
||||||
|
|
||||||
|
while (bytes >= 1024 && i < sizeof(units) / sizeof(units[0]) - 1) {
|
||||||
|
bytes /= 1024;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
snprintf(s, size, "%.2f %s", bytes, units[i]);
|
||||||
|
return s;
|
||||||
}
|
}
|
25883
lib/roaring.c
Normal file
25883
lib/roaring.c
Normal file
File diff suppressed because it is too large
Load diff
680
lib/tdigest.c
Normal file
680
lib/tdigest.c
Normal file
|
@ -0,0 +1,680 @@
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include "tdigest.h"
|
||||||
|
#include <errno.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#ifndef TD_MALLOC_INCLUDE
|
||||||
|
#define TD_MALLOC_INCLUDE "td_malloc.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef TD_ALLOC_H
|
||||||
|
#define TD_ALLOC_H
|
||||||
|
#define __td_malloc malloc
|
||||||
|
#define __td_calloc calloc
|
||||||
|
#define __td_realloc realloc
|
||||||
|
#define __td_free free
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define __td_max(x, y) (((x) > (y)) ? (x) : (y))
|
||||||
|
#define __td_min(x, y) (((x) < (y)) ? (x) : (y))
|
||||||
|
|
||||||
|
static inline double weighted_average_sorted(double x1, double w1, double x2, double w2) {
|
||||||
|
const double x = (x1 * w1 + x2 * w2) / (w1 + w2);
|
||||||
|
return __td_max(x1, __td_min(x, x2));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool _tdigest_long_long_add_safe(long long a, long long b) {
|
||||||
|
if (b < 0) {
|
||||||
|
return (a >= __LONG_LONG_MAX__ - b);
|
||||||
|
} else {
|
||||||
|
return (a <= __LONG_LONG_MAX__ - b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline double weighted_average(double x1, double w1, double x2, double w2) {
|
||||||
|
if (x1 <= x2) {
|
||||||
|
return weighted_average_sorted(x1, w1, x2, w2);
|
||||||
|
} else {
|
||||||
|
return weighted_average_sorted(x2, w2, x1, w1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void swap(double *arr, int i, int j) {
|
||||||
|
const double temp = arr[i];
|
||||||
|
arr[i] = arr[j];
|
||||||
|
arr[j] = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void swap_l(long long *arr, int i, int j) {
|
||||||
|
const long long temp = arr[i];
|
||||||
|
arr[i] = arr[j];
|
||||||
|
arr[j] = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned int partition(double *means, long long *weights, unsigned int start,
|
||||||
|
unsigned int end, unsigned int pivot_idx) {
|
||||||
|
const double pivotMean = means[pivot_idx];
|
||||||
|
swap(means, pivot_idx, end);
|
||||||
|
swap_l(weights, pivot_idx, end);
|
||||||
|
|
||||||
|
int i = start - 1;
|
||||||
|
|
||||||
|
for (unsigned int j = start; j < end; j++) {
|
||||||
|
// If current element is smaller than the pivot
|
||||||
|
if (means[j] < pivotMean) {
|
||||||
|
// increment index of smaller element
|
||||||
|
i++;
|
||||||
|
swap(means, i, j);
|
||||||
|
swap_l(weights, i, j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
swap(means, i + 1, end);
|
||||||
|
swap_l(weights, i + 1, end);
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Standard quick sort except that sorting rearranges parallel arrays
|
||||||
|
*
|
||||||
|
* @param means Values to sort on
|
||||||
|
* @param weights The auxillary values to sort.
|
||||||
|
* @param start The beginning of the values to sort
|
||||||
|
* @param end The value after the last value to sort
|
||||||
|
*/
|
||||||
|
static void td_qsort(double *means, long long *weights, unsigned int start, unsigned int end) {
|
||||||
|
if (start < end) {
|
||||||
|
// two elements can be directly compared
|
||||||
|
if ((end - start) == 1) {
|
||||||
|
if (means[start] > means[end]) {
|
||||||
|
swap(means, start, end);
|
||||||
|
swap_l(weights, start, end);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// generating a random number as a pivot was very expensive vs the array size
|
||||||
|
// const unsigned int pivot_idx = start + rand()%(end - start + 1);
|
||||||
|
const unsigned int pivot_idx = (end + start) / 2; // central pivot
|
||||||
|
const unsigned int new_pivot_idx = partition(means, weights, start, end, pivot_idx);
|
||||||
|
if (new_pivot_idx > start) {
|
||||||
|
td_qsort(means, weights, start, new_pivot_idx - 1);
|
||||||
|
}
|
||||||
|
td_qsort(means, weights, new_pivot_idx + 1, end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline size_t cap_from_compression(double compression) {
|
||||||
|
if ((size_t)compression > ((SIZE_MAX / sizeof(double) / 6) - 10)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (6 * (size_t)(compression)) + 10;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool should_td_compress(td_histogram_t *h) {
|
||||||
|
return ((h->merged_nodes + h->unmerged_nodes) >= (h->cap - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int next_node(td_histogram_t *h) { return h->merged_nodes + h->unmerged_nodes; }
|
||||||
|
|
||||||
|
int td_compress(td_histogram_t *h);
|
||||||
|
|
||||||
|
static inline int _check_overflow(const double v) {
|
||||||
|
// double-precision overflow detected on h->unmerged_weight
|
||||||
|
if (v == INFINITY) {
|
||||||
|
return EDOM;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int _check_td_overflow(const double new_unmerged_weight,
|
||||||
|
const double new_total_weight) {
|
||||||
|
// double-precision overflow detected on h->unmerged_weight
|
||||||
|
if (new_unmerged_weight == INFINITY) {
|
||||||
|
return EDOM;
|
||||||
|
}
|
||||||
|
if (new_total_weight == INFINITY) {
|
||||||
|
return EDOM;
|
||||||
|
}
|
||||||
|
const double denom = 2 * MM_PI * new_total_weight * log(new_total_weight);
|
||||||
|
if (denom == INFINITY) {
|
||||||
|
return EDOM;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int td_centroid_count(td_histogram_t *h) { return next_node(h); }
|
||||||
|
|
||||||
|
void td_reset(td_histogram_t *h) {
|
||||||
|
if (!h) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
h->min = __DBL_MAX__;
|
||||||
|
h->max = -h->min;
|
||||||
|
h->merged_nodes = 0;
|
||||||
|
h->merged_weight = 0;
|
||||||
|
h->unmerged_nodes = 0;
|
||||||
|
h->unmerged_weight = 0;
|
||||||
|
h->total_compressions = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int td_init(double compression, td_histogram_t **result) {
|
||||||
|
|
||||||
|
const size_t capacity = cap_from_compression(compression);
|
||||||
|
if (capacity < 1) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
td_histogram_t *histogram;
|
||||||
|
histogram = (td_histogram_t *)__td_malloc(sizeof(td_histogram_t));
|
||||||
|
if (!histogram) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
histogram->cap = capacity;
|
||||||
|
histogram->compression = (double)compression;
|
||||||
|
td_reset(histogram);
|
||||||
|
histogram->nodes_mean = (double *)__td_calloc(capacity, sizeof(double));
|
||||||
|
if (!histogram->nodes_mean) {
|
||||||
|
td_free(histogram);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
histogram->nodes_weight = (long long *)__td_calloc(capacity, sizeof(long long));
|
||||||
|
if (!histogram->nodes_weight) {
|
||||||
|
td_free(histogram);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
*result = histogram;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
td_histogram_t *td_new(double compression) {
|
||||||
|
td_histogram_t *mdigest = NULL;
|
||||||
|
td_init(compression, &mdigest);
|
||||||
|
return mdigest;
|
||||||
|
}
|
||||||
|
|
||||||
|
void td_free(td_histogram_t *histogram) {
|
||||||
|
if (histogram->nodes_mean) {
|
||||||
|
__td_free((void *)(histogram->nodes_mean));
|
||||||
|
}
|
||||||
|
if (histogram->nodes_weight) {
|
||||||
|
__td_free((void *)(histogram->nodes_weight));
|
||||||
|
}
|
||||||
|
__td_free((void *)(histogram));
|
||||||
|
}
|
||||||
|
|
||||||
|
int td_merge(td_histogram_t *into, td_histogram_t *from) {
|
||||||
|
if (td_compress(into) != 0)
|
||||||
|
return EDOM;
|
||||||
|
if (td_compress(from) != 0)
|
||||||
|
return EDOM;
|
||||||
|
const int pos = from->merged_nodes + from->unmerged_nodes;
|
||||||
|
for (int i = 0; i < pos; i++) {
|
||||||
|
const double mean = from->nodes_mean[i];
|
||||||
|
const long long weight = from->nodes_weight[i];
|
||||||
|
if (td_add(into, mean, weight) != 0) {
|
||||||
|
return EDOM;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
long long td_size(td_histogram_t *h) { return h->merged_weight + h->unmerged_weight; }
|
||||||
|
|
||||||
|
double td_cdf(td_histogram_t *h, double val) {
|
||||||
|
td_compress(h);
|
||||||
|
// no data to examine
|
||||||
|
if (h->merged_nodes == 0) {
|
||||||
|
return NAN;
|
||||||
|
}
|
||||||
|
// bellow lower bound
|
||||||
|
if (val < h->min) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
// above upper bound
|
||||||
|
if (val > h->max) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (h->merged_nodes == 1) {
|
||||||
|
// exactly one centroid, should have max==min
|
||||||
|
const double width = h->max - h->min;
|
||||||
|
if (val - h->min <= width) {
|
||||||
|
// min and max are too close together to do any viable interpolation
|
||||||
|
return 0.5;
|
||||||
|
} else {
|
||||||
|
// interpolate if somehow we have weight > 0 and max != min
|
||||||
|
return (val - h->min) / width;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const int n = h->merged_nodes;
|
||||||
|
// check for the left tail
|
||||||
|
const double left_centroid_mean = h->nodes_mean[0];
|
||||||
|
const double left_centroid_weight = (double)h->nodes_weight[0];
|
||||||
|
const double merged_weight_d = (double)h->merged_weight;
|
||||||
|
if (val < left_centroid_mean) {
|
||||||
|
// note that this is different than h->nodes_mean[0] > min
|
||||||
|
// ... this guarantees we divide by non-zero number and interpolation works
|
||||||
|
const double width = left_centroid_mean - h->min;
|
||||||
|
if (width > 0) {
|
||||||
|
// must be a sample exactly at min
|
||||||
|
if (val == h->min) {
|
||||||
|
return 0.5 / merged_weight_d;
|
||||||
|
} else {
|
||||||
|
return (1 + (val - h->min) / width * (left_centroid_weight / 2 - 1)) /
|
||||||
|
merged_weight_d;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// this should be redundant with the check val < h->min
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// and the right tail
|
||||||
|
const double right_centroid_mean = h->nodes_mean[n - 1];
|
||||||
|
const double right_centroid_weight = (double)h->nodes_weight[n - 1];
|
||||||
|
if (val > right_centroid_mean) {
|
||||||
|
const double width = h->max - right_centroid_mean;
|
||||||
|
if (width > 0) {
|
||||||
|
if (val == h->max) {
|
||||||
|
return 1 - 0.5 / merged_weight_d;
|
||||||
|
} else {
|
||||||
|
// there has to be a single sample exactly at max
|
||||||
|
const double dq = (1 + (h->max - val) / width * (right_centroid_weight / 2 - 1)) /
|
||||||
|
merged_weight_d;
|
||||||
|
return 1 - dq;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// we know that there are at least two centroids and mean[0] < x < mean[n-1]
|
||||||
|
// that means that there are either one or more consecutive centroids all at exactly x
|
||||||
|
// or there are consecutive centroids, c0 < x < c1
|
||||||
|
double weightSoFar = 0;
|
||||||
|
for (int it = 0; it < n - 1; it++) {
|
||||||
|
// weightSoFar does not include weight[it] yet
|
||||||
|
if (h->nodes_mean[it] == val) {
|
||||||
|
// we have one or more centroids == x, treat them as one
|
||||||
|
// dw will accumulate the weight of all of the centroids at x
|
||||||
|
double dw = 0;
|
||||||
|
while (it < n && h->nodes_mean[it] == val) {
|
||||||
|
dw += (double)h->nodes_weight[it];
|
||||||
|
it++;
|
||||||
|
}
|
||||||
|
return (weightSoFar + dw / 2) / (double)h->merged_weight;
|
||||||
|
} else if (h->nodes_mean[it] <= val && val < h->nodes_mean[it + 1]) {
|
||||||
|
const double node_weight = (double)h->nodes_weight[it];
|
||||||
|
const double node_weight_next = (double)h->nodes_weight[it + 1];
|
||||||
|
const double node_mean = h->nodes_mean[it];
|
||||||
|
const double node_mean_next = h->nodes_mean[it + 1];
|
||||||
|
// landed between centroids ... check for floating point madness
|
||||||
|
if (node_mean_next - node_mean > 0) {
|
||||||
|
// note how we handle singleton centroids here
|
||||||
|
// the point is that for singleton centroids, we know that their entire
|
||||||
|
// weight is exactly at the centroid and thus shouldn't be involved in
|
||||||
|
// interpolation
|
||||||
|
double leftExcludedW = 0;
|
||||||
|
double rightExcludedW = 0;
|
||||||
|
if (node_weight == 1) {
|
||||||
|
if (node_weight_next == 1) {
|
||||||
|
// two singletons means no interpolation
|
||||||
|
// left singleton is in, right is out
|
||||||
|
return (weightSoFar + 1) / merged_weight_d;
|
||||||
|
} else {
|
||||||
|
leftExcludedW = 0.5;
|
||||||
|
}
|
||||||
|
} else if (node_weight_next == 1) {
|
||||||
|
rightExcludedW = 0.5;
|
||||||
|
}
|
||||||
|
double dw = (node_weight + node_weight_next) / 2;
|
||||||
|
|
||||||
|
// adjust endpoints for any singleton
|
||||||
|
double dwNoSingleton = dw - leftExcludedW - rightExcludedW;
|
||||||
|
|
||||||
|
double base = weightSoFar + node_weight / 2 + leftExcludedW;
|
||||||
|
return (base + dwNoSingleton * (val - node_mean) / (node_mean_next - node_mean)) /
|
||||||
|
merged_weight_d;
|
||||||
|
} else {
|
||||||
|
// this is simply caution against floating point madness
|
||||||
|
// it is conceivable that the centroids will be different
|
||||||
|
// but too near to allow safe interpolation
|
||||||
|
double dw = (node_weight + node_weight_next) / 2;
|
||||||
|
return (weightSoFar + dw) / merged_weight_d;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
weightSoFar += (double)h->nodes_weight[it];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 1 - 0.5 / merged_weight_d;
|
||||||
|
}
|
||||||
|
|
||||||
|
static double td_internal_iterate_centroids_to_index(const td_histogram_t *h, const double index,
|
||||||
|
const double left_centroid_weight,
|
||||||
|
const int total_centroids, double *weightSoFar,
|
||||||
|
int *node_pos) {
|
||||||
|
if (left_centroid_weight > 1 && index < left_centroid_weight / 2) {
|
||||||
|
// there is a single sample at min so we interpolate with less weight
|
||||||
|
return h->min + (index - 1) / (left_centroid_weight / 2 - 1) * (h->nodes_mean[0] - h->min);
|
||||||
|
}
|
||||||
|
|
||||||
|
// usually the last centroid will have unit weight so this test will make it moot
|
||||||
|
if (index > h->merged_weight - 1) {
|
||||||
|
return h->max;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if the right-most centroid has more than one sample, we still know
|
||||||
|
// that one sample occurred at max so we can do some interpolation
|
||||||
|
const double right_centroid_weight = (double)h->nodes_weight[total_centroids - 1];
|
||||||
|
const double right_centroid_mean = h->nodes_mean[total_centroids - 1];
|
||||||
|
if (right_centroid_weight > 1 &&
|
||||||
|
(double)h->merged_weight - index <= right_centroid_weight / 2) {
|
||||||
|
return h->max - ((double)h->merged_weight - index - 1) / (right_centroid_weight / 2 - 1) *
|
||||||
|
(h->max - right_centroid_mean);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; *node_pos < total_centroids - 1; (*node_pos)++) {
|
||||||
|
const int i = *node_pos;
|
||||||
|
const double node_weight = (double)h->nodes_weight[i];
|
||||||
|
const double node_weight_next = (double)h->nodes_weight[i + 1];
|
||||||
|
const double node_mean = h->nodes_mean[i];
|
||||||
|
const double node_mean_next = h->nodes_mean[i + 1];
|
||||||
|
const double dw = (node_weight + node_weight_next) / 2;
|
||||||
|
if (*weightSoFar + dw > index) {
|
||||||
|
// centroids i and i+1 bracket our current point
|
||||||
|
// check for unit weight
|
||||||
|
double leftUnit = 0;
|
||||||
|
if (node_weight == 1) {
|
||||||
|
if (index - *weightSoFar < 0.5) {
|
||||||
|
// within the singleton's sphere
|
||||||
|
return node_mean;
|
||||||
|
} else {
|
||||||
|
leftUnit = 0.5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double rightUnit = 0;
|
||||||
|
if (node_weight_next == 1) {
|
||||||
|
if (*weightSoFar + dw - index <= 0.5) {
|
||||||
|
// no interpolation needed near singleton
|
||||||
|
return node_mean_next;
|
||||||
|
}
|
||||||
|
rightUnit = 0.5;
|
||||||
|
}
|
||||||
|
const double z1 = index - *weightSoFar - leftUnit;
|
||||||
|
const double z2 = *weightSoFar + dw - index - rightUnit;
|
||||||
|
return weighted_average(node_mean, z2, node_mean_next, z1);
|
||||||
|
}
|
||||||
|
*weightSoFar += dw;
|
||||||
|
}
|
||||||
|
|
||||||
|
// weightSoFar = totalWeight - weight[total_centroids-1]/2 (very nearly)
|
||||||
|
// so we interpolate out to max value ever seen
|
||||||
|
const double z1 = index - h->merged_weight - right_centroid_weight / 2.0;
|
||||||
|
const double z2 = right_centroid_weight / 2 - z1;
|
||||||
|
return weighted_average(right_centroid_mean, z1, h->max, z2);
|
||||||
|
}
|
||||||
|
|
||||||
|
double td_quantile(td_histogram_t *h, double q) {
|
||||||
|
td_compress(h);
|
||||||
|
// q should be in [0,1]
|
||||||
|
if (q < 0.0 || q > 1.0 || h->merged_nodes == 0) {
|
||||||
|
return NAN;
|
||||||
|
}
|
||||||
|
// with one data point, all quantiles lead to Rome
|
||||||
|
if (h->merged_nodes == 1) {
|
||||||
|
return h->nodes_mean[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// if values were stored in a sorted array, index would be the offset we are interested in
|
||||||
|
const double index = q * (double)h->merged_weight;
|
||||||
|
|
||||||
|
// beyond the boundaries, we return min or max
|
||||||
|
// usually, the first centroid will have unit weight so this will make it moot
|
||||||
|
if (index < 1) {
|
||||||
|
return h->min;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we know that there are at least two centroids now
|
||||||
|
const int n = h->merged_nodes;
|
||||||
|
|
||||||
|
// if the left centroid has more than one sample, we still know
|
||||||
|
// that one sample occurred at min so we can do some interpolation
|
||||||
|
const double left_centroid_weight = (double)h->nodes_weight[0];
|
||||||
|
|
||||||
|
// in between extremes we interpolate between centroids
|
||||||
|
double weightSoFar = left_centroid_weight / 2;
|
||||||
|
int i = 0;
|
||||||
|
return td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n, &weightSoFar,
|
||||||
|
&i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length) {
|
||||||
|
td_compress(h);
|
||||||
|
|
||||||
|
if (NULL == quantiles || NULL == values) {
|
||||||
|
return EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n = h->merged_nodes;
|
||||||
|
if (n == 0) {
|
||||||
|
for (size_t i = 0; i < length; i++) {
|
||||||
|
values[i] = NAN;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (n == 1) {
|
||||||
|
for (size_t i = 0; i < length; i++) {
|
||||||
|
const double requested_quantile = quantiles[i];
|
||||||
|
|
||||||
|
// q should be in [0,1]
|
||||||
|
if (requested_quantile < 0.0 || requested_quantile > 1.0) {
|
||||||
|
values[i] = NAN;
|
||||||
|
} else {
|
||||||
|
// with one data point, all quantiles lead to Rome
|
||||||
|
values[i] = h->nodes_mean[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we know that there are at least two centroids now
|
||||||
|
// if the left centroid has more than one sample, we still know
|
||||||
|
// that one sample occurred at min so we can do some interpolation
|
||||||
|
const double left_centroid_weight = (double)h->nodes_weight[0];
|
||||||
|
|
||||||
|
// in between extremes we interpolate between centroids
|
||||||
|
double weightSoFar = left_centroid_weight / 2;
|
||||||
|
int node_pos = 0;
|
||||||
|
|
||||||
|
// to avoid allocations we use the values array for intermediate computation
|
||||||
|
// i.e. to store the expected cumulative count at each percentile
|
||||||
|
for (size_t qpos = 0; qpos < length; qpos++) {
|
||||||
|
const double index = quantiles[qpos] * (double)h->merged_weight;
|
||||||
|
values[qpos] = td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n,
|
||||||
|
&weightSoFar, &node_pos);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static double td_internal_trimmed_mean(const td_histogram_t *h, const double leftmost_weight,
|
||||||
|
const double rightmost_weight) {
|
||||||
|
double count_done = 0;
|
||||||
|
double trimmed_sum = 0;
|
||||||
|
double trimmed_count = 0;
|
||||||
|
for (int i = 0; i < h->merged_nodes; i++) {
|
||||||
|
|
||||||
|
const double n_weight = (double)h->nodes_weight[i];
|
||||||
|
// Assume the whole centroid falls into the range
|
||||||
|
double count_add = n_weight;
|
||||||
|
|
||||||
|
// If we haven't reached the low threshold yet, skip appropriate part of the centroid.
|
||||||
|
count_add -= __td_min(__td_max(0, leftmost_weight - count_done), count_add);
|
||||||
|
|
||||||
|
// If we have reached the upper threshold, ignore the overflowing part of the centroid.
|
||||||
|
|
||||||
|
count_add = __td_min(__td_max(0, rightmost_weight - count_done), count_add);
|
||||||
|
|
||||||
|
// consider the whole centroid processed
|
||||||
|
count_done += n_weight;
|
||||||
|
|
||||||
|
// increment the sum / count
|
||||||
|
trimmed_sum += h->nodes_mean[i] * count_add;
|
||||||
|
trimmed_count += count_add;
|
||||||
|
|
||||||
|
// break once we cross the high threshold
|
||||||
|
if (count_done >= rightmost_weight)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return trimmed_sum / trimmed_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut) {
|
||||||
|
td_compress(h);
|
||||||
|
// proportion_to_cut should be in [0,1]
|
||||||
|
if (h->merged_nodes == 0 || proportion_to_cut < 0.0 || proportion_to_cut > 1.0) {
|
||||||
|
return NAN;
|
||||||
|
}
|
||||||
|
// with one data point, all values lead to Rome
|
||||||
|
if (h->merged_nodes == 1) {
|
||||||
|
return h->nodes_mean[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* translate the percentiles to counts */
|
||||||
|
const double leftmost_weight = floor((double)h->merged_weight * proportion_to_cut);
|
||||||
|
const double rightmost_weight = ceil((double)h->merged_weight * (1.0 - proportion_to_cut));
|
||||||
|
|
||||||
|
return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight);
|
||||||
|
}
|
||||||
|
|
||||||
|
double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut) {
|
||||||
|
td_compress(h);
|
||||||
|
// leftmost_cut and rightmost_cut should be in [0,1]
|
||||||
|
if (h->merged_nodes == 0 || leftmost_cut < 0.0 || leftmost_cut > 1.0 || rightmost_cut < 0.0 ||
|
||||||
|
rightmost_cut > 1.0) {
|
||||||
|
return NAN;
|
||||||
|
}
|
||||||
|
// with one data point, all values lead to Rome
|
||||||
|
if (h->merged_nodes == 1) {
|
||||||
|
return h->nodes_mean[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* translate the percentiles to counts */
|
||||||
|
const double leftmost_weight = floor((double)h->merged_weight * leftmost_cut);
|
||||||
|
const double rightmost_weight = ceil((double)h->merged_weight * rightmost_cut);
|
||||||
|
|
||||||
|
return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight);
|
||||||
|
}
|
||||||
|
|
||||||
|
int td_add(td_histogram_t *h, double mean, long long weight) {
|
||||||
|
if (should_td_compress(h)) {
|
||||||
|
const int overflow_res = td_compress(h);
|
||||||
|
if (overflow_res != 0)
|
||||||
|
return overflow_res;
|
||||||
|
}
|
||||||
|
const int pos = next_node(h);
|
||||||
|
if (pos >= h->cap)
|
||||||
|
return EDOM;
|
||||||
|
if (_tdigest_long_long_add_safe(h->unmerged_weight, weight) == false)
|
||||||
|
return EDOM;
|
||||||
|
const long long new_unmerged_weight = h->unmerged_weight + weight;
|
||||||
|
if (_tdigest_long_long_add_safe(new_unmerged_weight, h->merged_weight) == false)
|
||||||
|
return EDOM;
|
||||||
|
const long long new_total_weight = new_unmerged_weight + h->merged_weight;
|
||||||
|
// double-precision overflow detected
|
||||||
|
const int overflow_res =
|
||||||
|
_check_td_overflow((double)new_unmerged_weight, (double)new_total_weight);
|
||||||
|
if (overflow_res != 0)
|
||||||
|
return overflow_res;
|
||||||
|
|
||||||
|
if (mean < h->min) {
|
||||||
|
h->min = mean;
|
||||||
|
}
|
||||||
|
if (mean > h->max) {
|
||||||
|
h->max = mean;
|
||||||
|
}
|
||||||
|
h->nodes_mean[pos] = mean;
|
||||||
|
h->nodes_weight[pos] = weight;
|
||||||
|
h->unmerged_nodes++;
|
||||||
|
h->unmerged_weight = new_unmerged_weight;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int td_compress(td_histogram_t *h) {
|
||||||
|
if (h->unmerged_nodes == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
int N = h->merged_nodes + h->unmerged_nodes;
|
||||||
|
td_qsort(h->nodes_mean, h->nodes_weight, 0, N - 1);
|
||||||
|
const double total_weight = (double)h->merged_weight + (double)h->unmerged_weight;
|
||||||
|
// double-precision overflow detected
|
||||||
|
const int overflow_res = _check_td_overflow((double)h->unmerged_weight, (double)total_weight);
|
||||||
|
if (overflow_res != 0)
|
||||||
|
return overflow_res;
|
||||||
|
if (total_weight <= 1)
|
||||||
|
return 0;
|
||||||
|
const double denom = 2 * MM_PI * total_weight * log(total_weight);
|
||||||
|
if (_check_overflow(denom) != 0)
|
||||||
|
return EDOM;
|
||||||
|
|
||||||
|
// Compute the normalizer given compression and number of points.
|
||||||
|
const double normalizer = h->compression / denom;
|
||||||
|
if (_check_overflow(normalizer) != 0)
|
||||||
|
return EDOM;
|
||||||
|
int cur = 0;
|
||||||
|
double weight_so_far = 0;
|
||||||
|
|
||||||
|
for (int i = 1; i < N; i++) {
|
||||||
|
const double proposed_weight = (double)h->nodes_weight[cur] + (double)h->nodes_weight[i];
|
||||||
|
const double z = proposed_weight * normalizer;
|
||||||
|
// quantile up to cur
|
||||||
|
const double q0 = weight_so_far / total_weight;
|
||||||
|
// quantile up to cur + i
|
||||||
|
const double q2 = (weight_so_far + proposed_weight) / total_weight;
|
||||||
|
// Convert a quantile to the k-scale
|
||||||
|
const bool should_add = (z <= (q0 * (1 - q0))) && (z <= (q2 * (1 - q2)));
|
||||||
|
// next point will fit
|
||||||
|
// so merge into existing centroid
|
||||||
|
if (should_add) {
|
||||||
|
h->nodes_weight[cur] += h->nodes_weight[i];
|
||||||
|
const double delta = h->nodes_mean[i] - h->nodes_mean[cur];
|
||||||
|
const double weighted_delta = (delta * h->nodes_weight[i]) / h->nodes_weight[cur];
|
||||||
|
h->nodes_mean[cur] += weighted_delta;
|
||||||
|
} else {
|
||||||
|
weight_so_far += h->nodes_weight[cur];
|
||||||
|
cur++;
|
||||||
|
h->nodes_weight[cur] = h->nodes_weight[i];
|
||||||
|
h->nodes_mean[cur] = h->nodes_mean[i];
|
||||||
|
}
|
||||||
|
if (cur != i) {
|
||||||
|
h->nodes_weight[i] = 0;
|
||||||
|
h->nodes_mean[i] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
h->merged_nodes = cur + 1;
|
||||||
|
h->merged_weight = total_weight;
|
||||||
|
h->unmerged_nodes = 0;
|
||||||
|
h->unmerged_weight = 0;
|
||||||
|
h->total_compressions++;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double td_min(td_histogram_t *h) { return h->min; }
|
||||||
|
|
||||||
|
double td_max(td_histogram_t *h) { return h->max; }
|
||||||
|
|
||||||
|
int td_compression(td_histogram_t *h) { return h->compression; }
|
||||||
|
|
||||||
|
const long long *td_centroids_weight(td_histogram_t *h) { return h->nodes_weight; }
|
||||||
|
|
||||||
|
const double *td_centroids_mean(td_histogram_t *h) { return h->nodes_mean; }
|
||||||
|
|
||||||
|
long long td_centroids_weight_at(td_histogram_t *h, int pos) { return h->nodes_weight[pos]; }
|
||||||
|
|
||||||
|
double td_centroids_mean_at(td_histogram_t *h, int pos) {
|
||||||
|
if (pos < 0 || pos > h->merged_nodes) {
|
||||||
|
return NAN;
|
||||||
|
}
|
||||||
|
return h->nodes_mean[pos];
|
||||||
|
}
|
1542
src/sparsemap.c
1542
src/sparsemap.c
File diff suppressed because it is too large
Load diff
417
tests/midl.c
Normal file
417
tests/midl.c
Normal file
|
@ -0,0 +1,417 @@
|
||||||
|
/** @defgroup idls ID List Management
|
||||||
|
* @{
|
||||||
|
*/
|
||||||
|
/** A generic unsigned ID number. These were entryIDs in back-bdb.
|
||||||
|
* Preferably it should have the same size as a pointer.
|
||||||
|
*/
|
||||||
|
typedef size_t MDB_ID;
|
||||||
|
|
||||||
|
/** An IDL is an ID List, a sorted array of IDs. The first
|
||||||
|
* element of the array is a counter for how many actual
|
||||||
|
* IDs are in the list. In the original back-bdb code, IDLs are
|
||||||
|
* sorted in ascending order. For libmdb IDLs are sorted in
|
||||||
|
* descending order.
|
||||||
|
*/
|
||||||
|
typedef MDB_ID *MDB_IDL;
|
||||||
|
|
||||||
|
/* IDL sizes - likely should be even bigger
|
||||||
|
* limiting factors: sizeof(ID), thread stack size
|
||||||
|
*/
|
||||||
|
#define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */
|
||||||
|
#define MDB_IDL_DB_SIZE (1 << MDB_IDL_LOGN)
|
||||||
|
#define MDB_IDL_UM_SIZE (1 << (MDB_IDL_LOGN + 1))
|
||||||
|
|
||||||
|
#define MDB_IDL_DB_MAX (MDB_IDL_DB_SIZE - 1)
|
||||||
|
#define MDB_IDL_UM_MAX (MDB_IDL_UM_SIZE - 1)
|
||||||
|
|
||||||
|
#define MDB_IDL_SIZEOF(ids) (((ids)[0] + 1) * sizeof(MDB_ID))
|
||||||
|
#define MDB_IDL_IS_ZERO(ids) ((ids)[0] == 0)
|
||||||
|
#define MDB_IDL_CPY(dst, src) (memcpy(dst, src, MDB_IDL_SIZEOF(src)))
|
||||||
|
#define MDB_IDL_FIRST(ids) ((ids)[1])
|
||||||
|
#define MDB_IDL_LAST(ids) ((ids)[(ids)[0]])
|
||||||
|
|
||||||
|
/** Current max length of an #mdb_midl_alloc()ed IDL */
|
||||||
|
#define MDB_IDL_ALLOCLEN(ids) ((ids)[-1])
|
||||||
|
|
||||||
|
/** Append ID to IDL. The IDL must be big enough. */
|
||||||
|
#define mdb_midl_xappend(idl, id) \
|
||||||
|
do { \
|
||||||
|
MDB_ID *xidl = (idl), xlen = ++(xidl[0]); \
|
||||||
|
xidl[xlen] = (id); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
/** Search for an ID in an IDL.
|
||||||
|
* @param[in] ids The IDL to search.
|
||||||
|
* @param[in] id The ID to search for.
|
||||||
|
* @return The index of the first ID greater than or equal to \b id.
|
||||||
|
*/
|
||||||
|
unsigned mdb_midl_search(MDB_IDL ids, MDB_ID id);
|
||||||
|
|
||||||
|
/** Allocate an IDL.
|
||||||
|
* Allocates memory for an IDL of the given size.
|
||||||
|
* @return IDL on success, NULL on failure.
|
||||||
|
*/
|
||||||
|
MDB_IDL mdb_midl_alloc(int num);
|
||||||
|
|
||||||
|
/** Free an IDL.
|
||||||
|
* @param[in] ids The IDL to free.
|
||||||
|
*/
|
||||||
|
void mdb_midl_free(MDB_IDL ids);
|
||||||
|
|
||||||
|
/** Shrink an IDL.
|
||||||
|
* Return the IDL to the default size if it has grown larger.
|
||||||
|
* @param[in,out] idp Address of the IDL to shrink.
|
||||||
|
*/
|
||||||
|
void mdb_midl_shrink(MDB_IDL *idp);
|
||||||
|
|
||||||
|
/** Shrink an IDL to a specific size.
|
||||||
|
* Resize the IDL to \b size if it is larger.
|
||||||
|
* @param[in,out] idp Address of the IDL to shrink.
|
||||||
|
* @param[in] size Capacity to have once resized.
|
||||||
|
*/
|
||||||
|
void mdb_midl_shrink(MDB_IDL *idp);
|
||||||
|
|
||||||
|
/** Make room for num additional elements in an IDL.
|
||||||
|
* @param[in,out] idp Address of the IDL.
|
||||||
|
* @param[in] num Number of elements to make room for.
|
||||||
|
* @return 0 on success, ENOMEM on failure.
|
||||||
|
*/
|
||||||
|
int mdb_midl_need(MDB_IDL *idp, unsigned num);
|
||||||
|
|
||||||
|
/** Append an ID onto an IDL.
|
||||||
|
* @param[in,out] idp Address of the IDL to append to.
|
||||||
|
* @param[in] id The ID to append.
|
||||||
|
* @return 0 on success, ENOMEM if the IDL is too large.
|
||||||
|
*/
|
||||||
|
int mdb_midl_append(MDB_IDL *idp, MDB_ID id);
|
||||||
|
|
||||||
|
/** Append an IDL onto an IDL.
|
||||||
|
* @param[in,out] idp Address of the IDL to append to.
|
||||||
|
* @param[in] app The IDL to append.
|
||||||
|
* @return 0 on success, ENOMEM if the IDL is too large.
|
||||||
|
*/
|
||||||
|
int mdb_midl_append_list(MDB_IDL *idp, MDB_IDL app);
|
||||||
|
|
||||||
|
/** Append an ID range onto an IDL.
|
||||||
|
* @param[in,out] idp Address of the IDL to append to.
|
||||||
|
* @param[in] id The lowest ID to append.
|
||||||
|
* @param[in] n Number of IDs to append.
|
||||||
|
* @return 0 on success, ENOMEM if the IDL is too large.
|
||||||
|
*/
|
||||||
|
int mdb_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n);
|
||||||
|
|
||||||
|
/** Merge an IDL onto an IDL. The destination IDL must be big enough.
|
||||||
|
* @param[in] idl The IDL to merge into.
|
||||||
|
* @param[in] merge The IDL to merge.
|
||||||
|
*/
|
||||||
|
void mdb_midl_xmerge(MDB_IDL idl, MDB_IDL merge);
|
||||||
|
|
||||||
|
/** Sort an IDL.
|
||||||
|
* @param[in,out] ids The IDL to sort.
|
||||||
|
*/
|
||||||
|
void mdb_midl_sort(MDB_IDL ids);
|
||||||
|
|
||||||
|
/* midl.c ------------------------------------------------------------------ */
|
||||||
|
/** @defgroup idls ID List Management
|
||||||
|
* @{
|
||||||
|
*/
|
||||||
|
#define CMP(x, y) ((x) < (y) ? -1 : (x) > (y))
|
||||||
|
|
||||||
|
unsigned
|
||||||
|
mdb_midl_search(MDB_IDL ids, MDB_ID id)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* binary search of id in ids
|
||||||
|
* if found, returns position of id
|
||||||
|
* if not found, returns first position greater than id
|
||||||
|
*/
|
||||||
|
unsigned base = 0;
|
||||||
|
unsigned cursor = 1;
|
||||||
|
int val = 0;
|
||||||
|
unsigned n = ids[0];
|
||||||
|
|
||||||
|
while (0 < n) {
|
||||||
|
unsigned pivot = n >> 1;
|
||||||
|
cursor = base + pivot + 1;
|
||||||
|
val = CMP(ids[cursor], id);
|
||||||
|
|
||||||
|
if (val < 0) {
|
||||||
|
n = pivot;
|
||||||
|
|
||||||
|
} else if (val > 0) {
|
||||||
|
base = cursor;
|
||||||
|
n -= pivot + 1;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
return cursor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val > 0) {
|
||||||
|
++cursor;
|
||||||
|
}
|
||||||
|
return cursor;
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
mdb_midl_insert(MDB_IDL ids, MDB_ID id)
|
||||||
|
{
|
||||||
|
unsigned x, i;
|
||||||
|
|
||||||
|
x = mdb_midl_search(ids, id);
|
||||||
|
assert(x > 0);
|
||||||
|
|
||||||
|
if (x < 1) {
|
||||||
|
/* internal error */
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (x <= ids[0] && ids[x] == id) {
|
||||||
|
/* duplicate */
|
||||||
|
assert(0);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (++ids[0] >= MDB_IDL_DB_MAX) {
|
||||||
|
/* no room */
|
||||||
|
--ids[0];
|
||||||
|
return -2;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
/* insert id */
|
||||||
|
for (i = ids[0]; i > x; i--)
|
||||||
|
ids[i] = ids[i - 1];
|
||||||
|
ids[x] = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void
|
||||||
|
mdb_midl_pop_n(MDB_IDL ids, unsigned n)
|
||||||
|
{
|
||||||
|
ids[0] = ids[0] - n;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
mdb_midl_remove_at(MDB_IDL ids, unsigned idx)
|
||||||
|
{
|
||||||
|
for (int i = idx - 1; idx < ids[0] - 1;)
|
||||||
|
ids[++i] = ids[++idx];
|
||||||
|
ids[0] = ids[0] - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
mdb_midl_remove(MDB_IDL ids, MDB_ID id)
|
||||||
|
{
|
||||||
|
unsigned idx = mdb_midl_search(ids, id);
|
||||||
|
if (idx <= ids[0] && ids[idx] == id)
|
||||||
|
mdb_midl_remove_at(ids, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
MDB_IDL
|
||||||
|
mdb_midl_alloc(int num)
|
||||||
|
{
|
||||||
|
MDB_IDL ids = malloc((num + 2) * sizeof(MDB_ID));
|
||||||
|
if (ids) {
|
||||||
|
*ids++ = num;
|
||||||
|
*ids = 0;
|
||||||
|
}
|
||||||
|
return ids;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
mdb_midl_free(MDB_IDL ids)
|
||||||
|
{
|
||||||
|
if (ids)
|
||||||
|
free(ids - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
mdb_midl_shrink(MDB_IDL *idp)
|
||||||
|
{
|
||||||
|
MDB_IDL ids = *idp;
|
||||||
|
if (*(--ids) > MDB_IDL_UM_MAX && (ids = realloc(ids, (MDB_IDL_UM_MAX + 2) * sizeof(MDB_ID)))) {
|
||||||
|
*ids++ = MDB_IDL_UM_MAX;
|
||||||
|
*idp = ids;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
mdb_midl_shrink_to(MDB_IDL *idp, size_t size)
|
||||||
|
{
|
||||||
|
MDB_IDL ids = *idp;
|
||||||
|
if (*(--ids) > size && (ids = realloc(ids, (size + 2) * sizeof(MDB_ID)))) {
|
||||||
|
*ids++ = size;
|
||||||
|
*idp = ids;
|
||||||
|
*idp[0] = *idp[0] > size ? size : *idp[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
mdb_midl_grow(MDB_IDL *idp, int num)
|
||||||
|
{
|
||||||
|
MDB_IDL idn = *idp - 1;
|
||||||
|
/* grow it */
|
||||||
|
idn = realloc(idn, (*idn + num + 2) * sizeof(MDB_ID));
|
||||||
|
if (!idn)
|
||||||
|
return ENOMEM;
|
||||||
|
*idn++ += num;
|
||||||
|
*idp = idn;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
mdb_midl_need(MDB_IDL *idp, unsigned num)
|
||||||
|
{
|
||||||
|
MDB_IDL ids = *idp;
|
||||||
|
num += ids[0];
|
||||||
|
if (num > ids[-1]) {
|
||||||
|
num = (num + num / 4 + (256 + 2)) & -256;
|
||||||
|
if (!(ids = realloc(ids - 1, num * sizeof(MDB_ID))))
|
||||||
|
return ENOMEM;
|
||||||
|
*ids++ = num - 2;
|
||||||
|
*idp = ids;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
mdb_midl_append(MDB_IDL *idp, MDB_ID id)
|
||||||
|
{
|
||||||
|
MDB_IDL ids = *idp;
|
||||||
|
/* Too big? */
|
||||||
|
if (ids[0] >= ids[-1]) {
|
||||||
|
if (mdb_midl_grow(idp, MDB_IDL_UM_MAX))
|
||||||
|
return ENOMEM;
|
||||||
|
ids = *idp;
|
||||||
|
}
|
||||||
|
ids[0]++;
|
||||||
|
ids[ids[0]] = id;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
mdb_midl_append_list(MDB_IDL *idp, MDB_IDL app)
|
||||||
|
{
|
||||||
|
MDB_IDL ids = *idp;
|
||||||
|
/* Too big? */
|
||||||
|
if (ids[0] + app[0] >= ids[-1]) {
|
||||||
|
if (mdb_midl_grow(idp, app[0]))
|
||||||
|
return ENOMEM;
|
||||||
|
ids = *idp;
|
||||||
|
}
|
||||||
|
memcpy(&ids[ids[0] + 1], &app[1], app[0] * sizeof(MDB_ID));
|
||||||
|
ids[0] += app[0];
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
mdb_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n)
|
||||||
|
{
|
||||||
|
MDB_ID *ids = *idp, len = ids[0];
|
||||||
|
/* Too big? */
|
||||||
|
if (len + n > ids[-1]) {
|
||||||
|
if (mdb_midl_grow(idp, n | MDB_IDL_UM_MAX))
|
||||||
|
return ENOMEM;
|
||||||
|
ids = *idp;
|
||||||
|
}
|
||||||
|
ids[0] = len + n;
|
||||||
|
ids += len;
|
||||||
|
while (n)
|
||||||
|
ids[n--] = id++;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
mdb_midl_xmerge(MDB_IDL idl, MDB_IDL merge)
|
||||||
|
{
|
||||||
|
MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i + j, total = k;
|
||||||
|
idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */
|
||||||
|
old_id = idl[j];
|
||||||
|
while (i) {
|
||||||
|
merge_id = merge[i--];
|
||||||
|
for (; old_id < merge_id; old_id = idl[--j])
|
||||||
|
idl[k--] = old_id;
|
||||||
|
idl[k--] = merge_id;
|
||||||
|
}
|
||||||
|
idl[0] = total;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Quicksort + Insertion sort for small arrays */
|
||||||
|
|
||||||
|
#define SMALL 8
|
||||||
|
#define MIDL_SWAP(a, b) \
|
||||||
|
{ \
|
||||||
|
itmp = (a); \
|
||||||
|
(a) = (b); \
|
||||||
|
(b) = itmp; \
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
mdb_midl_sort(MDB_IDL ids)
|
||||||
|
{
|
||||||
|
/* Max possible depth of int-indexed tree * 2 items/level */
|
||||||
|
int istack[sizeof(int) * CHAR_BIT * 2];
|
||||||
|
int i, j, k, l, ir, jstack;
|
||||||
|
MDB_ID a, itmp;
|
||||||
|
|
||||||
|
ir = (int)ids[0];
|
||||||
|
l = 1;
|
||||||
|
jstack = 0;
|
||||||
|
for (;;) {
|
||||||
|
if (ir - l < SMALL) { /* Insertion sort */
|
||||||
|
for (j = l + 1; j <= ir; j++) {
|
||||||
|
a = ids[j];
|
||||||
|
for (i = j - 1; i >= 1; i--) {
|
||||||
|
if (ids[i] >= a)
|
||||||
|
break;
|
||||||
|
ids[i + 1] = ids[i];
|
||||||
|
}
|
||||||
|
ids[i + 1] = a;
|
||||||
|
}
|
||||||
|
if (jstack == 0)
|
||||||
|
break;
|
||||||
|
ir = istack[jstack--];
|
||||||
|
l = istack[jstack--];
|
||||||
|
} else {
|
||||||
|
k = (l + ir) >> 1; /* Choose median of left, center, right */
|
||||||
|
MIDL_SWAP(ids[k], ids[l + 1]);
|
||||||
|
if (ids[l] < ids[ir]) {
|
||||||
|
MIDL_SWAP(ids[l], ids[ir]);
|
||||||
|
}
|
||||||
|
if (ids[l + 1] < ids[ir]) {
|
||||||
|
MIDL_SWAP(ids[l + 1], ids[ir]);
|
||||||
|
}
|
||||||
|
if (ids[l] < ids[l + 1]) {
|
||||||
|
MIDL_SWAP(ids[l], ids[l + 1]);
|
||||||
|
}
|
||||||
|
i = l + 1;
|
||||||
|
j = ir;
|
||||||
|
a = ids[l + 1];
|
||||||
|
for (;;) {
|
||||||
|
do
|
||||||
|
i++;
|
||||||
|
while (ids[i] > a);
|
||||||
|
do
|
||||||
|
j--;
|
||||||
|
while (ids[j] < a);
|
||||||
|
if (j < i)
|
||||||
|
break;
|
||||||
|
MIDL_SWAP(ids[i], ids[j]);
|
||||||
|
}
|
||||||
|
ids[l + 1] = ids[j];
|
||||||
|
ids[j] = a;
|
||||||
|
jstack += 2;
|
||||||
|
if (ir - i + 1 >= j - l) {
|
||||||
|
istack[jstack] = ir;
|
||||||
|
istack[jstack - 1] = i;
|
||||||
|
ir = j - 1;
|
||||||
|
} else {
|
||||||
|
istack[jstack] = j - 1;
|
||||||
|
istack[jstack - 1] = l;
|
||||||
|
l = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
1485
tests/soak.c
Normal file
1485
tests/soak.c
Normal file
File diff suppressed because it is too large
Load diff
1252
tests/test.c
1252
tests/test.c
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue