Compare commits

...

61 commits

Author SHA1 Message Date
6c8ad3b25f shrink the buffer as chunks empty (#12)
Reviewed-on: #12
Co-authored-by: Greg Burd <greg@burd.me>
Co-committed-by: Greg Burd <greg@burd.me>
2024-07-02 13:07:25 +00:00
2f6e9bdf7b Merge pull request 'fuzz testing' (#11) from gburd/fixing-the-fuzz into main
Reviewed-on: #11
2024-06-27 08:13:15 +00:00
7335c8e1d8 find a span of unset bits in an empty map 2024-06-27 04:10:08 -04:00
f7fd8eed9a spelling 2024-05-23 19:59:24 -04:00
7ecd2e5dc2 cleanup 2024-05-21 11:59:07 -04:00
eae0743b56 rewrite soak with more flexibility and ability to record/playback events to reproduce bugs (#10)
Reviewed-on: #10
2024-05-21 00:29:26 +00:00
68c8cd0858 cmake 2024-05-16 22:13:27 +00:00
e398d014e8 fixes 2024-05-16 12:00:09 -04:00
a5c62cfe1e Add CMake build 2024-05-16 10:38:33 -04:00
7bb26dbe88 vector offset, fix span 2024-05-15 22:03:36 -04:00
69dd960558 use chunk alignment; new fill factor API 2024-05-15 15:50:15 -04:00
b028408150 split/merge (#9)
Reviewed-on: #9
Co-authored-by: Greg Burd <greg@burd.me>
Co-committed-by: Greg Burd <greg@burd.me>
2024-05-15 17:57:40 +00:00
f5b500087d split 2024-05-13 12:46:25 -04:00
604d48617e revert split and merge for now 2024-05-13 02:07:45 +00:00
7a572453c9 Merge pull request 'gburd/full-merge' (#8) from gburd/full-merge into main
Reviewed-on: #8
2024-05-11 01:26:44 +00:00
6990f94278 Merge branch 'main' into gburd/full-merge 2024-05-11 01:26:19 +00:00
984a1a920d working 2024-05-11 01:25:15 +00:00
e641e6cc63 WIP 2024-05-10 20:26:40 +00:00
32721da645 disable neovim for a bit 2024-05-10 20:26:40 +00:00
cea31ab184 disable neovim for a bit 2024-05-10 15:32:50 +00:00
f525a097c7 WIP 2024-05-10 11:27:43 -04:00
Greg Burd
a24a4e1cc5
fixes 2024-05-10 09:40:05 -04:00
08e8c0a5d1 chunk align 2024-05-09 16:01:28 -04:00
7494a9a3a2 many new API; fixes 2024-05-09 15:50:56 -04:00
0e348efaf6 a few more fixes 2024-05-07 08:47:56 -04:00
367d15a160 integrat ecrupp review suggestions 2024-05-06 15:43:47 -04:00
3b8e083806 add copy() 2024-05-05 16:46:07 -04:00
c53bd53f9a try using env to force ssh key 2024-05-04 20:23:52 +00:00
a2b1a22a79 fix 2024-05-04 09:45:43 -04:00
1909954b42 merge amt 2024-05-04 09:44:46 -04:00
cbd53976f9 merge when out of space 2024-05-04 09:39:19 -04:00
40de89bd8a fixes 2024-05-03 22:15:36 +00:00
756476561f updates 2024-05-03 22:14:11 +00:00
4e2b4bde26 spelling 2024-05-03 21:12:57 +00:00
9dccdcbf76 spelling 2024-05-03 21:00:30 +00:00
ad910f0adf add clang-format, etc. 2024-05-03 20:56:14 +00:00
7c9ecc6d79 fixes to ide 2024-05-03 20:47:41 +00:00
gregburd
ea32274524 add Google IDX config 2024-05-03 20:32:14 +00:00
4bf1a5a00c grow map when merge requires it 2024-05-03 16:07:46 -04:00
2afbfa946e fixups 2024-05-03 16:03:09 -04:00
ffc762a796 cleanup 2024-05-03 15:24:57 -04:00
fcab709f62 roaring and sparsemap should identify the same spans 2024-05-03 15:20:35 -04:00
b9612f12cc compare against roaring bitmaps 2024-05-03 15:15:39 -04:00
Greg Burd
57a8f99a32
fixes 2024-05-02 21:35:37 -04:00
a7754b05ba cleanup 2024-05-02 21:13:17 -04:00
86798b32bd formatting 2024-05-02 14:57:56 -04:00
c9ae042b40 fix scan 2024-05-02 14:55:04 -04:00
Greg Burd
6b6ed5290f
fixes 2024-05-02 08:55:38 -04:00
897e1af9df disable shrinking in soak 2024-05-02 08:46:35 -04:00
7192c7dcff fixups 2024-05-01 09:16:50 -04:00
Greg Burd
c65fcebaad
macOS/M2 fixes 2024-05-01 09:15:22 -04:00
ee695b7243 cleanup 2024-04-30 14:40:23 -04:00
b37d390e12 fix merge/soak 2024-04-30 13:58:35 -04:00
94fbf768c0 fixes 2024-04-30 11:25:03 -04:00
2dac3ed385 adding a merge map function (#6)
Reviewed-on: #6
Co-authored-by: Greg Burd <greg@burd.me>
Co-committed-by: Greg Burd <greg@burd.me>
2024-04-30 15:20:48 +00:00
4ebe555fac docs 2024-04-29 12:10:21 -04:00
ef7ac48e32 fix header 2024-04-28 16:40:58 -04:00
faea4510dc buffer when not debugging 2024-04-28 12:28:58 -04:00
c742185b73 fixes 2024-04-28 12:26:31 -04:00
0297757856 adding soak test (#5)
Reviewed-on: #5
Co-authored-by: Greg Burd <greg@burd.me>
Co-committed-by: Greg Burd <greg@burd.me>
2024-04-26 20:25:17 +00:00
b3dfd745e7 select/rank for unset as well as set bits (#4)
Reviewed-on: #4
Co-authored-by: Greg Burd <greg@burd.me>
Co-committed-by: Greg Burd <greg@burd.me>
2024-04-24 20:32:09 +00:00
29 changed files with 34769 additions and 993 deletions

View file

@ -3,8 +3,11 @@ Checks: >
bugprone-*, bugprone-*,
clang-analyzer-*, clang-analyzer-*,
google-*, google-*,
misc-*, -google-objectivec-*,
modernize-*, modernize-*,
-modernize-deprecated-headers,
-modernize-use-using,
misc-*,
performance-*, performance-*,
portability-*, portability-*,
-bugprone-branch-clone, -bugprone-branch-clone,

2
.envrc
View file

@ -1,5 +1,5 @@
if ! has nix_direnv_version || ! nix_direnv_version 3.0.4; then if ! has nix_direnv_version || ! nix_direnv_version 3.0.4; then
source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/3.0.4/direnvrc" "sha256-DzlYZ33mWF/Gs8DDeyjr8mnVmQGx7ASYqA5WlxwvBG4=" source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/3.0.4/direnvrc" "sha256-DzlYZ33mWF/Gs8DDeyjr8mnVmQGx7ASYqA5WlxwvBG4="
fi fi
watch_file devShell.nix shell.nix flake.nix watch_file shell.nix flake.nix
use flake || use nix use flake || use nix

2
.gitignore vendored
View file

@ -3,6 +3,7 @@
**/*.o **/*.o
tests/test tests/test
examples/ex_? examples/ex_?
tests/soak
.cache .cache
hints.txt hints.txt
tmp/ tmp/
@ -27,6 +28,7 @@ compile_commands.json
*.dat *.dat
*.fsm *.fsm
*.db *.db
.vscode/
# Created by https://www.gitignore.io/api/jetbrains # Created by https://www.gitignore.io/api/jetbrains
# Edit at https://www.gitignore.io/?templates=jetbrains # Edit at https://www.gitignore.io/?templates=jetbrains

15
.idea/customTargets.xml Normal file
View file

@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CLionExternalBuildManager">
<target id="db0ccaeb-4851-470b-83d0-afa663f6ceb9" name="tests/soak" defaultType="MAKE">
<configuration id="98973a90-a9d0-431b-9071-9ce6960b0b01" name="tests/soak">
<build type="MAKE">
<make targetName="tests/soak" />
</build>
<clean type="MAKE">
<make targetName="clean" />
</clean>
</configuration>
</target>
</component>
</project>

25
.idea/makefile.xml Normal file
View file

@ -0,0 +1,25 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="MakefileBuildTargetsManager">
<user-build-targets>
<build-target name="tests/soak">
<build-configurations>
<build-configuration>
<make-targets>
<make-target>tests/soak</make-target>
</make-targets>
</build-configuration>
</build-configurations>
</build-target>
<build-target name="clean">
<build-configurations>
<build-configuration>
<make-targets>
<make-target>clean</make-target>
</make-targets>
</build-configuration>
</build-configurations>
</build-target>
</user-build-targets>
</component>
</project>

View file

@ -8,6 +8,7 @@
<sourceRoots> <sourceRoots>
<file path="$PROJECT_DIR$/examples" /> <file path="$PROJECT_DIR$/examples" />
<file path="$PROJECT_DIR$/include" /> <file path="$PROJECT_DIR$/include" />
<file path="$PROJECT_DIR$/lib" />
<file path="$PROJECT_DIR$/src" /> <file path="$PROJECT_DIR$/src" />
<file path="$PROJECT_DIR$/tests" /> <file path="$PROJECT_DIR$/tests" />
</sourceRoots> </sourceRoots>

105
.idx/dev.nix Normal file
View file

@ -0,0 +1,105 @@
# To learn more about how to use Nix to configure your environment
# see: https://developers.google.com/idx/guides/customize-idx-env
{ pkgs, ... }: {
# Which nixpkgs channel to use.
channel = "stable-23.11"; # or "unstable"
# Use https://search.nixos.org/packages to find packages
packages = with pkgs; [
act
autoconf
clang
clang-tools
cmake
ed
fira-code-nerdfont
gcc
gdb
gettext
glibc.out
glibc.static
gnumake
graphviz-nox
libbacktrace
libtool
lldb
m4
neovim
ninja
openssh
perl
pkg-config
python3
ripgrep
# pkgs.python311
# pkgs.python311Packages.pip
];
# Sets environment variables in the workspace
env = {
GIT_SSH_COMMAND="ssh -i ~/.ssh/id_ed25519 -F /dev/null";
};
idx = {
# Search for the extensions you want on https://open-vsx.org/ and use "publisher.id"
extensions = [
"coolbear.systemd-unit-file"
"dotjoshjohnson.xml"
"editorconfig.editorconfig"
"esbenp.prettier-vscode"
"mads-hartmann.bash-ide-vscode"
"ms-python.python"
"ms-vscode.clangd"
"ms-vscode.cmake-tools"
"ms-vscode.cpptools"
"ms-vscode.cpptools-extension-pack"
"ms-vscode.makefile-tools"
"ms-vsliveshare.vsliveshare"
"redhat.vscode-yaml"
"rogalmic.bash-debug"
"ryu1kn.partial-diff"
"streetsidesoftware.code-spell-checker"
"timonwong.shellcheck"
"twxs.cmake"
"vadimcn.vscode-lldb"
#"vscode-icons-team.vscode-icons"
"yzhang.markdown-all-in-one"
"znck.grammarly"
#"llvm-vs-code-extensions.vscode-clangd"
#"eamodio.gitlens"
"asvetliakov.vscode-neovim"
#"golang.go"
#"jnoortheen.nix-ide"
#"ms-python.vscode-pylance"
#"mspython.debugpy"
#"scala-lang.scala"
#"scalameta.metals"
#"vscodevim.vim"
];
# Enable previews
previews = {
enable = true;
previews = {
# web = {
# # Example: run "npm run dev" with PORT set to IDX's defined port for previews,
# # and show it in IDX's web preview panel
# command = ["npm" "run" "dev"];
# manager = "web";
# env = {
# # Environment variables to set for your server
# PORT = "$PORT";
# };
# };
};
};
# Workspace lifecycle hooks
workspace = {
# Runs when a workspace is first created
onCreate = {
# Example: install JS dependencies from NPM
# npm-install = 'npm install';
};
onStart = {
# Example: start a background task to watch and re-build backend code
# watch-backend = "npm run watch-backend";
};
};
};
}

94
CMakeLists.txt Normal file
View file

@ -0,0 +1,94 @@
cmake_minimum_required(VERSION 3.27)
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR)
message(FATAL_ERROR "Do not build in-source. Please remove CMakeCache.txt and the CMakeFiles/ directory. Then build out-of-source.")
endif()
project(sparsemap LANGUAGES C)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_C_OUTPUT_EXTENSION .o)
# Set source and header file locations
set(SOURCE_DIR src)
set(HEADER_DIR include)
set(COMMON_CMAKE_C_FLAGS "-Wall -Wextra -Wpedantic")
set(CMAKE_C_FLAGS_DEBUG "-DSPARSEMAP_DIAGNOSTIC -DDEBUG -g -O0")
set(CMAKE_C_FLAGS_PROFILE "-DSPARSEMAP_DIAGNOSTIC -DDEBUG -g -Og -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope")
set(CMAKE_C_FLAGS_RELEASE "-Ofast")
# Include all header files from the header directory
file(GLOB_RECURSE HEADERS CONFIGURE_FILES ${HEADER_DIR}/*.h)
# Configure library sources
set(LIB_SRC
${SOURCE_DIR}/sparsemap.c
)
# Option to control building shared/static libraries
option(BUILD_SHARED_LIBS "Build shared libraries" ON)
# Add shared library
add_library(sparsemap_SHARED SHARED ${LIB_SRC} ${HEADERS})
# Set target properties for the shared library (adjust if needed)
set_target_properties(sparsemap_SHARED PROPERTIES
VERSION 1.0.0 # Set library version
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" # Set output directory
COMPILE_FLAGS "${CMAKE_C_FLAGS_${CMAKE_CURRENT_LIST_MODE}}"
)
target_include_directories(sparsemap_SHARED PRIVATE ${HEADER_DIR})
# Add static library
add_library(sparsemap STATIC ${LIB_SRC} ${HEADERS})
# Set target properties for static library (adjust if needed)
set_target_properties(sparsemap PROPERTIES
OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" # Set output directory
)
target_include_directories(sparsemap PRIVATE ${HEADER_DIR})
# Add ex_1 program
add_executable(ex_1 examples/ex_1.c tests/munit.c lib/common.c)
target_link_libraries(ex_1 PRIVATE sparsemap)
target_include_directories(ex_1 PRIVATE ${HEADER_DIR})
add_custom_target(run_ex_1 COMMAND ex_1 WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
# Add ex_2 program
add_executable(ex_2 examples/ex_2.c tests/munit.c lib/common.c)
target_link_libraries(ex_2 PRIVATE sparsemap)
target_include_directories(ex_2 PRIVATE ${HEADER_DIR})
add_custom_target(run_ex_2 COMMAND ex_2 WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
# Add ex_3 program
add_executable(ex_3 examples/ex_3.c tests/munit.c lib/common.c)
target_link_libraries(ex_3 PRIVATE sparsemap)
target_include_directories(ex_3 PRIVATE ${HEADER_DIR})
add_custom_target(run_ex_3 COMMAND ex_3 WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
# Add ex_4 program
add_executable(ex_4 examples/ex_4.c tests/munit.c lib/common.c)
target_link_libraries(ex_4 PRIVATE sparsemap)
target_include_directories(ex_4 PRIVATE ${HEADER_DIR})
add_custom_target(run_ex_4 COMMAND ex_4 WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
# Add test program
add_executable(test tests/test.c tests/munit.c lib/common.c)
target_link_libraries(test PRIVATE sparsemap)
target_include_directories(test PRIVATE ${HEADER_DIR})
add_custom_target(run_test COMMAND test WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
# Add soak program
add_executable(soak tests/soak.c lib/common.c lib/tdigest.c lib/roaring.c)
target_link_libraries(soak PRIVATE sparsemap)
target_include_directories(soak PRIVATE ${HEADER_DIR} lib)
target_link_libraries(soak PUBLIC m)
add_custom_target(run_soak COMMAND soak WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
# Add fuzzer program
# add_executable(fuzzer tests/fuzzer.c)
# target_link_libraries(fuzzer PRIVATE sparsemap)
# target_include_directories(fuzzer PRIVATE ${HEADER_DIR} lib)
# target_link_libraries(fuzzer PUBLIC m)
# add_custom_target(run_fuzzer COMMAND fuzzer WORKING_DIRECTORY ${CMAKE_BINARY_DIR})

View file

@ -3,16 +3,24 @@ OBJS = sparsemap.o
STATIC_LIB = libsparsemap.a STATIC_LIB = libsparsemap.a
SHARED_LIB = libsparsemap.so SHARED_LIB = libsparsemap.so
LIBS = -lm
#CFLAGS = -Wall -Wextra -Wpedantic -Of -std=c11 -Iinclude/ -fPIC #CFLAGS = -Wall -Wextra -Wpedantic -Of -std=c11 -Iinclude/ -fPIC
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC #CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC #CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -O0 -g -std=c11 -Iinclude/ -fPIC
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Ofast -g -std=c11 -Iinclude/ -fPIC
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
#CFLAGS = -Wall -Wextra -Wpedantic -Ofast -g -std=c11 -Iinclude/ -fPIC
#CFLAGS = -DSPARSEMAP_DIAGNOSTIC -DDEBUG -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC
#CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=all -fhardened -std=c11 -Iinclude/ -fPIC #CFLAGS = -Wall -Wextra -Wpedantic -Og -g -fsanitize=all -fhardened -std=c11 -Iinclude/ -fPIC
TEST_FLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -Itests/ -fPIC #TEST_FLAGS = -DDEBUG -Wall -Wextra -Wpedantic -O0 -g -std=c11 -Iinclude/ -Itests/ -fPIC
TEST_FLAGS = -Wall -Wextra -Wpedantic -Ofast -g -std=c11 -Iinclude/ -Itests/ -fPIC
#TEST_FLAGS = -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -Itests/ -fPIC
#TEST_FLAGS = -DDEBUG -Wall -Wextra -Wpedantic -Og -g -fsanitize=address,leak,object-size,pointer-compare,pointer-subtract,null,return,bounds,pointer-overflow,undefined -fsanitize-address-use-after-scope -std=c11 -Iinclude/ -fPIC
TESTS = tests/test TESTS = tests/test tests/soak
TEST_OBJS = tests/test.o tests/munit.o tests/common.o TEST_OBJS = tests/test.o lib/munit.o lib/tdigest.o lib/common.o
LIB_OBJS = lib/munit.o lib/tdigest.o lib/common.o lib/roaring.o
EXAMPLES = examples/ex_1 examples/ex_2 examples/ex_3 examples/ex_4 EXAMPLES = examples/ex_1 examples/ex_2 examples/ex_3 examples/ex_4
.PHONY: all shared static clean test examples mls .PHONY: all shared static clean test examples mls
@ -29,17 +37,23 @@ $(STATIC_LIB): $(OBJS)
$(SHARED_LIB): $(OBJS) $(SHARED_LIB): $(OBJS)
$(CC) $(CFLAGS) -o $@ $? -shared $(CC) $(CFLAGS) -o $@ $? -shared
examples: $(STATIC_LIB) $(EXAMPLES) examples/common.o examples: $(STATIC_LIB) $(EXAMPLES) $(TEST_OBJS)
mls: examples/mls mls: examples/mls
test: $(TESTS) tests: $(TESTS)
check: test test: tests
env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/test env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/test
tests/test: $(TEST_OBJS) $(STATIC_LIB) soak: tests
$(CC) $^ -o $@ $(TEST_FLAGS) env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/soak
fuzzer: tests
env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/fuzzer ./crash.case
tests/test: $(TEST_OBJS) $(LIB_OBJS) $(STATIC_LIB)
$(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS)
clean: clean:
rm -f $(OBJS) rm -f $(OBJS)
@ -49,31 +63,44 @@ clean:
rm -f $(EXAMPLES) examples/*.o rm -f $(EXAMPLES) examples/*.o
format: format:
clang-format -i src/sparsemap.c include/sparsemap.h examples/ex_*.c tests/test.c tests/common.c tests/common.h clang-format -i src/sparsemap.c include/sparsemap.h examples/ex_*.c tests/soak.c tests/test.c tests/midl.c lib/common.c include/common.h
# clang-format -i include/*.h src/*.c tests/*.c tests/*.h examples/*.c # clang-format -i include/*.h src/*.c tests/*.c tests/*.h examples/*.c
%.o: src/%.c %.o: src/%.c
$(CC) $(CFLAGS) -c -o $@ $^ $(CC) $(CFLAGS) -c -o $@ $^
lib/%.o: tests/%.c
$(CC) $(CFLAGS) -c -o $@ $^
tests/%.o: tests/%.c tests/%.o: tests/%.c
$(CC) $(CFLAGS) -c -o $@ $^ $(CC) $(CFLAGS) -c -o $@ $^
examples/%.o: examples/%.c examples/%.o: examples/%.c
$(CC) $(CFLAGS) -c -o $@ $^ $(CC) $(CFLAGS) -c -o $@ $^
examples/common.o: tests/common.c examples/ex_1: $(LIB_OBJS) examples/ex_1.o $(STATIC_LIB)
$(CC) $(CFLAGS) -c -o $@ $^ $(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS)
examples/ex_1: examples/common.o examples/ex_1.o $(STATIC_LIB) examples/ex_2: $(LIB_OBJS) examples/ex_2.o $(STATIC_LIB)
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS) $(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS)
examples/ex_2: examples/common.o examples/ex_2.o $(STATIC_LIB) examples/ex_3: $(LIB_OBJS) examples/ex_3.o $(STATIC_LIB)
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS) $(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS)
examples/ex_3: examples/common.o examples/ex_3.o $(STATIC_LIB) examples/ex_4: $(LIB_OBJS) examples/ex_4.o $(STATIC_LIB)
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS) $(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS)
examples/ex_4: examples/common.o examples/ex_4.o $(STATIC_LIB) tests/soak: $(LIB_OBJS) tests/soak.o $(STATIC_LIB)
$(CC) $^ -o $@ $(CFLAGS) $(TEST_FLAGS) $(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS)
tests/fuzzer: $(LIB_OBJS) tests/fuzzer.o $(STATIC_LIB)
$(CC) $^ $(LIBS) -o $@ $(TEST_FLAGS) -DFUZZ_DEBUG
todo:
rg -i 'todo|gsb|abort'
# cp src/sparsemap.c /tmp && clang-tidy src/sparsemap.c -fix -fix-errors -checks="readability-braces-around-statements" -- -DDEBUG -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC # cp src/sparsemap.c /tmp && clang-tidy src/sparsemap.c -fix -fix-errors -checks="readability-braces-around-statements" -- -DDEBUG -DSPARSEMAP_DIAGNOSTIC -DSPARSEMAP_ASSERT -Wall -Wextra -Wpedantic -Og -g -std=c11 -Iinclude/ -fPIC
# clear; make clean examples test && env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./tests/test
# clear; make clean examples test && env ASAN_OPTIONS=detect_leaks=1 LSAN_OPTIONS=verbosity=1:log_threads=1 ./examples/soak

View file

@ -1,8 +1,12 @@
# Sparsemap # Sparsemap
`sparsemap` is a sparse, compressed bitmap. In best case, it can store 2048 Bitsets, also called bitmaps, are commonly used as fast data structures.
bits in just 8 bytes. In worst case, it stores the 2048 bits uncompressed and Unfortunately, they can use too much memory. To compensate, we often use
requires additional 8 bytes of overhead. compressed bitmaps.
`sparsemap` is a sparse, compressed bitmap. In the best case, it can store 2048
bits in just 8 bytes. In the worst case, it stores the 2048 bits uncompressed and
requires an additional 8 bytes of overhead.
The "best" case happens when large consecutive sequences of the bits are The "best" case happens when large consecutive sequences of the bits are
either set ("1") or not set ("0"). If your numbers are consecutive 64bit either set ("1") or not set ("0"). If your numbers are consecutive 64bit
@ -14,9 +18,9 @@ On the lowest level stores bits in sm_bitvec_t's (a uint32_t or uint64_t).
Each sm_bitvec_t has an additional descriptor (2 bits). A single word prepended Each sm_bitvec_t has an additional descriptor (2 bits). A single word prepended
to each sm_bitvec_t describes its condition. The descriptor word and the to each sm_bitvec_t describes its condition. The descriptor word and the
sm_bitvec_t's have the same size.) The descriptor of a sm_bitvec_t sm_bitvec_t's have the same size. The descriptor of a sm_bitvec_t
specifies whether the sm_bitvec_t consists only of set bits ("1"), unset specifies whether the sm_bitvec_t consists only of set bits ("1"), unset
bits ("0") or has a mixed payload. In the first and second case the bits ("0") or has a mixed payload. In the first and second cases, the
sm_bitvec_t is not stored. sm_bitvec_t is not stored.
An example shows a sequence of 4 x 16 bits (here, each sm_bitvec_t and the An example shows a sequence of 4 x 16 bits (here, each sm_bitvec_t and the
@ -27,7 +31,7 @@ Descriptor word has 16 bits):
^^ ^^ ^^ ^^-- sm_bitvec_t #0 - #3 are "0000000000000000" ^^ ^^ ^^ ^^-- sm_bitvec_t #0 - #3 are "0000000000000000"
^^-- sm_bitvec_t #4 is "1111111111111111" ^^-- sm_bitvec_t #4 is "1111111111111111"
^^-- sm_bitvec_t #5 is "0000000000000000" ^^-- sm_bitvec_t #5 is "0000000000000000"
^^-- sm_bitvec_t #7 is "1111111111111111" ^^-- sm_bitvec_t #6 is "1111111111111111"
^^-- sm_bitvec_t #7 is "0110010101111001" ^^-- sm_bitvec_t #7 is "0110010101111001"
Since the first 7 sm_bitvec_t's are either all "1" or "0" they are not stored. Since the first 7 sm_bitvec_t's are either all "1" or "0" they are not stored.
@ -36,29 +40,34 @@ The actual memory sequence looks like this:
0000000011001110 0110010101111001 0000000011001110 0110010101111001
Instead of storing 8 Words (16 bytes), we only store 2 Words (2 bytes): one Instead of storing 8 Words (16 bytes), we only store 2 Words (2 bytes): one
for the descriptor, one for last sm_bitvec_t #7. for the descriptor, and one for the last sm_bitvec_t #7.
The sparsemap stores a list of chunk maps, and for each chunk map it stores the The sparsemap stores a list of chunk maps, and for each chunk map, it stores the
absolute address (i.e. if the user sets bit 0 and bit 10000, and the chunk map absolute address (i.e. if the user sets bit 0 and bit 10000, and the chunk map
capacity is 2048, the sparsemap creates two chunk maps; the first starts at capacity is 2048, the sparsemap creates two chunk maps; the first starts at
offset 0, the second starts at offset 8192). offset 0, the second starts at offset 8192).
## Usage instructions ## Usage instructions
The file `examples/ex_1.c` has example code. Copy the files `src/sparsemap.c` and `include/sparsemap.h` into your project.
Review the `examples/*` and `tests/*` code.
## Final words ## Final words
This bitmap has efficient compression when used on long sequences of set (or This bitmap has efficient compression when used on long sequences of set (or
unset) bits (i.e. with a word size of 64bit, and a payload of consecutive unset) bits (i.e. with a word size of 64 bit and a payload of consecutive
numbers without gaps, the payload of 2048 x sizeof(uint64_t) = 16kb will occupy numbers without gaps, the payload of 2048 x sizeof(uint64_t) = 16kb will occupy
only 8 bytes! only 8 bytes!).
However, if the sequence is not consecutive and has gaps, it's possible that However, if the sequence is not consecutive and has gaps, it's possible that
the compression is inefficient, and the size (in the worst case) is identical the compression is inefficient, and the size (in the worst case) is identical
to an uncompressed bit vector (sometimes higher due to the bytes required for to an uncompressed bit vector (sometimes higher due to the bytes required for
metadata). In such cases, other compression schemes are more efficient (i.e. metadata). In such cases, other compression schemes are more efficient (i.e.
http://lemire.me/blog/archives/2008/08/20/the-mythical-bitmap-index/). http://lemire.me/blog/archives/2008/08/20/the-mythical-bitmap-index/). We
include in `lib` the amalgamated (git `2dc8070`) and well-known
[Roaring Bitmaps](https://github.com/RoaringBitmap/CRoaring/tree/master) and
use it in the soak test to ensure our results are as accurate as theirs.
This library was originally created for hamsterdb [http://hamsterdb.com] in This library was originally created by [Christoph Rupp](https://crupp.de) in
C++ and then translated to C99 code by Greg Burd <greg@burd.me>. C++ and then translated to C and further improved by Greg Burd <greg@burd.me>
for use in LMDB and OpenLDAP.

14
cmake-it.sh Executable file
View file

@ -0,0 +1,14 @@
#!/usr/bin/env bash
target=${1:-Debug}
set targets="Debug Profile Release"
case "$target" in
$targets*) echo "Building ${target}..." ;;
*) echo "Unknown target ${target}, exiting." ;;
esac
name=${target,,}
echo $name
rm -rf "./cmake-build-${name}-system" && \
cmake -DCMAKE_BUILD_TYPE=${target} -DCMAKE_MAKE_PROGRAM=ninja -DCMAKE_C_COMPILER=clang -G Ninja -S "${PWD}" -B "${PWD}/cmake-build-${name}-system" && \
(cd "${PWD}/cmake-build-${name}-system" && ninja)

View file

@ -1,4 +1,7 @@
#include <assert.h> #include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include "../include/sparsemap.h" #include "../include/sparsemap.h"
@ -14,16 +17,18 @@
/* !!! Duplicated here for testing purposes. Keep in sync, or suffer. !!! */ /* !!! Duplicated here for testing purposes. Keep in sync, or suffer. !!! */
struct sparsemap { struct sparsemap {
uint8_t *m_data;
size_t m_capacity; size_t m_capacity;
size_t m_data_used; size_t m_data_used;
uint8_t *m_data;
}; };
int int
main() main()
{ {
size_t size = 4; size_t size = 4;
setbuf(stderr, 0); // disable buffering setvbuf(stdout, NULL, _IONBF, 0); // Disable buffering for stdout
setvbuf(stderr, NULL, _IONBF, 0); // Disable buffering for stdout
__diag("Please wait a moment..."); __diag("Please wait a moment...");
sparsemap_t mmap, *map = &mmap; sparsemap_t mmap, *map = &mmap;
uint8_t buffer[1024]; uint8_t buffer[1024];
@ -135,7 +140,7 @@ main()
sparsemap_set(map, i, true); sparsemap_set(map, i, true);
} }
for (int i = 0; i < 100000; i++) { for (int i = 0; i < 100000; i++) {
assert(sparsemap_select(map, i) == (unsigned)i); assert(sparsemap_select(map, i, true) == (unsigned)i);
} }
sparsemap_clear(map); sparsemap_clear(map);
@ -145,7 +150,7 @@ main()
sparsemap_set(map, i, true); sparsemap_set(map, i, true);
} }
for (int i = 1; i < 513; i++) { for (int i = 1; i < 513; i++) {
assert(sparsemap_select(map, i - 1) == (unsigned)i); assert(sparsemap_select(map, i - 1, true) == (unsigned)i);
} }
sparsemap_clear(map); sparsemap_clear(map);
@ -155,7 +160,7 @@ main()
sparsemap_set(map, i * 10, true); sparsemap_set(map, i * 10, true);
} }
for (size_t i = 0; i < 8; i++) { for (size_t i = 0; i < 8; i++) {
assert(sparsemap_select(map, i) == i * 10); assert(sparsemap_select(map, i, true) == (sparsemap_idx_t)i * 10);
} }
// split and move, aligned to MiniMap capacity // split and move, aligned to MiniMap capacity
@ -181,16 +186,18 @@ main()
sparsemap_clear(map); sparsemap_clear(map);
for (int i = 0; i < 2048 * 3; i++) { for (int i = 0; i < 2048 * 3; i++) {
sparsemap_set(map, i, true); sparsemap_set(map, i, true);
assert(sparsemap_is_set(map, i) == true);
} }
sparsemap_split(map, 64, sm2); sparsemap_split(map, 64, sm2);
for (int i = 0; i < 64; i++) { for (int i = 0; i < 2048 * 3; i++) {
if (i < 64) {
assert(sparsemap_is_set(map, i) == true); assert(sparsemap_is_set(map, i) == true);
assert(sparsemap_is_set(sm2, i) == false); assert(sparsemap_is_set(sm2, i) == false);
} } else {
for (int i = 64; i < 2048 * 3; i++) {
assert(sparsemap_is_set(map, i) == false); assert(sparsemap_is_set(map, i) == false);
assert(sparsemap_is_set(sm2, i) == true); assert(sparsemap_is_set(sm2, i) == true);
} }
}
fprintf(stderr, " ok\n"); fprintf(stderr, " ok\n");
} }

View file

@ -1,9 +1,7 @@
#include <assert.h> #include <assert.h>
#include <stdarg.h> #include <stdbool.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include "../include/sparsemap.h" #include "../include/sparsemap.h"
@ -16,32 +14,31 @@
} while (0) } while (0)
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#define SEED
int int
main(void) main(void)
{ {
int i = 0; int i;
// disable buffering // disable buffering
setbuf(stderr, 0); setvbuf(stdout, NULL, _IONBF, 0); // Disable buffering for stdout
setvbuf(stderr, NULL, _IONBF, 0); // Disable buffering for stdout
// start with a 1KiB buffer, 1024 bits // start with a 1KiB buffer, 1024 bits
uint8_t *buf = calloc(1024, sizeof(uint8_t)); uint8_t *buf = calloc(1024, sizeof(uint8_t));
// create the sparse bitmap // create the sparse bitmap
sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 1024); sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 1024);
// Set every other bit (pathologically worst case) to see what happens // Set every other bit (pathologically worst case) to see what happens
// when the map is full. // when the map is full.
for (i = 0; i < 7744; i++) { for (i = 0; i < 7744; i++) {
if (i % 2) if (!i % 2) {
continue;
sparsemap_set(map, i, true); sparsemap_set(map, i, true);
assert(sparsemap_is_set(map, i) == true); assert(sparsemap_is_set(map, i) == true);
} }
}
// On 1024 KiB of buffer with every other bit set the map holds 7744 bits // On 1024 KiB of buffer with every other bit set the map holds 7744 bits
// and then runs out of space. This next _set() call will fail/abort. // and then runs out of space. This next _set() call will fail.
sparsemap_set(map, ++i, true); sparsemap_set(map, ++i, true);
assert(sparsemap_is_set(map, i) == true); assert(sparsemap_is_set(map, i) == true);
return 0; return 0;

View file

@ -1,17 +1,14 @@
#include <assert.h> #include <assert.h>
#include <stdarg.h> #include <common.h>
#include <sparsemap.h>
#include <stdbool.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include "../include/sparsemap.h"
#include "../tests/common.h"
int int
main(void) main(void)
{ {
int i = 0; int i;
int array[1024] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, int array[1024] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
@ -60,7 +57,7 @@ main(void)
uint8_t *buf = calloc(1024, sizeof(uint8_t)); uint8_t *buf = calloc(1024, sizeof(uint8_t));
// create the sparse bitmap // create the sparse bitmap
sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 1024); sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 1024);
// set all the bits on in a random order // set all the bits on in a random order
for (i = 0; i < 1024; i++) { for (i = 0; i < 1024; i++) {

View file

@ -1,11 +1,10 @@
#include <assert.h> #include <assert.h>
#include <common.h>
#include <sparsemap.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <unistd.h> #include <unistd.h>
#include "../include/sparsemap.h"
#include "../tests/common.h"
#define TEST_ARRAY_SIZE 1024 #define TEST_ARRAY_SIZE 1024
int int
@ -24,7 +23,7 @@ main(void)
uint8_t *buf = calloc((size_t)3 * 1024, sizeof(uint8_t)); uint8_t *buf = calloc((size_t)3 * 1024, sizeof(uint8_t));
// create the sparse bitmap // create the sparse bitmap
sparsemap_t *map = sparsemap(buf, sizeof(uint8_t) * 3 * 1024); sparsemap_t *map = sparsemap_wrap(buf, sizeof(uint8_t) * 3 * 1024);
// create an array of ints // create an array of ints
setup_test_array(array, TEST_ARRAY_SIZE, 1024 * 3); setup_test_array(array, TEST_ARRAY_SIZE, 1024 * 3);
@ -60,7 +59,7 @@ main(void)
assert(sparsemap_is_set(map, array[i]) == true); assert(sparsemap_is_set(map, array[i]) == true);
} }
has_span(map, array, TEST_ARRAY_SIZE, (int)len); has_span(map, array, TEST_ARRAY_SIZE, (int)len);
size_t l = sparsemap_span(map, 0, len); size_t l = sparsemap_span(map, 0, len, true);
if (l != (size_t)-1) { if (l != (size_t)-1) {
__diag("Found span in map starting at %lu of length %lu\n", l, len); __diag("Found span in map starting at %lu of length %lu\n", l, len);
__diag("is_span(%lu, %lu) == %s\n", l, len, is_span(array, TEST_ARRAY_SIZE, l, len) ? "yes" : "no"); __diag("is_span(%lu, %lu) == %s\n", l, len, is_span(array, TEST_ARRAY_SIZE, l, len) ? "yes" : "no");

View file

@ -1,43 +1,25 @@
{ {
"nodes": { "nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1710146030,
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1712192574, "lastModified": 1701282334,
"narHash": "sha256-LbbVOliJKTF4Zl2b9salumvdMXuQBr2kuKP5+ZwbYq4=", "narHash": "sha256-MxCVrXY6v4QmfTwIysjjaX0XUhqBbxTWWB4HXtDYsdk=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "f480f9d09e4b4cf87ee6151eba068197125714de", "rev": "057f9aecfb71c4437d2b27d3323df7f93c010b7e",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "NixOS", "owner": "NixOS",
"ref": "nixpkgs-unstable", "ref": "23.11",
"repo": "nixpkgs", "repo": "nixpkgs",
"type": "github" "type": "github"
} }
}, },
"root": { "root": {
"inputs": { "inputs": {
"flake-utils": "flake-utils", "nixpkgs": "nixpkgs",
"nixpkgs": "nixpkgs" "utils": "utils"
} }
}, },
"systems": { "systems": {
@ -54,6 +36,24 @@
"repo": "default", "repo": "default",
"type": "github" "type": "github"
} }
},
"utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1710146030,
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
} }
}, },
"root": "root", "root": "root",

View file

@ -1,38 +1,30 @@
{ {
description = "A Concurrent Skip List library for key/value pairs."; description = "A sparse bitmapped index library in C.";
inputs = { inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; # nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
flake-utils.url = "github:numtide/flake-utils"; nixpkgs.url = "github:NixOS/nixpkgs/23.11";
utils.url = "github:numtide/flake-utils";
}; };
outputs = outputs = { self, nixpkgs, ... }
{ self @inputs: inputs.utils.lib.eachSystem [
, nixpkgs "x86_64-linux" "i686-linux" "aarch64-linux" "x86_64-darwin"
, flake-utils ] (system:
, ... let pkgs = import nixpkgs {
}:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = import nixpkgs {
inherit system; inherit system;
config = { allowUnfree = true; }; overlays = [];
config.allowUnfree = true;
}; };
supportedSystems = [ "x86_64-linux" ];
forAllSystems = nixpkgs.lib.genAttrs supportedSystems;
nixpkgsFor = forAllSystems (system: import nixpkgs {
inherit system;
overlays = [ self.overlay ];
});
in { in {
pkgs = import nixpkgs { flake-utils.inputs.systems.follows = "system";
inherit system; devShell = pkgs.mkShell rec {
devShell = nixpkgs.legacyPackages.${system} { name = "sparsemap";
pkgs.mkShell = { packages = with pkgs; [
nativeBuildInputs = with pkgs.buildPackages; [
act act
autoconf autoconf
clang clang
cmake
ed ed
gcc gcc
gdb gdb
@ -40,20 +32,26 @@
graphviz-nox graphviz-nox
libtool libtool
m4 m4
ninja
perl perl
pkg-config pkg-config
python3 python3
ripgrep ripgrep
valgrind valgrind
]; ];
buildInputs = with pkgs; [ buildInputs = with pkgs; [
libbacktrace libbacktrace
glibc.out glibc.out
glibc.static glibc.static
]; ];
shellHook = let
icon = "f121";
in ''
export PS1="$(echo -e '\u${icon}') {\[$(tput sgr0)\]\[\033[38;5;228m\]\w\[$(tput sgr0)\]\[\033[38;5;15m\]} (${name}) \\$ \[$(tput sgr0)\]"
'';
}; };
DOCKER_BUILDKIT = 1; DOCKER_BUILDKIT = 1;
};
};
}); });
} }

View file

@ -1,4 +1,6 @@
#include "../include/sparsemap.h"
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wvariadic-macros" #pragma GCC diagnostic ignored "-Wvariadic-macros"
#define __diag(...) \ #define __diag(...) \
@ -23,23 +25,12 @@
#define XORSHIFT_SEED_VALUE ((unsigned int)time(NULL) ^ getpid()) #define XORSHIFT_SEED_VALUE ((unsigned int)time(NULL) ^ getpid())
#endif #endif
#define EST_MEDIAN_DECL(decl, size) \
uint64_t heap_##decl[size] = { 0 }; \
int heap_##decl##_max_size = size; \
int heap_##decl##_size = 0;
#define EST_MEDIAN_ADD(decl, value) est_insert_value(heap_##decl, heap_##decl##_max_size, &heap_##decl##_size, (value));
#define EST_MEDIAN_GET(decl) heap_##decl[0]
uint64_t tsc(void); uint64_t tsc(void);
double tsc_ticks_to_ns(uint64_t tsc_ticks); double tsc_ticks_to_ns(uint64_t tsc_ticks);
void est_sift_up(uint64_t *heap, int child_index); double nsts(void);
void est_sift_down(uint64_t *heap, int heap_size, int parent_index);
void est_insert_value(uint64_t *heap, int heap_max_size, int *heap_size, uint64_t value);
void xorshift32_seed(); void xorshift32_seed(void);
uint32_t xorshift32(); uint32_t xorshift32(void);
void print_array(int *array, int l); void print_array(int *array, int l);
void print_spans(int *array, int n); void print_spans(int *array, int n);
@ -52,11 +43,18 @@ int is_unique(int a[], int l, int value);
void setup_test_array(int a[], int l, int max_value); void setup_test_array(int a[], int l, int max_value);
void shuffle(int *array, size_t n); void shuffle(int *array, size_t n);
int ensure_sequential_set(int a[], int l, int r); int ensure_sequential_set(int a[], int l, int r);
int create_sequential_set_in_empty_map(sparsemap_t *map, int s, int r); sparsemap_idx_t sm_add_span(sparsemap_t *map, int map_size, int span_length);
void print_bits(char *name, uint64_t value);
void bitmap_from_uint32(sparsemap_t *map, uint32_t number); void bitmap_from_uint32(sparsemap_t *map, uint32_t number);
void bitmap_from_uint64(sparsemap_t *map, uint64_t number); void sm_bitmap_from_uint64(sparsemap_t *map, int offset, uint64_t number);
uint32_t rank_uint64(uint64_t number, int n, int p); uint32_t rank_uint64(uint64_t number, int n, int p);
int whats_set_uint64(uint64_t number, int bitPositions[64]); int whats_set_uint64(uint64_t number, int bitPositions[64]);
void whats_set(sparsemap_t *map, int m); void sm_whats_set(sparsemap_t *map, int off, int len);
bool sm_is_span(sparsemap_t *map, sparsemap_idx_t m, int len, bool value);
bool sm_occupied(sparsemap_t *map, sparsemap_idx_t m, int len, bool value);
char *bytes_as(double bytes, char *s, size_t size);

2908
include/roaring.h Normal file

File diff suppressed because it is too large Load diff

View file

@ -69,14 +69,14 @@
#ifndef SPARSEMAP_H #ifndef SPARSEMAP_H
#define SPARSEMAP_H #define SPARSEMAP_H
#include <sys/types.h>
#include <assert.h>
#include <limits.h> #include <limits.h>
#include <stdbool.h> #include <stdbool.h>
#include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h>
#include <string.h> #if defined(__cplusplus)
extern "C" {
#endif
/* /*
* The public interface for a sparse bit-mapped index, a "sparse map". * The public interface for a sparse bit-mapped index, a "sparse map".
@ -88,55 +88,299 @@
*/ */
typedef struct sparsemap sparsemap_t; typedef struct sparsemap sparsemap_t;
typedef size_t sparsemap_idx_t;
#define SPARSEMAP_IDX_MAX SIZE_MAX
#define SPARSEMAP_FOUND(x) ((x) != SPARSEMAP_IDX_MAX)
#define SPARSEMAP_NOT_FOUND(x) ((x) == SPARSEMAP_IDX_MAX)
typedef uint32_t sm_idx_t; typedef uint32_t sm_idx_t;
typedef uint64_t sm_bitvec_t; typedef uint64_t sm_bitvec_t;
/* Allocate on a sparsemap_t on the heap and initialize it. */ /** @brief Allocate a new, empty sparsemap_t with a buffer of \b size on the
sparsemap_t *sparsemap(uint8_t *data, size_t size); * heap to use for storage of bitmap data.
*
* The buffer used for the bitmap is allocated in the same heap allocation as
* the structure, this means that you only need to call free() on the returned
* object to free all resources. Using this method allows you to grow the
* buffer size by calling #sparsemap_set_data_size(). This function calls
* #sparsemap_init().
*
* @param[in] size The starting size of the buffer used for the bitmap, default
* is 1024 bytes.
* @returns The newly allocated sparsemap reference.
*/
sparsemap_t *sparsemap(size_t size);
/* Initialize sparsemap_t with data. */ /** @brief Allocate a new, copy of the \b other sparsemap_t.
*
* @param[in] other The sparsemap to copy.
*/
sparsemap_t *sparsemap_copy(sparsemap_t *other);
/** @brief Allocate a new, empty sparsemap_t that references (wraps) the buffer
* \b data of \b size bytes to use for storage of bitmap data.
*
* This function allocates a new sparsemap_t but not the buffer which is
* provided by the caller as \b data which can be allocated on the stack or
* heap. Caller is responsible for calling free() on the returned heap object
* and releasing the memory used for \b data. Resizing the buffer is only
* supported when the heap object for the map includes the buffer and the
* \b data offset supplied is relative to the object (see #sparsemap()).
*
* @param[in] data A heap or stack memory buffer of \b size for use storing
* bitmap data.
* @param[in] size The size of the buffer \b data used for the bitmap.
* @returns The newly allocated sparsemap reference.
*/
sparsemap_t *sparsemap_wrap(uint8_t *data, size_t size);
/** @brief Initialize an existing sparsemap_t by assigning \b data of \b size
* bytes for storage of bitmap data.
*
* Given the address of an existing \b map allocated on the stack or heap this
* function will initialize the data structure and use the provided \b data of
* \b size for bitmap data. Caller is responsible for all memory management.
* Resizing the buffer is not directly supported, you
* may resize it and call #sparsemap_set_data_size() and then ensure that should
* the address of the object changed you need to update it by calling #sparsemap_
* m_data field.
*
* @param[in] map The sparsemap reference.
* @param[in] data A heap or stack memory buffer of \b size for use storing
* bitmap data.
* @param[in] size The size of the buffer \b data used for the bitmap.
*/
void sparsemap_init(sparsemap_t *map, uint8_t *data, size_t size); void sparsemap_init(sparsemap_t *map, uint8_t *data, size_t size);
/* Clears the whole buffer. */ /** @brief Opens, without initializing, an existing sparsemap contained within
* the specified buffer.
*
* Given the address of an existing \b map this function will assign to the
* provided data structure \b data of \b size for bitmap data. Caller is
* responsible for all memory management. Use this when as a way to
* "deserialize" bytes and make them ready for use as a bitmap.
*
* @param[in] map The sparsemap reference.
* @param[in] data A heap or stack memory buffer of \b size for use storing
* bitmap data.
* @param[in] size The size of the buffer \b data used for the bitmap.
*/
void sparsemap_open(sparsemap_t *map, uint8_t *data, size_t size);
/** @brief Resets values and empties the buffer making it ready to accept new
* data but does not free the memory.
*
* @param[in] map The sparsemap reference.
*/
void sparsemap_clear(sparsemap_t *map); void sparsemap_clear(sparsemap_t *map);
/* Opens an existing sparsemap at the specified buffer. */ /** @brief Update the size of the buffer \b data used for storing the bitmap.
void sparsemap_open(sparsemap_t *, uint8_t *data, size_t data_size); *
* When called with \b data NULL on a \b map that was created with #sparsemap()
* this function will reallocate the storage for both the map and data possibly
* changing the address of the map itself so it is important for the caller to
* update all references to this map to the address returned in this scenario.
* Access to stale references will result in memory violations and program
* termination. Caller is not required to free() the old address, only the new
* one should it have changed. This uses #realloc() under the covers, all
* caveats apply here as well.
*
* When called referencing a \b map that was allocate by the caller this
* function will only update the values within the data structure.
*
* @param[in] map The sparsemap reference.
* @param[in] size The desired size of the buffer \b data used for the bitmap.
* @returns The -- potentially changed -- sparsemap reference, or NULL should a
* #realloc() fail (\b ENOMEM)
* @note The resizing of caller supplied allocated objects is not yet fully
* supported.
*/
sparsemap_t *sparsemap_set_data_size(sparsemap_t *map, uint8_t *data, size_t size);
/* Resizes the data range. */ /** @brief Calculate remaining capacity, approaches 0 when full.
void sparsemap_set_data_size(sparsemap_t *map, size_t data_size); *
* Provides an estimate in the range [0.0, 100.0] of the remaining capacity of
/* Calculate remaining capacity, full when 0. */ * the buffer storing bitmap data. This can change up or down as more data
* is added/removed due to the method for compressed representation, do not
* expect a smooth progression either direction. This is a rough estimate only
* and may also jump in value after seemingly indiscriminate changes to the map.
*
* @param[in] map The sparsemap reference.
* @returns an estimate for remaining capacity that approaches 0.0 when full or
* 100.0 when empty
*/
double sparsemap_capacity_remaining(sparsemap_t *map); double sparsemap_capacity_remaining(sparsemap_t *map);
/* Returns the size of the underlying byte array. */ /** @brief Returns the capacity of the underlying byte array in bytes.
*
* Specifically, this returns the byte \b size provided for the underlying
* buffer used to store bitmap data.
*
* @param[in] map The sparsemap reference.
* @returns byte size of the buffer used for storing bitmap data
*/
size_t sparsemap_get_capacity(sparsemap_t *map); size_t sparsemap_get_capacity(sparsemap_t *map);
/* Returns the value of a bit at index |idx|. */ /** @brief Returns the value of a bit at index \b idx, either true for "set" (1)
bool sparsemap_is_set(sparsemap_t *map, size_t idx); * or \b false for "unset" (0).
*
* @param[in] map The sparsemap reference.
* @param[in] idx The 0-based offset into the bitmap index to examine.
* @returns either true or false
*/
bool sparsemap_is_set(sparsemap_t *map, sparsemap_idx_t idx);
/* Sets the bit at index |idx| to true or false, depending on |value|. */ /** @brief Sets the bit at index \b idx to \b value.
void sparsemap_set(sparsemap_t *map, size_t idx, bool value); *
* A sparsemap has a fixed size buffer with a capacity that can be exhausted by
* when calling this function. In such cases the return value is not equal to
* the provided \b idx and errno is set to ENOSPC. In such situations it is
* possible to grow the data size and retry the set() operation under certain
* circumstances (see #sparsemap() and #sparsemap_set_data_size()).
*
* @param[in] map The sparsemap reference.
* @param[in] idx The 0-based offset into the bitmap index to modify.
* @returns the \b idx supplied on success or SPARSEMAP_IDX_MAX on error
* with \b errno set to ENOSPC when the map is full.
*/
sparsemap_idx_t sparsemap_set(sparsemap_t *map, sparsemap_idx_t idx, bool value);
/* Returns the offset of the very first bit. */ /** @brief Returns the byte size of the data buffer that has been used thus far.
sm_idx_t sparsemap_get_start_offset(sparsemap_t *map); *
* @param[in] map The sparsemap reference.
/* Returns the used size in the data buffer. */ * @returns the byte size of the data buffer that has been used thus far
*/
size_t sparsemap_get_size(sparsemap_t *map); size_t sparsemap_get_size(sparsemap_t *map);
/* Decompresses the whole bitmap; calls scanner for all bits. */ /** @brief Returns a pointer to the data buffer used for the map.
void sparsemap_scan(sparsemap_t *map, void (*scanner)(sm_idx_t[], size_t), size_t skip); *
* @param[in] map The sparsemap reference.
* @returns a pointer to the data buffer used for the map
*/
void *sparsemap_get_data(sparsemap_t *map);
/* Appends all chunk maps from |map| starting at |sstart| to |other|, then /** @brief Returns the number of elements in the map.
reduces the chunk map-count appropriately. */ *
void sparsemap_split(sparsemap_t *map, size_t sstart, sparsemap_t *other); * @param[in] map The sparsemap reference.
* @returns the number of elements in the map
*/
size_t sparsemap_count(sparsemap_t *map);
/* Returns the index of the n'th set bit; uses a 0-based index. */ /** @brief Returns the offset of the first bit set in the map.
size_t sparsemap_select(sparsemap_t *map, size_t n); *
* This is the same as the value of the first set bit in the
* map.
*
* @param[in] map The sparsemap reference.
* @returns the offset of the first bit set in the map
*/
sparsemap_idx_t sparsemap_get_starting_offset(sparsemap_t *map);
/* Counts the set bits in the range [offset, idx]. */ /** @brief Returns the offset of the last bit set in the map.
size_t sparsemap_rank(sparsemap_t *map, size_t offset, size_t idx); *
* This is the same as the value of the last bit set in the
* map.
*
* @param[in] map The sparsemap reference.
* @returns the offset of the index bit set in the map
*/
sparsemap_idx_t sparsemap_get_ending_offset(sparsemap_t *map);
size_t sparsemap_span(sparsemap_t *map, size_t loc, size_t len); /** @brief Returns the percent of bits set in the map.
*
* @param[in] map The sparsemap reference.
* @returns the percent of bits set.
*/
double sparsemap_fill_factor(sparsemap_t *map);
/** @brief Provides a method for a callback function to examine every bit set in
* the index.
*
* This decompresses the whole bitmap and invokes #scanner() passing an array
* of the positions of set bits in order from 0 index to the end of the map.
*
* @param[in] map The sparsemap reference.
* @param[in] skip Start the scan after \b skip position in the map.
* @param[in] aux Auxiliary information passed to the scanner.
*/
void sparsemap_scan(sparsemap_t *map, void (*scanner)(sm_idx_t vec[], size_t n, void *aux), size_t skip, void *aux);
/** @brief Merges the values from \b source into \b destination, \b source is unchanged.
*
* Efficiently adds all set bits from \b source into \b destination.
*
* @param[in] destination The sparsemap reference into which we will merge \b source.
* @param[in] source The bitmap to merge into \b destination.
* @returns 0 on success, or sets errno to ENOSPC and returns the amount of
* additional space required to successfully merge the maps.
*/
int sparsemap_merge(sparsemap_t *destination, sparsemap_t *source);
/** @brief Splits the bitmap by assigning all bits starting at \b offset to the
* \b other bitmap while removing them from \b map.
*
* The \b other bitmap is expected to be empty.
*
* @param[in] map The sparsemap reference.
* @param[in] offset The 0-based offset into the bitmap at which to split, if
* set to SPARSEMAP_IDX_MAX then the bits will be evenly split.
* @param[in] other The bitmap into which we place the split.
* @returns the offset at which the map was split
*/
sparsemap_idx_t sparsemap_split(sparsemap_t *map, sparsemap_idx_t offset, sparsemap_t *other);
/** @brief Finds the index of the \b n'th bit set to \b value.
*
* Locates the \b n'th bit either set, \b value is true, or unset, \b value is
* false, from the start of the bitmap.
* So, if your bit pattern is: ```1101 1110 1010 1101 1011 1110 1110 1111``` and
* you request the first set bit the result is `0` (meaning the 1st bit in the
* map which is index 0 because this is 0-based indexing). The first unset bit
* is `2` (or the third bit in the pattern). When n is 3 and value is true the
* result would be `3` (the fourth bit, or the third set bit which is at index
* 3 when 0-based).
*
* @param[in] map The sparsemap reference.
* @param[in] n Specifies how many bits to ignore (when n=2 return the position
* of the third matching bit).
* @param[in] value Determines if the search is to examine set (true) or unset
* (false) bits in the bitmap index.
* @returns the 0-based index of the located bit position within the map; when
* not found either SPARSEMAP_IDX_MAX.
*/
sparsemap_idx_t sparsemap_select(sparsemap_t *map, sparsemap_idx_t n, bool value);
/** @brief Counts the bits matching \b value in the provided range, [\b x, \b
* y].
*
* Counts the set, \b value is true, or unset, \b value is false, bits starting
* at the \b idx'th bit (0-based) in the range [\b x, \b y] (inclusive on either
* end). If range is [0, 0] this examines 1 bit, the first one in the map, and
* returns 1 if value is true and the bit was set.
*
* @param[in] map The sparsemap reference.
* @param[in] x 0-based start of the inclusive range to examine.
* @param[in] y 0-based end of the inclusive range to examine.
* @param[in] value Determines if the scan is to count the set (true) or unset
* (false) bits in the range.
* @returns the count of bits found within the range that match the \b value
*/
size_t sparsemap_rank(sparsemap_t *map, size_t x, size_t y, bool value);
/** @brief Locates the first contiguous set of bits of \b len starting at \b idx
* matching \b value in the bitmap.
*
* @param[in] map The sparsemap reference.
* @param[in] start 0-based start of search within the bitmap.
* @param[in] len The length of contiguous bits we're seeking.
* @param[in] value Determines if the scan is to find all set (true) or unset
* (false) bits of \b len.
* @returns the index of the first bit matching the criteria; when not found
* found SPARSEMAP_IDX_MAX
*/
size_t sparsemap_span(sparsemap_t *map, sparsemap_idx_t start, size_t len, bool value);
#if defined(__cplusplus)
}
#endif #endif
#endif /* !defined(SPARSEMAP_H) */

258
include/tdigest.h Normal file
View file

@ -0,0 +1,258 @@
#pragma once
#include <stdlib.h>
/**
* Adaptive histogram based on something like streaming k-means crossed with Q-digest.
* The implementation is a direct descendent of MergingDigest
* https://github.com/tdunning/t-digest/
*
* Copyright (c) 2021 Redis, All rights reserved.
* Copyright (c) 2018 Andrew Werner, All rights reserved.
*
* The special characteristics of this algorithm are:
*
* - smaller summaries than Q-digest
*
* - provides part per million accuracy for extreme quantiles and typically &lt;1000 ppm accuracy
* for middle quantiles
*
* - fast
*
* - simple
*
* - easy to adapt for use with map-reduce
*/
#define MM_PI 3.14159265358979323846
struct td_histogram {
// compression is a setting used to configure the size of centroids when merged.
double compression;
double min;
double max;
// cap is the total size of nodes
int cap;
// merged_nodes is the number of merged nodes at the front of nodes.
int merged_nodes;
// unmerged_nodes is the number of buffered nodes.
int unmerged_nodes;
// we run the merge in reverse every other merge to avoid left-to-right bias in merging
long long total_compressions;
long long merged_weight;
long long unmerged_weight;
double *nodes_mean;
long long *nodes_weight;
};
typedef struct td_histogram td_histogram_t;
#ifdef __cplusplus
extern "C" {
#endif
/**
* Allocate the memory, initialise the t-digest, and return the histogram as output parameter.
* @param compression The compression parameter.
* 100 is a common value for normal uses.
* 1000 is extremely large.
* The number of centroids retained will be a smallish (usually less than 10) multiple of this
* number.
* @return the histogram on success, NULL if allocation failed.
*/
td_histogram_t *td_new(double compression);
/**
* Allocate the memory and initialise the t-digest.
*
* @param compression The compression parameter.
* 100 is a common value for normal uses.
* 1000 is extremely large.
* The number of centroids retained will be a smallish (usually less than 10) multiple of this
* number.
* @param result Output parameter to capture allocated histogram.
* @return 0 on success, 1 if allocation failed.
*/
int td_init(double compression, td_histogram_t **result);
/**
* Frees the memory associated with the t-digest.
*
* @param h The histogram you want to free.
*/
void td_free(td_histogram_t *h);
/**
* Reset a histogram to zero - empty out a histogram and re-initialise it
*
* If you want to re-use an existing histogram, but reset everything back to zero, this
* is the routine to use.
*
* @param h The histogram you want to reset to empty.
*
*/
void td_reset(td_histogram_t *h);
/**
* Adds a sample to a histogram.
*
* @param val The value to add.
* @param weight The weight of this point.
* @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
* weight.
*
*/
int td_add(td_histogram_t *h, double val, long long weight);
/**
* Re-examines a t-digest to determine whether some centroids are redundant. If your data are
* perversely ordered, this may be a good idea. Even if not, this may save 20% or so in space.
*
* The cost is roughly the same as adding as many data points as there are centroids. This
* is typically &lt; 10 * compression, but could be as high as 100 * compression.
* This is a destructive operation that is not thread-safe.
*
* @param h The histogram you want to compress.
* @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
* weight. If overflow is detected the histogram is not changed.
*
*/
int td_compress(td_histogram_t *h);
/**
* Merges all of the values from 'from' to 'this' histogram.
*
* @param h "This" pointer
* @param from Histogram to copy values from.
* * @return 0 on success, EDOM if overflow was detected as a consequence of merging the the
* provided histogram. If overflow is detected the original histogram is not detected.
*/
int td_merge(td_histogram_t *h, td_histogram_t *from);
/**
* Returns the fraction of all points added which are &le; x.
*
* @param x The cutoff for the cdf.
* @return The fraction of all data which is less or equal to x.
*/
double td_cdf(td_histogram_t *h, double x);
/**
* Returns an estimate of the cutoff such that a specified fraction of the data
* added to this TDigest would be less than or equal to the cutoff.
*
* @param q The desired fraction
* @return The value x such that cdf(x) == q;
*/
double td_quantile(td_histogram_t *h, double q);
/**
* Returns an estimate of the cutoff such that a specified fraction of the data
* added to this TDigest would be less than or equal to the cutoffs.
*
* @param quantiles The ordered percentiles array to get the values for.
* @param values Destination array containing the values at the given quantiles.
* The values array should be allocated by the caller.
* @return 0 on success, ENOMEM if the provided destination array is null.
*/
int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length);
/**
* Returns the trimmed mean ignoring values outside given cutoff upper and lower limits.
*
* @param leftmost_cut Fraction to cut off of the left tail of the distribution.
* @param rightmost_cut Fraction to cut off of the right tail of the distribution.
* @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
*/
double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut);
/**
* Returns the trimmed mean ignoring values outside given a symmetric cutoff limits.
*
* @param proportion_to_cut Fraction to cut off of the left and right tails of the distribution.
* @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
*/
double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut);
/**
* Returns the current compression factor.
*
* @return The compression factor originally used to set up the TDigest.
*/
int td_compression(td_histogram_t *h);
/**
* Returns the number of points that have been added to this TDigest.
*
* @return The sum of the weights on all centroids.
*/
long long td_size(td_histogram_t *h);
/**
* Returns the number of centroids being used by this TDigest.
*
* @return The number of centroids being used.
*/
int td_centroid_count(td_histogram_t *h);
/**
* Get minimum value from the histogram. Will return __DBL_MAX__ if the histogram
* is empty.
*
* @param h "This" pointer
*/
double td_min(td_histogram_t *h);
/**
* Get maximum value from the histogram. Will return - __DBL_MAX__ if the histogram
* is empty.
*
* @param h "This" pointer
*/
double td_max(td_histogram_t *h);
/**
* Get the full centroids weight array for 'this' histogram.
*
* @param h "This" pointer
*
* @return The full centroids weight array.
*/
const long long *td_centroids_weight(td_histogram_t *h);
/**
* Get the full centroids mean array for 'this' histogram.
*
* @param h "This" pointer
*
* @return The full centroids mean array.
*/
const double *td_centroids_mean(td_histogram_t *h);
/**
* Get the centroid weight for 'this' histogram and 'pos'.
*
* @param h "This" pointer
* @param pos centroid position.
*
* @return The centroid weight.
*/
long long td_centroids_weight_at(td_histogram_t *h, int pos);
/**
* Get the centroid mean for 'this' histogram and 'pos'.
*
* @param h "This" pointer
* @param pos centroid position.
*
* @return The centroid mean.
*/
double td_centroids_mean_at(td_histogram_t *h, int pos);
#ifdef __cplusplus
}
#endif

View file

@ -1,14 +1,25 @@
#include <sys/types.h> #define _POSIX_C_SOURCE 199309L
#define X86_INTRIN
#include <assert.h> #include <assert.h>
#include <pthread.h> #include <pthread.h> // If using threads
#include <sparsemap.h>
#include <stdbool.h> #include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h> #include <unistd.h>
#ifdef __x86_64__ // Check if running on x86_64 architecture
#ifdef X86_INTRIN
#include <errno.h>
#include <x86intrin.h>
#endif
#endif
#include "common.h" #include "../include/common.h"
#include "../include/sparsemap.h"
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wvariadic-macros" #pragma GCC diagnostic ignored "-Wvariadic-macros"
@ -22,95 +33,69 @@
uint64_t uint64_t
tsc(void) tsc(void)
{ {
#ifdef __x86_64__ // Check if running on x86_64 architecture
#ifdef X86_INTRIN
return __rdtsc();
#else
uint32_t low, high; uint32_t low, high;
__asm__ volatile("rdtsc" : "=a"(low), "=d"(high)); __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
return ((uint64_t)high << 32) | low; return ((uint64_t)high << 32) | low;
#endif
#ifdef __arm__ // Check if compiling for ARM architecture
uint64_t result;
__asm__ volatile("mrs %0, pmccntr_el0" : "=r"(result));
return result;
}
#endif
#endif
return 0;
} }
static uint64_t // get microsecond timestamp
get_tsc_frequency() uint64_t
msts()
{ {
uint32_t high, low; #ifdef _SC_MONOTONIC_CLOCK
__asm__ volatile("rdtsc" : "=a"(low), "=d"(high)); struct timespec ts;
__asm__ volatile("rdtsc"); if (sysconf(_SC_MONOTONIC_CLOCK) > 0) {
return ((uint64_t)high << 32) | low; /* A monotonic clock presents */
if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0)
return (uint64_t)(ts.tv_sec * 1000000 + ts.tv_nsec / 1000);
else
return 0;
}
return 0;
#else
struct timeval tv;
if (gettimeofday(&tv, NULL) == 0)
return (uint64_t)(tv.tv_sec * 1000000 + tv.tv_usec);
else
return 0;
#endif
} }
double double
tsc_ticks_to_ns(uint64_t tsc_ticks) nsts(void)
{ {
static uint64_t tsc_freq = 0; struct timespec ts;
if (tsc_freq == 0) {
tsc_freq = get_tsc_frequency();
}
return (double)tsc_ticks / (double)tsc_freq * 1e9;
}
void if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
est_sift_up(uint64_t *heap, int child_index) perror("clock_gettime");
{ return -1.0; // Return -1.0 on error
while (child_index > 0) {
int parent_index = (child_index - 1) / 2;
if (heap[parent_index] > heap[child_index]) {
// Swap parent and child
uint64_t temp = heap[parent_index];
heap[parent_index] = heap[child_index];
heap[child_index] = temp;
child_index = parent_index;
} else {
break; // Heap property satisfied
}
}
}
void
est_sift_down(uint64_t *heap, int heap_size, int parent_index)
{
int child_index = 2 * parent_index + 1; // Left child
while (child_index < heap_size) {
// Right child exists and is smaller than left child
if (child_index + 1 < heap_size && heap[child_index + 1] < heap[child_index]) {
child_index++;
}
// If the smallest child is smaller than the parent, swap them
if (heap[child_index] < heap[parent_index]) {
uint64_t temp = heap[child_index];
heap[child_index] = heap[parent_index];
heap[parent_index] = temp;
parent_index = child_index;
child_index = 2 * parent_index + 1;
} else {
break; // Heap property satisfied
}
}
}
void
est_insert_value(uint64_t *heap, int heap_max_size, int *heap_size, uint64_t value)
{
if (*heap_size < heap_max_size) { // Heap not full, insert value
heap[*heap_size] = value;
est_sift_up(heap, *heap_size);
(*heap_size)++;
} else {
// Heap is full, replace root with new value with a certain probability
// This is a very naive approach to maintain a sample of the input
if (rand() % 2) {
heap[0] = value;
est_sift_down(heap, heap_max_size, 0);
}
} }
return ts.tv_sec + ts.tv_nsec / 1e9;
} }
int __xorshift32_state = 0; int __xorshift32_state = 0;
// Xorshift algorithm for PRNG // Xorshift algorithm for PRNG
uint32_t uint32_t
xorshift32() xorshift32(void)
{ {
uint32_t x = __xorshift32_state; uint32_t x = __xorshift32_state;
if (x == 0) if (x == 0) {
x = 123456789; x = 123456789;
}
x ^= x << 13; x ^= x << 13;
x ^= x >> 17; x ^= x >> 17;
x ^= x << 5; x ^= x << 5;
@ -119,13 +104,13 @@ xorshift32()
} }
void void
xorshift32_seed() xorshift32_seed(void)
{ {
__xorshift32_state = XORSHIFT_SEED_VALUE; __xorshift32_state = XORSHIFT_SEED_VALUE;
} }
void void
shuffle(int *array, size_t n) // TODO working? shuffle(int *array, size_t n)
{ {
for (size_t i = n - 1; i > 0; --i) { for (size_t i = n - 1; i > 0; --i) {
size_t j = xorshift32() % (i + 1); size_t j = xorshift32() % (i + 1);
@ -170,7 +155,7 @@ has_sequential_set(int a[], int l, int r)
int int
ensure_sequential_set(int a[], int l, int r) ensure_sequential_set(int a[], int l, int r)
{ {
if (!a || l == 0 || r < 1 || r > l) { if (!a || l == 0 || r < 1 || r > l - 1) {
return 0; return 0;
} }
@ -190,7 +175,8 @@ ensure_sequential_set(int a[], int l, int r)
// Generate a random value between min_value and max_value // Generate a random value between min_value and max_value
int value = random_uint32() % (max_value - min_value - r + 1); int value = random_uint32() % (max_value - min_value - r + 1);
// Generate a random location between 0 and l - r // Generate a random location between 0 and l - r
offset = random_uint32() % (l - r - 1); int d = l - r - 1;
offset = d == 0 ? 0 : random_uint32() % d;
// Adjust the array to include a sequential set of 'r' integers at the random offset // Adjust the array to include a sequential set of 'r' integers at the random offset
for (int i = 0; i < r; ++i) { for (int i = 0; i < r; ++i) {
@ -199,21 +185,6 @@ ensure_sequential_set(int a[], int l, int r)
return value; return value;
} }
int
create_sequential_set_in_empty_map(sparsemap_t *map, int s, int r)
{
int placed_at;
if (s >= r + 1) {
placed_at = 0;
} else {
placed_at = random_uint32() % (s - r - 1);
}
for (int i = placed_at; i < placed_at + r; i++) {
sparsemap_set(map, i, true);
}
return placed_at;
}
void void
print_array(int *array, int l) print_array(int *array, int l)
{ {
@ -322,7 +293,7 @@ bool
is_set(const int array[], int bit) is_set(const int array[], int bit)
{ {
for (int i = 0; i < 1024; i++) { for (int i = 0; i < 1024; i++) {
if (array[i] == (int)bit) { if (array[i] == bit) {
return true; return true;
} }
} }
@ -330,28 +301,44 @@ is_set(const int array[], int bit)
} }
int int
is_unique(int a[], int l, int value) whats_set_uint64(uint64_t number, int pos[64])
{ {
for (int i = 0; i < l; ++i) { int length = 0;
if (a[i] == value) {
return 0; // Not unique for (int i = 0; i < 64; i++) {
if (number & ((uint64_t)1 << i)) {
pos[length++] = i;
} }
} }
return 1; // Unique
return length;
} }
/** @brief Fills an array with unique random values between 0 and max_value.
*
* @param[in] a The array to fill.
* @param[in] l The length of the array to fill.
* @param[in] max_value The maximum value for the random numbers.
*/
void void
setup_test_array(int a[], int l, int max_value) setup_test_array(int a[], int l, int max_value)
{ {
if (a == NULL || max_value < 0)
return; // Basic error handling and validation
for (int i = 0; i < l; ++i) { // Create a set to store the unique values.
int candidate; int unique_values[max_value + 1];
do { for (int i = 0; i <= max_value; ++i) {
candidate = random_uint32() % (max_value + 1); // Generate a new value within the specified range unique_values[i] = 0;
} while (!is_unique(a, i, candidate)); // Repeat until a unique value is found }
a[i] = candidate; // Assign the unique value to the array
// Keep generating random numbers until we have l unique values.
int count = 0;
while (count < l) {
int random_number = random_uint32() % (max_value + 1);
if (unique_values[random_number] == 0) {
unique_values[random_number] = 1;
a[count] = random_number;
count++;
}
} }
} }
@ -364,15 +351,6 @@ bitmap_from_uint32(sparsemap_t *map, uint32_t number)
} }
} }
void
bitmap_from_uint64(sparsemap_t *map, uint64_t number)
{
for (int i = 0; i < 64; i++) {
bool bit = number & (1 << i);
sparsemap_set(map, i, bit);
}
}
uint32_t uint32_t
rank_uint64(uint64_t number, int n, int p) rank_uint64(uint64_t number, int n, int p)
{ {
@ -400,28 +378,96 @@ rank_uint64(uint64_t number, int n, int p)
return count; return count;
} }
int void
whats_set_uint64(uint64_t number, int pos[64]) print_bits(char *name, uint64_t value)
{ {
int length = 0; if (name) {
printf("%s\t", name);
for (int i = 0; i < 64; i++) { }
if (number & ((uint64_t)1 << i)) { for (int i = 63; i >= 0; i--) {
pos[length++] = i; printf("%lu", (value >> i) & 1);
if (i % 8 == 0) {
printf(" "); // Add space for better readability
} }
} }
printf("\n");
return length;
} }
void void
whats_set(sparsemap_t *map, int m) sm_bitmap_from_uint64(sparsemap_t *map, int offset, uint64_t number)
{ {
logf("what's set in the range [0, %d): ", m); for (int i = offset; i < 64; i++) {
for (int i = 0; i < m; i++) { bool bit = number & ((uint64_t)1 << i);
if (sparsemap_is_set(map, i)) { sparsemap_set(map, i, bit);
logf("%d ", i);
} }
} }
logf("\n");
sparsemap_idx_t
sm_add_span(sparsemap_t *map, int map_size, int span_length)
{
int attempts = map_size / span_length;
sparsemap_idx_t placed_at;
do {
placed_at = random_uint32() % (map_size - span_length - 1);
if (sm_occupied(map, placed_at, span_length, true)) {
attempts--;
} else {
break;
}
} while (attempts);
for (sparsemap_idx_t i = placed_at; i < placed_at + span_length; i++) {
if (sparsemap_set(map, i, true) != i) {
return placed_at; // TODO error?
}
}
return placed_at;
}
void
sm_whats_set(sparsemap_t *map, int off, int len)
{
printf("what's set in the range [%d, %d): ", off, off + len);
for (int i = off; i < off + len; i++) {
if (sparsemap_is_set(map, i)) {
printf("%d ", i);
}
}
printf("\n");
}
bool
sm_is_span(sparsemap_t *map, sparsemap_idx_t m, int len, bool value)
{
for (sparsemap_idx_t i = m; i < m + len; i++) {
if (sparsemap_is_set(map, i) != value) {
return false;
}
}
return true;
}
bool
sm_occupied(sparsemap_t *map, sparsemap_idx_t m, int len, bool value)
{
for (sparsemap_idx_t i = m; i < (sparsemap_idx_t)len; i++) {
if (sparsemap_is_set(map, i) == value) {
return true;
}
}
return false;
}
char *
bytes_as(double bytes, char *s, size_t size)
{
const char *units[] = { "b", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" };
size_t i = 0;
while (bytes >= 1024 && i < sizeof(units) / sizeof(units[0]) - 1) {
bytes /= 1024;
i++;
}
snprintf(s, size, "%.2f %s", bytes, units[i]);
return s;
} }

25883
lib/roaring.c Normal file

File diff suppressed because it is too large Load diff

680
lib/tdigest.c Normal file
View file

@ -0,0 +1,680 @@
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <math.h>
#include "tdigest.h"
#include <errno.h>
#include <stdint.h>
#ifndef TD_MALLOC_INCLUDE
#define TD_MALLOC_INCLUDE "td_malloc.h"
#endif
#ifndef TD_ALLOC_H
#define TD_ALLOC_H
#define __td_malloc malloc
#define __td_calloc calloc
#define __td_realloc realloc
#define __td_free free
#endif
#define __td_max(x, y) (((x) > (y)) ? (x) : (y))
#define __td_min(x, y) (((x) < (y)) ? (x) : (y))
static inline double weighted_average_sorted(double x1, double w1, double x2, double w2) {
const double x = (x1 * w1 + x2 * w2) / (w1 + w2);
return __td_max(x1, __td_min(x, x2));
}
static inline bool _tdigest_long_long_add_safe(long long a, long long b) {
if (b < 0) {
return (a >= __LONG_LONG_MAX__ - b);
} else {
return (a <= __LONG_LONG_MAX__ - b);
}
}
static inline double weighted_average(double x1, double w1, double x2, double w2) {
if (x1 <= x2) {
return weighted_average_sorted(x1, w1, x2, w2);
} else {
return weighted_average_sorted(x2, w2, x1, w1);
}
}
static inline void swap(double *arr, int i, int j) {
const double temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
static inline void swap_l(long long *arr, int i, int j) {
const long long temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
static unsigned int partition(double *means, long long *weights, unsigned int start,
unsigned int end, unsigned int pivot_idx) {
const double pivotMean = means[pivot_idx];
swap(means, pivot_idx, end);
swap_l(weights, pivot_idx, end);
int i = start - 1;
for (unsigned int j = start; j < end; j++) {
// If current element is smaller than the pivot
if (means[j] < pivotMean) {
// increment index of smaller element
i++;
swap(means, i, j);
swap_l(weights, i, j);
}
}
swap(means, i + 1, end);
swap_l(weights, i + 1, end);
return i + 1;
}
/**
* Standard quick sort except that sorting rearranges parallel arrays
*
* @param means Values to sort on
* @param weights The auxillary values to sort.
* @param start The beginning of the values to sort
* @param end The value after the last value to sort
*/
static void td_qsort(double *means, long long *weights, unsigned int start, unsigned int end) {
if (start < end) {
// two elements can be directly compared
if ((end - start) == 1) {
if (means[start] > means[end]) {
swap(means, start, end);
swap_l(weights, start, end);
}
return;
}
// generating a random number as a pivot was very expensive vs the array size
// const unsigned int pivot_idx = start + rand()%(end - start + 1);
const unsigned int pivot_idx = (end + start) / 2; // central pivot
const unsigned int new_pivot_idx = partition(means, weights, start, end, pivot_idx);
if (new_pivot_idx > start) {
td_qsort(means, weights, start, new_pivot_idx - 1);
}
td_qsort(means, weights, new_pivot_idx + 1, end);
}
}
static inline size_t cap_from_compression(double compression) {
if ((size_t)compression > ((SIZE_MAX / sizeof(double) / 6) - 10)) {
return 0;
}
return (6 * (size_t)(compression)) + 10;
}
static inline bool should_td_compress(td_histogram_t *h) {
return ((h->merged_nodes + h->unmerged_nodes) >= (h->cap - 1));
}
static inline int next_node(td_histogram_t *h) { return h->merged_nodes + h->unmerged_nodes; }
int td_compress(td_histogram_t *h);
static inline int _check_overflow(const double v) {
// double-precision overflow detected on h->unmerged_weight
if (v == INFINITY) {
return EDOM;
}
return 0;
}
static inline int _check_td_overflow(const double new_unmerged_weight,
const double new_total_weight) {
// double-precision overflow detected on h->unmerged_weight
if (new_unmerged_weight == INFINITY) {
return EDOM;
}
if (new_total_weight == INFINITY) {
return EDOM;
}
const double denom = 2 * MM_PI * new_total_weight * log(new_total_weight);
if (denom == INFINITY) {
return EDOM;
}
return 0;
}
int td_centroid_count(td_histogram_t *h) { return next_node(h); }
void td_reset(td_histogram_t *h) {
if (!h) {
return;
}
h->min = __DBL_MAX__;
h->max = -h->min;
h->merged_nodes = 0;
h->merged_weight = 0;
h->unmerged_nodes = 0;
h->unmerged_weight = 0;
h->total_compressions = 0;
}
int td_init(double compression, td_histogram_t **result) {
const size_t capacity = cap_from_compression(compression);
if (capacity < 1) {
return 1;
}
td_histogram_t *histogram;
histogram = (td_histogram_t *)__td_malloc(sizeof(td_histogram_t));
if (!histogram) {
return 1;
}
histogram->cap = capacity;
histogram->compression = (double)compression;
td_reset(histogram);
histogram->nodes_mean = (double *)__td_calloc(capacity, sizeof(double));
if (!histogram->nodes_mean) {
td_free(histogram);
return 1;
}
histogram->nodes_weight = (long long *)__td_calloc(capacity, sizeof(long long));
if (!histogram->nodes_weight) {
td_free(histogram);
return 1;
}
*result = histogram;
return 0;
}
td_histogram_t *td_new(double compression) {
td_histogram_t *mdigest = NULL;
td_init(compression, &mdigest);
return mdigest;
}
void td_free(td_histogram_t *histogram) {
if (histogram->nodes_mean) {
__td_free((void *)(histogram->nodes_mean));
}
if (histogram->nodes_weight) {
__td_free((void *)(histogram->nodes_weight));
}
__td_free((void *)(histogram));
}
int td_merge(td_histogram_t *into, td_histogram_t *from) {
if (td_compress(into) != 0)
return EDOM;
if (td_compress(from) != 0)
return EDOM;
const int pos = from->merged_nodes + from->unmerged_nodes;
for (int i = 0; i < pos; i++) {
const double mean = from->nodes_mean[i];
const long long weight = from->nodes_weight[i];
if (td_add(into, mean, weight) != 0) {
return EDOM;
}
}
return 0;
}
long long td_size(td_histogram_t *h) { return h->merged_weight + h->unmerged_weight; }
double td_cdf(td_histogram_t *h, double val) {
td_compress(h);
// no data to examine
if (h->merged_nodes == 0) {
return NAN;
}
// bellow lower bound
if (val < h->min) {
return 0;
}
// above upper bound
if (val > h->max) {
return 1;
}
if (h->merged_nodes == 1) {
// exactly one centroid, should have max==min
const double width = h->max - h->min;
if (val - h->min <= width) {
// min and max are too close together to do any viable interpolation
return 0.5;
} else {
// interpolate if somehow we have weight > 0 and max != min
return (val - h->min) / width;
}
}
const int n = h->merged_nodes;
// check for the left tail
const double left_centroid_mean = h->nodes_mean[0];
const double left_centroid_weight = (double)h->nodes_weight[0];
const double merged_weight_d = (double)h->merged_weight;
if (val < left_centroid_mean) {
// note that this is different than h->nodes_mean[0] > min
// ... this guarantees we divide by non-zero number and interpolation works
const double width = left_centroid_mean - h->min;
if (width > 0) {
// must be a sample exactly at min
if (val == h->min) {
return 0.5 / merged_weight_d;
} else {
return (1 + (val - h->min) / width * (left_centroid_weight / 2 - 1)) /
merged_weight_d;
}
} else {
// this should be redundant with the check val < h->min
return 0;
}
}
// and the right tail
const double right_centroid_mean = h->nodes_mean[n - 1];
const double right_centroid_weight = (double)h->nodes_weight[n - 1];
if (val > right_centroid_mean) {
const double width = h->max - right_centroid_mean;
if (width > 0) {
if (val == h->max) {
return 1 - 0.5 / merged_weight_d;
} else {
// there has to be a single sample exactly at max
const double dq = (1 + (h->max - val) / width * (right_centroid_weight / 2 - 1)) /
merged_weight_d;
return 1 - dq;
}
} else {
return 1;
}
}
// we know that there are at least two centroids and mean[0] < x < mean[n-1]
// that means that there are either one or more consecutive centroids all at exactly x
// or there are consecutive centroids, c0 < x < c1
double weightSoFar = 0;
for (int it = 0; it < n - 1; it++) {
// weightSoFar does not include weight[it] yet
if (h->nodes_mean[it] == val) {
// we have one or more centroids == x, treat them as one
// dw will accumulate the weight of all of the centroids at x
double dw = 0;
while (it < n && h->nodes_mean[it] == val) {
dw += (double)h->nodes_weight[it];
it++;
}
return (weightSoFar + dw / 2) / (double)h->merged_weight;
} else if (h->nodes_mean[it] <= val && val < h->nodes_mean[it + 1]) {
const double node_weight = (double)h->nodes_weight[it];
const double node_weight_next = (double)h->nodes_weight[it + 1];
const double node_mean = h->nodes_mean[it];
const double node_mean_next = h->nodes_mean[it + 1];
// landed between centroids ... check for floating point madness
if (node_mean_next - node_mean > 0) {
// note how we handle singleton centroids here
// the point is that for singleton centroids, we know that their entire
// weight is exactly at the centroid and thus shouldn't be involved in
// interpolation
double leftExcludedW = 0;
double rightExcludedW = 0;
if (node_weight == 1) {
if (node_weight_next == 1) {
// two singletons means no interpolation
// left singleton is in, right is out
return (weightSoFar + 1) / merged_weight_d;
} else {
leftExcludedW = 0.5;
}
} else if (node_weight_next == 1) {
rightExcludedW = 0.5;
}
double dw = (node_weight + node_weight_next) / 2;
// adjust endpoints for any singleton
double dwNoSingleton = dw - leftExcludedW - rightExcludedW;
double base = weightSoFar + node_weight / 2 + leftExcludedW;
return (base + dwNoSingleton * (val - node_mean) / (node_mean_next - node_mean)) /
merged_weight_d;
} else {
// this is simply caution against floating point madness
// it is conceivable that the centroids will be different
// but too near to allow safe interpolation
double dw = (node_weight + node_weight_next) / 2;
return (weightSoFar + dw) / merged_weight_d;
}
} else {
weightSoFar += (double)h->nodes_weight[it];
}
}
return 1 - 0.5 / merged_weight_d;
}
static double td_internal_iterate_centroids_to_index(const td_histogram_t *h, const double index,
const double left_centroid_weight,
const int total_centroids, double *weightSoFar,
int *node_pos) {
if (left_centroid_weight > 1 && index < left_centroid_weight / 2) {
// there is a single sample at min so we interpolate with less weight
return h->min + (index - 1) / (left_centroid_weight / 2 - 1) * (h->nodes_mean[0] - h->min);
}
// usually the last centroid will have unit weight so this test will make it moot
if (index > h->merged_weight - 1) {
return h->max;
}
// if the right-most centroid has more than one sample, we still know
// that one sample occurred at max so we can do some interpolation
const double right_centroid_weight = (double)h->nodes_weight[total_centroids - 1];
const double right_centroid_mean = h->nodes_mean[total_centroids - 1];
if (right_centroid_weight > 1 &&
(double)h->merged_weight - index <= right_centroid_weight / 2) {
return h->max - ((double)h->merged_weight - index - 1) / (right_centroid_weight / 2 - 1) *
(h->max - right_centroid_mean);
}
for (; *node_pos < total_centroids - 1; (*node_pos)++) {
const int i = *node_pos;
const double node_weight = (double)h->nodes_weight[i];
const double node_weight_next = (double)h->nodes_weight[i + 1];
const double node_mean = h->nodes_mean[i];
const double node_mean_next = h->nodes_mean[i + 1];
const double dw = (node_weight + node_weight_next) / 2;
if (*weightSoFar + dw > index) {
// centroids i and i+1 bracket our current point
// check for unit weight
double leftUnit = 0;
if (node_weight == 1) {
if (index - *weightSoFar < 0.5) {
// within the singleton's sphere
return node_mean;
} else {
leftUnit = 0.5;
}
}
double rightUnit = 0;
if (node_weight_next == 1) {
if (*weightSoFar + dw - index <= 0.5) {
// no interpolation needed near singleton
return node_mean_next;
}
rightUnit = 0.5;
}
const double z1 = index - *weightSoFar - leftUnit;
const double z2 = *weightSoFar + dw - index - rightUnit;
return weighted_average(node_mean, z2, node_mean_next, z1);
}
*weightSoFar += dw;
}
// weightSoFar = totalWeight - weight[total_centroids-1]/2 (very nearly)
// so we interpolate out to max value ever seen
const double z1 = index - h->merged_weight - right_centroid_weight / 2.0;
const double z2 = right_centroid_weight / 2 - z1;
return weighted_average(right_centroid_mean, z1, h->max, z2);
}
double td_quantile(td_histogram_t *h, double q) {
td_compress(h);
// q should be in [0,1]
if (q < 0.0 || q > 1.0 || h->merged_nodes == 0) {
return NAN;
}
// with one data point, all quantiles lead to Rome
if (h->merged_nodes == 1) {
return h->nodes_mean[0];
}
// if values were stored in a sorted array, index would be the offset we are interested in
const double index = q * (double)h->merged_weight;
// beyond the boundaries, we return min or max
// usually, the first centroid will have unit weight so this will make it moot
if (index < 1) {
return h->min;
}
// we know that there are at least two centroids now
const int n = h->merged_nodes;
// if the left centroid has more than one sample, we still know
// that one sample occurred at min so we can do some interpolation
const double left_centroid_weight = (double)h->nodes_weight[0];
// in between extremes we interpolate between centroids
double weightSoFar = left_centroid_weight / 2;
int i = 0;
return td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n, &weightSoFar,
&i);
}
int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length) {
td_compress(h);
if (NULL == quantiles || NULL == values) {
return EINVAL;
}
const int n = h->merged_nodes;
if (n == 0) {
for (size_t i = 0; i < length; i++) {
values[i] = NAN;
}
return 0;
}
if (n == 1) {
for (size_t i = 0; i < length; i++) {
const double requested_quantile = quantiles[i];
// q should be in [0,1]
if (requested_quantile < 0.0 || requested_quantile > 1.0) {
values[i] = NAN;
} else {
// with one data point, all quantiles lead to Rome
values[i] = h->nodes_mean[0];
}
}
return 0;
}
// we know that there are at least two centroids now
// if the left centroid has more than one sample, we still know
// that one sample occurred at min so we can do some interpolation
const double left_centroid_weight = (double)h->nodes_weight[0];
// in between extremes we interpolate between centroids
double weightSoFar = left_centroid_weight / 2;
int node_pos = 0;
// to avoid allocations we use the values array for intermediate computation
// i.e. to store the expected cumulative count at each percentile
for (size_t qpos = 0; qpos < length; qpos++) {
const double index = quantiles[qpos] * (double)h->merged_weight;
values[qpos] = td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n,
&weightSoFar, &node_pos);
}
return 0;
}
static double td_internal_trimmed_mean(const td_histogram_t *h, const double leftmost_weight,
const double rightmost_weight) {
double count_done = 0;
double trimmed_sum = 0;
double trimmed_count = 0;
for (int i = 0; i < h->merged_nodes; i++) {
const double n_weight = (double)h->nodes_weight[i];
// Assume the whole centroid falls into the range
double count_add = n_weight;
// If we haven't reached the low threshold yet, skip appropriate part of the centroid.
count_add -= __td_min(__td_max(0, leftmost_weight - count_done), count_add);
// If we have reached the upper threshold, ignore the overflowing part of the centroid.
count_add = __td_min(__td_max(0, rightmost_weight - count_done), count_add);
// consider the whole centroid processed
count_done += n_weight;
// increment the sum / count
trimmed_sum += h->nodes_mean[i] * count_add;
trimmed_count += count_add;
// break once we cross the high threshold
if (count_done >= rightmost_weight)
break;
}
return trimmed_sum / trimmed_count;
}
double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut) {
td_compress(h);
// proportion_to_cut should be in [0,1]
if (h->merged_nodes == 0 || proportion_to_cut < 0.0 || proportion_to_cut > 1.0) {
return NAN;
}
// with one data point, all values lead to Rome
if (h->merged_nodes == 1) {
return h->nodes_mean[0];
}
/* translate the percentiles to counts */
const double leftmost_weight = floor((double)h->merged_weight * proportion_to_cut);
const double rightmost_weight = ceil((double)h->merged_weight * (1.0 - proportion_to_cut));
return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight);
}
double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut) {
td_compress(h);
// leftmost_cut and rightmost_cut should be in [0,1]
if (h->merged_nodes == 0 || leftmost_cut < 0.0 || leftmost_cut > 1.0 || rightmost_cut < 0.0 ||
rightmost_cut > 1.0) {
return NAN;
}
// with one data point, all values lead to Rome
if (h->merged_nodes == 1) {
return h->nodes_mean[0];
}
/* translate the percentiles to counts */
const double leftmost_weight = floor((double)h->merged_weight * leftmost_cut);
const double rightmost_weight = ceil((double)h->merged_weight * rightmost_cut);
return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight);
}
int td_add(td_histogram_t *h, double mean, long long weight) {
if (should_td_compress(h)) {
const int overflow_res = td_compress(h);
if (overflow_res != 0)
return overflow_res;
}
const int pos = next_node(h);
if (pos >= h->cap)
return EDOM;
if (_tdigest_long_long_add_safe(h->unmerged_weight, weight) == false)
return EDOM;
const long long new_unmerged_weight = h->unmerged_weight + weight;
if (_tdigest_long_long_add_safe(new_unmerged_weight, h->merged_weight) == false)
return EDOM;
const long long new_total_weight = new_unmerged_weight + h->merged_weight;
// double-precision overflow detected
const int overflow_res =
_check_td_overflow((double)new_unmerged_weight, (double)new_total_weight);
if (overflow_res != 0)
return overflow_res;
if (mean < h->min) {
h->min = mean;
}
if (mean > h->max) {
h->max = mean;
}
h->nodes_mean[pos] = mean;
h->nodes_weight[pos] = weight;
h->unmerged_nodes++;
h->unmerged_weight = new_unmerged_weight;
return 0;
}
int td_compress(td_histogram_t *h) {
if (h->unmerged_nodes == 0) {
return 0;
}
int N = h->merged_nodes + h->unmerged_nodes;
td_qsort(h->nodes_mean, h->nodes_weight, 0, N - 1);
const double total_weight = (double)h->merged_weight + (double)h->unmerged_weight;
// double-precision overflow detected
const int overflow_res = _check_td_overflow((double)h->unmerged_weight, (double)total_weight);
if (overflow_res != 0)
return overflow_res;
if (total_weight <= 1)
return 0;
const double denom = 2 * MM_PI * total_weight * log(total_weight);
if (_check_overflow(denom) != 0)
return EDOM;
// Compute the normalizer given compression and number of points.
const double normalizer = h->compression / denom;
if (_check_overflow(normalizer) != 0)
return EDOM;
int cur = 0;
double weight_so_far = 0;
for (int i = 1; i < N; i++) {
const double proposed_weight = (double)h->nodes_weight[cur] + (double)h->nodes_weight[i];
const double z = proposed_weight * normalizer;
// quantile up to cur
const double q0 = weight_so_far / total_weight;
// quantile up to cur + i
const double q2 = (weight_so_far + proposed_weight) / total_weight;
// Convert a quantile to the k-scale
const bool should_add = (z <= (q0 * (1 - q0))) && (z <= (q2 * (1 - q2)));
// next point will fit
// so merge into existing centroid
if (should_add) {
h->nodes_weight[cur] += h->nodes_weight[i];
const double delta = h->nodes_mean[i] - h->nodes_mean[cur];
const double weighted_delta = (delta * h->nodes_weight[i]) / h->nodes_weight[cur];
h->nodes_mean[cur] += weighted_delta;
} else {
weight_so_far += h->nodes_weight[cur];
cur++;
h->nodes_weight[cur] = h->nodes_weight[i];
h->nodes_mean[cur] = h->nodes_mean[i];
}
if (cur != i) {
h->nodes_weight[i] = 0;
h->nodes_mean[i] = 0.0;
}
}
h->merged_nodes = cur + 1;
h->merged_weight = total_weight;
h->unmerged_nodes = 0;
h->unmerged_weight = 0;
h->total_compressions++;
return 0;
}
double td_min(td_histogram_t *h) { return h->min; }
double td_max(td_histogram_t *h) { return h->max; }
int td_compression(td_histogram_t *h) { return h->compression; }
const long long *td_centroids_weight(td_histogram_t *h) { return h->nodes_weight; }
const double *td_centroids_mean(td_histogram_t *h) { return h->nodes_mean; }
long long td_centroids_weight_at(td_histogram_t *h, int pos) { return h->nodes_weight[pos]; }
double td_centroids_mean_at(td_histogram_t *h, int pos) {
if (pos < 0 || pos > h->merged_nodes) {
return NAN;
}
return h->nodes_mean[pos];
}

File diff suppressed because it is too large Load diff

417
tests/midl.c Normal file
View file

@ -0,0 +1,417 @@
/** @defgroup idls ID List Management
* @{
*/
/** A generic unsigned ID number. These were entryIDs in back-bdb.
* Preferably it should have the same size as a pointer.
*/
typedef size_t MDB_ID;
/** An IDL is an ID List, a sorted array of IDs. The first
* element of the array is a counter for how many actual
* IDs are in the list. In the original back-bdb code, IDLs are
* sorted in ascending order. For libmdb IDLs are sorted in
* descending order.
*/
typedef MDB_ID *MDB_IDL;
/* IDL sizes - likely should be even bigger
* limiting factors: sizeof(ID), thread stack size
*/
#define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */
#define MDB_IDL_DB_SIZE (1 << MDB_IDL_LOGN)
#define MDB_IDL_UM_SIZE (1 << (MDB_IDL_LOGN + 1))
#define MDB_IDL_DB_MAX (MDB_IDL_DB_SIZE - 1)
#define MDB_IDL_UM_MAX (MDB_IDL_UM_SIZE - 1)
#define MDB_IDL_SIZEOF(ids) (((ids)[0] + 1) * sizeof(MDB_ID))
#define MDB_IDL_IS_ZERO(ids) ((ids)[0] == 0)
#define MDB_IDL_CPY(dst, src) (memcpy(dst, src, MDB_IDL_SIZEOF(src)))
#define MDB_IDL_FIRST(ids) ((ids)[1])
#define MDB_IDL_LAST(ids) ((ids)[(ids)[0]])
/** Current max length of an #mdb_midl_alloc()ed IDL */
#define MDB_IDL_ALLOCLEN(ids) ((ids)[-1])
/** Append ID to IDL. The IDL must be big enough. */
#define mdb_midl_xappend(idl, id) \
do { \
MDB_ID *xidl = (idl), xlen = ++(xidl[0]); \
xidl[xlen] = (id); \
} while (0)
/** Search for an ID in an IDL.
* @param[in] ids The IDL to search.
* @param[in] id The ID to search for.
* @return The index of the first ID greater than or equal to \b id.
*/
unsigned mdb_midl_search(MDB_IDL ids, MDB_ID id);
/** Allocate an IDL.
* Allocates memory for an IDL of the given size.
* @return IDL on success, NULL on failure.
*/
MDB_IDL mdb_midl_alloc(int num);
/** Free an IDL.
* @param[in] ids The IDL to free.
*/
void mdb_midl_free(MDB_IDL ids);
/** Shrink an IDL.
* Return the IDL to the default size if it has grown larger.
* @param[in,out] idp Address of the IDL to shrink.
*/
void mdb_midl_shrink(MDB_IDL *idp);
/** Shrink an IDL to a specific size.
* Resize the IDL to \b size if it is larger.
* @param[in,out] idp Address of the IDL to shrink.
* @param[in] size Capacity to have once resized.
*/
void mdb_midl_shrink(MDB_IDL *idp);
/** Make room for num additional elements in an IDL.
* @param[in,out] idp Address of the IDL.
* @param[in] num Number of elements to make room for.
* @return 0 on success, ENOMEM on failure.
*/
int mdb_midl_need(MDB_IDL *idp, unsigned num);
/** Append an ID onto an IDL.
* @param[in,out] idp Address of the IDL to append to.
* @param[in] id The ID to append.
* @return 0 on success, ENOMEM if the IDL is too large.
*/
int mdb_midl_append(MDB_IDL *idp, MDB_ID id);
/** Append an IDL onto an IDL.
* @param[in,out] idp Address of the IDL to append to.
* @param[in] app The IDL to append.
* @return 0 on success, ENOMEM if the IDL is too large.
*/
int mdb_midl_append_list(MDB_IDL *idp, MDB_IDL app);
/** Append an ID range onto an IDL.
* @param[in,out] idp Address of the IDL to append to.
* @param[in] id The lowest ID to append.
* @param[in] n Number of IDs to append.
* @return 0 on success, ENOMEM if the IDL is too large.
*/
int mdb_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n);
/** Merge an IDL onto an IDL. The destination IDL must be big enough.
* @param[in] idl The IDL to merge into.
* @param[in] merge The IDL to merge.
*/
void mdb_midl_xmerge(MDB_IDL idl, MDB_IDL merge);
/** Sort an IDL.
* @param[in,out] ids The IDL to sort.
*/
void mdb_midl_sort(MDB_IDL ids);
/* midl.c ------------------------------------------------------------------ */
/** @defgroup idls ID List Management
* @{
*/
#define CMP(x, y) ((x) < (y) ? -1 : (x) > (y))
unsigned
mdb_midl_search(MDB_IDL ids, MDB_ID id)
{
/*
* binary search of id in ids
* if found, returns position of id
* if not found, returns first position greater than id
*/
unsigned base = 0;
unsigned cursor = 1;
int val = 0;
unsigned n = ids[0];
while (0 < n) {
unsigned pivot = n >> 1;
cursor = base + pivot + 1;
val = CMP(ids[cursor], id);
if (val < 0) {
n = pivot;
} else if (val > 0) {
base = cursor;
n -= pivot + 1;
} else {
return cursor;
}
}
if (val > 0) {
++cursor;
}
return cursor;
}
int
mdb_midl_insert(MDB_IDL ids, MDB_ID id)
{
unsigned x, i;
x = mdb_midl_search(ids, id);
assert(x > 0);
if (x < 1) {
/* internal error */
return -2;
}
if (x <= ids[0] && ids[x] == id) {
/* duplicate */
assert(0);
return -1;
}
if (++ids[0] >= MDB_IDL_DB_MAX) {
/* no room */
--ids[0];
return -2;
} else {
/* insert id */
for (i = ids[0]; i > x; i--)
ids[i] = ids[i - 1];
ids[x] = id;
}
return 0;
}
inline void
mdb_midl_pop_n(MDB_IDL ids, unsigned n)
{
ids[0] = ids[0] - n;
}
void
mdb_midl_remove_at(MDB_IDL ids, unsigned idx)
{
for (int i = idx - 1; idx < ids[0] - 1;)
ids[++i] = ids[++idx];
ids[0] = ids[0] - 1;
}
void
mdb_midl_remove(MDB_IDL ids, MDB_ID id)
{
unsigned idx = mdb_midl_search(ids, id);
if (idx <= ids[0] && ids[idx] == id)
mdb_midl_remove_at(ids, idx);
}
MDB_IDL
mdb_midl_alloc(int num)
{
MDB_IDL ids = malloc((num + 2) * sizeof(MDB_ID));
if (ids) {
*ids++ = num;
*ids = 0;
}
return ids;
}
void
mdb_midl_free(MDB_IDL ids)
{
if (ids)
free(ids - 1);
}
void
mdb_midl_shrink(MDB_IDL *idp)
{
MDB_IDL ids = *idp;
if (*(--ids) > MDB_IDL_UM_MAX && (ids = realloc(ids, (MDB_IDL_UM_MAX + 2) * sizeof(MDB_ID)))) {
*ids++ = MDB_IDL_UM_MAX;
*idp = ids;
}
}
void
mdb_midl_shrink_to(MDB_IDL *idp, size_t size)
{
MDB_IDL ids = *idp;
if (*(--ids) > size && (ids = realloc(ids, (size + 2) * sizeof(MDB_ID)))) {
*ids++ = size;
*idp = ids;
*idp[0] = *idp[0] > size ? size : *idp[0];
}
}
static int
mdb_midl_grow(MDB_IDL *idp, int num)
{
MDB_IDL idn = *idp - 1;
/* grow it */
idn = realloc(idn, (*idn + num + 2) * sizeof(MDB_ID));
if (!idn)
return ENOMEM;
*idn++ += num;
*idp = idn;
return 0;
}
int
mdb_midl_need(MDB_IDL *idp, unsigned num)
{
MDB_IDL ids = *idp;
num += ids[0];
if (num > ids[-1]) {
num = (num + num / 4 + (256 + 2)) & -256;
if (!(ids = realloc(ids - 1, num * sizeof(MDB_ID))))
return ENOMEM;
*ids++ = num - 2;
*idp = ids;
}
return 0;
}
int
mdb_midl_append(MDB_IDL *idp, MDB_ID id)
{
MDB_IDL ids = *idp;
/* Too big? */
if (ids[0] >= ids[-1]) {
if (mdb_midl_grow(idp, MDB_IDL_UM_MAX))
return ENOMEM;
ids = *idp;
}
ids[0]++;
ids[ids[0]] = id;
return 0;
}
int
mdb_midl_append_list(MDB_IDL *idp, MDB_IDL app)
{
MDB_IDL ids = *idp;
/* Too big? */
if (ids[0] + app[0] >= ids[-1]) {
if (mdb_midl_grow(idp, app[0]))
return ENOMEM;
ids = *idp;
}
memcpy(&ids[ids[0] + 1], &app[1], app[0] * sizeof(MDB_ID));
ids[0] += app[0];
return 0;
}
int
mdb_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n)
{
MDB_ID *ids = *idp, len = ids[0];
/* Too big? */
if (len + n > ids[-1]) {
if (mdb_midl_grow(idp, n | MDB_IDL_UM_MAX))
return ENOMEM;
ids = *idp;
}
ids[0] = len + n;
ids += len;
while (n)
ids[n--] = id++;
return 0;
}
void
mdb_midl_xmerge(MDB_IDL idl, MDB_IDL merge)
{
MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i + j, total = k;
idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */
old_id = idl[j];
while (i) {
merge_id = merge[i--];
for (; old_id < merge_id; old_id = idl[--j])
idl[k--] = old_id;
idl[k--] = merge_id;
}
idl[0] = total;
}
/* Quicksort + Insertion sort for small arrays */
#define SMALL 8
#define MIDL_SWAP(a, b) \
{ \
itmp = (a); \
(a) = (b); \
(b) = itmp; \
}
void
mdb_midl_sort(MDB_IDL ids)
{
/* Max possible depth of int-indexed tree * 2 items/level */
int istack[sizeof(int) * CHAR_BIT * 2];
int i, j, k, l, ir, jstack;
MDB_ID a, itmp;
ir = (int)ids[0];
l = 1;
jstack = 0;
for (;;) {
if (ir - l < SMALL) { /* Insertion sort */
for (j = l + 1; j <= ir; j++) {
a = ids[j];
for (i = j - 1; i >= 1; i--) {
if (ids[i] >= a)
break;
ids[i + 1] = ids[i];
}
ids[i + 1] = a;
}
if (jstack == 0)
break;
ir = istack[jstack--];
l = istack[jstack--];
} else {
k = (l + ir) >> 1; /* Choose median of left, center, right */
MIDL_SWAP(ids[k], ids[l + 1]);
if (ids[l] < ids[ir]) {
MIDL_SWAP(ids[l], ids[ir]);
}
if (ids[l + 1] < ids[ir]) {
MIDL_SWAP(ids[l + 1], ids[ir]);
}
if (ids[l] < ids[l + 1]) {
MIDL_SWAP(ids[l], ids[l + 1]);
}
i = l + 1;
j = ir;
a = ids[l + 1];
for (;;) {
do
i++;
while (ids[i] > a);
do
j--;
while (ids[j] < a);
if (j < i)
break;
MIDL_SWAP(ids[i], ids[j]);
}
ids[l + 1] = ids[j];
ids[j] = a;
jstack += 2;
if (ir - i + 1 >= j - l) {
istack[jstack] = ir;
istack[jstack - 1] = i;
ir = j - 1;
} else {
istack[jstack] = j - 1;
istack[jstack - 1] = l;
l = i;
}
}
}
}

1485
tests/soak.c Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff