select/rank for unset as well as set bits #4

Merged
greg merged 13 commits from gburd/select-neg-bool into main 2024-04-24 20:32:10 +00:00
6 changed files with 94 additions and 77 deletions
Showing only changes of commit 6e65bda211 - Show all commits

2
.envrc
View file

@ -1,5 +1,5 @@
if ! has nix_direnv_version || ! nix_direnv_version 3.0.4; then
source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/3.0.4/direnvrc" "sha256-DzlYZ33mWF/Gs8DDeyjr8mnVmQGx7ASYqA5WlxwvBG4="
fi
watch_file devShell.nix shell.nix flake.nix
watch_file shell.nix flake.nix
use flake || use nix

View file

@ -1,5 +1,5 @@
{
description = "A Concurrent Skip List library for key/value pairs.";
description = "A sparse bitmapped index library in C.";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";

View file

@ -475,18 +475,7 @@ __sm_chunk_map_select(__sm_chunk_t *map, size_t n, ssize_t *pnew_n, bool value)
return ret;
}
void printBits(char *name, uint64_t value) {
if (name) {
printf("%s\t", name);
}
for (int i = 63; i >= 0; i--) {
printf("%ld", (value >> i) & 1);
if (i % 8 == 0) {
printf(" "); // Add space for better readability
}
}
printf("\n");
}
extern void print_bits(char *name, uint64_t value); // GSB
/**
* Counts the set bits in the range [0, 'idx'] inclusive ignoring the first
@ -507,47 +496,51 @@ __sm_chunk_map_rank(__sm_chunk_t *map, size_t *offset, size_t idx, sm_bitvec_t *
continue;
}
if (flags == SM_PAYLOAD_ZEROS) {
if (idx > SM_BITS_PER_VECTOR) {
*vec = 0;
if (idx >= SM_BITS_PER_VECTOR) {
idx -= SM_BITS_PER_VECTOR;
if (*offset > SM_BITS_PER_VECTOR) {
*offset = *offset - SM_BITS_PER_VECTOR;
} else {
idx -= SM_BITS_PER_VECTOR - *offset;
if (*offset == 0) {
if (value == false) {
ret += SM_BITS_PER_VECTOR;
}
if (value == false) {
ret += SM_BITS_PER_VECTOR - *offset;
}
*offset = 0;
}
} else {
*vec = 0;
if (value == false) {
if (*offset > idx) {
*offset = *offset - idx;
} else {
return ret + idx - *offset;
ret += idx + 1 - *offset;
*offset = 0;
return ret;
}
} else {
return ret;
}
}
} else if (flags == SM_PAYLOAD_ONES) {
if (idx > SM_BITS_PER_VECTOR) {
*vec = UINT64_MAX;
if (idx >= SM_BITS_PER_VECTOR) {
idx -= SM_BITS_PER_VECTOR;
if (*offset > SM_BITS_PER_VECTOR) {
*offset = *offset - SM_BITS_PER_VECTOR;
} else {
idx -= SM_BITS_PER_VECTOR - *offset;
if (*offset == 0) {
if (value == true) {
ret += SM_BITS_PER_VECTOR;
}
if (value == true) {
ret += SM_BITS_PER_VECTOR - *offset;
}
*offset = 0;
}
} else {
*vec = UINT64_MAX;
if (value == true) {
return ret + idx;
if (*offset > idx) {
*offset = *offset - idx;
} else {
ret += idx + 1 - *offset;
*offset = 0;
return ret;
}
} else {
return ret;
}
@ -555,8 +548,8 @@ __sm_chunk_map_rank(__sm_chunk_t *map, size_t *offset, size_t idx, sm_bitvec_t *
} else if (flags == SM_PAYLOAD_MIXED) {
sm_bitvec_t w = map->m_data[1 + __sm_chunk_map_get_position(map, i * SM_FLAGS_PER_INDEX_BYTE + j)];
if (idx >= SM_BITS_PER_VECTOR) {
uint64_t mask = *offset > 0 ? ~(UINT64_MAX >> (SM_BITS_PER_VECTOR - *offset)) : UINT64_MAX;
idx -= SM_BITS_PER_VECTOR;
uint64_t mask = *offset == 0 ? UINT64_MAX : ~(UINT64_MAX >> (SM_BITS_PER_VECTOR - (*offset >= 64 ? 64 : *offset)));
size_t pc = popcountll(w & mask);
if (value == true) {
ret += pc;
@ -581,20 +574,20 @@ __sm_chunk_map_rank(__sm_chunk_t *map, size_t *offset, size_t idx, sm_bitvec_t *
that the offset bits are also set. Then popcount(). Then we subtract the count of set
bits found after masking from the possible number of bits that we examined. This should
have inverted the popcount() and counted the unset bits in the range [*offset, idx]. */
mask = idx_mask | offset_mask;
mask = idx_mask | (offset_mask > idx_mask ? idx_mask : offset_mask);
mw = w & mask;
size_t pc = popcountll(mw);
#if 0
#if 0 // GSB
printf("---------------------\n");
printBits("om", offset_mask);
printBits("im", idx_mask);
printBits("m", mask);
printBits("mw", mw);
print_bits("om", offset_mask);
print_bits("im", idx_mask);
print_bits("m", mask);
print_bits("mw", mw);
printf("pc: %lu\tidx:%lu\t*o:%lu\n", pc, idx, *offset);
#endif
ret += idx + 1 - pc; /* We accounted for offset in our masking above. */
}
*offset = *offset > idx ? *offset - idx : 0;
*offset = *offset > idx ? *offset - idx + 1 : 0;
*vec = mw;
(*vec) <<= *offset;
return ret;
@ -876,7 +869,7 @@ sparsemap(size_t size)
sparsemap_t *map = (sparsemap_t *)calloc(1, total_size);
if (map) {
uint8_t *data = (uint8_t *)(((uintptr_t)map + sizeof(sparsemap_t)) & ~ (uintptr_t)7);
uint8_t *data = (uint8_t *)(((uintptr_t)map + sizeof(sparsemap_t)) & ~(uintptr_t)7);
sparsemap_init(map, data, size);
__sm_when_diag({ __sm_assert(IS_8_BYTE_ALIGNED(map->m_data)); });
}
@ -934,7 +927,7 @@ sparsemap_set_data_size(sparsemap_t *map, size_t size)
}
memset(((uint8_t *)m) + sizeof(sparsemap_t) + (m->m_capacity * sizeof(uint8_t)), 0, size - m->m_capacity + padding);
m->m_capacity = data_size;
m->m_data = (uint8_t *)(((uintptr_t)m + sizeof(sparsemap_t)) & ~ (uintptr_t)7);
m->m_data = (uint8_t *)(((uintptr_t)m + sizeof(sparsemap_t)) & ~(uintptr_t)7);
__sm_when_diag({ __sm_assert(IS_8_BYTE_ALIGNED(m->m_data)); }) return m;
} else {
map->m_capacity = size;
@ -1108,9 +1101,9 @@ sparsemap_set(sparsemap_t *map, sparsemap_idx_t idx, bool value)
__sm_insert_data(map, offset, (uint8_t *)&fill, sizeof(sm_bitvec_t));
}
__sm_when_diag({
code = __sm_chunk_map_set(&chunk, idx - start, value, &position, &fill, true);
__sm_assert(code == SM_OK);
});
code = __sm_chunk_map_set(&chunk, idx - start, value, &position, &fill, true);
__sm_assert(code == SM_OK);
});
break;
case SM_NEEDS_TO_SHRINK:
/* If the __sm_chunk_t is empty then remove it. */
@ -1155,7 +1148,7 @@ sparsemap_get_size(sparsemap_t *map)
__sm_when_diag({
size_t used = __sm_get_size_impl(map);
__sm_assert(map->m_data_used == used);
});
});
return map->m_data_used;
}
return map->m_data_used = __sm_get_size_impl(map);
@ -1337,47 +1330,44 @@ size_t
sparsemap_rank_vec(sparsemap_t *map, size_t x, size_t y, bool value, sm_bitvec_t *vec)
{
assert(sparsemap_get_size(map) >= SM_SIZEOF_OVERHEAD);
size_t gap, amt = 0, result = 0, prev = 0, count = __sm_get_chunk_map_count(map);
size_t amt = 0, result = 0, prev = 0, count = __sm_get_chunk_map_count(map);
uint8_t *p = __sm_get_chunk_map_data(map, 0);
/* The count/rank of unset bits in an empty map is inf, so what you requested is the answer. */
if (count == 0 && value == false) {
return y - x + 1;
if (count == 0) {
if (value == false) {
/* The count/rank of unset bits in an empty map is inf, so what you requested is the answer. */
return y - x + 1;
}
}
for (size_t i = 0; i < count; i++) {
sm_idx_t start = *(sm_idx_t *)p;
gap = start - (prev == 0 ? start : prev);
/* Start of this chunk is greater than the end of the desired range. */
if (start > y) {
if (value == true) {
return result;
} else {
/* This chunk starts after our range [x, y]. */
return result + gap + (y - x) + 1;
return result + (y - x) + 1;
}
} else {
/* The range and this chunk overlap. */
if (value == false) {
result += start - x;
if (x > start) {
x -= start;
} else {
x = 0;
}
}
}
x -= gap;
if (value == false) {
result += gap;
}
prev = start;
p += sizeof(sm_idx_t);
__sm_chunk_t chunk;
__sm_chunk_map_init(&chunk, p);
/* Ensure that x, the offset, isn't beyond the start of this chunk. */
//if (x > y - start) {
// amt = value ? 0 : y - start + 1;
//} else {
/* Count all the set/unset inside this chunk. */
amt = __sm_chunk_map_rank(&chunk, &x, y - start, vec, value);
//}
/* Count all the set/unset inside this chunk. */
amt = __sm_chunk_map_rank(&chunk, &x, y - start, vec, value);
result += amt;
p += __sm_chunk_map_get_size(&chunk);
}
@ -1419,8 +1409,9 @@ sparsemap_span(sparsemap_t *map, sparsemap_idx_t idx, size_t len, bool value)
nth++;
}
}
if (count)
if (count) {
nth++;
}
/* Use select to potentially jump very far forward in the map. */
offset = sparsemap_select(map, nth, value);
} while (offset != SPARSEMAP_IDX_MAX);

View file

@ -338,6 +338,21 @@ rank_uint64(uint64_t number, int n, int p)
return count;
}
void
print_bits(char *name, uint64_t value)
{
if (name) {
printf("%s\t", name);
}
for (int i = 63; i >= 0; i--) {
printf("%ld", (value >> i) & 1);
if (i % 8 == 0) {
printf(" "); // Add space for better readability
}
}
printf("\n");
}
void
sm_bitmap_from_uint64(sparsemap_t *map, uint64_t number)
{

View file

@ -43,6 +43,8 @@ void shuffle(int *array, size_t n);
int ensure_sequential_set(int a[], int l, int r);
sparsemap_idx_t sm_add_span(sparsemap_t *map, int map_size, int span_length);
void print_bits(char *name, uint64_t value);
void bitmap_from_uint32(sparsemap_t *map, uint32_t number);
void sm_bitmap_from_uint64(sparsemap_t *map, uint64_t number);
uint32_t rank_uint64(uint64_t number, int n, int p);

View file

@ -668,7 +668,7 @@ test_api_select_false(const MunitParameter params[], void *data)
assert_ptr_not_null(map);
/* First few 0/off/unset-bits in ((uint64_t)0xfeedface << 32) | 0xbadc0ffee) expressed as an array of offsets. */
int off[] = { 0, 4, 16, 17, 18, 19, 20, 21, 25, 28, 30, 36, 37, 40, 42, 49, 52, 56, 64, 65};
int off[] = { 0, 4, 16, 17, 18, 19, 20, 21, 25, 28, 30, 36, 37, 40, 42, 49, 52, 56, 64, 65 };
for (int i = 0; i < 20; i++) {
sparsemap_idx_t f = sparsemap_select(map, i, false);
assert_true(f == off[i]);
@ -757,12 +757,6 @@ test_api_rank_true(const MunitParameter params[], void *data)
for (int i = 0; i < 10; i++) {
sparsemap_set(map, i, true);
}
for (int i = 0; i < 10; i++) {
assert_true(sparsemap_is_set(map, i));
}
for (int i = 10; i < 1000; i++) {
assert_true(!sparsemap_is_set(map, i));
}
/* rank() is also 0-based, for consistency (and confusion sake); consider the
range as [start, end] of [0, 9] counts the bits set in the first 10
positions (starting from the LSB) in the index. */
@ -772,11 +766,24 @@ test_api_rank_true(const MunitParameter params[], void *data)
assert_true(sparsemap_rank(map, 0, 9, true) == 10);
assert_true(sparsemap_rank(map, 1000, 1050, true) == 0);
for (int i = 0; i < 10; i++) {
for (int j = i; j < 10; j++) {
r1 = rank_uint64((uint64_t)-1, i, j);
r2 = sparsemap_rank(map, i, j, true);
assert_true(r1 == r2);
sparsemap_clear(map);
for (int i = 0; i < 10000; i++) {
sparsemap_set(map, i, true);
}
sparsemap_idx_t hole = 4999;
sparsemap_set(map, hole, false);
for (int i = 0; i < 10000; i++) {
for (int j = i; j < 10000; j++) {
int amt = j - i + 1 - ((hole >= i && j >= hole) ? 1 : 0);
int r = sparsemap_rank(map, i, j, true);
#ifdef DEBUG
if (r != amt) {
printf("\033[2K\r");
printf("%d\t%d\t--\t%d\t%d", i, j, amt, r);
}
#endif
assert_true(r == amt);
}
}
@ -822,11 +829,13 @@ test_api_rank_false(const MunitParameter params[], void *data)
// one chunk means not so empty now!
sparsemap_idx_t hole = 4999;
sparsemap_set(map, hole, true);
for (int i = 0; i < 10000; i++) {
for (int j = i; j < 10000; j++) {
// for (int i = 0; i < 10000; i++) {
// for (int j = i; j < 10000; j++) {
for (int i = 5000; i < 10000; i++) {
for (int j = 5000; j < 10000; j++) {
int amt = j - i + 1 - ((hole >= i && j >= hole) ? 1 : 0);
r = sparsemap_rank(map, i, j, false);
#if 1
#ifdef DEBUG
if (r != amt) {
printf("\033[2K\r");
printf("%d\t%d\t--\t%d\t%d", i, j, amt, r);