Implement K-min-values Sketch for Similarity detection.
This commit is contained in:
parent
117382c141
commit
f3f472b860
4 changed files with 257 additions and 101 deletions
|
@ -24,8 +24,8 @@
|
||||||
PROG= pcompress
|
PROG= pcompress
|
||||||
MAINSRCS = main.c utils/utils.c allocator.c zlib_compress.c bzip2_compress.c \
|
MAINSRCS = main.c utils/utils.c allocator.c zlib_compress.c bzip2_compress.c \
|
||||||
lzma_compress.c ppmd_compress.c adaptive_compress.c lzfx_compress.c \
|
lzma_compress.c ppmd_compress.c adaptive_compress.c lzfx_compress.c \
|
||||||
lz4_compress.c none_compress.c utils/xxhash.c
|
lz4_compress.c none_compress.c utils/xxhash.c utils/heapq.c
|
||||||
MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h
|
MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heapq.h
|
||||||
MAINOBJS = $(MAINSRCS:.c=.o)
|
MAINOBJS = $(MAINSRCS:.c=.o)
|
||||||
|
|
||||||
RABINSRCS = rabin/rabin_dedup.c
|
RABINSRCS = rabin/rabin_dedup.c
|
||||||
|
|
|
@ -63,9 +63,12 @@
|
||||||
#include <allocator.h>
|
#include <allocator.h>
|
||||||
#include <utils.h>
|
#include <utils.h>
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
|
#include <heapq.h>
|
||||||
|
|
||||||
#include "rabin_dedup.h"
|
#include "rabin_dedup.h"
|
||||||
|
|
||||||
|
#define FORTY_PCNT(x) (((x)/5 << 1))
|
||||||
|
|
||||||
extern int lzma_init(void **data, int *level, ssize_t chunksize);
|
extern int lzma_init(void **data, int *level, ssize_t chunksize);
|
||||||
extern int lzma_compress(void *src, size_t srclen, void *dst,
|
extern int lzma_compress(void *src, size_t srclen, void *dst,
|
||||||
size_t *destlen, int level, uchar_t chdr, void *data);
|
size_t *destlen, int level, uchar_t chdr, void *data);
|
||||||
|
@ -87,19 +90,7 @@ rabin_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_
|
||||||
{
|
{
|
||||||
uint32_t min_blk;
|
uint32_t min_blk;
|
||||||
|
|
||||||
min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
|
min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS - 1);
|
||||||
if (rab_blk_sz > 1)
|
|
||||||
return (min_blk);
|
|
||||||
|
|
||||||
if (((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) &&
|
|
||||||
chunksize <= LZMA_WINDOW_MAX) || delta_flag) {
|
|
||||||
if (memcmp(algo, "lzfx", 4) == 0 || memcmp(algo, "lz4", 3) == 0 ||
|
|
||||||
memcmp(algo, "zlib", 4) == 0 || memcmp(algo, "none", 4) == 0) {
|
|
||||||
min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS - 1);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS - 1);
|
|
||||||
}
|
|
||||||
return (min_blk);
|
return (min_blk);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -298,32 +289,25 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
char *buf1 = (char *)buf;
|
char *buf1 = (char *)buf;
|
||||||
uint32_t length;
|
uint32_t length;
|
||||||
uint64_t cur_roll_checksum, cur_pos_checksum, cur_sketch;
|
uint64_t cur_roll_checksum, cur_pos_checksum, cur_sketch;
|
||||||
uint64_t *fplist;
|
uint32_t *fplist;
|
||||||
uint32_t len1, fpos[2], cur_sketch2;
|
heap_t heap;
|
||||||
uint32_t *charcounts, byts;
|
|
||||||
|
|
||||||
if (rabin_pos == NULL) {
|
if (rabin_pos == NULL) {
|
||||||
/*
|
/*
|
||||||
* Initialize arrays for sketch computation. We re-use memory allocated
|
* Initialize arrays for sketch computation. We re-use memory allocated
|
||||||
* for the compressed chunk temporarily.
|
* for the compressed chunk temporarily.
|
||||||
*/
|
*/
|
||||||
fplist_sz = 8 * ctx->rabin_poly_avg_block_size;
|
fplist_sz = 4 * ctx->rabin_poly_max_block_size;
|
||||||
fplist = (uint64_t *)(ctx->cbuf + ctx->real_chunksize - fplist_sz - 256 * 4);
|
fplist = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - fplist_sz);
|
||||||
charcounts = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - 256 * 4);
|
|
||||||
memset(fplist, 0, fplist_sz);
|
memset(fplist, 0, fplist_sz);
|
||||||
memset(charcounts, 0, 256 * 4);
|
reset_heap(&heap, fplist_sz/2);
|
||||||
fpos[0] = 0;
|
|
||||||
fpos[1] = 0;
|
|
||||||
len1 = 0;
|
|
||||||
}
|
}
|
||||||
length = offset;
|
length = offset;
|
||||||
last_offset = 0;
|
last_offset = 0;
|
||||||
blknum = 0;
|
blknum = 0;
|
||||||
ctx->valid = 0;
|
ctx->valid = 0;
|
||||||
cur_roll_checksum = 0;
|
cur_roll_checksum = 0;
|
||||||
j = 0;
|
|
||||||
cur_sketch = 0;
|
cur_sketch = 0;
|
||||||
cur_sketch2 = 0;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
|
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
|
||||||
|
@ -362,6 +346,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
}
|
}
|
||||||
|
|
||||||
if (*size < ctx->rabin_poly_avg_block_size) return;
|
if (*size < ctx->rabin_poly_avg_block_size) return;
|
||||||
|
j = 0;
|
||||||
|
|
||||||
for (i=offset; i<*size; i++) {
|
for (i=offset; i<*size; i++) {
|
||||||
uint32_t *splits;
|
uint32_t *splits;
|
||||||
uchar_t cur_byte = buf1[i];
|
uchar_t cur_byte = buf1[i];
|
||||||
|
@ -379,68 +365,19 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
|
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Compute a super sketch value of the block. We store a sum of relative
|
* Retain a list of all fingerprints in the block. We then compute
|
||||||
* maximal rabin hash values per 1K(SKETCH_BASIC_BLOCK_SZ) of data. So we
|
* the K min values sketch from that list and generate a super sketch
|
||||||
* get upto 128 sums for a max block size of 128K. The bottom blocksize bits
|
* by hashing over the K min values sketch. We only store the least
|
||||||
* of the hash are only used which are then biased with the occurrence count.
|
* significant 32 bits of the fingerprint. This uses less memory,
|
||||||
* This is a representative fingerprint sketch of the block. Storing and
|
* requires smaller memset() calls and generates a sufficiently large
|
||||||
* comparing upto 128 fingerprints per block is very expensive (compute & RAM)
|
* number of similarity matches without false positives - determined
|
||||||
* so we eventually sum all the fingerprints for the block to create a single
|
* by experimentation.
|
||||||
* super sketch value representing maximal features of the block. In addition
|
|
||||||
* the top 2 commonly occuring byte values are used to compute a second sketch
|
|
||||||
* to refine the earlier one.
|
|
||||||
*
|
*
|
||||||
* This value can be used for similarity detection for delta encoding. Exact
|
* This is called minhashing and is used widely, for example in various
|
||||||
* match for deduplication is additionally detected via a memcmp(). This is a
|
* search engines to detect similar documents.
|
||||||
* variant of some approaches detailed in:
|
|
||||||
* http://www.armedia.com/wp/SimilarityIndex.pdf
|
|
||||||
*/
|
*/
|
||||||
len1++;
|
fplist[j] = cur_pos_checksum & 0xFFFFFFFFUL;
|
||||||
fpos[1] = cur_pos_checksum & ctx->rabin_avg_block_mask;
|
j++;
|
||||||
splits = (uint32_t *)(&fplist[fpos[1]]);
|
|
||||||
#if BYTE_ORDER == BIG_ENDIAN
|
|
||||||
splits[0]++;
|
|
||||||
splits[1] += cur_pos_checksum & ctx->fp_mask;
|
|
||||||
#else
|
|
||||||
splits[1]++;
|
|
||||||
splits[0] += cur_pos_checksum & ctx->fp_mask;
|
|
||||||
#endif
|
|
||||||
charcounts[cur_byte]++;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Perform the following statement without branching:
|
|
||||||
* if (fplist[fpos[1]] > fplist[fpos[0]]) fpos[0] = fpos[1];
|
|
||||||
*/
|
|
||||||
fpos[0] = fpos[(fplist[fpos[1]] > fplist[fpos[0]])];
|
|
||||||
if (len1 == SKETCH_BASIC_BLOCK_SZ && ctx->delta_flag) {
|
|
||||||
uint32_t p1, p2, p3;
|
|
||||||
/*
|
|
||||||
* Compute the super sketch value by summing all the representative
|
|
||||||
* fingerprints of the block.
|
|
||||||
*/
|
|
||||||
cur_sketch += fplist[fpos[0]];
|
|
||||||
memset(fplist, 0, fplist_sz);
|
|
||||||
fpos[0] = 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Find out the top 2 occurring byte values and compute
|
|
||||||
* a secondary sketch from them.
|
|
||||||
*/
|
|
||||||
p1 = 0;
|
|
||||||
p2 = 0;
|
|
||||||
p3 = 0;
|
|
||||||
for (len1=0; len1<256; len1++) {
|
|
||||||
if (charcounts[len1] > p1) {
|
|
||||||
p3 = p2;
|
|
||||||
p2 = p1;
|
|
||||||
p1 = len1;
|
|
||||||
}
|
|
||||||
charcounts[len1] = 0;
|
|
||||||
}
|
|
||||||
cur_sketch2 += ((p1 << 16) | (p2 << 8) | p3);
|
|
||||||
len1 = 0;
|
|
||||||
j++;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
|
* Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
|
||||||
|
@ -463,23 +400,26 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
ctx->blocks[blknum]->similar = 0;
|
ctx->blocks[blknum]->similar = 0;
|
||||||
ctx->blocks[blknum]->crc = XXH_strong32(buf1+last_offset, length, 0);
|
ctx->blocks[blknum]->crc = XXH_strong32(buf1+last_offset, length, 0);
|
||||||
|
|
||||||
// Accumulate the 2 sketch values into a combined similarity checksum
|
|
||||||
if (ctx->delta_flag) {
|
if (ctx->delta_flag) {
|
||||||
ctx->blocks[blknum]->cksum_n_offset = (cur_sketch + cur_sketch2) / 2;
|
/*
|
||||||
ctx->blocks[blknum]->mean_n_length = cur_sketch / j;
|
* Reset the heap structure and find the K min values. We use a
|
||||||
|
* min heap mechanism taken from the heap based priority queue
|
||||||
|
* implementation in Python.
|
||||||
|
* Here K = 40%. We are aiming to detect 40% similarity on average.
|
||||||
|
*/
|
||||||
|
reset_heap(&heap, FORTY_PCNT(j));
|
||||||
|
ksmallest(fplist, j, &heap);
|
||||||
|
cur_sketch = XXH_fast32((const uchar_t *)fplist, FORTY_PCNT(j)*4, 0);
|
||||||
memset(fplist, 0, fplist_sz);
|
memset(fplist, 0, fplist_sz);
|
||||||
} else {
|
} else {
|
||||||
ctx->blocks[blknum]->cksum_n_offset = 0;
|
cur_sketch = ctx->blocks[blknum]->crc;
|
||||||
ctx->blocks[blknum]->mean_n_length = 0;
|
|
||||||
}
|
}
|
||||||
fpos[0] = 0;
|
ctx->blocks[blknum]->cksum_n_offset = cur_sketch;
|
||||||
len1 = 0;
|
|
||||||
cur_sketch = 0;
|
cur_sketch = 0;
|
||||||
blknum++;
|
blknum++;
|
||||||
last_offset = i+1;
|
last_offset = i+1;
|
||||||
length = 0;
|
length = 0;
|
||||||
j = 0;
|
j = 0;
|
||||||
cur_sketch2 = 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -500,22 +440,31 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
// Insert the last left-over trailing bytes, if any, into a block.
|
// Insert the last left-over trailing bytes, if any, into a block.
|
||||||
if (last_offset < *size) {
|
if (last_offset < *size) {
|
||||||
if (ctx->blocks[blknum] == 0)
|
if (ctx->blocks[blknum] == 0)
|
||||||
ctx->blocks[blknum] = (rabin_blockentry_t *)slab_alloc(NULL, sizeof (rabin_blockentry_t));
|
ctx->blocks[blknum] = (rabin_blockentry_t *)slab_alloc(NULL,
|
||||||
|
sizeof (rabin_blockentry_t));
|
||||||
ctx->blocks[blknum]->offset = last_offset;
|
ctx->blocks[blknum]->offset = last_offset;
|
||||||
ctx->blocks[blknum]->index = blknum;
|
ctx->blocks[blknum]->index = blknum;
|
||||||
ctx->blocks[blknum]->length = *size - last_offset;
|
ctx->blocks[blknum]->length = *size - last_offset;
|
||||||
ctx->blocks[blknum]->ref = 0;
|
ctx->blocks[blknum]->ref = 0;
|
||||||
ctx->blocks[blknum]->similar = 0;
|
ctx->blocks[blknum]->similar = 0;
|
||||||
|
ctx->blocks[blknum]->crc = XXH_strong32(buf1+last_offset, ctx->blocks[blknum]->length, 0);
|
||||||
|
|
||||||
if (ctx->delta_flag) {
|
if (ctx->delta_flag) {
|
||||||
j = (j > 0 ? j:1);
|
j = (j > 0 ? j:1);
|
||||||
ctx->blocks[blknum]->cksum_n_offset = (cur_sketch + cur_sketch2) / 2;
|
if (j > 1) {
|
||||||
ctx->blocks[blknum]->mean_n_length = cur_sketch / j;
|
reset_heap(&heap, FORTY_PCNT(j));
|
||||||
|
ksmallest(fplist, j, &heap);
|
||||||
|
cur_sketch =
|
||||||
|
XXH_fast32((const uchar_t *)fplist, FORTY_PCNT(j)*4, 0);
|
||||||
|
} else {
|
||||||
|
cur_sketch =
|
||||||
|
XXH_fast32((const uchar_t *)fplist, (j*4)/2, 0);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
ctx->blocks[blknum]->cksum_n_offset = 0;
|
cur_sketch = ctx->blocks[blknum]->crc;
|
||||||
ctx->blocks[blknum]->mean_n_length = 0;
|
|
||||||
}
|
}
|
||||||
ctx->blocks[blknum]->crc = XXH_strong32(buf1+last_offset, ctx->blocks[blknum]->length, 0);
|
|
||||||
|
ctx->blocks[blknum]->cksum_n_offset = cur_sketch;
|
||||||
blknum++;
|
blknum++;
|
||||||
last_offset = *size;
|
last_offset = *size;
|
||||||
}
|
}
|
||||||
|
|
193
utils/heapq.c
Normal file
193
utils/heapq.c
Normal file
|
@ -0,0 +1,193 @@
|
||||||
|
/*
|
||||||
|
* Functions for a rudimentary fast min-heap implementation.
|
||||||
|
* Derived from Python's _heapqmodule.c by way of drastic simplification
|
||||||
|
* and a few optimizations.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Original Python _heapqmodule.c implementation was derived directly
|
||||||
|
* from heapq.py in Py2.3 which was written by Kevin O'Connor, augmented
|
||||||
|
* by Tim Peters, annotated by François Pinard, and converted to C by
|
||||||
|
* Raymond Hettinger.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <limits.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <heapq.h>
|
||||||
|
|
||||||
|
#ifndef NDEBUG
|
||||||
|
#define ERROR_CHK
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void
|
||||||
|
reset_heap(heap_t *heap, __TYPE tot)
|
||||||
|
{
|
||||||
|
if (heap) {
|
||||||
|
heap->len = 0;
|
||||||
|
heap->tot = tot;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
_siftdownmax(heap_t *h, __TYPE startpos, __TYPE pos)
|
||||||
|
{
|
||||||
|
__TYPE newitem, parent;
|
||||||
|
__TYPE parentpos, *heap;
|
||||||
|
|
||||||
|
#ifdef ERROR_CHK
|
||||||
|
if (pos >= h->len) {
|
||||||
|
fprintf(stderr, "_siftdownmax: index out of range\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
heap = h->ary;
|
||||||
|
newitem = heap[pos];
|
||||||
|
/* Follow the path to the root, moving parents down until finding
|
||||||
|
a place newitem fits. */
|
||||||
|
while (pos > startpos){
|
||||||
|
parentpos = (pos - 1) >> 1;
|
||||||
|
parent = heap[parentpos];
|
||||||
|
if (parent < newitem)
|
||||||
|
break;
|
||||||
|
heap[pos] = parent;
|
||||||
|
pos = parentpos;
|
||||||
|
}
|
||||||
|
heap[pos] = newitem;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
_siftupmax(heap_t *h, __TYPE spos, __TYPE epos)
|
||||||
|
{
|
||||||
|
__TYPE endpos, childpos, rightpos;
|
||||||
|
__TYPE newitem, *heap, pos;
|
||||||
|
|
||||||
|
endpos = h->len;
|
||||||
|
heap = h->ary;
|
||||||
|
#ifdef ERROR_CHK
|
||||||
|
if (pos >= endpos) {
|
||||||
|
fprintf(stderr, "_siftupmax: index out of range: %u, len: %u\n", pos, endpos);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
do {
|
||||||
|
pos = spos;
|
||||||
|
/* Bubble up the smaller child until hitting a leaf. */
|
||||||
|
newitem = heap[pos];
|
||||||
|
childpos = (pos << 1) + 1; /* leftmost child position */
|
||||||
|
while (childpos < endpos) {
|
||||||
|
/* Set childpos to index of smaller child. */
|
||||||
|
rightpos = childpos + 1;
|
||||||
|
if (rightpos < endpos) {
|
||||||
|
if (heap[rightpos] < heap[childpos])
|
||||||
|
childpos = rightpos;
|
||||||
|
}
|
||||||
|
/* Move the smaller child up. */
|
||||||
|
heap[pos] = heap[childpos];
|
||||||
|
pos = childpos;
|
||||||
|
childpos = (pos << 1) + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The leaf at pos is empty now. Put newitem there, and and bubble
|
||||||
|
it up to its final resting place (by sifting its parents down). */
|
||||||
|
heap[pos] = newitem;
|
||||||
|
#ifdef ERROR_CHK
|
||||||
|
if (_siftdownmax(h, spos, pos) == -1)
|
||||||
|
return (-1);
|
||||||
|
#else
|
||||||
|
_siftdownmax(h, spos, pos);
|
||||||
|
#endif
|
||||||
|
spos--;
|
||||||
|
} while (spos >= epos);
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
_siftupmax_s(heap_t *h, __TYPE spos)
|
||||||
|
{
|
||||||
|
__TYPE endpos, childpos, rightpos;
|
||||||
|
__TYPE newitem, *heap, pos;
|
||||||
|
|
||||||
|
endpos = h->len;
|
||||||
|
heap = h->ary;
|
||||||
|
#ifdef ERROR_CHK
|
||||||
|
if (pos >= endpos) {
|
||||||
|
fprintf(stderr, "_siftupmax: index out of range: %u, len: %u\n", pos, endpos);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
pos = spos;
|
||||||
|
/* Bubble up the smaller child until hitting a leaf. */
|
||||||
|
newitem = heap[pos];
|
||||||
|
childpos = (pos << 1) + 1; /* leftmost child position */
|
||||||
|
while (childpos < endpos) {
|
||||||
|
/* Set childpos to index of smaller child. */
|
||||||
|
rightpos = childpos + 1;
|
||||||
|
if (rightpos < endpos) {
|
||||||
|
if (! (heap[rightpos] < heap[childpos]))
|
||||||
|
childpos = rightpos;
|
||||||
|
}
|
||||||
|
/* Move the smaller child up. */
|
||||||
|
heap[pos] = heap[childpos];
|
||||||
|
pos = childpos;
|
||||||
|
childpos = (pos << 1) + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The leaf at pos is empty now. Put newitem there, and and bubble
|
||||||
|
it up to its final resting place (by sifting its parents down). */
|
||||||
|
heap[pos] = newitem;
|
||||||
|
return (_siftdownmax(h, spos, pos));
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
ksmallest(__TYPE *ary, __TYPE len, heap_t *heap)
|
||||||
|
{
|
||||||
|
__TYPE elem, los;
|
||||||
|
__TYPE i, *hp, n;
|
||||||
|
|
||||||
|
#ifdef ERROR_CHK
|
||||||
|
if (len >= heap->tot) {
|
||||||
|
fprintf(stderr, "nsmallest: array size > heap size\n");
|
||||||
|
return (-1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
n = heap->tot;
|
||||||
|
heap->ary = ary;
|
||||||
|
hp = ary;
|
||||||
|
heap->len = n;
|
||||||
|
|
||||||
|
#ifdef ERROR_CHK
|
||||||
|
if(_siftupmax(heap, n/2-1, 0) == -1)
|
||||||
|
return (-1);
|
||||||
|
#else
|
||||||
|
_siftupmax(heap, n/2-1, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
los = hp[0];
|
||||||
|
for (i = n; i < len; i++) {
|
||||||
|
elem = ary[i];
|
||||||
|
if (elem >= los) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
hp[0] = elem;
|
||||||
|
#ifdef ERROR_CHK
|
||||||
|
if (_siftupmax_s(heap, 0) == -1)
|
||||||
|
return (-1);
|
||||||
|
#else
|
||||||
|
_siftupmax_s(heap, 0);
|
||||||
|
#endif
|
||||||
|
los = hp[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
14
utils/heapq.h
Normal file
14
utils/heapq.h
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
#ifndef __HEAPQ_H_
|
||||||
|
|
||||||
|
#define __TYPE int32_t
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
__TYPE *ary;
|
||||||
|
__TYPE len;
|
||||||
|
__TYPE tot;
|
||||||
|
} heap_t;
|
||||||
|
|
||||||
|
extern int ksmallest(__TYPE *ary, __TYPE len, heap_t *heap);
|
||||||
|
extern void reset_heap(heap_t *h, __TYPE tot);
|
||||||
|
|
||||||
|
#endif
|
Loading…
Reference in a new issue