Fix polynomial table computation.
Change hashing and length bias to reduce hashtable bucket collisions. Add support for user-selectable 60% or 40% similarity for Delta Compression. Overall slight speedup.
This commit is contained in:
parent
8386e72566
commit
3544a8c708
4 changed files with 64 additions and 33 deletions
|
@ -80,8 +80,13 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
|
||||||
pcompress -D -r ... - Do NOT split chunks at a rabin boundary. Default
|
pcompress -D -r ... - Do NOT split chunks at a rabin boundary. Default
|
||||||
is to split.
|
is to split.
|
||||||
|
|
||||||
Perform Delta Encoding in addition to Exact Dedup:
|
Perform Delta Encoding in addition to Identical Dedup:
|
||||||
pcompress -E ... - This also implies '-D'.
|
pcompress -E ... - This also implies '-D'. This performs Delta Compression
|
||||||
|
between 2 blocks if they are at least 60% similar.
|
||||||
|
pcompress -EE .. - This causes Delta Compression to happen if 2 blocks are
|
||||||
|
at least 40% similar. This can effect greater final
|
||||||
|
compression ratio at the cost of higher processing
|
||||||
|
overhead.
|
||||||
|
|
||||||
Number of threads can optionally be specified: -t <1 - 256 count>
|
Number of threads can optionally be specified: -t <1 - 256 count>
|
||||||
Other flags:
|
Other flags:
|
||||||
|
|
12
main.c
12
main.c
|
@ -135,9 +135,10 @@ usage(void)
|
||||||
" %s -p ...\n"
|
" %s -p ...\n"
|
||||||
"4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
|
"4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
|
||||||
" %s -D ...\n"
|
" %s -D ...\n"
|
||||||
" %s -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.\n"
|
" %s -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.\n\n"
|
||||||
"5) Perform Delta Encoding in addition to Exact Dedup:\n"
|
"5) Perform Delta Encoding in addition to Identical Dedup:\n"
|
||||||
" %s -E ... - This also implies '-D'.\n"
|
" %s -E ... - This also implies '-D'. This checks for at least 60%% similarity.\n"
|
||||||
|
" The flag can be repeated as in '-EE' to indicate at least 40%% similarity.\n\n"
|
||||||
"6) Number of threads can optionally be specified: -t <1 - 256 count>\n"
|
"6) Number of threads can optionally be specified: -t <1 - 256 count>\n"
|
||||||
"7) Other flags:\n"
|
"7) Other flags:\n"
|
||||||
" '-L' - Enable LZP pre-compression. This improves compression ratio of all\n"
|
" '-L' - Enable LZP pre-compression. This improves compression ratio of all\n"
|
||||||
|
@ -1594,7 +1595,10 @@ main(int argc, char *argv[])
|
||||||
|
|
||||||
case 'E':
|
case 'E':
|
||||||
enable_rabin_scan = 1;
|
enable_rabin_scan = 1;
|
||||||
enable_delta_encode = 1;
|
if (!enable_delta_encode)
|
||||||
|
enable_delta_encode = DELTA_NORMAL;
|
||||||
|
else
|
||||||
|
enable_delta_encode = DELTA_EXTRA;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'F':
|
case 'F':
|
||||||
|
|
|
@ -68,6 +68,7 @@
|
||||||
#include "rabin_dedup.h"
|
#include "rabin_dedup.h"
|
||||||
|
|
||||||
#define FORTY_PCNT(x) (((x)/5 << 1))
|
#define FORTY_PCNT(x) (((x)/5 << 1))
|
||||||
|
#define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3))
|
||||||
|
|
||||||
extern int lzma_init(void **data, int *level, ssize_t chunksize);
|
extern int lzma_init(void **data, int *level, ssize_t chunksize);
|
||||||
extern int lzma_compress(void *src, size_t srclen, void *dst,
|
extern int lzma_compress(void *src, size_t srclen, void *dst,
|
||||||
|
@ -135,7 +136,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
||||||
val = 0;
|
val = 0;
|
||||||
for (i=0; i<RAB_POLYNOMIAL_WIN_SIZE; i++) {
|
for (i=0; i<RAB_POLYNOMIAL_WIN_SIZE; i++) {
|
||||||
if (term & FP_POLY) {
|
if (term & FP_POLY) {
|
||||||
val = (val << 1) + j;
|
val += term * j;
|
||||||
}
|
}
|
||||||
term <<= 1;
|
term <<= 1;
|
||||||
}
|
}
|
||||||
|
@ -297,7 +298,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
|
||||||
ctx->blocks[i]->index = i; // Need to store for sorting
|
ctx->blocks[i]->index = i; // Need to store for sorting
|
||||||
ctx->blocks[i]->length = length;
|
ctx->blocks[i]->length = length;
|
||||||
ctx->blocks[i]->similar = 0;
|
ctx->blocks[i]->similar = 0;
|
||||||
ctx->blocks[i]->hash = XXH_strong32(buf1+last_offset, length, 0);
|
ctx->blocks[i]->hash = XXH_fast32(buf1+last_offset, length, 0);
|
||||||
ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash;
|
ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash;
|
||||||
last_offset += length;
|
last_offset += length;
|
||||||
}
|
}
|
||||||
|
@ -355,7 +356,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
|
||||||
j = 0;
|
j = 0;
|
||||||
|
|
||||||
for (i=offset; i<*size; i++) {
|
for (i=offset; i<*size; i++) {
|
||||||
uint32_t *splits;
|
ssize_t pc[3];
|
||||||
uchar_t cur_byte = buf1[i];
|
uchar_t cur_byte = buf1[i];
|
||||||
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
||||||
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
||||||
|
@ -403,21 +404,22 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
|
||||||
ctx->blocks[blknum]->offset = last_offset;
|
ctx->blocks[blknum]->offset = last_offset;
|
||||||
ctx->blocks[blknum]->index = blknum; // Need to store for sorting
|
ctx->blocks[blknum]->index = blknum; // Need to store for sorting
|
||||||
ctx->blocks[blknum]->length = length;
|
ctx->blocks[blknum]->length = length;
|
||||||
ctx->blocks[blknum]->other = 0;
|
|
||||||
ctx->blocks[blknum]->next = 0;
|
|
||||||
ctx->blocks[blknum]->similar = 0;
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reset the heap structure and find the K min values if Delta Compression
|
||||||
|
* is enabled. We use a min heap mechanism taken from the heap based priority
|
||||||
|
* queue implementation in Python.
|
||||||
|
* Here K = 60% or 40%. We are aiming to detect either 60% (default) or 40%
|
||||||
|
* similarity on average.
|
||||||
|
*/
|
||||||
if (ctx->delta_flag) {
|
if (ctx->delta_flag) {
|
||||||
/*
|
pc[1] = SIXTY_PCNT(j);
|
||||||
* Reset the heap structure and find the K min values. We use a
|
pc[2] = FORTY_PCNT(j);
|
||||||
* min heap mechanism taken from the heap based priority queue
|
|
||||||
* implementation in Python.
|
reset_heap(&heap, pc[ctx->delta_flag]);
|
||||||
* Here K = 40%. We are aiming to detect 40% similarity on average.
|
|
||||||
*/
|
|
||||||
reset_heap(&heap, FORTY_PCNT(j));
|
|
||||||
ksmallest(fplist, j, &heap);
|
ksmallest(fplist, j, &heap);
|
||||||
ctx->blocks[blknum]->similarity_hash =
|
ctx->blocks[blknum]->similarity_hash =
|
||||||
XXH_fast32((const uchar_t *)fplist, FORTY_PCNT(j)*4, 0);
|
XXH_fast32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0);
|
||||||
memset(fplist, 0, ary_sz);
|
memset(fplist, 0, ary_sz);
|
||||||
}
|
}
|
||||||
blknum++;
|
blknum++;
|
||||||
|
@ -435,20 +437,20 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
|
||||||
ctx->blocks[blknum]->offset = last_offset;
|
ctx->blocks[blknum]->offset = last_offset;
|
||||||
ctx->blocks[blknum]->index = blknum;
|
ctx->blocks[blknum]->index = blknum;
|
||||||
ctx->blocks[blknum]->length = *size - last_offset;
|
ctx->blocks[blknum]->length = *size - last_offset;
|
||||||
ctx->blocks[blknum]->other = 0;
|
|
||||||
ctx->blocks[blknum]->next = 0;
|
|
||||||
ctx->blocks[blknum]->similar = 0;
|
|
||||||
ctx->blocks[blknum]->hash = XXH_strong32(buf1+last_offset, ctx->blocks[blknum]->length, 0);
|
|
||||||
|
|
||||||
if (ctx->delta_flag) {
|
if (ctx->delta_flag) {
|
||||||
uint64_t cur_sketch;
|
uint64_t cur_sketch;
|
||||||
j = (j > 0 ? j:1);
|
ssize_t pc[3];
|
||||||
|
|
||||||
if (j > 1) {
|
if (j > 1) {
|
||||||
reset_heap(&heap, FORTY_PCNT(j));
|
pc[1] = SIXTY_PCNT(j);
|
||||||
|
pc[2] = FORTY_PCNT(j);
|
||||||
|
reset_heap(&heap, pc[ctx->delta_flag]);
|
||||||
ksmallest(fplist, j, &heap);
|
ksmallest(fplist, j, &heap);
|
||||||
cur_sketch =
|
cur_sketch =
|
||||||
XXH_fast32((const uchar_t *)fplist, FORTY_PCNT(j)*4, 0);
|
XXH_fast32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0);
|
||||||
} else {
|
} else {
|
||||||
|
if (j == 0) j = 1;
|
||||||
cur_sketch =
|
cur_sketch =
|
||||||
XXH_fast32((const uchar_t *)fplist, (j*4)/2, 0);
|
XXH_fast32((const uchar_t *)fplist, (j*4)/2, 0);
|
||||||
}
|
}
|
||||||
|
@ -483,12 +485,12 @@ process_blocks:
|
||||||
*/
|
*/
|
||||||
if (ctx->delta_flag) {
|
if (ctx->delta_flag) {
|
||||||
for (i=0; i<blknum; i++) {
|
for (i=0; i<blknum; i++) {
|
||||||
ctx->blocks[i]->hash = XXH_strong32(buf1+ctx->blocks[i]->offset,
|
ctx->blocks[i]->hash = XXH_fast32(buf1+ctx->blocks[i]->offset,
|
||||||
ctx->blocks[i]->length, 0);
|
ctx->blocks[i]->length, 0);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (i=0; i<blknum; i++) {
|
for (i=0; i<blknum; i++) {
|
||||||
ctx->blocks[i]->hash = XXH_strong32(buf1+ctx->blocks[i]->offset,
|
ctx->blocks[i]->hash = XXH_fast32(buf1+ctx->blocks[i]->offset,
|
||||||
ctx->blocks[i]->length, 0);
|
ctx->blocks[i]->length, 0);
|
||||||
ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash;
|
ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash;
|
||||||
}
|
}
|
||||||
|
@ -507,15 +509,21 @@ process_blocks:
|
||||||
uint64_t ck;
|
uint64_t ck;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Add length to hash for fewer collisions. If Delta Compression is
|
* Bias hash with length for fewer collisions. If Delta Compression is
|
||||||
* not enabled then value of similarity_hash == hash.
|
* not enabled then value of similarity_hash == hash.
|
||||||
*/
|
*/
|
||||||
ck = ctx->blocks[i]->similarity_hash;
|
ck = ctx->blocks[i]->similarity_hash;
|
||||||
ck += ctx->blocks[i]->length;
|
ck += (ck / ctx->blocks[i]->length);
|
||||||
j = ck % blknum;
|
j = ck % blknum;
|
||||||
|
|
||||||
if (htab[j] == 0) {
|
if (htab[j] == 0) {
|
||||||
htab[j] = ctx->blocks[i];
|
/*
|
||||||
|
* Hash bucket empty. So add block into table.
|
||||||
|
*/
|
||||||
|
htab[j] = ctx->blocks[i];
|
||||||
|
ctx->blocks[i]->other = 0;
|
||||||
|
ctx->blocks[i]->next = 0;
|
||||||
|
ctx->blocks[i]->similar = 0;
|
||||||
} else {
|
} else {
|
||||||
be = htab[j];
|
be = htab[j];
|
||||||
length = 0;
|
length = 0;
|
||||||
|
@ -562,8 +570,14 @@ process_blocks:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// This is an unique block so add it to hashtable.
|
/*
|
||||||
|
* No duplicate in table for this block. So add it to
|
||||||
|
* the bucket chain.
|
||||||
|
*/
|
||||||
if (!length) {
|
if (!length) {
|
||||||
|
ctx->blocks[i]->other = 0;
|
||||||
|
ctx->blocks[i]->next = 0;
|
||||||
|
ctx->blocks[i]->similar = 0;
|
||||||
be->next = ctx->blocks[i];
|
be->next = ctx->blocks[i];
|
||||||
DEBUG_STAT_EN(hash_collisions++);
|
DEBUG_STAT_EN(hash_collisions++);
|
||||||
}
|
}
|
||||||
|
|
|
@ -116,6 +116,14 @@
|
||||||
#define SIMILAR_PARTIAL 2
|
#define SIMILAR_PARTIAL 2
|
||||||
#define SIMILAR_REF 3
|
#define SIMILAR_REF 3
|
||||||
|
|
||||||
|
/*
|
||||||
|
* TYpes of delta operations.
|
||||||
|
* DELTA_NORMAL = Check for at least 60% similarity
|
||||||
|
* DELTA_EXTRA = Check for at least 40% similarity
|
||||||
|
*/
|
||||||
|
#define DELTA_NORMAL 1
|
||||||
|
#define DELTA_EXTRA 2
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Irreducible polynomial for Rabin modulus. This value is from the
|
* Irreducible polynomial for Rabin modulus. This value is from the
|
||||||
* Low Bandwidth Filesystem.
|
* Low Bandwidth Filesystem.
|
||||||
|
|
Loading…
Reference in a new issue