Fix polynomial table computation.

Change hashing and length bias to reduce hashtable bucket collisions.
Add support for user-selectable 60% or 40% similarity for Delta Compression.
Overall slight speedup.
This commit is contained in:
Moinak Ghosh 2012-09-24 22:20:27 +05:30
parent 8386e72566
commit 3544a8c708
4 changed files with 64 additions and 33 deletions

View file

@ -80,8 +80,13 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
pcompress -D -r ... - Do NOT split chunks at a rabin boundary. Default pcompress -D -r ... - Do NOT split chunks at a rabin boundary. Default
is to split. is to split.
Perform Delta Encoding in addition to Exact Dedup: Perform Delta Encoding in addition to Identical Dedup:
pcompress -E ... - This also implies '-D'. pcompress -E ... - This also implies '-D'. This performs Delta Compression
between 2 blocks if they are at least 60% similar.
pcompress -EE .. - This causes Delta Compression to happen if 2 blocks are
at least 40% similar. This can effect greater final
compression ratio at the cost of higher processing
overhead.
Number of threads can optionally be specified: -t <1 - 256 count> Number of threads can optionally be specified: -t <1 - 256 count>
Other flags: Other flags:

12
main.c
View file

@ -135,9 +135,10 @@ usage(void)
" %s -p ...\n" " %s -p ...\n"
"4) Attempt Rabin fingerprinting based deduplication on chunks:\n" "4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
" %s -D ...\n" " %s -D ...\n"
" %s -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.\n" " %s -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.\n\n"
"5) Perform Delta Encoding in addition to Exact Dedup:\n" "5) Perform Delta Encoding in addition to Identical Dedup:\n"
" %s -E ... - This also implies '-D'.\n" " %s -E ... - This also implies '-D'. This checks for at least 60%% similarity.\n"
" The flag can be repeated as in '-EE' to indicate at least 40%% similarity.\n\n"
"6) Number of threads can optionally be specified: -t <1 - 256 count>\n" "6) Number of threads can optionally be specified: -t <1 - 256 count>\n"
"7) Other flags:\n" "7) Other flags:\n"
" '-L' - Enable LZP pre-compression. This improves compression ratio of all\n" " '-L' - Enable LZP pre-compression. This improves compression ratio of all\n"
@ -1594,7 +1595,10 @@ main(int argc, char *argv[])
case 'E': case 'E':
enable_rabin_scan = 1; enable_rabin_scan = 1;
enable_delta_encode = 1; if (!enable_delta_encode)
enable_delta_encode = DELTA_NORMAL;
else
enable_delta_encode = DELTA_EXTRA;
break; break;
case 'F': case 'F':

View file

@ -68,6 +68,7 @@
#include "rabin_dedup.h" #include "rabin_dedup.h"
#define FORTY_PCNT(x) (((x)/5 << 1)) #define FORTY_PCNT(x) (((x)/5 << 1))
#define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3))
extern int lzma_init(void **data, int *level, ssize_t chunksize); extern int lzma_init(void **data, int *level, ssize_t chunksize);
extern int lzma_compress(void *src, size_t srclen, void *dst, extern int lzma_compress(void *src, size_t srclen, void *dst,
@ -135,7 +136,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
val = 0; val = 0;
for (i=0; i<RAB_POLYNOMIAL_WIN_SIZE; i++) { for (i=0; i<RAB_POLYNOMIAL_WIN_SIZE; i++) {
if (term & FP_POLY) { if (term & FP_POLY) {
val = (val << 1) + j; val += term * j;
} }
term <<= 1; term <<= 1;
} }
@ -297,7 +298,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
ctx->blocks[i]->index = i; // Need to store for sorting ctx->blocks[i]->index = i; // Need to store for sorting
ctx->blocks[i]->length = length; ctx->blocks[i]->length = length;
ctx->blocks[i]->similar = 0; ctx->blocks[i]->similar = 0;
ctx->blocks[i]->hash = XXH_strong32(buf1+last_offset, length, 0); ctx->blocks[i]->hash = XXH_fast32(buf1+last_offset, length, 0);
ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash; ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash;
last_offset += length; last_offset += length;
} }
@ -355,7 +356,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
j = 0; j = 0;
for (i=offset; i<*size; i++) { for (i=offset; i<*size; i++) {
uint32_t *splits; ssize_t pc[3];
uchar_t cur_byte = buf1[i]; uchar_t cur_byte = buf1[i];
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos]; uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
ctx->current_window_data[ctx->window_pos] = cur_byte; ctx->current_window_data[ctx->window_pos] = cur_byte;
@ -403,21 +404,22 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
ctx->blocks[blknum]->offset = last_offset; ctx->blocks[blknum]->offset = last_offset;
ctx->blocks[blknum]->index = blknum; // Need to store for sorting ctx->blocks[blknum]->index = blknum; // Need to store for sorting
ctx->blocks[blknum]->length = length; ctx->blocks[blknum]->length = length;
ctx->blocks[blknum]->other = 0;
ctx->blocks[blknum]->next = 0;
ctx->blocks[blknum]->similar = 0;
/*
* Reset the heap structure and find the K min values if Delta Compression
* is enabled. We use a min heap mechanism taken from the heap based priority
* queue implementation in Python.
* Here K = 60% or 40%. We are aiming to detect either 60% (default) or 40%
* similarity on average.
*/
if (ctx->delta_flag) { if (ctx->delta_flag) {
/* pc[1] = SIXTY_PCNT(j);
* Reset the heap structure and find the K min values. We use a pc[2] = FORTY_PCNT(j);
* min heap mechanism taken from the heap based priority queue
* implementation in Python. reset_heap(&heap, pc[ctx->delta_flag]);
* Here K = 40%. We are aiming to detect 40% similarity on average.
*/
reset_heap(&heap, FORTY_PCNT(j));
ksmallest(fplist, j, &heap); ksmallest(fplist, j, &heap);
ctx->blocks[blknum]->similarity_hash = ctx->blocks[blknum]->similarity_hash =
XXH_fast32((const uchar_t *)fplist, FORTY_PCNT(j)*4, 0); XXH_fast32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0);
memset(fplist, 0, ary_sz); memset(fplist, 0, ary_sz);
} }
blknum++; blknum++;
@ -435,20 +437,20 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
ctx->blocks[blknum]->offset = last_offset; ctx->blocks[blknum]->offset = last_offset;
ctx->blocks[blknum]->index = blknum; ctx->blocks[blknum]->index = blknum;
ctx->blocks[blknum]->length = *size - last_offset; ctx->blocks[blknum]->length = *size - last_offset;
ctx->blocks[blknum]->other = 0;
ctx->blocks[blknum]->next = 0;
ctx->blocks[blknum]->similar = 0;
ctx->blocks[blknum]->hash = XXH_strong32(buf1+last_offset, ctx->blocks[blknum]->length, 0);
if (ctx->delta_flag) { if (ctx->delta_flag) {
uint64_t cur_sketch; uint64_t cur_sketch;
j = (j > 0 ? j:1); ssize_t pc[3];
if (j > 1) { if (j > 1) {
reset_heap(&heap, FORTY_PCNT(j)); pc[1] = SIXTY_PCNT(j);
pc[2] = FORTY_PCNT(j);
reset_heap(&heap, pc[ctx->delta_flag]);
ksmallest(fplist, j, &heap); ksmallest(fplist, j, &heap);
cur_sketch = cur_sketch =
XXH_fast32((const uchar_t *)fplist, FORTY_PCNT(j)*4, 0); XXH_fast32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0);
} else { } else {
if (j == 0) j = 1;
cur_sketch = cur_sketch =
XXH_fast32((const uchar_t *)fplist, (j*4)/2, 0); XXH_fast32((const uchar_t *)fplist, (j*4)/2, 0);
} }
@ -483,12 +485,12 @@ process_blocks:
*/ */
if (ctx->delta_flag) { if (ctx->delta_flag) {
for (i=0; i<blknum; i++) { for (i=0; i<blknum; i++) {
ctx->blocks[i]->hash = XXH_strong32(buf1+ctx->blocks[i]->offset, ctx->blocks[i]->hash = XXH_fast32(buf1+ctx->blocks[i]->offset,
ctx->blocks[i]->length, 0); ctx->blocks[i]->length, 0);
} }
} else { } else {
for (i=0; i<blknum; i++) { for (i=0; i<blknum; i++) {
ctx->blocks[i]->hash = XXH_strong32(buf1+ctx->blocks[i]->offset, ctx->blocks[i]->hash = XXH_fast32(buf1+ctx->blocks[i]->offset,
ctx->blocks[i]->length, 0); ctx->blocks[i]->length, 0);
ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash; ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash;
} }
@ -507,15 +509,21 @@ process_blocks:
uint64_t ck; uint64_t ck;
/* /*
* Add length to hash for fewer collisions. If Delta Compression is * Bias hash with length for fewer collisions. If Delta Compression is
* not enabled then value of similarity_hash == hash. * not enabled then value of similarity_hash == hash.
*/ */
ck = ctx->blocks[i]->similarity_hash; ck = ctx->blocks[i]->similarity_hash;
ck += ctx->blocks[i]->length; ck += (ck / ctx->blocks[i]->length);
j = ck % blknum; j = ck % blknum;
if (htab[j] == 0) { if (htab[j] == 0) {
htab[j] = ctx->blocks[i]; /*
* Hash bucket empty. So add block into table.
*/
htab[j] = ctx->blocks[i];
ctx->blocks[i]->other = 0;
ctx->blocks[i]->next = 0;
ctx->blocks[i]->similar = 0;
} else { } else {
be = htab[j]; be = htab[j];
length = 0; length = 0;
@ -562,8 +570,14 @@ process_blocks:
break; break;
} }
} }
// This is an unique block so add it to hashtable. /*
* No duplicate in table for this block. So add it to
* the bucket chain.
*/
if (!length) { if (!length) {
ctx->blocks[i]->other = 0;
ctx->blocks[i]->next = 0;
ctx->blocks[i]->similar = 0;
be->next = ctx->blocks[i]; be->next = ctx->blocks[i];
DEBUG_STAT_EN(hash_collisions++); DEBUG_STAT_EN(hash_collisions++);
} }

View file

@ -116,6 +116,14 @@
#define SIMILAR_PARTIAL 2 #define SIMILAR_PARTIAL 2
#define SIMILAR_REF 3 #define SIMILAR_REF 3
/*
* TYpes of delta operations.
* DELTA_NORMAL = Check for at least 60% similarity
* DELTA_EXTRA = Check for at least 40% similarity
*/
#define DELTA_NORMAL 1
#define DELTA_EXTRA 2
/* /*
* Irreducible polynomial for Rabin modulus. This value is from the * Irreducible polynomial for Rabin modulus. This value is from the
* Low Bandwidth Filesystem. * Low Bandwidth Filesystem.