Allow user-specified minimum Dedupe block size.

Compute similarity sketch only if Delta Compression enabled.
This commit is contained in:
Moinak Ghosh 2012-09-05 22:43:54 +05:30
parent 560fa85aab
commit e6f042aaf8
2 changed files with 41 additions and 17 deletions

15
main.c
View file

@ -88,6 +88,7 @@ static int do_compress = 0;
static int do_uncompress = 0; static int do_uncompress = 0;
static int cksum_bytes; static int cksum_bytes;
static int cksum = 0; static int cksum = 0;
static int rab_blk_size = 0;
static rabin_context_t *rctx; static rabin_context_t *rctx;
static void static void
@ -143,6 +144,8 @@ usage(void)
" '-S' <cksum>\n" " '-S' <cksum>\n"
" - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512\n" " - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512\n"
" Default one is SKEIN256.\n" " Default one is SKEIN256.\n"
" '-B' <1..5>\n"
" - Specify a minimum Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n"
" '-M' - Display memory allocator statistics\n" " '-M' - Display memory allocator statistics\n"
" '-C' - Display compression statistics\n\n", " '-C' - Display compression statistics\n\n",
UTILITY_VERSION, exec_name, exec_name, exec_name, exec_name, exec_name, exec_name); UTILITY_VERSION, exec_name, exec_name, exec_name, exec_name, exec_name, exec_name);
@ -578,7 +581,7 @@ start_decompress(const char *filename, const char *to_filename)
} }
} }
if (enable_rabin_scan) { if (enable_rabin_scan) {
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, 0, tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, rab_blk_size,
algo, enable_delta_encode); algo, enable_delta_encode);
if (tdat->rctx == NULL) { if (tdat->rctx == NULL) {
UNCOMP_BAIL; UNCOMP_BAIL;
@ -1130,7 +1133,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
} }
} }
if (enable_rabin_scan) { if (enable_rabin_scan) {
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, 0, tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, rab_blk_size,
algo, enable_delta_encode); algo, enable_delta_encode);
if (tdat->rctx == NULL) { if (tdat->rctx == NULL) {
COMP_BAIL; COMP_BAIL;
@ -1521,7 +1524,7 @@ main(int argc, char *argv[])
level = 6; level = 6;
slab_init(); slab_init();
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDErLS:")) != -1) { while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDErLS:B:")) != -1) {
int ovr; int ovr;
switch (opt) { switch (opt) {
@ -1555,6 +1558,12 @@ main(int argc, char *argv[])
err_exit(0, "Compression level should be in range 0 - 14\n"); err_exit(0, "Compression level should be in range 0 - 14\n");
break; break;
case 'B':
rab_blk_size = atoi(optarg);
if (rab_blk_size < 1 || rab_blk_size > 5)
err_exit(0, "Minimum Dedupe block size must be in range 1 (4k) - 5 (64k)\n");
break;
case 'p': case 'p':
pipe_mode = 1; pipe_mode = 1;
break; break;

View file

@ -88,6 +88,9 @@ rabin_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_
uint32_t min_blk; uint32_t min_blk;
min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS); min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
if (rab_blk_sz > 1)
return (min_blk);
if (((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) && if (((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) &&
chunksize <= LZMA_WINDOW_MAX) || delta_flag) { chunksize <= LZMA_WINDOW_MAX) || delta_flag) {
if (memcmp(algo, "lzfx", 4) == 0 || memcmp(algo, "lz4", 3) == 0 || if (memcmp(algo, "lzfx", 4) == 0 || memcmp(algo, "lz4", 3) == 0 ||
@ -220,6 +223,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
* x * polynomial_pow can we written as x << RAB_POLYNOMIAL_WIN_SIZE * x * polynomial_pow can we written as x << RAB_POLYNOMIAL_WIN_SIZE
*/ */
slab_cache_add(sizeof (rabin_blockentry_t));
ctx->current_window_data = current_window_data; ctx->current_window_data = current_window_data;
ctx->real_chunksize = real_chunksize; ctx->real_chunksize = real_chunksize;
reset_rabin_context(ctx); reset_rabin_context(ctx);
@ -408,7 +412,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
* if (fplist[fpos[1]] > fplist[fpos[0]]) fpos[0] = fpos[1]; * if (fplist[fpos[1]] > fplist[fpos[0]]) fpos[0] = fpos[1];
*/ */
fpos[0] = fpos[(fplist[fpos[1]] > fplist[fpos[0]])]; fpos[0] = fpos[(fplist[fpos[1]] > fplist[fpos[0]])];
if (len1 == SKETCH_BASIC_BLOCK_SZ) { if (len1 == SKETCH_BASIC_BLOCK_SZ && ctx->delta_flag) {
uint32_t p1, p2, p3; uint32_t p1, p2, p3;
/* /*
* Compute the super sketch value by summing all the representative * Compute the super sketch value by summing all the representative
@ -460,9 +464,14 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
ctx->blocks[blknum]->crc = XXH_strong32(buf1+last_offset, length, 0); ctx->blocks[blknum]->crc = XXH_strong32(buf1+last_offset, length, 0);
// Accumulate the 2 sketch values into a combined similarity checksum // Accumulate the 2 sketch values into a combined similarity checksum
if (ctx->delta_flag) {
ctx->blocks[blknum]->cksum_n_offset = (cur_sketch + cur_sketch2) / 2; ctx->blocks[blknum]->cksum_n_offset = (cur_sketch + cur_sketch2) / 2;
ctx->blocks[blknum]->mean_n_length = cur_sketch / j; ctx->blocks[blknum]->mean_n_length = cur_sketch / j;
memset(fplist, 0, fplist_sz); memset(fplist, 0, fplist_sz);
} else {
ctx->blocks[blknum]->cksum_n_offset = 0;
ctx->blocks[blknum]->mean_n_length = 0;
}
fpos[0] = 0; fpos[0] = 0;
len1 = 0; len1 = 0;
cur_sketch = 0; cur_sketch = 0;
@ -498,9 +507,14 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
ctx->blocks[blknum]->ref = 0; ctx->blocks[blknum]->ref = 0;
ctx->blocks[blknum]->similar = 0; ctx->blocks[blknum]->similar = 0;
if (ctx->delta_flag) {
j = (j > 0 ? j:1); j = (j > 0 ? j:1);
ctx->blocks[blknum]->cksum_n_offset = (cur_sketch + cur_sketch2) / 2; ctx->blocks[blknum]->cksum_n_offset = (cur_sketch + cur_sketch2) / 2;
ctx->blocks[blknum]->mean_n_length = cur_sketch / j; ctx->blocks[blknum]->mean_n_length = cur_sketch / j;
} else {
ctx->blocks[blknum]->cksum_n_offset = 0;
ctx->blocks[blknum]->mean_n_length = 0;
}
ctx->blocks[blknum]->crc = XXH_strong32(buf1+last_offset, ctx->blocks[blknum]->length, 0); ctx->blocks[blknum]->crc = XXH_strong32(buf1+last_offset, ctx->blocks[blknum]->length, 0);
blknum++; blknum++;
last_offset = *size; last_offset = *size;
@ -771,6 +785,7 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
sz = 0; sz = 0;
ctx->valid = 1; ctx->valid = 1;
slab_cache_add(sizeof (rabin_blockentry_t));
for (blk = 0; blk < blknum; blk++) { for (blk = 0; blk < blknum; blk++) {
if (ctx->blocks[blk] == 0) if (ctx->blocks[blk] == 0)
ctx->blocks[blk] = (rabin_blockentry_t *)slab_alloc(NULL, sizeof (rabin_blockentry_t)); ctx->blocks[blk] = (rabin_blockentry_t *)slab_alloc(NULL, sizeof (rabin_blockentry_t));