Add support for Fixed-Block deduplication.

More refactoring of symbol names.
This commit is contained in:
Moinak Ghosh 2012-09-16 11:12:58 +05:30
parent b9355a5dcc
commit e3befd9e16
6 changed files with 116 additions and 76 deletions

View file

@ -97,6 +97,12 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
the fastest in the group, especially on x86 platforms. BLAKE is faster the fastest in the group, especially on x86 platforms. BLAKE is faster
than SKEIN on a few platforms. than SKEIN on a few platforms.
SKEIN 512-256 is about 60% faster than SHA 512-256 on x64 platforms. SKEIN 512-256 is about 60% faster than SHA 512-256 on x64 platforms.
'-F' - Perform Fixed Block Deduplication. This is faster than fingerprinting
based content-aware deduplication in some cases. However this is mostly
usable for disk dumps especially virtual machine images. This generally
gives lower dedupe ratio than content-aware dedupe (-D) and does not
support delta compression.
'-M' - Display memory allocator statistics '-M' - Display memory allocator statistics
'-C' - Display compression statistics '-C' - Display compression statistics

75
main.c
View file

@ -90,7 +90,7 @@ static int do_uncompress = 0;
static int cksum_bytes; static int cksum_bytes;
static int cksum = 0; static int cksum = 0;
static int rab_blk_size = 0; static int rab_blk_size = 0;
static rabin_context_t *rctx; static dedupe_context_t *rctx;
static void static void
usage(void) usage(void)
@ -145,6 +145,8 @@ usage(void)
" '-S' <cksum>\n" " '-S' <cksum>\n"
" - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512\n" " - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512\n"
" Default one is SKEIN256.\n" " Default one is SKEIN256.\n"
" '-F' - Perform Fixed-Block Deduplication. Faster than '-D' in some cases\n"
" but with lower deduplication ratio.\n"
" '-B' <1..5>\n" " '-B' <1..5>\n"
" - Specify a minimum Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n" " - Specify a minimum Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n"
" '-M' - Display memory allocator statistics\n" " '-M' - Display memory allocator statistics\n"
@ -299,11 +301,11 @@ redo:
_chunksize = ntohll(*((ssize_t *)rseg)); _chunksize = ntohll(*((ssize_t *)rseg));
} }
if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) { if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
uchar_t *cmpbuf, *ubuf; uchar_t *cmpbuf, *ubuf;
/* Extract various sizes from rabin header. */ /* Extract various sizes from rabin header. */
rabin_parse_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz, parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz,
&dedupe_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize); &dedupe_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize);
memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE); memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE);
@ -363,14 +365,14 @@ redo:
goto cont; goto cont;
} }
/* Rebuild chunk from dedup blocks. */ /* Rebuild chunk from dedup blocks. */
if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) { if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
rabin_context_t *rctx; dedupe_context_t *rctx;
uchar_t *tmp; uchar_t *tmp;
rctx = tdat->rctx; rctx = tdat->rctx;
reset_rabin_context(tdat->rctx); reset_dedupe_context(tdat->rctx);
rctx->cbuf = tdat->compressed_chunk; rctx->cbuf = tdat->compressed_chunk;
rabin_inverse_dedup(rctx, tdat->uncompressed_chunk, &(tdat->len_cmp)); dedupe_decompress(rctx, tdat->uncompressed_chunk, &(tdat->len_cmp));
if (!rctx->valid) { if (!rctx->valid) {
fprintf(stderr, "ERROR: Chunk %d, dedup recovery failed.\n", tdat->id); fprintf(stderr, "ERROR: Chunk %d, dedup recovery failed.\n", tdat->id);
rv = -1; rv = -1;
@ -582,8 +584,8 @@ start_decompress(const char *filename, const char *to_filename)
UNCOMP_BAIL; UNCOMP_BAIL;
} }
} }
if (enable_rabin_scan) { if (enable_rabin_scan || enable_fixed_scan) {
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, rab_blk_size, tdat->rctx = create_dedupe_context(chunksize, compressed_chunksize, rab_blk_size,
algo, enable_delta_encode, enable_fixed_scan); algo, enable_delta_encode, enable_fixed_scan);
if (tdat->rctx == NULL) { if (tdat->rctx == NULL) {
UNCOMP_BAIL; UNCOMP_BAIL;
@ -659,7 +661,7 @@ start_decompress(const char *filename, const char *to_filename)
if (!tdat->compressed_chunk) { if (!tdat->compressed_chunk) {
tdat->compressed_chunk = (uchar_t *)slab_alloc(NULL, tdat->compressed_chunk = (uchar_t *)slab_alloc(NULL,
compressed_chunksize); compressed_chunksize);
if (enable_rabin_scan) if ((enable_rabin_scan || enable_fixed_scan))
tdat->uncompressed_chunk = (uchar_t *)slab_alloc(NULL, tdat->uncompressed_chunk = (uchar_t *)slab_alloc(NULL,
compressed_chunksize); compressed_chunksize);
else else
@ -735,8 +737,8 @@ uncomp_done:
slab_free(NULL, dary[i]->compressed_chunk); slab_free(NULL, dary[i]->compressed_chunk);
if (_deinit_func) if (_deinit_func)
_deinit_func(&(dary[i]->data)); _deinit_func(&(dary[i]->data));
if (enable_rabin_scan) { if ((enable_rabin_scan || enable_fixed_scan)) {
destroy_rabin_context(dary[i]->rctx); destroy_dedupe_context(dary[i]->rctx);
} }
slab_free(NULL, dary[i]); slab_free(NULL, dary[i]);
} }
@ -770,8 +772,8 @@ redo:
compressed_chunk = tdat->compressed_chunk + CHUNK_FLAG_SZ; compressed_chunk = tdat->compressed_chunk + CHUNK_FLAG_SZ;
rbytes = tdat->rbytes; rbytes = tdat->rbytes;
/* Perform Dedup if enabled. */ /* Perform Dedup if enabled. */
if (enable_rabin_scan) { if ((enable_rabin_scan || enable_fixed_scan)) {
rabin_context_t *rctx; dedupe_context_t *rctx;
/* /*
* Compute checksum of original uncompressed chunk. When doing dedup * Compute checksum of original uncompressed chunk. When doing dedup
@ -782,9 +784,9 @@ redo:
compute_checksum(tdat->checksum, cksum, tdat->cmp_seg, tdat->rbytes); compute_checksum(tdat->checksum, cksum, tdat->cmp_seg, tdat->rbytes);
rctx = tdat->rctx; rctx = tdat->rctx;
reset_rabin_context(tdat->rctx); reset_dedupe_context(tdat->rctx);
rctx->cbuf = tdat->uncompressed_chunk; rctx->cbuf = tdat->uncompressed_chunk;
dedupe_index_sz = rabin_dedup(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0, NULL); dedupe_index_sz = dedupe_compress(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0, NULL);
if (!rctx->valid) { if (!rctx->valid) {
memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes); memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes);
tdat->rbytes = rbytes; tdat->rbytes = rbytes;
@ -801,7 +803,7 @@ redo:
* The rabin index array values can pollute the compressor's dictionary thereby * The rabin index array values can pollute the compressor's dictionary thereby
* reducing compression effectiveness of the data chunk. So we separate them. * reducing compression effectiveness of the data chunk. So we separate them.
*/ */
if (enable_rabin_scan && tdat->rctx->valid) { if ((enable_rabin_scan || enable_fixed_scan) && tdat->rctx->valid) {
_chunksize = tdat->rbytes - dedupe_index_sz - RABIN_HDR_SIZE; _chunksize = tdat->rbytes - dedupe_index_sz - RABIN_HDR_SIZE;
index_size_cmp = dedupe_index_sz; index_size_cmp = dedupe_index_sz;
@ -837,7 +839,7 @@ redo:
memcpy(compressed_chunk + index_size_cmp, memcpy(compressed_chunk + index_size_cmp,
tdat->uncompressed_chunk + dedupe_index_sz, _chunksize); tdat->uncompressed_chunk + dedupe_index_sz, _chunksize);
/* Now update rabin header with the compressed sizes. */ /* Now update rabin header with the compressed sizes. */
rabin_update_hdr(compressed_chunk, index_size_cmp - RABIN_HDR_SIZE, update_dedupe_hdr(compressed_chunk, index_size_cmp - RABIN_HDR_SIZE,
_chunksize); _chunksize);
} else { } else {
/* If rabin index compression fails, we just drop down to plain /* If rabin index compression fails, we just drop down to plain
@ -869,7 +871,7 @@ plain_compress:
*/ */
tdat->len_cmp = _chunksize; tdat->len_cmp = _chunksize;
if (_chunksize >= rbytes || rv < 0) { if (_chunksize >= rbytes || rv < 0) {
if (!enable_rabin_scan || !tdat->rctx->valid) if (!(enable_rabin_scan || enable_fixed_scan) || !tdat->rctx->valid)
memcpy(compressed_chunk, tdat->uncompressed_chunk, tdat->rbytes); memcpy(compressed_chunk, tdat->uncompressed_chunk, tdat->rbytes);
type = UNCOMPRESSED; type = UNCOMPRESSED;
tdat->len_cmp = tdat->rbytes; tdat->len_cmp = tdat->rbytes;
@ -877,7 +879,7 @@ plain_compress:
type = COMPRESSED; type = COMPRESSED;
} }
if (enable_rabin_scan && tdat->rctx->valid) { if ((enable_rabin_scan || enable_fixed_scan) && tdat->rctx->valid) {
type |= CHUNK_FLAG_DEDUP; type |= CHUNK_FLAG_DEDUP;
} }
if (lzp_preprocess) { if (lzp_preprocess) {
@ -982,7 +984,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
struct cmp_data **dary = NULL, *tdat; struct cmp_data **dary = NULL, *tdat;
pthread_t writer_thr; pthread_t writer_thr;
uchar_t *cread_buf, *pos; uchar_t *cread_buf, *pos;
rabin_context_t *rctx; dedupe_context_t *rctx;
algo_props_t props; algo_props_t props;
/* /*
@ -1015,7 +1017,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
else else
flags |= FLAG_DEDUP_FIXED; flags |= FLAG_DEDUP_FIXED;
/* Additional scratch space for dedup arrays. */ /* Additional scratch space for dedup arrays. */
compressed_chunksize += (rabin_buf_extra(chunksize, 0, algo, compressed_chunksize += (dedupe_buf_extra(chunksize, 0, algo,
enable_delta_encode) - (compressed_chunksize - chunksize)); enable_delta_encode) - (compressed_chunksize - chunksize));
} }
@ -1107,7 +1109,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
fprintf(stderr, "\n"); fprintf(stderr, "\n");
dary = (struct cmp_data **)slab_calloc(NULL, nprocs, sizeof (struct cmp_data *)); dary = (struct cmp_data **)slab_calloc(NULL, nprocs, sizeof (struct cmp_data *));
if (enable_rabin_scan) if ((enable_rabin_scan || enable_fixed_scan))
cread_buf = (uchar_t *)slab_alloc(NULL, compressed_chunksize); cread_buf = (uchar_t *)slab_alloc(NULL, compressed_chunksize);
else else
cread_buf = (uchar_t *)slab_alloc(NULL, chunksize); cread_buf = (uchar_t *)slab_alloc(NULL, chunksize);
@ -1137,8 +1139,8 @@ start_compress(const char *filename, uint64_t chunksize, int level)
COMP_BAIL; COMP_BAIL;
} }
} }
if (enable_rabin_scan) { if (enable_rabin_scan || enable_fixed_scan) {
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, rab_blk_size, tdat->rctx = create_dedupe_context(chunksize, compressed_chunksize, rab_blk_size,
algo, enable_delta_encode, enable_fixed_scan); algo, enable_delta_encode, enable_fixed_scan);
if (tdat->rctx == NULL) { if (tdat->rctx == NULL) {
COMP_BAIL; COMP_BAIL;
@ -1204,7 +1206,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
* Read the first chunk into a spare buffer (a simple double-buffering). * Read the first chunk into a spare buffer (a simple double-buffering).
*/ */
if (enable_rabin_split) { if (enable_rabin_split) {
rctx = create_rabin_context(chunksize, 0, 0, algo, enable_delta_encode, rctx = create_dedupe_context(chunksize, 0, 0, algo, enable_delta_encode,
enable_fixed_scan); enable_fixed_scan);
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx); rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
} else { } else {
@ -1231,7 +1233,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
* Delayed allocation. Allocate chunks if not already done. * Delayed allocation. Allocate chunks if not already done.
*/ */
if (!tdat->cmp_seg) { if (!tdat->cmp_seg) {
if (enable_rabin_scan) { if ((enable_rabin_scan || enable_fixed_scan)) {
if (single_chunk) if (single_chunk)
tdat->cmp_seg = (uchar_t *)1; tdat->cmp_seg = (uchar_t *)1;
else else
@ -1266,7 +1268,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
*/ */
tdat->id = chunk_num; tdat->id = chunk_num;
tdat->rbytes = rbytes; tdat->rbytes = rbytes;
if (enable_rabin_scan) { if ((enable_rabin_scan || enable_fixed_scan)) {
tmp = tdat->cmp_seg; tmp = tdat->cmp_seg;
tdat->cmp_seg = cread_buf; tdat->cmp_seg = cread_buf;
cread_buf = tmp; cread_buf = tmp;
@ -1383,8 +1385,8 @@ comp_done:
slab_free(NULL, dary[i]->uncompressed_chunk); slab_free(NULL, dary[i]->uncompressed_chunk);
if (dary[i]->cmp_seg != (uchar_t *)1) if (dary[i]->cmp_seg != (uchar_t *)1)
slab_free(NULL, dary[i]->cmp_seg); slab_free(NULL, dary[i]->cmp_seg);
if (enable_rabin_scan) { if ((enable_rabin_scan || enable_fixed_scan)) {
destroy_rabin_context(dary[i]->rctx); destroy_dedupe_context(dary[i]->rctx);
} }
if (_deinit_func) if (_deinit_func)
_deinit_func(&(dary[i]->data)); _deinit_func(&(dary[i]->data));
@ -1392,7 +1394,7 @@ comp_done:
} }
slab_free(NULL, dary); slab_free(NULL, dary);
} }
if (enable_rabin_split) destroy_rabin_context(rctx); if (enable_rabin_split) destroy_dedupe_context(rctx);
if (cread_buf != (uchar_t *)1) if (cread_buf != (uchar_t *)1)
slab_free(NULL, cread_buf); slab_free(NULL, cread_buf);
if (!pipe_mode) { if (!pipe_mode) {
@ -1530,7 +1532,7 @@ main(int argc, char *argv[])
level = 6; level = 6;
slab_init(); slab_init();
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDErLS:B:")) != -1) { while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDErLS:B:F")) != -1) {
int ovr; int ovr;
switch (opt) { switch (opt) {
@ -1597,8 +1599,9 @@ main(int argc, char *argv[])
enable_delta_encode = 1; enable_delta_encode = 1;
break; break;
case 'f': case 'F':
enable_fixed_scan = 1; enable_fixed_scan = 1;
enable_rabin_split = 0;
break; break;
case 'L': case 'L':
@ -1638,15 +1641,15 @@ main(int argc, char *argv[])
exit(1); exit(1);
} }
if (enable_rabin_scan && !do_compress) { if ((enable_rabin_scan || enable_fixed_scan) && !do_compress) {
fprintf(stderr, "Rabin Deduplication is only used during compression.\n"); fprintf(stderr, "Deduplication is only used during compression.\n");
usage(); usage();
exit(1); exit(1);
} }
if (!enable_rabin_scan) if (!enable_rabin_scan)
enable_rabin_split = 0; enable_rabin_split = 0;
if (enable_fixed_scan && (enable_rabin_scan || enable_delta_encode)) { if (enable_fixed_scan && (enable_rabin_scan || enable_delta_encode || enable_rabin_split)) {
fprintf(stderr, "Rabin Deduplication and Fixed block Deduplication are mutually exclusive\n"); fprintf(stderr, "Rabin Deduplication and Fixed block Deduplication are mutually exclusive\n");
exit(1); exit(1);
} }

View file

@ -157,7 +157,7 @@ struct cmp_data {
uchar_t *cmp_seg; uchar_t *cmp_seg;
uchar_t *compressed_chunk; uchar_t *compressed_chunk;
uchar_t *uncompressed_chunk; uchar_t *uncompressed_chunk;
rabin_context_t *rctx; dedupe_context_t *rctx;
ssize_t rbytes; ssize_t rbytes;
ssize_t chunksize; ssize_t chunksize;
ssize_t len_cmp; ssize_t len_cmp;

View file

@ -86,7 +86,7 @@ uint64_t ir[256];
static int inited = 0; static int inited = 0;
static uint32_t static uint32_t
rabin_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag) dedupe_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
{ {
uint32_t min_blk; uint32_t min_blk;
@ -95,22 +95,22 @@ rabin_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_
} }
uint32_t uint32_t
rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag) dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
{ {
if (rab_blk_sz < 1 || rab_blk_sz > 5) if (rab_blk_sz < 1 || rab_blk_sz > 5)
rab_blk_sz = RAB_BLK_DEFAULT; rab_blk_sz = RAB_BLK_DEFAULT;
return ((chunksize / rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag)) return ((chunksize / dedupe_min_blksz(chunksize, rab_blk_sz, algo, delta_flag))
* sizeof (uint32_t)); * sizeof (uint32_t));
} }
/* /*
* Initialize the algorithm with the default params. * Initialize the algorithm with the default params.
*/ */
rabin_context_t * dedupe_context_t *
create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz, create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz,
const char *algo, int delta_flag, int fixed_flag) { const char *algo, int delta_flag, int fixed_flag) {
rabin_context_t *ctx; dedupe_context_t *ctx;
unsigned char *current_window_data; unsigned char *current_window_data;
uint32_t i; uint32_t i;
@ -165,7 +165,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
* use 4K minimum Rabin block size. For everything else it is 2K based * use 4K minimum Rabin block size. For everything else it is 2K based
* on experimentation. * on experimentation.
*/ */
ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t)); ctx = (dedupe_context_t *)slab_alloc(NULL, sizeof (dedupe_context_t));
ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE; ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
ctx->fixed_flag = fixed_flag; ctx->fixed_flag = fixed_flag;
@ -173,7 +173,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
ctx->delta_flag = delta_flag; ctx->delta_flag = delta_flag;
ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS); ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1; ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1;
ctx->rabin_poly_min_block_size = rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag); ctx->rabin_poly_min_block_size = dedupe_min_blksz(chunksize, rab_blk_sz, algo, delta_flag);
ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size; ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size;
if (!fixed_flag) if (!fixed_flag)
@ -186,7 +186,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
if (ctx->blknum > RABIN_MAX_BLOCKS) { if (ctx->blknum > RABIN_MAX_BLOCKS) {
fprintf(stderr, "Chunk size too large for dedup.\n"); fprintf(stderr, "Chunk size too large for dedup.\n");
destroy_rabin_context(ctx); destroy_dedupe_context(ctx);
return (NULL); return (NULL);
} }
current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE); current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE);
@ -198,7 +198,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
if(ctx == NULL || current_window_data == NULL || (ctx->blocks == NULL && real_chunksize > 0)) { if(ctx == NULL || current_window_data == NULL || (ctx->blocks == NULL && real_chunksize > 0)) {
fprintf(stderr, fprintf(stderr,
"Could not allocate rabin polynomial context, out of memory\n"); "Could not allocate rabin polynomial context, out of memory\n");
destroy_rabin_context(ctx); destroy_dedupe_context(ctx);
return (NULL); return (NULL);
} }
@ -209,7 +209,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
if (!(ctx->lzma_data)) { if (!(ctx->lzma_data)) {
fprintf(stderr, fprintf(stderr,
"Could not initialize LZMA data for dedupe index, out of memory\n"); "Could not initialize LZMA data for dedupe index, out of memory\n");
destroy_rabin_context(ctx); destroy_dedupe_context(ctx);
return (NULL); return (NULL);
} }
} }
@ -227,19 +227,19 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
slab_cache_add(sizeof (rabin_blockentry_t)); slab_cache_add(sizeof (rabin_blockentry_t));
ctx->current_window_data = current_window_data; ctx->current_window_data = current_window_data;
ctx->real_chunksize = real_chunksize; ctx->real_chunksize = real_chunksize;
reset_rabin_context(ctx); reset_dedupe_context(ctx);
return (ctx); return (ctx);
} }
void void
reset_rabin_context(rabin_context_t *ctx) reset_dedupe_context(dedupe_context_t *ctx)
{ {
memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE); memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
ctx->window_pos = 0; ctx->window_pos = 0;
} }
void void
destroy_rabin_context(rabin_context_t *ctx) destroy_dedupe_context(dedupe_context_t *ctx)
{ {
if (ctx) { if (ctx) {
uint32_t i; uint32_t i;
@ -288,11 +288,13 @@ cmpblks(const void *a, const void *b)
} }
/** /**
* Perform Deduplication based on Rabin Fingerprinting. A 31-byte window is used for * Perform Deduplication.
* the rolling checksum and dedup blocks vary in size from 4K-128K. * Both Semi-Rabin fingerprinting based and Fixed Block Deduplication are supported.
* A 16-byte window is used for the rolling checksum and dedup blocks can vary in size
* from 4K-128K.
*/ */
uint32_t uint32_t
rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos) dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos)
{ {
ssize_t i, last_offset, j, fplist_sz; ssize_t i, last_offset, j, fplist_sz;
uint32_t blknum; uint32_t blknum;
@ -302,6 +304,40 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
uint32_t *fplist; uint32_t *fplist;
heap_t heap; heap_t heap;
length = offset;
last_offset = 0;
blknum = 0;
ctx->valid = 0;
cur_roll_checksum = 0;
cur_sketch = 0;
if (ctx->fixed_flag) {
blknum = *size / ctx->rabin_poly_avg_block_size;
j = *size % ctx->rabin_poly_avg_block_size;
if (j) blknum++;
last_offset = 0;
length = ctx->rabin_poly_avg_block_size;
for (i=0; i<blknum; i++) {
if (i == blknum-1) {
length = j;
}
if (ctx->blocks[i] == 0) {
ctx->blocks[i] = (rabin_blockentry_t *)slab_alloc(NULL,
sizeof (rabin_blockentry_t));
}
ctx->blocks[i]->offset = last_offset;
ctx->blocks[i]->index = i; // Need to store for sorting
ctx->blocks[i]->length = length;
ctx->blocks[i]->ref = 0;
ctx->blocks[i]->similar = 0;
ctx->blocks[i]->crc = XXH_strong32(buf1+last_offset, length, 0);
ctx->blocks[i]->cksum_n_offset = ctx->blocks[i]->crc;
last_offset += length;
}
goto process_blocks;
}
if (rabin_pos == NULL) { if (rabin_pos == NULL) {
/* /*
* Initialize arrays for sketch computation. We re-use memory allocated * Initialize arrays for sketch computation. We re-use memory allocated
@ -312,12 +348,6 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
memset(fplist, 0, fplist_sz); memset(fplist, 0, fplist_sz);
reset_heap(&heap, fplist_sz/2); reset_heap(&heap, fplist_sz/2);
} }
length = offset;
last_offset = 0;
blknum = 0;
ctx->valid = 0;
cur_roll_checksum = 0;
cur_sketch = 0;
/* /*
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary * If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
@ -434,6 +464,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
} }
} }
process_blocks:
DEBUG_STAT_EN(printf("Original size: %lld, blknum: %u\n", *size, blknum)); DEBUG_STAT_EN(printf("Original size: %lld, blknum: %u\n", *size, blknum));
// If we found at least a few chunks, perform dedup. // If we found at least a few chunks, perform dedup.
if (blknum > 2) { if (blknum > 2) {
@ -701,7 +732,7 @@ cont:
} }
void void
rabin_update_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp) update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp)
{ {
ssize_t *entries; ssize_t *entries;
@ -712,7 +743,7 @@ rabin_update_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_s
} }
void void
rabin_parse_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz, parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
ssize_t *rabin_data_sz, ssize_t *dedupe_index_sz_cmp, ssize_t *rabin_data_sz, ssize_t *dedupe_index_sz_cmp,
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size) ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size)
{ {
@ -730,7 +761,7 @@ rabin_parse_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
} }
void void
rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size) dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size)
{ {
uint32_t blknum, blk, oblk, len; uint32_t blknum, blk, oblk, len;
uint32_t *dedupe_index; uint32_t *dedupe_index;
@ -738,7 +769,7 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
ssize_t dedupe_index_sz, pos1, i; ssize_t dedupe_index_sz, pos1, i;
uchar_t *pos2; uchar_t *pos2;
rabin_parse_hdr(buf, &blknum, &dedupe_index_sz, &data_sz, &indx_cmp, &data_sz_cmp, &deduped_sz); parse_dedupe_hdr(buf, &blknum, &dedupe_index_sz, &data_sz, &indx_cmp, &data_sz_cmp, &deduped_sz);
dedupe_index = (uint32_t *)(buf + RABIN_HDR_SIZE); dedupe_index = (uint32_t *)(buf + RABIN_HDR_SIZE);
pos1 = dedupe_index_sz + RABIN_HDR_SIZE; pos1 = dedupe_index_sz + RABIN_HDR_SIZE;
pos2 = ctx->cbuf; pos2 = ctx->cbuf;
@ -828,7 +859,7 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
* TODO: Consolidate rabin dedup and compression/decompression in functions here rather than * TODO: Consolidate rabin dedup and compression/decompression in functions here rather than
* messy code in main program. * messy code in main program.
int int
rabin_compress(rabin_context_t *ctx, uchar_t *from, ssize_t fromlen, uchar_t *to, ssize_t *tolen, rabin_compress(dedupe_context_t *ctx, uchar_t *from, ssize_t fromlen, uchar_t *to, ssize_t *tolen,
int level, char chdr, void *data, compress_func_ptr cmp) int level, char chdr, void *data, compress_func_ptr cmp)
{ {
} }

View file

@ -150,21 +150,21 @@ typedef struct {
short valid; short valid;
void *lzma_data; void *lzma_data;
int level, delta_flag, fixed_flag; int level, delta_flag, fixed_flag;
} rabin_context_t; } dedupe_context_t;
extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, extern dedupe_context_t *create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize,
int rab_blk_sz, const char *algo, int delta_flag, int fixed_flag); int rab_blk_sz, const char *algo, int delta_flag, int fixed_flag);
extern void destroy_rabin_context(rabin_context_t *ctx); extern void destroy_dedupe_context(dedupe_context_t *ctx);
extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf, extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf,
ssize_t *size, ssize_t offset, ssize_t *rabin_pos); ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
extern void rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size); extern void dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size);
extern void rabin_parse_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz, extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz,
ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp, ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp,
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size); ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size);
extern void rabin_update_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp, extern void update_dedupe_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
ssize_t rabin_data_sz_cmp); ssize_t rabin_data_sz_cmp);
extern void reset_rabin_context(rabin_context_t *ctx); extern void reset_dedupe_context(dedupe_context_t *ctx);
extern uint32_t rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
int delta_flag); int delta_flag);
#endif /* _RABIN_POLY_H_ */ #endif /* _RABIN_POLY_H_ */

View file

@ -223,7 +223,7 @@ Read_Adjusted(int fd, uchar_t *buf, size_t count, ssize_t *rabin_count, void *ct
{ {
char *buf2; char *buf2;
ssize_t rcount; ssize_t rcount;
rabin_context_t *rctx = (rabin_context_t *)ctx; dedupe_context_t *rctx = (dedupe_context_t *)ctx;
if (!ctx) return (Read(fd, buf, count)); if (!ctx) return (Read(fd, buf, count));
buf2 = buf; buf2 = buf;
@ -235,7 +235,7 @@ Read_Adjusted(int fd, uchar_t *buf, size_t count, ssize_t *rabin_count, void *ct
if (rcount > 0) { if (rcount > 0) {
rcount += *rabin_count; rcount += *rabin_count;
if (rcount == count) if (rcount == count)
rabin_dedup(rctx, buf, &rcount, 0, rabin_count); dedupe_compress(rctx, buf, &rcount, 0, rabin_count);
else else
*rabin_count = 0; *rabin_count = 0;
} else { } else {