Add support for Fixed-Block deduplication.
More refactoring of symbol names.
This commit is contained in:
parent
b9355a5dcc
commit
e3befd9e16
6 changed files with 116 additions and 76 deletions
|
@ -97,6 +97,12 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
|
||||||
the fastest in the group, especially on x86 platforms. BLAKE is faster
|
the fastest in the group, especially on x86 platforms. BLAKE is faster
|
||||||
than SKEIN on a few platforms.
|
than SKEIN on a few platforms.
|
||||||
SKEIN 512-256 is about 60% faster than SHA 512-256 on x64 platforms.
|
SKEIN 512-256 is about 60% faster than SHA 512-256 on x64 platforms.
|
||||||
|
|
||||||
|
'-F' - Perform Fixed Block Deduplication. This is faster than fingerprinting
|
||||||
|
based content-aware deduplication in some cases. However this is mostly
|
||||||
|
usable for disk dumps especially virtual machine images. This generally
|
||||||
|
gives lower dedupe ratio than content-aware dedupe (-D) and does not
|
||||||
|
support delta compression.
|
||||||
'-M' - Display memory allocator statistics
|
'-M' - Display memory allocator statistics
|
||||||
'-C' - Display compression statistics
|
'-C' - Display compression statistics
|
||||||
|
|
||||||
|
|
75
main.c
75
main.c
|
@ -90,7 +90,7 @@ static int do_uncompress = 0;
|
||||||
static int cksum_bytes;
|
static int cksum_bytes;
|
||||||
static int cksum = 0;
|
static int cksum = 0;
|
||||||
static int rab_blk_size = 0;
|
static int rab_blk_size = 0;
|
||||||
static rabin_context_t *rctx;
|
static dedupe_context_t *rctx;
|
||||||
|
|
||||||
static void
|
static void
|
||||||
usage(void)
|
usage(void)
|
||||||
|
@ -145,6 +145,8 @@ usage(void)
|
||||||
" '-S' <cksum>\n"
|
" '-S' <cksum>\n"
|
||||||
" - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512\n"
|
" - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512\n"
|
||||||
" Default one is SKEIN256.\n"
|
" Default one is SKEIN256.\n"
|
||||||
|
" '-F' - Perform Fixed-Block Deduplication. Faster than '-D' in some cases\n"
|
||||||
|
" but with lower deduplication ratio.\n"
|
||||||
" '-B' <1..5>\n"
|
" '-B' <1..5>\n"
|
||||||
" - Specify a minimum Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n"
|
" - Specify a minimum Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n"
|
||||||
" '-M' - Display memory allocator statistics\n"
|
" '-M' - Display memory allocator statistics\n"
|
||||||
|
@ -299,11 +301,11 @@ redo:
|
||||||
_chunksize = ntohll(*((ssize_t *)rseg));
|
_chunksize = ntohll(*((ssize_t *)rseg));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) {
|
if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
|
||||||
uchar_t *cmpbuf, *ubuf;
|
uchar_t *cmpbuf, *ubuf;
|
||||||
|
|
||||||
/* Extract various sizes from rabin header. */
|
/* Extract various sizes from rabin header. */
|
||||||
rabin_parse_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz,
|
parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz,
|
||||||
&dedupe_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize);
|
&dedupe_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize);
|
||||||
memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE);
|
memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE);
|
||||||
|
|
||||||
|
@ -363,14 +365,14 @@ redo:
|
||||||
goto cont;
|
goto cont;
|
||||||
}
|
}
|
||||||
/* Rebuild chunk from dedup blocks. */
|
/* Rebuild chunk from dedup blocks. */
|
||||||
if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) {
|
if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
|
||||||
rabin_context_t *rctx;
|
dedupe_context_t *rctx;
|
||||||
uchar_t *tmp;
|
uchar_t *tmp;
|
||||||
|
|
||||||
rctx = tdat->rctx;
|
rctx = tdat->rctx;
|
||||||
reset_rabin_context(tdat->rctx);
|
reset_dedupe_context(tdat->rctx);
|
||||||
rctx->cbuf = tdat->compressed_chunk;
|
rctx->cbuf = tdat->compressed_chunk;
|
||||||
rabin_inverse_dedup(rctx, tdat->uncompressed_chunk, &(tdat->len_cmp));
|
dedupe_decompress(rctx, tdat->uncompressed_chunk, &(tdat->len_cmp));
|
||||||
if (!rctx->valid) {
|
if (!rctx->valid) {
|
||||||
fprintf(stderr, "ERROR: Chunk %d, dedup recovery failed.\n", tdat->id);
|
fprintf(stderr, "ERROR: Chunk %d, dedup recovery failed.\n", tdat->id);
|
||||||
rv = -1;
|
rv = -1;
|
||||||
|
@ -582,8 +584,8 @@ start_decompress(const char *filename, const char *to_filename)
|
||||||
UNCOMP_BAIL;
|
UNCOMP_BAIL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (enable_rabin_scan) {
|
if (enable_rabin_scan || enable_fixed_scan) {
|
||||||
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, rab_blk_size,
|
tdat->rctx = create_dedupe_context(chunksize, compressed_chunksize, rab_blk_size,
|
||||||
algo, enable_delta_encode, enable_fixed_scan);
|
algo, enable_delta_encode, enable_fixed_scan);
|
||||||
if (tdat->rctx == NULL) {
|
if (tdat->rctx == NULL) {
|
||||||
UNCOMP_BAIL;
|
UNCOMP_BAIL;
|
||||||
|
@ -659,7 +661,7 @@ start_decompress(const char *filename, const char *to_filename)
|
||||||
if (!tdat->compressed_chunk) {
|
if (!tdat->compressed_chunk) {
|
||||||
tdat->compressed_chunk = (uchar_t *)slab_alloc(NULL,
|
tdat->compressed_chunk = (uchar_t *)slab_alloc(NULL,
|
||||||
compressed_chunksize);
|
compressed_chunksize);
|
||||||
if (enable_rabin_scan)
|
if ((enable_rabin_scan || enable_fixed_scan))
|
||||||
tdat->uncompressed_chunk = (uchar_t *)slab_alloc(NULL,
|
tdat->uncompressed_chunk = (uchar_t *)slab_alloc(NULL,
|
||||||
compressed_chunksize);
|
compressed_chunksize);
|
||||||
else
|
else
|
||||||
|
@ -735,8 +737,8 @@ uncomp_done:
|
||||||
slab_free(NULL, dary[i]->compressed_chunk);
|
slab_free(NULL, dary[i]->compressed_chunk);
|
||||||
if (_deinit_func)
|
if (_deinit_func)
|
||||||
_deinit_func(&(dary[i]->data));
|
_deinit_func(&(dary[i]->data));
|
||||||
if (enable_rabin_scan) {
|
if ((enable_rabin_scan || enable_fixed_scan)) {
|
||||||
destroy_rabin_context(dary[i]->rctx);
|
destroy_dedupe_context(dary[i]->rctx);
|
||||||
}
|
}
|
||||||
slab_free(NULL, dary[i]);
|
slab_free(NULL, dary[i]);
|
||||||
}
|
}
|
||||||
|
@ -770,8 +772,8 @@ redo:
|
||||||
compressed_chunk = tdat->compressed_chunk + CHUNK_FLAG_SZ;
|
compressed_chunk = tdat->compressed_chunk + CHUNK_FLAG_SZ;
|
||||||
rbytes = tdat->rbytes;
|
rbytes = tdat->rbytes;
|
||||||
/* Perform Dedup if enabled. */
|
/* Perform Dedup if enabled. */
|
||||||
if (enable_rabin_scan) {
|
if ((enable_rabin_scan || enable_fixed_scan)) {
|
||||||
rabin_context_t *rctx;
|
dedupe_context_t *rctx;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Compute checksum of original uncompressed chunk. When doing dedup
|
* Compute checksum of original uncompressed chunk. When doing dedup
|
||||||
|
@ -782,9 +784,9 @@ redo:
|
||||||
compute_checksum(tdat->checksum, cksum, tdat->cmp_seg, tdat->rbytes);
|
compute_checksum(tdat->checksum, cksum, tdat->cmp_seg, tdat->rbytes);
|
||||||
|
|
||||||
rctx = tdat->rctx;
|
rctx = tdat->rctx;
|
||||||
reset_rabin_context(tdat->rctx);
|
reset_dedupe_context(tdat->rctx);
|
||||||
rctx->cbuf = tdat->uncompressed_chunk;
|
rctx->cbuf = tdat->uncompressed_chunk;
|
||||||
dedupe_index_sz = rabin_dedup(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0, NULL);
|
dedupe_index_sz = dedupe_compress(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0, NULL);
|
||||||
if (!rctx->valid) {
|
if (!rctx->valid) {
|
||||||
memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes);
|
memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes);
|
||||||
tdat->rbytes = rbytes;
|
tdat->rbytes = rbytes;
|
||||||
|
@ -801,7 +803,7 @@ redo:
|
||||||
* The rabin index array values can pollute the compressor's dictionary thereby
|
* The rabin index array values can pollute the compressor's dictionary thereby
|
||||||
* reducing compression effectiveness of the data chunk. So we separate them.
|
* reducing compression effectiveness of the data chunk. So we separate them.
|
||||||
*/
|
*/
|
||||||
if (enable_rabin_scan && tdat->rctx->valid) {
|
if ((enable_rabin_scan || enable_fixed_scan) && tdat->rctx->valid) {
|
||||||
_chunksize = tdat->rbytes - dedupe_index_sz - RABIN_HDR_SIZE;
|
_chunksize = tdat->rbytes - dedupe_index_sz - RABIN_HDR_SIZE;
|
||||||
index_size_cmp = dedupe_index_sz;
|
index_size_cmp = dedupe_index_sz;
|
||||||
|
|
||||||
|
@ -837,7 +839,7 @@ redo:
|
||||||
memcpy(compressed_chunk + index_size_cmp,
|
memcpy(compressed_chunk + index_size_cmp,
|
||||||
tdat->uncompressed_chunk + dedupe_index_sz, _chunksize);
|
tdat->uncompressed_chunk + dedupe_index_sz, _chunksize);
|
||||||
/* Now update rabin header with the compressed sizes. */
|
/* Now update rabin header with the compressed sizes. */
|
||||||
rabin_update_hdr(compressed_chunk, index_size_cmp - RABIN_HDR_SIZE,
|
update_dedupe_hdr(compressed_chunk, index_size_cmp - RABIN_HDR_SIZE,
|
||||||
_chunksize);
|
_chunksize);
|
||||||
} else {
|
} else {
|
||||||
/* If rabin index compression fails, we just drop down to plain
|
/* If rabin index compression fails, we just drop down to plain
|
||||||
|
@ -869,7 +871,7 @@ plain_compress:
|
||||||
*/
|
*/
|
||||||
tdat->len_cmp = _chunksize;
|
tdat->len_cmp = _chunksize;
|
||||||
if (_chunksize >= rbytes || rv < 0) {
|
if (_chunksize >= rbytes || rv < 0) {
|
||||||
if (!enable_rabin_scan || !tdat->rctx->valid)
|
if (!(enable_rabin_scan || enable_fixed_scan) || !tdat->rctx->valid)
|
||||||
memcpy(compressed_chunk, tdat->uncompressed_chunk, tdat->rbytes);
|
memcpy(compressed_chunk, tdat->uncompressed_chunk, tdat->rbytes);
|
||||||
type = UNCOMPRESSED;
|
type = UNCOMPRESSED;
|
||||||
tdat->len_cmp = tdat->rbytes;
|
tdat->len_cmp = tdat->rbytes;
|
||||||
|
@ -877,7 +879,7 @@ plain_compress:
|
||||||
type = COMPRESSED;
|
type = COMPRESSED;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (enable_rabin_scan && tdat->rctx->valid) {
|
if ((enable_rabin_scan || enable_fixed_scan) && tdat->rctx->valid) {
|
||||||
type |= CHUNK_FLAG_DEDUP;
|
type |= CHUNK_FLAG_DEDUP;
|
||||||
}
|
}
|
||||||
if (lzp_preprocess) {
|
if (lzp_preprocess) {
|
||||||
|
@ -982,7 +984,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
struct cmp_data **dary = NULL, *tdat;
|
struct cmp_data **dary = NULL, *tdat;
|
||||||
pthread_t writer_thr;
|
pthread_t writer_thr;
|
||||||
uchar_t *cread_buf, *pos;
|
uchar_t *cread_buf, *pos;
|
||||||
rabin_context_t *rctx;
|
dedupe_context_t *rctx;
|
||||||
algo_props_t props;
|
algo_props_t props;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1015,7 +1017,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
else
|
else
|
||||||
flags |= FLAG_DEDUP_FIXED;
|
flags |= FLAG_DEDUP_FIXED;
|
||||||
/* Additional scratch space for dedup arrays. */
|
/* Additional scratch space for dedup arrays. */
|
||||||
compressed_chunksize += (rabin_buf_extra(chunksize, 0, algo,
|
compressed_chunksize += (dedupe_buf_extra(chunksize, 0, algo,
|
||||||
enable_delta_encode) - (compressed_chunksize - chunksize));
|
enable_delta_encode) - (compressed_chunksize - chunksize));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1107,7 +1109,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
dary = (struct cmp_data **)slab_calloc(NULL, nprocs, sizeof (struct cmp_data *));
|
dary = (struct cmp_data **)slab_calloc(NULL, nprocs, sizeof (struct cmp_data *));
|
||||||
if (enable_rabin_scan)
|
if ((enable_rabin_scan || enable_fixed_scan))
|
||||||
cread_buf = (uchar_t *)slab_alloc(NULL, compressed_chunksize);
|
cread_buf = (uchar_t *)slab_alloc(NULL, compressed_chunksize);
|
||||||
else
|
else
|
||||||
cread_buf = (uchar_t *)slab_alloc(NULL, chunksize);
|
cread_buf = (uchar_t *)slab_alloc(NULL, chunksize);
|
||||||
|
@ -1137,8 +1139,8 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
COMP_BAIL;
|
COMP_BAIL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (enable_rabin_scan) {
|
if (enable_rabin_scan || enable_fixed_scan) {
|
||||||
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, rab_blk_size,
|
tdat->rctx = create_dedupe_context(chunksize, compressed_chunksize, rab_blk_size,
|
||||||
algo, enable_delta_encode, enable_fixed_scan);
|
algo, enable_delta_encode, enable_fixed_scan);
|
||||||
if (tdat->rctx == NULL) {
|
if (tdat->rctx == NULL) {
|
||||||
COMP_BAIL;
|
COMP_BAIL;
|
||||||
|
@ -1204,7 +1206,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
* Read the first chunk into a spare buffer (a simple double-buffering).
|
* Read the first chunk into a spare buffer (a simple double-buffering).
|
||||||
*/
|
*/
|
||||||
if (enable_rabin_split) {
|
if (enable_rabin_split) {
|
||||||
rctx = create_rabin_context(chunksize, 0, 0, algo, enable_delta_encode,
|
rctx = create_dedupe_context(chunksize, 0, 0, algo, enable_delta_encode,
|
||||||
enable_fixed_scan);
|
enable_fixed_scan);
|
||||||
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
|
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
|
||||||
} else {
|
} else {
|
||||||
|
@ -1231,7 +1233,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
* Delayed allocation. Allocate chunks if not already done.
|
* Delayed allocation. Allocate chunks if not already done.
|
||||||
*/
|
*/
|
||||||
if (!tdat->cmp_seg) {
|
if (!tdat->cmp_seg) {
|
||||||
if (enable_rabin_scan) {
|
if ((enable_rabin_scan || enable_fixed_scan)) {
|
||||||
if (single_chunk)
|
if (single_chunk)
|
||||||
tdat->cmp_seg = (uchar_t *)1;
|
tdat->cmp_seg = (uchar_t *)1;
|
||||||
else
|
else
|
||||||
|
@ -1266,7 +1268,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
*/
|
*/
|
||||||
tdat->id = chunk_num;
|
tdat->id = chunk_num;
|
||||||
tdat->rbytes = rbytes;
|
tdat->rbytes = rbytes;
|
||||||
if (enable_rabin_scan) {
|
if ((enable_rabin_scan || enable_fixed_scan)) {
|
||||||
tmp = tdat->cmp_seg;
|
tmp = tdat->cmp_seg;
|
||||||
tdat->cmp_seg = cread_buf;
|
tdat->cmp_seg = cread_buf;
|
||||||
cread_buf = tmp;
|
cread_buf = tmp;
|
||||||
|
@ -1383,8 +1385,8 @@ comp_done:
|
||||||
slab_free(NULL, dary[i]->uncompressed_chunk);
|
slab_free(NULL, dary[i]->uncompressed_chunk);
|
||||||
if (dary[i]->cmp_seg != (uchar_t *)1)
|
if (dary[i]->cmp_seg != (uchar_t *)1)
|
||||||
slab_free(NULL, dary[i]->cmp_seg);
|
slab_free(NULL, dary[i]->cmp_seg);
|
||||||
if (enable_rabin_scan) {
|
if ((enable_rabin_scan || enable_fixed_scan)) {
|
||||||
destroy_rabin_context(dary[i]->rctx);
|
destroy_dedupe_context(dary[i]->rctx);
|
||||||
}
|
}
|
||||||
if (_deinit_func)
|
if (_deinit_func)
|
||||||
_deinit_func(&(dary[i]->data));
|
_deinit_func(&(dary[i]->data));
|
||||||
|
@ -1392,7 +1394,7 @@ comp_done:
|
||||||
}
|
}
|
||||||
slab_free(NULL, dary);
|
slab_free(NULL, dary);
|
||||||
}
|
}
|
||||||
if (enable_rabin_split) destroy_rabin_context(rctx);
|
if (enable_rabin_split) destroy_dedupe_context(rctx);
|
||||||
if (cread_buf != (uchar_t *)1)
|
if (cread_buf != (uchar_t *)1)
|
||||||
slab_free(NULL, cread_buf);
|
slab_free(NULL, cread_buf);
|
||||||
if (!pipe_mode) {
|
if (!pipe_mode) {
|
||||||
|
@ -1530,7 +1532,7 @@ main(int argc, char *argv[])
|
||||||
level = 6;
|
level = 6;
|
||||||
slab_init();
|
slab_init();
|
||||||
|
|
||||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDErLS:B:")) != -1) {
|
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDErLS:B:F")) != -1) {
|
||||||
int ovr;
|
int ovr;
|
||||||
|
|
||||||
switch (opt) {
|
switch (opt) {
|
||||||
|
@ -1597,8 +1599,9 @@ main(int argc, char *argv[])
|
||||||
enable_delta_encode = 1;
|
enable_delta_encode = 1;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'f':
|
case 'F':
|
||||||
enable_fixed_scan = 1;
|
enable_fixed_scan = 1;
|
||||||
|
enable_rabin_split = 0;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'L':
|
case 'L':
|
||||||
|
@ -1638,15 +1641,15 @@ main(int argc, char *argv[])
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (enable_rabin_scan && !do_compress) {
|
if ((enable_rabin_scan || enable_fixed_scan) && !do_compress) {
|
||||||
fprintf(stderr, "Rabin Deduplication is only used during compression.\n");
|
fprintf(stderr, "Deduplication is only used during compression.\n");
|
||||||
usage();
|
usage();
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (!enable_rabin_scan)
|
if (!enable_rabin_scan)
|
||||||
enable_rabin_split = 0;
|
enable_rabin_split = 0;
|
||||||
|
|
||||||
if (enable_fixed_scan && (enable_rabin_scan || enable_delta_encode)) {
|
if (enable_fixed_scan && (enable_rabin_scan || enable_delta_encode || enable_rabin_split)) {
|
||||||
fprintf(stderr, "Rabin Deduplication and Fixed block Deduplication are mutually exclusive\n");
|
fprintf(stderr, "Rabin Deduplication and Fixed block Deduplication are mutually exclusive\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -157,7 +157,7 @@ struct cmp_data {
|
||||||
uchar_t *cmp_seg;
|
uchar_t *cmp_seg;
|
||||||
uchar_t *compressed_chunk;
|
uchar_t *compressed_chunk;
|
||||||
uchar_t *uncompressed_chunk;
|
uchar_t *uncompressed_chunk;
|
||||||
rabin_context_t *rctx;
|
dedupe_context_t *rctx;
|
||||||
ssize_t rbytes;
|
ssize_t rbytes;
|
||||||
ssize_t chunksize;
|
ssize_t chunksize;
|
||||||
ssize_t len_cmp;
|
ssize_t len_cmp;
|
||||||
|
|
|
@ -86,7 +86,7 @@ uint64_t ir[256];
|
||||||
static int inited = 0;
|
static int inited = 0;
|
||||||
|
|
||||||
static uint32_t
|
static uint32_t
|
||||||
rabin_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
dedupe_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
||||||
{
|
{
|
||||||
uint32_t min_blk;
|
uint32_t min_blk;
|
||||||
|
|
||||||
|
@ -95,22 +95,22 @@ rabin_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t
|
uint32_t
|
||||||
rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
||||||
{
|
{
|
||||||
if (rab_blk_sz < 1 || rab_blk_sz > 5)
|
if (rab_blk_sz < 1 || rab_blk_sz > 5)
|
||||||
rab_blk_sz = RAB_BLK_DEFAULT;
|
rab_blk_sz = RAB_BLK_DEFAULT;
|
||||||
|
|
||||||
return ((chunksize / rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag))
|
return ((chunksize / dedupe_min_blksz(chunksize, rab_blk_sz, algo, delta_flag))
|
||||||
* sizeof (uint32_t));
|
* sizeof (uint32_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Initialize the algorithm with the default params.
|
* Initialize the algorithm with the default params.
|
||||||
*/
|
*/
|
||||||
rabin_context_t *
|
dedupe_context_t *
|
||||||
create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz,
|
create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz,
|
||||||
const char *algo, int delta_flag, int fixed_flag) {
|
const char *algo, int delta_flag, int fixed_flag) {
|
||||||
rabin_context_t *ctx;
|
dedupe_context_t *ctx;
|
||||||
unsigned char *current_window_data;
|
unsigned char *current_window_data;
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
|
|
||||||
|
@ -165,7 +165,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
||||||
* use 4K minimum Rabin block size. For everything else it is 2K based
|
* use 4K minimum Rabin block size. For everything else it is 2K based
|
||||||
* on experimentation.
|
* on experimentation.
|
||||||
*/
|
*/
|
||||||
ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t));
|
ctx = (dedupe_context_t *)slab_alloc(NULL, sizeof (dedupe_context_t));
|
||||||
ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
|
ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
|
||||||
|
|
||||||
ctx->fixed_flag = fixed_flag;
|
ctx->fixed_flag = fixed_flag;
|
||||||
|
@ -173,7 +173,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
||||||
ctx->delta_flag = delta_flag;
|
ctx->delta_flag = delta_flag;
|
||||||
ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
|
ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
|
||||||
ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1;
|
ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1;
|
||||||
ctx->rabin_poly_min_block_size = rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag);
|
ctx->rabin_poly_min_block_size = dedupe_min_blksz(chunksize, rab_blk_sz, algo, delta_flag);
|
||||||
ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size;
|
ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size;
|
||||||
|
|
||||||
if (!fixed_flag)
|
if (!fixed_flag)
|
||||||
|
@ -186,7 +186,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
||||||
|
|
||||||
if (ctx->blknum > RABIN_MAX_BLOCKS) {
|
if (ctx->blknum > RABIN_MAX_BLOCKS) {
|
||||||
fprintf(stderr, "Chunk size too large for dedup.\n");
|
fprintf(stderr, "Chunk size too large for dedup.\n");
|
||||||
destroy_rabin_context(ctx);
|
destroy_dedupe_context(ctx);
|
||||||
return (NULL);
|
return (NULL);
|
||||||
}
|
}
|
||||||
current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE);
|
current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE);
|
||||||
|
@ -198,7 +198,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
||||||
if(ctx == NULL || current_window_data == NULL || (ctx->blocks == NULL && real_chunksize > 0)) {
|
if(ctx == NULL || current_window_data == NULL || (ctx->blocks == NULL && real_chunksize > 0)) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"Could not allocate rabin polynomial context, out of memory\n");
|
"Could not allocate rabin polynomial context, out of memory\n");
|
||||||
destroy_rabin_context(ctx);
|
destroy_dedupe_context(ctx);
|
||||||
return (NULL);
|
return (NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -209,7 +209,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
||||||
if (!(ctx->lzma_data)) {
|
if (!(ctx->lzma_data)) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"Could not initialize LZMA data for dedupe index, out of memory\n");
|
"Could not initialize LZMA data for dedupe index, out of memory\n");
|
||||||
destroy_rabin_context(ctx);
|
destroy_dedupe_context(ctx);
|
||||||
return (NULL);
|
return (NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -227,19 +227,19 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
||||||
slab_cache_add(sizeof (rabin_blockentry_t));
|
slab_cache_add(sizeof (rabin_blockentry_t));
|
||||||
ctx->current_window_data = current_window_data;
|
ctx->current_window_data = current_window_data;
|
||||||
ctx->real_chunksize = real_chunksize;
|
ctx->real_chunksize = real_chunksize;
|
||||||
reset_rabin_context(ctx);
|
reset_dedupe_context(ctx);
|
||||||
return (ctx);
|
return (ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
reset_rabin_context(rabin_context_t *ctx)
|
reset_dedupe_context(dedupe_context_t *ctx)
|
||||||
{
|
{
|
||||||
memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
|
memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
|
||||||
ctx->window_pos = 0;
|
ctx->window_pos = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
destroy_rabin_context(rabin_context_t *ctx)
|
destroy_dedupe_context(dedupe_context_t *ctx)
|
||||||
{
|
{
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
|
@ -288,11 +288,13 @@ cmpblks(const void *a, const void *b)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Perform Deduplication based on Rabin Fingerprinting. A 31-byte window is used for
|
* Perform Deduplication.
|
||||||
* the rolling checksum and dedup blocks vary in size from 4K-128K.
|
* Both Semi-Rabin fingerprinting based and Fixed Block Deduplication are supported.
|
||||||
|
* A 16-byte window is used for the rolling checksum and dedup blocks can vary in size
|
||||||
|
* from 4K-128K.
|
||||||
*/
|
*/
|
||||||
uint32_t
|
uint32_t
|
||||||
rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos)
|
dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos)
|
||||||
{
|
{
|
||||||
ssize_t i, last_offset, j, fplist_sz;
|
ssize_t i, last_offset, j, fplist_sz;
|
||||||
uint32_t blknum;
|
uint32_t blknum;
|
||||||
|
@ -302,6 +304,40 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
uint32_t *fplist;
|
uint32_t *fplist;
|
||||||
heap_t heap;
|
heap_t heap;
|
||||||
|
|
||||||
|
length = offset;
|
||||||
|
last_offset = 0;
|
||||||
|
blknum = 0;
|
||||||
|
ctx->valid = 0;
|
||||||
|
cur_roll_checksum = 0;
|
||||||
|
cur_sketch = 0;
|
||||||
|
|
||||||
|
if (ctx->fixed_flag) {
|
||||||
|
blknum = *size / ctx->rabin_poly_avg_block_size;
|
||||||
|
j = *size % ctx->rabin_poly_avg_block_size;
|
||||||
|
if (j) blknum++;
|
||||||
|
|
||||||
|
last_offset = 0;
|
||||||
|
length = ctx->rabin_poly_avg_block_size;
|
||||||
|
for (i=0; i<blknum; i++) {
|
||||||
|
if (i == blknum-1) {
|
||||||
|
length = j;
|
||||||
|
}
|
||||||
|
if (ctx->blocks[i] == 0) {
|
||||||
|
ctx->blocks[i] = (rabin_blockentry_t *)slab_alloc(NULL,
|
||||||
|
sizeof (rabin_blockentry_t));
|
||||||
|
}
|
||||||
|
ctx->blocks[i]->offset = last_offset;
|
||||||
|
ctx->blocks[i]->index = i; // Need to store for sorting
|
||||||
|
ctx->blocks[i]->length = length;
|
||||||
|
ctx->blocks[i]->ref = 0;
|
||||||
|
ctx->blocks[i]->similar = 0;
|
||||||
|
ctx->blocks[i]->crc = XXH_strong32(buf1+last_offset, length, 0);
|
||||||
|
ctx->blocks[i]->cksum_n_offset = ctx->blocks[i]->crc;
|
||||||
|
last_offset += length;
|
||||||
|
}
|
||||||
|
goto process_blocks;
|
||||||
|
}
|
||||||
|
|
||||||
if (rabin_pos == NULL) {
|
if (rabin_pos == NULL) {
|
||||||
/*
|
/*
|
||||||
* Initialize arrays for sketch computation. We re-use memory allocated
|
* Initialize arrays for sketch computation. We re-use memory allocated
|
||||||
|
@ -312,12 +348,6 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
memset(fplist, 0, fplist_sz);
|
memset(fplist, 0, fplist_sz);
|
||||||
reset_heap(&heap, fplist_sz/2);
|
reset_heap(&heap, fplist_sz/2);
|
||||||
}
|
}
|
||||||
length = offset;
|
|
||||||
last_offset = 0;
|
|
||||||
blknum = 0;
|
|
||||||
ctx->valid = 0;
|
|
||||||
cur_roll_checksum = 0;
|
|
||||||
cur_sketch = 0;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
|
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
|
||||||
|
@ -434,6 +464,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
process_blocks:
|
||||||
DEBUG_STAT_EN(printf("Original size: %lld, blknum: %u\n", *size, blknum));
|
DEBUG_STAT_EN(printf("Original size: %lld, blknum: %u\n", *size, blknum));
|
||||||
// If we found at least a few chunks, perform dedup.
|
// If we found at least a few chunks, perform dedup.
|
||||||
if (blknum > 2) {
|
if (blknum > 2) {
|
||||||
|
@ -701,7 +732,7 @@ cont:
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
rabin_update_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp)
|
update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp)
|
||||||
{
|
{
|
||||||
ssize_t *entries;
|
ssize_t *entries;
|
||||||
|
|
||||||
|
@ -712,7 +743,7 @@ rabin_update_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_s
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
rabin_parse_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
|
parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
|
||||||
ssize_t *rabin_data_sz, ssize_t *dedupe_index_sz_cmp,
|
ssize_t *rabin_data_sz, ssize_t *dedupe_index_sz_cmp,
|
||||||
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size)
|
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size)
|
||||||
{
|
{
|
||||||
|
@ -730,7 +761,7 @@ rabin_parse_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size)
|
||||||
{
|
{
|
||||||
uint32_t blknum, blk, oblk, len;
|
uint32_t blknum, blk, oblk, len;
|
||||||
uint32_t *dedupe_index;
|
uint32_t *dedupe_index;
|
||||||
|
@ -738,7 +769,7 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
||||||
ssize_t dedupe_index_sz, pos1, i;
|
ssize_t dedupe_index_sz, pos1, i;
|
||||||
uchar_t *pos2;
|
uchar_t *pos2;
|
||||||
|
|
||||||
rabin_parse_hdr(buf, &blknum, &dedupe_index_sz, &data_sz, &indx_cmp, &data_sz_cmp, &deduped_sz);
|
parse_dedupe_hdr(buf, &blknum, &dedupe_index_sz, &data_sz, &indx_cmp, &data_sz_cmp, &deduped_sz);
|
||||||
dedupe_index = (uint32_t *)(buf + RABIN_HDR_SIZE);
|
dedupe_index = (uint32_t *)(buf + RABIN_HDR_SIZE);
|
||||||
pos1 = dedupe_index_sz + RABIN_HDR_SIZE;
|
pos1 = dedupe_index_sz + RABIN_HDR_SIZE;
|
||||||
pos2 = ctx->cbuf;
|
pos2 = ctx->cbuf;
|
||||||
|
@ -828,7 +859,7 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
||||||
* TODO: Consolidate rabin dedup and compression/decompression in functions here rather than
|
* TODO: Consolidate rabin dedup and compression/decompression in functions here rather than
|
||||||
* messy code in main program.
|
* messy code in main program.
|
||||||
int
|
int
|
||||||
rabin_compress(rabin_context_t *ctx, uchar_t *from, ssize_t fromlen, uchar_t *to, ssize_t *tolen,
|
rabin_compress(dedupe_context_t *ctx, uchar_t *from, ssize_t fromlen, uchar_t *to, ssize_t *tolen,
|
||||||
int level, char chdr, void *data, compress_func_ptr cmp)
|
int level, char chdr, void *data, compress_func_ptr cmp)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
|
@ -150,21 +150,21 @@ typedef struct {
|
||||||
short valid;
|
short valid;
|
||||||
void *lzma_data;
|
void *lzma_data;
|
||||||
int level, delta_flag, fixed_flag;
|
int level, delta_flag, fixed_flag;
|
||||||
} rabin_context_t;
|
} dedupe_context_t;
|
||||||
|
|
||||||
extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize,
|
extern dedupe_context_t *create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize,
|
||||||
int rab_blk_sz, const char *algo, int delta_flag, int fixed_flag);
|
int rab_blk_sz, const char *algo, int delta_flag, int fixed_flag);
|
||||||
extern void destroy_rabin_context(rabin_context_t *ctx);
|
extern void destroy_dedupe_context(dedupe_context_t *ctx);
|
||||||
extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf,
|
extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf,
|
||||||
ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
|
ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
|
||||||
extern void rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size);
|
extern void dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size);
|
||||||
extern void rabin_parse_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz,
|
extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz,
|
||||||
ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp,
|
ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp,
|
||||||
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size);
|
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size);
|
||||||
extern void rabin_update_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
|
extern void update_dedupe_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
|
||||||
ssize_t rabin_data_sz_cmp);
|
ssize_t rabin_data_sz_cmp);
|
||||||
extern void reset_rabin_context(rabin_context_t *ctx);
|
extern void reset_dedupe_context(dedupe_context_t *ctx);
|
||||||
extern uint32_t rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
|
extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
|
||||||
int delta_flag);
|
int delta_flag);
|
||||||
|
|
||||||
#endif /* _RABIN_POLY_H_ */
|
#endif /* _RABIN_POLY_H_ */
|
||||||
|
|
|
@ -223,7 +223,7 @@ Read_Adjusted(int fd, uchar_t *buf, size_t count, ssize_t *rabin_count, void *ct
|
||||||
{
|
{
|
||||||
char *buf2;
|
char *buf2;
|
||||||
ssize_t rcount;
|
ssize_t rcount;
|
||||||
rabin_context_t *rctx = (rabin_context_t *)ctx;
|
dedupe_context_t *rctx = (dedupe_context_t *)ctx;
|
||||||
|
|
||||||
if (!ctx) return (Read(fd, buf, count));
|
if (!ctx) return (Read(fd, buf, count));
|
||||||
buf2 = buf;
|
buf2 = buf;
|
||||||
|
@ -235,7 +235,7 @@ Read_Adjusted(int fd, uchar_t *buf, size_t count, ssize_t *rabin_count, void *ct
|
||||||
if (rcount > 0) {
|
if (rcount > 0) {
|
||||||
rcount += *rabin_count;
|
rcount += *rabin_count;
|
||||||
if (rcount == count)
|
if (rcount == count)
|
||||||
rabin_dedup(rctx, buf, &rcount, 0, rabin_count);
|
dedupe_compress(rctx, buf, &rcount, 0, rabin_count);
|
||||||
else
|
else
|
||||||
*rabin_count = 0;
|
*rabin_count = 0;
|
||||||
} else {
|
} else {
|
||||||
|
|
Loading…
Reference in a new issue