Add support for Fixed-Block deduplication.
More refactoring of symbol names.
This commit is contained in:
parent
b9355a5dcc
commit
e3befd9e16
6 changed files with 116 additions and 76 deletions
|
@ -97,6 +97,12 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
|
|||
the fastest in the group, especially on x86 platforms. BLAKE is faster
|
||||
than SKEIN on a few platforms.
|
||||
SKEIN 512-256 is about 60% faster than SHA 512-256 on x64 platforms.
|
||||
|
||||
'-F' - Perform Fixed Block Deduplication. This is faster than fingerprinting
|
||||
based content-aware deduplication in some cases. However this is mostly
|
||||
usable for disk dumps especially virtual machine images. This generally
|
||||
gives lower dedupe ratio than content-aware dedupe (-D) and does not
|
||||
support delta compression.
|
||||
'-M' - Display memory allocator statistics
|
||||
'-C' - Display compression statistics
|
||||
|
||||
|
|
75
main.c
75
main.c
|
@ -90,7 +90,7 @@ static int do_uncompress = 0;
|
|||
static int cksum_bytes;
|
||||
static int cksum = 0;
|
||||
static int rab_blk_size = 0;
|
||||
static rabin_context_t *rctx;
|
||||
static dedupe_context_t *rctx;
|
||||
|
||||
static void
|
||||
usage(void)
|
||||
|
@ -145,6 +145,8 @@ usage(void)
|
|||
" '-S' <cksum>\n"
|
||||
" - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512\n"
|
||||
" Default one is SKEIN256.\n"
|
||||
" '-F' - Perform Fixed-Block Deduplication. Faster than '-D' in some cases\n"
|
||||
" but with lower deduplication ratio.\n"
|
||||
" '-B' <1..5>\n"
|
||||
" - Specify a minimum Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n"
|
||||
" '-M' - Display memory allocator statistics\n"
|
||||
|
@ -299,11 +301,11 @@ redo:
|
|||
_chunksize = ntohll(*((ssize_t *)rseg));
|
||||
}
|
||||
|
||||
if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) {
|
||||
if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
|
||||
uchar_t *cmpbuf, *ubuf;
|
||||
|
||||
/* Extract various sizes from rabin header. */
|
||||
rabin_parse_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz,
|
||||
parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz,
|
||||
&dedupe_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize);
|
||||
memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE);
|
||||
|
||||
|
@ -363,14 +365,14 @@ redo:
|
|||
goto cont;
|
||||
}
|
||||
/* Rebuild chunk from dedup blocks. */
|
||||
if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) {
|
||||
rabin_context_t *rctx;
|
||||
if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
|
||||
dedupe_context_t *rctx;
|
||||
uchar_t *tmp;
|
||||
|
||||
rctx = tdat->rctx;
|
||||
reset_rabin_context(tdat->rctx);
|
||||
reset_dedupe_context(tdat->rctx);
|
||||
rctx->cbuf = tdat->compressed_chunk;
|
||||
rabin_inverse_dedup(rctx, tdat->uncompressed_chunk, &(tdat->len_cmp));
|
||||
dedupe_decompress(rctx, tdat->uncompressed_chunk, &(tdat->len_cmp));
|
||||
if (!rctx->valid) {
|
||||
fprintf(stderr, "ERROR: Chunk %d, dedup recovery failed.\n", tdat->id);
|
||||
rv = -1;
|
||||
|
@ -582,8 +584,8 @@ start_decompress(const char *filename, const char *to_filename)
|
|||
UNCOMP_BAIL;
|
||||
}
|
||||
}
|
||||
if (enable_rabin_scan) {
|
||||
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, rab_blk_size,
|
||||
if (enable_rabin_scan || enable_fixed_scan) {
|
||||
tdat->rctx = create_dedupe_context(chunksize, compressed_chunksize, rab_blk_size,
|
||||
algo, enable_delta_encode, enable_fixed_scan);
|
||||
if (tdat->rctx == NULL) {
|
||||
UNCOMP_BAIL;
|
||||
|
@ -659,7 +661,7 @@ start_decompress(const char *filename, const char *to_filename)
|
|||
if (!tdat->compressed_chunk) {
|
||||
tdat->compressed_chunk = (uchar_t *)slab_alloc(NULL,
|
||||
compressed_chunksize);
|
||||
if (enable_rabin_scan)
|
||||
if ((enable_rabin_scan || enable_fixed_scan))
|
||||
tdat->uncompressed_chunk = (uchar_t *)slab_alloc(NULL,
|
||||
compressed_chunksize);
|
||||
else
|
||||
|
@ -735,8 +737,8 @@ uncomp_done:
|
|||
slab_free(NULL, dary[i]->compressed_chunk);
|
||||
if (_deinit_func)
|
||||
_deinit_func(&(dary[i]->data));
|
||||
if (enable_rabin_scan) {
|
||||
destroy_rabin_context(dary[i]->rctx);
|
||||
if ((enable_rabin_scan || enable_fixed_scan)) {
|
||||
destroy_dedupe_context(dary[i]->rctx);
|
||||
}
|
||||
slab_free(NULL, dary[i]);
|
||||
}
|
||||
|
@ -770,8 +772,8 @@ redo:
|
|||
compressed_chunk = tdat->compressed_chunk + CHUNK_FLAG_SZ;
|
||||
rbytes = tdat->rbytes;
|
||||
/* Perform Dedup if enabled. */
|
||||
if (enable_rabin_scan) {
|
||||
rabin_context_t *rctx;
|
||||
if ((enable_rabin_scan || enable_fixed_scan)) {
|
||||
dedupe_context_t *rctx;
|
||||
|
||||
/*
|
||||
* Compute checksum of original uncompressed chunk. When doing dedup
|
||||
|
@ -782,9 +784,9 @@ redo:
|
|||
compute_checksum(tdat->checksum, cksum, tdat->cmp_seg, tdat->rbytes);
|
||||
|
||||
rctx = tdat->rctx;
|
||||
reset_rabin_context(tdat->rctx);
|
||||
reset_dedupe_context(tdat->rctx);
|
||||
rctx->cbuf = tdat->uncompressed_chunk;
|
||||
dedupe_index_sz = rabin_dedup(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0, NULL);
|
||||
dedupe_index_sz = dedupe_compress(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0, NULL);
|
||||
if (!rctx->valid) {
|
||||
memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes);
|
||||
tdat->rbytes = rbytes;
|
||||
|
@ -801,7 +803,7 @@ redo:
|
|||
* The rabin index array values can pollute the compressor's dictionary thereby
|
||||
* reducing compression effectiveness of the data chunk. So we separate them.
|
||||
*/
|
||||
if (enable_rabin_scan && tdat->rctx->valid) {
|
||||
if ((enable_rabin_scan || enable_fixed_scan) && tdat->rctx->valid) {
|
||||
_chunksize = tdat->rbytes - dedupe_index_sz - RABIN_HDR_SIZE;
|
||||
index_size_cmp = dedupe_index_sz;
|
||||
|
||||
|
@ -837,7 +839,7 @@ redo:
|
|||
memcpy(compressed_chunk + index_size_cmp,
|
||||
tdat->uncompressed_chunk + dedupe_index_sz, _chunksize);
|
||||
/* Now update rabin header with the compressed sizes. */
|
||||
rabin_update_hdr(compressed_chunk, index_size_cmp - RABIN_HDR_SIZE,
|
||||
update_dedupe_hdr(compressed_chunk, index_size_cmp - RABIN_HDR_SIZE,
|
||||
_chunksize);
|
||||
} else {
|
||||
/* If rabin index compression fails, we just drop down to plain
|
||||
|
@ -869,7 +871,7 @@ plain_compress:
|
|||
*/
|
||||
tdat->len_cmp = _chunksize;
|
||||
if (_chunksize >= rbytes || rv < 0) {
|
||||
if (!enable_rabin_scan || !tdat->rctx->valid)
|
||||
if (!(enable_rabin_scan || enable_fixed_scan) || !tdat->rctx->valid)
|
||||
memcpy(compressed_chunk, tdat->uncompressed_chunk, tdat->rbytes);
|
||||
type = UNCOMPRESSED;
|
||||
tdat->len_cmp = tdat->rbytes;
|
||||
|
@ -877,7 +879,7 @@ plain_compress:
|
|||
type = COMPRESSED;
|
||||
}
|
||||
|
||||
if (enable_rabin_scan && tdat->rctx->valid) {
|
||||
if ((enable_rabin_scan || enable_fixed_scan) && tdat->rctx->valid) {
|
||||
type |= CHUNK_FLAG_DEDUP;
|
||||
}
|
||||
if (lzp_preprocess) {
|
||||
|
@ -982,7 +984,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
struct cmp_data **dary = NULL, *tdat;
|
||||
pthread_t writer_thr;
|
||||
uchar_t *cread_buf, *pos;
|
||||
rabin_context_t *rctx;
|
||||
dedupe_context_t *rctx;
|
||||
algo_props_t props;
|
||||
|
||||
/*
|
||||
|
@ -1015,7 +1017,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
else
|
||||
flags |= FLAG_DEDUP_FIXED;
|
||||
/* Additional scratch space for dedup arrays. */
|
||||
compressed_chunksize += (rabin_buf_extra(chunksize, 0, algo,
|
||||
compressed_chunksize += (dedupe_buf_extra(chunksize, 0, algo,
|
||||
enable_delta_encode) - (compressed_chunksize - chunksize));
|
||||
}
|
||||
|
||||
|
@ -1107,7 +1109,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
fprintf(stderr, "\n");
|
||||
|
||||
dary = (struct cmp_data **)slab_calloc(NULL, nprocs, sizeof (struct cmp_data *));
|
||||
if (enable_rabin_scan)
|
||||
if ((enable_rabin_scan || enable_fixed_scan))
|
||||
cread_buf = (uchar_t *)slab_alloc(NULL, compressed_chunksize);
|
||||
else
|
||||
cread_buf = (uchar_t *)slab_alloc(NULL, chunksize);
|
||||
|
@ -1137,8 +1139,8 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
COMP_BAIL;
|
||||
}
|
||||
}
|
||||
if (enable_rabin_scan) {
|
||||
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, rab_blk_size,
|
||||
if (enable_rabin_scan || enable_fixed_scan) {
|
||||
tdat->rctx = create_dedupe_context(chunksize, compressed_chunksize, rab_blk_size,
|
||||
algo, enable_delta_encode, enable_fixed_scan);
|
||||
if (tdat->rctx == NULL) {
|
||||
COMP_BAIL;
|
||||
|
@ -1204,7 +1206,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
* Read the first chunk into a spare buffer (a simple double-buffering).
|
||||
*/
|
||||
if (enable_rabin_split) {
|
||||
rctx = create_rabin_context(chunksize, 0, 0, algo, enable_delta_encode,
|
||||
rctx = create_dedupe_context(chunksize, 0, 0, algo, enable_delta_encode,
|
||||
enable_fixed_scan);
|
||||
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
|
||||
} else {
|
||||
|
@ -1231,7 +1233,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
* Delayed allocation. Allocate chunks if not already done.
|
||||
*/
|
||||
if (!tdat->cmp_seg) {
|
||||
if (enable_rabin_scan) {
|
||||
if ((enable_rabin_scan || enable_fixed_scan)) {
|
||||
if (single_chunk)
|
||||
tdat->cmp_seg = (uchar_t *)1;
|
||||
else
|
||||
|
@ -1266,7 +1268,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
*/
|
||||
tdat->id = chunk_num;
|
||||
tdat->rbytes = rbytes;
|
||||
if (enable_rabin_scan) {
|
||||
if ((enable_rabin_scan || enable_fixed_scan)) {
|
||||
tmp = tdat->cmp_seg;
|
||||
tdat->cmp_seg = cread_buf;
|
||||
cread_buf = tmp;
|
||||
|
@ -1383,8 +1385,8 @@ comp_done:
|
|||
slab_free(NULL, dary[i]->uncompressed_chunk);
|
||||
if (dary[i]->cmp_seg != (uchar_t *)1)
|
||||
slab_free(NULL, dary[i]->cmp_seg);
|
||||
if (enable_rabin_scan) {
|
||||
destroy_rabin_context(dary[i]->rctx);
|
||||
if ((enable_rabin_scan || enable_fixed_scan)) {
|
||||
destroy_dedupe_context(dary[i]->rctx);
|
||||
}
|
||||
if (_deinit_func)
|
||||
_deinit_func(&(dary[i]->data));
|
||||
|
@ -1392,7 +1394,7 @@ comp_done:
|
|||
}
|
||||
slab_free(NULL, dary);
|
||||
}
|
||||
if (enable_rabin_split) destroy_rabin_context(rctx);
|
||||
if (enable_rabin_split) destroy_dedupe_context(rctx);
|
||||
if (cread_buf != (uchar_t *)1)
|
||||
slab_free(NULL, cread_buf);
|
||||
if (!pipe_mode) {
|
||||
|
@ -1530,7 +1532,7 @@ main(int argc, char *argv[])
|
|||
level = 6;
|
||||
slab_init();
|
||||
|
||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDErLS:B:")) != -1) {
|
||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDErLS:B:F")) != -1) {
|
||||
int ovr;
|
||||
|
||||
switch (opt) {
|
||||
|
@ -1597,8 +1599,9 @@ main(int argc, char *argv[])
|
|||
enable_delta_encode = 1;
|
||||
break;
|
||||
|
||||
case 'f':
|
||||
case 'F':
|
||||
enable_fixed_scan = 1;
|
||||
enable_rabin_split = 0;
|
||||
break;
|
||||
|
||||
case 'L':
|
||||
|
@ -1638,15 +1641,15 @@ main(int argc, char *argv[])
|
|||
exit(1);
|
||||
}
|
||||
|
||||
if (enable_rabin_scan && !do_compress) {
|
||||
fprintf(stderr, "Rabin Deduplication is only used during compression.\n");
|
||||
if ((enable_rabin_scan || enable_fixed_scan) && !do_compress) {
|
||||
fprintf(stderr, "Deduplication is only used during compression.\n");
|
||||
usage();
|
||||
exit(1);
|
||||
}
|
||||
if (!enable_rabin_scan)
|
||||
enable_rabin_split = 0;
|
||||
|
||||
if (enable_fixed_scan && (enable_rabin_scan || enable_delta_encode)) {
|
||||
if (enable_fixed_scan && (enable_rabin_scan || enable_delta_encode || enable_rabin_split)) {
|
||||
fprintf(stderr, "Rabin Deduplication and Fixed block Deduplication are mutually exclusive\n");
|
||||
exit(1);
|
||||
}
|
||||
|
|
|
@ -157,7 +157,7 @@ struct cmp_data {
|
|||
uchar_t *cmp_seg;
|
||||
uchar_t *compressed_chunk;
|
||||
uchar_t *uncompressed_chunk;
|
||||
rabin_context_t *rctx;
|
||||
dedupe_context_t *rctx;
|
||||
ssize_t rbytes;
|
||||
ssize_t chunksize;
|
||||
ssize_t len_cmp;
|
||||
|
|
|
@ -86,7 +86,7 @@ uint64_t ir[256];
|
|||
static int inited = 0;
|
||||
|
||||
static uint32_t
|
||||
rabin_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
||||
dedupe_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
||||
{
|
||||
uint32_t min_blk;
|
||||
|
||||
|
@ -95,22 +95,22 @@ rabin_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_
|
|||
}
|
||||
|
||||
uint32_t
|
||||
rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
||||
dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
||||
{
|
||||
if (rab_blk_sz < 1 || rab_blk_sz > 5)
|
||||
rab_blk_sz = RAB_BLK_DEFAULT;
|
||||
|
||||
return ((chunksize / rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag))
|
||||
return ((chunksize / dedupe_min_blksz(chunksize, rab_blk_sz, algo, delta_flag))
|
||||
* sizeof (uint32_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the algorithm with the default params.
|
||||
*/
|
||||
rabin_context_t *
|
||||
create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz,
|
||||
dedupe_context_t *
|
||||
create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz,
|
||||
const char *algo, int delta_flag, int fixed_flag) {
|
||||
rabin_context_t *ctx;
|
||||
dedupe_context_t *ctx;
|
||||
unsigned char *current_window_data;
|
||||
uint32_t i;
|
||||
|
||||
|
@ -165,7 +165,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
|||
* use 4K minimum Rabin block size. For everything else it is 2K based
|
||||
* on experimentation.
|
||||
*/
|
||||
ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t));
|
||||
ctx = (dedupe_context_t *)slab_alloc(NULL, sizeof (dedupe_context_t));
|
||||
ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
|
||||
|
||||
ctx->fixed_flag = fixed_flag;
|
||||
|
@ -173,7 +173,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
|||
ctx->delta_flag = delta_flag;
|
||||
ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
|
||||
ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1;
|
||||
ctx->rabin_poly_min_block_size = rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag);
|
||||
ctx->rabin_poly_min_block_size = dedupe_min_blksz(chunksize, rab_blk_sz, algo, delta_flag);
|
||||
ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size;
|
||||
|
||||
if (!fixed_flag)
|
||||
|
@ -186,7 +186,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
|||
|
||||
if (ctx->blknum > RABIN_MAX_BLOCKS) {
|
||||
fprintf(stderr, "Chunk size too large for dedup.\n");
|
||||
destroy_rabin_context(ctx);
|
||||
destroy_dedupe_context(ctx);
|
||||
return (NULL);
|
||||
}
|
||||
current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE);
|
||||
|
@ -198,7 +198,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
|||
if(ctx == NULL || current_window_data == NULL || (ctx->blocks == NULL && real_chunksize > 0)) {
|
||||
fprintf(stderr,
|
||||
"Could not allocate rabin polynomial context, out of memory\n");
|
||||
destroy_rabin_context(ctx);
|
||||
destroy_dedupe_context(ctx);
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
|
@ -209,7 +209,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
|||
if (!(ctx->lzma_data)) {
|
||||
fprintf(stderr,
|
||||
"Could not initialize LZMA data for dedupe index, out of memory\n");
|
||||
destroy_rabin_context(ctx);
|
||||
destroy_dedupe_context(ctx);
|
||||
return (NULL);
|
||||
}
|
||||
}
|
||||
|
@ -227,19 +227,19 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
|||
slab_cache_add(sizeof (rabin_blockentry_t));
|
||||
ctx->current_window_data = current_window_data;
|
||||
ctx->real_chunksize = real_chunksize;
|
||||
reset_rabin_context(ctx);
|
||||
reset_dedupe_context(ctx);
|
||||
return (ctx);
|
||||
}
|
||||
|
||||
void
|
||||
reset_rabin_context(rabin_context_t *ctx)
|
||||
reset_dedupe_context(dedupe_context_t *ctx)
|
||||
{
|
||||
memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
|
||||
ctx->window_pos = 0;
|
||||
}
|
||||
|
||||
void
|
||||
destroy_rabin_context(rabin_context_t *ctx)
|
||||
destroy_dedupe_context(dedupe_context_t *ctx)
|
||||
{
|
||||
if (ctx) {
|
||||
uint32_t i;
|
||||
|
@ -288,11 +288,13 @@ cmpblks(const void *a, const void *b)
|
|||
}
|
||||
|
||||
/**
|
||||
* Perform Deduplication based on Rabin Fingerprinting. A 31-byte window is used for
|
||||
* the rolling checksum and dedup blocks vary in size from 4K-128K.
|
||||
* Perform Deduplication.
|
||||
* Both Semi-Rabin fingerprinting based and Fixed Block Deduplication are supported.
|
||||
* A 16-byte window is used for the rolling checksum and dedup blocks can vary in size
|
||||
* from 4K-128K.
|
||||
*/
|
||||
uint32_t
|
||||
rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos)
|
||||
dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos)
|
||||
{
|
||||
ssize_t i, last_offset, j, fplist_sz;
|
||||
uint32_t blknum;
|
||||
|
@ -302,6 +304,40 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
uint32_t *fplist;
|
||||
heap_t heap;
|
||||
|
||||
length = offset;
|
||||
last_offset = 0;
|
||||
blknum = 0;
|
||||
ctx->valid = 0;
|
||||
cur_roll_checksum = 0;
|
||||
cur_sketch = 0;
|
||||
|
||||
if (ctx->fixed_flag) {
|
||||
blknum = *size / ctx->rabin_poly_avg_block_size;
|
||||
j = *size % ctx->rabin_poly_avg_block_size;
|
||||
if (j) blknum++;
|
||||
|
||||
last_offset = 0;
|
||||
length = ctx->rabin_poly_avg_block_size;
|
||||
for (i=0; i<blknum; i++) {
|
||||
if (i == blknum-1) {
|
||||
length = j;
|
||||
}
|
||||
if (ctx->blocks[i] == 0) {
|
||||
ctx->blocks[i] = (rabin_blockentry_t *)slab_alloc(NULL,
|
||||
sizeof (rabin_blockentry_t));
|
||||
}
|
||||
ctx->blocks[i]->offset = last_offset;
|
||||
ctx->blocks[i]->index = i; // Need to store for sorting
|
||||
ctx->blocks[i]->length = length;
|
||||
ctx->blocks[i]->ref = 0;
|
||||
ctx->blocks[i]->similar = 0;
|
||||
ctx->blocks[i]->crc = XXH_strong32(buf1+last_offset, length, 0);
|
||||
ctx->blocks[i]->cksum_n_offset = ctx->blocks[i]->crc;
|
||||
last_offset += length;
|
||||
}
|
||||
goto process_blocks;
|
||||
}
|
||||
|
||||
if (rabin_pos == NULL) {
|
||||
/*
|
||||
* Initialize arrays for sketch computation. We re-use memory allocated
|
||||
|
@ -312,12 +348,6 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
memset(fplist, 0, fplist_sz);
|
||||
reset_heap(&heap, fplist_sz/2);
|
||||
}
|
||||
length = offset;
|
||||
last_offset = 0;
|
||||
blknum = 0;
|
||||
ctx->valid = 0;
|
||||
cur_roll_checksum = 0;
|
||||
cur_sketch = 0;
|
||||
|
||||
/*
|
||||
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
|
||||
|
@ -434,6 +464,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
}
|
||||
}
|
||||
|
||||
process_blocks:
|
||||
DEBUG_STAT_EN(printf("Original size: %lld, blknum: %u\n", *size, blknum));
|
||||
// If we found at least a few chunks, perform dedup.
|
||||
if (blknum > 2) {
|
||||
|
@ -701,7 +732,7 @@ cont:
|
|||
}
|
||||
|
||||
void
|
||||
rabin_update_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp)
|
||||
update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp)
|
||||
{
|
||||
ssize_t *entries;
|
||||
|
||||
|
@ -712,7 +743,7 @@ rabin_update_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_s
|
|||
}
|
||||
|
||||
void
|
||||
rabin_parse_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
|
||||
parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
|
||||
ssize_t *rabin_data_sz, ssize_t *dedupe_index_sz_cmp,
|
||||
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size)
|
||||
{
|
||||
|
@ -730,7 +761,7 @@ rabin_parse_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
|
|||
}
|
||||
|
||||
void
|
||||
rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
||||
dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size)
|
||||
{
|
||||
uint32_t blknum, blk, oblk, len;
|
||||
uint32_t *dedupe_index;
|
||||
|
@ -738,7 +769,7 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
|||
ssize_t dedupe_index_sz, pos1, i;
|
||||
uchar_t *pos2;
|
||||
|
||||
rabin_parse_hdr(buf, &blknum, &dedupe_index_sz, &data_sz, &indx_cmp, &data_sz_cmp, &deduped_sz);
|
||||
parse_dedupe_hdr(buf, &blknum, &dedupe_index_sz, &data_sz, &indx_cmp, &data_sz_cmp, &deduped_sz);
|
||||
dedupe_index = (uint32_t *)(buf + RABIN_HDR_SIZE);
|
||||
pos1 = dedupe_index_sz + RABIN_HDR_SIZE;
|
||||
pos2 = ctx->cbuf;
|
||||
|
@ -828,7 +859,7 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
|||
* TODO: Consolidate rabin dedup and compression/decompression in functions here rather than
|
||||
* messy code in main program.
|
||||
int
|
||||
rabin_compress(rabin_context_t *ctx, uchar_t *from, ssize_t fromlen, uchar_t *to, ssize_t *tolen,
|
||||
rabin_compress(dedupe_context_t *ctx, uchar_t *from, ssize_t fromlen, uchar_t *to, ssize_t *tolen,
|
||||
int level, char chdr, void *data, compress_func_ptr cmp)
|
||||
{
|
||||
}
|
||||
|
|
|
@ -150,21 +150,21 @@ typedef struct {
|
|||
short valid;
|
||||
void *lzma_data;
|
||||
int level, delta_flag, fixed_flag;
|
||||
} rabin_context_t;
|
||||
} dedupe_context_t;
|
||||
|
||||
extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize,
|
||||
extern dedupe_context_t *create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize,
|
||||
int rab_blk_sz, const char *algo, int delta_flag, int fixed_flag);
|
||||
extern void destroy_rabin_context(rabin_context_t *ctx);
|
||||
extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf,
|
||||
extern void destroy_dedupe_context(dedupe_context_t *ctx);
|
||||
extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf,
|
||||
ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
|
||||
extern void rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size);
|
||||
extern void rabin_parse_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz,
|
||||
extern void dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size);
|
||||
extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz,
|
||||
ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp,
|
||||
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size);
|
||||
extern void rabin_update_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
|
||||
extern void update_dedupe_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
|
||||
ssize_t rabin_data_sz_cmp);
|
||||
extern void reset_rabin_context(rabin_context_t *ctx);
|
||||
extern uint32_t rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
|
||||
extern void reset_dedupe_context(dedupe_context_t *ctx);
|
||||
extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
|
||||
int delta_flag);
|
||||
|
||||
#endif /* _RABIN_POLY_H_ */
|
||||
|
|
|
@ -223,7 +223,7 @@ Read_Adjusted(int fd, uchar_t *buf, size_t count, ssize_t *rabin_count, void *ct
|
|||
{
|
||||
char *buf2;
|
||||
ssize_t rcount;
|
||||
rabin_context_t *rctx = (rabin_context_t *)ctx;
|
||||
dedupe_context_t *rctx = (dedupe_context_t *)ctx;
|
||||
|
||||
if (!ctx) return (Read(fd, buf, count));
|
||||
buf2 = buf;
|
||||
|
@ -235,7 +235,7 @@ Read_Adjusted(int fd, uchar_t *buf, size_t count, ssize_t *rabin_count, void *ct
|
|||
if (rcount > 0) {
|
||||
rcount += *rabin_count;
|
||||
if (rcount == count)
|
||||
rabin_dedup(rctx, buf, &rcount, 0, rabin_count);
|
||||
dedupe_compress(rctx, buf, &rcount, 0, rabin_count);
|
||||
else
|
||||
*rabin_count = 0;
|
||||
} else {
|
||||
|
|
Loading…
Reference in a new issue