Reduce dedupe loop checks for slight speed edge.
Beginnings of Fixed-block dedupe. Update variable name for clarity.
This commit is contained in:
parent
a6b3719d89
commit
b9355a5dcc
5 changed files with 109 additions and 83 deletions
|
@ -115,7 +115,7 @@ Examples
|
|||
========
|
||||
|
||||
Compress "file.tar" using bzip2 level 6, 64MB chunk size and use 4 threads. In
|
||||
addition perform exact deduplication and delta compression prior to compression.
|
||||
addition perform identity deduplication and delta compression prior to compression.
|
||||
|
||||
pcompress -D -E -c bzip2 -l6 -s64m -t4 file.tar
|
||||
|
||||
|
@ -177,6 +177,3 @@ Normally this utility requires lots of RAM depending on compression algorithm,
|
|||
compression level, and dedupe being enabled. Larger chunk sizes can give
|
||||
better compression ratio but at the same time use more RAM.
|
||||
|
||||
In some cases for files less than a gigabyte. Using Delta Compression in addition
|
||||
to exact Dedupe can have a slight negative impact on LZMA compression ratio
|
||||
especially when using the large-window ultra compression levels above 10.
|
||||
|
|
65
main.c
65
main.c
|
@ -79,6 +79,7 @@ static int hide_cmp_stats = 1;
|
|||
static int enable_rabin_scan = 0;
|
||||
static int enable_delta_encode = 0;
|
||||
static int enable_rabin_split = 1;
|
||||
static int enable_fixed_scan = 0;
|
||||
static int lzp_preprocess = 0;
|
||||
static unsigned int chunk_num;
|
||||
static uint64_t largest_chunk, smallest_chunk, avg_chunk;
|
||||
|
@ -261,7 +262,7 @@ perform_decompress(void *dat)
|
|||
{
|
||||
struct cmp_data *tdat = (struct cmp_data *)dat;
|
||||
ssize_t _chunksize;
|
||||
ssize_t rabin_index_sz, rabin_data_sz, rabin_index_sz_cmp, rabin_data_sz_cmp;
|
||||
ssize_t dedupe_index_sz, rabin_data_sz, dedupe_index_sz_cmp, rabin_data_sz_cmp;
|
||||
int type, rv;
|
||||
unsigned int blknum;
|
||||
uchar_t checksum[CKSUM_MAX_BYTES];
|
||||
|
@ -302,8 +303,8 @@ redo:
|
|||
uchar_t *cmpbuf, *ubuf;
|
||||
|
||||
/* Extract various sizes from rabin header. */
|
||||
rabin_parse_hdr(cseg, &blknum, &rabin_index_sz, &rabin_data_sz,
|
||||
&rabin_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize);
|
||||
rabin_parse_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz,
|
||||
&dedupe_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize);
|
||||
memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE);
|
||||
|
||||
/*
|
||||
|
@ -312,8 +313,8 @@ redo:
|
|||
* state/dictionary info. Since data chunk directly follows index
|
||||
* uncompressing index first corrupts the data.
|
||||
*/
|
||||
cmpbuf = cseg + RABIN_HDR_SIZE + rabin_index_sz_cmp;
|
||||
ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + rabin_index_sz;
|
||||
cmpbuf = cseg + RABIN_HDR_SIZE + dedupe_index_sz_cmp;
|
||||
ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + dedupe_index_sz;
|
||||
if (HDR & COMPRESSED) {
|
||||
if (HDR & CHUNK_FLAG_PREPROC) {
|
||||
rv = preproc_decompress(tdat->decompress, cmpbuf, rabin_data_sz_cmp,
|
||||
|
@ -334,12 +335,12 @@ redo:
|
|||
rv = 0;
|
||||
cmpbuf = cseg + RABIN_HDR_SIZE;
|
||||
ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE;
|
||||
if (rabin_index_sz >= 90) {
|
||||
if (dedupe_index_sz >= 90) {
|
||||
/* Index should be at least 90 bytes to have been compressed. */
|
||||
rv = lzma_decompress(cmpbuf, rabin_index_sz_cmp, ubuf,
|
||||
&rabin_index_sz, tdat->rctx->level, 0, tdat->rctx->lzma_data);
|
||||
rv = lzma_decompress(cmpbuf, dedupe_index_sz_cmp, ubuf,
|
||||
&dedupe_index_sz, tdat->rctx->level, 0, tdat->rctx->lzma_data);
|
||||
} else {
|
||||
memcpy(ubuf, cmpbuf, rabin_index_sz);
|
||||
memcpy(ubuf, cmpbuf, dedupe_index_sz);
|
||||
}
|
||||
} else {
|
||||
if (HDR & COMPRESSED) {
|
||||
|
@ -529,6 +530,9 @@ start_decompress(const char *filename, const char *to_filename)
|
|||
|
||||
if (flags & FLAG_DEDUP) {
|
||||
enable_rabin_scan = 1;
|
||||
|
||||
} else if (flags & FLAG_DEDUP_FIXED) {
|
||||
enable_fixed_scan = 1;
|
||||
}
|
||||
|
||||
if (flags & FLAG_SINGLE_CHUNK) {
|
||||
|
@ -580,7 +584,7 @@ start_decompress(const char *filename, const char *to_filename)
|
|||
}
|
||||
if (enable_rabin_scan) {
|
||||
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, rab_blk_size,
|
||||
algo, enable_delta_encode);
|
||||
algo, enable_delta_encode, enable_fixed_scan);
|
||||
if (tdat->rctx == NULL) {
|
||||
UNCOMP_BAIL;
|
||||
}
|
||||
|
@ -750,7 +754,7 @@ uncomp_done:
|
|||
static void *
|
||||
perform_compress(void *dat) {
|
||||
struct cmp_data *tdat = (struct cmp_data *)dat;
|
||||
typeof (tdat->chunksize) _chunksize, len_cmp, rabin_index_sz, index_size_cmp;
|
||||
typeof (tdat->chunksize) _chunksize, len_cmp, dedupe_index_sz, index_size_cmp;
|
||||
int type, rv;
|
||||
uchar_t *compressed_chunk;
|
||||
ssize_t rbytes;
|
||||
|
@ -780,7 +784,7 @@ redo:
|
|||
rctx = tdat->rctx;
|
||||
reset_rabin_context(tdat->rctx);
|
||||
rctx->cbuf = tdat->uncompressed_chunk;
|
||||
rabin_index_sz = rabin_dedup(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0, NULL);
|
||||
dedupe_index_sz = rabin_dedup(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0, NULL);
|
||||
if (!rctx->valid) {
|
||||
memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes);
|
||||
tdat->rbytes = rbytes;
|
||||
|
@ -798,32 +802,32 @@ redo:
|
|||
* reducing compression effectiveness of the data chunk. So we separate them.
|
||||
*/
|
||||
if (enable_rabin_scan && tdat->rctx->valid) {
|
||||
_chunksize = tdat->rbytes - rabin_index_sz - RABIN_HDR_SIZE;
|
||||
index_size_cmp = rabin_index_sz;
|
||||
_chunksize = tdat->rbytes - dedupe_index_sz - RABIN_HDR_SIZE;
|
||||
index_size_cmp = dedupe_index_sz;
|
||||
|
||||
rv = 0;
|
||||
if (rabin_index_sz >= 90) {
|
||||
if (dedupe_index_sz >= 90) {
|
||||
/* Compress index if it is at least 90 bytes. */
|
||||
rv = lzma_compress(tdat->uncompressed_chunk + RABIN_HDR_SIZE,
|
||||
rabin_index_sz, compressed_chunk + RABIN_HDR_SIZE,
|
||||
dedupe_index_sz, compressed_chunk + RABIN_HDR_SIZE,
|
||||
&index_size_cmp, tdat->rctx->level, 255, tdat->rctx->lzma_data);
|
||||
} else {
|
||||
memcpy(compressed_chunk + RABIN_HDR_SIZE,
|
||||
tdat->uncompressed_chunk + RABIN_HDR_SIZE, rabin_index_sz);
|
||||
tdat->uncompressed_chunk + RABIN_HDR_SIZE, dedupe_index_sz);
|
||||
}
|
||||
|
||||
index_size_cmp += RABIN_HDR_SIZE;
|
||||
rabin_index_sz += RABIN_HDR_SIZE;
|
||||
dedupe_index_sz += RABIN_HDR_SIZE;
|
||||
if (rv == 0) {
|
||||
memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE);
|
||||
/* Compress data chunk. */
|
||||
if (lzp_preprocess) {
|
||||
rv = preproc_compress(tdat->compress,
|
||||
tdat->uncompressed_chunk + rabin_index_sz,
|
||||
tdat->uncompressed_chunk + dedupe_index_sz,
|
||||
_chunksize, compressed_chunk + index_size_cmp, &_chunksize,
|
||||
tdat->level, 0, tdat->data);
|
||||
} else {
|
||||
rv = tdat->compress(tdat->uncompressed_chunk + rabin_index_sz,
|
||||
rv = tdat->compress(tdat->uncompressed_chunk + dedupe_index_sz,
|
||||
_chunksize, compressed_chunk + index_size_cmp, &_chunksize,
|
||||
tdat->level, 0, tdat->data);
|
||||
}
|
||||
|
@ -831,7 +835,7 @@ redo:
|
|||
/* Can't compress data just retain as-is. */
|
||||
if (rv < 0)
|
||||
memcpy(compressed_chunk + index_size_cmp,
|
||||
tdat->uncompressed_chunk + rabin_index_sz, _chunksize);
|
||||
tdat->uncompressed_chunk + dedupe_index_sz, _chunksize);
|
||||
/* Now update rabin header with the compressed sizes. */
|
||||
rabin_update_hdr(compressed_chunk, index_size_cmp - RABIN_HDR_SIZE,
|
||||
_chunksize);
|
||||
|
@ -1005,8 +1009,11 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
}
|
||||
|
||||
flags = 0;
|
||||
if (enable_rabin_scan) {
|
||||
if (enable_rabin_scan || enable_fixed_scan) {
|
||||
if (enable_rabin_scan)
|
||||
flags |= FLAG_DEDUP;
|
||||
else
|
||||
flags |= FLAG_DEDUP_FIXED;
|
||||
/* Additional scratch space for dedup arrays. */
|
||||
compressed_chunksize += (rabin_buf_extra(chunksize, 0, algo,
|
||||
enable_delta_encode) - (compressed_chunksize - chunksize));
|
||||
|
@ -1132,7 +1139,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
}
|
||||
if (enable_rabin_scan) {
|
||||
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, rab_blk_size,
|
||||
algo, enable_delta_encode);
|
||||
algo, enable_delta_encode, enable_fixed_scan);
|
||||
if (tdat->rctx == NULL) {
|
||||
COMP_BAIL;
|
||||
}
|
||||
|
@ -1197,7 +1204,8 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
* Read the first chunk into a spare buffer (a simple double-buffering).
|
||||
*/
|
||||
if (enable_rabin_split) {
|
||||
rctx = create_rabin_context(chunksize, 0, 0, algo, enable_delta_encode);
|
||||
rctx = create_rabin_context(chunksize, 0, 0, algo, enable_delta_encode,
|
||||
enable_fixed_scan);
|
||||
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
|
||||
} else {
|
||||
rbytes = Read(uncompfd, cread_buf, chunksize);
|
||||
|
@ -1589,6 +1597,10 @@ main(int argc, char *argv[])
|
|||
enable_delta_encode = 1;
|
||||
break;
|
||||
|
||||
case 'f':
|
||||
enable_fixed_scan = 1;
|
||||
break;
|
||||
|
||||
case 'L':
|
||||
lzp_preprocess = 1;
|
||||
break;
|
||||
|
@ -1634,6 +1646,11 @@ main(int argc, char *argv[])
|
|||
if (!enable_rabin_scan)
|
||||
enable_rabin_split = 0;
|
||||
|
||||
if (enable_fixed_scan && (enable_rabin_scan || enable_delta_encode)) {
|
||||
fprintf(stderr, "Rabin Deduplication and Fixed block Deduplication are mutually exclusive\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (num_rem == 0 && !pipe_mode) {
|
||||
usage(); /* At least 1 filename needed. */
|
||||
exit(1);
|
||||
|
|
|
@ -39,6 +39,7 @@ extern "C" {
|
|||
#define MIN_CHUNK 2048
|
||||
#define VERSION 3
|
||||
#define FLAG_DEDUP 1
|
||||
#define FLAG_DEDUP_FIXED 1
|
||||
#define FLAG_SINGLE_CHUNK 2
|
||||
#define UTILITY_VERSION "0.8.1"
|
||||
|
||||
|
|
|
@ -109,7 +109,7 @@ rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_
|
|||
*/
|
||||
rabin_context_t *
|
||||
create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz,
|
||||
const char *algo, int delta_flag) {
|
||||
const char *algo, int delta_flag, int fixed_flag) {
|
||||
rabin_context_t *ctx;
|
||||
unsigned char *current_window_data;
|
||||
uint32_t i;
|
||||
|
@ -117,6 +117,11 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
|||
if (rab_blk_sz < 1 || rab_blk_sz > 5)
|
||||
rab_blk_sz = RAB_BLK_DEFAULT;
|
||||
|
||||
if (fixed_flag) {
|
||||
delta_flag = 0;
|
||||
inited = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Pre-compute a table of irreducible polynomial evaluations for each
|
||||
* possible byte value.
|
||||
|
@ -163,13 +168,18 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
|||
ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t));
|
||||
ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
|
||||
|
||||
ctx->fixed_flag = fixed_flag;
|
||||
ctx->rabin_break_patt = 0;
|
||||
ctx->delta_flag = delta_flag;
|
||||
ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
|
||||
ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1;
|
||||
ctx->rabin_poly_min_block_size = rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag);
|
||||
ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size;
|
||||
|
||||
if (!fixed_flag)
|
||||
ctx->blknum = chunksize / ctx->rabin_poly_min_block_size;
|
||||
else
|
||||
ctx->blknum = chunksize / ctx->rabin_poly_avg_block_size;
|
||||
|
||||
if (chunksize % ctx->rabin_poly_min_block_size)
|
||||
ctx->blknum++;
|
||||
|
@ -198,7 +208,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz
|
|||
lzma_init(&(ctx->lzma_data), &(ctx->level), chunksize);
|
||||
if (!(ctx->lzma_data)) {
|
||||
fprintf(stderr,
|
||||
"Could not initialize LZMA data for rabin index, out of memory\n");
|
||||
"Could not initialize LZMA data for dedupe index, out of memory\n");
|
||||
destroy_rabin_context(ctx);
|
||||
return (NULL);
|
||||
}
|
||||
|
@ -392,7 +402,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
if ((cur_pos_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
|
||||
length >= ctx->rabin_poly_max_block_size) {
|
||||
if (ctx->blocks[blknum] == 0)
|
||||
ctx->blocks[blknum] = (rabin_blockentry_t *)slab_alloc(NULL, sizeof (rabin_blockentry_t));
|
||||
ctx->blocks[blknum] = (rabin_blockentry_t *)slab_alloc(NULL,
|
||||
sizeof (rabin_blockentry_t));
|
||||
ctx->blocks[blknum]->offset = last_offset;
|
||||
ctx->blocks[blknum]->index = blknum; // Need to store for sorting
|
||||
ctx->blocks[blknum]->length = length;
|
||||
|
@ -430,8 +441,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
ssize_t pos, matchlen, pos1;
|
||||
int valid = 1;
|
||||
char *tmp;
|
||||
uint32_t *blkarr, *trans, *rabin_index;
|
||||
ssize_t rabin_index_sz;
|
||||
uint32_t *blkarr, *trans, *dedupe_index;
|
||||
ssize_t dedupe_index_sz;
|
||||
rabin_blockentry_t *prev;
|
||||
DEBUG_STAT_EN(uint32_t delta_calls, delta_fails);
|
||||
DEBUG_STAT_EN(delta_calls = 0);
|
||||
|
@ -469,7 +480,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
last_offset = *size;
|
||||
}
|
||||
|
||||
rabin_index_sz = (ssize_t)blknum * RABIN_ENTRY_SIZE;
|
||||
dedupe_index_sz = (ssize_t)blknum * RABIN_ENTRY_SIZE;
|
||||
|
||||
/*
|
||||
* Now sort the block array based on checksums. This will bring virtually
|
||||
|
@ -477,14 +488,14 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
* our checksum is. We are using a maximal super-sketch value.
|
||||
*/
|
||||
qsort(ctx->blocks, blknum, sizeof (rabin_blockentry_t *), cmpblks);
|
||||
rabin_index = (uint32_t *)(ctx->cbuf + RABIN_HDR_SIZE);
|
||||
dedupe_index = (uint32_t *)(ctx->cbuf + RABIN_HDR_SIZE);
|
||||
|
||||
/*
|
||||
* We need 2 temporary arrays. We just use available space in the last
|
||||
* portion of the buffer that will hold the deduped segment.
|
||||
*/
|
||||
blkarr = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - (rabin_index_sz * 2 + 1));
|
||||
trans = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - (rabin_index_sz + 1));
|
||||
blkarr = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - (dedupe_index_sz * 2 + 1));
|
||||
trans = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - (dedupe_index_sz + 1));
|
||||
matchlen = 0;
|
||||
|
||||
/*
|
||||
|
@ -497,12 +508,13 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
* A reference count is maintained for blocks that are similar with other
|
||||
* blocks. This helps in non-duplicate block merging later.
|
||||
*/
|
||||
for (blk = 0; blk < blknum; blk++) {
|
||||
blkarr[ctx->blocks[0]->index] = 0;
|
||||
prev = ctx->blocks[0];
|
||||
for (blk = 1; blk < blknum; blk++) {
|
||||
blkarr[ctx->blocks[blk]->index] = blk;
|
||||
|
||||
if (blk > 0 && ctx->blocks[blk]->cksum_n_offset == prev->cksum_n_offset &&
|
||||
if (ctx->blocks[blk]->crc == prev->crc &&
|
||||
ctx->blocks[blk]->length == prev->length &&
|
||||
ctx->blocks[blk]->crc == prev->crc &&
|
||||
memcmp(buf1 + prev->offset, buf1 + ctx->blocks[blk]->offset,
|
||||
prev->length) == 0)
|
||||
{
|
||||
|
@ -526,8 +538,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
*/
|
||||
if (prev != NULL && ctx->blocks[blk]->ref == 0 &&
|
||||
ctx->blocks[blk]->cksum_n_offset == prev->cksum_n_offset &&
|
||||
ctx->blocks[blk]->length - prev->length < 512 &&
|
||||
ctx->blocks[blk]->mean_n_length == prev->mean_n_length
|
||||
ctx->blocks[blk]->length - prev->length < 512
|
||||
) {
|
||||
ctx->blocks[blk]->index = prev->index;
|
||||
ctx->blocks[blk]->similar = SIMILAR_PARTIAL;
|
||||
|
@ -538,7 +549,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
prev = ctx->blocks[blk];
|
||||
}
|
||||
}
|
||||
if (matchlen < rabin_index_sz) {
|
||||
if (matchlen < dedupe_index_sz) {
|
||||
ctx->valid = 0;
|
||||
return;
|
||||
}
|
||||
|
@ -569,7 +580,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
prev_index = pos;
|
||||
prev_length = be->length;
|
||||
}
|
||||
rabin_index[pos] = be->length;
|
||||
dedupe_index[pos] = be->length;
|
||||
ctx->blocks[pos]->cksum_n_offset = be->offset;
|
||||
trans[blk] = pos;
|
||||
pos++;
|
||||
|
@ -577,18 +588,18 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
if (be->ref > 0) {
|
||||
prev_index = 0;
|
||||
prev_length = 0;
|
||||
rabin_index[pos] = be->length;
|
||||
dedupe_index[pos] = be->length;
|
||||
ctx->blocks[pos]->cksum_n_offset = be->offset;
|
||||
trans[blk] = pos;
|
||||
pos++;
|
||||
} else {
|
||||
if (prev_length + be->length <= RABIN_MAX_BLOCK_SIZE) {
|
||||
prev_length += be->length;
|
||||
rabin_index[prev_index] = prev_length;
|
||||
dedupe_index[prev_index] = prev_length;
|
||||
} else {
|
||||
prev_index = 0;
|
||||
prev_length = 0;
|
||||
rabin_index[pos] = be->length;
|
||||
dedupe_index[pos] = be->length;
|
||||
ctx->blocks[pos]->cksum_n_offset = be->offset;
|
||||
trans[blk] = pos;
|
||||
pos++;
|
||||
|
@ -599,14 +610,14 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
prev_index = 0;
|
||||
prev_length = 0;
|
||||
ctx->blocks[pos]->cksum_n_offset = be->offset;
|
||||
ctx->blocks[pos]->mean_n_length = be->length;
|
||||
ctx->blocks[pos]->alt_length = be->length;
|
||||
trans[blk] = pos;
|
||||
|
||||
if (be->similar == SIMILAR_EXACT) {
|
||||
rabin_index[pos] = (blkarr[be->index] | RABIN_INDEX_FLAG) &
|
||||
dedupe_index[pos] = (blkarr[be->index] | RABIN_INDEX_FLAG) &
|
||||
CLEAR_SIMILARITY_FLAG;
|
||||
} else {
|
||||
rabin_index[pos] = blkarr[be->index] | RABIN_INDEX_FLAG |
|
||||
dedupe_index[pos] = blkarr[be->index] | RABIN_INDEX_FLAG |
|
||||
SET_SIMILARITY_FLAG;
|
||||
}
|
||||
pos++;
|
||||
|
@ -617,8 +628,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
* Final pass, copy the data and perform delta encoding.
|
||||
*/
|
||||
blknum = pos;
|
||||
rabin_index_sz = (ssize_t)pos * RABIN_ENTRY_SIZE;
|
||||
pos1 = rabin_index_sz + RABIN_HDR_SIZE;
|
||||
dedupe_index_sz = (ssize_t)pos * RABIN_ENTRY_SIZE;
|
||||
pos1 = dedupe_index_sz + RABIN_HDR_SIZE;
|
||||
for (blk = 0; blk < blknum; blk++) {
|
||||
uchar_t *old, *new;
|
||||
int32_t bsz;
|
||||
|
@ -631,37 +642,37 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
valid = 0;
|
||||
break;
|
||||
}
|
||||
if (rabin_index[blk] & RABIN_INDEX_FLAG) {
|
||||
j = rabin_index[blk] & RABIN_INDEX_VALUE;
|
||||
if (dedupe_index[blk] & RABIN_INDEX_FLAG) {
|
||||
j = dedupe_index[blk] & RABIN_INDEX_VALUE;
|
||||
i = ctx->blocks[j]->index;
|
||||
|
||||
if (rabin_index[blk] & GET_SIMILARITY_FLAG) {
|
||||
if (dedupe_index[blk] & GET_SIMILARITY_FLAG) {
|
||||
old = buf1 + ctx->blocks[j]->offset;
|
||||
new = buf1 + ctx->blocks[blk]->cksum_n_offset;
|
||||
matchlen = ctx->real_chunksize - *size;
|
||||
DEBUG_STAT_EN(delta_calls++);
|
||||
|
||||
bsz = bsdiff(old, ctx->blocks[j]->length, new,
|
||||
ctx->blocks[blk]->mean_n_length, ctx->cbuf + pos1,
|
||||
ctx->blocks[blk]->alt_length, ctx->cbuf + pos1,
|
||||
buf1 + *size, matchlen);
|
||||
if (bsz == 0) {
|
||||
DEBUG_STAT_EN(delta_fails++);
|
||||
memcpy(ctx->cbuf + pos1, new, ctx->blocks[blk]->mean_n_length);
|
||||
rabin_index[blk] = htonl(ctx->blocks[blk]->mean_n_length);
|
||||
pos1 += ctx->blocks[blk]->mean_n_length;
|
||||
memcpy(ctx->cbuf + pos1, new, ctx->blocks[blk]->alt_length);
|
||||
dedupe_index[blk] = htonl(ctx->blocks[blk]->alt_length);
|
||||
pos1 += ctx->blocks[blk]->alt_length;
|
||||
} else {
|
||||
rabin_index[blk] = htonl(trans[i] |
|
||||
dedupe_index[blk] = htonl(trans[i] |
|
||||
RABIN_INDEX_FLAG | SET_SIMILARITY_FLAG);
|
||||
pos1 += bsz;
|
||||
}
|
||||
} else {
|
||||
rabin_index[blk] = htonl(trans[i] | RABIN_INDEX_FLAG);
|
||||
dedupe_index[blk] = htonl(trans[i] | RABIN_INDEX_FLAG);
|
||||
}
|
||||
} else {
|
||||
memcpy(ctx->cbuf + pos1, buf1 + ctx->blocks[blk]->cksum_n_offset,
|
||||
rabin_index[blk]);
|
||||
pos1 += rabin_index[blk];
|
||||
rabin_index[blk] = htonl(rabin_index[blk]);
|
||||
dedupe_index[blk]);
|
||||
pos1 += dedupe_index[blk];
|
||||
dedupe_index[blk] = htonl(dedupe_index[blk]);
|
||||
}
|
||||
}
|
||||
cont:
|
||||
|
@ -674,7 +685,7 @@ cont:
|
|||
entries = (ssize_t *)cbuf;
|
||||
entries[0] = htonll(*size);
|
||||
entries[1] = 0;
|
||||
entries[2] = htonll(pos1 - rabin_index_sz - RABIN_HDR_SIZE);
|
||||
entries[2] = htonll(pos1 - dedupe_index_sz - RABIN_HDR_SIZE);
|
||||
*size = pos1;
|
||||
ctx->valid = 1;
|
||||
DEBUG_STAT_EN(printf("Deduped size: %lld, blknum: %u, delta_calls: %u, delta_fails: %u\n",
|
||||
|
@ -683,26 +694,26 @@ cont:
|
|||
* Remaining header entries: size of compressed index and size of
|
||||
* compressed data are inserted later via rabin_update_hdr, after actual compression!
|
||||
*/
|
||||
return (rabin_index_sz);
|
||||
return (dedupe_index_sz);
|
||||
}
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
void
|
||||
rabin_update_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp, ssize_t rabin_data_sz_cmp)
|
||||
rabin_update_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp)
|
||||
{
|
||||
ssize_t *entries;
|
||||
|
||||
buf += sizeof (uint32_t);
|
||||
entries = (ssize_t *)buf;
|
||||
entries[1] = htonll(rabin_index_sz_cmp);
|
||||
entries[1] = htonll(dedupe_index_sz_cmp);
|
||||
entries[3] = htonll(rabin_data_sz_cmp);
|
||||
}
|
||||
|
||||
void
|
||||
rabin_parse_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *rabin_index_sz,
|
||||
ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp,
|
||||
rabin_parse_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
|
||||
ssize_t *rabin_data_sz, ssize_t *dedupe_index_sz_cmp,
|
||||
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size)
|
||||
{
|
||||
ssize_t *entries;
|
||||
|
@ -712,8 +723,8 @@ rabin_parse_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *rabin_index_sz,
|
|||
|
||||
entries = (ssize_t *)buf;
|
||||
*rabin_data_sz = ntohll(entries[0]);
|
||||
*rabin_index_sz = (ssize_t)(*blknum) * RABIN_ENTRY_SIZE;
|
||||
*rabin_index_sz_cmp = ntohll(entries[1]);
|
||||
*dedupe_index_sz = (ssize_t)(*blknum) * RABIN_ENTRY_SIZE;
|
||||
*dedupe_index_sz_cmp = ntohll(entries[1]);
|
||||
*rabin_deduped_size = ntohll(entries[2]);
|
||||
*rabin_data_sz_cmp = ntohll(entries[3]);
|
||||
}
|
||||
|
@ -722,14 +733,14 @@ void
|
|||
rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
||||
{
|
||||
uint32_t blknum, blk, oblk, len;
|
||||
uint32_t *rabin_index;
|
||||
uint32_t *dedupe_index;
|
||||
ssize_t data_sz, sz, indx_cmp, data_sz_cmp, deduped_sz;
|
||||
ssize_t rabin_index_sz, pos1, i;
|
||||
ssize_t dedupe_index_sz, pos1, i;
|
||||
uchar_t *pos2;
|
||||
|
||||
rabin_parse_hdr(buf, &blknum, &rabin_index_sz, &data_sz, &indx_cmp, &data_sz_cmp, &deduped_sz);
|
||||
rabin_index = (uint32_t *)(buf + RABIN_HDR_SIZE);
|
||||
pos1 = rabin_index_sz + RABIN_HDR_SIZE;
|
||||
rabin_parse_hdr(buf, &blknum, &dedupe_index_sz, &data_sz, &indx_cmp, &data_sz_cmp, &deduped_sz);
|
||||
dedupe_index = (uint32_t *)(buf + RABIN_HDR_SIZE);
|
||||
pos1 = dedupe_index_sz + RABIN_HDR_SIZE;
|
||||
pos2 = ctx->cbuf;
|
||||
sz = 0;
|
||||
ctx->valid = 1;
|
||||
|
@ -738,7 +749,7 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
|||
for (blk = 0; blk < blknum; blk++) {
|
||||
if (ctx->blocks[blk] == 0)
|
||||
ctx->blocks[blk] = (rabin_blockentry_t *)slab_alloc(NULL, sizeof (rabin_blockentry_t));
|
||||
len = ntohl(rabin_index[blk]);
|
||||
len = ntohl(dedupe_index[blk]);
|
||||
if (len == 0) {
|
||||
ctx->blocks[blk]->length = 0;
|
||||
ctx->blocks[blk]->index = 0;
|
||||
|
|
|
@ -127,7 +127,7 @@
|
|||
typedef struct {
|
||||
ssize_t offset;
|
||||
uint64_t cksum_n_offset; // Dual purpose variable
|
||||
uint64_t mean_n_length; // Dual purpose variable
|
||||
uint64_t alt_length;
|
||||
uint64_t crc;
|
||||
unsigned int index;
|
||||
unsigned int length;
|
||||
|
@ -149,11 +149,11 @@ typedef struct {
|
|||
uint64_t real_chunksize;
|
||||
short valid;
|
||||
void *lzma_data;
|
||||
int level, delta_flag;
|
||||
int level, delta_flag, fixed_flag;
|
||||
} rabin_context_t;
|
||||
|
||||
extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize,
|
||||
int rab_blk_sz, const char *algo, int delta_flag);
|
||||
int rab_blk_sz, const char *algo, int delta_flag, int fixed_flag);
|
||||
extern void destroy_rabin_context(rabin_context_t *ctx);
|
||||
extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf,
|
||||
ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
|
||||
|
|
Loading…
Reference in a new issue