Updates to Rabin based Dedup.
Change command line option.
This commit is contained in:
parent
cbf9728278
commit
f9c3644459
3 changed files with 54 additions and 13 deletions
11
main.c
11
main.c
|
@ -114,7 +114,8 @@ usage(void)
|
|||
" %s -d <compressed file> <target file>\n"
|
||||
"3) To operate as a pipe, read from stdin and write to stdout:\n"
|
||||
" %s -p ...\n"
|
||||
"4) Rabin Deduplication: Work in progress.\n"
|
||||
"4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
|
||||
" %s -D ...\n"
|
||||
"5) Number of threads can optionally be specified: -t <1 - 256 count>\n"
|
||||
"6) Pass '-M' to display memory allocator statistics\n"
|
||||
"7) Pass '-C' to display compression statistics\n\n",
|
||||
|
@ -1103,7 +1104,7 @@ main(int argc, char *argv[])
|
|||
level = 6;
|
||||
slab_init();
|
||||
|
||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCr")) != -1) {
|
||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCD")) != -1) {
|
||||
int ovr;
|
||||
|
||||
switch (opt) {
|
||||
|
@ -1155,9 +1156,9 @@ main(int argc, char *argv[])
|
|||
hide_cmp_stats = 0;
|
||||
break;
|
||||
|
||||
//case 'r':
|
||||
//enable_rabin_scan = 1;
|
||||
//break;
|
||||
case 'D':
|
||||
enable_rabin_scan = 1;
|
||||
break;
|
||||
|
||||
case '?':
|
||||
default:
|
||||
|
|
|
@ -204,6 +204,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
|||
ctx->blocks[blknum].index = blknum; // Need to store for sorting
|
||||
ctx->blocks[blknum].checksum = ctx->cur_checksum;
|
||||
ctx->blocks[blknum].length = length;
|
||||
ctx->blocks[blknum].refcount = 0;
|
||||
|
||||
blknum++;
|
||||
ctx->cur_checksum = 0;
|
||||
|
@ -219,7 +220,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
|||
ssize_t pos, matches;
|
||||
int valid = 1;
|
||||
char *tmp, *prev_offset;
|
||||
unsigned int *blkarr, prev_blk;
|
||||
unsigned int *blkarr, prev_index, prev_blk;
|
||||
|
||||
// Insert the last left-over trailing bytes, if any, into a block.
|
||||
if (last_offset < *size) {
|
||||
|
@ -255,6 +256,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
|||
* to be considered identical.
|
||||
* The block index in the chunk is initialized with pointers into the
|
||||
* sorted block array.
|
||||
* A reference count is maintained for blocks that are similar with other
|
||||
* blocks. This helps in non-duplicate block merging later.
|
||||
*/
|
||||
for (blk = 0; blk < blknum; blk++) {
|
||||
blkarr[ctx->blocks[blk].index] = blk;
|
||||
|
@ -263,7 +266,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
|||
ctx->blocks[blk].length == prev_length &&
|
||||
memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) {
|
||||
ctx->blocks[blk].length = 0;
|
||||
ctx->blocks[blk].index = prev_blk;
|
||||
ctx->blocks[blk].index = prev_index;
|
||||
(ctx->blocks[prev_blk].refcount)++;
|
||||
matches += prev_length;
|
||||
continue;
|
||||
}
|
||||
|
@ -271,7 +275,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
|||
prev_offset = buf1 + ctx->blocks[blk].offset;
|
||||
prev_cksum = ctx->blocks[blk].checksum;
|
||||
prev_length = ctx->blocks[blk].length;
|
||||
prev_blk = ctx->blocks[blk].index;
|
||||
prev_index = ctx->blocks[blk].index;
|
||||
prev_blk = blk;
|
||||
}
|
||||
|
||||
if (matches < overhead) {
|
||||
|
@ -285,6 +290,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
|||
* This way we can differentiate between a unique block length entry and a
|
||||
* pointer to another block without needing a separate flag.
|
||||
*/
|
||||
prev_index = 0;
|
||||
prev_length = 0;
|
||||
for (blk = 0; blk < blknum; blk++) {
|
||||
rabin_blockentry_t *be;
|
||||
|
||||
|
@ -301,8 +308,37 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
|||
prev_offset = buf1 + be->offset;
|
||||
memcpy(ctx->cbuf + pos, prev_offset, be->length);
|
||||
pos += be->length;
|
||||
blkarr[blk] = htonl(be->length);
|
||||
/*
|
||||
* Update Index entry with the length. Also try to merge runs
|
||||
* of unique (non-duplicate) blocks into a single block entry
|
||||
* as long as the total length does not exceed max block size.
|
||||
*/
|
||||
if (prev_index == 0) {
|
||||
if (be->refcount == 0) {
|
||||
prev_index = blk;
|
||||
prev_length = be->length;
|
||||
}
|
||||
blkarr[blk] = htonl(be->length);
|
||||
} else {
|
||||
if (be->refcount > 0) {
|
||||
prev_index = 0;
|
||||
prev_length = 0;
|
||||
blkarr[blk] = htonl(be->length);
|
||||
} else {
|
||||
if (prev_length + be->length <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) {
|
||||
prev_length += be->length;
|
||||
blkarr[prev_index] = htonl(prev_length);
|
||||
blkarr[blk] = 0;
|
||||
} else {
|
||||
prev_index = 0;
|
||||
prev_length = 0;
|
||||
blkarr[blk] = htonl(be->length);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
prev_index = 0;
|
||||
prev_length = 0;
|
||||
blkarr[blk] = htonl(RAB_POLYNOMIAL_MAX_BLOCK_SIZE + be->index + 1);
|
||||
}
|
||||
}
|
||||
|
@ -337,7 +373,11 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
|||
|
||||
for (blk = 0; blk < blknum; blk++) {
|
||||
len = ntohl(blkarr[blk]);
|
||||
if (len <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) {
|
||||
if (len == 0) {
|
||||
ctx->blocks[blk].length = 0;
|
||||
ctx->blocks[blk].index = 0;
|
||||
|
||||
} else if (len <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) {
|
||||
ctx->blocks[blk].length = len;
|
||||
ctx->blocks[blk].offset = pos1;
|
||||
pos1 += len;
|
||||
|
@ -347,6 +387,7 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
|||
}
|
||||
}
|
||||
for (blk = 0; blk < blknum; blk++) {
|
||||
if (ctx->blocks[blk].length == 0 && ctx->blocks[blk].index == 0) continue;
|
||||
if (ctx->blocks[blk].length > 0) {
|
||||
len = ctx->blocks[blk].length;
|
||||
pos1 = ctx->blocks[blk].offset;
|
||||
|
|
|
@ -68,14 +68,12 @@
|
|||
// 1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT = Average Rabin Chunk Size
|
||||
// So we are always looking at power of 2 chunk sizes to avoid doing a modulus
|
||||
//
|
||||
// A value of 12 below gives avg block size of 4096 bytes
|
||||
//
|
||||
#define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT 12
|
||||
#define RAB_POLYNOMIAL_AVG_BLOCK_SIZE (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT)
|
||||
#define RAB_POLYNOMIAL_AVG_BLOCK_MASK (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1)
|
||||
#define RAB_POLYNOMIAL_MIN_BLOCK_SIZE (4096)
|
||||
#define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024)
|
||||
#define RAB_POLYNOMIAL_WIN_SIZE 32
|
||||
#define RAB_POLYNOMIAL_WIN_SIZE 31
|
||||
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 17
|
||||
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 63
|
||||
|
||||
|
@ -84,6 +82,7 @@ typedef struct {
|
|||
uint64_t checksum;
|
||||
unsigned int index;
|
||||
unsigned int length;
|
||||
unsigned short refcount;
|
||||
} rabin_blockentry_t;
|
||||
|
||||
// An entry in the Rabin block array in the chunk.
|
||||
|
|
Loading…
Reference in a new issue