Updates to Rabin based Dedup.

Change command line option.
This commit is contained in:
Moinak Ghosh 2012-06-29 23:45:06 +05:30
parent cbf9728278
commit f9c3644459
3 changed files with 54 additions and 13 deletions

11
main.c
View file

@ -114,7 +114,8 @@ usage(void)
" %s -d <compressed file> <target file>\n"
"3) To operate as a pipe, read from stdin and write to stdout:\n"
" %s -p ...\n"
"4) Rabin Deduplication: Work in progress.\n"
"4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
" %s -D ...\n"
"5) Number of threads can optionally be specified: -t <1 - 256 count>\n"
"6) Pass '-M' to display memory allocator statistics\n"
"7) Pass '-C' to display compression statistics\n\n",
@ -1103,7 +1104,7 @@ main(int argc, char *argv[])
level = 6;
slab_init();
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCr")) != -1) {
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCD")) != -1) {
int ovr;
switch (opt) {
@ -1155,9 +1156,9 @@ main(int argc, char *argv[])
hide_cmp_stats = 0;
break;
//case 'r':
//enable_rabin_scan = 1;
//break;
case 'D':
enable_rabin_scan = 1;
break;
case '?':
default:

View file

@ -204,6 +204,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
ctx->blocks[blknum].index = blknum; // Need to store for sorting
ctx->blocks[blknum].checksum = ctx->cur_checksum;
ctx->blocks[blknum].length = length;
ctx->blocks[blknum].refcount = 0;
blknum++;
ctx->cur_checksum = 0;
@ -219,7 +220,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
ssize_t pos, matches;
int valid = 1;
char *tmp, *prev_offset;
unsigned int *blkarr, prev_blk;
unsigned int *blkarr, prev_index, prev_blk;
// Insert the last left-over trailing bytes, if any, into a block.
if (last_offset < *size) {
@ -255,6 +256,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
* to be considered identical.
* The block index in the chunk is initialized with pointers into the
* sorted block array.
* A reference count is maintained for blocks that are similar with other
* blocks. This helps in non-duplicate block merging later.
*/
for (blk = 0; blk < blknum; blk++) {
blkarr[ctx->blocks[blk].index] = blk;
@ -263,7 +266,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
ctx->blocks[blk].length == prev_length &&
memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) {
ctx->blocks[blk].length = 0;
ctx->blocks[blk].index = prev_blk;
ctx->blocks[blk].index = prev_index;
(ctx->blocks[prev_blk].refcount)++;
matches += prev_length;
continue;
}
@ -271,7 +275,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
prev_offset = buf1 + ctx->blocks[blk].offset;
prev_cksum = ctx->blocks[blk].checksum;
prev_length = ctx->blocks[blk].length;
prev_blk = ctx->blocks[blk].index;
prev_index = ctx->blocks[blk].index;
prev_blk = blk;
}
if (matches < overhead) {
@ -285,6 +290,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
* This way we can differentiate between a unique block length entry and a
* pointer to another block without needing a separate flag.
*/
prev_index = 0;
prev_length = 0;
for (blk = 0; blk < blknum; blk++) {
rabin_blockentry_t *be;
@ -301,8 +308,37 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
prev_offset = buf1 + be->offset;
memcpy(ctx->cbuf + pos, prev_offset, be->length);
pos += be->length;
blkarr[blk] = htonl(be->length);
/*
* Update Index entry with the length. Also try to merge runs
* of unique (non-duplicate) blocks into a single block entry
* as long as the total length does not exceed max block size.
*/
if (prev_index == 0) {
if (be->refcount == 0) {
prev_index = blk;
prev_length = be->length;
}
blkarr[blk] = htonl(be->length);
} else {
if (be->refcount > 0) {
prev_index = 0;
prev_length = 0;
blkarr[blk] = htonl(be->length);
} else {
if (prev_length + be->length <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) {
prev_length += be->length;
blkarr[prev_index] = htonl(prev_length);
blkarr[blk] = 0;
} else {
prev_index = 0;
prev_length = 0;
blkarr[blk] = htonl(be->length);
}
}
}
} else {
prev_index = 0;
prev_length = 0;
blkarr[blk] = htonl(RAB_POLYNOMIAL_MAX_BLOCK_SIZE + be->index + 1);
}
}
@ -337,7 +373,11 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
for (blk = 0; blk < blknum; blk++) {
len = ntohl(blkarr[blk]);
if (len <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) {
if (len == 0) {
ctx->blocks[blk].length = 0;
ctx->blocks[blk].index = 0;
} else if (len <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) {
ctx->blocks[blk].length = len;
ctx->blocks[blk].offset = pos1;
pos1 += len;
@ -347,6 +387,7 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
}
}
for (blk = 0; blk < blknum; blk++) {
if (ctx->blocks[blk].length == 0 && ctx->blocks[blk].index == 0) continue;
if (ctx->blocks[blk].length > 0) {
len = ctx->blocks[blk].length;
pos1 = ctx->blocks[blk].offset;

View file

@ -68,14 +68,12 @@
// 1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT = Average Rabin Chunk Size
// So we are always looking at power of 2 chunk sizes to avoid doing a modulus
//
// A value of 12 below gives avg block size of 4096 bytes
//
#define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT 12
#define RAB_POLYNOMIAL_AVG_BLOCK_SIZE (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT)
#define RAB_POLYNOMIAL_AVG_BLOCK_MASK (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1)
#define RAB_POLYNOMIAL_MIN_BLOCK_SIZE (4096)
#define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024)
#define RAB_POLYNOMIAL_WIN_SIZE 32
#define RAB_POLYNOMIAL_WIN_SIZE 31
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 17
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 63
@ -84,6 +82,7 @@ typedef struct {
uint64_t checksum;
unsigned int index;
unsigned int length;
unsigned short refcount;
} rabin_blockentry_t;
// An entry in the Rabin block array in the chunk.