Updates to Rabin based Dedup.

Change command line option.
This commit is contained in:
Moinak Ghosh 2012-06-29 23:45:06 +05:30
parent cbf9728278
commit f9c3644459
3 changed files with 54 additions and 13 deletions

11
main.c
View file

@ -114,7 +114,8 @@ usage(void)
" %s -d <compressed file> <target file>\n" " %s -d <compressed file> <target file>\n"
"3) To operate as a pipe, read from stdin and write to stdout:\n" "3) To operate as a pipe, read from stdin and write to stdout:\n"
" %s -p ...\n" " %s -p ...\n"
"4) Rabin Deduplication: Work in progress.\n" "4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
" %s -D ...\n"
"5) Number of threads can optionally be specified: -t <1 - 256 count>\n" "5) Number of threads can optionally be specified: -t <1 - 256 count>\n"
"6) Pass '-M' to display memory allocator statistics\n" "6) Pass '-M' to display memory allocator statistics\n"
"7) Pass '-C' to display compression statistics\n\n", "7) Pass '-C' to display compression statistics\n\n",
@ -1103,7 +1104,7 @@ main(int argc, char *argv[])
level = 6; level = 6;
slab_init(); slab_init();
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCr")) != -1) { while ((opt = getopt(argc, argv, "dc:s:l:pt:MCD")) != -1) {
int ovr; int ovr;
switch (opt) { switch (opt) {
@ -1155,9 +1156,9 @@ main(int argc, char *argv[])
hide_cmp_stats = 0; hide_cmp_stats = 0;
break; break;
//case 'r': case 'D':
//enable_rabin_scan = 1; enable_rabin_scan = 1;
//break; break;
case '?': case '?':
default: default:

View file

@ -204,6 +204,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
ctx->blocks[blknum].index = blknum; // Need to store for sorting ctx->blocks[blknum].index = blknum; // Need to store for sorting
ctx->blocks[blknum].checksum = ctx->cur_checksum; ctx->blocks[blknum].checksum = ctx->cur_checksum;
ctx->blocks[blknum].length = length; ctx->blocks[blknum].length = length;
ctx->blocks[blknum].refcount = 0;
blknum++; blknum++;
ctx->cur_checksum = 0; ctx->cur_checksum = 0;
@ -219,7 +220,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
ssize_t pos, matches; ssize_t pos, matches;
int valid = 1; int valid = 1;
char *tmp, *prev_offset; char *tmp, *prev_offset;
unsigned int *blkarr, prev_blk; unsigned int *blkarr, prev_index, prev_blk;
// Insert the last left-over trailing bytes, if any, into a block. // Insert the last left-over trailing bytes, if any, into a block.
if (last_offset < *size) { if (last_offset < *size) {
@ -255,6 +256,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
* to be considered identical. * to be considered identical.
* The block index in the chunk is initialized with pointers into the * The block index in the chunk is initialized with pointers into the
* sorted block array. * sorted block array.
* A reference count is maintained for blocks that are similar with other
* blocks. This helps in non-duplicate block merging later.
*/ */
for (blk = 0; blk < blknum; blk++) { for (blk = 0; blk < blknum; blk++) {
blkarr[ctx->blocks[blk].index] = blk; blkarr[ctx->blocks[blk].index] = blk;
@ -263,7 +266,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
ctx->blocks[blk].length == prev_length && ctx->blocks[blk].length == prev_length &&
memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) { memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) {
ctx->blocks[blk].length = 0; ctx->blocks[blk].length = 0;
ctx->blocks[blk].index = prev_blk; ctx->blocks[blk].index = prev_index;
(ctx->blocks[prev_blk].refcount)++;
matches += prev_length; matches += prev_length;
continue; continue;
} }
@ -271,7 +275,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
prev_offset = buf1 + ctx->blocks[blk].offset; prev_offset = buf1 + ctx->blocks[blk].offset;
prev_cksum = ctx->blocks[blk].checksum; prev_cksum = ctx->blocks[blk].checksum;
prev_length = ctx->blocks[blk].length; prev_length = ctx->blocks[blk].length;
prev_blk = ctx->blocks[blk].index; prev_index = ctx->blocks[blk].index;
prev_blk = blk;
} }
if (matches < overhead) { if (matches < overhead) {
@ -285,6 +290,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
* This way we can differentiate between a unique block length entry and a * This way we can differentiate between a unique block length entry and a
* pointer to another block without needing a separate flag. * pointer to another block without needing a separate flag.
*/ */
prev_index = 0;
prev_length = 0;
for (blk = 0; blk < blknum; blk++) { for (blk = 0; blk < blknum; blk++) {
rabin_blockentry_t *be; rabin_blockentry_t *be;
@ -301,8 +308,37 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
prev_offset = buf1 + be->offset; prev_offset = buf1 + be->offset;
memcpy(ctx->cbuf + pos, prev_offset, be->length); memcpy(ctx->cbuf + pos, prev_offset, be->length);
pos += be->length; pos += be->length;
/*
* Update Index entry with the length. Also try to merge runs
* of unique (non-duplicate) blocks into a single block entry
* as long as the total length does not exceed max block size.
*/
if (prev_index == 0) {
if (be->refcount == 0) {
prev_index = blk;
prev_length = be->length;
}
blkarr[blk] = htonl(be->length); blkarr[blk] = htonl(be->length);
} else { } else {
if (be->refcount > 0) {
prev_index = 0;
prev_length = 0;
blkarr[blk] = htonl(be->length);
} else {
if (prev_length + be->length <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) {
prev_length += be->length;
blkarr[prev_index] = htonl(prev_length);
blkarr[blk] = 0;
} else {
prev_index = 0;
prev_length = 0;
blkarr[blk] = htonl(be->length);
}
}
}
} else {
prev_index = 0;
prev_length = 0;
blkarr[blk] = htonl(RAB_POLYNOMIAL_MAX_BLOCK_SIZE + be->index + 1); blkarr[blk] = htonl(RAB_POLYNOMIAL_MAX_BLOCK_SIZE + be->index + 1);
} }
} }
@ -337,7 +373,11 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
for (blk = 0; blk < blknum; blk++) { for (blk = 0; blk < blknum; blk++) {
len = ntohl(blkarr[blk]); len = ntohl(blkarr[blk]);
if (len <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) { if (len == 0) {
ctx->blocks[blk].length = 0;
ctx->blocks[blk].index = 0;
} else if (len <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) {
ctx->blocks[blk].length = len; ctx->blocks[blk].length = len;
ctx->blocks[blk].offset = pos1; ctx->blocks[blk].offset = pos1;
pos1 += len; pos1 += len;
@ -347,6 +387,7 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
} }
} }
for (blk = 0; blk < blknum; blk++) { for (blk = 0; blk < blknum; blk++) {
if (ctx->blocks[blk].length == 0 && ctx->blocks[blk].index == 0) continue;
if (ctx->blocks[blk].length > 0) { if (ctx->blocks[blk].length > 0) {
len = ctx->blocks[blk].length; len = ctx->blocks[blk].length;
pos1 = ctx->blocks[blk].offset; pos1 = ctx->blocks[blk].offset;

View file

@ -68,14 +68,12 @@
// 1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT = Average Rabin Chunk Size // 1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT = Average Rabin Chunk Size
// So we are always looking at power of 2 chunk sizes to avoid doing a modulus // So we are always looking at power of 2 chunk sizes to avoid doing a modulus
// //
// A value of 12 below gives avg block size of 4096 bytes
//
#define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT 12 #define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT 12
#define RAB_POLYNOMIAL_AVG_BLOCK_SIZE (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT) #define RAB_POLYNOMIAL_AVG_BLOCK_SIZE (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT)
#define RAB_POLYNOMIAL_AVG_BLOCK_MASK (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1) #define RAB_POLYNOMIAL_AVG_BLOCK_MASK (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1)
#define RAB_POLYNOMIAL_MIN_BLOCK_SIZE (4096) #define RAB_POLYNOMIAL_MIN_BLOCK_SIZE (4096)
#define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024) #define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024)
#define RAB_POLYNOMIAL_WIN_SIZE 32 #define RAB_POLYNOMIAL_WIN_SIZE 31
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 17 #define RAB_POLYNOMIAL_MIN_WIN_SIZE 17
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 63 #define RAB_POLYNOMIAL_MAX_WIN_SIZE 63
@ -84,6 +82,7 @@ typedef struct {
uint64_t checksum; uint64_t checksum;
unsigned int index; unsigned int index;
unsigned int length; unsigned int length;
unsigned short refcount;
} rabin_blockentry_t; } rabin_blockentry_t;
// An entry in the Rabin block array in the chunk. // An entry in the Rabin block array in the chunk.