Change rabin index encoding scheme for better metadata compression.

This commit is contained in:
Moinak Ghosh 2012-07-02 22:08:03 +05:30
parent a1825a2305
commit a13c61e926
2 changed files with 24 additions and 4 deletions

View file

@ -101,6 +101,11 @@ create_rabin_context(uint64_t chunksize) {
if (chunksize % rabin_polynomial_min_block_size) if (chunksize % rabin_polynomial_min_block_size)
blknum++; blknum++;
if (blknum > RABIN_MAX_BLOCKS) {
fprintf(stderr, "Chunk size too large for dedup.\n");
destroy_rabin_context(ctx);
return (NULL);
}
ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t)); ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t));
current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE); current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE);
ctx->blocks = (rabin_blockentry_t *)slab_alloc(NULL, ctx->blocks = (rabin_blockentry_t *)slab_alloc(NULL,
@ -345,7 +350,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
prev_length = 0; prev_length = 0;
rabin_index[blk] = htonl(be->length); rabin_index[blk] = htonl(be->length);
} else { } else {
if (prev_length + be->length <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) { if (prev_length + be->length <= RABIN_MAX_BLOCK_SIZE) {
prev_length += be->length; prev_length += be->length;
rabin_index[prev_index] = htonl(prev_length); rabin_index[prev_index] = htonl(prev_length);
rabin_index[blk] = 0; rabin_index[blk] = 0;
@ -359,7 +364,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
} else { } else {
prev_index = 0; prev_index = 0;
prev_length = 0; prev_length = 0;
rabin_index[blk] = htonl(RAB_POLYNOMIAL_MAX_BLOCK_SIZE + be->index + 1); rabin_index[blk] = htonl(be->index | RABIN_INDEX_FLAG);
} }
} }
@ -437,13 +442,13 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
ctx->blocks[blk].length = 0; ctx->blocks[blk].length = 0;
ctx->blocks[blk].index = 0; ctx->blocks[blk].index = 0;
} else if (len <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) { } else if (!(len & RABIN_INDEX_FLAG)) {
ctx->blocks[blk].length = len; ctx->blocks[blk].length = len;
ctx->blocks[blk].offset = pos1; ctx->blocks[blk].offset = pos1;
pos1 += len; pos1 += len;
} else { } else {
ctx->blocks[blk].length = 0; ctx->blocks[blk].length = 0;
ctx->blocks[blk].index = len - RAB_POLYNOMIAL_MAX_BLOCK_SIZE - 1; ctx->blocks[blk].index = len & RABIN_INDEX_VALUE;
} }
} }
for (blk = 0; blk < blknum; blk++) { for (blk = 0; blk < blknum; blk++) {

View file

@ -99,6 +99,21 @@ typedef struct {
// size of deduped data, size of compressed data // size of deduped data, size of compressed data
#define RABIN_HDR_SIZE (sizeof (unsigned int) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t)) #define RABIN_HDR_SIZE (sizeof (unsigned int) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t))
// Maximum number of dedup blocks supported (2^31 - 1)
#define RABIN_MAX_BLOCKS (0x7fffffff)
// Maximum possible block size for a single rabin block. This is a hard limit much
// larger than RAB_POLYNOMIAL_MAX_BLOCK_SIZE. Useful when merging non-duplicate blocks.
// This is also 2^31 - 1.
#define RABIN_MAX_BLOCK_SIZE (RABIN_MAX_BLOCKS)
// Mask to determine whether Rabin index entry is a length value or index value.
// MSB = 1 : Index
// MSB = 0 : Length
#define RABIN_INDEX_FLAG (0x80000000)
// Mask to extract value from a rabin index entry
#define RABIN_INDEX_VALUE (0x7fffffff)
typedef struct { typedef struct {
unsigned char *current_window_data; unsigned char *current_window_data;
rabin_blockentry_t *blocks; rabin_blockentry_t *blocks;