From a13c61e92620cdf67744b6f203b9e717ed72ab67 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Mon, 2 Jul 2012 22:08:03 +0530 Subject: [PATCH] Change rabin index encoding scheme for better metadata compression. --- rabin/rabin_polynomial.c | 13 +++++++++---- rabin/rabin_polynomial.h | 15 +++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/rabin/rabin_polynomial.c b/rabin/rabin_polynomial.c index 22a0b65..3fbd9e5 100755 --- a/rabin/rabin_polynomial.c +++ b/rabin/rabin_polynomial.c @@ -101,6 +101,11 @@ create_rabin_context(uint64_t chunksize) { if (chunksize % rabin_polynomial_min_block_size) blknum++; + if (blknum > RABIN_MAX_BLOCKS) { + fprintf(stderr, "Chunk size too large for dedup.\n"); + destroy_rabin_context(ctx); + return (NULL); + } ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t)); current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE); ctx->blocks = (rabin_blockentry_t *)slab_alloc(NULL, @@ -345,7 +350,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset) prev_length = 0; rabin_index[blk] = htonl(be->length); } else { - if (prev_length + be->length <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) { + if (prev_length + be->length <= RABIN_MAX_BLOCK_SIZE) { prev_length += be->length; rabin_index[prev_index] = htonl(prev_length); rabin_index[blk] = 0; @@ -359,7 +364,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset) } else { prev_index = 0; prev_length = 0; - rabin_index[blk] = htonl(RAB_POLYNOMIAL_MAX_BLOCK_SIZE + be->index + 1); + rabin_index[blk] = htonl(be->index | RABIN_INDEX_FLAG); } } @@ -437,13 +442,13 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size) ctx->blocks[blk].length = 0; ctx->blocks[blk].index = 0; - } else if (len <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) { + } else if (!(len & RABIN_INDEX_FLAG)) { ctx->blocks[blk].length = len; ctx->blocks[blk].offset = pos1; pos1 += len; } else { ctx->blocks[blk].length = 0; - ctx->blocks[blk].index = len - RAB_POLYNOMIAL_MAX_BLOCK_SIZE - 1; + ctx->blocks[blk].index = len & RABIN_INDEX_VALUE; } } for (blk = 0; blk < blknum; blk++) { diff --git a/rabin/rabin_polynomial.h b/rabin/rabin_polynomial.h index 71ac0d3..9248e2d 100644 --- a/rabin/rabin_polynomial.h +++ b/rabin/rabin_polynomial.h @@ -99,6 +99,21 @@ typedef struct { // size of deduped data, size of compressed data #define RABIN_HDR_SIZE (sizeof (unsigned int) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t)) +// Maximum number of dedup blocks supported (2^31 - 1) +#define RABIN_MAX_BLOCKS (0x7fffffff) + +// Maximum possible block size for a single rabin block. This is a hard limit much +// larger than RAB_POLYNOMIAL_MAX_BLOCK_SIZE. Useful when merging non-duplicate blocks. +// This is also 2^31 - 1. +#define RABIN_MAX_BLOCK_SIZE (RABIN_MAX_BLOCKS) + +// Mask to determine whether Rabin index entry is a length value or index value. +// MSB = 1 : Index +// MSB = 0 : Length +#define RABIN_INDEX_FLAG (0x80000000) +// Mask to extract value from a rabin index entry +#define RABIN_INDEX_VALUE (0x7fffffff) + typedef struct { unsigned char *current_window_data; rabin_blockentry_t *blocks;