Updates to Rabin based Dedup.
Change command line option.
This commit is contained in:
parent
cbf9728278
commit
f9c3644459
3 changed files with 54 additions and 13 deletions
11
main.c
11
main.c
|
@ -114,7 +114,8 @@ usage(void)
|
||||||
" %s -d <compressed file> <target file>\n"
|
" %s -d <compressed file> <target file>\n"
|
||||||
"3) To operate as a pipe, read from stdin and write to stdout:\n"
|
"3) To operate as a pipe, read from stdin and write to stdout:\n"
|
||||||
" %s -p ...\n"
|
" %s -p ...\n"
|
||||||
"4) Rabin Deduplication: Work in progress.\n"
|
"4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
|
||||||
|
" %s -D ...\n"
|
||||||
"5) Number of threads can optionally be specified: -t <1 - 256 count>\n"
|
"5) Number of threads can optionally be specified: -t <1 - 256 count>\n"
|
||||||
"6) Pass '-M' to display memory allocator statistics\n"
|
"6) Pass '-M' to display memory allocator statistics\n"
|
||||||
"7) Pass '-C' to display compression statistics\n\n",
|
"7) Pass '-C' to display compression statistics\n\n",
|
||||||
|
@ -1103,7 +1104,7 @@ main(int argc, char *argv[])
|
||||||
level = 6;
|
level = 6;
|
||||||
slab_init();
|
slab_init();
|
||||||
|
|
||||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCr")) != -1) {
|
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCD")) != -1) {
|
||||||
int ovr;
|
int ovr;
|
||||||
|
|
||||||
switch (opt) {
|
switch (opt) {
|
||||||
|
@ -1155,9 +1156,9 @@ main(int argc, char *argv[])
|
||||||
hide_cmp_stats = 0;
|
hide_cmp_stats = 0;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
//case 'r':
|
case 'D':
|
||||||
//enable_rabin_scan = 1;
|
enable_rabin_scan = 1;
|
||||||
//break;
|
break;
|
||||||
|
|
||||||
case '?':
|
case '?':
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -204,6 +204,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
||||||
ctx->blocks[blknum].index = blknum; // Need to store for sorting
|
ctx->blocks[blknum].index = blknum; // Need to store for sorting
|
||||||
ctx->blocks[blknum].checksum = ctx->cur_checksum;
|
ctx->blocks[blknum].checksum = ctx->cur_checksum;
|
||||||
ctx->blocks[blknum].length = length;
|
ctx->blocks[blknum].length = length;
|
||||||
|
ctx->blocks[blknum].refcount = 0;
|
||||||
|
|
||||||
blknum++;
|
blknum++;
|
||||||
ctx->cur_checksum = 0;
|
ctx->cur_checksum = 0;
|
||||||
|
@ -219,7 +220,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
||||||
ssize_t pos, matches;
|
ssize_t pos, matches;
|
||||||
int valid = 1;
|
int valid = 1;
|
||||||
char *tmp, *prev_offset;
|
char *tmp, *prev_offset;
|
||||||
unsigned int *blkarr, prev_blk;
|
unsigned int *blkarr, prev_index, prev_blk;
|
||||||
|
|
||||||
// Insert the last left-over trailing bytes, if any, into a block.
|
// Insert the last left-over trailing bytes, if any, into a block.
|
||||||
if (last_offset < *size) {
|
if (last_offset < *size) {
|
||||||
|
@ -255,6 +256,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
||||||
* to be considered identical.
|
* to be considered identical.
|
||||||
* The block index in the chunk is initialized with pointers into the
|
* The block index in the chunk is initialized with pointers into the
|
||||||
* sorted block array.
|
* sorted block array.
|
||||||
|
* A reference count is maintained for blocks that are similar with other
|
||||||
|
* blocks. This helps in non-duplicate block merging later.
|
||||||
*/
|
*/
|
||||||
for (blk = 0; blk < blknum; blk++) {
|
for (blk = 0; blk < blknum; blk++) {
|
||||||
blkarr[ctx->blocks[blk].index] = blk;
|
blkarr[ctx->blocks[blk].index] = blk;
|
||||||
|
@ -263,7 +266,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
||||||
ctx->blocks[blk].length == prev_length &&
|
ctx->blocks[blk].length == prev_length &&
|
||||||
memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) {
|
memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) {
|
||||||
ctx->blocks[blk].length = 0;
|
ctx->blocks[blk].length = 0;
|
||||||
ctx->blocks[blk].index = prev_blk;
|
ctx->blocks[blk].index = prev_index;
|
||||||
|
(ctx->blocks[prev_blk].refcount)++;
|
||||||
matches += prev_length;
|
matches += prev_length;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -271,7 +275,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
||||||
prev_offset = buf1 + ctx->blocks[blk].offset;
|
prev_offset = buf1 + ctx->blocks[blk].offset;
|
||||||
prev_cksum = ctx->blocks[blk].checksum;
|
prev_cksum = ctx->blocks[blk].checksum;
|
||||||
prev_length = ctx->blocks[blk].length;
|
prev_length = ctx->blocks[blk].length;
|
||||||
prev_blk = ctx->blocks[blk].index;
|
prev_index = ctx->blocks[blk].index;
|
||||||
|
prev_blk = blk;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (matches < overhead) {
|
if (matches < overhead) {
|
||||||
|
@ -285,6 +290,8 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
||||||
* This way we can differentiate between a unique block length entry and a
|
* This way we can differentiate between a unique block length entry and a
|
||||||
* pointer to another block without needing a separate flag.
|
* pointer to another block without needing a separate flag.
|
||||||
*/
|
*/
|
||||||
|
prev_index = 0;
|
||||||
|
prev_length = 0;
|
||||||
for (blk = 0; blk < blknum; blk++) {
|
for (blk = 0; blk < blknum; blk++) {
|
||||||
rabin_blockentry_t *be;
|
rabin_blockentry_t *be;
|
||||||
|
|
||||||
|
@ -301,8 +308,37 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
||||||
prev_offset = buf1 + be->offset;
|
prev_offset = buf1 + be->offset;
|
||||||
memcpy(ctx->cbuf + pos, prev_offset, be->length);
|
memcpy(ctx->cbuf + pos, prev_offset, be->length);
|
||||||
pos += be->length;
|
pos += be->length;
|
||||||
|
/*
|
||||||
|
* Update Index entry with the length. Also try to merge runs
|
||||||
|
* of unique (non-duplicate) blocks into a single block entry
|
||||||
|
* as long as the total length does not exceed max block size.
|
||||||
|
*/
|
||||||
|
if (prev_index == 0) {
|
||||||
|
if (be->refcount == 0) {
|
||||||
|
prev_index = blk;
|
||||||
|
prev_length = be->length;
|
||||||
|
}
|
||||||
blkarr[blk] = htonl(be->length);
|
blkarr[blk] = htonl(be->length);
|
||||||
} else {
|
} else {
|
||||||
|
if (be->refcount > 0) {
|
||||||
|
prev_index = 0;
|
||||||
|
prev_length = 0;
|
||||||
|
blkarr[blk] = htonl(be->length);
|
||||||
|
} else {
|
||||||
|
if (prev_length + be->length <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) {
|
||||||
|
prev_length += be->length;
|
||||||
|
blkarr[prev_index] = htonl(prev_length);
|
||||||
|
blkarr[blk] = 0;
|
||||||
|
} else {
|
||||||
|
prev_index = 0;
|
||||||
|
prev_length = 0;
|
||||||
|
blkarr[blk] = htonl(be->length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
prev_index = 0;
|
||||||
|
prev_length = 0;
|
||||||
blkarr[blk] = htonl(RAB_POLYNOMIAL_MAX_BLOCK_SIZE + be->index + 1);
|
blkarr[blk] = htonl(RAB_POLYNOMIAL_MAX_BLOCK_SIZE + be->index + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -337,7 +373,11 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
||||||
|
|
||||||
for (blk = 0; blk < blknum; blk++) {
|
for (blk = 0; blk < blknum; blk++) {
|
||||||
len = ntohl(blkarr[blk]);
|
len = ntohl(blkarr[blk]);
|
||||||
if (len <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) {
|
if (len == 0) {
|
||||||
|
ctx->blocks[blk].length = 0;
|
||||||
|
ctx->blocks[blk].index = 0;
|
||||||
|
|
||||||
|
} else if (len <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE) {
|
||||||
ctx->blocks[blk].length = len;
|
ctx->blocks[blk].length = len;
|
||||||
ctx->blocks[blk].offset = pos1;
|
ctx->blocks[blk].offset = pos1;
|
||||||
pos1 += len;
|
pos1 += len;
|
||||||
|
@ -347,6 +387,7 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (blk = 0; blk < blknum; blk++) {
|
for (blk = 0; blk < blknum; blk++) {
|
||||||
|
if (ctx->blocks[blk].length == 0 && ctx->blocks[blk].index == 0) continue;
|
||||||
if (ctx->blocks[blk].length > 0) {
|
if (ctx->blocks[blk].length > 0) {
|
||||||
len = ctx->blocks[blk].length;
|
len = ctx->blocks[blk].length;
|
||||||
pos1 = ctx->blocks[blk].offset;
|
pos1 = ctx->blocks[blk].offset;
|
||||||
|
|
|
@ -68,14 +68,12 @@
|
||||||
// 1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT = Average Rabin Chunk Size
|
// 1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT = Average Rabin Chunk Size
|
||||||
// So we are always looking at power of 2 chunk sizes to avoid doing a modulus
|
// So we are always looking at power of 2 chunk sizes to avoid doing a modulus
|
||||||
//
|
//
|
||||||
// A value of 12 below gives avg block size of 4096 bytes
|
|
||||||
//
|
|
||||||
#define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT 12
|
#define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT 12
|
||||||
#define RAB_POLYNOMIAL_AVG_BLOCK_SIZE (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT)
|
#define RAB_POLYNOMIAL_AVG_BLOCK_SIZE (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT)
|
||||||
#define RAB_POLYNOMIAL_AVG_BLOCK_MASK (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1)
|
#define RAB_POLYNOMIAL_AVG_BLOCK_MASK (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1)
|
||||||
#define RAB_POLYNOMIAL_MIN_BLOCK_SIZE (4096)
|
#define RAB_POLYNOMIAL_MIN_BLOCK_SIZE (4096)
|
||||||
#define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024)
|
#define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024)
|
||||||
#define RAB_POLYNOMIAL_WIN_SIZE 32
|
#define RAB_POLYNOMIAL_WIN_SIZE 31
|
||||||
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 17
|
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 17
|
||||||
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 63
|
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 63
|
||||||
|
|
||||||
|
@ -84,6 +82,7 @@ typedef struct {
|
||||||
uint64_t checksum;
|
uint64_t checksum;
|
||||||
unsigned int index;
|
unsigned int index;
|
||||||
unsigned int length;
|
unsigned int length;
|
||||||
|
unsigned short refcount;
|
||||||
} rabin_blockentry_t;
|
} rabin_blockentry_t;
|
||||||
|
|
||||||
// An entry in the Rabin block array in the chunk.
|
// An entry in the Rabin block array in the chunk.
|
||||||
|
|
Loading…
Reference in a new issue