Implement ability to partition chunks at the last rabin boundary instead of fixed size.
This commit is contained in:
parent
d3f5287ee5
commit
010f49f412
4 changed files with 105 additions and 22 deletions
40
main.c
40
main.c
|
@ -78,6 +78,7 @@ static int nthreads = 0;
|
||||||
static int hide_mem_stats = 1;
|
static int hide_mem_stats = 1;
|
||||||
static int hide_cmp_stats = 1;
|
static int hide_cmp_stats = 1;
|
||||||
static int enable_rabin_scan = 0;
|
static int enable_rabin_scan = 0;
|
||||||
|
static int enable_rabin_split = 1;
|
||||||
static unsigned int chunk_num;
|
static unsigned int chunk_num;
|
||||||
static uint64_t largest_chunk, smallest_chunk, avg_chunk;
|
static uint64_t largest_chunk, smallest_chunk, avg_chunk;
|
||||||
static const char *exec_name;
|
static const char *exec_name;
|
||||||
|
@ -116,6 +117,7 @@ usage(void)
|
||||||
" %s -p ...\n"
|
" %s -p ...\n"
|
||||||
"4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
|
"4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
|
||||||
" %s -D ...\n"
|
" %s -D ...\n"
|
||||||
|
" %s -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.\n"
|
||||||
"5) Number of threads can optionally be specified: -t <1 - 256 count>\n"
|
"5) Number of threads can optionally be specified: -t <1 - 256 count>\n"
|
||||||
"6) Pass '-M' to display memory allocator statistics\n"
|
"6) Pass '-M' to display memory allocator statistics\n"
|
||||||
"7) Pass '-C' to display compression statistics\n\n",
|
"7) Pass '-C' to display compression statistics\n\n",
|
||||||
|
@ -602,7 +604,7 @@ redo:
|
||||||
rbytes = tdat->rbytes;
|
rbytes = tdat->rbytes;
|
||||||
reset_rabin_context(tdat->rctx);
|
reset_rabin_context(tdat->rctx);
|
||||||
rctx->cbuf = tdat->uncompressed_chunk;
|
rctx->cbuf = tdat->uncompressed_chunk;
|
||||||
rabin_index_sz = rabin_dedup(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0);
|
rabin_index_sz = rabin_dedup(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0, NULL);
|
||||||
if (!rctx->valid) {
|
if (!rctx->valid) {
|
||||||
memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes);
|
memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes);
|
||||||
tdat->rbytes = rbytes;
|
tdat->rbytes = rbytes;
|
||||||
|
@ -753,7 +755,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
char tmpfile1[MAXPATHLEN];
|
char tmpfile1[MAXPATHLEN];
|
||||||
char to_filename[MAXPATHLEN];
|
char to_filename[MAXPATHLEN];
|
||||||
ssize_t compressed_chunksize;
|
ssize_t compressed_chunksize;
|
||||||
ssize_t n_chunksize, rbytes;
|
ssize_t n_chunksize, rbytes, rabin_count;
|
||||||
short version, flags;
|
short version, flags;
|
||||||
struct stat sbuf;
|
struct stat sbuf;
|
||||||
int compfd = -1, uncompfd = -1, err;
|
int compfd = -1, uncompfd = -1, err;
|
||||||
|
@ -762,6 +764,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
struct cmp_data **dary = NULL, *tdat;
|
struct cmp_data **dary = NULL, *tdat;
|
||||||
pthread_t writer_thr;
|
pthread_t writer_thr;
|
||||||
uchar_t *cread_buf, *pos;
|
uchar_t *cread_buf, *pos;
|
||||||
|
rabin_context_t *rctx;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Compressed buffer size must include zlib scratch space and
|
* Compressed buffer size must include zlib scratch space and
|
||||||
|
@ -956,11 +959,17 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
largest_chunk = 0;
|
largest_chunk = 0;
|
||||||
smallest_chunk = chunksize;
|
smallest_chunk = chunksize;
|
||||||
avg_chunk = 0;
|
avg_chunk = 0;
|
||||||
|
rabin_count = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Read the first chunk into a spare buffer (a simple double-buffering).
|
* Read the first chunk into a spare buffer (a simple double-buffering).
|
||||||
*/
|
*/
|
||||||
rbytes = Read(uncompfd, cread_buf, chunksize);
|
if (enable_rabin_split) {
|
||||||
|
rctx = create_rabin_context(chunksize, 0, algo);
|
||||||
|
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
|
||||||
|
} else {
|
||||||
|
rbytes = Read(uncompfd, cread_buf, chunksize);
|
||||||
|
}
|
||||||
|
|
||||||
while (!bail) {
|
while (!bail) {
|
||||||
uchar_t *tmp;
|
uchar_t *tmp;
|
||||||
|
@ -990,6 +999,18 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
tdat->cmp_seg = cread_buf;
|
tdat->cmp_seg = cread_buf;
|
||||||
cread_buf = tmp;
|
cread_buf = tmp;
|
||||||
tdat->compressed_chunk = tdat->cmp_seg + sizeof (chunksize) + sizeof (uint64_t);
|
tdat->compressed_chunk = tdat->cmp_seg + sizeof (chunksize) + sizeof (uint64_t);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If there is data after the last rabin boundary in the chunk, then
|
||||||
|
* rabin_count will be non-zero. We carry over the data to the beginning
|
||||||
|
* of the next chunk.
|
||||||
|
*/
|
||||||
|
if (rabin_count) {
|
||||||
|
memcpy(cread_buf,
|
||||||
|
tdat->cmp_seg + rabin_count, rbytes - rabin_count);
|
||||||
|
tdat->rbytes = rabin_count;
|
||||||
|
rabin_count = rbytes - rabin_count;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
tmp = tdat->uncompressed_chunk;
|
tmp = tdat->uncompressed_chunk;
|
||||||
tdat->uncompressed_chunk = cread_buf;
|
tdat->uncompressed_chunk = cread_buf;
|
||||||
|
@ -1014,7 +1035,11 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
* Read the next buffer we want to process while previous
|
* Read the next buffer we want to process while previous
|
||||||
* buffer is in progress.
|
* buffer is in progress.
|
||||||
*/
|
*/
|
||||||
rbytes = Read(uncompfd, cread_buf, chunksize);
|
if (enable_rabin_split) {
|
||||||
|
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
|
||||||
|
} else {
|
||||||
|
rbytes = Read(uncompfd, cread_buf, chunksize);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1088,6 +1113,7 @@ comp_done:
|
||||||
}
|
}
|
||||||
slab_free(NULL, dary);
|
slab_free(NULL, dary);
|
||||||
}
|
}
|
||||||
|
if (enable_rabin_split) destroy_rabin_context(rctx);
|
||||||
slab_free(NULL, cread_buf);
|
slab_free(NULL, cread_buf);
|
||||||
if (!pipe_mode) {
|
if (!pipe_mode) {
|
||||||
if (compfd != -1) close(compfd);
|
if (compfd != -1) close(compfd);
|
||||||
|
@ -1177,7 +1203,7 @@ main(int argc, char *argv[])
|
||||||
level = 6;
|
level = 6;
|
||||||
slab_init();
|
slab_init();
|
||||||
|
|
||||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCD")) != -1) {
|
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDr")) != -1) {
|
||||||
int ovr;
|
int ovr;
|
||||||
|
|
||||||
switch (opt) {
|
switch (opt) {
|
||||||
|
@ -1233,6 +1259,10 @@ main(int argc, char *argv[])
|
||||||
enable_rabin_scan = 1;
|
enable_rabin_scan = 1;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case 'r':
|
||||||
|
enable_rabin_split = 0;
|
||||||
|
break;
|
||||||
|
|
||||||
case '?':
|
case '?':
|
||||||
default:
|
default:
|
||||||
usage();
|
usage();
|
||||||
|
|
|
@ -123,21 +123,27 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
|
||||||
return (NULL);
|
return (NULL);
|
||||||
}
|
}
|
||||||
current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE);
|
current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE);
|
||||||
ctx->blocks = (rabin_blockentry_t *)slab_alloc(NULL,
|
ctx->blocks = NULL;
|
||||||
blknum * ctx->rabin_poly_min_block_size);
|
if (real_chunksize > 0) {
|
||||||
if(ctx == NULL || current_window_data == NULL || ctx->blocks == NULL) {
|
ctx->blocks = (rabin_blockentry_t *)slab_alloc(NULL,
|
||||||
|
blknum * ctx->rabin_poly_min_block_size);
|
||||||
|
}
|
||||||
|
if(ctx == NULL || current_window_data == NULL || (ctx->blocks == NULL && real_chunksize > 0)) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"Could not allocate rabin polynomial context, out of memory\n");
|
"Could not allocate rabin polynomial context, out of memory\n");
|
||||||
destroy_rabin_context(ctx);
|
destroy_rabin_context(ctx);
|
||||||
return (NULL);
|
return (NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
lzma_init(&(ctx->lzma_data), &(ctx->level), chunksize);
|
ctx->lzma_data = NULL;
|
||||||
if (!(ctx->lzma_data)) {
|
if (real_chunksize > 0) {
|
||||||
fprintf(stderr,
|
lzma_init(&(ctx->lzma_data), &(ctx->level), chunksize);
|
||||||
"Could not allocate rabin polynomial context, out of memory\n");
|
if (!(ctx->lzma_data)) {
|
||||||
destroy_rabin_context(ctx);
|
fprintf(stderr,
|
||||||
return (NULL);
|
"Could not allocate rabin polynomial context, out of memory\n");
|
||||||
|
destroy_rabin_context(ctx);
|
||||||
|
return (NULL);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
* We should compute the power for the window size.
|
* We should compute the power for the window size.
|
||||||
|
@ -198,7 +204,7 @@ cmpblks(const void *a, const void *b)
|
||||||
* the rolling checksum and dedup blocks vary in size from 4K-128K.
|
* the rolling checksum and dedup blocks vary in size from 4K-128K.
|
||||||
*/
|
*/
|
||||||
uint32_t
|
uint32_t
|
||||||
rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos)
|
||||||
{
|
{
|
||||||
ssize_t i, last_offset,j;
|
ssize_t i, last_offset,j;
|
||||||
uint32_t blknum;
|
uint32_t blknum;
|
||||||
|
@ -211,6 +217,14 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
||||||
ctx->valid = 0;
|
ctx->valid = 0;
|
||||||
ctx->cur_checksum = 0;
|
ctx->cur_checksum = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
|
||||||
|
* in the chunk. We start scanning at chunk end - max rabin block size. We avoid doing
|
||||||
|
* a full chunk scan.
|
||||||
|
*/
|
||||||
|
if (rabin_pos) {
|
||||||
|
offset = *size - RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
|
||||||
|
}
|
||||||
if (*size < ctx->rabin_poly_avg_block_size) return;
|
if (*size < ctx->rabin_poly_avg_block_size) return;
|
||||||
for (i=offset; i<*size; i++) {
|
for (i=offset; i<*size; i++) {
|
||||||
char cur_byte = buf1[i];
|
char cur_byte = buf1[i];
|
||||||
|
@ -241,18 +255,24 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset)
|
||||||
// If we hit our special value or reached the max block size update block offset
|
// If we hit our special value or reached the max block size update block offset
|
||||||
if ((ctx->cur_roll_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
|
if ((ctx->cur_roll_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
|
||||||
length >= rabin_polynomial_max_block_size) {
|
length >= rabin_polynomial_max_block_size) {
|
||||||
ctx->blocks[blknum].offset = last_offset;
|
if (rabin_pos == NULL) {
|
||||||
ctx->blocks[blknum].index = blknum; // Need to store for sorting
|
ctx->blocks[blknum].offset = last_offset;
|
||||||
ctx->blocks[blknum].cksum_n_offset = ctx->cur_checksum;
|
ctx->blocks[blknum].index = blknum; // Need to store for sorting
|
||||||
ctx->blocks[blknum].length = length;
|
ctx->blocks[blknum].cksum_n_offset = ctx->cur_checksum;
|
||||||
ctx->blocks[blknum].refcount = 0;
|
ctx->blocks[blknum].length = length;
|
||||||
blknum++;
|
ctx->blocks[blknum].refcount = 0;
|
||||||
|
blknum++;
|
||||||
|
}
|
||||||
ctx->cur_checksum = 0;
|
ctx->cur_checksum = 0;
|
||||||
last_offset = i+1;
|
last_offset = i+1;
|
||||||
length = 0;
|
length = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (rabin_pos && last_offset < *size) {
|
||||||
|
*rabin_pos = last_offset;
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
// If we found at least a few chunks, perform dedup.
|
// If we found at least a few chunks, perform dedup.
|
||||||
if (blknum > 2) {
|
if (blknum > 2) {
|
||||||
uint64_t prev_cksum;
|
uint64_t prev_cksum;
|
||||||
|
|
|
@ -144,7 +144,7 @@ extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_c
|
||||||
const char *algo);
|
const char *algo);
|
||||||
extern void destroy_rabin_context(rabin_context_t *ctx);
|
extern void destroy_rabin_context(rabin_context_t *ctx);
|
||||||
extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf,
|
extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf,
|
||||||
ssize_t *size, ssize_t offset);
|
ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
|
||||||
extern void rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size);
|
extern void rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size);
|
||||||
extern void rabin_parse_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz,
|
extern void rabin_parse_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz,
|
||||||
ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp,
|
ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp,
|
||||||
|
|
33
utils.c
33
utils.c
|
@ -194,6 +194,39 @@ Read(int fd, void *buf, size_t count)
|
||||||
return (count - rem);
|
return (count - rem);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Read the requested chunk and return the last rabin boundary in the chunk.
|
||||||
|
* This helps in splitting chunks at rabin boundaries rather than fixed points.
|
||||||
|
* The request buffer may have some data at the beginning carried over from
|
||||||
|
* after the previous rabin boundary.
|
||||||
|
*/
|
||||||
|
ssize_t
|
||||||
|
Read_Adjusted(int fd, uchar_t *buf, size_t count, ssize_t *rabin_count, void *ctx)
|
||||||
|
{
|
||||||
|
char *buf2;
|
||||||
|
ssize_t rcount;
|
||||||
|
rabin_context_t *rctx = (rabin_context_t *)ctx;
|
||||||
|
|
||||||
|
if (!ctx) return (Read(fd, buf, count));
|
||||||
|
buf2 = buf;
|
||||||
|
if (*rabin_count) {
|
||||||
|
buf2 = (char *)buf + *rabin_count;
|
||||||
|
count -= *rabin_count;
|
||||||
|
}
|
||||||
|
rcount = Read(fd, buf2, count);
|
||||||
|
if (rcount > 0) {
|
||||||
|
rcount += *rabin_count;
|
||||||
|
if (!rcount < count)
|
||||||
|
rabin_dedup(rctx, buf, &rcount, *rabin_count, rabin_count);
|
||||||
|
else
|
||||||
|
*rabin_count = 0;
|
||||||
|
} else {
|
||||||
|
if (rcount == 0) rcount = *rabin_count;
|
||||||
|
*rabin_count = 0;
|
||||||
|
}
|
||||||
|
return (rcount);
|
||||||
|
}
|
||||||
|
|
||||||
ssize_t
|
ssize_t
|
||||||
Write(int fd, const void *buf, size_t count)
|
Write(int fd, const void *buf, size_t count)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in a new issue