Speed up adaptive modes by using heuristics to select compression algorithm.

Select similarity percentage based on dedupe block size for effectiveness.
This commit is contained in:
Moinak Ghosh 2012-09-26 19:47:32 +05:30
parent 333b7b011e
commit 449dc35675
3 changed files with 59 additions and 50 deletions

View file

@ -82,11 +82,13 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
Perform Delta Encoding in addition to Identical Dedup:
pcompress -E ... - This also implies '-D'. This performs Delta Compression
between 2 blocks if they are at least 60% similar.
between 2 blocks if they are 40% to 60% similar. The
similarity %age is selected based on the dedupe block
size to balance performance and effectiveness.
pcompress -EE .. - This causes Delta Compression to happen if 2 blocks are
at least 40% similar. This can effect greater final
compression ratio at the cost of higher processing
overhead.
at least 40% similar regardless of block size. This can
effect greater final compression ratio at the cost of
higher processing overhead.
Number of threads can optionally be specified: -t <1 - 256 count>
Other flags:

View file

@ -140,55 +140,43 @@ adapt_compress(void *src, size_t srclen, void *dst,
size_t *dstlen, int level, uchar_t chdr, void *data)
{
struct adapt_data *adat = (struct adapt_data *)(data);
int rv, rv1, rv2;
unsigned int *inc;
size_t dst2len, dst3len, smaller_dstlen;
uchar_t *dst2, *smaller_dst;
void *tmp;
uchar_t *src1 = (uchar_t *)src;
size_t i, bincount;
int rv;
dst2 = slab_alloc(NULL, *dstlen);
if (!dst2) {
fprintf(stderr, "Adapt: Out of memory\n");
return (-1);
}
/*
* Count number of 8-bit binary bytes in source.
*/
bincount = 0;
for (i = 0; i < srclen; i++)
bincount += (src1[i] >> 7);
rv = COMPRESS_PPMD;
inc = &ppmd_count;
dst2len = *dstlen;
dst3len = *dstlen;
rv1 = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
if (rv1 < 0) *dstlen = dst3len;
if (adat->adapt_mode == 2) {
rv2 = lzma_compress(src, srclen, dst2, &dst2len, level, chdr, adat->lzma_data);
if (rv2 < 0) dst2len = dst3len;
if (dst2len < *dstlen) {
inc = &lzma_count;
/*
* Use PPMd if at least 70% of source is 7-bit textual bytes, otherwise
* use Bzip2 or LZMA.
*/
if (bincount > (srclen / 10 * 3)) {
if (adat->adapt_mode == 2) {
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
if (rv < 0)
return (rv);
rv = COMPRESS_LZMA;
}
} else {
rv2 = bzip2_compress(src, srclen, dst2, &dst2len, level, chdr, NULL);
if (rv2 < 0) dst2len = dst3len;
if (dst2len < *dstlen) {
inc = &bzip2_count;
lzma_count++;
} else {
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
if (rv < 0)
return (rv);
rv = COMPRESS_BZIP2;
bzip2_count++;
}
}
if (dst2len < *dstlen) {
smaller_dstlen = dst2len;
smaller_dst = dst2;
} else {
smaller_dstlen = *dstlen;
smaller_dst = dst;
rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
if (rv < 0)
return (rv);
rv = COMPRESS_PPMD;
ppmd_count++;
}
*inc += 1;
if (smaller_dst != dst) {
memcpy(dst, smaller_dst, smaller_dstlen);
*dstlen = smaller_dstlen;
}
slab_free(NULL, dst2);
return (rv);
}

View file

@ -67,7 +67,8 @@
#include "rabin_dedup.h"
#define FORTY_PCNT(x) (((x)/5 << 1))
#define FORTY_PCNT(x) ((x)/5 << 1)
#define FIFTY_PCNT(x) ((x) >> 1)
#define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3))
extern int lzma_init(void **data, int *level, ssize_t chunksize);
@ -170,11 +171,27 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
ctx->fixed_flag = fixed_flag;
ctx->rabin_break_patt = 0;
ctx->delta_flag = delta_flag;
ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1;
ctx->rabin_poly_min_block_size = dedupe_min_blksz(rab_blk_sz);
ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size;
ctx->delta_flag = 0;
/*
* Scale down similarity percentage based on avg block size unless user specified
* argument '-EE' in which case fixed 40% match is used for Delta compression.
*/
if (delta_flag == DELTA_NORMAL) {
if (ctx->rabin_poly_avg_block_size < (1 << 14)) {
ctx->delta_flag = 1;
} else if (ctx->rabin_poly_avg_block_size < (1 << 16)) {
ctx->delta_flag = 2;
} else {
ctx->delta_flag = 3;
}
} else if (delta_flag == DELTA_EXTRA) {
ctx->delta_flag = 1;
}
if (!fixed_flag)
ctx->blknum = chunksize / ctx->rabin_poly_min_block_size;
@ -356,7 +373,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
j = 0;
for (i=offset; i<*size; i++) {
ssize_t pc[3];
ssize_t pc[4];
uchar_t cur_byte = buf1[i];
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
ctx->current_window_data[ctx->window_pos] = cur_byte;
@ -414,7 +431,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
*/
if (ctx->delta_flag) {
pc[1] = SIXTY_PCNT(j);
pc[2] = FORTY_PCNT(j);
pc[2] = FIFTY_PCNT(j);
pc[3] = FORTY_PCNT(j);
reset_heap(&heap, pc[ctx->delta_flag]);
ksmallest(fplist, j, &heap);
@ -444,7 +462,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
if (j > 1) {
pc[1] = SIXTY_PCNT(j);
pc[2] = FORTY_PCNT(j);
pc[2] = FIFTY_PCNT(j);
pc[3] = FORTY_PCNT(j);
reset_heap(&heap, pc[ctx->delta_flag]);
ksmallest(fplist, j, &heap);
cur_sketch =