Speed up adaptive modes by using heuristics to select compression algorithm.

Select similarity percentage based on dedupe block size for effectiveness.
This commit is contained in:
Moinak Ghosh 2012-09-26 19:47:32 +05:30
parent 333b7b011e
commit 449dc35675
3 changed files with 59 additions and 50 deletions

View file

@ -82,11 +82,13 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
Perform Delta Encoding in addition to Identical Dedup: Perform Delta Encoding in addition to Identical Dedup:
pcompress -E ... - This also implies '-D'. This performs Delta Compression pcompress -E ... - This also implies '-D'. This performs Delta Compression
between 2 blocks if they are at least 60% similar. between 2 blocks if they are 40% to 60% similar. The
similarity %age is selected based on the dedupe block
size to balance performance and effectiveness.
pcompress -EE .. - This causes Delta Compression to happen if 2 blocks are pcompress -EE .. - This causes Delta Compression to happen if 2 blocks are
at least 40% similar. This can effect greater final at least 40% similar regardless of block size. This can
compression ratio at the cost of higher processing effect greater final compression ratio at the cost of
overhead. higher processing overhead.
Number of threads can optionally be specified: -t <1 - 256 count> Number of threads can optionally be specified: -t <1 - 256 count>
Other flags: Other flags:

View file

@ -140,55 +140,43 @@ adapt_compress(void *src, size_t srclen, void *dst,
size_t *dstlen, int level, uchar_t chdr, void *data) size_t *dstlen, int level, uchar_t chdr, void *data)
{ {
struct adapt_data *adat = (struct adapt_data *)(data); struct adapt_data *adat = (struct adapt_data *)(data);
int rv, rv1, rv2; uchar_t *src1 = (uchar_t *)src;
unsigned int *inc; size_t i, bincount;
size_t dst2len, dst3len, smaller_dstlen; int rv;
uchar_t *dst2, *smaller_dst;
void *tmp;
dst2 = slab_alloc(NULL, *dstlen); /*
if (!dst2) { * Count number of 8-bit binary bytes in source.
fprintf(stderr, "Adapt: Out of memory\n"); */
return (-1); bincount = 0;
} for (i = 0; i < srclen; i++)
bincount += (src1[i] >> 7);
rv = COMPRESS_PPMD; /*
inc = &ppmd_count; * Use PPMd if at least 70% of source is 7-bit textual bytes, otherwise
dst2len = *dstlen; * use Bzip2 or LZMA.
dst3len = *dstlen; */
rv1 = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data); if (bincount > (srclen / 10 * 3)) {
if (rv1 < 0) *dstlen = dst3len; if (adat->adapt_mode == 2) {
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
if (adat->adapt_mode == 2) { if (rv < 0)
rv2 = lzma_compress(src, srclen, dst2, &dst2len, level, chdr, adat->lzma_data); return (rv);
if (rv2 < 0) dst2len = dst3len;
if (dst2len < *dstlen) {
inc = &lzma_count;
rv = COMPRESS_LZMA; rv = COMPRESS_LZMA;
} lzma_count++;
} else { } else {
rv2 = bzip2_compress(src, srclen, dst2, &dst2len, level, chdr, NULL); rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
if (rv2 < 0) dst2len = dst3len; if (rv < 0)
if (dst2len < *dstlen) { return (rv);
inc = &bzip2_count;
rv = COMPRESS_BZIP2; rv = COMPRESS_BZIP2;
bzip2_count++;
} }
}
if (dst2len < *dstlen) {
smaller_dstlen = dst2len;
smaller_dst = dst2;
} else { } else {
smaller_dstlen = *dstlen; rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
smaller_dst = dst; if (rv < 0)
return (rv);
rv = COMPRESS_PPMD;
ppmd_count++;
} }
*inc += 1;
if (smaller_dst != dst) {
memcpy(dst, smaller_dst, smaller_dstlen);
*dstlen = smaller_dstlen;
}
slab_free(NULL, dst2);
return (rv); return (rv);
} }

View file

@ -67,7 +67,8 @@
#include "rabin_dedup.h" #include "rabin_dedup.h"
#define FORTY_PCNT(x) (((x)/5 << 1)) #define FORTY_PCNT(x) ((x)/5 << 1)
#define FIFTY_PCNT(x) ((x) >> 1)
#define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3)) #define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3))
extern int lzma_init(void **data, int *level, ssize_t chunksize); extern int lzma_init(void **data, int *level, ssize_t chunksize);
@ -170,11 +171,27 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
ctx->fixed_flag = fixed_flag; ctx->fixed_flag = fixed_flag;
ctx->rabin_break_patt = 0; ctx->rabin_break_patt = 0;
ctx->delta_flag = delta_flag;
ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS); ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1; ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1;
ctx->rabin_poly_min_block_size = dedupe_min_blksz(rab_blk_sz); ctx->rabin_poly_min_block_size = dedupe_min_blksz(rab_blk_sz);
ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size; ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size;
ctx->delta_flag = 0;
/*
* Scale down similarity percentage based on avg block size unless user specified
* argument '-EE' in which case fixed 40% match is used for Delta compression.
*/
if (delta_flag == DELTA_NORMAL) {
if (ctx->rabin_poly_avg_block_size < (1 << 14)) {
ctx->delta_flag = 1;
} else if (ctx->rabin_poly_avg_block_size < (1 << 16)) {
ctx->delta_flag = 2;
} else {
ctx->delta_flag = 3;
}
} else if (delta_flag == DELTA_EXTRA) {
ctx->delta_flag = 1;
}
if (!fixed_flag) if (!fixed_flag)
ctx->blknum = chunksize / ctx->rabin_poly_min_block_size; ctx->blknum = chunksize / ctx->rabin_poly_min_block_size;
@ -356,7 +373,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
j = 0; j = 0;
for (i=offset; i<*size; i++) { for (i=offset; i<*size; i++) {
ssize_t pc[3]; ssize_t pc[4];
uchar_t cur_byte = buf1[i]; uchar_t cur_byte = buf1[i];
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos]; uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
ctx->current_window_data[ctx->window_pos] = cur_byte; ctx->current_window_data[ctx->window_pos] = cur_byte;
@ -414,7 +431,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
*/ */
if (ctx->delta_flag) { if (ctx->delta_flag) {
pc[1] = SIXTY_PCNT(j); pc[1] = SIXTY_PCNT(j);
pc[2] = FORTY_PCNT(j); pc[2] = FIFTY_PCNT(j);
pc[3] = FORTY_PCNT(j);
reset_heap(&heap, pc[ctx->delta_flag]); reset_heap(&heap, pc[ctx->delta_flag]);
ksmallest(fplist, j, &heap); ksmallest(fplist, j, &heap);
@ -444,7 +462,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
if (j > 1) { if (j > 1) {
pc[1] = SIXTY_PCNT(j); pc[1] = SIXTY_PCNT(j);
pc[2] = FORTY_PCNT(j); pc[2] = FIFTY_PCNT(j);
pc[3] = FORTY_PCNT(j);
reset_heap(&heap, pc[ctx->delta_flag]); reset_heap(&heap, pc[ctx->delta_flag]);
ksmallest(fplist, j, &heap); ksmallest(fplist, j, &heap);
cur_sketch = cur_sketch =