Speed up adaptive modes by using heuristics to select compression algorithm.
Select similarity percentage based on dedupe block size for effectiveness.
This commit is contained in:
parent
333b7b011e
commit
449dc35675
3 changed files with 59 additions and 50 deletions
10
README.md
10
README.md
|
@ -82,11 +82,13 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
|
|||
|
||||
Perform Delta Encoding in addition to Identical Dedup:
|
||||
pcompress -E ... - This also implies '-D'. This performs Delta Compression
|
||||
between 2 blocks if they are at least 60% similar.
|
||||
between 2 blocks if they are 40% to 60% similar. The
|
||||
similarity %age is selected based on the dedupe block
|
||||
size to balance performance and effectiveness.
|
||||
pcompress -EE .. - This causes Delta Compression to happen if 2 blocks are
|
||||
at least 40% similar. This can effect greater final
|
||||
compression ratio at the cost of higher processing
|
||||
overhead.
|
||||
at least 40% similar regardless of block size. This can
|
||||
effect greater final compression ratio at the cost of
|
||||
higher processing overhead.
|
||||
|
||||
Number of threads can optionally be specified: -t <1 - 256 count>
|
||||
Other flags:
|
||||
|
|
|
@ -140,55 +140,43 @@ adapt_compress(void *src, size_t srclen, void *dst,
|
|||
size_t *dstlen, int level, uchar_t chdr, void *data)
|
||||
{
|
||||
struct adapt_data *adat = (struct adapt_data *)(data);
|
||||
int rv, rv1, rv2;
|
||||
unsigned int *inc;
|
||||
size_t dst2len, dst3len, smaller_dstlen;
|
||||
uchar_t *dst2, *smaller_dst;
|
||||
void *tmp;
|
||||
uchar_t *src1 = (uchar_t *)src;
|
||||
size_t i, bincount;
|
||||
int rv;
|
||||
|
||||
dst2 = slab_alloc(NULL, *dstlen);
|
||||
if (!dst2) {
|
||||
fprintf(stderr, "Adapt: Out of memory\n");
|
||||
return (-1);
|
||||
}
|
||||
|
||||
rv = COMPRESS_PPMD;
|
||||
inc = &ppmd_count;
|
||||
dst2len = *dstlen;
|
||||
dst3len = *dstlen;
|
||||
rv1 = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
|
||||
if (rv1 < 0) *dstlen = dst3len;
|
||||
/*
|
||||
* Count number of 8-bit binary bytes in source.
|
||||
*/
|
||||
bincount = 0;
|
||||
for (i = 0; i < srclen; i++)
|
||||
bincount += (src1[i] >> 7);
|
||||
|
||||
/*
|
||||
* Use PPMd if at least 70% of source is 7-bit textual bytes, otherwise
|
||||
* use Bzip2 or LZMA.
|
||||
*/
|
||||
if (bincount > (srclen / 10 * 3)) {
|
||||
if (adat->adapt_mode == 2) {
|
||||
rv2 = lzma_compress(src, srclen, dst2, &dst2len, level, chdr, adat->lzma_data);
|
||||
if (rv2 < 0) dst2len = dst3len;
|
||||
if (dst2len < *dstlen) {
|
||||
inc = &lzma_count;
|
||||
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
|
||||
if (rv < 0)
|
||||
return (rv);
|
||||
rv = COMPRESS_LZMA;
|
||||
}
|
||||
lzma_count++;
|
||||
} else {
|
||||
rv2 = bzip2_compress(src, srclen, dst2, &dst2len, level, chdr, NULL);
|
||||
if (rv2 < 0) dst2len = dst3len;
|
||||
if (dst2len < *dstlen) {
|
||||
inc = &bzip2_count;
|
||||
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
|
||||
if (rv < 0)
|
||||
return (rv);
|
||||
rv = COMPRESS_BZIP2;
|
||||
bzip2_count++;
|
||||
}
|
||||
}
|
||||
|
||||
if (dst2len < *dstlen) {
|
||||
smaller_dstlen = dst2len;
|
||||
smaller_dst = dst2;
|
||||
} else {
|
||||
smaller_dstlen = *dstlen;
|
||||
smaller_dst = dst;
|
||||
rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
|
||||
if (rv < 0)
|
||||
return (rv);
|
||||
rv = COMPRESS_PPMD;
|
||||
ppmd_count++;
|
||||
}
|
||||
|
||||
*inc += 1;
|
||||
if (smaller_dst != dst) {
|
||||
memcpy(dst, smaller_dst, smaller_dstlen);
|
||||
*dstlen = smaller_dstlen;
|
||||
}
|
||||
slab_free(NULL, dst2);
|
||||
return (rv);
|
||||
}
|
||||
|
||||
|
|
|
@ -67,7 +67,8 @@
|
|||
|
||||
#include "rabin_dedup.h"
|
||||
|
||||
#define FORTY_PCNT(x) (((x)/5 << 1))
|
||||
#define FORTY_PCNT(x) ((x)/5 << 1)
|
||||
#define FIFTY_PCNT(x) ((x) >> 1)
|
||||
#define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3))
|
||||
|
||||
extern int lzma_init(void **data, int *level, ssize_t chunksize);
|
||||
|
@ -170,11 +171,27 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
|||
|
||||
ctx->fixed_flag = fixed_flag;
|
||||
ctx->rabin_break_patt = 0;
|
||||
ctx->delta_flag = delta_flag;
|
||||
ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
|
||||
ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1;
|
||||
ctx->rabin_poly_min_block_size = dedupe_min_blksz(rab_blk_sz);
|
||||
ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size;
|
||||
ctx->delta_flag = 0;
|
||||
|
||||
/*
|
||||
* Scale down similarity percentage based on avg block size unless user specified
|
||||
* argument '-EE' in which case fixed 40% match is used for Delta compression.
|
||||
*/
|
||||
if (delta_flag == DELTA_NORMAL) {
|
||||
if (ctx->rabin_poly_avg_block_size < (1 << 14)) {
|
||||
ctx->delta_flag = 1;
|
||||
} else if (ctx->rabin_poly_avg_block_size < (1 << 16)) {
|
||||
ctx->delta_flag = 2;
|
||||
} else {
|
||||
ctx->delta_flag = 3;
|
||||
}
|
||||
} else if (delta_flag == DELTA_EXTRA) {
|
||||
ctx->delta_flag = 1;
|
||||
}
|
||||
|
||||
if (!fixed_flag)
|
||||
ctx->blknum = chunksize / ctx->rabin_poly_min_block_size;
|
||||
|
@ -356,7 +373,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
|
|||
j = 0;
|
||||
|
||||
for (i=offset; i<*size; i++) {
|
||||
ssize_t pc[3];
|
||||
ssize_t pc[4];
|
||||
uchar_t cur_byte = buf1[i];
|
||||
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
||||
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
||||
|
@ -414,7 +431,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
|
|||
*/
|
||||
if (ctx->delta_flag) {
|
||||
pc[1] = SIXTY_PCNT(j);
|
||||
pc[2] = FORTY_PCNT(j);
|
||||
pc[2] = FIFTY_PCNT(j);
|
||||
pc[3] = FORTY_PCNT(j);
|
||||
|
||||
reset_heap(&heap, pc[ctx->delta_flag]);
|
||||
ksmallest(fplist, j, &heap);
|
||||
|
@ -444,7 +462,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
|
|||
|
||||
if (j > 1) {
|
||||
pc[1] = SIXTY_PCNT(j);
|
||||
pc[2] = FORTY_PCNT(j);
|
||||
pc[2] = FIFTY_PCNT(j);
|
||||
pc[3] = FORTY_PCNT(j);
|
||||
reset_heap(&heap, pc[ctx->delta_flag]);
|
||||
ksmallest(fplist, j, &heap);
|
||||
cur_sketch =
|
||||
|
|
Loading…
Reference in a new issue