Speed up adaptive modes by using heuristics to select compression algorithm.
Select similarity percentage based on dedupe block size for effectiveness.
This commit is contained in:
parent
333b7b011e
commit
449dc35675
3 changed files with 59 additions and 50 deletions
10
README.md
10
README.md
|
@ -82,11 +82,13 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
|
||||||
|
|
||||||
Perform Delta Encoding in addition to Identical Dedup:
|
Perform Delta Encoding in addition to Identical Dedup:
|
||||||
pcompress -E ... - This also implies '-D'. This performs Delta Compression
|
pcompress -E ... - This also implies '-D'. This performs Delta Compression
|
||||||
between 2 blocks if they are at least 60% similar.
|
between 2 blocks if they are 40% to 60% similar. The
|
||||||
|
similarity %age is selected based on the dedupe block
|
||||||
|
size to balance performance and effectiveness.
|
||||||
pcompress -EE .. - This causes Delta Compression to happen if 2 blocks are
|
pcompress -EE .. - This causes Delta Compression to happen if 2 blocks are
|
||||||
at least 40% similar. This can effect greater final
|
at least 40% similar regardless of block size. This can
|
||||||
compression ratio at the cost of higher processing
|
effect greater final compression ratio at the cost of
|
||||||
overhead.
|
higher processing overhead.
|
||||||
|
|
||||||
Number of threads can optionally be specified: -t <1 - 256 count>
|
Number of threads can optionally be specified: -t <1 - 256 count>
|
||||||
Other flags:
|
Other flags:
|
||||||
|
|
|
@ -140,55 +140,43 @@ adapt_compress(void *src, size_t srclen, void *dst,
|
||||||
size_t *dstlen, int level, uchar_t chdr, void *data)
|
size_t *dstlen, int level, uchar_t chdr, void *data)
|
||||||
{
|
{
|
||||||
struct adapt_data *adat = (struct adapt_data *)(data);
|
struct adapt_data *adat = (struct adapt_data *)(data);
|
||||||
int rv, rv1, rv2;
|
uchar_t *src1 = (uchar_t *)src;
|
||||||
unsigned int *inc;
|
size_t i, bincount;
|
||||||
size_t dst2len, dst3len, smaller_dstlen;
|
int rv;
|
||||||
uchar_t *dst2, *smaller_dst;
|
|
||||||
void *tmp;
|
|
||||||
|
|
||||||
dst2 = slab_alloc(NULL, *dstlen);
|
/*
|
||||||
if (!dst2) {
|
* Count number of 8-bit binary bytes in source.
|
||||||
fprintf(stderr, "Adapt: Out of memory\n");
|
*/
|
||||||
return (-1);
|
bincount = 0;
|
||||||
}
|
for (i = 0; i < srclen; i++)
|
||||||
|
bincount += (src1[i] >> 7);
|
||||||
|
|
||||||
rv = COMPRESS_PPMD;
|
/*
|
||||||
inc = &ppmd_count;
|
* Use PPMd if at least 70% of source is 7-bit textual bytes, otherwise
|
||||||
dst2len = *dstlen;
|
* use Bzip2 or LZMA.
|
||||||
dst3len = *dstlen;
|
*/
|
||||||
rv1 = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
|
if (bincount > (srclen / 10 * 3)) {
|
||||||
if (rv1 < 0) *dstlen = dst3len;
|
if (adat->adapt_mode == 2) {
|
||||||
|
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
|
||||||
if (adat->adapt_mode == 2) {
|
if (rv < 0)
|
||||||
rv2 = lzma_compress(src, srclen, dst2, &dst2len, level, chdr, adat->lzma_data);
|
return (rv);
|
||||||
if (rv2 < 0) dst2len = dst3len;
|
|
||||||
if (dst2len < *dstlen) {
|
|
||||||
inc = &lzma_count;
|
|
||||||
rv = COMPRESS_LZMA;
|
rv = COMPRESS_LZMA;
|
||||||
}
|
lzma_count++;
|
||||||
} else {
|
} else {
|
||||||
rv2 = bzip2_compress(src, srclen, dst2, &dst2len, level, chdr, NULL);
|
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
|
||||||
if (rv2 < 0) dst2len = dst3len;
|
if (rv < 0)
|
||||||
if (dst2len < *dstlen) {
|
return (rv);
|
||||||
inc = &bzip2_count;
|
|
||||||
rv = COMPRESS_BZIP2;
|
rv = COMPRESS_BZIP2;
|
||||||
|
bzip2_count++;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (dst2len < *dstlen) {
|
|
||||||
smaller_dstlen = dst2len;
|
|
||||||
smaller_dst = dst2;
|
|
||||||
} else {
|
} else {
|
||||||
smaller_dstlen = *dstlen;
|
rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
|
||||||
smaller_dst = dst;
|
if (rv < 0)
|
||||||
|
return (rv);
|
||||||
|
rv = COMPRESS_PPMD;
|
||||||
|
ppmd_count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
*inc += 1;
|
|
||||||
if (smaller_dst != dst) {
|
|
||||||
memcpy(dst, smaller_dst, smaller_dstlen);
|
|
||||||
*dstlen = smaller_dstlen;
|
|
||||||
}
|
|
||||||
slab_free(NULL, dst2);
|
|
||||||
return (rv);
|
return (rv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -67,7 +67,8 @@
|
||||||
|
|
||||||
#include "rabin_dedup.h"
|
#include "rabin_dedup.h"
|
||||||
|
|
||||||
#define FORTY_PCNT(x) (((x)/5 << 1))
|
#define FORTY_PCNT(x) ((x)/5 << 1)
|
||||||
|
#define FIFTY_PCNT(x) ((x) >> 1)
|
||||||
#define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3))
|
#define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3))
|
||||||
|
|
||||||
extern int lzma_init(void **data, int *level, ssize_t chunksize);
|
extern int lzma_init(void **data, int *level, ssize_t chunksize);
|
||||||
|
@ -170,11 +171,27 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
||||||
|
|
||||||
ctx->fixed_flag = fixed_flag;
|
ctx->fixed_flag = fixed_flag;
|
||||||
ctx->rabin_break_patt = 0;
|
ctx->rabin_break_patt = 0;
|
||||||
ctx->delta_flag = delta_flag;
|
|
||||||
ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
|
ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
|
||||||
ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1;
|
ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1;
|
||||||
ctx->rabin_poly_min_block_size = dedupe_min_blksz(rab_blk_sz);
|
ctx->rabin_poly_min_block_size = dedupe_min_blksz(rab_blk_sz);
|
||||||
ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size;
|
ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size;
|
||||||
|
ctx->delta_flag = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Scale down similarity percentage based on avg block size unless user specified
|
||||||
|
* argument '-EE' in which case fixed 40% match is used for Delta compression.
|
||||||
|
*/
|
||||||
|
if (delta_flag == DELTA_NORMAL) {
|
||||||
|
if (ctx->rabin_poly_avg_block_size < (1 << 14)) {
|
||||||
|
ctx->delta_flag = 1;
|
||||||
|
} else if (ctx->rabin_poly_avg_block_size < (1 << 16)) {
|
||||||
|
ctx->delta_flag = 2;
|
||||||
|
} else {
|
||||||
|
ctx->delta_flag = 3;
|
||||||
|
}
|
||||||
|
} else if (delta_flag == DELTA_EXTRA) {
|
||||||
|
ctx->delta_flag = 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (!fixed_flag)
|
if (!fixed_flag)
|
||||||
ctx->blknum = chunksize / ctx->rabin_poly_min_block_size;
|
ctx->blknum = chunksize / ctx->rabin_poly_min_block_size;
|
||||||
|
@ -356,7 +373,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
|
||||||
j = 0;
|
j = 0;
|
||||||
|
|
||||||
for (i=offset; i<*size; i++) {
|
for (i=offset; i<*size; i++) {
|
||||||
ssize_t pc[3];
|
ssize_t pc[4];
|
||||||
uchar_t cur_byte = buf1[i];
|
uchar_t cur_byte = buf1[i];
|
||||||
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
||||||
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
||||||
|
@ -414,7 +431,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
|
||||||
*/
|
*/
|
||||||
if (ctx->delta_flag) {
|
if (ctx->delta_flag) {
|
||||||
pc[1] = SIXTY_PCNT(j);
|
pc[1] = SIXTY_PCNT(j);
|
||||||
pc[2] = FORTY_PCNT(j);
|
pc[2] = FIFTY_PCNT(j);
|
||||||
|
pc[3] = FORTY_PCNT(j);
|
||||||
|
|
||||||
reset_heap(&heap, pc[ctx->delta_flag]);
|
reset_heap(&heap, pc[ctx->delta_flag]);
|
||||||
ksmallest(fplist, j, &heap);
|
ksmallest(fplist, j, &heap);
|
||||||
|
@ -444,7 +462,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
|
||||||
|
|
||||||
if (j > 1) {
|
if (j > 1) {
|
||||||
pc[1] = SIXTY_PCNT(j);
|
pc[1] = SIXTY_PCNT(j);
|
||||||
pc[2] = FORTY_PCNT(j);
|
pc[2] = FIFTY_PCNT(j);
|
||||||
|
pc[3] = FORTY_PCNT(j);
|
||||||
reset_heap(&heap, pc[ctx->delta_flag]);
|
reset_heap(&heap, pc[ctx->delta_flag]);
|
||||||
ksmallest(fplist, j, &heap);
|
ksmallest(fplist, j, &heap);
|
||||||
cur_sketch =
|
cur_sketch =
|
||||||
|
|
Loading…
Reference in a new issue