diff --git a/README.md b/README.md index a65e087..2ce5bbf 100644 --- a/README.md +++ b/README.md @@ -82,11 +82,13 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library Perform Delta Encoding in addition to Identical Dedup: pcompress -E ... - This also implies '-D'. This performs Delta Compression - between 2 blocks if they are at least 60% similar. + between 2 blocks if they are 40% to 60% similar. The + similarity %age is selected based on the dedupe block + size to balance performance and effectiveness. pcompress -EE .. - This causes Delta Compression to happen if 2 blocks are - at least 40% similar. This can effect greater final - compression ratio at the cost of higher processing - overhead. + at least 40% similar regardless of block size. This can + effect greater final compression ratio at the cost of + higher processing overhead. Number of threads can optionally be specified: -t <1 - 256 count> Other flags: diff --git a/adaptive_compress.c b/adaptive_compress.c index 00d8b96..c25848c 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -140,55 +140,43 @@ adapt_compress(void *src, size_t srclen, void *dst, size_t *dstlen, int level, uchar_t chdr, void *data) { struct adapt_data *adat = (struct adapt_data *)(data); - int rv, rv1, rv2; - unsigned int *inc; - size_t dst2len, dst3len, smaller_dstlen; - uchar_t *dst2, *smaller_dst; - void *tmp; + uchar_t *src1 = (uchar_t *)src; + size_t i, bincount; + int rv; - dst2 = slab_alloc(NULL, *dstlen); - if (!dst2) { - fprintf(stderr, "Adapt: Out of memory\n"); - return (-1); - } + /* + * Count number of 8-bit binary bytes in source. + */ + bincount = 0; + for (i = 0; i < srclen; i++) + bincount += (src1[i] >> 7); - rv = COMPRESS_PPMD; - inc = &ppmd_count; - dst2len = *dstlen; - dst3len = *dstlen; - rv1 = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data); - if (rv1 < 0) *dstlen = dst3len; - - if (adat->adapt_mode == 2) { - rv2 = lzma_compress(src, srclen, dst2, &dst2len, level, chdr, adat->lzma_data); - if (rv2 < 0) dst2len = dst3len; - if (dst2len < *dstlen) { - inc = &lzma_count; + /* + * Use PPMd if at least 70% of source is 7-bit textual bytes, otherwise + * use Bzip2 or LZMA. + */ + if (bincount > (srclen / 10 * 3)) { + if (adat->adapt_mode == 2) { + rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data); + if (rv < 0) + return (rv); rv = COMPRESS_LZMA; - } - } else { - rv2 = bzip2_compress(src, srclen, dst2, &dst2len, level, chdr, NULL); - if (rv2 < 0) dst2len = dst3len; - if (dst2len < *dstlen) { - inc = &bzip2_count; + lzma_count++; + } else { + rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL); + if (rv < 0) + return (rv); rv = COMPRESS_BZIP2; + bzip2_count++; } - } - - if (dst2len < *dstlen) { - smaller_dstlen = dst2len; - smaller_dst = dst2; } else { - smaller_dstlen = *dstlen; - smaller_dst = dst; + rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data); + if (rv < 0) + return (rv); + rv = COMPRESS_PPMD; + ppmd_count++; } - *inc += 1; - if (smaller_dst != dst) { - memcpy(dst, smaller_dst, smaller_dstlen); - *dstlen = smaller_dstlen; - } - slab_free(NULL, dst2); return (rv); } diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index 104381d..3661387 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -67,7 +67,8 @@ #include "rabin_dedup.h" -#define FORTY_PCNT(x) (((x)/5 << 1)) +#define FORTY_PCNT(x) ((x)/5 << 1) +#define FIFTY_PCNT(x) ((x) >> 1) #define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3)) extern int lzma_init(void **data, int *level, ssize_t chunksize); @@ -170,11 +171,27 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s ctx->fixed_flag = fixed_flag; ctx->rabin_break_patt = 0; - ctx->delta_flag = delta_flag; ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS); ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1; ctx->rabin_poly_min_block_size = dedupe_min_blksz(rab_blk_sz); ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size; + ctx->delta_flag = 0; + + /* + * Scale down similarity percentage based on avg block size unless user specified + * argument '-EE' in which case fixed 40% match is used for Delta compression. + */ + if (delta_flag == DELTA_NORMAL) { + if (ctx->rabin_poly_avg_block_size < (1 << 14)) { + ctx->delta_flag = 1; + } else if (ctx->rabin_poly_avg_block_size < (1 << 16)) { + ctx->delta_flag = 2; + } else { + ctx->delta_flag = 3; + } + } else if (delta_flag == DELTA_EXTRA) { + ctx->delta_flag = 1; + } if (!fixed_flag) ctx->blknum = chunksize / ctx->rabin_poly_min_block_size; @@ -356,7 +373,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs j = 0; for (i=offset; i<*size; i++) { - ssize_t pc[3]; + ssize_t pc[4]; uchar_t cur_byte = buf1[i]; uint64_t pushed_out = ctx->current_window_data[ctx->window_pos]; ctx->current_window_data[ctx->window_pos] = cur_byte; @@ -414,7 +431,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs */ if (ctx->delta_flag) { pc[1] = SIXTY_PCNT(j); - pc[2] = FORTY_PCNT(j); + pc[2] = FIFTY_PCNT(j); + pc[3] = FORTY_PCNT(j); reset_heap(&heap, pc[ctx->delta_flag]); ksmallest(fplist, j, &heap); @@ -444,7 +462,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs if (j > 1) { pc[1] = SIXTY_PCNT(j); - pc[2] = FORTY_PCNT(j); + pc[2] = FIFTY_PCNT(j); + pc[3] = FORTY_PCNT(j); reset_heap(&heap, pc[ctx->delta_flag]); ksmallest(fplist, j, &heap); cur_sketch =