From 8f8af7ed6b9cb84d2d6894834c67b283fdd0ea12 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Thu, 27 Sep 2012 22:29:08 +0530 Subject: [PATCH] Update adaptive mode heuristic based on algorithms. Remove incorrect check in PPMd decompression code. More refactoring of variable names. --- README.md | 10 +++++----- adaptive_compress.c | 39 +++++++++++++++++++++------------------ main.c | 12 ++++++------ ppmd_compress.c | 3 --- rabin/rabin_dedup.c | 14 +++++++------- rabin/rabin_dedup.h | 10 +++++----- 6 files changed, 44 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 2ce5bbf..7164d27 100644 --- a/README.md +++ b/README.md @@ -52,11 +52,11 @@ Usage Bzip2 (See: libbsc.com). adapt - Adaptive mode where ppmd or bzip2 will be used per chunk, - depending on which one produces better compression. This mode - is obviously fairly slow and requires lots of memory. - adapt2 - Adaptive mode which includes ppmd and lzma. This requires - more memory than adapt mode, is slower and potentially gives - the best compression. + depending on heuristics. If at least 50% of the input data is + 7-bit text then PPMd will be used otherwise Bzip2. + adapt2 - Adaptive mode which includes ppmd and lzma. If at least 80% of + the input data is 7-bit text then PPMd will be used otherwise + LZMA. It has significantly more memory usage than adapt. none - No compression. This is only meaningful with -D and -E so Dedupe can be done for post-processing with an external utility. - This can be in bytes or can use the following suffixes: diff --git a/adaptive_compress.c b/adaptive_compress.c index c25848c..a952851 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -35,6 +35,9 @@ #include #include +#define FIFTY_PCT(x) (((x)/10) * 5) +#define TWENTY_PCT(x) (((x)/10) * 2) + static unsigned int lzma_count = 0; static unsigned int bzip2_count = 0; static unsigned int ppmd_count = 0; @@ -141,34 +144,34 @@ adapt_compress(void *src, size_t srclen, void *dst, { struct adapt_data *adat = (struct adapt_data *)(data); uchar_t *src1 = (uchar_t *)src; - size_t i, bincount; + size_t i, tot8b; int rv; /* * Count number of 8-bit binary bytes in source. */ - bincount = 0; + tot8b = 0; for (i = 0; i < srclen; i++) - bincount += (src1[i] >> 7); + tot8b += (src1[i] >> 7); /* - * Use PPMd if at least 70% of source is 7-bit textual bytes, otherwise + * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise * use Bzip2 or LZMA. */ - if (bincount > (srclen / 10 * 3)) { - if (adat->adapt_mode == 2) { - rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data); - if (rv < 0) - return (rv); - rv = COMPRESS_LZMA; - lzma_count++; - } else { - rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL); - if (rv < 0) - return (rv); - rv = COMPRESS_BZIP2; - bzip2_count++; - } + if (adat->adapt_mode == 2 && tot8b > TWENTY_PCT(srclen)) { + rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data); + if (rv < 0) + return (rv); + rv = COMPRESS_LZMA; + lzma_count++; + + } else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) { + rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL); + if (rv < 0) + return (rv); + rv = COMPRESS_BZIP2; + bzip2_count++; + } else { rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data); if (rv < 0) diff --git a/main.c b/main.c index 036f8a6..2749c50 100644 --- a/main.c +++ b/main.c @@ -265,7 +265,7 @@ perform_decompress(void *dat) { struct cmp_data *tdat = (struct cmp_data *)dat; ssize_t _chunksize; - ssize_t dedupe_index_sz, rabin_data_sz, dedupe_index_sz_cmp, rabin_data_sz_cmp; + ssize_t dedupe_index_sz, dedupe_data_sz, dedupe_index_sz_cmp, dedupe_data_sz_cmp; int type, rv; unsigned int blknum; uchar_t checksum[CKSUM_MAX_BYTES]; @@ -305,9 +305,9 @@ redo: if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) { uchar_t *cmpbuf, *ubuf; - /* Extract various sizes from rabin header. */ - parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz, - &dedupe_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize); + /* Extract various sizes from dedupe header. */ + parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &dedupe_data_sz, + &dedupe_index_sz_cmp, &dedupe_data_sz_cmp, &_chunksize); memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE); /* @@ -320,10 +320,10 @@ redo: ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + dedupe_index_sz; if (HDR & COMPRESSED) { if (HDR & CHUNK_FLAG_PREPROC) { - rv = preproc_decompress(tdat->decompress, cmpbuf, rabin_data_sz_cmp, + rv = preproc_decompress(tdat->decompress, cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize, tdat->level, HDR, tdat->data); } else { - rv = tdat->decompress(cmpbuf, rabin_data_sz_cmp, ubuf, &_chunksize, + rv = tdat->decompress(cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize, tdat->level, HDR, tdat->data); } if (rv == -1) { diff --git a/ppmd_compress.c b/ppmd_compress.c index 35e377b..1940b83 100644 --- a/ppmd_compress.c +++ b/ppmd_compress.c @@ -130,9 +130,6 @@ ppmd_decompress(void *src, size_t srclen, void *dst, size_t i; int res; - if (*((char *)_src) < 2) - return (-1); - _ppmd->buf = (Byte *)_src; _ppmd->bufLen = srclen; _ppmd->bufUsed = 0; diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index 3661387..976c831 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -707,20 +707,20 @@ cont: } void -update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp) +update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t dedupe_data_sz_cmp) { ssize_t *entries; buf += sizeof (uint32_t); entries = (ssize_t *)buf; entries[1] = htonll(dedupe_index_sz_cmp); - entries[3] = htonll(rabin_data_sz_cmp); + entries[3] = htonll(dedupe_data_sz_cmp); } void parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz, - ssize_t *rabin_data_sz, ssize_t *dedupe_index_sz_cmp, - ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size) + ssize_t *dedupe_data_sz, ssize_t *dedupe_index_sz_cmp, + ssize_t *dedupe_data_sz_cmp, ssize_t *deduped_size) { ssize_t *entries; @@ -728,11 +728,11 @@ parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz, buf += sizeof (uint32_t); entries = (ssize_t *)buf; - *rabin_data_sz = ntohll(entries[0]); + *dedupe_data_sz = ntohll(entries[0]); *dedupe_index_sz = (ssize_t)(*blknum) * RABIN_ENTRY_SIZE; *dedupe_index_sz_cmp = ntohll(entries[1]); - *rabin_deduped_size = ntohll(entries[2]); - *rabin_data_sz_cmp = ntohll(entries[3]); + *deduped_size = ntohll(entries[2]); + *dedupe_data_sz_cmp = ntohll(entries[3]); } void diff --git a/rabin/rabin_dedup.h b/rabin/rabin_dedup.h index 938b388..ad84f23 100644 --- a/rabin/rabin_dedup.h +++ b/rabin/rabin_dedup.h @@ -165,11 +165,11 @@ extern void destroy_dedupe_context(dedupe_context_t *ctx); extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos); extern void dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size); -extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz, - ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp, - ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size); -extern void update_dedupe_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp, - ssize_t rabin_data_sz_cmp); +extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *dedupe_index_sz, + ssize_t *dedupe_data_sz, ssize_t *rabin_index_sz_cmp, + ssize_t *dedupe_data_sz_cmp, ssize_t *deduped_size); +extern void update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, + ssize_t dedupe_data_sz_cmp); extern void reset_dedupe_context(dedupe_context_t *ctx); extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag);