From 8f8af7ed6b9cb84d2d6894834c67b283fdd0ea12 Mon Sep 17 00:00:00 2001
From: Moinak Ghosh <moinakg@gmail.com>
Date: Thu, 27 Sep 2012 22:29:08 +0530
Subject: [PATCH] Update adaptive mode heuristic based on algorithms. Remove
 incorrect check in PPMd decompression code. More refactoring of variable
 names.

---
 README.md           | 10 +++++-----
 adaptive_compress.c | 39 +++++++++++++++++++++------------------
 main.c              | 12 ++++++------
 ppmd_compress.c     |  3 ---
 rabin/rabin_dedup.c | 14 +++++++-------
 rabin/rabin_dedup.h | 10 +++++-----
 6 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/README.md b/README.md
index 2ce5bbf..7164d27 100644
--- a/README.md
+++ b/README.md
@@ -52,11 +52,11 @@ Usage
                 Bzip2 (See: libbsc.com).
 
        adapt  - Adaptive mode where ppmd or bzip2 will be used per chunk,
-                depending on which one produces better compression. This mode
-                is obviously fairly slow and requires lots of memory.
-       adapt2 - Adaptive mode which includes ppmd and lzma. This requires
-                more memory than adapt mode, is slower and potentially gives
-                the best compression.
+                depending on heuristics. If at least 50% of the input data is
+                7-bit text then PPMd will be used otherwise Bzip2.
+       adapt2 - Adaptive mode which includes ppmd and lzma. If at least 80% of
+                the input data is 7-bit text then PPMd will be used otherwise
+                LZMA. It has significantly more memory usage than adapt.
        none   - No compression. This is only meaningful with -D and -E so Dedupe
                 can be done for post-processing with an external utility.
        <chunk_size> - This can be in bytes or can use the following suffixes:
diff --git a/adaptive_compress.c b/adaptive_compress.c
index c25848c..a952851 100644
--- a/adaptive_compress.c
+++ b/adaptive_compress.c
@@ -35,6 +35,9 @@
 #include <pcompress.h>
 #include <allocator.h>
 
+#define	FIFTY_PCT(x)	(((x)/10) * 5)
+#define	TWENTY_PCT(x)	(((x)/10) * 2)
+
 static unsigned int lzma_count = 0;
 static unsigned int bzip2_count = 0;
 static unsigned int ppmd_count = 0;
@@ -141,34 +144,34 @@ adapt_compress(void *src, size_t srclen, void *dst,
 {
 	struct adapt_data *adat = (struct adapt_data *)(data);
 	uchar_t *src1 = (uchar_t *)src;
-	size_t i, bincount;
+	size_t i, tot8b;
 	int rv;
 
 	/*
 	 * Count number of 8-bit binary bytes in source.
 	 */
-	bincount = 0;
+	tot8b = 0;
 	for (i = 0; i < srclen; i++)
-		bincount += (src1[i] >> 7);
+		tot8b += (src1[i] >> 7);
 
 	/*
-	 * Use PPMd if at least 70% of source is 7-bit textual bytes, otherwise
+	 * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
 	 * use Bzip2 or LZMA.
 	 */
-	if (bincount > (srclen / 10 * 3)) {
-		if (adat->adapt_mode == 2) {
-			rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
-			if (rv < 0)
-				return (rv);
-			rv = COMPRESS_LZMA;
-			lzma_count++;
-		} else {
-			rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
-			if (rv < 0)
-				return (rv);
-			rv = COMPRESS_BZIP2;
-			bzip2_count++;
-		}
+	if (adat->adapt_mode == 2 && tot8b > TWENTY_PCT(srclen)) {
+		rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
+		if (rv < 0)
+			return (rv);
+		rv = COMPRESS_LZMA;
+		lzma_count++;
+
+	} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
+		rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
+		if (rv < 0)
+			return (rv);
+		rv = COMPRESS_BZIP2;
+		bzip2_count++;
+
 	} else {
 		rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
 		if (rv < 0)
diff --git a/main.c b/main.c
index 036f8a6..2749c50 100644
--- a/main.c
+++ b/main.c
@@ -265,7 +265,7 @@ perform_decompress(void *dat)
 {
 	struct cmp_data *tdat = (struct cmp_data *)dat;
 	ssize_t _chunksize;
-	ssize_t dedupe_index_sz, rabin_data_sz, dedupe_index_sz_cmp, rabin_data_sz_cmp;
+	ssize_t dedupe_index_sz, dedupe_data_sz, dedupe_index_sz_cmp, dedupe_data_sz_cmp;
 	int type, rv;
 	unsigned int blknum;
 	uchar_t checksum[CKSUM_MAX_BYTES];
@@ -305,9 +305,9 @@ redo:
 	if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
 		uchar_t *cmpbuf, *ubuf;
 
-		/* Extract various sizes from rabin header. */
-		parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz,
-				&dedupe_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize);
+		/* Extract various sizes from dedupe header. */
+		parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &dedupe_data_sz,
+				&dedupe_index_sz_cmp, &dedupe_data_sz_cmp, &_chunksize);
 		memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE);
 
 		/*
@@ -320,10 +320,10 @@ redo:
 		ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + dedupe_index_sz;
 		if (HDR & COMPRESSED) {
 			if (HDR & CHUNK_FLAG_PREPROC) {
-				rv = preproc_decompress(tdat->decompress, cmpbuf, rabin_data_sz_cmp,
+				rv = preproc_decompress(tdat->decompress, cmpbuf, dedupe_data_sz_cmp,
 				    ubuf, &_chunksize, tdat->level, HDR, tdat->data);
 			} else {
-				rv = tdat->decompress(cmpbuf, rabin_data_sz_cmp, ubuf, &_chunksize,
+				rv = tdat->decompress(cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize,
 				    tdat->level, HDR, tdat->data);
 			}
 			if (rv == -1) {
diff --git a/ppmd_compress.c b/ppmd_compress.c
index 35e377b..1940b83 100644
--- a/ppmd_compress.c
+++ b/ppmd_compress.c
@@ -130,9 +130,6 @@ ppmd_decompress(void *src, size_t srclen, void *dst,
 	size_t i;
 	int res;
 
-	if (*((char *)_src) < 2)
-		return (-1);
-
 	_ppmd->buf = (Byte *)_src;
 	_ppmd->bufLen = srclen;
 	_ppmd->bufUsed = 0;
diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c
index 3661387..976c831 100755
--- a/rabin/rabin_dedup.c
+++ b/rabin/rabin_dedup.c
@@ -707,20 +707,20 @@ cont:
 }
 
 void
-update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp)
+update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t dedupe_data_sz_cmp)
 {
 	ssize_t *entries;
 
 	buf += sizeof (uint32_t);
 	entries = (ssize_t *)buf;
 	entries[1] = htonll(dedupe_index_sz_cmp);
-	entries[3] = htonll(rabin_data_sz_cmp);
+	entries[3] = htonll(dedupe_data_sz_cmp);
 }
 
 void
 parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
-		ssize_t *rabin_data_sz, ssize_t *dedupe_index_sz_cmp,
-		ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size)
+		ssize_t *dedupe_data_sz, ssize_t *dedupe_index_sz_cmp,
+		ssize_t *dedupe_data_sz_cmp, ssize_t *deduped_size)
 {
 	ssize_t *entries;
 
@@ -728,11 +728,11 @@ parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
 	buf += sizeof (uint32_t);
 
 	entries = (ssize_t *)buf;
-	*rabin_data_sz = ntohll(entries[0]);
+	*dedupe_data_sz = ntohll(entries[0]);
 	*dedupe_index_sz = (ssize_t)(*blknum) * RABIN_ENTRY_SIZE;
 	*dedupe_index_sz_cmp =  ntohll(entries[1]);
-	*rabin_deduped_size = ntohll(entries[2]);
-	*rabin_data_sz_cmp = ntohll(entries[3]);
+	*deduped_size = ntohll(entries[2]);
+	*dedupe_data_sz_cmp = ntohll(entries[3]);
 }
 
 void
diff --git a/rabin/rabin_dedup.h b/rabin/rabin_dedup.h
index 938b388..ad84f23 100644
--- a/rabin/rabin_dedup.h
+++ b/rabin/rabin_dedup.h
@@ -165,11 +165,11 @@ extern void destroy_dedupe_context(dedupe_context_t *ctx);
 extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf, 
 	ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
 extern void dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size);
-extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz,
-		ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp,
-		ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size);
-extern void update_dedupe_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
-			     ssize_t rabin_data_sz_cmp);
+extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *dedupe_index_sz,
+		ssize_t *dedupe_data_sz, ssize_t *rabin_index_sz_cmp,
+		ssize_t *dedupe_data_sz_cmp, ssize_t *deduped_size);
+extern void update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp,
+			     ssize_t dedupe_data_sz_cmp);
 extern void reset_dedupe_context(dedupe_context_t *ctx);
 extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
 	int delta_flag);