From c7cc7b469ce02b41d8af0831e72d0695d58aee0d Mon Sep 17 00:00:00 2001
From: Moinak Ghosh <moinakg@gmail.com>
Date: Fri, 27 Jul 2012 22:03:24 +0530
Subject: [PATCH] Update chunk size computation to reduce memory usage.
 Implement runtime bypass of custom allocator. Update README.

---
 README.md                | 82 +++++++++++++++++++++++++++++++++++++++-
 allocator.c              | 13 ++++++-
 main.c                   | 21 ++++++----
 pcompress.h              |  1 +
 rabin/rabin_polynomial.c |  6 +++
 rabin/rabin_polynomial.h |  3 +-
 zlib_compress.c          |  8 ++++
 7 files changed, 122 insertions(+), 12 deletions(-)
diff --git a/README.md b/README.md
index b447833..f3bd059 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,82 @@
-pcompress
+Pcompress
 =========
 
-A Parallel Compression/Decompression utility
\ No newline at end of file
+Copyright (C) 2012 Moinak Ghosh. All rights reserved.
+Use is subject to license terms.
+
+Pcompress is a utility to do compression and decompression in parallel by
+splitting input data into chunks. It has a modular structure and includes
+support for multiple algorithms like LZMA, Bzip2, PPMD, etc., with CRC64
+chunk checksums. SSE optimizations for the bundled LZMA are included. It
+also implements chunk-level Content-Aware Deduplication and Delta
+Compression features based on a Semi-Rabin Fingerprinting scheme. Delta
+Compression is implemented via the widely popular bsdiff algorithm.
+Similarity is detected using a custom hashing of maximal features of a
+block. When doing chunk-level dedupe it attempts to merge adjacent
+non-duplicate blocks index entries into a single larger entry to reduce
+metadata. In addition to all these it can internally split chunks at
+rabin boundaries to help dedupe and compression.
+
+It has low metadata overhead and overlaps I/O and compression to achieve
+maximum parallelism. It also bundles a simple slab allocator to speed
+repeated allocation of similar chunks. It can work in pipe mode, reading
+from stdin and writing to stdout. It also provides some adaptive compression
+modes in which multiple algorithms are tried per chunk to determine the best
+one for the given chunk. Finally it support 14 compression levels to allow
+for ultra compression modes in some algorithms.
+
+Usage
+=====
+
+    To compress a file:
+       pcompress -c <algorithm> [-l <compress level>] [-s <chunk size>] <file>
+       Where <algorithm> can be the folowing:
+       lzfx   - Very fast and small algorithm based on LZF.
+       lz4    - Ultra fast, high-throughput algorithm reaching RAM B/W at level1.
+       zlib   - The base Zlib format compression (not Gzip).
+       lzma   - The LZMA (Lempel-Ziv Markov) algorithm from 7Zip.
+       bzip2  - Bzip2 Algorithm from libbzip2.
+       ppmd   - The PPMd algorithm excellent for textual data. PPMd requires
+                at least 64MB X CPUs more memory than the other modes.
+       adapt  - Adaptive mode where ppmd or bzip2 will be used per chunk,
+                depending on which one produces better compression. This mode
+                is obviously fairly slow and requires lots of memory.
+       adapt2 - Adaptive mode which includes ppmd and lzma. This requires
+                more memory than adapt mode, is slower and potentially gives
+                the best compression.
+       <chunk_size> - This can be in bytes or can use the following suffixes:
+                g - Gigabyte, m - Megabyte, k - Kilobyte.
+                Larger chunks produce better compression at the cost of memory.
+       <compress_level> - Can be a number from 0 meaning minimum and 14 meaning
+                maximum compression.
+
+    To decompress a file compressed using above command:
+       pcompress -d <compressed file> <target file>
+
+    To operate as a pipe, read from stdin and write to stdout:
+       pcompress -p ...
+
+    Attempt Rabin fingerprinting based deduplication on chunks:
+       pcompress -D ...
+       pcompress -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.
+
+    Perform Delta Encoding in addition to Exact Dedup:
+       pcompress -E ... - This also implies '-D'.
+
+    Number of threads can optionally be specified: -t <1 - 256 count>
+    Pass '-M' to display memory allocator statistics
+    Pass '-C' to display compression statistics
+
+Examples
+========
+
+Compress "file.tar" using bzip2 level 6, 64MB chunk size and use 4 threads. In
+addition perform exact deduplication and delta compression prior to compression.
+
+    pcompress -D -E -c bzip2 -l6 -s64m -t4 file.tar
+
+Compress "file.tar" using extreme compression mode of LZMA and a chunk size of
+of 1GB. Allow pcompress to detect the number of CPU cores and use as many threads.
+
+    pcompress -c lzma -l14 -s1g file.tar
+
diff --git a/allocator.c b/allocator.c
index b0ac4cf..8562c15 100644
--- a/allocator.c
+++ b/allocator.c
@@ -96,7 +96,7 @@ static struct bufentry **htable;
 static pthread_mutex_t *hbucket_locks;
 static pthread_mutex_t htable_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t slab_table_lock = PTHREAD_MUTEX_INITIALIZER;
-static int inited = 0;
+static int inited = 0, bypass = 0;
 
 static uint64_t total_allocs, oversize_allocs, hash_collisions, hash_entries;
 
@@ -124,6 +124,12 @@ slab_init()
 	size_t slab_sz;
 	int nprocs;
 
+	/* Check bypass env variable. */
+	if (getenv("ALLOCATOR_BYPASS") != NULL) {
+		bypass = 1;
+		return;
+	}
+
 	/* Initialize first NUM_POW2 power of 2 slots. */
 	slab_sz = SLAB_START_SZ;
 	for (i = 0; i < NUM_POW2; i++) {
@@ -177,6 +183,7 @@ slab_cleanup(int quiet)
 	uint64_t nonfreed_oversize;
 
 	if (!inited) return;
+	if (bypass) return;
 
 	if (!quiet) {
 		fprintf(stderr, "Slab Allocation Stats\n");
@@ -276,6 +283,7 @@ void *
 slab_calloc(void *p, size_t items, size_t size) {
 	void *ptr;
 
+	if (bypass) return(calloc(items, size));
 	ptr = slab_alloc(p, items * size);
 	memset(ptr, 0, items * size);
 	return (ptr);
@@ -338,6 +346,7 @@ slab_cache_add(size_t size)
 {
 	uint32_t sindx;
 	struct slabentry *slab;
+	if (bypass) return (0);
 	if (try_dynamic_slab(size)) return (0); /* Already added. */
 
 	/* Locate the hash slot for the size. */
@@ -375,6 +384,7 @@ slab_alloc(void *p, size_t size)
 	void *ptr;
 	struct slabentry *slab;
 
+	if (bypass) return (malloc(size));
 	ATOMIC_ADD(total_allocs, 1);
 	slab = NULL;
 
@@ -444,6 +454,7 @@ slab_free(void *p, void *address)
 	uint32_t hindx;
 
 	if (!address) return;
+	if (bypass) { free(address); return; }
 	hindx = hash6432shift((uint64_t)(address)) & (HTABLE_SZ - 1);
 
 	pthread_mutex_lock(&hbucket_locks[hindx]);
diff --git a/main.c b/main.c
index 5ff3599..6fa7101 100644
--- a/main.c
+++ b/main.c
@@ -45,6 +45,7 @@
 #include <pcompress.h>
 #include <allocator.h>
 #include <rabin_polynomial.h>
+#include <zlib.h>
 
 /* Needed for CLzmaEncprops. */
 #include <LzmaEnc.h>
@@ -788,7 +789,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
 	rabin_context_t *rctx;
 
 	/*
-	 * Compressed buffer size must include zlib scratch space and
+	 * Compressed buffer size must include zlib/dedup scratch space and
 	 * chunk header space.
 	 * See http://www.zlib.net/manual.html#compress2
 	 * 
@@ -799,19 +800,23 @@ start_compress(const char *filename, uint64_t chunksize, int level)
 	 * See start_decompress() routine for details of chunk header.
 	 * We also keep extra 8-byte space for the last chunk's size.
 	 */
-	compressed_chunksize = chunksize + (chunksize >> 6) +
-	    sizeof (chunksize) + sizeof (uint64_t) + sizeof (chunksize);
-	err = 0;
+	compressed_chunksize = chunksize + sizeof (chunksize) +
+	    sizeof (uint64_t) + sizeof (chunksize) + zlib_buf_extra(chunksize);
+
 	flags = 0;
+	if (enable_rabin_scan) {
+		flags |= FLAG_DEDUP;
+		/* Additional scratch space for dedup arrays. */
+		compressed_chunksize += (rabin_buf_extra(chunksize) -
+					(compressed_chunksize - chunksize));
+	}
+
+	err = 0;
 	thread = 0;
 	slab_cache_add(chunksize);
 	slab_cache_add(compressed_chunksize + CHDR_SZ);
 	slab_cache_add(sizeof (struct cmp_data));
 
-	if (enable_rabin_scan) {
-		flags |= FLAG_DEDUP;
-	}
-
 	/* A host of sanity checks. */
 	if (!pipe_mode) {
 		if ((uncompfd = open(filename, O_RDWR, 0)) == -1)
diff --git a/pcompress.h b/pcompress.h
index 1eaa7f2..05cdfc7 100644
--- a/pcompress.h
+++ b/pcompress.h
@@ -54,6 +54,7 @@ extern "C" {
 extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc);
 extern uint64_t lzma_crc64_8bchk(const uint8_t *buf, size_t size,
 	uint64_t crc, uint64_t *cnt);
+extern uint32_t zlib_buf_extra(ssize_t buflen);
 
 extern int zlib_compress(void *src, size_t srclen, void *dst,
 	size_t *destlen, int level, uchar_t chdr, void *data);
diff --git a/rabin/rabin_polynomial.c b/rabin/rabin_polynomial.c
index 7dc0789..b655a34 100755
--- a/rabin/rabin_polynomial.c
+++ b/rabin/rabin_polynomial.c
@@ -81,6 +81,12 @@ extern int bspatch(u_char *pbuf, u_char *old, bsize_t oldsize, u_char *new,
 
 uint32_t rabin_polynomial_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
 
+uint32_t
+rabin_buf_extra(uint64_t chunksize)
+{
+	return ((chunksize / RAB_POLYNOMIAL_MIN_BLOCK_SIZE2) * sizeof (uint32_t));
+}
+
 /*
  * Initialize the algorithm with the default params.
  */
diff --git a/rabin/rabin_polynomial.h b/rabin/rabin_polynomial.h
index 91eb54e..2365c00 100644
--- a/rabin/rabin_polynomial.h
+++ b/rabin/rabin_polynomial.h
@@ -168,5 +168,6 @@ extern void rabin_parse_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_i
 extern void rabin_update_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
 			     ssize_t rabin_data_sz_cmp);
 extern void reset_rabin_context(rabin_context_t *ctx);
+extern uint32_t rabin_buf_extra(uint64_t chunksize);
 
-#endif /* _RABIN_POLY_H_ */
\ No newline at end of file
+#endif /* _RABIN_0POLY_H_ */
\ No newline at end of file
diff --git a/zlib_compress.c b/zlib_compress.c
index c2a3723..382b612 100644
--- a/zlib_compress.c
+++ b/zlib_compress.c
@@ -43,6 +43,14 @@ slab_alloc_ui(void *p, unsigned int items, unsigned int size) {
 	return (ptr);
 }
 
+uint32_t
+zlib_buf_extra(ssize_t buflen)
+{
+	if (buflen > SINGLE_CALL_MAX)
+		buflen = SINGLE_CALL_MAX;
+	return (compressBound(buflen) - buflen);
+}
+
 int
 zlib_init(void **data, int *level, ssize_t chunksize)
 {