diff --git a/README.md b/README.md
index 2f72614..1145bc8 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ maximum parallelism. It also bundles a simple slab allocator to speed
 repeated allocation of similar chunks. It can work in pipe mode, reading
 from stdin and writing to stdout. It also provides some adaptive compression
 modes in which multiple algorithms are tried per chunk to determine the best
-one for the given chunk. Finally it support 14 compression levels to allow
+one for the given chunk. Finally it supports 14 compression levels to allow
 for ultra compression modes in some algorithms.
 
 Usage
@@ -58,7 +58,8 @@ Usage
 
     Attempt Rabin fingerprinting based deduplication on chunks:
        pcompress -D ...
-       pcompress -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.
+       pcompress -D -r ... - Do NOT split chunks at a rabin boundary. Default
+                             is to split.
 
     Perform Delta Encoding in addition to Exact Dedup:
        pcompress -E ... - This also implies '-D'.
@@ -67,6 +68,13 @@ Usage
     Pass '-M' to display memory allocator statistics
     Pass '-C' to display compression statistics
 
+Environment Variables
+=====================
+
+Set ALLOCATOR_BYPASS=1 in the environment to avoid using the the built-in
+allocator. Due to the the way it rounds up an allocation request to the nearest
+slab the built-in allocator can allocate extra unused memory.
+
 Examples
 ========
 
@@ -80,4 +88,41 @@ of 1GB. Allow pcompress to detect the number of CPU cores and use as many thread
 
     pcompress -c lzma -l14 -s1g file.tar
 
+Compression Algorithms
+======================
 
+LZFX	- Ultra Fast, average compression. This algorithm is the fastest overall.
+	  Levels: 1 - 5
+LZ4	- Very Fast, better compression than LZFX.
+	  Levels: 1 - 3
+Zlib	- Fast, better compression.
+	  Levels: 1 - 9
+Bzip2	- Slow, much better compression than Zlib.
+	  Levels: 1 - 9
+LZMA	- Very slow. Extreme compression.
+	  Levels: 1 - 14
+PPMD	- Slow. Extreme compression for Text, average compression for binary.
+	  Levels: 1 - 14.
+
+Adapt	- Very slow synthetic mode. Both Bzip2 and PPMD are tried per chunk and
+	  better result selected.
+	  Levels: 1 - 14
+Adapt2	- Ultra slow synthetic mode. Both LZMA and PPMD are tried per chunk and
+	  better result selected. Can give best compression ration when splitting
+	  file into multiple chunks.
+	  Levels: 1 - 14
+
+It is possible for a single chunk to span the entire file if enough RAM is
+available. However for adaptive modes to be effective for large files, especially
+multi-file archives splitting into chunks is required so that best compression
+algorithm can be selected for textual and binary portions.
+
+Caveats
+=======
+This utility can gobble up RAM depending on compression algorithm,
+compression level, and dedupe being enabled. Larger chunk sizes can give
+better compression ratio but at the same time use more RAM.
+
+In some cases for files less than a gigabyte. Using Delta Compression in addition
+to exact Dedupe can have a slight negative impact on LZMA compression ratio
+especially when using the large-window ultra compression levels above 12.
diff --git a/allocator.c b/allocator.c
index 8562c15..b5bfd5a 100644
--- a/allocator.c
+++ b/allocator.c
@@ -100,23 +100,6 @@ static int inited = 0, bypass = 0;
 
 static uint64_t total_allocs, oversize_allocs, hash_collisions, hash_entries;
 
-/*
- * Hash function for 64Bit pointers that generates a 32Bit hash value.
- * Taken from Thomas Wang's Integer hashing paper:
- * http://www.cris.com/~Ttwang/tech/inthash.htm
- */
-uint32_t
-hash6432shift(uint64_t key)
-{
-	key = (~key) + (key << 18); // key = (key << 18) - key - 1;
-	key = key ^ (key >> 31);
-	key = key * 21; // key = (key + (key << 2)) + (key << 4);
-	key = key ^ (key >> 11);
-	key = key + (key << 6);
-	key = key ^ (key >> 22);
-	return (uint32_t) key;
-}
-
 void
 slab_init()
 {
diff --git a/bsdiff/bsdiff.c b/bsdiff/bsdiff.c
index 6575870..f81ee8f 100644
--- a/bsdiff/bsdiff.c
+++ b/bsdiff/bsdiff.c
@@ -61,6 +61,11 @@ __FBSDID("$FreeBSD: src/usr.bin/bsdiff/bsdiff/bsdiff.c,v 1.1 2005/08/06 01:59:05
 #include <unistd.h>
 #include <allocator.h>
 #include <utils.h>
+
+#ifdef __USE_SSE_INTRIN__
+#include <emmintrin.h>
+#endif
+
 #include "bscommon.h"
 
 #define MIN(x,y) (((x)<(y)) ? (x) : (y))
@@ -131,7 +136,28 @@ static void qsufsort(bsize_t *I,bsize_t *V,u_char *old,bsize_t oldsize)
 	bsize_t buckets[256];
 	bsize_t i,h,len;
 
+#ifdef __USE_SSE_INTRIN__
+	if (((bsize_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ?
+		int iters;
+		uchar_t *pos;
+
+		iters = (256 * sizeof (bsize_t)) / (16 * 4);
+		__m128i zero = _mm_setzero_si128 ();
+		pos = (uchar_t *)buckets;
+
+		for (i=0; i<iters; i++) {
+			_mm_store_si128((__m128i *)pos, zero);
+			_mm_store_si128((__m128i *)(pos+16), zero);
+			_mm_store_si128((__m128i *)(pos+32), zero);
+			_mm_store_si128((__m128i *)(pos+48), zero);
+			pos += 64;
+		}
+	} else {
+#endif
 	for(i=0;i<256;i++) buckets[i]=0;
+#ifdef __USE_SSE_INTRIN__
+	}
+#endif
 	for(i=0;i<oldsize;i++) buckets[old[i]]++;
 	for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
 	for(i=255;i>0;i--) buckets[i]=buckets[i-1];
diff --git a/rabin/rabin_polynomial.c b/rabin/rabin_polynomial.c
index b655a34..202df14 100755
--- a/rabin/rabin_polynomial.c
+++ b/rabin/rabin_polynomial.c
@@ -151,7 +151,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
 	ctx->blocks = NULL;
 	if (real_chunksize > 0) {
 		ctx->blocks = (rabin_blockentry_t *)slab_alloc(NULL,
-			blknum * ctx->rabin_poly_min_block_size);
+			blknum * sizeof (rabin_blockentry_t));
 	}
 	if(ctx == NULL || current_window_data == NULL || (ctx->blocks == NULL && real_chunksize > 0)) {
 		fprintf(stderr,
@@ -370,7 +370,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 			ctx->blocks[blknum].offset = last_offset;
 			ctx->blocks[blknum].index = blknum; // Need to store for sorting
 			ctx->blocks[blknum].length = length;
-			ctx->blocks[blknum].refcount = 0;
+			ctx->blocks[blknum].ref = 0;
 			ctx->blocks[blknum].similar = 0;
 			ctx->blocks[blknum].cksum_n_offset = cur_sketch;
 			memset(fplist, 0, fplist_sz);
@@ -399,7 +399,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 			ctx->blocks[blknum].offset = last_offset;
 			ctx->blocks[blknum].index = blknum;
 			ctx->blocks[blknum].length = *size - last_offset;
-			ctx->blocks[blknum].refcount = 0;
+			ctx->blocks[blknum].ref = 0;
 			ctx->blocks[blknum].similar = 0;
 			ctx->blocks[blknum].cksum_n_offset = cur_sketch;
 			blknum++;
@@ -445,7 +445,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 			    memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) {
 				ctx->blocks[blk].similar = SIMILAR_EXACT;
 				ctx->blocks[blk].index = prev_index;
-				(ctx->blocks[prev_blk].refcount)++;
+				ctx->blocks[prev_blk].ref = 1;
 				matchlen += prev_length;
 				continue;
 			}
@@ -460,11 +460,11 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 			for (blk = 0; blk < blknum; blk++) {
 				if (ctx->blocks[blk].similar) continue;
 
-				if (blk > 0 && ctx->blocks[blk].refcount == 0 &&
+				if (blk > 0 && ctx->blocks[blk].ref == 0 &&
 				    ctx->blocks[blk].cksum_n_offset == prev_cksum) {
 					ctx->blocks[blk].index = prev_index;
 					ctx->blocks[blk].similar = SIMILAR_PARTIAL;
-					(ctx->blocks[prev_blk].refcount)++;
+					ctx->blocks[prev_blk].ref = 1;
 					matchlen += prev_length/2;
 					continue;
 				}
@@ -502,7 +502,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 				 * size.
 				 */
 				if (prev_index == 0) {
-					if (be->refcount == 0) {
+					if (be->ref == 0) {
 						prev_index = pos;
 						prev_length = be->length;
 					}
@@ -511,7 +511,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 					trans[blk] = pos;
 					pos++;
 				} else {
-					if (be->refcount > 0) {
+					if (be->ref > 0) {
 						prev_index = 0;
 						prev_length = 0;
 						rabin_index[pos] = be->length;
diff --git a/rabin/rabin_polynomial.h b/rabin/rabin_polynomial.h
index 2365c00..19d792c 100644
--- a/rabin/rabin_polynomial.h
+++ b/rabin/rabin_polynomial.h
@@ -136,8 +136,7 @@ typedef struct {
 	unsigned int index;
 	unsigned int length;
 	unsigned int new_length;
-	unsigned short refcount;
-	short similar;
+	unsigned char ref, similar;
 } rabin_blockentry_t;
 
 typedef struct {
diff --git a/utils.c b/utils.c
index c335bb4..2cb492e 100644
--- a/utils.c
+++ b/utils.c
@@ -169,6 +169,23 @@ bytes_to_size(uint64_t bytes)
 	return (num);
 }
 
+/*
+ * Hash function for 64Bit pointers that generates a 32Bit hash value.
+ * Taken from Thomas Wang's Integer hashing paper:
+ * http://www.cris.com/~Ttwang/tech/inthash.htm
+ */
+uint32_t
+hash6432shift(uint64_t key)
+{
+	key = (~key) + (key << 18); // key = (key << 18) - key - 1;
+	key = key ^ (key >> 31);
+	key = key * 21; // key = (key + (key << 2)) + (key << 4);
+	key = key ^ (key >> 11);
+	key = key + (key << 6);
+	key = key ^ (key >> 22);
+	return (uint32_t) key;
+}
+
 /*
  * Read/Write helpers to ensure a full chunk is read or written
  * unless there is an error.
diff --git a/utils.h b/utils.h
index 3f3f11f..f12c1b3 100644
--- a/utils.h
+++ b/utils.h
@@ -102,6 +102,7 @@ extern void err_exit(int show_errno, const char *format, ...);
 extern const char *get_execname(const char *);
 extern int parse_numeric(ssize_t *val, const char *str);
 extern char *bytes_to_size(uint64_t bytes);
+extern uint32_t hash6432shift(uint64_t key);
 extern ssize_t Read(int fd, void *buf, size_t count);
 extern ssize_t Read_Adjusted(int fd, uchar_t *buf, size_t count,
 	ssize_t *rabin_count, void *ctx);