diff --git a/bsdiff/bsdiff.c b/bsdiff/bsdiff.c
index 56827e3..af6ff8b 100644
--- a/bsdiff/bsdiff.c
+++ b/bsdiff/bsdiff.c
@@ -134,11 +134,12 @@ static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h)
 
 static void qsufsort(bsize_t *I,bsize_t *V,u_char *oldbuf,bsize_t oldsize)
 {
-	bsize_t buckets[256];
+	bsize_t buckets[257];
+	bsize_t *bkts;
 	bsize_t i,h,len;
 
 #ifdef __USE_SSE_INTRIN__
-	if (((bsize_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ?
+	if (((size_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ?
 		int iters;
 		uchar_t *pos;
 
@@ -159,9 +160,18 @@ static void qsufsort(bsize_t *I,bsize_t *V,u_char *oldbuf,bsize_t oldsize)
 #ifdef __USE_SSE_INTRIN__
 	}
 #endif
-	for(i=0;i<oldsize;i++) buckets[oldbuf[i]]++;
-	for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
-	for(i=255;i>0;i--) buckets[i]=buckets[i-1];
+	/* We want to do this:
+	 * for(i=0;i<oldsize;i++) buckets[oldbuf[i]]++;
+	 * for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
+	 * for(i=255;i>0;i--) buckets[i]=buckets[i-1];
+	 * buckets[0]=0;
+	 * 
+	 * However the code below uses an array larger by 1 element and is able to
+	 * avoid the 3rd loop.
+	 */
+	bkts = &buckets[1];
+	for(i=0;i<oldsize;i++) bkts[oldbuf[i]]++;
+	for(i=1;i<256;i++) bkts[i]+=bkts[i-1];
 	buckets[0]=0;
 
 	for(i=0;i<oldsize;i++) I[++buckets[oldbuf[i]]]=i;
@@ -311,9 +321,8 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
 					0,oldsize,&pos);
 
 			for(;scsc<scan+len;scsc++)
-			if((scsc+lastoffset<oldsize) &&
-				(oldbuf[scsc+lastoffset] == newbuf[scsc]))
-				oldscore++;
+				oldscore += ((scsc+lastoffset<oldsize) &&
+					(oldbuf[scsc+lastoffset] == newbuf[scsc]));
 
 			if(((len==oldscore) && (len!=0)) || 
 				(len>oldscore+sz)) break;
@@ -326,7 +335,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
 		if((len!=oldscore) || (scan==newsize)) {
 			s=0;Sf=0;lenf=0;
 			for(i=0;(lastscan+i<scan)&&(lastpos+i<oldsize);) {
-				if(oldbuf[lastpos+i]==newbuf[lastscan+i]) s++;
+				s += (oldbuf[lastpos+i]==newbuf[lastscan+i]);
 				i++;
 				if(s*2-i>Sf*2-lenf) { Sf=s; lenf=i; };
 			};
@@ -335,7 +344,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
 			if(scan<newsize) {
 				s=0;Sb=0;
 				for(i=1;(scan>=lastscan+i)&&(pos>=i);i++) {
-					if(oldbuf[pos-i]==newbuf[scan-i]) s++;
+					s += (oldbuf[pos-i]==newbuf[scan-i]);
 					if(s*2-i>Sb*2-lenb) { Sb=s; lenb=i; };
 				};
 			};
@@ -344,10 +353,9 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
 				overlap=(lastscan+lenf)-(scan-lenb);
 				s=0;Ss=0;lens=0;
 				for(i=0;i<overlap;i++) {
-					if(newbuf[lastscan+lenf-overlap+i]==
-					   oldbuf[lastpos+lenf-overlap+i]) s++;
-					if(newbuf[scan-lenb+i]==
-					   oldbuf[pos-lenb+i]) s--;
+					s += (newbuf[lastscan+lenf-overlap+i]==
+					   oldbuf[lastpos+lenf-overlap+i]);
+					s -= (newbuf[scan-lenb+i]==oldbuf[pos-lenb+i]);
 					if(s>Ss) { Ss=s; lens=i+1; };
 				};
 
diff --git a/bsdiff/rle_encoder.c b/bsdiff/rle_encoder.c
index 34b8505..1665b51 100644
--- a/bsdiff/rle_encoder.c
+++ b/bsdiff/rle_encoder.c
@@ -24,6 +24,7 @@
 
 #include <utils.h>
 #include <stdio.h>
+#include <string.h>
 
 #define ZERO_MASK (32768)
 #define DATA_MASK (32767)
@@ -33,15 +34,25 @@ int
 zero_rle_encode(const void *ibuf, const unsigned int ilen,
 	void *obuf, unsigned int *olen)
 {
-	unsigned int pos1, pos2;
+	unsigned int pos1, pos2, sz;
 	unsigned short count;
 	const uchar_t *ib = (const uchar_t *)ibuf;
 	uchar_t *ob = (uchar_t *)obuf;
+	uint64_t val;
 
+	sz = sizeof (val) - 1;
 	pos2 = 0;
 	for (pos1=0; pos1<ilen && pos2<*olen;) {
 		count = 0;
 		if (ib[pos1] == 0) {
+			/*
+			 * We have a run of zeroes. Count them and store only the count.
+			 */
+			while (pos1 < (ilen - sz) && count < (COUNT_MAX - sz)) {
+				val = *((uint64_t *)(ib+pos1));
+				if (val) break;
+				pos1 += sizeof (val); count += sizeof (val);
+			}
 			for (;pos1<ilen && ib[pos1]==0 && count<COUNT_MAX; pos1++) count++;
 			count |= ZERO_MASK;
 			*((unsigned short *)(ob + pos2)) = htons(count);
@@ -93,11 +104,24 @@ zero_rle_decode(const void* ibuf, unsigned int ilen,
 		pos1 += 2;
 		if (count & ZERO_MASK) {
 			count &= DATA_MASK;
-			for (i=0; i<count && pos2<*olen; i++)
-				ob[pos2++] = 0;
+			if (pos2 + count > *olen) {
+				fprintf(stderr, "Output buffer overflow in Zero RLE decode.\n");
+				return (-1);
+			}
+			memset(ob+pos2, 0, count);
+			pos2 += count;
 		} else {
-			for (i=0; i<count && pos1<ilen && pos2<*olen; i++)
-				ob[pos2++] = ib[pos1++];
+			if (pos1 + count > ilen) {
+				fprintf(stderr, "Input underflow in Zero RLE decode.\n");
+				return (-1);
+			}
+			if (pos2 + count > *olen) {
+				fprintf(stderr, "Output buffer overflow in Zero RLE decode.\n");
+				return (-1);
+			}
+			memcpy(ob+pos2, ib+pos1, count);
+			pos2 += count;
+			pos1 += count;
 		}
 	}
 	i = *olen;
diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c
index ca99ecf..14aa3a6 100755
--- a/rabin/rabin_dedup.c
+++ b/rabin/rabin_dedup.c
@@ -72,9 +72,9 @@
 
 #include "rabin_dedup.h"
 
-#define	FORTY_PCNT(x) ((x)/5 << 1)
-#define	FIFTY_PCNT(x) ((x) >> 1)
-#define	SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3))
+#define	DELTA_EXTRA2_PCT(x) ((x) >> 1)
+#define	DELTA_EXTRA_PCT(x) (((x) >> 1) + ((x) >> 3))
+#define	DELTA_NORMAL_PCT(x) (((x) >> 1) + ((x) >> 2) + ((x) >> 3))
 
 extern int lzma_init(void **data, int *level, int nthreads, int64_t chunksize,
 		     int file_version, compress_op_t op);
@@ -203,7 +203,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
 			ctx->delta_flag = 3;
 		}
 	} else if (delta_flag == DELTA_EXTRA) {
-		ctx->delta_flag = 1;
+		ctx->delta_flag = 2;
 	}
 
 	if (!fixed_flag)
@@ -292,7 +292,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 	uchar_t *buf1 = (uchar_t *)buf;
 	uint32_t length;
 	uint64_t cur_roll_checksum, cur_pos_checksum;
-	uint32_t *fplist;
+	uint32_t *ctx_heap;
 	rabin_blockentry_t **htab;
 	heap_t heap;
 	DEBUG_STAT_EN(uint32_t max_count);
@@ -341,9 +341,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 		 * Initialize arrays for sketch computation. We re-use memory allocated
 		 * for the compressed chunk temporarily.
 		 */
-		ary_sz = 4 * ctx->rabin_poly_max_block_size;
-		fplist = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz);
-		if (ctx->delta_flag) memset(fplist, 0, ary_sz);
+		ary_sz = ctx->rabin_poly_max_block_size;
+		ctx_heap = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz);
 	}
 	memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
 
@@ -397,23 +396,6 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 		cur_roll_checksum -= out[pushed_out];
 		cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
 
-		/*
-		 * Retain a list of all fingerprints in the block. We then compute
-		 * the K min values sketch from that list and generate a super sketch
-		 * by hashing over the K min values sketch. We only store the least
-		 * significant 32 bits of the fingerprint. This uses less memory,
-		 * requires smaller memset() calls and generates a sufficiently large
-		 * number of similarity matches without false positives - determined
-		 * by experimentation.
-		 * 
-		 * This is called minhashing and is used widely, for example in various
-		 * search engines to detect similar documents.
-		 */
-		if (ctx->delta_flag) {
-			fplist[j] = cur_pos_checksum & 0xFFFFFFFFUL;
-			j++;
-		}
-
 		/*
 		 * Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
 		 * We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE
@@ -432,25 +414,32 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 			ctx->blocks[blknum]->offset = last_offset;
 			ctx->blocks[blknum]->index = blknum; // Need to store for sorting
 			ctx->blocks[blknum]->length = length;
-
 			DEBUG_STAT_EN(if (length >= ctx->rabin_poly_max_block_size) max_count++);
+
 			/*
 			 * Reset the heap structure and find the K min values if Delta Compression
 			 * is enabled. We use a min heap mechanism taken from the heap based priority
 			 * queue implementation in Python.
-			 * Here K = 60% or 40%. We are aiming to detect either 60% (default) or 40%
-			 * similarity on average.
+			 * Here K = similarity extent = 87% or 62% or 50%.
+			 * 
+			 * Once block contents are arranged in a min heap we compute the K min values
+			 * sketch by hashing over the heap till K%. We interpret the raw bytes as a
+			 * sequence of 64-bit integers.
+			 * This is called minhashing and is used widely, for example in various
+			 * search engines to detect similar documents.
 			 */
 			if (ctx->delta_flag) {
-				pc[1] = SIXTY_PCNT(j);
-				pc[2] = FIFTY_PCNT(j);
-				pc[3] = FORTY_PCNT(j);
+				memcpy(ctx_heap, buf1+last_offset, length);
+				length /= 8;
+				pc[1] = DELTA_NORMAL_PCT(length);
+				pc[2] = DELTA_EXTRA_PCT(length);
+				pc[3] = DELTA_EXTRA2_PCT(length);
 
 				reset_heap(&heap, pc[ctx->delta_flag]);
-				ksmallest((int32_t *)fplist, j, &heap);
+				ksmallest((int64_t *)ctx_heap, length, &heap);
+
 				ctx->blocks[blknum]->similarity_hash =
-					XXH32((const uchar_t *)fplist,  pc[ctx->delta_flag]*4, 0);
-				memset(fplist, 0, ary_sz);
+					XXH32((const uchar_t *)ctx_heap,  pc[ctx->delta_flag]*8, 0);
 			}
 			blknum++;
 			last_offset = i+1;
@@ -466,26 +455,30 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 				sizeof (rabin_blockentry_t));
 		ctx->blocks[blknum]->offset = last_offset;
 		ctx->blocks[blknum]->index = blknum;
-		ctx->blocks[blknum]->length = *size - last_offset;
+		length = *size - last_offset;
+		ctx->blocks[blknum]->length = length;
 
 		if (ctx->delta_flag) {
 			uint64_t cur_sketch;
 			uint64_t pc[3];
 
-			if (j > 1) {
-				pc[1] = SIXTY_PCNT(j);
-				pc[2] = FIFTY_PCNT(j);
-				pc[3] = FORTY_PCNT(j);
+			if (length > ctx->rabin_poly_min_block_size) {
+				memcpy(ctx_heap, buf1+last_offset, length);
+				length /= 8;
+				pc[1] = DELTA_NORMAL_PCT(length);
+				pc[2] = DELTA_EXTRA_PCT(length);
+				pc[3] = DELTA_EXTRA2_PCT(length);
+
 				reset_heap(&heap, pc[ctx->delta_flag]);
-				ksmallest((int32_t *)fplist, j, &heap);
+				ksmallest((int64_t *)ctx_heap, length, &heap);
 				cur_sketch =
-				    XXH32((const uchar_t *)fplist,  pc[ctx->delta_flag]*4, 0);
+				    XXH32((const uchar_t *)ctx_heap,  pc[ctx->delta_flag]*8, 0);
+				ctx->blocks[blknum]->similarity_hash = cur_sketch;
 			} else {
-				if (j == 0) j = 1;
 				cur_sketch =
-				    XXH32((const uchar_t *)fplist, (j*4)/2, 0);
+				    XXH32((const uchar_t *)(buf1+last_offset), length, 0);
+				ctx->blocks[blknum]->similarity_hash = cur_sketch;
 			}
-			ctx->blocks[blknum]->similarity_hash = cur_sketch;
 		}
 		blknum++;
 		last_offset = *size;
diff --git a/utils/heapq.c b/utils/heapq.c
index 3676a6b..5ce8958 100644
--- a/utils/heapq.c
+++ b/utils/heapq.c
@@ -17,6 +17,7 @@
 #include <string.h>
 #include <sys/types.h>
 #include <stdint.h>
+#include <inttypes.h>
 #include <heapq.h>
 
 #ifndef NDEBUG
@@ -71,7 +72,7 @@ _siftupmax(heap_t *h, __TYPE spos, __TYPE epos)
     heap = h->ary;
 #ifdef ERROR_CHK
     if (spos >= endpos) {
-        fprintf(stderr, "_siftupmax: index out of range: %u, len: %u\n", spos, endpos);
+        fprintf(stderr, "_siftupmax: index out of range: %" PRId64 ", len: %" PRId64 "\n", spos, endpos);
         return -1;
     }
 #endif
@@ -118,7 +119,7 @@ _siftupmax_s(heap_t *h, __TYPE spos)
     heap = h->ary;
 #ifdef ERROR_CHK
     if (spos >= endpos) {
-        fprintf(stderr, "_siftupmax: index out of range: %u, len: %u\n", spos, endpos);
+        fprintf(stderr, "_siftupmax: index out of range: %" PRId64 ", len: %" PRId64 "\n", spos, endpos);
         return -1;
     }
 #endif
diff --git a/utils/heapq.h b/utils/heapq.h
index 155eeca..5b3e2f5 100644
--- a/utils/heapq.h
+++ b/utils/heapq.h
@@ -1,6 +1,6 @@
 #ifndef __HEAPQ_H_
 
-#define __TYPE int32_t
+#define __TYPE int64_t
 
 typedef struct {
     __TYPE *ary;
diff --git a/utils/utils.h b/utils/utils.h
index 47017a9..10fd6e3 100644
--- a/utils/utils.h
+++ b/utils/utils.h
@@ -57,7 +57,7 @@ extern "C" {
 #       endif
 #endif
 typedef unsigned long uintptr_t;
-typedef int64_t bsize_t;
+typedef int32_t bsize_t;
 
 #undef WORDS_BIGENDIAN
 #if BYTE_ORDER == BIG_ENDIAN