diff --git a/Makefile.in b/Makefile.in
index 23ecde1..f0c5e2b 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -167,9 +167,9 @@ DEBUG_RABIN_OPT = -O -fno-omit-frame-pointer
 DEBUG_CPPFLAGS = $(COMMON_CPPFLAGS)
 DEBUG_FPTR_FLAG = 
 
-RELEASE_LINK = g++ -m64 -pthread -msse3 @LIBBSCGEN_OPT@ @LTO_FLAG@
-RELEASE_COMPILE = gcc -m64 -msse3 -c @LTO_FLAG@
-RELEASE_COMPILE_cpp = g++ -m64 -msse3 -c @LTO_FLAG@
+RELEASE_LINK = g++ -m64 -pthread -msse3 @LIBBSCGEN_OPT@ @M64_FLAG@
+RELEASE_COMPILE = gcc -m64 -msse3 -c @M64_FLAG@
+RELEASE_COMPILE_cpp = g++ -m64 -msse3 -c @M64_FLAG@
 RELEASE_VEC_FLAGS = $(COMMON_VEC_FLAGS)
 RELEASE_LOOP_OPTFLAGS = $(COMMON_LOOP_OPTFLAGS)
 RELEASE_CPPFLAGS = $(COMMON_CPPFLAGS) -DNDEBUG
diff --git a/config b/config
index 9c07a8c..f74c1f1 100755
--- a/config
+++ b/config
@@ -50,7 +50,7 @@ yasm=yasm
 keccak_srcs=
 keccak_hdrs=
 keccak_srcs_asm=
-lto_flag=
+m64_flag=
 zlib_prefix=
 bzlib_prefix=
 
@@ -78,10 +78,6 @@ then
 	exit 1
 fi
 
-# Check GCC version and enable LTO flags if possible
-gcc -v 2>&1 | grep lto > /dev/null
-[ $? -eq 0 ] && lto_flag="-flto"
-
 # Check bitness of system/toolchain
 bitness=`./tst`
 if [ $bitness -lt 8 ]
@@ -99,8 +95,7 @@ then
 
 	# If m64 compilation succeeds we assume platform to be 64-bit capable but
 	# explicit flag is reqd.
-	# Instead of setting another variable lets cheat by plugging m64 into lto_flag!
-	lto_flag="${lto_flag} -m64"
+	m64_flag="-m64"
 fi
 rm -f tst tst.c
 
@@ -405,7 +400,7 @@ sha256asmobjsvar="SHA256ASM_OBJS"
 sha256objsvar="SHA256_OBJS"
 yasmvar="YASM"
 fptr_flag_var="FPTR_FLAG"
-lto_flag_var="LTO_FLAG"
+m64_flag_var="M64_FLAG"
 
 openssllibdirvar="OPENSSL_LIBDIR"
 opensslincdirvar="OPENSSL_INCDIR"
@@ -458,6 +453,6 @@ s#@${keccak_srcs_var}@#${keccak_srcs}#g
 s#@${keccak_hdrs_var}@#${keccak_hdrs}#g
 s#@${keccak_srcs_var}@#${keccak_srcs}#g
 s#@${keccak_srcs_asm_var}@#${keccak_srcs_asm}#g
-s#@${lto_flag_var}@#${lto_flag}#g
+s#@${m64_flag_var}@#${m64_flag}#g
 " > Makefile
 
diff --git a/delta2/delta2.c b/delta2/delta2.c
index 8fa82df..d93afc9 100644
--- a/delta2/delta2.c
+++ b/delta2/delta2.c
@@ -30,20 +30,19 @@
  * Bytes are packed into integers in big-endian format.
  *
  * After an optimal stride length has been identified the encoder
- * performs a delta run length encoding on the spans. Three types of
+ * performs a delta run length encoding on the spans. Two types of
  * objects are output by the encoder:
- * 1) A literal run of unmodified bytes. Header: 1 zero byte followed
- *    by a 64bit length in bytes.
- * 2) A literal run of transposed bytes containing sequences that are
- *    below threshold and the total span of those sequences is at least
- *    97%+ of the entire run.
- *    Header: 1 byte stride length with high bit set.
- *            64bit length of span in bytes.
- * 3) An encoded run length of a series in arithmetic progression.
- *    Header: 1 byte stride length (must be less than 128)
- *            64bit length of span in bytes
+ * 1) A literal run of unmodified bytes. Header:
+ *    64-bit encoded value of the following format
+ *    Most Significant Byte = 0
+ *    Remaining Bytes       = Length of literal span in bytes
+ * 2) An encoded run length of a series in arithmetic progression.
+ *    Header: 64bit encoded value
  *            64bit starting value of series
  *            64bit delta value
+ *    64-bit encoded value is of the following format
+ *    Most Significant Byte = Stride length
+ *    Remaining Bytes       = Number of bytes in the span
  */
 #include <stdio.h>
 #include <string.h>
@@ -55,25 +54,21 @@
 #define	MAIN_HDR	(sizeof (uint64_t))
 
 // Literal text header block:
-// 1-byte flag
-// 64bit length of run in bytes.
-#define	LIT_HDR		(1 + sizeof (uint64_t))
-#define	TRANSP_HDR	(LIT_HDR)
+// 64bit encoded value.
+#define	LIT_HDR		(sizeof (uint64_t))
 
 // Delta encoded header block:
-// 1-byte flag indicating stride length
-// 64bit length of span in bytes
+// 64bit encoded value
 // 64bit initial value
 // 64bit delta value
-#define	DELTA_HDR	(1 + (sizeof (uint64_t)) * 3)
+#define	DELTA_HDR	((sizeof (uint64_t)) * 3)
 
 // Minimum span length
 #define	MIN_THRESH	(50)
 // Maximum data length (16TB)
 #define	MAX_THRESH	(0x100000000000ULL)
-#define	TRANSP_THRESH	(100)
-#define	TRANSP_BIT	(128)
-#define	TRANSP_MASK	(127)
+#define	MSB_SETZERO_MASK	(0xffffffffffffffULL)
+#define	MSB_SHIFT	(56)
 
 /*
  * Delta2 algorithm processes data in chunks. The 4K size below is somewhat
@@ -86,8 +81,20 @@
  */
 #define	DELTA2_CHUNK	(4096)
 
+/*
+ * Byteswap macros. We optimize for little-endian, so values are stored
+ * and interpreted in little-endian order.
+ */
+#if BYTE_ORDER == BIG_ENDIAN
+#define	HTONLL	__bswap_64(x)
+#define	NTOHLL	__bswap_64(x)
+#else
+#define	HTONLL
+#define	NTOHLL
+#endif
+
 static int delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen,
-		int rle_thresh, int last_encode, int *transp_count, int *hdr_ovr);
+		int rle_thresh, int last_encode, int *hdr_ovr);
 
 int
 delta2_encode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int rle_thresh)
@@ -98,14 +105,13 @@ delta2_encode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int
 	}
 
 	if (*dstlen < DELTA2_CHUNK) {
-		int transp_count, hdr_ovr;
+		int hdr_ovr;
 		int rv;
 
-		transp_count = 0;
 		hdr_ovr = 0;
-		rv = delta2_encode_real(src, srclen, dst, dstlen, rle_thresh, 1, &transp_count, &hdr_ovr);
+		rv = delta2_encode_real(src, srclen, dst, dstlen, rle_thresh, 1, &hdr_ovr);
 		DEBUG_STAT_EN(fprintf(stderr, "DELTA2: srclen: %" PRIu64 ", dstlen: %" PRIu64 "\n", srclen, *dstlen));
-		DEBUG_STAT_EN(fprintf(stderr, "DELTA2: transpositions: %d, header overhead: %d\n", transp_count, hdr_ovr));
+		DEBUG_STAT_EN(fprintf(stderr, "DELTA2: header overhead: %d\n", hdr_ovr));
 	} else {
 		uchar_t *srcpos, *dstpos, *lastdst, *lastsrc, *dstend;
 		uint64_t slen, sz, dsz, pending;
@@ -117,11 +123,10 @@ delta2_encode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int
 		dstend = dst + *dstlen;
 		slen = srclen;
 		pending = 0;
-		*((uint64_t *)dstpos) = htonll(srclen);
+		*((uint64_t *)dstpos) = HTONLL(srclen);
 		dstpos += MAIN_HDR;
 		lastdst = dstpos;
 		lastsrc = srcpos;
-		transp_count = 0;
 		hdr_ovr = 0;
 
 		DEBUG_STAT_EN(strt = get_wtime_millis());
@@ -135,22 +140,21 @@ delta2_encode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int
 			}
 			dsz = sz;
 			rem = delta2_encode_real(srcpos, sz, dstpos, &dsz, rle_thresh, lenc,
-						 &transp_count, &hdr_ovr);
+						 &hdr_ovr);
 			if (rem == -1) {
 				if (pending == 0) {
 					lastdst = dstpos;
 					lastsrc = srcpos;
 					dstpos += LIT_HDR;
 				}
-				pending += sz;
-				srcpos += sz;
-				dstpos += sz;
-				slen -= sz;
+				pending += dsz;
+				srcpos += dsz;
+				dstpos += dsz;
+				slen -= dsz;
 			} else {
 				if (pending) {
-					*lastdst = 0;
-					lastdst++;
-					*((uint64_t *)lastdst) = htonll(pending);
+					pending &=  MSB_SETZERO_MASK;
+					*((uint64_t *)lastdst) = HTONLL(pending);
 					lastdst += sizeof (uint64_t);
 					memcpy(lastdst, lastsrc, pending);
 					pending = 0;
@@ -165,9 +169,8 @@ delta2_encode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int
 			}
 		}
 		if (pending) {
-			*lastdst = 0;
-			lastdst++;
-			*((uint64_t *)lastdst) = htonll(pending);
+			pending &=  MSB_SETZERO_MASK;
+			*((uint64_t *)lastdst) = HTONLL(pending);
 			lastdst += sizeof (uint64_t);
 			if (lastdst + pending > dstend) {
 				DEBUG_STAT_EN(fprintf(stderr, "No Delta\n"));
@@ -178,7 +181,7 @@ delta2_encode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int
 		*dstlen = dstpos - dst;
 		DEBUG_STAT_EN(en = get_wtime_millis());
 		DEBUG_STAT_EN(fprintf(stderr, "DELTA2: srclen: %" PRIu64 ", dstlen: %" PRIu64 "\n", srclen, *dstlen));
-		DEBUG_STAT_EN(fprintf(stderr, "DELTA2: transpositions: %d, header overhead: %d\n", transp_count, hdr_ovr));
+		DEBUG_STAT_EN(fprintf(stderr, "DELTA2: header overhead: %d\n", hdr_ovr));
 		DEBUG_STAT_EN(fprintf(stderr, "DELTA2: Processed at %.3f MB/s\n", get_mb_s(srclen, strt, en)));
 	}
 	return (0);
@@ -186,7 +189,7 @@ delta2_encode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int
 
 static int
 delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen,
-		   int rle_thresh, int last_encode, int *transp_count, int *hdr_ovr)
+		   int rle_thresh, int last_encode, int *hdr_ovr)
 {
 	uint64_t snum, gtot1, gtot2, tot;
 	uint64_t cnt, val, sval;
@@ -199,8 +202,8 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen
 		return (-1);
 	gtot1 = ULL_MAX;
 	stride = 0;
-	sval = 0;
 	val = 0;
+	tot = 0;
 	sz = sizeof (strides) / sizeof (strides[0]);
 
 	/*
@@ -210,16 +213,20 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen
 		int gt;
 
 		snum = 0;
-		gtot2 = MAIN_HDR + LIT_HDR;
+		gtot2 = LIT_HDR;
 		vl1 = 0;
 		vld1 = 0;
 		tot = 0;
 		pos = src;
 		st1 = strides[st];
+		sval = st1;
+		sval = ((sval << 3) - 1);
+		sval = (1ULL << sval);
+		sval |= (sval - 1);
 		for (cnt = 0; cnt < (srclen - sizeof (cnt)); cnt += st1) {
 			vl2 = *((uint64_t *)pos);
-			vl2 = htonll(vl2);
-			vl2 >>= ((sizeof (vl2) - st1) << 3);
+			vl2 = HTONLL(vl2);
+			vl2 &= sval;
 			vld2 = vl2 - vl1;
 			if (vld1 != vld2) {
 				if (snum > rle_thresh) {
@@ -241,30 +248,30 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen
 		if (snum > rle_thresh) {
 			gtot2 += DELTA_HDR;
 			/*
-			 * If this ended partially into another table reset next scan
+			 * If this ended into another table reset next scan
 			 * point to before the table.
 			 */
 			val = cnt - snum;
 		} else {
 			gtot2 += snum;
 			/*
-			 * If this ended partially into another table reset next scan
+			 * If this ended into another table reset next scan
 			 * point to before the table.
 			 */
-			if (snum >= st1 * 5)
+			if (snum >= (MIN_THRESH>>1))
 				val = cnt - snum;
 		}
 		if (gtot2 < gtot1) {
 			gtot1 = gtot2;
 			stride = st1;
-			sval = val;
+			tot = val;
 		}
 	}
 
 	if ( !(gtot1 < srclen && srclen - gtot1 > (DELTA_HDR + LIT_HDR + MAIN_HDR) && gtot1 < *dstlen) ) {
 		if (srclen >= DELTA2_CHUNK) {
-			if (sval > 0)
-				*dstlen = sval;
+			if (tot > 0)
+				*dstlen = tot;
 		}
 		return (-1);
 	}
@@ -278,70 +285,48 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen
 	gtot1 = 0;
 	pos = src;
 	pos2 = dst;
-	gtot2 = 0;
 
-	if (rle_thresh <= TRANSP_THRESH) {
-		tot = rle_thresh/2;
-	} else {
-		tot = TRANSP_THRESH;
-	}
 	vl2 = *((uint64_t *)pos);
-	vl2 = htonll(vl2);
-	vl2 >>= ((sizeof (vl2) - stride) << 3);
+	vl2 = HTONLL(vl2);
+	val = stride;
+	val = ((val << 3) - 1);
+	val = (1ULL << val);
+	val |= (val - 1);
+	vl2 &= val;
 	sval = vl2;
 
 	for (cnt = 0; cnt < (srclen - sizeof (cnt)); cnt += stride) {
-		val = *((uint64_t *)pos);
-		vl2 = htonll(val);
-		vl2 >>= ((sizeof (vl2) - stride) << 3);
+		vl2 = *((uint64_t *)pos);
+		vl2 = HTONLL(vl2);
+		vl2 &= val;
 		vld2 = vl2 - vl1;
 		if (vld1 != vld2) {
 			if (snum > rle_thresh) {
 				if (gtot1 > 0) {
-					/*
-					 * Encode previous literal run, if any. If the literal run
-					 * has enough (97%+) large sequences just below threshold,
-					 * do a matrix transpose on the range in the hope of achieving
-					 * a better compression ratio.
-					 */
-					if (gtot2 >= ((gtot1 >> 1) + (gtot1 >> 2) + (gtot1 >> 3) +
-					    (gtot1 >> 4) + (gtot1 >> 5))) {
-						*pos2 = stride | TRANSP_BIT;
-						pos2++;
-						*((uint64_t *)pos2) = htonll(gtot1);
-						pos2 += sizeof (uint64_t);
-						DEBUG_STAT_EN((*transp_count)++);
-						DEBUG_STAT_EN(*hdr_ovr += TRANSP_HDR);
-						transpose(pos - (gtot1+snum), pos2, gtot1, stride, ROW);
-					} else {
-						*pos2 = 0;
-						pos2++;
-						*((uint64_t *)pos2) = htonll(gtot1);
-						pos2 += sizeof (uint64_t);
-						DEBUG_STAT_EN(*hdr_ovr += LIT_HDR);
-						memcpy(pos2, pos - (gtot1+snum), gtot1);
-					}
+					gtot1 &= MSB_SETZERO_MASK;
+					*((uint64_t *)pos2) = HTONLL(gtot1);
+					pos2 += sizeof (uint64_t);
+					DEBUG_STAT_EN(*hdr_ovr += LIT_HDR);
+					memcpy(pos2, pos - (gtot1+snum), gtot1);
 					pos2 += gtot1;
 					gtot1 = 0;
-					gtot2 = 0;
 				}
 
 				/*
 				 * RLE Encode delta series.
 				 */
-				*pos2 = stride;
-				pos2++;
-				*((uint64_t *)pos2) = htonll(snum);
+				gtot2 = stride;
+				gtot2 <<= MSB_SHIFT;
+				gtot2 |= (snum & MSB_SETZERO_MASK);
+				*((uint64_t *)pos2) = HTONLL(gtot2);
 				pos2 += sizeof (uint64_t);
-				*((uint64_t *)pos2) = htonll(sval);
+				*((uint64_t *)pos2) = HTONLL(sval);
 				pos2 += sizeof (uint64_t);
-				*((uint64_t *)pos2) = htonll(vld1);
+				*((uint64_t *)pos2) = HTONLL(vld1);
 				pos2 += sizeof (uint64_t);
 				DEBUG_STAT_EN(*hdr_ovr += DELTA_HDR);
 			} else {
 				gtot1 += snum;
-				if (snum >= tot)
-					gtot2 += snum;
 			}
 			snum = 0;
 			sval = vl2;
@@ -355,30 +340,29 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen
 	if (snum > 0) {
 		if (snum > rle_thresh) {
 			if (gtot1 > 0) {
-				*pos2 = 0;
-				pos2++;
-				*((uint64_t *)pos2) = htonll(gtot1);
+				gtot1 &= MSB_SETZERO_MASK;
+				*((uint64_t *)pos2) = HTONLL(gtot1);
 				pos2 += sizeof (uint64_t);
 				DEBUG_STAT_EN(*hdr_ovr += LIT_HDR);
 				memcpy(pos2, pos - (gtot1+snum), gtot1);
 				pos2 += gtot1;
 				gtot1 = 0;
 			}
-			*pos2 = stride;
-			pos2++;
-			*((uint64_t *)pos2) = htonll(snum);
+			gtot2 = stride;
+			gtot2 <<= MSB_SHIFT;
+			gtot2 |= (snum & MSB_SETZERO_MASK);
+			*((uint64_t *)pos2) = HTONLL(gtot2);
 			pos2 += sizeof (uint64_t);
-			*((uint64_t *)pos2) = htonll(sval);
+			*((uint64_t *)pos2) = HTONLL(sval);
 			pos2 += sizeof (uint64_t);
-			*((uint64_t *)pos2) = htonll(vld1);
+			*((uint64_t *)pos2) = HTONLL(vld1);
 			pos2 += sizeof (uint64_t);
 			DEBUG_STAT_EN(*hdr_ovr += DELTA_HDR);
 
 		} else if (last_encode) {
 			gtot1 += snum;
-			*pos2 = 0;
-			pos2++;
-			*((uint64_t *)pos2) = htonll(gtot1);
+			gtot1 &= MSB_SETZERO_MASK;
+			*((uint64_t *)pos2) = HTONLL(gtot1);
 			pos2 += sizeof (uint64_t);
 			DEBUG_STAT_EN(*hdr_ovr += LIT_HDR);
 			memcpy(pos2, pos - gtot1, gtot1);
@@ -395,9 +379,8 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen
 			* Encode left over bytes, if any, at the end into a
 			* literal run.
 			*/
-			*pos2 = 0;
-			pos2++;
-			*((uint64_t *)pos2) = htonll(val);
+			val &= MSB_SETZERO_MASK;
+			*((uint64_t *)pos2) = HTONLL(val);
 			pos2 += sizeof (uint64_t);
 			for (cnt = 0; cnt < val; cnt++) {
 				*pos2 = *pos;
@@ -417,14 +400,16 @@ int
 delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
 {
 	uchar_t *pos, *pos1, *last;
-	uint64_t olen, val, sval, delta, rcnt, cnt, out;
-	uchar_t stride;
+	uint64_t olen, val, sval, delta, rcnt, cnt, out, vl;
+	uchar_t stride, flags;
+	DEBUG_STAT_EN(double strt, en);
 
 	pos = src;
 	pos1 = dst;
 
+	DEBUG_STAT_EN(strt = get_wtime_millis());
 	last = pos + srclen;
-	olen = ntohll(*((uint64_t *)pos));
+	olen = NTOHLL(*((uint64_t *)pos));
 	if (*dstlen < olen) {
 		fprintf(stderr, "DELTA2 Decode: Destination buffer too small.\n");
 		return (-1);
@@ -434,12 +419,15 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
 	pos += MAIN_HDR;
 
 	while (pos < last) {
-		if (*pos == 0) {
+		val = *((uint64_t *)pos);
+		val = NTOHLL(val);
+		flags = (val >> MSB_SHIFT) & 0xff;
+
+		if (flags == 0) {
 			/*
 			 * Copy over literal run of bytes.
 			 */
-			pos++;
-			rcnt = ntohll(*((uint64_t *)pos));
+			rcnt = val & MSB_SETZERO_MASK;
 			pos += sizeof (rcnt);
 			if (out + rcnt > *dstlen) {
 				fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
@@ -450,44 +438,31 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
 			pos1 += rcnt;
 			out += rcnt;
 
-		} else if (*pos & TRANSP_BIT) {
-			int stride;
-			/*
-			 * Copy over literal run of transposed bytes and inverse-transpose.
-			 */
-			stride = (*pos & TRANSP_MASK);
-			pos++;
-			rcnt = ntohll(*((uint64_t *)pos));
-			pos += sizeof (rcnt);
-			if (out + rcnt > *dstlen) {
-				fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
-				return (-1);
-			}
-			transpose(pos, pos1, rcnt, stride, COL);
-			pos += rcnt;
-			pos1 += rcnt;
-			out += rcnt;
 		} else {
-			stride = *pos;
-			pos++;
-			rcnt = ntohll(*((uint64_t *)pos));
+			stride = flags;
+			rcnt = val & MSB_SETZERO_MASK;
 			pos += sizeof (rcnt);
-			sval = ntohll(*((uint64_t *)pos));
+			sval = NTOHLL(*((uint64_t *)pos));
 			pos += sizeof (sval);
-			delta = ntohll(*((uint64_t *)pos));
+			delta = NTOHLL(*((uint64_t *)pos));
 			pos += sizeof (delta);
 			if (out + rcnt > *dstlen) {
 				fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
 				return (-1);
 			}
 
+			vl = stride;
+			vl = (vl << 3) - 1;
+			vl = (1ULL << vl);
+			vl |= (vl - 1);
+
 			/*
 			 * Recover original bytes from the arithmetic series using
 			 * length, starting value and delta.
 			 */
 			for (cnt = 0; cnt < rcnt/stride; cnt++) {
-				val = sval << ((sizeof (val) - stride) << 3);
-				*((uint64_t *)pos1) = ntohll(val);
+				val = (sval & vl);
+				*((uint64_t *)pos1) = NTOHLL(val);
 				out += stride;
 				sval += delta;
 				pos1 += stride;
@@ -495,5 +470,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
 		}
 	}
 	*dstlen = out;
+	DEBUG_STAT_EN(en = get_wtime_millis());
+	DEBUG_STAT_EN(fprintf(stderr, "DELTA2: Decoded at %.3f MB/s\n", get_mb_s(out, strt, en)));
 	return (0);
 }
diff --git a/main.c b/main.c
index a422933..7a4c3e1 100644
--- a/main.c
+++ b/main.c
@@ -208,9 +208,13 @@ preproc_compress(compress_func_ptr cmp_func, void *src, uint64_t srclen, void *d
 		type = PREPROC_TYPE_LZP;
 		hashsize = lzp_hash_size(level);
 		result = lzp_compress(src, dst, srclen, hashsize, LZP_DEFAULT_LZPMINLEN, 0);
-		if (result < 0 || result == srclen) return (-1);
-		srclen = result;
-		memcpy(src, dst, srclen);
+		if (result < 0 || result == srclen) {
+			if (!enable_delta2_encode)
+				return (-1);
+		} else {
+			srclen = result;
+			memcpy(src, dst, srclen);
+		}
 
 	} else if (!enable_delta2_encode)  {
 		/*