diff --git a/Makefile.in b/Makefile.in
index 9a15e44..f279b31 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -97,6 +97,10 @@ LIBBSCLIB = @LIBBSCLIB@
 LIBBSCGEN_OPT = -fopenmp
 LIBBSCCPPFLAGS = -I$(LIBBSCDIR)/libbsc -DENABLE_PC_LIBBSC
 
+TRANSP_SRCS = transpose/transpose.c
+TRANSP_HDRS = transpose/transpose.h
+TRANSP_OBJS = $(TRANSP_SRCS:.c=.o)
+
 KECCAK_SRC_COMMON = crypto/keccak/genKAT.c crypto/keccak/KeccakDuplex.c \
 	crypto/keccak/KeccakNISTInterface.c crypto/keccak/KeccakSponge.c
 KECCAK_SRC_OPT64 = $(KECCAK_SRC_COMMON) crypto/keccak/KeccakF-1600-opt64.c
@@ -120,7 +124,7 @@ KECCAK_OBJS_ASM = $(KECCAK_SRCS_ASM:.s=.o)
 
 BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ utils/*~ crypto/sha2/*~ \
 	crypto/sha2/intel/*~ crypto/aes/*~ crypto/scrypt/*~ crypto/*~ rabin/global/*~ \
-	delta2/*~ crypto/keccak/*~
+	delta2/*~ crypto/keccak/*~ transpose/*~
 
 RM = rm -f
 RM_RF = rm -rf
@@ -128,14 +132,15 @@ COMMON_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT
 	-DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \
 	-I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I@OPENSSL_INCDIR@ \
 	-I./crypto/sha2 -I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ \
-	@LIBBZ2_INC@ @LIBZ_INC@ -I./crypto/keccak
+	@LIBBZ2_INC@ @LIBZ_INC@ -I./crypto/keccak -I./transpose
 COMMON_VEC_FLAGS = -ftree-vectorize
 COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
 LDLIBS = -ldl -L@LIBBZ2_DIR@ -lbz2 -L@LIBZ_DIR@ -lz -lm @LIBBSCLFLAGS@ \
 	-L@OPENSSL_LIBDIR@ -lcrypto -lrt
 OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
 $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \
-$(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM)
+$(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
+$(TRANSP_OBJS)
 
 DEBUG_LINK = g++ -m64 -pthread -msse3 @LIBBSCGEN_OPT@
 DEBUG_COMPILE = gcc -m64 -g -msse3 -c
@@ -227,6 +232,9 @@ $(LIBBSCLIB):
 $(LIBBSCWRAPOBJ): $(LIBBSCWRAP) $(LIBBSCLIB)
 	$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
 
+$(TRANSP_OBJS): $(TRANSP_SRCS) $(TRANSP_HDRS)
+	$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
+
 $(MAINOBJS): $(MAINSRCS) $(MAINHDRS)
 	$(COMPILE) $(GEN_OPT) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
 
diff --git a/main.c b/main.c
index 84c0aa6..696a820 100644
--- a/main.c
+++ b/main.c
@@ -46,6 +46,7 @@
 #include <allocator.h>
 #include <rabin_dedup.h>
 #include <lzp.h>
+#include <transpose.h>
 
 /*
  * We use 5MB chunks by default.
@@ -452,13 +453,21 @@ redo:
 		rv = 0;
 		cmpbuf = cseg + RABIN_HDR_SIZE;
 		ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE;
-		if (dedupe_index_sz >= 90) {
+
+		if (dedupe_index_sz >= 90 && dedupe_index_sz > dedupe_index_sz_cmp) {
 			/* Index should be at least 90 bytes to have been compressed. */
 			rv = lzma_decompress(cmpbuf, dedupe_index_sz_cmp, ubuf,
 			    &dedupe_index_sz, tdat->rctx->level, 0, tdat->rctx->lzma_data);
 		} else {
 			memcpy(ubuf, cmpbuf, dedupe_index_sz);
 		}
+
+		/*
+		 * Recover from transposed index.
+		 */
+		transpose(ubuf, cmpbuf, dedupe_index_sz, sizeof (uint32_t), COL);
+		memcpy(ubuf, cmpbuf, dedupe_index_sz);
+
 	} else {
 		if (HDR & COMPRESSED) {
 			if (HDR & CHUNK_FLAG_PREPROC) {
@@ -1150,48 +1159,57 @@ redo:
 		index_size_cmp = dedupe_index_sz;
 
 		rv = 0;
+
+		/*
+		 * Do a matrix transpose of the index table with the hope of improving
+		 * compression ratio subsequently.
+		 */
+		transpose(tdat->uncompressed_chunk + RABIN_HDR_SIZE,
+		    compressed_chunk + RABIN_HDR_SIZE, dedupe_index_sz,
+		    sizeof (uint32_t), ROW);
+		memcpy(tdat->uncompressed_chunk + RABIN_HDR_SIZE,
+		    compressed_chunk + RABIN_HDR_SIZE, dedupe_index_sz);
+
 		if (dedupe_index_sz >= 90) {
 			/* Compress index if it is at least 90 bytes. */
 			rv = lzma_compress(tdat->uncompressed_chunk + RABIN_HDR_SIZE,
 			    dedupe_index_sz, compressed_chunk + RABIN_HDR_SIZE,
 			    &index_size_cmp, tdat->rctx->level, 255, tdat->rctx->lzma_data);
+
+			/* 
+			 * If index compression fails or does not produce a smaller result
+			 * retain it as is. In that case compressed size == original size
+			 * and it will be handled correctly during decompression.
+			 */
+			if (rv != 0 || index_size_cmp >= dedupe_index_sz) {
+				index_size_cmp = dedupe_index_sz;
+				goto plain_index;
+			}
 		} else {
+plain_index:
 			memcpy(compressed_chunk + RABIN_HDR_SIZE,
 			    tdat->uncompressed_chunk + RABIN_HDR_SIZE, dedupe_index_sz);
 		}
 
 		index_size_cmp += RABIN_HDR_SIZE;
 		dedupe_index_sz += RABIN_HDR_SIZE;
-		if (rv == 0) {
-			memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE);
-			/* Compress data chunk. */
-			if (lzp_preprocess) {
-				rv = preproc_compress(tdat->compress,
-				    tdat->uncompressed_chunk + dedupe_index_sz,
-				    _chunksize, compressed_chunk + index_size_cmp, &_chunksize,
-				    tdat->level, 0, tdat->data, tdat->props);
-			} else {
-				rv = tdat->compress(tdat->uncompressed_chunk + dedupe_index_sz,
-				    _chunksize, compressed_chunk + index_size_cmp, &_chunksize,
-				    tdat->level, 0, tdat->data);
-			}
-
-			/* Can't compress data just retain as-is. */
-			if (rv < 0)
-				memcpy(compressed_chunk + index_size_cmp,
-				    tdat->uncompressed_chunk + dedupe_index_sz, _chunksize);
-			/* Now update rabin header with the compressed sizes. */
-			update_dedupe_hdr(compressed_chunk, index_size_cmp - RABIN_HDR_SIZE,
-					 _chunksize);
+		memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE);
+		/* Compress data chunk. */
+		if (lzp_preprocess) {
+			rv = preproc_compress(tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz,
+			    _chunksize, compressed_chunk + index_size_cmp, &_chunksize,
+			    tdat->level, 0, tdat->data, tdat->props);
 		} else {
-			/* If rabin index compression fails, we just drop down to plain
-			 * compression and avoid dedup. Should be pretty rare case.
-			 */
-			tdat->rctx->valid = 0;
-			memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes);
-			tdat->rbytes = rbytes;
-			goto plain_compress;
+			rv = tdat->compress(tdat->uncompressed_chunk + dedupe_index_sz, _chunksize,
+			    compressed_chunk + index_size_cmp, &_chunksize, tdat->level, 0, tdat->data);
 		}
+
+		/* Can't compress data just retain as-is. */
+		if (rv < 0)
+			memcpy(compressed_chunk + index_size_cmp,
+			    tdat->uncompressed_chunk + dedupe_index_sz, _chunksize);
+		/* Now update rabin header with the compressed sizes. */
+		update_dedupe_hdr(compressed_chunk, index_size_cmp - RABIN_HDR_SIZE, _chunksize);
 		_chunksize += index_size_cmp;
 	} else {
 plain_compress:
diff --git a/transpose/transpose.c b/transpose/transpose.c
new file mode 100644
index 0000000..760400f
--- /dev/null
+++ b/transpose/transpose.c
@@ -0,0 +1,50 @@
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * moinakg@belenix.org, http://moinakg.wordpress.com/
+ */
+
+#include "transpose.h"
+
+/*
+ * Perform a simple matrix transpose of the given buffer in "from".
+ * If the buffer contains tables of numbers or structured data a
+ * transpose can potentially help improve compression ratio by
+ * bringing repeating values in columns into row ordering.
+ */
+void
+transpose(unsigned char *from, unsigned char *to, uint64_t buflen, uint64_t stride, rowcol_t rc)
+{
+	uint64_t rows, cols, i, j, k, l;
+
+	if (rc == ROW) {
+		rows = buflen / stride;
+		cols = stride;
+	} else {
+		cols = buflen / stride;
+		rows = stride;
+	}
+	k = 0;
+	for (j = 0; j < rows; j++) {
+		l = 0;
+		for (i = 0; i < cols; i++) {
+			to[j + l] = from[i + k];
+			l += rows;
+		}
+		k += cols;
+	}
+}
diff --git a/transpose/transpose.h b/transpose/transpose.h
new file mode 100644
index 0000000..6b619dd
--- /dev/null
+++ b/transpose/transpose.h
@@ -0,0 +1,44 @@
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * moinakg@belenix.org, http://moinakg.wordpress.com/
+ */
+
+#ifndef	_TRANSP_H
+#define	_TRANSP_H
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	ROW = 0,
+	COL = 1
+} rowcol_t;
+
+void transpose(unsigned char *from, unsigned char *to, uint64_t buflen,
+	       uint64_t stride, rowcol_t rc);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif