From bf149e880d9606b46f574bd1d88c8420a76306c8 Mon Sep 17 00:00:00 2001
From: Moinak Ghosh <moinakg@gmail.com>
Date: Thu, 23 Aug 2012 22:58:44 +0530
Subject: [PATCH] Add LZP Pre-Compression support ported from libbsc. Add
 generic pre-processing wrappers for future support of other pre-processors.
 Clean up computation of Rabin block sizes. Compute Rabin scratch space
 accurately to avoid RAM wastage.

---
 Makefile                 |  24 ++-
 README.md                |   8 +-
 lzp/lzp.c                | 454 +++++++++++++++++++++++++++++++++++++++
 lzp/lzp.h                |  89 ++++++++
 main.c                   | 159 ++++++++++++--
 pcompress.h              |   4 +
 rabin/rabin_polynomial.c |  58 ++---
 rabin/rabin_polynomial.h |  27 +--
 8 files changed, 750 insertions(+), 73 deletions(-)
 create mode 100644 lzp/lzp.c
 create mode 100644 lzp/lzp.h

diff --git a/Makefile b/Makefile
index b421cb0..cc9c4c9 100644
--- a/Makefile
+++ b/Makefile
@@ -58,14 +58,21 @@ CRCSRCS = lzma/crc64_fast.c lzma/crc64_table.c
 CRCHDRS = lzma/crc64_table_le.h lzma/crc64_table_be.h lzma/crc_macros.h
 CRCOBJS = $(CRCSRCS:.c=.o)
 
-BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~
+LZPSRCS = lzp/lzp.c
+LZPHDRS = lzp/lzp.h
+LZPOBJS = $(LZPSRCS:.c=.o)
+
+BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~
 
 RM = rm -f
 CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_PROPS \
-	-DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32
+	-DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \
+	-I./lzp
 VEC_FLAGS = -ftree-vectorize
 LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
 LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm
+OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
+$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS)
 
 ifdef DEBUG
 LINK = g++ -m64 -pthread -msse3
@@ -115,16 +122,15 @@ $(LZFXOBJS): $(LZFXSRCS) $(LZFXHDRS)
 $(LZ4OBJS): $(LZ4SRCS) $(LZ4HDRS)
 	$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
 
+$(LZPOBJS): $(LZPSRCS) $(LZPHDRS)
+	$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
+
 $(MAINOBJS): $(MAINSRCS) $(MAINHDRS)
 	$(COMPILE) $(GEN_OPT) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
 
-$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) \
-$(CRCOBJS) $(RABINOBJS) $(BSDIFFOBJS)
-	$(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) \
-		$(LZFXOBJS) $(LZ4OBJS) $(RABINOBJS) $(BSDIFFOBJS) \
-		$(LDLIBS)
+$(PROG): $(OBJS)
+	$(LINK) -o $@ $(OBJS) $(LDLIBS)
 
 clean:
-	$(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(LZFXOBJS) $(LZ4OBJS) \
-	$(RABINOBJS) $(BSDIFFOBJS) $(BAKFILES)
+	$(RM) $(PROG) $(OBJS) $(BAKFILES)
 
diff --git a/README.md b/README.md
index cc29002..a4ac126 100644
--- a/README.md
+++ b/README.md
@@ -70,8 +70,12 @@ Usage
        pcompress -E ... - This also implies '-D'.
 
     Number of threads can optionally be specified: -t <1 - 256 count>
-    Pass '-M' to display memory allocator statistics
-    Pass '-C' to display compression statistics
+    Other flags:
+       '-L' -     Enable LZP pre-compression. This improves compression ratio of all
+                  algorithms with some extra CPU and very low RAM overhead. Using
+                  delta encoding in conjunction with this may not always be beneficial.
+       '-M' -     Display memory allocator statistics
+       '-C' -     Display compression statistics
 
 Environment Variables
 =====================
diff --git a/lzp/lzp.c b/lzp/lzp.c
new file mode 100644
index 0000000..ecece04
--- /dev/null
+++ b/lzp/lzp.c
@@ -0,0 +1,454 @@
+/*-----------------------------------------------------------*/
+/* Block Sorting, Lossless Data Compression Library.         */
+/* Lempel Ziv Prediction                                     */
+/*-----------------------------------------------------------*/
+
+/*--
+
+This file is a part of bsc and/or libbsc, a program and a library for
+lossless, block-sorting data compression.
+
+Copyright (c) 2009-2012 Ilya Grebnov <ilya.grebnov@gmail.com>
+Copyright (c) 2012 Moinak Ghosh <moinakg@gmail.com>
+
+See file AUTHORS for a full list of contributors.
+
+The bsc and libbsc is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The bsc and libbsc is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the bsc and libbsc. If not, see http://www.gnu.org/licenses/.
+
+Please see the files COPYING and COPYING.LIB for full copyright information.
+
+See also the bsc and libbsc web site:
+  http://libbsc.com/ for more information.
+
+--*/
+
+/*
+ *  TODO: Port the parallel implementation.
+ */
+#undef LZP_OPENMP
+
+#include <stdlib.h>
+#include <memory.h>
+#include <string.h>
+#include <allocator.h>
+#include <sys/types.h>
+
+#include "lzp.h"
+
+#define LZP_LZP_MATCH_FLAG 	0xf2
+
+static
+inline int bsc_lzp_num_blocks(ssize_t n)
+{
+    if (n <       256 * 1024)   return 1;
+    if (n <  4 * 1024 * 1024)   return 2;
+    if (n < 16 * 1024 * 1024)   return 4;
+    if (n <    LZP_MAX_BLOCK)   return 8;
+
+    return (n / LZP_MAX_BLOCK);
+}
+
+static
+int bsc_lzp_encode_block(const unsigned char * input, const unsigned char * inputEnd, unsigned char * output, unsigned char * outputEnd, int hashSize, int minLen)
+{
+    int *lookup, i;
+    if (inputEnd - input < 16)
+    {
+        return LZP_NOT_COMPRESSIBLE;
+    }
+
+    if (lookup = (int *)slab_calloc(NULL, (int)(1 << hashSize), sizeof(int)))
+    {
+        unsigned int            mask        = (int)(1 << hashSize) - 1;
+        const unsigned char *   inputStart  = input;
+        const unsigned char *   outputStart = output;
+        const unsigned char *   outputEOB   = outputEnd - 4;
+
+        unsigned int context = 0;
+        for (i = 0; i < 4; ++i)
+        {
+            context = (context << 8) | (*output++ = *input++);
+        }
+
+        const unsigned char * heuristic      = input;
+        const unsigned char * inputMinLenEnd = inputEnd - minLen - 8;
+        while ((input < inputMinLenEnd) && (output < outputEOB))
+        {
+            unsigned int index = ((context >> 15) ^ context ^ (context >> 3)) & mask;
+            int value = lookup[index]; lookup[index] = (int)(input - inputStart);
+            if (value > 0)
+            {
+                const unsigned char * reference = inputStart + value;
+                if ((*(unsigned int *)(input + minLen - 4) == *(unsigned int *)(reference + minLen - 4)) && (*(unsigned int *)(input) == *(unsigned int *)(reference)))
+                {
+                    if ((heuristic > input) && (*(unsigned int *)heuristic != *(unsigned int *)(reference + (heuristic - input))))
+                    {
+                        goto LZP_LZP_MATCH_NOT_FOUND;
+                    }
+
+                    int len = 4;
+                    for (; input + len < inputMinLenEnd; len += 4)
+                    {
+                        if (*(unsigned int *)(input + len) != *(unsigned int *)(reference + len)) break;
+                    }
+                    if (len < minLen)
+                    {
+                        if (heuristic < input + len) heuristic = input + len;
+                        goto LZP_LZP_MATCH_NOT_FOUND;
+                    }
+
+                    if (input[len] == reference[len]) len++;
+                    if (input[len] == reference[len]) len++;
+                    if (input[len] == reference[len]) len++;
+
+                    input += len; context = input[-1] | (input[-2] << 8) | (input[-3] << 16) | (input[-4] << 24);
+
+                    *output++ = LZP_LZP_MATCH_FLAG;
+
+                    len -= minLen; while (len >= 254) { len -= 254; *output++ = 254; if (output >= outputEOB) break; }
+
+                    *output++ = (unsigned char)(len);
+                }
+                else
+                {
+		    unsigned char next;
+LZP_LZP_MATCH_NOT_FOUND:
+                    next = *output++ = *input++; context = (context << 8) | next;
+                    if (next == LZP_LZP_MATCH_FLAG) *output++ = 255;
+                }
+            }
+            else
+            {
+                context = (context << 8) | (*output++ = *input++);
+            }
+        }
+
+        while ((input < inputEnd) && (output < outputEOB))
+        {
+            unsigned int index = ((context >> 15) ^ context ^ (context >> 3)) & mask;
+            int value = lookup[index]; lookup[index] = (int)(input - inputStart);
+            if (value > 0)
+            {
+                unsigned char next = *output++ = *input++; context = (context << 8) | next;
+                if (next == LZP_LZP_MATCH_FLAG) *output++ = 255;
+            }
+            else
+            {
+                context = (context << 8) | (*output++ = *input++);
+            }
+        }
+
+        slab_free(NULL, lookup);
+
+        return (output >= outputEOB) ? LZP_NOT_COMPRESSIBLE : (int)(output - outputStart);
+    }
+
+    return LZP_NOT_ENOUGH_MEMORY;
+}
+
+static
+int bsc_lzp_decode_block(const unsigned char * input, const unsigned char * inputEnd, unsigned char * output, int hashSize, int minLen)
+{
+    int *lookup, i;
+    if (inputEnd - input < 4)
+    {
+        return LZP_UNEXPECTED_EOB;
+    }
+
+    if (lookup = (int *)slab_calloc(NULL, (int)(1 << hashSize), sizeof(int)))
+    {
+        unsigned int            mask        = (int)(1 << hashSize) - 1;
+        const unsigned char *   outputStart = output;
+
+        unsigned int context = 0;
+        for (i = 0; i < 4; ++i)
+        {
+            context = (context << 8) | (*output++ = *input++);
+        }
+
+        while (input < inputEnd)
+        {
+            unsigned int index = ((context >> 15) ^ context ^ (context >> 3)) & mask;
+            int value = lookup[index]; lookup[index] = (int)(output - outputStart);
+            if (*input == LZP_LZP_MATCH_FLAG && value > 0)
+            {
+                input++;
+                if (*input != 255)
+                {
+                    int len = minLen; while (1) { len += *input; if (*input++ != 254) break; }
+
+                    const unsigned char * reference = outputStart + value;
+                          unsigned char * outputEnd = output + len;
+
+                    if (output - reference < 4)
+                    {
+                        int offset[4] = {0, 3, 2, 3};
+
+                        *output++ = *reference++;
+                        *output++ = *reference++;
+                        *output++ = *reference++;
+                        *output++ = *reference++;
+
+                        reference -= offset[output - reference];
+                    }
+
+                    while (output < outputEnd) { *(unsigned int *)output = *(unsigned int*)reference; output += 4; reference += 4; }
+
+                    output = outputEnd; context = output[-1] | (output[-2] << 8) | (output[-3] << 16) | (output[-4] << 24);
+                }
+                else
+                {
+                    input++; context = (context << 8) | (*output++ = LZP_LZP_MATCH_FLAG);
+                }
+            }
+            else
+            {
+                context = (context << 8) | (*output++ = *input++);
+            }
+        }
+
+        slab_free(NULL, lookup);
+
+        return (int)(output - outputStart);
+    }
+
+    return LZP_NOT_ENOUGH_MEMORY;
+}
+
+static
+ssize_t bsc_lzp_compress_serial(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen)
+{
+    if (bsc_lzp_num_blocks(n) == 1)
+    {
+        int result = bsc_lzp_encode_block(input, input + n, output + 1, output + n - 1, hashSize, minLen);
+        if (result >= LZP_NO_ERROR) result = (output[0] = 1, result + 1);
+
+        return result;
+    }
+
+    int nBlocks   = bsc_lzp_num_blocks(n);
+    int chunkSize = n / nBlocks;
+    int blockId;
+    ssize_t outputPtr = 1 + 8 * nBlocks;
+
+    output[0] = nBlocks;
+    for (blockId = 0; blockId < nBlocks; ++blockId)
+    {
+        int inputStart  = blockId * chunkSize;
+        int inputSize   = blockId != nBlocks - 1 ? chunkSize : n - inputStart;
+        int outputSize  = inputSize; if (outputSize > n - outputPtr) outputSize = n - outputPtr;
+
+        int result = bsc_lzp_encode_block(input + inputStart, input + inputStart + inputSize, output + outputPtr, output + outputPtr + outputSize, hashSize, minLen);
+        if (result < LZP_NO_ERROR)
+        {
+            if (outputPtr + inputSize >= n) return LZP_NOT_COMPRESSIBLE;
+            result = inputSize; memcpy(output + outputPtr, input + inputStart, inputSize);
+        }
+
+        *(int *)(output + 1 + 8 * blockId + 0) = inputSize;
+        *(int *)(output + 1 + 8 * blockId + 4) = result;
+
+        outputPtr += result;
+    }
+
+    return outputPtr;
+}
+
+#ifdef LZP_OPENMP
+
+static
+int bsc_lzp_compress_parallel(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen)
+{
+    if (unsigned char * buffer = (unsigned char *)bsc_malloc(n * sizeof(unsigned char)))
+    {
+        int compressionResult[ALPHABET_SIZE];
+
+        int nBlocks   = bsc_lzp_num_blocks(n);
+        int result    = LZP_NO_ERROR;
+        int chunkSize = n / nBlocks;
+
+        int numThreads = omp_get_max_threads();
+        if (numThreads > nBlocks) numThreads = nBlocks;
+
+        output[0] = nBlocks;
+        #pragma omp parallel num_threads(numThreads) if(numThreads > 1)
+        {
+            if (omp_get_num_threads() == 1)
+            {
+                result = bsc_lzp_compress_serial(input, output, n, hashSize, minLen);
+            }
+            else
+            {
+                #pragma omp for schedule(dynamic)
+                for (int blockId = 0; blockId < nBlocks; ++blockId)
+                {
+                    int blockStart   = blockId * chunkSize;
+                    int blockSize    = blockId != nBlocks - 1 ? chunkSize : n - blockStart;
+
+                    compressionResult[blockId] = bsc_lzp_encode_block(input + blockStart, input + blockStart + blockSize, buffer + blockStart, buffer + blockStart + blockSize, hashSize, minLen);
+                    if (compressionResult[blockId] < LZP_NO_ERROR) compressionResult[blockId] = blockSize;
+
+                    *(int *)(output + 1 + 8 * blockId + 0) = blockSize;
+                    *(int *)(output + 1 + 8 * blockId + 4) = compressionResult[blockId];
+                }
+
+                #pragma omp single
+                {
+                    result = 1 + 8 * nBlocks;
+                    for (int blockId = 0; blockId < nBlocks; ++blockId)
+                    {
+                        result += compressionResult[blockId];
+                    }
+
+                    if (result >= n) result = LZP_NOT_COMPRESSIBLE;
+                }
+
+                if (result >= LZP_NO_ERROR)
+                {
+                    #pragma omp for schedule(dynamic)
+                    for (int blockId = 0; blockId < nBlocks; ++blockId)
+                    {
+                        int blockStart   = blockId * chunkSize;
+                        int blockSize    = blockId != nBlocks - 1 ? chunkSize : n - blockStart;
+
+                        int outputPtr = 1 + 8 * nBlocks;
+                        for (int p = 0; p < blockId; ++p) outputPtr += compressionResult[p];
+
+                        if (compressionResult[blockId] != blockSize)
+                        {
+                            memcpy(output + outputPtr, buffer + blockStart, compressionResult[blockId]);
+                        }
+                        else
+                        {
+                            memcpy(output + outputPtr, input + blockStart, compressionResult[blockId]);
+                        }
+                    }
+                }
+            }
+        }
+
+        bsc_free(buffer);
+
+        return result;
+    }
+    return LZP_NOT_ENOUGH_MEMORY;
+}
+
+#endif
+
+ssize_t lzp_compress(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen, int features)
+{
+
+#ifdef LZP_OPENMP
+
+    if ((bsc_lzp_num_blocks(n) != 1) && (features & LZP_FEATURE_MULTITHREADING))
+    {
+        return bsc_lzp_compress_parallel(input, output, n, hashSize, minLen);
+    }
+
+#endif
+
+    return bsc_lzp_compress_serial(input, output, n, hashSize, minLen);
+}
+
+ssize_t lzp_decompress(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen, int features)
+{
+    int nBlocks = input[0];
+
+    if (nBlocks == 1)
+    {
+        return bsc_lzp_decode_block(input + 1, input + n, output, hashSize, minLen);
+    }
+
+    int decompressionResult[ALPHABET_SIZE];
+
+#ifdef LZP_OPENMP
+
+    if (features & LZP_FEATURE_MULTITHREADING)
+    {
+        #pragma omp parallel for schedule(dynamic)
+        for (int blockId = 0; blockId < nBlocks; ++blockId)
+        {
+            int inputPtr = 0;  for (int p = 0; p < blockId; ++p) inputPtr  += *(int *)(input + 1 + 8 * p + 4);
+            int outputPtr = 0; for (int p = 0; p < blockId; ++p) outputPtr += *(int *)(input + 1 + 8 * p + 0);
+
+            inputPtr += 1 + 8 * nBlocks;
+
+            int inputSize  = *(int *)(input + 1 + 8 * blockId + 4);
+            int outputSize = *(int *)(input + 1 + 8 * blockId + 0);
+
+            if (inputSize != outputSize)
+            {
+                decompressionResult[blockId] = bsc_lzp_decode_block(input + inputPtr, input + inputPtr + inputSize, output + outputPtr, hashSize, minLen);
+            }
+            else
+            {
+                decompressionResult[blockId] = inputSize; memcpy(output + outputPtr, input + inputPtr, inputSize);
+            }
+        }
+    }
+    else
+
+#endif
+
+    {
+	int blockId, p;
+
+        for (blockId = 0; blockId < nBlocks; ++blockId)
+        {
+            int inputPtr = 0;  for (p = 0; p < blockId; ++p) inputPtr  += *(int *)(input + 1 + 8 * p + 4);
+            int outputPtr = 0; for (p = 0; p < blockId; ++p) outputPtr += *(int *)(input + 1 + 8 * p + 0);
+
+            inputPtr += 1 + 8 * nBlocks;
+
+            int inputSize  = *(int *)(input + 1 + 8 * blockId + 4);
+            int outputSize = *(int *)(input + 1 + 8 * blockId + 0);
+
+            if (inputSize != outputSize)
+            {
+                decompressionResult[blockId] = bsc_lzp_decode_block(input + inputPtr, input + inputPtr + inputSize, output + outputPtr, hashSize, minLen);
+            }
+            else
+            {
+                decompressionResult[blockId] = inputSize; memcpy(output + outputPtr, input + inputPtr, inputSize);
+            }
+        }
+    }
+
+    ssize_t dataSize = 0;
+    int result = LZP_NO_ERROR;
+    int blockId;
+    for (blockId = 0; blockId < nBlocks; ++blockId)
+    {
+        if (decompressionResult[blockId] < LZP_NO_ERROR) result = decompressionResult[blockId];
+        dataSize += decompressionResult[blockId];
+    }
+
+    return (result == LZP_NO_ERROR) ? dataSize : result;
+}
+
+int lzp_hash_size(int level) {
+    if (level > 7) {
+        return (LZP_DEFAULT_LZPHASHSIZE + 2);
+    } else if (level > 5) {
+        return (LZP_DEFAULT_LZPHASHSIZE + 3);
+    } else if (level > 3) {
+        return (LZP_DEFAULT_LZPHASHSIZE + 4);
+    } else {
+        return (LZP_DEFAULT_LZPHASHSIZE + 5);
+    }
+}
+/*-----------------------------------------------------------*/
+/* End                                               lzp.cpp */
+/*-----------------------------------------------------------*/
diff --git a/lzp/lzp.h b/lzp/lzp.h
new file mode 100644
index 0000000..29a4b42
--- /dev/null
+++ b/lzp/lzp.h
@@ -0,0 +1,89 @@
+/*-----------------------------------------------------------*/
+/* Block Sorting, Lossless Data Compression Library.         */
+/* Interface to Lempel Ziv Prediction functions              */
+/*-----------------------------------------------------------*/
+
+/*--
+
+This file is a part of bsc and/or libbsc, a program and a library for
+lossless, block-sorting data compression.
+
+Copyright (c) 2009-2012 Ilya Grebnov <ilya.grebnov@gmail.com>
+Copyright (c) 2012 Moinak Ghosh <moinakg@gmail.com>
+
+See file AUTHORS for a full list of contributors.
+
+The bsc and libbsc is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The bsc and libbsc is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the bsc and libbsc. If not, see http://www.gnu.org/licenses/.
+
+Please see the files COPYING and COPYING.LIB for full copyright information.
+
+See also the bsc and libbsc web site:
+  http://libbsc.com/ for more information.
+
+--*/
+
+#ifndef _LZP_H
+#define _LZP_H
+
+#define LZP_NO_ERROR                0
+#define LZP_BAD_PARAMETER          -1
+#define LZP_NOT_ENOUGH_MEMORY      -2
+#define LZP_NOT_COMPRESSIBLE       -3
+#define LZP_NOT_SUPPORTED          -4
+#define LZP_UNEXPECTED_EOB         -5
+#define LZP_DATA_CORRUPT           -6
+
+#define LZP_DEFAULT_LZPHASHSIZE    16
+#define LZP_DEFAULT_LZPMINLEN      128
+#define	LZP_MAX_BLOCK              (2147483648LL)
+#define	ALPHABET_SIZE              (256)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    /**
+    * Preprocess a memory block by LZP algorithm.
+    * @param input      - the input memory block of n bytes.
+    * @param output     - the output memory block of n bytes.
+    * @param n          - the length of the input/output memory blocks.
+    * @param hashSize   - the hash table size.
+    * @param minLen     - the minimum match length.
+    * @param features   - the set of additional features.
+    * @return The length of preprocessed memory block if no error occurred, error code otherwise.
+    */
+    ssize_t lzp_compress(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen, int features);
+
+    /**
+    * Reconstructs the original memory block after LZP algorithm.
+    * @param input      - the input memory block of n bytes.
+    * @param output     - the output memory block.
+    * @param n          - the length of the input memory block.
+    * @param hashSize   - the hash table size.
+    * @param minLen     - the minimum match length.
+    * @param features   - the set of additional features.
+    * @return The length of original memory block if no error occurred, error code otherwise.
+    */
+    ssize_t lzp_decompress(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen, int features);
+
+    int lzp_hash_size(int level);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+/*-----------------------------------------------------------*/
+/* End                                                 lzp.h */
+/*-----------------------------------------------------------*/
diff --git a/main.c b/main.c
index 3c2213d..641bde0 100644
--- a/main.c
+++ b/main.c
@@ -45,6 +45,7 @@
 #include <pcompress.h>
 #include <allocator.h>
 #include <rabin_polynomial.h>
+#include <lzp.h>
 
 /*
  * We use 5MB chunks by default.
@@ -78,6 +79,7 @@ static int hide_cmp_stats = 1;
 static int enable_rabin_scan = 0;
 static int enable_delta_encode = 0;
 static int enable_rabin_split = 1;
+static int lzp_preprocess = 0;
 static unsigned int chunk_num;
 static uint64_t largest_chunk, smallest_chunk, avg_chunk;
 static const char *exec_name;
@@ -128,8 +130,11 @@ usage(void)
 	    "5) Perform Delta Encoding in addition to Exact Dedup:\n"
 	    "   %s -E ... - This also implies '-D'.\n"
 	    "6) Number of threads can optionally be specified: -t <1 - 256 count>\n"
-	    "7) Pass '-M' to display memory allocator statistics\n"
-	    "8) Pass '-C' to display compression statistics\n\n",
+	    "7) Other flags:\n"
+	    "   '-L'	- Enable LZP pre-compression. This improves compression ratio of all\n"
+	    "       	  algorithms with some extra CPU and very low RAM overhead.\n"
+	    "   '-M'	- Display memory allocator statistics\n"
+	    "   '-C'	- Display compression statistics\n\n",
 	    UTILITY_VERSION, exec_name, exec_name, exec_name, exec_name, exec_name, exec_name);
 }
 
@@ -148,6 +153,92 @@ show_compression_stats(uint64_t chunksize)
 	    bytes_to_size(avg_chunk), (double)avg_chunk/(double)chunksize*100);
 }
 
+/*
+ * Wrapper functions to pre-process the buffer and then call the main compression routine.
+ * At present only LZP pre-compression is used below. Some extra metadata is added:
+ * 
+ * Byte 0: A flag to indicate which pre-processor was used.
+ * Byte 1 - Byte 8: Size of buffer after pre-processing
+ * 
+ * It is possible for a buffer to be only pre-processed and not compressed by the final
+ * algorithm if the final one fails to compress for some reason. However the vice versa
+ * is not allowed.
+ */
+int
+preproc_compress(compress_func_ptr cmp_func, void *src, size_t srclen, void *dst,
+	size_t *dstlen, int level, uchar_t chdr, void *data)
+{
+	uchar_t *dest = (uchar_t *)dst, type = 0;
+	ssize_t result, _dstlen;
+
+	if (lzp_preprocess) {
+		int hashsize;
+
+		type = PREPROC_TYPE_LZP;
+		hashsize = lzp_hash_size(level);
+		result = lzp_compress(src, dst, srclen, hashsize, LZP_DEFAULT_LZPMINLEN, 0);
+		if (result < 0 || result == srclen) return (-1);
+		srclen = result;
+		memcpy(src, dst, srclen);
+	} else {
+		/*
+		 * Execution won't come here but just in case ...
+		 */
+		fprintf(stderr, "Invalid preprocessing mode\n");
+		return (-1);
+	}
+
+	*dest = type;
+	*((int64_t *)(dest + 1)) = htonll(srclen);
+	_dstlen = srclen;
+	result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr, data);
+	if (result == 0 && _dstlen < srclen) {
+		*dest |= PREPROC_COMPRESSED;
+		*dstlen = _dstlen + 9;
+	} else {
+		memcpy(dest+1, src, srclen);
+		_dstlen = srclen;
+		*dstlen = _dstlen + 1;
+	}
+	return (0);
+}
+
+int
+preproc_decompress(compress_func_ptr dec_func, void *src, size_t srclen, void *dst,
+	size_t *dstlen, int level, uchar_t chdr, void *data)
+{
+	uchar_t *sorc = (uchar_t *)src, type;
+	ssize_t result;
+
+	type = *sorc;
+	sorc++;
+	srclen--;
+	if (type & PREPROC_COMPRESSED) {
+		*dstlen = ntohll(*((int64_t *)(sorc)));
+		sorc += 8;
+		srclen -= 8;
+		result = dec_func(sorc, srclen, dst, dstlen, level, chdr, data);
+		if (result < 0) return (result);
+		memcpy(src, dst, *dstlen);
+		srclen = *dstlen;
+	}
+
+	if (type & PREPROC_TYPE_LZP) {
+		int hashsize;
+		hashsize = lzp_hash_size(level);
+		result = lzp_decompress(src, dst, srclen, hashsize, LZP_DEFAULT_LZPMINLEN, 0);
+		if (result < 0) {
+			fprintf(stderr, "LZP decompression failed.\n");
+			return (-1);
+		}
+		*dstlen = result;
+	} else {
+		fprintf(stderr, "Invalid preprocessing flags: %d\n", type);
+		return (-1);
+	}
+	return (0);
+}
+
 /*
  * This routine is called in multiple threads. Calls the decompression handler
  * as encoded in the file header. For adaptive mode the handler adapt_decompress()
@@ -214,8 +305,13 @@ redo:
 		cmpbuf = cseg + RABIN_HDR_SIZE + rabin_index_sz_cmp;
 		ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + rabin_index_sz;
 		if (HDR & COMPRESSED) {
-			rv = tdat->decompress(cmpbuf, rabin_data_sz_cmp, ubuf, &_chunksize,
-			tdat->level, HDR, tdat->data);
+			if (HDR & CHUNK_FLAG_PREPROC) {
+				rv = preproc_decompress(tdat->decompress, cmpbuf, rabin_data_sz_cmp,
+				    ubuf, &_chunksize, tdat->level, HDR, tdat->data);
+			} else {
+				rv = tdat->decompress(cmpbuf, rabin_data_sz_cmp, ubuf, &_chunksize,
+				    tdat->level, HDR, tdat->data);
+			}
 			if (rv == -1) {
 				tdat->len_cmp = 0;
 				fprintf(stderr, "ERROR: Chunk %d, decompression failed.\n", tdat->id);
@@ -237,8 +333,13 @@ redo:
 		}
 	} else {
 		if (HDR & COMPRESSED) {
-			rv = tdat->decompress(cseg, tdat->len_cmp, tdat->uncompressed_chunk,
-			&_chunksize, tdat->level, HDR, tdat->data);
+			if (HDR & CHUNK_FLAG_PREPROC) {
+				rv = preproc_decompress(tdat->decompress, cseg, tdat->len_cmp,
+				    tdat->uncompressed_chunk, &_chunksize, tdat->level, HDR, tdat->data);
+			} else {
+				rv = tdat->decompress(cseg, tdat->len_cmp, tdat->uncompressed_chunk,
+				    &_chunksize, tdat->level, HDR, tdat->data);
+			}
 		} else {
 			memcpy(tdat->uncompressed_chunk, cseg, _chunksize);
 		}
@@ -317,7 +418,7 @@ cont:
  * |     `---------------- 2 - Lzma (Adaptive Mode)
  * |                       3 - PPMD (Adaptive Mode)
  * |
- * `---------------------- 1 - Last Chunk flag
+ * `---------------------- 1 - Chunk size flag (if original chunk is of variable length)
  *
  * A file trailer to indicate end.
  * Zero Compressed length: 8 zero bytes.
@@ -459,7 +560,7 @@ start_decompress(const char *filename, const char *to_filename)
 			}
 		}
 		if (enable_rabin_scan) {
-			tdat->rctx = create_rabin_context(chunksize, compressed_chunksize,
+			tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, 0,
 			    algo, enable_delta_encode);
 			if (tdat->rctx == NULL) {
 				UNCOMP_BAIL;
@@ -685,7 +786,7 @@ redo:
 			/* Compress index if it is at least 90 bytes. */
 			rv = lzma_compress(tdat->uncompressed_chunk + RABIN_HDR_SIZE,
 			    rabin_index_sz, compressed_chunk + RABIN_HDR_SIZE,
-			    &index_size_cmp, tdat->rctx->level, 0, tdat->rctx->lzma_data);
+			    &index_size_cmp, tdat->rctx->level, 255, tdat->rctx->lzma_data);
 		} else {
 			memcpy(compressed_chunk + RABIN_HDR_SIZE,
 			    tdat->uncompressed_chunk + RABIN_HDR_SIZE, rabin_index_sz);
@@ -696,9 +797,16 @@ redo:
 		if (rv == 0) {
 			memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE);
 			/* Compress data chunk. */
-			rv = tdat->compress(tdat->uncompressed_chunk + rabin_index_sz,
-			    _chunksize, compressed_chunk + index_size_cmp, &_chunksize,
-		            tdat->level, 0, tdat->data);
+			if (lzp_preprocess) {
+				rv = preproc_compress(tdat->compress,
+				    tdat->uncompressed_chunk + rabin_index_sz,
+				    _chunksize, compressed_chunk + index_size_cmp, &_chunksize,
+				    tdat->level, 0, tdat->data);
+			} else {
+				rv = tdat->compress(tdat->uncompressed_chunk + rabin_index_sz,
+				    _chunksize, compressed_chunk + index_size_cmp, &_chunksize,
+				    tdat->level, 0, tdat->data);
+			}
 
 			/* Can't compress data just retain as-is. */
 			if (rv < 0)
@@ -720,8 +828,14 @@ redo:
 	} else {
 plain_compress:
 		_chunksize = tdat->rbytes;
-		rv = tdat->compress(tdat->uncompressed_chunk, tdat->rbytes,
-		    compressed_chunk, &_chunksize, tdat->level, 0, tdat->data);
+		if (lzp_preprocess) {
+			rv = preproc_compress(tdat->compress,
+			    tdat->uncompressed_chunk, tdat->rbytes,
+			    compressed_chunk, &_chunksize, tdat->level, 0, tdat->data);
+		} else {
+			rv = tdat->compress(tdat->uncompressed_chunk, tdat->rbytes,
+			    compressed_chunk, &_chunksize, tdat->level, 0, tdat->data);
+		}
 	}
 	/*
 	 * Sanity check to ensure compressed data is lesser than original.
@@ -742,6 +856,9 @@ plain_compress:
 	if (enable_rabin_scan && tdat->rctx->valid) {
 		type |= CHUNK_FLAG_DEDUP;
 	}
+	if (lzp_preprocess) {
+		type |= CHUNK_FLAG_PREPROC;
+	}
 	/*
 	 * Insert compressed chunk length and CRC64 checksum into
 	 * chunk header.
@@ -871,8 +988,8 @@ start_compress(const char *filename, uint64_t chunksize, int level)
 	if (enable_rabin_scan) {
 		flags |= FLAG_DEDUP;
 		/* Additional scratch space for dedup arrays. */
-		compressed_chunksize += (rabin_buf_extra(chunksize) -
-					(compressed_chunksize - chunksize));
+		compressed_chunksize += (rabin_buf_extra(chunksize, 0, algo,
+			enable_delta_encode) - (compressed_chunksize - chunksize));
 	}
 
 	err = 0;
@@ -992,7 +1109,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
 			}
 		}
 		if (enable_rabin_scan) {
-			tdat->rctx = create_rabin_context(chunksize, compressed_chunksize,
+			tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, 0,
 			    algo, enable_delta_encode);
 			if (tdat->rctx == NULL) {
 				COMP_BAIL;
@@ -1057,7 +1174,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
 	 * Read the first chunk into a spare buffer (a simple double-buffering).
 	 */
 	if (enable_rabin_split) {
-		rctx = create_rabin_context(chunksize, 0, algo, enable_delta_encode);
+		rctx = create_rabin_context(chunksize, 0, 0, algo, enable_delta_encode);
 		rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
 	} else {
 		rbytes = Read(uncompfd, cread_buf, chunksize);
@@ -1371,7 +1488,7 @@ main(int argc, char *argv[])
 	level = 6;
 	slab_init();
 
-	while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDEr")) != -1) {
+	while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDErL")) != -1) {
 		int ovr;
 
 		switch (opt) {
@@ -1432,6 +1549,10 @@ main(int argc, char *argv[])
 			enable_delta_encode = 1;
 			break;
 
+		    case 'L':
+			lzp_preprocess = 1;
+			break;
+
 		    case 'r':
 			enable_rabin_split = 0;
 			break;
diff --git a/pcompress.h b/pcompress.h
index 2842dbf..ef16d35 100644
--- a/pcompress.h
+++ b/pcompress.h
@@ -47,8 +47,12 @@ extern "C" {
 #define	BZIP2_A_NUM	16
 #define	LZMA_A_NUM	32
 #define	CHUNK_FLAG_DEDUP	2
+#define	CHUNK_FLAG_PREPROC	4
 #define	COMP_EXTN	".pz"
 
+#define	PREPROC_TYPE_LZP	1
+#define	PREPROC_COMPRESSED	128
+
 /*
  * lower 3 bits in higher nibble indicate compression algorithm.
  */
diff --git a/rabin/rabin_polynomial.c b/rabin/rabin_polynomial.c
index eea9a9a..8fb6d37 100755
--- a/rabin/rabin_polynomial.c
+++ b/rabin/rabin_polynomial.c
@@ -94,21 +94,47 @@ static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
 uint64_t ir[256];
 static int inited = 0;
 
-uint32_t
-rabin_buf_extra(uint64_t chunksize)
+static uint32_t
+rabin_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
 {
-	return ((chunksize / RAB_POLYNOMIAL_MIN_BLOCK_SIZE2) * sizeof (uint32_t));
+	uint32_t min_blk;
+
+	min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
+	if (((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) &&
+	      chunksize <= LZMA_WINDOW_MAX) || delta_flag) {
+		if (memcmp(algo, "lzfx", 4) == 0 || memcmp(algo, "lz4", 3) == 0 ||
+		    memcmp(algo, "zlib", 4) == 0 || memcmp(algo, "none", 4) == 0) {
+			min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS - 1);
+		}
+	} else {
+		min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS - 1);
+	}
+	return (min_blk);
+}
+
+uint32_t
+rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
+{
+	if (rab_blk_sz < 1 || rab_blk_sz > 5)
+		rab_blk_sz = RAB_BLK_DEFAULT;
+
+	return ((chunksize / rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag))
+	    * sizeof (uint32_t));
 }
 
 /*
  * Initialize the algorithm with the default params.
  */
 rabin_context_t *
-create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *algo, int delta_flag) {
+create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz,
+    const char *algo, int delta_flag) {
 	rabin_context_t *ctx;
 	unsigned char *current_window_data;
 	uint32_t i;
 
+	if (rab_blk_sz < 1 || rab_blk_sz > 5)
+		rab_blk_sz = RAB_BLK_DEFAULT;
+
 	/*
 	 * Pre-compute a table of irreducible polynomial evaluations for each
 	 * possible byte value.
@@ -157,28 +183,12 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
 
 	ctx->rabin_break_patt = 0;
 	ctx->delta_flag = delta_flag;
-	if (((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) &&
-	      chunksize <= LZMA_WINDOW_MAX) || delta_flag) {
-		if (memcmp(algo, "lzfx", 4) == 0 || memcmp(algo, "lz4", 3) == 0 ||
-		    memcmp(algo, "zlib", 4) == 0 || memcmp(algo, "none", 4) == 0) {
-			ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE2;
-			ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK2;
-			ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE2;
-			if (delta_flag)
-				ctx->delta_flag = DELTA_LESS_FUZZY;
-		} else {
-			ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE;
-			ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK;
-			ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE;
-		}
-	} else {
-		ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE2;
-		ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK2;
-		ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE2;
-	}
-
+	ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
+	ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1;
+	ctx->rabin_poly_min_block_size = rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag);
 	ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size;
 	ctx->blknum = chunksize / ctx->rabin_poly_min_block_size;
+
 	if (chunksize % ctx->rabin_poly_min_block_size)
 		ctx->blknum++;
 
diff --git a/rabin/rabin_polynomial.h b/rabin/rabin_polynomial.h
index 2873522..6e7f019 100644
--- a/rabin/rabin_polynomial.h
+++ b/rabin/rabin_polynomial.h
@@ -63,26 +63,14 @@
 
 //List of constants, mostly constraints and defaults for various parameters
 //to the Rabin Fingerprinting algorithm
-
 #define	RAB_POLYNOMIAL_CONST 2
-// 1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT = Average Rabin Chunk Size
-// So we are always looking at power of 2 chunk sizes to avoid doing a modulus
-//
-#define	RAB_POLYNOMIAL_AVG_BLOCK_SHIFT 12
-#define	RAB_POLYNOMIAL_AVG_BLOCK_SIZE (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT)
-#define	RAB_POLYNOMIAL_AVG_BLOCK_MASK (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1)
-#define	RAB_POLYNOMIAL_MIN_BLOCK_SIZE RAB_POLYNOMIAL_AVG_BLOCK_SIZE
-#define	RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024)
-
-#define	RAB_POLYNOMIAL_AVG_BLOCK_SHIFT2 12
-#define	RAB_POLYNOMIAL_AVG_BLOCK_SIZE2 (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT)
-#define	RAB_POLYNOMIAL_AVG_BLOCK_MASK2 (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1)
-#define	RAB_POLYNOMIAL_MIN_BLOCK_SIZE2 2048
-
+#define	RAB_BLK_DEFAULT 1
+#define	RAB_BLK_MIN_BITS 11
 #define LZMA_WINDOW_MAX (128L * 1024L * 1024L)
 #define	RAB_POLYNOMIAL_WIN_SIZE 16
 #define	RAB_POLYNOMIAL_MIN_WIN_SIZE 8
 #define	RAB_POLYNOMIAL_MAX_WIN_SIZE 64
+#define	RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024)
 
 // Minimum practical chunk size when doing dedup
 #define	RAB_MIN_CHUNK_SIZE (1048576L)
@@ -166,8 +154,8 @@ typedef struct {
 	int level, delta_flag;
 } rabin_context_t;
 
-extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize,
-	const char *algo, int delta_flag);
+extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, 
+	int rab_blk_sz, const char *algo, int delta_flag);
 extern void destroy_rabin_context(rabin_context_t *ctx);
 extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf, 
 	ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
@@ -178,6 +166,7 @@ extern void rabin_parse_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_i
 extern void rabin_update_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
 			     ssize_t rabin_data_sz_cmp);
 extern void reset_rabin_context(rabin_context_t *ctx);
-extern uint32_t rabin_buf_extra(uint64_t chunksize);
+extern uint32_t rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
+	int delta_flag);
 
-#endif /* _RABIN_0POLY_H_ */
\ No newline at end of file
+#endif /* _RABIN_POLY_H_ */