From bf149e880d9606b46f574bd1d88c8420a76306c8 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Thu, 23 Aug 2012 22:58:44 +0530 Subject: [PATCH] Add LZP Pre-Compression support ported from libbsc. Add generic pre-processing wrappers for future support of other pre-processors. Clean up computation of Rabin block sizes. Compute Rabin scratch space accurately to avoid RAM wastage. --- Makefile | 24 ++- README.md | 8 +- lzp/lzp.c | 454 +++++++++++++++++++++++++++++++++++++++ lzp/lzp.h | 89 ++++++++ main.c | 159 ++++++++++++-- pcompress.h | 4 + rabin/rabin_polynomial.c | 58 ++--- rabin/rabin_polynomial.h | 27 +-- 8 files changed, 750 insertions(+), 73 deletions(-) create mode 100644 lzp/lzp.c create mode 100644 lzp/lzp.h diff --git a/Makefile b/Makefile index b421cb0..cc9c4c9 100644 --- a/Makefile +++ b/Makefile @@ -58,14 +58,21 @@ CRCSRCS = lzma/crc64_fast.c lzma/crc64_table.c CRCHDRS = lzma/crc64_table_le.h lzma/crc64_table_be.h lzma/crc_macros.h CRCOBJS = $(CRCSRCS:.c=.o) -BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ +LZPSRCS = lzp/lzp.c +LZPHDRS = lzp/lzp.h +LZPOBJS = $(LZPSRCS:.c=.o) + +BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ RM = rm -f CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_PROPS \ - -DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 + -DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \ + -I./lzp VEC_FLAGS = -ftree-vectorize LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm +OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \ +$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) ifdef DEBUG LINK = g++ -m64 -pthread -msse3 @@ -115,16 +122,15 @@ $(LZFXOBJS): $(LZFXSRCS) $(LZFXHDRS) $(LZ4OBJS): $(LZ4SRCS) $(LZ4HDRS) $(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ +$(LZPOBJS): $(LZPSRCS) $(LZPHDRS) + $(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ + $(MAINOBJS): $(MAINSRCS) $(MAINHDRS) $(COMPILE) $(GEN_OPT) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ -$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) \ -$(CRCOBJS) $(RABINOBJS) $(BSDIFFOBJS) - $(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) \ - $(LZFXOBJS) $(LZ4OBJS) $(RABINOBJS) $(BSDIFFOBJS) \ - $(LDLIBS) +$(PROG): $(OBJS) + $(LINK) -o $@ $(OBJS) $(LDLIBS) clean: - $(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(LZFXOBJS) $(LZ4OBJS) \ - $(RABINOBJS) $(BSDIFFOBJS) $(BAKFILES) + $(RM) $(PROG) $(OBJS) $(BAKFILES) diff --git a/README.md b/README.md index cc29002..a4ac126 100644 --- a/README.md +++ b/README.md @@ -70,8 +70,12 @@ Usage pcompress -E ... - This also implies '-D'. Number of threads can optionally be specified: -t <1 - 256 count> - Pass '-M' to display memory allocator statistics - Pass '-C' to display compression statistics + Other flags: + '-L' - Enable LZP pre-compression. This improves compression ratio of all + algorithms with some extra CPU and very low RAM overhead. Using + delta encoding in conjunction with this may not always be beneficial. + '-M' - Display memory allocator statistics + '-C' - Display compression statistics Environment Variables ===================== diff --git a/lzp/lzp.c b/lzp/lzp.c new file mode 100644 index 0000000..ecece04 --- /dev/null +++ b/lzp/lzp.c @@ -0,0 +1,454 @@ +/*-----------------------------------------------------------*/ +/* Block Sorting, Lossless Data Compression Library. */ +/* Lempel Ziv Prediction */ +/*-----------------------------------------------------------*/ + +/*-- + +This file is a part of bsc and/or libbsc, a program and a library for +lossless, block-sorting data compression. + +Copyright (c) 2009-2012 Ilya Grebnov +Copyright (c) 2012 Moinak Ghosh + +See file AUTHORS for a full list of contributors. + +The bsc and libbsc is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The bsc and libbsc is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the bsc and libbsc. If not, see http://www.gnu.org/licenses/. + +Please see the files COPYING and COPYING.LIB for full copyright information. + +See also the bsc and libbsc web site: + http://libbsc.com/ for more information. + +--*/ + +/* + * TODO: Port the parallel implementation. + */ +#undef LZP_OPENMP + +#include +#include +#include +#include +#include + +#include "lzp.h" + +#define LZP_LZP_MATCH_FLAG 0xf2 + +static +inline int bsc_lzp_num_blocks(ssize_t n) +{ + if (n < 256 * 1024) return 1; + if (n < 4 * 1024 * 1024) return 2; + if (n < 16 * 1024 * 1024) return 4; + if (n < LZP_MAX_BLOCK) return 8; + + return (n / LZP_MAX_BLOCK); +} + +static +int bsc_lzp_encode_block(const unsigned char * input, const unsigned char * inputEnd, unsigned char * output, unsigned char * outputEnd, int hashSize, int minLen) +{ + int *lookup, i; + if (inputEnd - input < 16) + { + return LZP_NOT_COMPRESSIBLE; + } + + if (lookup = (int *)slab_calloc(NULL, (int)(1 << hashSize), sizeof(int))) + { + unsigned int mask = (int)(1 << hashSize) - 1; + const unsigned char * inputStart = input; + const unsigned char * outputStart = output; + const unsigned char * outputEOB = outputEnd - 4; + + unsigned int context = 0; + for (i = 0; i < 4; ++i) + { + context = (context << 8) | (*output++ = *input++); + } + + const unsigned char * heuristic = input; + const unsigned char * inputMinLenEnd = inputEnd - minLen - 8; + while ((input < inputMinLenEnd) && (output < outputEOB)) + { + unsigned int index = ((context >> 15) ^ context ^ (context >> 3)) & mask; + int value = lookup[index]; lookup[index] = (int)(input - inputStart); + if (value > 0) + { + const unsigned char * reference = inputStart + value; + if ((*(unsigned int *)(input + minLen - 4) == *(unsigned int *)(reference + minLen - 4)) && (*(unsigned int *)(input) == *(unsigned int *)(reference))) + { + if ((heuristic > input) && (*(unsigned int *)heuristic != *(unsigned int *)(reference + (heuristic - input)))) + { + goto LZP_LZP_MATCH_NOT_FOUND; + } + + int len = 4; + for (; input + len < inputMinLenEnd; len += 4) + { + if (*(unsigned int *)(input + len) != *(unsigned int *)(reference + len)) break; + } + if (len < minLen) + { + if (heuristic < input + len) heuristic = input + len; + goto LZP_LZP_MATCH_NOT_FOUND; + } + + if (input[len] == reference[len]) len++; + if (input[len] == reference[len]) len++; + if (input[len] == reference[len]) len++; + + input += len; context = input[-1] | (input[-2] << 8) | (input[-3] << 16) | (input[-4] << 24); + + *output++ = LZP_LZP_MATCH_FLAG; + + len -= minLen; while (len >= 254) { len -= 254; *output++ = 254; if (output >= outputEOB) break; } + + *output++ = (unsigned char)(len); + } + else + { + unsigned char next; +LZP_LZP_MATCH_NOT_FOUND: + next = *output++ = *input++; context = (context << 8) | next; + if (next == LZP_LZP_MATCH_FLAG) *output++ = 255; + } + } + else + { + context = (context << 8) | (*output++ = *input++); + } + } + + while ((input < inputEnd) && (output < outputEOB)) + { + unsigned int index = ((context >> 15) ^ context ^ (context >> 3)) & mask; + int value = lookup[index]; lookup[index] = (int)(input - inputStart); + if (value > 0) + { + unsigned char next = *output++ = *input++; context = (context << 8) | next; + if (next == LZP_LZP_MATCH_FLAG) *output++ = 255; + } + else + { + context = (context << 8) | (*output++ = *input++); + } + } + + slab_free(NULL, lookup); + + return (output >= outputEOB) ? LZP_NOT_COMPRESSIBLE : (int)(output - outputStart); + } + + return LZP_NOT_ENOUGH_MEMORY; +} + +static +int bsc_lzp_decode_block(const unsigned char * input, const unsigned char * inputEnd, unsigned char * output, int hashSize, int minLen) +{ + int *lookup, i; + if (inputEnd - input < 4) + { + return LZP_UNEXPECTED_EOB; + } + + if (lookup = (int *)slab_calloc(NULL, (int)(1 << hashSize), sizeof(int))) + { + unsigned int mask = (int)(1 << hashSize) - 1; + const unsigned char * outputStart = output; + + unsigned int context = 0; + for (i = 0; i < 4; ++i) + { + context = (context << 8) | (*output++ = *input++); + } + + while (input < inputEnd) + { + unsigned int index = ((context >> 15) ^ context ^ (context >> 3)) & mask; + int value = lookup[index]; lookup[index] = (int)(output - outputStart); + if (*input == LZP_LZP_MATCH_FLAG && value > 0) + { + input++; + if (*input != 255) + { + int len = minLen; while (1) { len += *input; if (*input++ != 254) break; } + + const unsigned char * reference = outputStart + value; + unsigned char * outputEnd = output + len; + + if (output - reference < 4) + { + int offset[4] = {0, 3, 2, 3}; + + *output++ = *reference++; + *output++ = *reference++; + *output++ = *reference++; + *output++ = *reference++; + + reference -= offset[output - reference]; + } + + while (output < outputEnd) { *(unsigned int *)output = *(unsigned int*)reference; output += 4; reference += 4; } + + output = outputEnd; context = output[-1] | (output[-2] << 8) | (output[-3] << 16) | (output[-4] << 24); + } + else + { + input++; context = (context << 8) | (*output++ = LZP_LZP_MATCH_FLAG); + } + } + else + { + context = (context << 8) | (*output++ = *input++); + } + } + + slab_free(NULL, lookup); + + return (int)(output - outputStart); + } + + return LZP_NOT_ENOUGH_MEMORY; +} + +static +ssize_t bsc_lzp_compress_serial(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen) +{ + if (bsc_lzp_num_blocks(n) == 1) + { + int result = bsc_lzp_encode_block(input, input + n, output + 1, output + n - 1, hashSize, minLen); + if (result >= LZP_NO_ERROR) result = (output[0] = 1, result + 1); + + return result; + } + + int nBlocks = bsc_lzp_num_blocks(n); + int chunkSize = n / nBlocks; + int blockId; + ssize_t outputPtr = 1 + 8 * nBlocks; + + output[0] = nBlocks; + for (blockId = 0; blockId < nBlocks; ++blockId) + { + int inputStart = blockId * chunkSize; + int inputSize = blockId != nBlocks - 1 ? chunkSize : n - inputStart; + int outputSize = inputSize; if (outputSize > n - outputPtr) outputSize = n - outputPtr; + + int result = bsc_lzp_encode_block(input + inputStart, input + inputStart + inputSize, output + outputPtr, output + outputPtr + outputSize, hashSize, minLen); + if (result < LZP_NO_ERROR) + { + if (outputPtr + inputSize >= n) return LZP_NOT_COMPRESSIBLE; + result = inputSize; memcpy(output + outputPtr, input + inputStart, inputSize); + } + + *(int *)(output + 1 + 8 * blockId + 0) = inputSize; + *(int *)(output + 1 + 8 * blockId + 4) = result; + + outputPtr += result; + } + + return outputPtr; +} + +#ifdef LZP_OPENMP + +static +int bsc_lzp_compress_parallel(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen) +{ + if (unsigned char * buffer = (unsigned char *)bsc_malloc(n * sizeof(unsigned char))) + { + int compressionResult[ALPHABET_SIZE]; + + int nBlocks = bsc_lzp_num_blocks(n); + int result = LZP_NO_ERROR; + int chunkSize = n / nBlocks; + + int numThreads = omp_get_max_threads(); + if (numThreads > nBlocks) numThreads = nBlocks; + + output[0] = nBlocks; + #pragma omp parallel num_threads(numThreads) if(numThreads > 1) + { + if (omp_get_num_threads() == 1) + { + result = bsc_lzp_compress_serial(input, output, n, hashSize, minLen); + } + else + { + #pragma omp for schedule(dynamic) + for (int blockId = 0; blockId < nBlocks; ++blockId) + { + int blockStart = blockId * chunkSize; + int blockSize = blockId != nBlocks - 1 ? chunkSize : n - blockStart; + + compressionResult[blockId] = bsc_lzp_encode_block(input + blockStart, input + blockStart + blockSize, buffer + blockStart, buffer + blockStart + blockSize, hashSize, minLen); + if (compressionResult[blockId] < LZP_NO_ERROR) compressionResult[blockId] = blockSize; + + *(int *)(output + 1 + 8 * blockId + 0) = blockSize; + *(int *)(output + 1 + 8 * blockId + 4) = compressionResult[blockId]; + } + + #pragma omp single + { + result = 1 + 8 * nBlocks; + for (int blockId = 0; blockId < nBlocks; ++blockId) + { + result += compressionResult[blockId]; + } + + if (result >= n) result = LZP_NOT_COMPRESSIBLE; + } + + if (result >= LZP_NO_ERROR) + { + #pragma omp for schedule(dynamic) + for (int blockId = 0; blockId < nBlocks; ++blockId) + { + int blockStart = blockId * chunkSize; + int blockSize = blockId != nBlocks - 1 ? chunkSize : n - blockStart; + + int outputPtr = 1 + 8 * nBlocks; + for (int p = 0; p < blockId; ++p) outputPtr += compressionResult[p]; + + if (compressionResult[blockId] != blockSize) + { + memcpy(output + outputPtr, buffer + blockStart, compressionResult[blockId]); + } + else + { + memcpy(output + outputPtr, input + blockStart, compressionResult[blockId]); + } + } + } + } + } + + bsc_free(buffer); + + return result; + } + return LZP_NOT_ENOUGH_MEMORY; +} + +#endif + +ssize_t lzp_compress(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen, int features) +{ + +#ifdef LZP_OPENMP + + if ((bsc_lzp_num_blocks(n) != 1) && (features & LZP_FEATURE_MULTITHREADING)) + { + return bsc_lzp_compress_parallel(input, output, n, hashSize, minLen); + } + +#endif + + return bsc_lzp_compress_serial(input, output, n, hashSize, minLen); +} + +ssize_t lzp_decompress(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen, int features) +{ + int nBlocks = input[0]; + + if (nBlocks == 1) + { + return bsc_lzp_decode_block(input + 1, input + n, output, hashSize, minLen); + } + + int decompressionResult[ALPHABET_SIZE]; + +#ifdef LZP_OPENMP + + if (features & LZP_FEATURE_MULTITHREADING) + { + #pragma omp parallel for schedule(dynamic) + for (int blockId = 0; blockId < nBlocks; ++blockId) + { + int inputPtr = 0; for (int p = 0; p < blockId; ++p) inputPtr += *(int *)(input + 1 + 8 * p + 4); + int outputPtr = 0; for (int p = 0; p < blockId; ++p) outputPtr += *(int *)(input + 1 + 8 * p + 0); + + inputPtr += 1 + 8 * nBlocks; + + int inputSize = *(int *)(input + 1 + 8 * blockId + 4); + int outputSize = *(int *)(input + 1 + 8 * blockId + 0); + + if (inputSize != outputSize) + { + decompressionResult[blockId] = bsc_lzp_decode_block(input + inputPtr, input + inputPtr + inputSize, output + outputPtr, hashSize, minLen); + } + else + { + decompressionResult[blockId] = inputSize; memcpy(output + outputPtr, input + inputPtr, inputSize); + } + } + } + else + +#endif + + { + int blockId, p; + + for (blockId = 0; blockId < nBlocks; ++blockId) + { + int inputPtr = 0; for (p = 0; p < blockId; ++p) inputPtr += *(int *)(input + 1 + 8 * p + 4); + int outputPtr = 0; for (p = 0; p < blockId; ++p) outputPtr += *(int *)(input + 1 + 8 * p + 0); + + inputPtr += 1 + 8 * nBlocks; + + int inputSize = *(int *)(input + 1 + 8 * blockId + 4); + int outputSize = *(int *)(input + 1 + 8 * blockId + 0); + + if (inputSize != outputSize) + { + decompressionResult[blockId] = bsc_lzp_decode_block(input + inputPtr, input + inputPtr + inputSize, output + outputPtr, hashSize, minLen); + } + else + { + decompressionResult[blockId] = inputSize; memcpy(output + outputPtr, input + inputPtr, inputSize); + } + } + } + + ssize_t dataSize = 0; + int result = LZP_NO_ERROR; + int blockId; + for (blockId = 0; blockId < nBlocks; ++blockId) + { + if (decompressionResult[blockId] < LZP_NO_ERROR) result = decompressionResult[blockId]; + dataSize += decompressionResult[blockId]; + } + + return (result == LZP_NO_ERROR) ? dataSize : result; +} + +int lzp_hash_size(int level) { + if (level > 7) { + return (LZP_DEFAULT_LZPHASHSIZE + 2); + } else if (level > 5) { + return (LZP_DEFAULT_LZPHASHSIZE + 3); + } else if (level > 3) { + return (LZP_DEFAULT_LZPHASHSIZE + 4); + } else { + return (LZP_DEFAULT_LZPHASHSIZE + 5); + } +} +/*-----------------------------------------------------------*/ +/* End lzp.cpp */ +/*-----------------------------------------------------------*/ diff --git a/lzp/lzp.h b/lzp/lzp.h new file mode 100644 index 0000000..29a4b42 --- /dev/null +++ b/lzp/lzp.h @@ -0,0 +1,89 @@ +/*-----------------------------------------------------------*/ +/* Block Sorting, Lossless Data Compression Library. */ +/* Interface to Lempel Ziv Prediction functions */ +/*-----------------------------------------------------------*/ + +/*-- + +This file is a part of bsc and/or libbsc, a program and a library for +lossless, block-sorting data compression. + +Copyright (c) 2009-2012 Ilya Grebnov +Copyright (c) 2012 Moinak Ghosh + +See file AUTHORS for a full list of contributors. + +The bsc and libbsc is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The bsc and libbsc is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the bsc and libbsc. If not, see http://www.gnu.org/licenses/. + +Please see the files COPYING and COPYING.LIB for full copyright information. + +See also the bsc and libbsc web site: + http://libbsc.com/ for more information. + +--*/ + +#ifndef _LZP_H +#define _LZP_H + +#define LZP_NO_ERROR 0 +#define LZP_BAD_PARAMETER -1 +#define LZP_NOT_ENOUGH_MEMORY -2 +#define LZP_NOT_COMPRESSIBLE -3 +#define LZP_NOT_SUPPORTED -4 +#define LZP_UNEXPECTED_EOB -5 +#define LZP_DATA_CORRUPT -6 + +#define LZP_DEFAULT_LZPHASHSIZE 16 +#define LZP_DEFAULT_LZPMINLEN 128 +#define LZP_MAX_BLOCK (2147483648LL) +#define ALPHABET_SIZE (256) + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * Preprocess a memory block by LZP algorithm. + * @param input - the input memory block of n bytes. + * @param output - the output memory block of n bytes. + * @param n - the length of the input/output memory blocks. + * @param hashSize - the hash table size. + * @param minLen - the minimum match length. + * @param features - the set of additional features. + * @return The length of preprocessed memory block if no error occurred, error code otherwise. + */ + ssize_t lzp_compress(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen, int features); + + /** + * Reconstructs the original memory block after LZP algorithm. + * @param input - the input memory block of n bytes. + * @param output - the output memory block. + * @param n - the length of the input memory block. + * @param hashSize - the hash table size. + * @param minLen - the minimum match length. + * @param features - the set of additional features. + * @return The length of original memory block if no error occurred, error code otherwise. + */ + ssize_t lzp_decompress(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen, int features); + + int lzp_hash_size(int level); +#ifdef __cplusplus +} +#endif + +#endif + +/*-----------------------------------------------------------*/ +/* End lzp.h */ +/*-----------------------------------------------------------*/ diff --git a/main.c b/main.c index 3c2213d..641bde0 100644 --- a/main.c +++ b/main.c @@ -45,6 +45,7 @@ #include #include #include +#include /* * We use 5MB chunks by default. @@ -78,6 +79,7 @@ static int hide_cmp_stats = 1; static int enable_rabin_scan = 0; static int enable_delta_encode = 0; static int enable_rabin_split = 1; +static int lzp_preprocess = 0; static unsigned int chunk_num; static uint64_t largest_chunk, smallest_chunk, avg_chunk; static const char *exec_name; @@ -128,8 +130,11 @@ usage(void) "5) Perform Delta Encoding in addition to Exact Dedup:\n" " %s -E ... - This also implies '-D'.\n" "6) Number of threads can optionally be specified: -t <1 - 256 count>\n" - "7) Pass '-M' to display memory allocator statistics\n" - "8) Pass '-C' to display compression statistics\n\n", + "7) Other flags:\n" + " '-L' - Enable LZP pre-compression. This improves compression ratio of all\n" + " algorithms with some extra CPU and very low RAM overhead.\n" + " '-M' - Display memory allocator statistics\n" + " '-C' - Display compression statistics\n\n", UTILITY_VERSION, exec_name, exec_name, exec_name, exec_name, exec_name, exec_name); } @@ -148,6 +153,92 @@ show_compression_stats(uint64_t chunksize) bytes_to_size(avg_chunk), (double)avg_chunk/(double)chunksize*100); } +/* + * Wrapper functions to pre-process the buffer and then call the main compression routine. + * At present only LZP pre-compression is used below. Some extra metadata is added: + * + * Byte 0: A flag to indicate which pre-processor was used. + * Byte 1 - Byte 8: Size of buffer after pre-processing + * + * It is possible for a buffer to be only pre-processed and not compressed by the final + * algorithm if the final one fails to compress for some reason. However the vice versa + * is not allowed. + */ +int +preproc_compress(compress_func_ptr cmp_func, void *src, size_t srclen, void *dst, + size_t *dstlen, int level, uchar_t chdr, void *data) +{ + uchar_t *dest = (uchar_t *)dst, type = 0; + ssize_t result, _dstlen; + + if (lzp_preprocess) { + int hashsize; + + type = PREPROC_TYPE_LZP; + hashsize = lzp_hash_size(level); + result = lzp_compress(src, dst, srclen, hashsize, LZP_DEFAULT_LZPMINLEN, 0); + if (result < 0 || result == srclen) return (-1); + srclen = result; + memcpy(src, dst, srclen); + } else { + /* + * Execution won't come here but just in case ... + */ + fprintf(stderr, "Invalid preprocessing mode\n"); + return (-1); + } + + *dest = type; + *((int64_t *)(dest + 1)) = htonll(srclen); + _dstlen = srclen; + result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr, data); + if (result == 0 && _dstlen < srclen) { + *dest |= PREPROC_COMPRESSED; + *dstlen = _dstlen + 9; + } else { + memcpy(dest+1, src, srclen); + _dstlen = srclen; + *dstlen = _dstlen + 1; + } + return (0); +} + +int +preproc_decompress(compress_func_ptr dec_func, void *src, size_t srclen, void *dst, + size_t *dstlen, int level, uchar_t chdr, void *data) +{ + uchar_t *sorc = (uchar_t *)src, type; + ssize_t result; + + type = *sorc; + sorc++; + srclen--; + if (type & PREPROC_COMPRESSED) { + *dstlen = ntohll(*((int64_t *)(sorc))); + sorc += 8; + srclen -= 8; + result = dec_func(sorc, srclen, dst, dstlen, level, chdr, data); + if (result < 0) return (result); + memcpy(src, dst, *dstlen); + srclen = *dstlen; + } + + if (type & PREPROC_TYPE_LZP) { + int hashsize; + hashsize = lzp_hash_size(level); + result = lzp_decompress(src, dst, srclen, hashsize, LZP_DEFAULT_LZPMINLEN, 0); + if (result < 0) { + fprintf(stderr, "LZP decompression failed.\n"); + return (-1); + } + *dstlen = result; + } else { + fprintf(stderr, "Invalid preprocessing flags: %d\n", type); + return (-1); + } + return (0); +} + /* * This routine is called in multiple threads. Calls the decompression handler * as encoded in the file header. For adaptive mode the handler adapt_decompress() @@ -214,8 +305,13 @@ redo: cmpbuf = cseg + RABIN_HDR_SIZE + rabin_index_sz_cmp; ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + rabin_index_sz; if (HDR & COMPRESSED) { - rv = tdat->decompress(cmpbuf, rabin_data_sz_cmp, ubuf, &_chunksize, - tdat->level, HDR, tdat->data); + if (HDR & CHUNK_FLAG_PREPROC) { + rv = preproc_decompress(tdat->decompress, cmpbuf, rabin_data_sz_cmp, + ubuf, &_chunksize, tdat->level, HDR, tdat->data); + } else { + rv = tdat->decompress(cmpbuf, rabin_data_sz_cmp, ubuf, &_chunksize, + tdat->level, HDR, tdat->data); + } if (rv == -1) { tdat->len_cmp = 0; fprintf(stderr, "ERROR: Chunk %d, decompression failed.\n", tdat->id); @@ -237,8 +333,13 @@ redo: } } else { if (HDR & COMPRESSED) { - rv = tdat->decompress(cseg, tdat->len_cmp, tdat->uncompressed_chunk, - &_chunksize, tdat->level, HDR, tdat->data); + if (HDR & CHUNK_FLAG_PREPROC) { + rv = preproc_decompress(tdat->decompress, cseg, tdat->len_cmp, + tdat->uncompressed_chunk, &_chunksize, tdat->level, HDR, tdat->data); + } else { + rv = tdat->decompress(cseg, tdat->len_cmp, tdat->uncompressed_chunk, + &_chunksize, tdat->level, HDR, tdat->data); + } } else { memcpy(tdat->uncompressed_chunk, cseg, _chunksize); } @@ -317,7 +418,7 @@ cont: * | `---------------- 2 - Lzma (Adaptive Mode) * | 3 - PPMD (Adaptive Mode) * | - * `---------------------- 1 - Last Chunk flag + * `---------------------- 1 - Chunk size flag (if original chunk is of variable length) * * A file trailer to indicate end. * Zero Compressed length: 8 zero bytes. @@ -459,7 +560,7 @@ start_decompress(const char *filename, const char *to_filename) } } if (enable_rabin_scan) { - tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, + tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, 0, algo, enable_delta_encode); if (tdat->rctx == NULL) { UNCOMP_BAIL; @@ -685,7 +786,7 @@ redo: /* Compress index if it is at least 90 bytes. */ rv = lzma_compress(tdat->uncompressed_chunk + RABIN_HDR_SIZE, rabin_index_sz, compressed_chunk + RABIN_HDR_SIZE, - &index_size_cmp, tdat->rctx->level, 0, tdat->rctx->lzma_data); + &index_size_cmp, tdat->rctx->level, 255, tdat->rctx->lzma_data); } else { memcpy(compressed_chunk + RABIN_HDR_SIZE, tdat->uncompressed_chunk + RABIN_HDR_SIZE, rabin_index_sz); @@ -696,9 +797,16 @@ redo: if (rv == 0) { memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE); /* Compress data chunk. */ - rv = tdat->compress(tdat->uncompressed_chunk + rabin_index_sz, - _chunksize, compressed_chunk + index_size_cmp, &_chunksize, - tdat->level, 0, tdat->data); + if (lzp_preprocess) { + rv = preproc_compress(tdat->compress, + tdat->uncompressed_chunk + rabin_index_sz, + _chunksize, compressed_chunk + index_size_cmp, &_chunksize, + tdat->level, 0, tdat->data); + } else { + rv = tdat->compress(tdat->uncompressed_chunk + rabin_index_sz, + _chunksize, compressed_chunk + index_size_cmp, &_chunksize, + tdat->level, 0, tdat->data); + } /* Can't compress data just retain as-is. */ if (rv < 0) @@ -720,8 +828,14 @@ redo: } else { plain_compress: _chunksize = tdat->rbytes; - rv = tdat->compress(tdat->uncompressed_chunk, tdat->rbytes, - compressed_chunk, &_chunksize, tdat->level, 0, tdat->data); + if (lzp_preprocess) { + rv = preproc_compress(tdat->compress, + tdat->uncompressed_chunk, tdat->rbytes, + compressed_chunk, &_chunksize, tdat->level, 0, tdat->data); + } else { + rv = tdat->compress(tdat->uncompressed_chunk, tdat->rbytes, + compressed_chunk, &_chunksize, tdat->level, 0, tdat->data); + } } /* * Sanity check to ensure compressed data is lesser than original. @@ -742,6 +856,9 @@ plain_compress: if (enable_rabin_scan && tdat->rctx->valid) { type |= CHUNK_FLAG_DEDUP; } + if (lzp_preprocess) { + type |= CHUNK_FLAG_PREPROC; + } /* * Insert compressed chunk length and CRC64 checksum into * chunk header. @@ -871,8 +988,8 @@ start_compress(const char *filename, uint64_t chunksize, int level) if (enable_rabin_scan) { flags |= FLAG_DEDUP; /* Additional scratch space for dedup arrays. */ - compressed_chunksize += (rabin_buf_extra(chunksize) - - (compressed_chunksize - chunksize)); + compressed_chunksize += (rabin_buf_extra(chunksize, 0, algo, + enable_delta_encode) - (compressed_chunksize - chunksize)); } err = 0; @@ -992,7 +1109,7 @@ start_compress(const char *filename, uint64_t chunksize, int level) } } if (enable_rabin_scan) { - tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, + tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, 0, algo, enable_delta_encode); if (tdat->rctx == NULL) { COMP_BAIL; @@ -1057,7 +1174,7 @@ start_compress(const char *filename, uint64_t chunksize, int level) * Read the first chunk into a spare buffer (a simple double-buffering). */ if (enable_rabin_split) { - rctx = create_rabin_context(chunksize, 0, algo, enable_delta_encode); + rctx = create_rabin_context(chunksize, 0, 0, algo, enable_delta_encode); rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx); } else { rbytes = Read(uncompfd, cread_buf, chunksize); @@ -1371,7 +1488,7 @@ main(int argc, char *argv[]) level = 6; slab_init(); - while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDEr")) != -1) { + while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDErL")) != -1) { int ovr; switch (opt) { @@ -1432,6 +1549,10 @@ main(int argc, char *argv[]) enable_delta_encode = 1; break; + case 'L': + lzp_preprocess = 1; + break; + case 'r': enable_rabin_split = 0; break; diff --git a/pcompress.h b/pcompress.h index 2842dbf..ef16d35 100644 --- a/pcompress.h +++ b/pcompress.h @@ -47,8 +47,12 @@ extern "C" { #define BZIP2_A_NUM 16 #define LZMA_A_NUM 32 #define CHUNK_FLAG_DEDUP 2 +#define CHUNK_FLAG_PREPROC 4 #define COMP_EXTN ".pz" +#define PREPROC_TYPE_LZP 1 +#define PREPROC_COMPRESSED 128 + /* * lower 3 bits in higher nibble indicate compression algorithm. */ diff --git a/rabin/rabin_polynomial.c b/rabin/rabin_polynomial.c index eea9a9a..8fb6d37 100755 --- a/rabin/rabin_polynomial.c +++ b/rabin/rabin_polynomial.c @@ -94,21 +94,47 @@ static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; uint64_t ir[256]; static int inited = 0; -uint32_t -rabin_buf_extra(uint64_t chunksize) +static uint32_t +rabin_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag) { - return ((chunksize / RAB_POLYNOMIAL_MIN_BLOCK_SIZE2) * sizeof (uint32_t)); + uint32_t min_blk; + + min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS); + if (((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) && + chunksize <= LZMA_WINDOW_MAX) || delta_flag) { + if (memcmp(algo, "lzfx", 4) == 0 || memcmp(algo, "lz4", 3) == 0 || + memcmp(algo, "zlib", 4) == 0 || memcmp(algo, "none", 4) == 0) { + min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS - 1); + } + } else { + min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS - 1); + } + return (min_blk); +} + +uint32_t +rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag) +{ + if (rab_blk_sz < 1 || rab_blk_sz > 5) + rab_blk_sz = RAB_BLK_DEFAULT; + + return ((chunksize / rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag)) + * sizeof (uint32_t)); } /* * Initialize the algorithm with the default params. */ rabin_context_t * -create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *algo, int delta_flag) { +create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz, + const char *algo, int delta_flag) { rabin_context_t *ctx; unsigned char *current_window_data; uint32_t i; + if (rab_blk_sz < 1 || rab_blk_sz > 5) + rab_blk_sz = RAB_BLK_DEFAULT; + /* * Pre-compute a table of irreducible polynomial evaluations for each * possible byte value. @@ -157,28 +183,12 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al ctx->rabin_break_patt = 0; ctx->delta_flag = delta_flag; - if (((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) && - chunksize <= LZMA_WINDOW_MAX) || delta_flag) { - if (memcmp(algo, "lzfx", 4) == 0 || memcmp(algo, "lz4", 3) == 0 || - memcmp(algo, "zlib", 4) == 0 || memcmp(algo, "none", 4) == 0) { - ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE2; - ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK2; - ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE2; - if (delta_flag) - ctx->delta_flag = DELTA_LESS_FUZZY; - } else { - ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE; - ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK; - ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE; - } - } else { - ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE2; - ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK2; - ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE2; - } - + ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS); + ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1; + ctx->rabin_poly_min_block_size = rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag); ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size; ctx->blknum = chunksize / ctx->rabin_poly_min_block_size; + if (chunksize % ctx->rabin_poly_min_block_size) ctx->blknum++; diff --git a/rabin/rabin_polynomial.h b/rabin/rabin_polynomial.h index 2873522..6e7f019 100644 --- a/rabin/rabin_polynomial.h +++ b/rabin/rabin_polynomial.h @@ -63,26 +63,14 @@ //List of constants, mostly constraints and defaults for various parameters //to the Rabin Fingerprinting algorithm - #define RAB_POLYNOMIAL_CONST 2 -// 1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT = Average Rabin Chunk Size -// So we are always looking at power of 2 chunk sizes to avoid doing a modulus -// -#define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT 12 -#define RAB_POLYNOMIAL_AVG_BLOCK_SIZE (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT) -#define RAB_POLYNOMIAL_AVG_BLOCK_MASK (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1) -#define RAB_POLYNOMIAL_MIN_BLOCK_SIZE RAB_POLYNOMIAL_AVG_BLOCK_SIZE -#define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024) - -#define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT2 12 -#define RAB_POLYNOMIAL_AVG_BLOCK_SIZE2 (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT) -#define RAB_POLYNOMIAL_AVG_BLOCK_MASK2 (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1) -#define RAB_POLYNOMIAL_MIN_BLOCK_SIZE2 2048 - +#define RAB_BLK_DEFAULT 1 +#define RAB_BLK_MIN_BITS 11 #define LZMA_WINDOW_MAX (128L * 1024L * 1024L) #define RAB_POLYNOMIAL_WIN_SIZE 16 #define RAB_POLYNOMIAL_MIN_WIN_SIZE 8 #define RAB_POLYNOMIAL_MAX_WIN_SIZE 64 +#define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024) // Minimum practical chunk size when doing dedup #define RAB_MIN_CHUNK_SIZE (1048576L) @@ -166,8 +154,8 @@ typedef struct { int level, delta_flag; } rabin_context_t; -extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, - const char *algo, int delta_flag); +extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, + int rab_blk_sz, const char *algo, int delta_flag); extern void destroy_rabin_context(rabin_context_t *ctx); extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos); @@ -178,6 +166,7 @@ extern void rabin_parse_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_i extern void rabin_update_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp, ssize_t rabin_data_sz_cmp); extern void reset_rabin_context(rabin_context_t *ctx); -extern uint32_t rabin_buf_extra(uint64_t chunksize); +extern uint32_t rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, + int delta_flag); -#endif /* _RABIN_0POLY_H_ */ \ No newline at end of file +#endif /* _RABIN_POLY_H_ */