Add LZP Pre-Compression support ported from libbsc.
Add generic pre-processing wrappers for future support of other pre-processors. Clean up computation of Rabin block sizes. Compute Rabin scratch space accurately to avoid RAM wastage.
This commit is contained in:
parent
3851c9c6cc
commit
bf149e880d
8 changed files with 750 additions and 73 deletions
24
Makefile
24
Makefile
|
@ -58,14 +58,21 @@ CRCSRCS = lzma/crc64_fast.c lzma/crc64_table.c
|
|||
CRCHDRS = lzma/crc64_table_le.h lzma/crc64_table_be.h lzma/crc_macros.h
|
||||
CRCOBJS = $(CRCSRCS:.c=.o)
|
||||
|
||||
BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~
|
||||
LZPSRCS = lzp/lzp.c
|
||||
LZPHDRS = lzp/lzp.h
|
||||
LZPOBJS = $(LZPSRCS:.c=.o)
|
||||
|
||||
BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~
|
||||
|
||||
RM = rm -f
|
||||
CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_PROPS \
|
||||
-DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32
|
||||
-DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \
|
||||
-I./lzp
|
||||
VEC_FLAGS = -ftree-vectorize
|
||||
LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
|
||||
LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm
|
||||
OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
|
||||
$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS)
|
||||
|
||||
ifdef DEBUG
|
||||
LINK = g++ -m64 -pthread -msse3
|
||||
|
@ -115,16 +122,15 @@ $(LZFXOBJS): $(LZFXSRCS) $(LZFXHDRS)
|
|||
$(LZ4OBJS): $(LZ4SRCS) $(LZ4HDRS)
|
||||
$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||
|
||||
$(LZPOBJS): $(LZPSRCS) $(LZPHDRS)
|
||||
$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||
|
||||
$(MAINOBJS): $(MAINSRCS) $(MAINHDRS)
|
||||
$(COMPILE) $(GEN_OPT) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||
|
||||
$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) \
|
||||
$(CRCOBJS) $(RABINOBJS) $(BSDIFFOBJS)
|
||||
$(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) \
|
||||
$(LZFXOBJS) $(LZ4OBJS) $(RABINOBJS) $(BSDIFFOBJS) \
|
||||
$(LDLIBS)
|
||||
$(PROG): $(OBJS)
|
||||
$(LINK) -o $@ $(OBJS) $(LDLIBS)
|
||||
|
||||
clean:
|
||||
$(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(LZFXOBJS) $(LZ4OBJS) \
|
||||
$(RABINOBJS) $(BSDIFFOBJS) $(BAKFILES)
|
||||
$(RM) $(PROG) $(OBJS) $(BAKFILES)
|
||||
|
||||
|
|
|
@ -70,8 +70,12 @@ Usage
|
|||
pcompress -E ... - This also implies '-D'.
|
||||
|
||||
Number of threads can optionally be specified: -t <1 - 256 count>
|
||||
Pass '-M' to display memory allocator statistics
|
||||
Pass '-C' to display compression statistics
|
||||
Other flags:
|
||||
'-L' - Enable LZP pre-compression. This improves compression ratio of all
|
||||
algorithms with some extra CPU and very low RAM overhead. Using
|
||||
delta encoding in conjunction with this may not always be beneficial.
|
||||
'-M' - Display memory allocator statistics
|
||||
'-C' - Display compression statistics
|
||||
|
||||
Environment Variables
|
||||
=====================
|
||||
|
|
454
lzp/lzp.c
Normal file
454
lzp/lzp.c
Normal file
|
@ -0,0 +1,454 @@
|
|||
/*-----------------------------------------------------------*/
|
||||
/* Block Sorting, Lossless Data Compression Library. */
|
||||
/* Lempel Ziv Prediction */
|
||||
/*-----------------------------------------------------------*/
|
||||
|
||||
/*--
|
||||
|
||||
This file is a part of bsc and/or libbsc, a program and a library for
|
||||
lossless, block-sorting data compression.
|
||||
|
||||
Copyright (c) 2009-2012 Ilya Grebnov <ilya.grebnov@gmail.com>
|
||||
Copyright (c) 2012 Moinak Ghosh <moinakg@gmail.com>
|
||||
|
||||
See file AUTHORS for a full list of contributors.
|
||||
|
||||
The bsc and libbsc is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or (at your
|
||||
option) any later version.
|
||||
|
||||
The bsc and libbsc is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with the bsc and libbsc. If not, see http://www.gnu.org/licenses/.
|
||||
|
||||
Please see the files COPYING and COPYING.LIB for full copyright information.
|
||||
|
||||
See also the bsc and libbsc web site:
|
||||
http://libbsc.com/ for more information.
|
||||
|
||||
--*/
|
||||
|
||||
/*
|
||||
* TODO: Port the parallel implementation.
|
||||
*/
|
||||
#undef LZP_OPENMP
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <memory.h>
|
||||
#include <string.h>
|
||||
#include <allocator.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "lzp.h"
|
||||
|
||||
#define LZP_LZP_MATCH_FLAG 0xf2
|
||||
|
||||
static
|
||||
inline int bsc_lzp_num_blocks(ssize_t n)
|
||||
{
|
||||
if (n < 256 * 1024) return 1;
|
||||
if (n < 4 * 1024 * 1024) return 2;
|
||||
if (n < 16 * 1024 * 1024) return 4;
|
||||
if (n < LZP_MAX_BLOCK) return 8;
|
||||
|
||||
return (n / LZP_MAX_BLOCK);
|
||||
}
|
||||
|
||||
static
|
||||
int bsc_lzp_encode_block(const unsigned char * input, const unsigned char * inputEnd, unsigned char * output, unsigned char * outputEnd, int hashSize, int minLen)
|
||||
{
|
||||
int *lookup, i;
|
||||
if (inputEnd - input < 16)
|
||||
{
|
||||
return LZP_NOT_COMPRESSIBLE;
|
||||
}
|
||||
|
||||
if (lookup = (int *)slab_calloc(NULL, (int)(1 << hashSize), sizeof(int)))
|
||||
{
|
||||
unsigned int mask = (int)(1 << hashSize) - 1;
|
||||
const unsigned char * inputStart = input;
|
||||
const unsigned char * outputStart = output;
|
||||
const unsigned char * outputEOB = outputEnd - 4;
|
||||
|
||||
unsigned int context = 0;
|
||||
for (i = 0; i < 4; ++i)
|
||||
{
|
||||
context = (context << 8) | (*output++ = *input++);
|
||||
}
|
||||
|
||||
const unsigned char * heuristic = input;
|
||||
const unsigned char * inputMinLenEnd = inputEnd - minLen - 8;
|
||||
while ((input < inputMinLenEnd) && (output < outputEOB))
|
||||
{
|
||||
unsigned int index = ((context >> 15) ^ context ^ (context >> 3)) & mask;
|
||||
int value = lookup[index]; lookup[index] = (int)(input - inputStart);
|
||||
if (value > 0)
|
||||
{
|
||||
const unsigned char * reference = inputStart + value;
|
||||
if ((*(unsigned int *)(input + minLen - 4) == *(unsigned int *)(reference + minLen - 4)) && (*(unsigned int *)(input) == *(unsigned int *)(reference)))
|
||||
{
|
||||
if ((heuristic > input) && (*(unsigned int *)heuristic != *(unsigned int *)(reference + (heuristic - input))))
|
||||
{
|
||||
goto LZP_LZP_MATCH_NOT_FOUND;
|
||||
}
|
||||
|
||||
int len = 4;
|
||||
for (; input + len < inputMinLenEnd; len += 4)
|
||||
{
|
||||
if (*(unsigned int *)(input + len) != *(unsigned int *)(reference + len)) break;
|
||||
}
|
||||
if (len < minLen)
|
||||
{
|
||||
if (heuristic < input + len) heuristic = input + len;
|
||||
goto LZP_LZP_MATCH_NOT_FOUND;
|
||||
}
|
||||
|
||||
if (input[len] == reference[len]) len++;
|
||||
if (input[len] == reference[len]) len++;
|
||||
if (input[len] == reference[len]) len++;
|
||||
|
||||
input += len; context = input[-1] | (input[-2] << 8) | (input[-3] << 16) | (input[-4] << 24);
|
||||
|
||||
*output++ = LZP_LZP_MATCH_FLAG;
|
||||
|
||||
len -= minLen; while (len >= 254) { len -= 254; *output++ = 254; if (output >= outputEOB) break; }
|
||||
|
||||
*output++ = (unsigned char)(len);
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned char next;
|
||||
LZP_LZP_MATCH_NOT_FOUND:
|
||||
next = *output++ = *input++; context = (context << 8) | next;
|
||||
if (next == LZP_LZP_MATCH_FLAG) *output++ = 255;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
context = (context << 8) | (*output++ = *input++);
|
||||
}
|
||||
}
|
||||
|
||||
while ((input < inputEnd) && (output < outputEOB))
|
||||
{
|
||||
unsigned int index = ((context >> 15) ^ context ^ (context >> 3)) & mask;
|
||||
int value = lookup[index]; lookup[index] = (int)(input - inputStart);
|
||||
if (value > 0)
|
||||
{
|
||||
unsigned char next = *output++ = *input++; context = (context << 8) | next;
|
||||
if (next == LZP_LZP_MATCH_FLAG) *output++ = 255;
|
||||
}
|
||||
else
|
||||
{
|
||||
context = (context << 8) | (*output++ = *input++);
|
||||
}
|
||||
}
|
||||
|
||||
slab_free(NULL, lookup);
|
||||
|
||||
return (output >= outputEOB) ? LZP_NOT_COMPRESSIBLE : (int)(output - outputStart);
|
||||
}
|
||||
|
||||
return LZP_NOT_ENOUGH_MEMORY;
|
||||
}
|
||||
|
||||
static
|
||||
int bsc_lzp_decode_block(const unsigned char * input, const unsigned char * inputEnd, unsigned char * output, int hashSize, int minLen)
|
||||
{
|
||||
int *lookup, i;
|
||||
if (inputEnd - input < 4)
|
||||
{
|
||||
return LZP_UNEXPECTED_EOB;
|
||||
}
|
||||
|
||||
if (lookup = (int *)slab_calloc(NULL, (int)(1 << hashSize), sizeof(int)))
|
||||
{
|
||||
unsigned int mask = (int)(1 << hashSize) - 1;
|
||||
const unsigned char * outputStart = output;
|
||||
|
||||
unsigned int context = 0;
|
||||
for (i = 0; i < 4; ++i)
|
||||
{
|
||||
context = (context << 8) | (*output++ = *input++);
|
||||
}
|
||||
|
||||
while (input < inputEnd)
|
||||
{
|
||||
unsigned int index = ((context >> 15) ^ context ^ (context >> 3)) & mask;
|
||||
int value = lookup[index]; lookup[index] = (int)(output - outputStart);
|
||||
if (*input == LZP_LZP_MATCH_FLAG && value > 0)
|
||||
{
|
||||
input++;
|
||||
if (*input != 255)
|
||||
{
|
||||
int len = minLen; while (1) { len += *input; if (*input++ != 254) break; }
|
||||
|
||||
const unsigned char * reference = outputStart + value;
|
||||
unsigned char * outputEnd = output + len;
|
||||
|
||||
if (output - reference < 4)
|
||||
{
|
||||
int offset[4] = {0, 3, 2, 3};
|
||||
|
||||
*output++ = *reference++;
|
||||
*output++ = *reference++;
|
||||
*output++ = *reference++;
|
||||
*output++ = *reference++;
|
||||
|
||||
reference -= offset[output - reference];
|
||||
}
|
||||
|
||||
while (output < outputEnd) { *(unsigned int *)output = *(unsigned int*)reference; output += 4; reference += 4; }
|
||||
|
||||
output = outputEnd; context = output[-1] | (output[-2] << 8) | (output[-3] << 16) | (output[-4] << 24);
|
||||
}
|
||||
else
|
||||
{
|
||||
input++; context = (context << 8) | (*output++ = LZP_LZP_MATCH_FLAG);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
context = (context << 8) | (*output++ = *input++);
|
||||
}
|
||||
}
|
||||
|
||||
slab_free(NULL, lookup);
|
||||
|
||||
return (int)(output - outputStart);
|
||||
}
|
||||
|
||||
return LZP_NOT_ENOUGH_MEMORY;
|
||||
}
|
||||
|
||||
static
|
||||
ssize_t bsc_lzp_compress_serial(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen)
|
||||
{
|
||||
if (bsc_lzp_num_blocks(n) == 1)
|
||||
{
|
||||
int result = bsc_lzp_encode_block(input, input + n, output + 1, output + n - 1, hashSize, minLen);
|
||||
if (result >= LZP_NO_ERROR) result = (output[0] = 1, result + 1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int nBlocks = bsc_lzp_num_blocks(n);
|
||||
int chunkSize = n / nBlocks;
|
||||
int blockId;
|
||||
ssize_t outputPtr = 1 + 8 * nBlocks;
|
||||
|
||||
output[0] = nBlocks;
|
||||
for (blockId = 0; blockId < nBlocks; ++blockId)
|
||||
{
|
||||
int inputStart = blockId * chunkSize;
|
||||
int inputSize = blockId != nBlocks - 1 ? chunkSize : n - inputStart;
|
||||
int outputSize = inputSize; if (outputSize > n - outputPtr) outputSize = n - outputPtr;
|
||||
|
||||
int result = bsc_lzp_encode_block(input + inputStart, input + inputStart + inputSize, output + outputPtr, output + outputPtr + outputSize, hashSize, minLen);
|
||||
if (result < LZP_NO_ERROR)
|
||||
{
|
||||
if (outputPtr + inputSize >= n) return LZP_NOT_COMPRESSIBLE;
|
||||
result = inputSize; memcpy(output + outputPtr, input + inputStart, inputSize);
|
||||
}
|
||||
|
||||
*(int *)(output + 1 + 8 * blockId + 0) = inputSize;
|
||||
*(int *)(output + 1 + 8 * blockId + 4) = result;
|
||||
|
||||
outputPtr += result;
|
||||
}
|
||||
|
||||
return outputPtr;
|
||||
}
|
||||
|
||||
#ifdef LZP_OPENMP
|
||||
|
||||
static
|
||||
int bsc_lzp_compress_parallel(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen)
|
||||
{
|
||||
if (unsigned char * buffer = (unsigned char *)bsc_malloc(n * sizeof(unsigned char)))
|
||||
{
|
||||
int compressionResult[ALPHABET_SIZE];
|
||||
|
||||
int nBlocks = bsc_lzp_num_blocks(n);
|
||||
int result = LZP_NO_ERROR;
|
||||
int chunkSize = n / nBlocks;
|
||||
|
||||
int numThreads = omp_get_max_threads();
|
||||
if (numThreads > nBlocks) numThreads = nBlocks;
|
||||
|
||||
output[0] = nBlocks;
|
||||
#pragma omp parallel num_threads(numThreads) if(numThreads > 1)
|
||||
{
|
||||
if (omp_get_num_threads() == 1)
|
||||
{
|
||||
result = bsc_lzp_compress_serial(input, output, n, hashSize, minLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
#pragma omp for schedule(dynamic)
|
||||
for (int blockId = 0; blockId < nBlocks; ++blockId)
|
||||
{
|
||||
int blockStart = blockId * chunkSize;
|
||||
int blockSize = blockId != nBlocks - 1 ? chunkSize : n - blockStart;
|
||||
|
||||
compressionResult[blockId] = bsc_lzp_encode_block(input + blockStart, input + blockStart + blockSize, buffer + blockStart, buffer + blockStart + blockSize, hashSize, minLen);
|
||||
if (compressionResult[blockId] < LZP_NO_ERROR) compressionResult[blockId] = blockSize;
|
||||
|
||||
*(int *)(output + 1 + 8 * blockId + 0) = blockSize;
|
||||
*(int *)(output + 1 + 8 * blockId + 4) = compressionResult[blockId];
|
||||
}
|
||||
|
||||
#pragma omp single
|
||||
{
|
||||
result = 1 + 8 * nBlocks;
|
||||
for (int blockId = 0; blockId < nBlocks; ++blockId)
|
||||
{
|
||||
result += compressionResult[blockId];
|
||||
}
|
||||
|
||||
if (result >= n) result = LZP_NOT_COMPRESSIBLE;
|
||||
}
|
||||
|
||||
if (result >= LZP_NO_ERROR)
|
||||
{
|
||||
#pragma omp for schedule(dynamic)
|
||||
for (int blockId = 0; blockId < nBlocks; ++blockId)
|
||||
{
|
||||
int blockStart = blockId * chunkSize;
|
||||
int blockSize = blockId != nBlocks - 1 ? chunkSize : n - blockStart;
|
||||
|
||||
int outputPtr = 1 + 8 * nBlocks;
|
||||
for (int p = 0; p < blockId; ++p) outputPtr += compressionResult[p];
|
||||
|
||||
if (compressionResult[blockId] != blockSize)
|
||||
{
|
||||
memcpy(output + outputPtr, buffer + blockStart, compressionResult[blockId]);
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(output + outputPtr, input + blockStart, compressionResult[blockId]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bsc_free(buffer);
|
||||
|
||||
return result;
|
||||
}
|
||||
return LZP_NOT_ENOUGH_MEMORY;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
ssize_t lzp_compress(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen, int features)
|
||||
{
|
||||
|
||||
#ifdef LZP_OPENMP
|
||||
|
||||
if ((bsc_lzp_num_blocks(n) != 1) && (features & LZP_FEATURE_MULTITHREADING))
|
||||
{
|
||||
return bsc_lzp_compress_parallel(input, output, n, hashSize, minLen);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return bsc_lzp_compress_serial(input, output, n, hashSize, minLen);
|
||||
}
|
||||
|
||||
ssize_t lzp_decompress(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen, int features)
|
||||
{
|
||||
int nBlocks = input[0];
|
||||
|
||||
if (nBlocks == 1)
|
||||
{
|
||||
return bsc_lzp_decode_block(input + 1, input + n, output, hashSize, minLen);
|
||||
}
|
||||
|
||||
int decompressionResult[ALPHABET_SIZE];
|
||||
|
||||
#ifdef LZP_OPENMP
|
||||
|
||||
if (features & LZP_FEATURE_MULTITHREADING)
|
||||
{
|
||||
#pragma omp parallel for schedule(dynamic)
|
||||
for (int blockId = 0; blockId < nBlocks; ++blockId)
|
||||
{
|
||||
int inputPtr = 0; for (int p = 0; p < blockId; ++p) inputPtr += *(int *)(input + 1 + 8 * p + 4);
|
||||
int outputPtr = 0; for (int p = 0; p < blockId; ++p) outputPtr += *(int *)(input + 1 + 8 * p + 0);
|
||||
|
||||
inputPtr += 1 + 8 * nBlocks;
|
||||
|
||||
int inputSize = *(int *)(input + 1 + 8 * blockId + 4);
|
||||
int outputSize = *(int *)(input + 1 + 8 * blockId + 0);
|
||||
|
||||
if (inputSize != outputSize)
|
||||
{
|
||||
decompressionResult[blockId] = bsc_lzp_decode_block(input + inputPtr, input + inputPtr + inputSize, output + outputPtr, hashSize, minLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
decompressionResult[blockId] = inputSize; memcpy(output + outputPtr, input + inputPtr, inputSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
|
||||
#endif
|
||||
|
||||
{
|
||||
int blockId, p;
|
||||
|
||||
for (blockId = 0; blockId < nBlocks; ++blockId)
|
||||
{
|
||||
int inputPtr = 0; for (p = 0; p < blockId; ++p) inputPtr += *(int *)(input + 1 + 8 * p + 4);
|
||||
int outputPtr = 0; for (p = 0; p < blockId; ++p) outputPtr += *(int *)(input + 1 + 8 * p + 0);
|
||||
|
||||
inputPtr += 1 + 8 * nBlocks;
|
||||
|
||||
int inputSize = *(int *)(input + 1 + 8 * blockId + 4);
|
||||
int outputSize = *(int *)(input + 1 + 8 * blockId + 0);
|
||||
|
||||
if (inputSize != outputSize)
|
||||
{
|
||||
decompressionResult[blockId] = bsc_lzp_decode_block(input + inputPtr, input + inputPtr + inputSize, output + outputPtr, hashSize, minLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
decompressionResult[blockId] = inputSize; memcpy(output + outputPtr, input + inputPtr, inputSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ssize_t dataSize = 0;
|
||||
int result = LZP_NO_ERROR;
|
||||
int blockId;
|
||||
for (blockId = 0; blockId < nBlocks; ++blockId)
|
||||
{
|
||||
if (decompressionResult[blockId] < LZP_NO_ERROR) result = decompressionResult[blockId];
|
||||
dataSize += decompressionResult[blockId];
|
||||
}
|
||||
|
||||
return (result == LZP_NO_ERROR) ? dataSize : result;
|
||||
}
|
||||
|
||||
int lzp_hash_size(int level) {
|
||||
if (level > 7) {
|
||||
return (LZP_DEFAULT_LZPHASHSIZE + 2);
|
||||
} else if (level > 5) {
|
||||
return (LZP_DEFAULT_LZPHASHSIZE + 3);
|
||||
} else if (level > 3) {
|
||||
return (LZP_DEFAULT_LZPHASHSIZE + 4);
|
||||
} else {
|
||||
return (LZP_DEFAULT_LZPHASHSIZE + 5);
|
||||
}
|
||||
}
|
||||
/*-----------------------------------------------------------*/
|
||||
/* End lzp.cpp */
|
||||
/*-----------------------------------------------------------*/
|
89
lzp/lzp.h
Normal file
89
lzp/lzp.h
Normal file
|
@ -0,0 +1,89 @@
|
|||
/*-----------------------------------------------------------*/
|
||||
/* Block Sorting, Lossless Data Compression Library. */
|
||||
/* Interface to Lempel Ziv Prediction functions */
|
||||
/*-----------------------------------------------------------*/
|
||||
|
||||
/*--
|
||||
|
||||
This file is a part of bsc and/or libbsc, a program and a library for
|
||||
lossless, block-sorting data compression.
|
||||
|
||||
Copyright (c) 2009-2012 Ilya Grebnov <ilya.grebnov@gmail.com>
|
||||
Copyright (c) 2012 Moinak Ghosh <moinakg@gmail.com>
|
||||
|
||||
See file AUTHORS for a full list of contributors.
|
||||
|
||||
The bsc and libbsc is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or (at your
|
||||
option) any later version.
|
||||
|
||||
The bsc and libbsc is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with the bsc and libbsc. If not, see http://www.gnu.org/licenses/.
|
||||
|
||||
Please see the files COPYING and COPYING.LIB for full copyright information.
|
||||
|
||||
See also the bsc and libbsc web site:
|
||||
http://libbsc.com/ for more information.
|
||||
|
||||
--*/
|
||||
|
||||
#ifndef _LZP_H
|
||||
#define _LZP_H
|
||||
|
||||
#define LZP_NO_ERROR 0
|
||||
#define LZP_BAD_PARAMETER -1
|
||||
#define LZP_NOT_ENOUGH_MEMORY -2
|
||||
#define LZP_NOT_COMPRESSIBLE -3
|
||||
#define LZP_NOT_SUPPORTED -4
|
||||
#define LZP_UNEXPECTED_EOB -5
|
||||
#define LZP_DATA_CORRUPT -6
|
||||
|
||||
#define LZP_DEFAULT_LZPHASHSIZE 16
|
||||
#define LZP_DEFAULT_LZPMINLEN 128
|
||||
#define LZP_MAX_BLOCK (2147483648LL)
|
||||
#define ALPHABET_SIZE (256)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Preprocess a memory block by LZP algorithm.
|
||||
* @param input - the input memory block of n bytes.
|
||||
* @param output - the output memory block of n bytes.
|
||||
* @param n - the length of the input/output memory blocks.
|
||||
* @param hashSize - the hash table size.
|
||||
* @param minLen - the minimum match length.
|
||||
* @param features - the set of additional features.
|
||||
* @return The length of preprocessed memory block if no error occurred, error code otherwise.
|
||||
*/
|
||||
ssize_t lzp_compress(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen, int features);
|
||||
|
||||
/**
|
||||
* Reconstructs the original memory block after LZP algorithm.
|
||||
* @param input - the input memory block of n bytes.
|
||||
* @param output - the output memory block.
|
||||
* @param n - the length of the input memory block.
|
||||
* @param hashSize - the hash table size.
|
||||
* @param minLen - the minimum match length.
|
||||
* @param features - the set of additional features.
|
||||
* @return The length of original memory block if no error occurred, error code otherwise.
|
||||
*/
|
||||
ssize_t lzp_decompress(const unsigned char * input, unsigned char * output, ssize_t n, int hashSize, int minLen, int features);
|
||||
|
||||
int lzp_hash_size(int level);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*-----------------------------------------------------------*/
|
||||
/* End lzp.h */
|
||||
/*-----------------------------------------------------------*/
|
159
main.c
159
main.c
|
@ -45,6 +45,7 @@
|
|||
#include <pcompress.h>
|
||||
#include <allocator.h>
|
||||
#include <rabin_polynomial.h>
|
||||
#include <lzp.h>
|
||||
|
||||
/*
|
||||
* We use 5MB chunks by default.
|
||||
|
@ -78,6 +79,7 @@ static int hide_cmp_stats = 1;
|
|||
static int enable_rabin_scan = 0;
|
||||
static int enable_delta_encode = 0;
|
||||
static int enable_rabin_split = 1;
|
||||
static int lzp_preprocess = 0;
|
||||
static unsigned int chunk_num;
|
||||
static uint64_t largest_chunk, smallest_chunk, avg_chunk;
|
||||
static const char *exec_name;
|
||||
|
@ -128,8 +130,11 @@ usage(void)
|
|||
"5) Perform Delta Encoding in addition to Exact Dedup:\n"
|
||||
" %s -E ... - This also implies '-D'.\n"
|
||||
"6) Number of threads can optionally be specified: -t <1 - 256 count>\n"
|
||||
"7) Pass '-M' to display memory allocator statistics\n"
|
||||
"8) Pass '-C' to display compression statistics\n\n",
|
||||
"7) Other flags:\n"
|
||||
" '-L' - Enable LZP pre-compression. This improves compression ratio of all\n"
|
||||
" algorithms with some extra CPU and very low RAM overhead.\n"
|
||||
" '-M' - Display memory allocator statistics\n"
|
||||
" '-C' - Display compression statistics\n\n",
|
||||
UTILITY_VERSION, exec_name, exec_name, exec_name, exec_name, exec_name, exec_name);
|
||||
}
|
||||
|
||||
|
@ -148,6 +153,92 @@ show_compression_stats(uint64_t chunksize)
|
|||
bytes_to_size(avg_chunk), (double)avg_chunk/(double)chunksize*100);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wrapper functions to pre-process the buffer and then call the main compression routine.
|
||||
* At present only LZP pre-compression is used below. Some extra metadata is added:
|
||||
*
|
||||
* Byte 0: A flag to indicate which pre-processor was used.
|
||||
* Byte 1 - Byte 8: Size of buffer after pre-processing
|
||||
*
|
||||
* It is possible for a buffer to be only pre-processed and not compressed by the final
|
||||
* algorithm if the final one fails to compress for some reason. However the vice versa
|
||||
* is not allowed.
|
||||
*/
|
||||
int
|
||||
preproc_compress(compress_func_ptr cmp_func, void *src, size_t srclen, void *dst,
|
||||
size_t *dstlen, int level, uchar_t chdr, void *data)
|
||||
{
|
||||
uchar_t *dest = (uchar_t *)dst, type = 0;
|
||||
ssize_t result, _dstlen;
|
||||
|
||||
if (lzp_preprocess) {
|
||||
int hashsize;
|
||||
|
||||
type = PREPROC_TYPE_LZP;
|
||||
hashsize = lzp_hash_size(level);
|
||||
result = lzp_compress(src, dst, srclen, hashsize, LZP_DEFAULT_LZPMINLEN, 0);
|
||||
if (result < 0 || result == srclen) return (-1);
|
||||
srclen = result;
|
||||
memcpy(src, dst, srclen);
|
||||
} else {
|
||||
/*
|
||||
* Execution won't come here but just in case ...
|
||||
*/
|
||||
fprintf(stderr, "Invalid preprocessing mode\n");
|
||||
return (-1);
|
||||
}
|
||||
|
||||
*dest = type;
|
||||
*((int64_t *)(dest + 1)) = htonll(srclen);
|
||||
_dstlen = srclen;
|
||||
result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr, data);
|
||||
if (result == 0 && _dstlen < srclen) {
|
||||
*dest |= PREPROC_COMPRESSED;
|
||||
*dstlen = _dstlen + 9;
|
||||
} else {
|
||||
memcpy(dest+1, src, srclen);
|
||||
_dstlen = srclen;
|
||||
*dstlen = _dstlen + 1;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
int
|
||||
preproc_decompress(compress_func_ptr dec_func, void *src, size_t srclen, void *dst,
|
||||
size_t *dstlen, int level, uchar_t chdr, void *data)
|
||||
{
|
||||
uchar_t *sorc = (uchar_t *)src, type;
|
||||
ssize_t result;
|
||||
|
||||
type = *sorc;
|
||||
sorc++;
|
||||
srclen--;
|
||||
if (type & PREPROC_COMPRESSED) {
|
||||
*dstlen = ntohll(*((int64_t *)(sorc)));
|
||||
sorc += 8;
|
||||
srclen -= 8;
|
||||
result = dec_func(sorc, srclen, dst, dstlen, level, chdr, data);
|
||||
if (result < 0) return (result);
|
||||
memcpy(src, dst, *dstlen);
|
||||
srclen = *dstlen;
|
||||
}
|
||||
|
||||
if (type & PREPROC_TYPE_LZP) {
|
||||
int hashsize;
|
||||
hashsize = lzp_hash_size(level);
|
||||
result = lzp_decompress(src, dst, srclen, hashsize, LZP_DEFAULT_LZPMINLEN, 0);
|
||||
if (result < 0) {
|
||||
fprintf(stderr, "LZP decompression failed.\n");
|
||||
return (-1);
|
||||
}
|
||||
*dstlen = result;
|
||||
} else {
|
||||
fprintf(stderr, "Invalid preprocessing flags: %d\n", type);
|
||||
return (-1);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* This routine is called in multiple threads. Calls the decompression handler
|
||||
* as encoded in the file header. For adaptive mode the handler adapt_decompress()
|
||||
|
@ -214,8 +305,13 @@ redo:
|
|||
cmpbuf = cseg + RABIN_HDR_SIZE + rabin_index_sz_cmp;
|
||||
ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + rabin_index_sz;
|
||||
if (HDR & COMPRESSED) {
|
||||
rv = tdat->decompress(cmpbuf, rabin_data_sz_cmp, ubuf, &_chunksize,
|
||||
tdat->level, HDR, tdat->data);
|
||||
if (HDR & CHUNK_FLAG_PREPROC) {
|
||||
rv = preproc_decompress(tdat->decompress, cmpbuf, rabin_data_sz_cmp,
|
||||
ubuf, &_chunksize, tdat->level, HDR, tdat->data);
|
||||
} else {
|
||||
rv = tdat->decompress(cmpbuf, rabin_data_sz_cmp, ubuf, &_chunksize,
|
||||
tdat->level, HDR, tdat->data);
|
||||
}
|
||||
if (rv == -1) {
|
||||
tdat->len_cmp = 0;
|
||||
fprintf(stderr, "ERROR: Chunk %d, decompression failed.\n", tdat->id);
|
||||
|
@ -237,8 +333,13 @@ redo:
|
|||
}
|
||||
} else {
|
||||
if (HDR & COMPRESSED) {
|
||||
rv = tdat->decompress(cseg, tdat->len_cmp, tdat->uncompressed_chunk,
|
||||
&_chunksize, tdat->level, HDR, tdat->data);
|
||||
if (HDR & CHUNK_FLAG_PREPROC) {
|
||||
rv = preproc_decompress(tdat->decompress, cseg, tdat->len_cmp,
|
||||
tdat->uncompressed_chunk, &_chunksize, tdat->level, HDR, tdat->data);
|
||||
} else {
|
||||
rv = tdat->decompress(cseg, tdat->len_cmp, tdat->uncompressed_chunk,
|
||||
&_chunksize, tdat->level, HDR, tdat->data);
|
||||
}
|
||||
} else {
|
||||
memcpy(tdat->uncompressed_chunk, cseg, _chunksize);
|
||||
}
|
||||
|
@ -317,7 +418,7 @@ cont:
|
|||
* | `---------------- 2 - Lzma (Adaptive Mode)
|
||||
* | 3 - PPMD (Adaptive Mode)
|
||||
* |
|
||||
* `---------------------- 1 - Last Chunk flag
|
||||
* `---------------------- 1 - Chunk size flag (if original chunk is of variable length)
|
||||
*
|
||||
* A file trailer to indicate end.
|
||||
* Zero Compressed length: 8 zero bytes.
|
||||
|
@ -459,7 +560,7 @@ start_decompress(const char *filename, const char *to_filename)
|
|||
}
|
||||
}
|
||||
if (enable_rabin_scan) {
|
||||
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize,
|
||||
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, 0,
|
||||
algo, enable_delta_encode);
|
||||
if (tdat->rctx == NULL) {
|
||||
UNCOMP_BAIL;
|
||||
|
@ -685,7 +786,7 @@ redo:
|
|||
/* Compress index if it is at least 90 bytes. */
|
||||
rv = lzma_compress(tdat->uncompressed_chunk + RABIN_HDR_SIZE,
|
||||
rabin_index_sz, compressed_chunk + RABIN_HDR_SIZE,
|
||||
&index_size_cmp, tdat->rctx->level, 0, tdat->rctx->lzma_data);
|
||||
&index_size_cmp, tdat->rctx->level, 255, tdat->rctx->lzma_data);
|
||||
} else {
|
||||
memcpy(compressed_chunk + RABIN_HDR_SIZE,
|
||||
tdat->uncompressed_chunk + RABIN_HDR_SIZE, rabin_index_sz);
|
||||
|
@ -696,9 +797,16 @@ redo:
|
|||
if (rv == 0) {
|
||||
memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE);
|
||||
/* Compress data chunk. */
|
||||
rv = tdat->compress(tdat->uncompressed_chunk + rabin_index_sz,
|
||||
_chunksize, compressed_chunk + index_size_cmp, &_chunksize,
|
||||
tdat->level, 0, tdat->data);
|
||||
if (lzp_preprocess) {
|
||||
rv = preproc_compress(tdat->compress,
|
||||
tdat->uncompressed_chunk + rabin_index_sz,
|
||||
_chunksize, compressed_chunk + index_size_cmp, &_chunksize,
|
||||
tdat->level, 0, tdat->data);
|
||||
} else {
|
||||
rv = tdat->compress(tdat->uncompressed_chunk + rabin_index_sz,
|
||||
_chunksize, compressed_chunk + index_size_cmp, &_chunksize,
|
||||
tdat->level, 0, tdat->data);
|
||||
}
|
||||
|
||||
/* Can't compress data just retain as-is. */
|
||||
if (rv < 0)
|
||||
|
@ -720,8 +828,14 @@ redo:
|
|||
} else {
|
||||
plain_compress:
|
||||
_chunksize = tdat->rbytes;
|
||||
rv = tdat->compress(tdat->uncompressed_chunk, tdat->rbytes,
|
||||
compressed_chunk, &_chunksize, tdat->level, 0, tdat->data);
|
||||
if (lzp_preprocess) {
|
||||
rv = preproc_compress(tdat->compress,
|
||||
tdat->uncompressed_chunk, tdat->rbytes,
|
||||
compressed_chunk, &_chunksize, tdat->level, 0, tdat->data);
|
||||
} else {
|
||||
rv = tdat->compress(tdat->uncompressed_chunk, tdat->rbytes,
|
||||
compressed_chunk, &_chunksize, tdat->level, 0, tdat->data);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Sanity check to ensure compressed data is lesser than original.
|
||||
|
@ -742,6 +856,9 @@ plain_compress:
|
|||
if (enable_rabin_scan && tdat->rctx->valid) {
|
||||
type |= CHUNK_FLAG_DEDUP;
|
||||
}
|
||||
if (lzp_preprocess) {
|
||||
type |= CHUNK_FLAG_PREPROC;
|
||||
}
|
||||
/*
|
||||
* Insert compressed chunk length and CRC64 checksum into
|
||||
* chunk header.
|
||||
|
@ -871,8 +988,8 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
if (enable_rabin_scan) {
|
||||
flags |= FLAG_DEDUP;
|
||||
/* Additional scratch space for dedup arrays. */
|
||||
compressed_chunksize += (rabin_buf_extra(chunksize) -
|
||||
(compressed_chunksize - chunksize));
|
||||
compressed_chunksize += (rabin_buf_extra(chunksize, 0, algo,
|
||||
enable_delta_encode) - (compressed_chunksize - chunksize));
|
||||
}
|
||||
|
||||
err = 0;
|
||||
|
@ -992,7 +1109,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
}
|
||||
}
|
||||
if (enable_rabin_scan) {
|
||||
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize,
|
||||
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, 0,
|
||||
algo, enable_delta_encode);
|
||||
if (tdat->rctx == NULL) {
|
||||
COMP_BAIL;
|
||||
|
@ -1057,7 +1174,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
* Read the first chunk into a spare buffer (a simple double-buffering).
|
||||
*/
|
||||
if (enable_rabin_split) {
|
||||
rctx = create_rabin_context(chunksize, 0, algo, enable_delta_encode);
|
||||
rctx = create_rabin_context(chunksize, 0, 0, algo, enable_delta_encode);
|
||||
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
|
||||
} else {
|
||||
rbytes = Read(uncompfd, cread_buf, chunksize);
|
||||
|
@ -1371,7 +1488,7 @@ main(int argc, char *argv[])
|
|||
level = 6;
|
||||
slab_init();
|
||||
|
||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDEr")) != -1) {
|
||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDErL")) != -1) {
|
||||
int ovr;
|
||||
|
||||
switch (opt) {
|
||||
|
@ -1432,6 +1549,10 @@ main(int argc, char *argv[])
|
|||
enable_delta_encode = 1;
|
||||
break;
|
||||
|
||||
case 'L':
|
||||
lzp_preprocess = 1;
|
||||
break;
|
||||
|
||||
case 'r':
|
||||
enable_rabin_split = 0;
|
||||
break;
|
||||
|
|
|
@ -47,8 +47,12 @@ extern "C" {
|
|||
#define BZIP2_A_NUM 16
|
||||
#define LZMA_A_NUM 32
|
||||
#define CHUNK_FLAG_DEDUP 2
|
||||
#define CHUNK_FLAG_PREPROC 4
|
||||
#define COMP_EXTN ".pz"
|
||||
|
||||
#define PREPROC_TYPE_LZP 1
|
||||
#define PREPROC_COMPRESSED 128
|
||||
|
||||
/*
|
||||
* lower 3 bits in higher nibble indicate compression algorithm.
|
||||
*/
|
||||
|
|
|
@ -94,21 +94,47 @@ static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
|
|||
uint64_t ir[256];
|
||||
static int inited = 0;
|
||||
|
||||
uint32_t
|
||||
rabin_buf_extra(uint64_t chunksize)
|
||||
static uint32_t
|
||||
rabin_min_blksz(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
||||
{
|
||||
return ((chunksize / RAB_POLYNOMIAL_MIN_BLOCK_SIZE2) * sizeof (uint32_t));
|
||||
uint32_t min_blk;
|
||||
|
||||
min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
|
||||
if (((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) &&
|
||||
chunksize <= LZMA_WINDOW_MAX) || delta_flag) {
|
||||
if (memcmp(algo, "lzfx", 4) == 0 || memcmp(algo, "lz4", 3) == 0 ||
|
||||
memcmp(algo, "zlib", 4) == 0 || memcmp(algo, "none", 4) == 0) {
|
||||
min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS - 1);
|
||||
}
|
||||
} else {
|
||||
min_blk = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS - 1);
|
||||
}
|
||||
return (min_blk);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
||||
{
|
||||
if (rab_blk_sz < 1 || rab_blk_sz > 5)
|
||||
rab_blk_sz = RAB_BLK_DEFAULT;
|
||||
|
||||
return ((chunksize / rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag))
|
||||
* sizeof (uint32_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the algorithm with the default params.
|
||||
*/
|
||||
rabin_context_t *
|
||||
create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *algo, int delta_flag) {
|
||||
create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz,
|
||||
const char *algo, int delta_flag) {
|
||||
rabin_context_t *ctx;
|
||||
unsigned char *current_window_data;
|
||||
uint32_t i;
|
||||
|
||||
if (rab_blk_sz < 1 || rab_blk_sz > 5)
|
||||
rab_blk_sz = RAB_BLK_DEFAULT;
|
||||
|
||||
/*
|
||||
* Pre-compute a table of irreducible polynomial evaluations for each
|
||||
* possible byte value.
|
||||
|
@ -157,28 +183,12 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
|
|||
|
||||
ctx->rabin_break_patt = 0;
|
||||
ctx->delta_flag = delta_flag;
|
||||
if (((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) &&
|
||||
chunksize <= LZMA_WINDOW_MAX) || delta_flag) {
|
||||
if (memcmp(algo, "lzfx", 4) == 0 || memcmp(algo, "lz4", 3) == 0 ||
|
||||
memcmp(algo, "zlib", 4) == 0 || memcmp(algo, "none", 4) == 0) {
|
||||
ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE2;
|
||||
ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK2;
|
||||
ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE2;
|
||||
if (delta_flag)
|
||||
ctx->delta_flag = DELTA_LESS_FUZZY;
|
||||
} else {
|
||||
ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE;
|
||||
ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK;
|
||||
ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE;
|
||||
}
|
||||
} else {
|
||||
ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE2;
|
||||
ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK2;
|
||||
ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE2;
|
||||
}
|
||||
|
||||
ctx->rabin_poly_avg_block_size = 1 << (rab_blk_sz + RAB_BLK_MIN_BITS);
|
||||
ctx->rabin_avg_block_mask = ctx->rabin_poly_avg_block_size - 1;
|
||||
ctx->rabin_poly_min_block_size = rabin_min_blksz(chunksize, rab_blk_sz, algo, delta_flag);
|
||||
ctx->fp_mask = ctx->rabin_avg_block_mask | ctx->rabin_poly_avg_block_size;
|
||||
ctx->blknum = chunksize / ctx->rabin_poly_min_block_size;
|
||||
|
||||
if (chunksize % ctx->rabin_poly_min_block_size)
|
||||
ctx->blknum++;
|
||||
|
||||
|
|
|
@ -63,26 +63,14 @@
|
|||
|
||||
//List of constants, mostly constraints and defaults for various parameters
|
||||
//to the Rabin Fingerprinting algorithm
|
||||
|
||||
#define RAB_POLYNOMIAL_CONST 2
|
||||
// 1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT = Average Rabin Chunk Size
|
||||
// So we are always looking at power of 2 chunk sizes to avoid doing a modulus
|
||||
//
|
||||
#define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT 12
|
||||
#define RAB_POLYNOMIAL_AVG_BLOCK_SIZE (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT)
|
||||
#define RAB_POLYNOMIAL_AVG_BLOCK_MASK (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1)
|
||||
#define RAB_POLYNOMIAL_MIN_BLOCK_SIZE RAB_POLYNOMIAL_AVG_BLOCK_SIZE
|
||||
#define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024)
|
||||
|
||||
#define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT2 12
|
||||
#define RAB_POLYNOMIAL_AVG_BLOCK_SIZE2 (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT)
|
||||
#define RAB_POLYNOMIAL_AVG_BLOCK_MASK2 (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1)
|
||||
#define RAB_POLYNOMIAL_MIN_BLOCK_SIZE2 2048
|
||||
|
||||
#define RAB_BLK_DEFAULT 1
|
||||
#define RAB_BLK_MIN_BITS 11
|
||||
#define LZMA_WINDOW_MAX (128L * 1024L * 1024L)
|
||||
#define RAB_POLYNOMIAL_WIN_SIZE 16
|
||||
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 8
|
||||
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 64
|
||||
#define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024)
|
||||
|
||||
// Minimum practical chunk size when doing dedup
|
||||
#define RAB_MIN_CHUNK_SIZE (1048576L)
|
||||
|
@ -167,7 +155,7 @@ typedef struct {
|
|||
} rabin_context_t;
|
||||
|
||||
extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize,
|
||||
const char *algo, int delta_flag);
|
||||
int rab_blk_sz, const char *algo, int delta_flag);
|
||||
extern void destroy_rabin_context(rabin_context_t *ctx);
|
||||
extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf,
|
||||
ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
|
||||
|
@ -178,6 +166,7 @@ extern void rabin_parse_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_i
|
|||
extern void rabin_update_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
|
||||
ssize_t rabin_data_sz_cmp);
|
||||
extern void reset_rabin_context(rabin_context_t *ctx);
|
||||
extern uint32_t rabin_buf_extra(uint64_t chunksize);
|
||||
extern uint32_t rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
|
||||
int delta_flag);
|
||||
|
||||
#endif /* _RABIN_0POLY_H_ */
|
||||
#endif /* _RABIN_POLY_H_ */
|
||||
|
|
Loading…
Reference in a new issue