pcompress/lzp/lzp.c
Moinak Ghosh 6b756eb165 Fix Delta2 handling for small buffers.
Fix LZP handling during preprocessing.
Fix type flag handling during preprocessing.
Update test cases to use configurable list of test corpus files.
Update some more int64_t datatypes to uint64_t
Add a gitignore.
2013-01-02 22:56:21 +05:30

481 lines
16 KiB
C

/*-----------------------------------------------------------*/
/* Block Sorting, Lossless Data Compression Library. */
/* Lempel Ziv Prediction */
/*-----------------------------------------------------------*/
/*--
This file is a part of bsc and/or libbsc, a program and a library for
lossless, block-sorting data compression.
Copyright (c) 2009-2012 Ilya Grebnov <ilya.grebnov@gmail.com>
Copyright (c) 2012 Moinak Ghosh <moinakg@gmail.com>
See file AUTHORS for a full list of contributors.
The bsc and libbsc is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.
The bsc and libbsc is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the bsc and libbsc. If not, see http://www.gnu.org/licenses/.
Please see the files COPYING and COPYING.LIB for full copyright information.
See also the bsc and libbsc web site:
http://libbsc.com/ for more information.
--*/
/*
* TODO: Port the parallel implementation.
*/
#undef LZP_OPENMP
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS 1
#endif
#include <stdlib.h>
#include <memory.h>
#include <string.h>
#include <allocator.h>
#include <sys/types.h>
#include <stdio.h>
#include <utils.h>
#include "lzp.h"
#define LZP_MATCH_FLAG 0xf2
static
inline int bsc_lzp_num_blocks(int64_t n)
{
int nb;
if (n < 256LL * 1024LL) return 1;
if (n < 4LL * 1024LL * 1024LL) return 2;
if (n < 16LL * 1024LL * 1024LL) return 4;
if (n < LZP_MAX_BLOCK) return 8;
nb = n / LZP_MAX_BLOCK;
if (n % LZP_MAX_BLOCK) nb++;
return (nb);
}
static
int bsc_lzp_encode_block(const unsigned char * input, const unsigned char * inputEnd, unsigned char * output, unsigned char * outputEnd, int hashSize, int minLen)
{
int *lookup, i;
if (inputEnd - input < 16)
{
return LZP_NOT_COMPRESSIBLE;
}
lookup = (int *)slab_calloc(NULL, (int)(1 << hashSize), sizeof(int));
if (lookup)
{
unsigned int mask = (int)(1 << hashSize) - 1;
const unsigned char * inputStart = input;
const unsigned char * outputStart = output;
const unsigned char * outputEOB = outputEnd - 4;
unsigned int context = 0;
for (i = 0; i < 4; ++i)
{
context = (context << 8) | (*output++ = *input++);
}
const unsigned char * heuristic = input;
const unsigned char * inputMinLenEnd = inputEnd - minLen - 8;
while ((input < inputMinLenEnd) && (output < outputEOB))
{
unsigned int index = ((context >> 15) ^ context ^ (context >> 3)) & mask;
int value = lookup[index]; lookup[index] = (int)(input - inputStart);
if (value > 0)
{
const unsigned char * reference = inputStart + value;
if ((*(unsigned int *)(input + minLen - 4) == *(unsigned int *)(reference + minLen - 4)) && (*(unsigned int *)(input) == *(unsigned int *)(reference)))
{
if ((heuristic > input) && (*(unsigned int *)heuristic != *(unsigned int *)(reference + (heuristic - input))))
{
goto LZP_MATCH_NOT_FOUND;
}
int len = 4;
for (; input + len < inputMinLenEnd; len += 4)
{
if (*(unsigned int *)(input + len) != *(unsigned int *)(reference + len)) break;
}
if (len < minLen)
{
if (heuristic < input + len) heuristic = input + len;
goto LZP_MATCH_NOT_FOUND;
}
if (input[len] == reference[len]) len++;
if (input[len] == reference[len]) len++;
if (input[len] == reference[len]) len++;
input += len; context = input[-1] | (input[-2] << 8) | (input[-3] << 16) | (input[-4] << 24);
*output++ = LZP_MATCH_FLAG;
len -= minLen; while (len >= 254) { len -= 254; *output++ = 254; if (output >= outputEOB) break; }
*output++ = (unsigned char)(len);
}
else
{
unsigned char next;
LZP_MATCH_NOT_FOUND:
next = *output++ = *input++; context = (context << 8) | next;
if (next == LZP_MATCH_FLAG) *output++ = 255;
}
}
else
{
context = (context << 8) | (*output++ = *input++);
}
}
while ((input < inputEnd) && (output < outputEOB))
{
unsigned int index = ((context >> 15) ^ context ^ (context >> 3)) & mask;
int value = lookup[index]; lookup[index] = (int)(input - inputStart);
if (value > 0)
{
unsigned char next = *output++ = *input++; context = (context << 8) | next;
if (next == LZP_MATCH_FLAG) *output++ = 255;
}
else
{
context = (context << 8) | (*output++ = *input++);
}
}
slab_free(NULL, lookup);
return (output >= outputEOB) ? LZP_NOT_COMPRESSIBLE : (int)(output - outputStart);
}
return LZP_NOT_ENOUGH_MEMORY;
}
static
int bsc_lzp_decode_block(const unsigned char * input, const unsigned char * inputEnd, unsigned char * output, int hashSize, int minLen)
{
int *lookup, i;
if (inputEnd - input < 4)
{
return LZP_UNEXPECTED_EOB;
}
lookup = (int *)slab_calloc(NULL, (int)(1 << hashSize), sizeof(int));
if (lookup)
{
unsigned int mask = (int)(1 << hashSize) - 1;
const unsigned char * outputStart = output;
unsigned int context = 0;
for (i = 0; i < 4; ++i)
{
context = (context << 8) | (*output++ = *input++);
}
while (input < inputEnd)
{
unsigned int index = ((context >> 15) ^ context ^ (context >> 3)) & mask;
int value = lookup[index]; lookup[index] = (int)(output - outputStart);
if (*input == LZP_MATCH_FLAG && value > 0)
{
input++;
if (*input != 255)
{
int len = minLen; while (1) { len += *input; if (*input++ != 254) break; }
const unsigned char * reference = outputStart + value;
unsigned char * outputEnd = output + len;
if (output - reference < 4)
{
int offset[4] = {0, 3, 2, 3};
*output++ = *reference++;
*output++ = *reference++;
*output++ = *reference++;
*output++ = *reference++;
reference -= offset[output - reference];
}
while (output < outputEnd) { *(unsigned int *)output = *(unsigned int*)reference; output += 4; reference += 4; }
output = outputEnd; context = output[-1] | (output[-2] << 8) | (output[-3] << 16) | (output[-4] << 24);
}
else
{
input++; context = (context << 8) | (*output++ = LZP_MATCH_FLAG);
}
}
else
{
context = (context << 8) | (*output++ = *input++);
}
}
slab_free(NULL, lookup);
return (int)(output - outputStart);
}
return LZP_NOT_ENOUGH_MEMORY;
}
static
int64_t bsc_lzp_compress_serial(const unsigned char * input, unsigned char * output, int64_t n, int hashSize, int minLen)
{
if (bsc_lzp_num_blocks(n) == 1)
{
int result = bsc_lzp_encode_block(input, input + n, output + 1, output + n - 1, hashSize, minLen);
if (result >= LZP_NO_ERROR) result = (output[0] = 1, result + 1);
return result;
}
int nBlocks = bsc_lzp_num_blocks(n);
int chunkSize;
int blockId;
int64_t outputPtr = 1 + 8 * nBlocks;
DEBUG_STAT_EN(double strt, en);
DEBUG_STAT_EN(strt = get_wtime_millis());
if (n > LZP_MAX_BLOCK)
chunkSize = LZP_MAX_BLOCK;
else
chunkSize = n / nBlocks;
output[0] = nBlocks;
for (blockId = 0; blockId < nBlocks; ++blockId)
{
int64_t inputStart = blockId * chunkSize;
int inputSize = blockId != nBlocks - 1 ? chunkSize : n - inputStart;
int outputSize = inputSize; if (outputSize > n - outputPtr) outputSize = n - outputPtr;
int result = bsc_lzp_encode_block(input + inputStart, input + inputStart + inputSize, output + outputPtr, output + outputPtr + outputSize, hashSize, minLen);
if (result < LZP_NO_ERROR)
{
if (outputPtr + inputSize >= n) return LZP_NOT_COMPRESSIBLE;
result = inputSize; memcpy(output + outputPtr, input + inputStart, inputSize);
}
*(int *)(output + 1 + 8 * blockId + 0) = inputSize;
*(int *)(output + 1 + 8 * blockId + 4) = result;
outputPtr += result;
}
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "LZP: Insize: %" PRId64 ", Outsize: %" PRId64 "\n", n, outputPtr));
DEBUG_STAT_EN(fprintf(stderr, "LZP: Processed at %.3f MB/s\n", get_mb_s(n, strt, en)));
return outputPtr;
}
#ifdef LZP_OPENMP
static
int bsc_lzp_compress_parallel(const unsigned char * input, unsigned char * output, int64_t n, int hashSize, int minLen)
{
if (unsigned char * buffer = (unsigned char *)bsc_malloc(n * sizeof(unsigned char)))
{
int compressionResult[ALPHABET_SIZE];
int nBlocks = bsc_lzp_num_blocks(n);
int result = LZP_NO_ERROR;
int chunkSize = n / nBlocks;
int numThreads = omp_get_max_threads();
if (numThreads > nBlocks) numThreads = nBlocks;
output[0] = nBlocks;
#pragma omp parallel num_threads(numThreads) if(numThreads > 1)
{
if (omp_get_num_threads() == 1)
{
result = bsc_lzp_compress_serial(input, output, n, hashSize, minLen);
}
else
{
#pragma omp for schedule(dynamic)
for (int blockId = 0; blockId < nBlocks; ++blockId)
{
int blockStart = blockId * chunkSize;
int blockSize = blockId != nBlocks - 1 ? chunkSize : n - blockStart;
compressionResult[blockId] = bsc_lzp_encode_block(input + blockStart, input + blockStart + blockSize, buffer + blockStart, buffer + blockStart + blockSize, hashSize, minLen);
if (compressionResult[blockId] < LZP_NO_ERROR) compressionResult[blockId] = blockSize;
*(int *)(output + 1 + 8 * blockId + 0) = blockSize;
*(int *)(output + 1 + 8 * blockId + 4) = compressionResult[blockId];
}
#pragma omp single
{
result = 1 + 8 * nBlocks;
for (int blockId = 0; blockId < nBlocks; ++blockId)
{
result += compressionResult[blockId];
}
if (result >= n) result = LZP_NOT_COMPRESSIBLE;
}
if (result >= LZP_NO_ERROR)
{
#pragma omp for schedule(dynamic)
for (int blockId = 0; blockId < nBlocks; ++blockId)
{
int blockStart = blockId * chunkSize;
int blockSize = blockId != nBlocks - 1 ? chunkSize : n - blockStart;
int outputPtr = 1 + 8 * nBlocks;
for (int p = 0; p < blockId; ++p) outputPtr += compressionResult[p];
if (compressionResult[blockId] != blockSize)
{
memcpy(output + outputPtr, buffer + blockStart, compressionResult[blockId]);
}
else
{
memcpy(output + outputPtr, input + blockStart, compressionResult[blockId]);
}
}
}
}
}
bsc_free(buffer);
return result;
}
return LZP_NOT_ENOUGH_MEMORY;
}
#endif
int64_t lzp_compress(const unsigned char * input, unsigned char * output, int64_t n, int hashSize, int minLen, int features)
{
#ifdef LZP_OPENMP
if ((bsc_lzp_num_blocks(n) != 1) && (features & LZP_FEATURE_MULTITHREADING))
{
return bsc_lzp_compress_parallel(input, output, n, hashSize, minLen);
}
#endif
return bsc_lzp_compress_serial(input, output, n, hashSize, minLen);
}
int64_t lzp_decompress(const unsigned char * input, unsigned char * output, int64_t n, int hashSize, int minLen, int features)
{
int nBlocks = input[0];
if (nBlocks == 1)
{
return bsc_lzp_decode_block(input + 1, input + n, output, hashSize, minLen);
}
int decompressionResult[ALPHABET_SIZE];
#ifdef LZP_OPENMP
if (features & LZP_FEATURE_MULTITHREADING)
{
#pragma omp parallel for schedule(dynamic)
for (int blockId = 0; blockId < nBlocks; ++blockId)
{
int inputPtr = 0; for (int p = 0; p < blockId; ++p) inputPtr += *(int *)(input + 1 + 8 * p + 4);
int outputPtr = 0; for (int p = 0; p < blockId; ++p) outputPtr += *(int *)(input + 1 + 8 * p + 0);
inputPtr += 1 + 8 * nBlocks;
int inputSize = *(int *)(input + 1 + 8 * blockId + 4);
int outputSize = *(int *)(input + 1 + 8 * blockId + 0);
if (inputSize != outputSize)
{
decompressionResult[blockId] = bsc_lzp_decode_block(input + inputPtr, input + inputPtr + inputSize, output + outputPtr, hashSize, minLen);
}
else
{
decompressionResult[blockId] = inputSize; memcpy(output + outputPtr, input + inputPtr, inputSize);
}
}
}
else
#endif
{
int blockId, p;
for (blockId = 0; blockId < nBlocks; ++blockId)
{
int64_t inputPtr = 0; for (p = 0; p < blockId; ++p) inputPtr += *(int *)(input + 1 + 8 * p + 4);
int64_t outputPtr = 0; for (p = 0; p < blockId; ++p) outputPtr += *(int *)(input + 1 + 8 * p + 0);
inputPtr += 1 + 8 * nBlocks;
int inputSize = *(int *)(input + 1 + 8 * blockId + 4);
int outputSize = *(int *)(input + 1 + 8 * blockId + 0);
if (inputSize != outputSize)
{
decompressionResult[blockId] = bsc_lzp_decode_block(input + inputPtr, input + inputPtr + inputSize, output + outputPtr, hashSize, minLen);
}
else
{
decompressionResult[blockId] = inputSize; memcpy(output + outputPtr, input + inputPtr, inputSize);
}
}
}
int64_t dataSize = 0;
int result = LZP_NO_ERROR;
int blockId;
for (blockId = 0; blockId < nBlocks; ++blockId)
{
if (decompressionResult[blockId] < LZP_NO_ERROR) result = decompressionResult[blockId];
dataSize += decompressionResult[blockId];
}
return (result == LZP_NO_ERROR) ? dataSize : result;
}
/*
* Counter-intuitively we use a larger hash (with better LZP compression) for lower global
* compression levels. So that LZP preprocessing plays along nicely with the primary
* compression algorithm being used and actually provides a benefit.
*/
int lzp_hash_size(int level) {
if (level > 7) {
return (LZP_DEFAULT_LZPHASHSIZE + 2);
} else if (level > 5) {
return (LZP_DEFAULT_LZPHASHSIZE + 3);
} else if (level > 3) {
return (LZP_DEFAULT_LZPHASHSIZE + 4);
} else {
return (LZP_DEFAULT_LZPHASHSIZE + 5);
}
}
/*-----------------------------------------------------------*/
/* End lzp.cpp */
/*-----------------------------------------------------------*/