Make Delta2 encoding independent of LZP.

Tweak Delta2 parameters.
Update README and test cases.
This commit is contained in:
Moinak Ghosh 2012-12-15 22:03:23 +05:30
parent b01d255f6c
commit ef0191729e
8 changed files with 30 additions and 18 deletions

View file

@ -102,10 +102,12 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
delta encoding in conjunction with this may not always be beneficial.
However Adaptive Delta Encoding is beneficial along with this.
'-P' - Enable Adaptive Delta Encoding. This implies '-L' as well. It improves
compresion ratio further at the cost of more CPU overhead. Delta
Encoding is combined with Run-Length encoding and Matrix transpose
of certain kinds of data to improve subsequent compression results.
'-P' - Enable Adaptive Delta Encoding. It can improve compresion ratio further
for data containing tables of numerical values especially if those are
in an arithmetic series. In this implementation basic Delta Encoding is
combined with Run-Length encoding and Matrix transpose
NOTE - If data has mixed textual and numeric table components then both -L and
-P can be used together.
'-S' <cksum>
- Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and

View file

@ -282,8 +282,10 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
last = pos + srclen;
olen = ntohll(*((uint64_t *)pos));
if (*dstlen < (olen + 8))
if (*dstlen < olen) {
fprintf(stderr, "DELTA2 Decode: Destination buffer too small.\n");
return (-1);
}
out = 0;
pos += MAIN_HDR;
@ -297,6 +299,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
rcnt = ntohll(*((uint64_t *)pos));
pos += sizeof (rcnt);
if (out + rcnt > *dstlen) {
fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
return (-1);
}
memcpy(pos1, pos, rcnt);
@ -314,6 +317,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
rcnt = ntohll(*((uint64_t *)pos));
pos += sizeof (rcnt);
if (out + rcnt > *dstlen) {
fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
return (-1);
}
transpose(pos, pos1, rcnt, stride, COL);
@ -330,6 +334,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
delta = ntohll(*((uint64_t *)pos));
pos += sizeof (delta);
if (out + rcnt > *dstlen) {
fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
return (-1);
}

View file

@ -56,7 +56,7 @@ lz4_props(algo_props_t *data, int level, int64_t chunksize) {
data->compress_mt_capable = 0;
data->decompress_mt_capable = 0;
data->buf_extra = lz4_buf_extra(chunksize);
data->delta2_span = 50;
data->delta2_span = 100;
}
int

21
main.c
View file

@ -150,8 +150,11 @@ usage(void)
"7) Other flags:\n"
" '-L' - Enable LZP pre-compression. This improves compression ratio of all\n"
" algorithms with some extra CPU and very low RAM overhead.\n"
" '-P' - Enable Adaptive Delta Encoding. This implies '-L' as well. It improves\n"
" compresion ratio further at the cost of more CPU overhead.\n"
" '-P' - Enable Adaptive Delta Encoding. It can improve compresion ratio for\n"
" data containing tables of numerical values especially if those are in\n"
" an arithmetic series.\n"
" NOTE - If data has mixed textual and numeric table components then both -L and\n"
" -P can be used together.\n"
" '-S' <cksum>\n"
" - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and\n"
" SHA512. Default one is SKEIN256.\n"
@ -207,7 +210,8 @@ preproc_compress(compress_func_ptr cmp_func, void *src, uint64_t srclen, void *d
if (result < 0 || result == srclen) return (-1);
srclen = result;
memcpy(src, dst, srclen);
} else {
} else if (!enable_delta2_encode) {
/*
* Execution won't come here but just in case ...
* Even Delta2 encoding below enables LZP.
@ -282,7 +286,9 @@ preproc_decompress(compress_func_ptr dec_func, void *src, uint64_t srclen, void
return (-1);
}
*dstlen = result;
} else {
}
if (!(type & (PREPROC_COMPRESSED | PREPROC_TYPE_DELTA2 | PREPROC_TYPE_LZP))) {
fprintf(stderr, "Invalid preprocessing flags: %d\n", type);
return (-1);
}
@ -1196,7 +1202,7 @@ plain_index:
dedupe_index_sz += RABIN_HDR_SIZE;
memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE);
/* Compress data chunk. */
if (lzp_preprocess) {
if (lzp_preprocess || enable_delta2_encode) {
rv = preproc_compress(tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz,
_chunksize, compressed_chunk + index_size_cmp, &_chunksize,
tdat->level, 0, tdat->data, tdat->props);
@ -1215,7 +1221,7 @@ plain_index:
} else {
plain_compress:
_chunksize = tdat->rbytes;
if (lzp_preprocess) {
if (lzp_preprocess || enable_delta2_encode) {
rv = preproc_compress(tdat->compress,
tdat->uncompressed_chunk, tdat->rbytes,
compressed_chunk, &_chunksize, tdat->level, 0, tdat->data,
@ -1270,7 +1276,7 @@ plain_compress:
if ((enable_rabin_scan || enable_fixed_scan) && tdat->rctx->valid) {
type |= CHUNK_FLAG_DEDUP;
}
if (lzp_preprocess) {
if (lzp_preprocess || enable_delta2_encode) {
type |= CHUNK_FLAG_PREPROC;
}
@ -2165,7 +2171,6 @@ main(int argc, char *argv[])
break;
case 'P':
lzp_preprocess = 1;
enable_delta2_encode = 1;
break;

View file

@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt
do
for tf in combined.dat comb_d.dat
do
for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -E -P"
for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P"
do
for seg in 2m 100m
do

View file

@ -12,7 +12,7 @@ for algo in lzfx adapt2
do
for tf in comb_d.dat
do
for feat in "-e" "-e -L -S SHA256" "-D -e -S SHA512" "-D -EE -L -e -S SKEIN512" "-e -S CRC64" "-e -P" "-e -P -S KECCAK256" "-D -e -L -S KECCAK512"
for feat in "-e" "-e -L -S SHA256" "-D -e -S SHA512" "-D -EE -L -e -S SKEIN512" "-e -S CRC64" "-e -P" "-e -L -P -S KECCAK256" "-D -e -L -S KECCAK512"
do
for seg in 2m 100m
do

View file

@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt adapt2
do
for tf in combined.dat comb_d.dat
do
for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L"
for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P"
do
for seg in 2m 100m
do

View file

@ -91,7 +91,7 @@ zlib_stats(int show)
void
zlib_props(algo_props_t *data, int level, int64_t chunksize) {
data->delta2_span = 50;
data->delta2_span = 100;
}
int