Make Delta2 encoding independent of LZP.

Tweak Delta2 parameters.
Update README and test cases.
This commit is contained in:
Moinak Ghosh 2012-12-15 22:03:23 +05:30
parent b01d255f6c
commit ef0191729e
8 changed files with 30 additions and 18 deletions

View file

@ -102,10 +102,12 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
delta encoding in conjunction with this may not always be beneficial. delta encoding in conjunction with this may not always be beneficial.
However Adaptive Delta Encoding is beneficial along with this. However Adaptive Delta Encoding is beneficial along with this.
'-P' - Enable Adaptive Delta Encoding. This implies '-L' as well. It improves '-P' - Enable Adaptive Delta Encoding. It can improve compresion ratio further
compresion ratio further at the cost of more CPU overhead. Delta for data containing tables of numerical values especially if those are
Encoding is combined with Run-Length encoding and Matrix transpose in an arithmetic series. In this implementation basic Delta Encoding is
of certain kinds of data to improve subsequent compression results. combined with Run-Length encoding and Matrix transpose
NOTE - If data has mixed textual and numeric table components then both -L and
-P can be used together.
'-S' <cksum> '-S' <cksum>
- Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and

View file

@ -282,8 +282,10 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
last = pos + srclen; last = pos + srclen;
olen = ntohll(*((uint64_t *)pos)); olen = ntohll(*((uint64_t *)pos));
if (*dstlen < (olen + 8)) if (*dstlen < olen) {
fprintf(stderr, "DELTA2 Decode: Destination buffer too small.\n");
return (-1); return (-1);
}
out = 0; out = 0;
pos += MAIN_HDR; pos += MAIN_HDR;
@ -297,6 +299,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
rcnt = ntohll(*((uint64_t *)pos)); rcnt = ntohll(*((uint64_t *)pos));
pos += sizeof (rcnt); pos += sizeof (rcnt);
if (out + rcnt > *dstlen) { if (out + rcnt > *dstlen) {
fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
return (-1); return (-1);
} }
memcpy(pos1, pos, rcnt); memcpy(pos1, pos, rcnt);
@ -314,6 +317,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
rcnt = ntohll(*((uint64_t *)pos)); rcnt = ntohll(*((uint64_t *)pos));
pos += sizeof (rcnt); pos += sizeof (rcnt);
if (out + rcnt > *dstlen) { if (out + rcnt > *dstlen) {
fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
return (-1); return (-1);
} }
transpose(pos, pos1, rcnt, stride, COL); transpose(pos, pos1, rcnt, stride, COL);
@ -330,6 +334,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
delta = ntohll(*((uint64_t *)pos)); delta = ntohll(*((uint64_t *)pos));
pos += sizeof (delta); pos += sizeof (delta);
if (out + rcnt > *dstlen) { if (out + rcnt > *dstlen) {
fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
return (-1); return (-1);
} }

View file

@ -56,7 +56,7 @@ lz4_props(algo_props_t *data, int level, int64_t chunksize) {
data->compress_mt_capable = 0; data->compress_mt_capable = 0;
data->decompress_mt_capable = 0; data->decompress_mt_capable = 0;
data->buf_extra = lz4_buf_extra(chunksize); data->buf_extra = lz4_buf_extra(chunksize);
data->delta2_span = 50; data->delta2_span = 100;
} }
int int

21
main.c
View file

@ -150,8 +150,11 @@ usage(void)
"7) Other flags:\n" "7) Other flags:\n"
" '-L' - Enable LZP pre-compression. This improves compression ratio of all\n" " '-L' - Enable LZP pre-compression. This improves compression ratio of all\n"
" algorithms with some extra CPU and very low RAM overhead.\n" " algorithms with some extra CPU and very low RAM overhead.\n"
" '-P' - Enable Adaptive Delta Encoding. This implies '-L' as well. It improves\n" " '-P' - Enable Adaptive Delta Encoding. It can improve compresion ratio for\n"
" compresion ratio further at the cost of more CPU overhead.\n" " data containing tables of numerical values especially if those are in\n"
" an arithmetic series.\n"
" NOTE - If data has mixed textual and numeric table components then both -L and\n"
" -P can be used together.\n"
" '-S' <cksum>\n" " '-S' <cksum>\n"
" - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and\n" " - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and\n"
" SHA512. Default one is SKEIN256.\n" " SHA512. Default one is SKEIN256.\n"
@ -207,7 +210,8 @@ preproc_compress(compress_func_ptr cmp_func, void *src, uint64_t srclen, void *d
if (result < 0 || result == srclen) return (-1); if (result < 0 || result == srclen) return (-1);
srclen = result; srclen = result;
memcpy(src, dst, srclen); memcpy(src, dst, srclen);
} else {
} else if (!enable_delta2_encode) {
/* /*
* Execution won't come here but just in case ... * Execution won't come here but just in case ...
* Even Delta2 encoding below enables LZP. * Even Delta2 encoding below enables LZP.
@ -282,7 +286,9 @@ preproc_decompress(compress_func_ptr dec_func, void *src, uint64_t srclen, void
return (-1); return (-1);
} }
*dstlen = result; *dstlen = result;
} else { }
if (!(type & (PREPROC_COMPRESSED | PREPROC_TYPE_DELTA2 | PREPROC_TYPE_LZP))) {
fprintf(stderr, "Invalid preprocessing flags: %d\n", type); fprintf(stderr, "Invalid preprocessing flags: %d\n", type);
return (-1); return (-1);
} }
@ -1196,7 +1202,7 @@ plain_index:
dedupe_index_sz += RABIN_HDR_SIZE; dedupe_index_sz += RABIN_HDR_SIZE;
memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE); memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE);
/* Compress data chunk. */ /* Compress data chunk. */
if (lzp_preprocess) { if (lzp_preprocess || enable_delta2_encode) {
rv = preproc_compress(tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz, rv = preproc_compress(tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz,
_chunksize, compressed_chunk + index_size_cmp, &_chunksize, _chunksize, compressed_chunk + index_size_cmp, &_chunksize,
tdat->level, 0, tdat->data, tdat->props); tdat->level, 0, tdat->data, tdat->props);
@ -1215,7 +1221,7 @@ plain_index:
} else { } else {
plain_compress: plain_compress:
_chunksize = tdat->rbytes; _chunksize = tdat->rbytes;
if (lzp_preprocess) { if (lzp_preprocess || enable_delta2_encode) {
rv = preproc_compress(tdat->compress, rv = preproc_compress(tdat->compress,
tdat->uncompressed_chunk, tdat->rbytes, tdat->uncompressed_chunk, tdat->rbytes,
compressed_chunk, &_chunksize, tdat->level, 0, tdat->data, compressed_chunk, &_chunksize, tdat->level, 0, tdat->data,
@ -1270,7 +1276,7 @@ plain_compress:
if ((enable_rabin_scan || enable_fixed_scan) && tdat->rctx->valid) { if ((enable_rabin_scan || enable_fixed_scan) && tdat->rctx->valid) {
type |= CHUNK_FLAG_DEDUP; type |= CHUNK_FLAG_DEDUP;
} }
if (lzp_preprocess) { if (lzp_preprocess || enable_delta2_encode) {
type |= CHUNK_FLAG_PREPROC; type |= CHUNK_FLAG_PREPROC;
} }
@ -2165,7 +2171,6 @@ main(int argc, char *argv[])
break; break;
case 'P': case 'P':
lzp_preprocess = 1;
enable_delta2_encode = 1; enable_delta2_encode = 1;
break; break;

View file

@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt
do do
for tf in combined.dat comb_d.dat for tf in combined.dat comb_d.dat
do do
for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -E -P" for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P"
do do
for seg in 2m 100m for seg in 2m 100m
do do

View file

@ -12,7 +12,7 @@ for algo in lzfx adapt2
do do
for tf in comb_d.dat for tf in comb_d.dat
do do
for feat in "-e" "-e -L -S SHA256" "-D -e -S SHA512" "-D -EE -L -e -S SKEIN512" "-e -S CRC64" "-e -P" "-e -P -S KECCAK256" "-D -e -L -S KECCAK512" for feat in "-e" "-e -L -S SHA256" "-D -e -S SHA512" "-D -EE -L -e -S SKEIN512" "-e -S CRC64" "-e -P" "-e -L -P -S KECCAK256" "-D -e -L -S KECCAK512"
do do
for seg in 2m 100m for seg in 2m 100m
do do

View file

@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt adapt2
do do
for tf in combined.dat comb_d.dat for tf in combined.dat comb_d.dat
do do
for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P"
do do
for seg in 2m 100m for seg in 2m 100m
do do

View file

@ -91,7 +91,7 @@ zlib_stats(int show)
void void
zlib_props(algo_props_t *data, int level, int64_t chunksize) { zlib_props(algo_props_t *data, int level, int64_t chunksize) {
data->delta2_span = 50; data->delta2_span = 100;
} }
int int