Make Delta2 encoding independent of LZP.
Tweak Delta2 parameters. Update README and test cases.
This commit is contained in:
parent
b01d255f6c
commit
ef0191729e
8 changed files with 30 additions and 18 deletions
10
README.md
10
README.md
|
@ -102,10 +102,12 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
|
||||||
delta encoding in conjunction with this may not always be beneficial.
|
delta encoding in conjunction with this may not always be beneficial.
|
||||||
However Adaptive Delta Encoding is beneficial along with this.
|
However Adaptive Delta Encoding is beneficial along with this.
|
||||||
|
|
||||||
'-P' - Enable Adaptive Delta Encoding. This implies '-L' as well. It improves
|
'-P' - Enable Adaptive Delta Encoding. It can improve compresion ratio further
|
||||||
compresion ratio further at the cost of more CPU overhead. Delta
|
for data containing tables of numerical values especially if those are
|
||||||
Encoding is combined with Run-Length encoding and Matrix transpose
|
in an arithmetic series. In this implementation basic Delta Encoding is
|
||||||
of certain kinds of data to improve subsequent compression results.
|
combined with Run-Length encoding and Matrix transpose
|
||||||
|
NOTE - If data has mixed textual and numeric table components then both -L and
|
||||||
|
-P can be used together.
|
||||||
|
|
||||||
'-S' <cksum>
|
'-S' <cksum>
|
||||||
- Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and
|
- Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and
|
||||||
|
|
|
@ -282,8 +282,10 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
|
||||||
|
|
||||||
last = pos + srclen;
|
last = pos + srclen;
|
||||||
olen = ntohll(*((uint64_t *)pos));
|
olen = ntohll(*((uint64_t *)pos));
|
||||||
if (*dstlen < (olen + 8))
|
if (*dstlen < olen) {
|
||||||
|
fprintf(stderr, "DELTA2 Decode: Destination buffer too small.\n");
|
||||||
return (-1);
|
return (-1);
|
||||||
|
}
|
||||||
|
|
||||||
out = 0;
|
out = 0;
|
||||||
pos += MAIN_HDR;
|
pos += MAIN_HDR;
|
||||||
|
@ -297,6 +299,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
|
||||||
rcnt = ntohll(*((uint64_t *)pos));
|
rcnt = ntohll(*((uint64_t *)pos));
|
||||||
pos += sizeof (rcnt);
|
pos += sizeof (rcnt);
|
||||||
if (out + rcnt > *dstlen) {
|
if (out + rcnt > *dstlen) {
|
||||||
|
fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
|
||||||
return (-1);
|
return (-1);
|
||||||
}
|
}
|
||||||
memcpy(pos1, pos, rcnt);
|
memcpy(pos1, pos, rcnt);
|
||||||
|
@ -314,6 +317,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
|
||||||
rcnt = ntohll(*((uint64_t *)pos));
|
rcnt = ntohll(*((uint64_t *)pos));
|
||||||
pos += sizeof (rcnt);
|
pos += sizeof (rcnt);
|
||||||
if (out + rcnt > *dstlen) {
|
if (out + rcnt > *dstlen) {
|
||||||
|
fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
|
||||||
return (-1);
|
return (-1);
|
||||||
}
|
}
|
||||||
transpose(pos, pos1, rcnt, stride, COL);
|
transpose(pos, pos1, rcnt, stride, COL);
|
||||||
|
@ -330,6 +334,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
|
||||||
delta = ntohll(*((uint64_t *)pos));
|
delta = ntohll(*((uint64_t *)pos));
|
||||||
pos += sizeof (delta);
|
pos += sizeof (delta);
|
||||||
if (out + rcnt > *dstlen) {
|
if (out + rcnt > *dstlen) {
|
||||||
|
fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
|
||||||
return (-1);
|
return (-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -56,7 +56,7 @@ lz4_props(algo_props_t *data, int level, int64_t chunksize) {
|
||||||
data->compress_mt_capable = 0;
|
data->compress_mt_capable = 0;
|
||||||
data->decompress_mt_capable = 0;
|
data->decompress_mt_capable = 0;
|
||||||
data->buf_extra = lz4_buf_extra(chunksize);
|
data->buf_extra = lz4_buf_extra(chunksize);
|
||||||
data->delta2_span = 50;
|
data->delta2_span = 100;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
|
|
21
main.c
21
main.c
|
@ -150,8 +150,11 @@ usage(void)
|
||||||
"7) Other flags:\n"
|
"7) Other flags:\n"
|
||||||
" '-L' - Enable LZP pre-compression. This improves compression ratio of all\n"
|
" '-L' - Enable LZP pre-compression. This improves compression ratio of all\n"
|
||||||
" algorithms with some extra CPU and very low RAM overhead.\n"
|
" algorithms with some extra CPU and very low RAM overhead.\n"
|
||||||
" '-P' - Enable Adaptive Delta Encoding. This implies '-L' as well. It improves\n"
|
" '-P' - Enable Adaptive Delta Encoding. It can improve compresion ratio for\n"
|
||||||
" compresion ratio further at the cost of more CPU overhead.\n"
|
" data containing tables of numerical values especially if those are in\n"
|
||||||
|
" an arithmetic series.\n"
|
||||||
|
" NOTE - If data has mixed textual and numeric table components then both -L and\n"
|
||||||
|
" -P can be used together.\n"
|
||||||
" '-S' <cksum>\n"
|
" '-S' <cksum>\n"
|
||||||
" - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and\n"
|
" - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and\n"
|
||||||
" SHA512. Default one is SKEIN256.\n"
|
" SHA512. Default one is SKEIN256.\n"
|
||||||
|
@ -207,7 +210,8 @@ preproc_compress(compress_func_ptr cmp_func, void *src, uint64_t srclen, void *d
|
||||||
if (result < 0 || result == srclen) return (-1);
|
if (result < 0 || result == srclen) return (-1);
|
||||||
srclen = result;
|
srclen = result;
|
||||||
memcpy(src, dst, srclen);
|
memcpy(src, dst, srclen);
|
||||||
} else {
|
|
||||||
|
} else if (!enable_delta2_encode) {
|
||||||
/*
|
/*
|
||||||
* Execution won't come here but just in case ...
|
* Execution won't come here but just in case ...
|
||||||
* Even Delta2 encoding below enables LZP.
|
* Even Delta2 encoding below enables LZP.
|
||||||
|
@ -282,7 +286,9 @@ preproc_decompress(compress_func_ptr dec_func, void *src, uint64_t srclen, void
|
||||||
return (-1);
|
return (-1);
|
||||||
}
|
}
|
||||||
*dstlen = result;
|
*dstlen = result;
|
||||||
} else {
|
}
|
||||||
|
|
||||||
|
if (!(type & (PREPROC_COMPRESSED | PREPROC_TYPE_DELTA2 | PREPROC_TYPE_LZP))) {
|
||||||
fprintf(stderr, "Invalid preprocessing flags: %d\n", type);
|
fprintf(stderr, "Invalid preprocessing flags: %d\n", type);
|
||||||
return (-1);
|
return (-1);
|
||||||
}
|
}
|
||||||
|
@ -1196,7 +1202,7 @@ plain_index:
|
||||||
dedupe_index_sz += RABIN_HDR_SIZE;
|
dedupe_index_sz += RABIN_HDR_SIZE;
|
||||||
memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE);
|
memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE);
|
||||||
/* Compress data chunk. */
|
/* Compress data chunk. */
|
||||||
if (lzp_preprocess) {
|
if (lzp_preprocess || enable_delta2_encode) {
|
||||||
rv = preproc_compress(tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz,
|
rv = preproc_compress(tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz,
|
||||||
_chunksize, compressed_chunk + index_size_cmp, &_chunksize,
|
_chunksize, compressed_chunk + index_size_cmp, &_chunksize,
|
||||||
tdat->level, 0, tdat->data, tdat->props);
|
tdat->level, 0, tdat->data, tdat->props);
|
||||||
|
@ -1215,7 +1221,7 @@ plain_index:
|
||||||
} else {
|
} else {
|
||||||
plain_compress:
|
plain_compress:
|
||||||
_chunksize = tdat->rbytes;
|
_chunksize = tdat->rbytes;
|
||||||
if (lzp_preprocess) {
|
if (lzp_preprocess || enable_delta2_encode) {
|
||||||
rv = preproc_compress(tdat->compress,
|
rv = preproc_compress(tdat->compress,
|
||||||
tdat->uncompressed_chunk, tdat->rbytes,
|
tdat->uncompressed_chunk, tdat->rbytes,
|
||||||
compressed_chunk, &_chunksize, tdat->level, 0, tdat->data,
|
compressed_chunk, &_chunksize, tdat->level, 0, tdat->data,
|
||||||
|
@ -1270,7 +1276,7 @@ plain_compress:
|
||||||
if ((enable_rabin_scan || enable_fixed_scan) && tdat->rctx->valid) {
|
if ((enable_rabin_scan || enable_fixed_scan) && tdat->rctx->valid) {
|
||||||
type |= CHUNK_FLAG_DEDUP;
|
type |= CHUNK_FLAG_DEDUP;
|
||||||
}
|
}
|
||||||
if (lzp_preprocess) {
|
if (lzp_preprocess || enable_delta2_encode) {
|
||||||
type |= CHUNK_FLAG_PREPROC;
|
type |= CHUNK_FLAG_PREPROC;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2165,7 +2171,6 @@ main(int argc, char *argv[])
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'P':
|
case 'P':
|
||||||
lzp_preprocess = 1;
|
|
||||||
enable_delta2_encode = 1;
|
enable_delta2_encode = 1;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt
|
||||||
do
|
do
|
||||||
for tf in combined.dat comb_d.dat
|
for tf in combined.dat comb_d.dat
|
||||||
do
|
do
|
||||||
for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -E -P"
|
for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P"
|
||||||
do
|
do
|
||||||
for seg in 2m 100m
|
for seg in 2m 100m
|
||||||
do
|
do
|
||||||
|
|
|
@ -12,7 +12,7 @@ for algo in lzfx adapt2
|
||||||
do
|
do
|
||||||
for tf in comb_d.dat
|
for tf in comb_d.dat
|
||||||
do
|
do
|
||||||
for feat in "-e" "-e -L -S SHA256" "-D -e -S SHA512" "-D -EE -L -e -S SKEIN512" "-e -S CRC64" "-e -P" "-e -P -S KECCAK256" "-D -e -L -S KECCAK512"
|
for feat in "-e" "-e -L -S SHA256" "-D -e -S SHA512" "-D -EE -L -e -S SKEIN512" "-e -S CRC64" "-e -P" "-e -L -P -S KECCAK256" "-D -e -L -S KECCAK512"
|
||||||
do
|
do
|
||||||
for seg in 2m 100m
|
for seg in 2m 100m
|
||||||
do
|
do
|
||||||
|
|
|
@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt adapt2
|
||||||
do
|
do
|
||||||
for tf in combined.dat comb_d.dat
|
for tf in combined.dat comb_d.dat
|
||||||
do
|
do
|
||||||
for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L"
|
for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P"
|
||||||
do
|
do
|
||||||
for seg in 2m 100m
|
for seg in 2m 100m
|
||||||
do
|
do
|
||||||
|
|
|
@ -91,7 +91,7 @@ zlib_stats(int show)
|
||||||
|
|
||||||
void
|
void
|
||||||
zlib_props(algo_props_t *data, int level, int64_t chunksize) {
|
zlib_props(algo_props_t *data, int level, int64_t chunksize) {
|
||||||
data->delta2_span = 50;
|
data->delta2_span = 100;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
|
|
Loading…
Reference in a new issue