diff --git a/README.md b/README.md index 7126cd8..0b7b61b 100644 --- a/README.md +++ b/README.md @@ -102,10 +102,12 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library delta encoding in conjunction with this may not always be beneficial. However Adaptive Delta Encoding is beneficial along with this. - '-P' - Enable Adaptive Delta Encoding. This implies '-L' as well. It improves - compresion ratio further at the cost of more CPU overhead. Delta - Encoding is combined with Run-Length encoding and Matrix transpose - of certain kinds of data to improve subsequent compression results. + '-P' - Enable Adaptive Delta Encoding. It can improve compresion ratio further + for data containing tables of numerical values especially if those are + in an arithmetic series. In this implementation basic Delta Encoding is + combined with Run-Length encoding and Matrix transpose + NOTE - If data has mixed textual and numeric table components then both -L and + -P can be used together. '-S' - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and diff --git a/delta2/delta2.c b/delta2/delta2.c index fcc1196..f185821 100644 --- a/delta2/delta2.c +++ b/delta2/delta2.c @@ -282,8 +282,10 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen) last = pos + srclen; olen = ntohll(*((uint64_t *)pos)); - if (*dstlen < (olen + 8)) + if (*dstlen < olen) { + fprintf(stderr, "DELTA2 Decode: Destination buffer too small.\n"); return (-1); + } out = 0; pos += MAIN_HDR; @@ -297,6 +299,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen) rcnt = ntohll(*((uint64_t *)pos)); pos += sizeof (rcnt); if (out + rcnt > *dstlen) { + fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n"); return (-1); } memcpy(pos1, pos, rcnt); @@ -314,6 +317,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen) rcnt = ntohll(*((uint64_t *)pos)); pos += sizeof (rcnt); if (out + rcnt > *dstlen) { + fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n"); return (-1); } transpose(pos, pos1, rcnt, stride, COL); @@ -330,6 +334,7 @@ delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen) delta = ntohll(*((uint64_t *)pos)); pos += sizeof (delta); if (out + rcnt > *dstlen) { + fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n"); return (-1); } diff --git a/lz4_compress.c b/lz4_compress.c index 96c0541..7904019 100644 --- a/lz4_compress.c +++ b/lz4_compress.c @@ -56,7 +56,7 @@ lz4_props(algo_props_t *data, int level, int64_t chunksize) { data->compress_mt_capable = 0; data->decompress_mt_capable = 0; data->buf_extra = lz4_buf_extra(chunksize); - data->delta2_span = 50; + data->delta2_span = 100; } int diff --git a/main.c b/main.c index 27272e8..10488db 100644 --- a/main.c +++ b/main.c @@ -150,8 +150,11 @@ usage(void) "7) Other flags:\n" " '-L' - Enable LZP pre-compression. This improves compression ratio of all\n" " algorithms with some extra CPU and very low RAM overhead.\n" - " '-P' - Enable Adaptive Delta Encoding. This implies '-L' as well. It improves\n" - " compresion ratio further at the cost of more CPU overhead.\n" + " '-P' - Enable Adaptive Delta Encoding. It can improve compresion ratio for\n" + " data containing tables of numerical values especially if those are in\n" + " an arithmetic series.\n" + " NOTE - If data has mixed textual and numeric table components then both -L and\n" + " -P can be used together.\n" " '-S' \n" " - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and\n" " SHA512. Default one is SKEIN256.\n" @@ -207,7 +210,8 @@ preproc_compress(compress_func_ptr cmp_func, void *src, uint64_t srclen, void *d if (result < 0 || result == srclen) return (-1); srclen = result; memcpy(src, dst, srclen); - } else { + + } else if (!enable_delta2_encode) { /* * Execution won't come here but just in case ... * Even Delta2 encoding below enables LZP. @@ -282,7 +286,9 @@ preproc_decompress(compress_func_ptr dec_func, void *src, uint64_t srclen, void return (-1); } *dstlen = result; - } else { + } + + if (!(type & (PREPROC_COMPRESSED | PREPROC_TYPE_DELTA2 | PREPROC_TYPE_LZP))) { fprintf(stderr, "Invalid preprocessing flags: %d\n", type); return (-1); } @@ -1196,7 +1202,7 @@ plain_index: dedupe_index_sz += RABIN_HDR_SIZE; memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE); /* Compress data chunk. */ - if (lzp_preprocess) { + if (lzp_preprocess || enable_delta2_encode) { rv = preproc_compress(tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz, _chunksize, compressed_chunk + index_size_cmp, &_chunksize, tdat->level, 0, tdat->data, tdat->props); @@ -1215,7 +1221,7 @@ plain_index: } else { plain_compress: _chunksize = tdat->rbytes; - if (lzp_preprocess) { + if (lzp_preprocess || enable_delta2_encode) { rv = preproc_compress(tdat->compress, tdat->uncompressed_chunk, tdat->rbytes, compressed_chunk, &_chunksize, tdat->level, 0, tdat->data, @@ -1270,7 +1276,7 @@ plain_compress: if ((enable_rabin_scan || enable_fixed_scan) && tdat->rctx->valid) { type |= CHUNK_FLAG_DEDUP; } - if (lzp_preprocess) { + if (lzp_preprocess || enable_delta2_encode) { type |= CHUNK_FLAG_PREPROC; } @@ -2165,7 +2171,6 @@ main(int argc, char *argv[]) break; case 'P': - lzp_preprocess = 1; enable_delta2_encode = 1; break; diff --git a/test/t4.tst b/test/t4.tst index 4cfb701..41b019a 100644 --- a/test/t4.tst +++ b/test/t4.tst @@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt do for tf in combined.dat comb_d.dat do - for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -E -P" + for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P" do for seg in 2m 100m do diff --git a/test/t5.tst b/test/t5.tst index d68f0e9..d3bc7f5 100644 --- a/test/t5.tst +++ b/test/t5.tst @@ -12,7 +12,7 @@ for algo in lzfx adapt2 do for tf in comb_d.dat do - for feat in "-e" "-e -L -S SHA256" "-D -e -S SHA512" "-D -EE -L -e -S SKEIN512" "-e -S CRC64" "-e -P" "-e -P -S KECCAK256" "-D -e -L -S KECCAK512" + for feat in "-e" "-e -L -S SHA256" "-D -e -S SHA512" "-D -EE -L -e -S SKEIN512" "-e -S CRC64" "-e -P" "-e -L -P -S KECCAK256" "-D -e -L -S KECCAK512" do for seg in 2m 100m do diff --git a/test/t8.tst b/test/t8.tst index c05e2d3..efdc467 100644 --- a/test/t8.tst +++ b/test/t8.tst @@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt adapt2 do for tf in combined.dat comb_d.dat do - for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" + for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P" do for seg in 2m 100m do diff --git a/zlib_compress.c b/zlib_compress.c index a4aa89a..76b5215 100644 --- a/zlib_compress.c +++ b/zlib_compress.c @@ -91,7 +91,7 @@ zlib_stats(int show) void zlib_props(algo_props_t *data, int level, int64_t chunksize) { - data->delta2_span = 50; + data->delta2_span = 100; } int