Update adaptive mode heuristic based on algorithms.
Remove incorrect check in PPMd decompression code. More refactoring of variable names.
This commit is contained in:
parent
449dc35675
commit
8f8af7ed6b
6 changed files with 44 additions and 44 deletions
10
README.md
10
README.md
|
@ -52,11 +52,11 @@ Usage
|
||||||
Bzip2 (See: libbsc.com).
|
Bzip2 (See: libbsc.com).
|
||||||
|
|
||||||
adapt - Adaptive mode where ppmd or bzip2 will be used per chunk,
|
adapt - Adaptive mode where ppmd or bzip2 will be used per chunk,
|
||||||
depending on which one produces better compression. This mode
|
depending on heuristics. If at least 50% of the input data is
|
||||||
is obviously fairly slow and requires lots of memory.
|
7-bit text then PPMd will be used otherwise Bzip2.
|
||||||
adapt2 - Adaptive mode which includes ppmd and lzma. This requires
|
adapt2 - Adaptive mode which includes ppmd and lzma. If at least 80% of
|
||||||
more memory than adapt mode, is slower and potentially gives
|
the input data is 7-bit text then PPMd will be used otherwise
|
||||||
the best compression.
|
LZMA. It has significantly more memory usage than adapt.
|
||||||
none - No compression. This is only meaningful with -D and -E so Dedupe
|
none - No compression. This is only meaningful with -D and -E so Dedupe
|
||||||
can be done for post-processing with an external utility.
|
can be done for post-processing with an external utility.
|
||||||
<chunk_size> - This can be in bytes or can use the following suffixes:
|
<chunk_size> - This can be in bytes or can use the following suffixes:
|
||||||
|
|
|
@ -35,6 +35,9 @@
|
||||||
#include <pcompress.h>
|
#include <pcompress.h>
|
||||||
#include <allocator.h>
|
#include <allocator.h>
|
||||||
|
|
||||||
|
#define FIFTY_PCT(x) (((x)/10) * 5)
|
||||||
|
#define TWENTY_PCT(x) (((x)/10) * 2)
|
||||||
|
|
||||||
static unsigned int lzma_count = 0;
|
static unsigned int lzma_count = 0;
|
||||||
static unsigned int bzip2_count = 0;
|
static unsigned int bzip2_count = 0;
|
||||||
static unsigned int ppmd_count = 0;
|
static unsigned int ppmd_count = 0;
|
||||||
|
@ -141,34 +144,34 @@ adapt_compress(void *src, size_t srclen, void *dst,
|
||||||
{
|
{
|
||||||
struct adapt_data *adat = (struct adapt_data *)(data);
|
struct adapt_data *adat = (struct adapt_data *)(data);
|
||||||
uchar_t *src1 = (uchar_t *)src;
|
uchar_t *src1 = (uchar_t *)src;
|
||||||
size_t i, bincount;
|
size_t i, tot8b;
|
||||||
int rv;
|
int rv;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Count number of 8-bit binary bytes in source.
|
* Count number of 8-bit binary bytes in source.
|
||||||
*/
|
*/
|
||||||
bincount = 0;
|
tot8b = 0;
|
||||||
for (i = 0; i < srclen; i++)
|
for (i = 0; i < srclen; i++)
|
||||||
bincount += (src1[i] >> 7);
|
tot8b += (src1[i] >> 7);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Use PPMd if at least 70% of source is 7-bit textual bytes, otherwise
|
* Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
|
||||||
* use Bzip2 or LZMA.
|
* use Bzip2 or LZMA.
|
||||||
*/
|
*/
|
||||||
if (bincount > (srclen / 10 * 3)) {
|
if (adat->adapt_mode == 2 && tot8b > TWENTY_PCT(srclen)) {
|
||||||
if (adat->adapt_mode == 2) {
|
|
||||||
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
|
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
|
||||||
if (rv < 0)
|
if (rv < 0)
|
||||||
return (rv);
|
return (rv);
|
||||||
rv = COMPRESS_LZMA;
|
rv = COMPRESS_LZMA;
|
||||||
lzma_count++;
|
lzma_count++;
|
||||||
} else {
|
|
||||||
|
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
|
||||||
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
|
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
|
||||||
if (rv < 0)
|
if (rv < 0)
|
||||||
return (rv);
|
return (rv);
|
||||||
rv = COMPRESS_BZIP2;
|
rv = COMPRESS_BZIP2;
|
||||||
bzip2_count++;
|
bzip2_count++;
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
|
rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
|
||||||
if (rv < 0)
|
if (rv < 0)
|
||||||
|
|
12
main.c
12
main.c
|
@ -265,7 +265,7 @@ perform_decompress(void *dat)
|
||||||
{
|
{
|
||||||
struct cmp_data *tdat = (struct cmp_data *)dat;
|
struct cmp_data *tdat = (struct cmp_data *)dat;
|
||||||
ssize_t _chunksize;
|
ssize_t _chunksize;
|
||||||
ssize_t dedupe_index_sz, rabin_data_sz, dedupe_index_sz_cmp, rabin_data_sz_cmp;
|
ssize_t dedupe_index_sz, dedupe_data_sz, dedupe_index_sz_cmp, dedupe_data_sz_cmp;
|
||||||
int type, rv;
|
int type, rv;
|
||||||
unsigned int blknum;
|
unsigned int blknum;
|
||||||
uchar_t checksum[CKSUM_MAX_BYTES];
|
uchar_t checksum[CKSUM_MAX_BYTES];
|
||||||
|
@ -305,9 +305,9 @@ redo:
|
||||||
if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
|
if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
|
||||||
uchar_t *cmpbuf, *ubuf;
|
uchar_t *cmpbuf, *ubuf;
|
||||||
|
|
||||||
/* Extract various sizes from rabin header. */
|
/* Extract various sizes from dedupe header. */
|
||||||
parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz,
|
parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &dedupe_data_sz,
|
||||||
&dedupe_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize);
|
&dedupe_index_sz_cmp, &dedupe_data_sz_cmp, &_chunksize);
|
||||||
memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE);
|
memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -320,10 +320,10 @@ redo:
|
||||||
ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + dedupe_index_sz;
|
ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + dedupe_index_sz;
|
||||||
if (HDR & COMPRESSED) {
|
if (HDR & COMPRESSED) {
|
||||||
if (HDR & CHUNK_FLAG_PREPROC) {
|
if (HDR & CHUNK_FLAG_PREPROC) {
|
||||||
rv = preproc_decompress(tdat->decompress, cmpbuf, rabin_data_sz_cmp,
|
rv = preproc_decompress(tdat->decompress, cmpbuf, dedupe_data_sz_cmp,
|
||||||
ubuf, &_chunksize, tdat->level, HDR, tdat->data);
|
ubuf, &_chunksize, tdat->level, HDR, tdat->data);
|
||||||
} else {
|
} else {
|
||||||
rv = tdat->decompress(cmpbuf, rabin_data_sz_cmp, ubuf, &_chunksize,
|
rv = tdat->decompress(cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize,
|
||||||
tdat->level, HDR, tdat->data);
|
tdat->level, HDR, tdat->data);
|
||||||
}
|
}
|
||||||
if (rv == -1) {
|
if (rv == -1) {
|
||||||
|
|
|
@ -130,9 +130,6 @@ ppmd_decompress(void *src, size_t srclen, void *dst,
|
||||||
size_t i;
|
size_t i;
|
||||||
int res;
|
int res;
|
||||||
|
|
||||||
if (*((char *)_src) < 2)
|
|
||||||
return (-1);
|
|
||||||
|
|
||||||
_ppmd->buf = (Byte *)_src;
|
_ppmd->buf = (Byte *)_src;
|
||||||
_ppmd->bufLen = srclen;
|
_ppmd->bufLen = srclen;
|
||||||
_ppmd->bufUsed = 0;
|
_ppmd->bufUsed = 0;
|
||||||
|
|
|
@ -707,20 +707,20 @@ cont:
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp)
|
update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t dedupe_data_sz_cmp)
|
||||||
{
|
{
|
||||||
ssize_t *entries;
|
ssize_t *entries;
|
||||||
|
|
||||||
buf += sizeof (uint32_t);
|
buf += sizeof (uint32_t);
|
||||||
entries = (ssize_t *)buf;
|
entries = (ssize_t *)buf;
|
||||||
entries[1] = htonll(dedupe_index_sz_cmp);
|
entries[1] = htonll(dedupe_index_sz_cmp);
|
||||||
entries[3] = htonll(rabin_data_sz_cmp);
|
entries[3] = htonll(dedupe_data_sz_cmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
|
parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
|
||||||
ssize_t *rabin_data_sz, ssize_t *dedupe_index_sz_cmp,
|
ssize_t *dedupe_data_sz, ssize_t *dedupe_index_sz_cmp,
|
||||||
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size)
|
ssize_t *dedupe_data_sz_cmp, ssize_t *deduped_size)
|
||||||
{
|
{
|
||||||
ssize_t *entries;
|
ssize_t *entries;
|
||||||
|
|
||||||
|
@ -728,11 +728,11 @@ parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
|
||||||
buf += sizeof (uint32_t);
|
buf += sizeof (uint32_t);
|
||||||
|
|
||||||
entries = (ssize_t *)buf;
|
entries = (ssize_t *)buf;
|
||||||
*rabin_data_sz = ntohll(entries[0]);
|
*dedupe_data_sz = ntohll(entries[0]);
|
||||||
*dedupe_index_sz = (ssize_t)(*blknum) * RABIN_ENTRY_SIZE;
|
*dedupe_index_sz = (ssize_t)(*blknum) * RABIN_ENTRY_SIZE;
|
||||||
*dedupe_index_sz_cmp = ntohll(entries[1]);
|
*dedupe_index_sz_cmp = ntohll(entries[1]);
|
||||||
*rabin_deduped_size = ntohll(entries[2]);
|
*deduped_size = ntohll(entries[2]);
|
||||||
*rabin_data_sz_cmp = ntohll(entries[3]);
|
*dedupe_data_sz_cmp = ntohll(entries[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
|
@ -165,11 +165,11 @@ extern void destroy_dedupe_context(dedupe_context_t *ctx);
|
||||||
extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf,
|
extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf,
|
||||||
ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
|
ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
|
||||||
extern void dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size);
|
extern void dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size);
|
||||||
extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz,
|
extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *dedupe_index_sz,
|
||||||
ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp,
|
ssize_t *dedupe_data_sz, ssize_t *rabin_index_sz_cmp,
|
||||||
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size);
|
ssize_t *dedupe_data_sz_cmp, ssize_t *deduped_size);
|
||||||
extern void update_dedupe_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
|
extern void update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp,
|
||||||
ssize_t rabin_data_sz_cmp);
|
ssize_t dedupe_data_sz_cmp);
|
||||||
extern void reset_dedupe_context(dedupe_context_t *ctx);
|
extern void reset_dedupe_context(dedupe_context_t *ctx);
|
||||||
extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
|
extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
|
||||||
int delta_flag);
|
int delta_flag);
|
||||||
|
|
Loading…
Reference in a new issue