Update adaptive mode heuristic based on algorithms.
Remove incorrect check in PPMd decompression code. More refactoring of variable names.
This commit is contained in:
parent
449dc35675
commit
8f8af7ed6b
6 changed files with 44 additions and 44 deletions
10
README.md
10
README.md
|
@ -52,11 +52,11 @@ Usage
|
|||
Bzip2 (See: libbsc.com).
|
||||
|
||||
adapt - Adaptive mode where ppmd or bzip2 will be used per chunk,
|
||||
depending on which one produces better compression. This mode
|
||||
is obviously fairly slow and requires lots of memory.
|
||||
adapt2 - Adaptive mode which includes ppmd and lzma. This requires
|
||||
more memory than adapt mode, is slower and potentially gives
|
||||
the best compression.
|
||||
depending on heuristics. If at least 50% of the input data is
|
||||
7-bit text then PPMd will be used otherwise Bzip2.
|
||||
adapt2 - Adaptive mode which includes ppmd and lzma. If at least 80% of
|
||||
the input data is 7-bit text then PPMd will be used otherwise
|
||||
LZMA. It has significantly more memory usage than adapt.
|
||||
none - No compression. This is only meaningful with -D and -E so Dedupe
|
||||
can be done for post-processing with an external utility.
|
||||
<chunk_size> - This can be in bytes or can use the following suffixes:
|
||||
|
|
|
@ -35,6 +35,9 @@
|
|||
#include <pcompress.h>
|
||||
#include <allocator.h>
|
||||
|
||||
#define FIFTY_PCT(x) (((x)/10) * 5)
|
||||
#define TWENTY_PCT(x) (((x)/10) * 2)
|
||||
|
||||
static unsigned int lzma_count = 0;
|
||||
static unsigned int bzip2_count = 0;
|
||||
static unsigned int ppmd_count = 0;
|
||||
|
@ -141,34 +144,34 @@ adapt_compress(void *src, size_t srclen, void *dst,
|
|||
{
|
||||
struct adapt_data *adat = (struct adapt_data *)(data);
|
||||
uchar_t *src1 = (uchar_t *)src;
|
||||
size_t i, bincount;
|
||||
size_t i, tot8b;
|
||||
int rv;
|
||||
|
||||
/*
|
||||
* Count number of 8-bit binary bytes in source.
|
||||
*/
|
||||
bincount = 0;
|
||||
tot8b = 0;
|
||||
for (i = 0; i < srclen; i++)
|
||||
bincount += (src1[i] >> 7);
|
||||
tot8b += (src1[i] >> 7);
|
||||
|
||||
/*
|
||||
* Use PPMd if at least 70% of source is 7-bit textual bytes, otherwise
|
||||
* Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
|
||||
* use Bzip2 or LZMA.
|
||||
*/
|
||||
if (bincount > (srclen / 10 * 3)) {
|
||||
if (adat->adapt_mode == 2) {
|
||||
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
|
||||
if (rv < 0)
|
||||
return (rv);
|
||||
rv = COMPRESS_LZMA;
|
||||
lzma_count++;
|
||||
} else {
|
||||
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
|
||||
if (rv < 0)
|
||||
return (rv);
|
||||
rv = COMPRESS_BZIP2;
|
||||
bzip2_count++;
|
||||
}
|
||||
if (adat->adapt_mode == 2 && tot8b > TWENTY_PCT(srclen)) {
|
||||
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
|
||||
if (rv < 0)
|
||||
return (rv);
|
||||
rv = COMPRESS_LZMA;
|
||||
lzma_count++;
|
||||
|
||||
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
|
||||
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
|
||||
if (rv < 0)
|
||||
return (rv);
|
||||
rv = COMPRESS_BZIP2;
|
||||
bzip2_count++;
|
||||
|
||||
} else {
|
||||
rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
|
||||
if (rv < 0)
|
||||
|
|
12
main.c
12
main.c
|
@ -265,7 +265,7 @@ perform_decompress(void *dat)
|
|||
{
|
||||
struct cmp_data *tdat = (struct cmp_data *)dat;
|
||||
ssize_t _chunksize;
|
||||
ssize_t dedupe_index_sz, rabin_data_sz, dedupe_index_sz_cmp, rabin_data_sz_cmp;
|
||||
ssize_t dedupe_index_sz, dedupe_data_sz, dedupe_index_sz_cmp, dedupe_data_sz_cmp;
|
||||
int type, rv;
|
||||
unsigned int blknum;
|
||||
uchar_t checksum[CKSUM_MAX_BYTES];
|
||||
|
@ -305,9 +305,9 @@ redo:
|
|||
if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
|
||||
uchar_t *cmpbuf, *ubuf;
|
||||
|
||||
/* Extract various sizes from rabin header. */
|
||||
parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz,
|
||||
&dedupe_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize);
|
||||
/* Extract various sizes from dedupe header. */
|
||||
parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &dedupe_data_sz,
|
||||
&dedupe_index_sz_cmp, &dedupe_data_sz_cmp, &_chunksize);
|
||||
memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE);
|
||||
|
||||
/*
|
||||
|
@ -320,10 +320,10 @@ redo:
|
|||
ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + dedupe_index_sz;
|
||||
if (HDR & COMPRESSED) {
|
||||
if (HDR & CHUNK_FLAG_PREPROC) {
|
||||
rv = preproc_decompress(tdat->decompress, cmpbuf, rabin_data_sz_cmp,
|
||||
rv = preproc_decompress(tdat->decompress, cmpbuf, dedupe_data_sz_cmp,
|
||||
ubuf, &_chunksize, tdat->level, HDR, tdat->data);
|
||||
} else {
|
||||
rv = tdat->decompress(cmpbuf, rabin_data_sz_cmp, ubuf, &_chunksize,
|
||||
rv = tdat->decompress(cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize,
|
||||
tdat->level, HDR, tdat->data);
|
||||
}
|
||||
if (rv == -1) {
|
||||
|
|
|
@ -130,9 +130,6 @@ ppmd_decompress(void *src, size_t srclen, void *dst,
|
|||
size_t i;
|
||||
int res;
|
||||
|
||||
if (*((char *)_src) < 2)
|
||||
return (-1);
|
||||
|
||||
_ppmd->buf = (Byte *)_src;
|
||||
_ppmd->bufLen = srclen;
|
||||
_ppmd->bufUsed = 0;
|
||||
|
|
|
@ -707,20 +707,20 @@ cont:
|
|||
}
|
||||
|
||||
void
|
||||
update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp)
|
||||
update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t dedupe_data_sz_cmp)
|
||||
{
|
||||
ssize_t *entries;
|
||||
|
||||
buf += sizeof (uint32_t);
|
||||
entries = (ssize_t *)buf;
|
||||
entries[1] = htonll(dedupe_index_sz_cmp);
|
||||
entries[3] = htonll(rabin_data_sz_cmp);
|
||||
entries[3] = htonll(dedupe_data_sz_cmp);
|
||||
}
|
||||
|
||||
void
|
||||
parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
|
||||
ssize_t *rabin_data_sz, ssize_t *dedupe_index_sz_cmp,
|
||||
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size)
|
||||
ssize_t *dedupe_data_sz, ssize_t *dedupe_index_sz_cmp,
|
||||
ssize_t *dedupe_data_sz_cmp, ssize_t *deduped_size)
|
||||
{
|
||||
ssize_t *entries;
|
||||
|
||||
|
@ -728,11 +728,11 @@ parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
|
|||
buf += sizeof (uint32_t);
|
||||
|
||||
entries = (ssize_t *)buf;
|
||||
*rabin_data_sz = ntohll(entries[0]);
|
||||
*dedupe_data_sz = ntohll(entries[0]);
|
||||
*dedupe_index_sz = (ssize_t)(*blknum) * RABIN_ENTRY_SIZE;
|
||||
*dedupe_index_sz_cmp = ntohll(entries[1]);
|
||||
*rabin_deduped_size = ntohll(entries[2]);
|
||||
*rabin_data_sz_cmp = ntohll(entries[3]);
|
||||
*deduped_size = ntohll(entries[2]);
|
||||
*dedupe_data_sz_cmp = ntohll(entries[3]);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -165,11 +165,11 @@ extern void destroy_dedupe_context(dedupe_context_t *ctx);
|
|||
extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf,
|
||||
ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
|
||||
extern void dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size);
|
||||
extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz,
|
||||
ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp,
|
||||
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size);
|
||||
extern void update_dedupe_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
|
||||
ssize_t rabin_data_sz_cmp);
|
||||
extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *dedupe_index_sz,
|
||||
ssize_t *dedupe_data_sz, ssize_t *rabin_index_sz_cmp,
|
||||
ssize_t *dedupe_data_sz_cmp, ssize_t *deduped_size);
|
||||
extern void update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp,
|
||||
ssize_t dedupe_data_sz_cmp);
|
||||
extern void reset_dedupe_context(dedupe_context_t *ctx);
|
||||
extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
|
||||
int delta_flag);
|
||||
|
|
Loading…
Reference in a new issue