Update adaptive mode heuristic based on algorithms.

Remove incorrect check in PPMd decompression code.
More refactoring of variable names.
This commit is contained in:
Moinak Ghosh 2012-09-27 22:29:08 +05:30
parent 449dc35675
commit 8f8af7ed6b
6 changed files with 44 additions and 44 deletions

View file

@ -52,11 +52,11 @@ Usage
Bzip2 (See: libbsc.com).
adapt - Adaptive mode where ppmd or bzip2 will be used per chunk,
depending on which one produces better compression. This mode
is obviously fairly slow and requires lots of memory.
adapt2 - Adaptive mode which includes ppmd and lzma. This requires
more memory than adapt mode, is slower and potentially gives
the best compression.
depending on heuristics. If at least 50% of the input data is
7-bit text then PPMd will be used otherwise Bzip2.
adapt2 - Adaptive mode which includes ppmd and lzma. If at least 80% of
the input data is 7-bit text then PPMd will be used otherwise
LZMA. It has significantly more memory usage than adapt.
none - No compression. This is only meaningful with -D and -E so Dedupe
can be done for post-processing with an external utility.
<chunk_size> - This can be in bytes or can use the following suffixes:

View file

@ -35,6 +35,9 @@
#include <pcompress.h>
#include <allocator.h>
#define FIFTY_PCT(x) (((x)/10) * 5)
#define TWENTY_PCT(x) (((x)/10) * 2)
static unsigned int lzma_count = 0;
static unsigned int bzip2_count = 0;
static unsigned int ppmd_count = 0;
@ -141,34 +144,34 @@ adapt_compress(void *src, size_t srclen, void *dst,
{
struct adapt_data *adat = (struct adapt_data *)(data);
uchar_t *src1 = (uchar_t *)src;
size_t i, bincount;
size_t i, tot8b;
int rv;
/*
* Count number of 8-bit binary bytes in source.
*/
bincount = 0;
tot8b = 0;
for (i = 0; i < srclen; i++)
bincount += (src1[i] >> 7);
tot8b += (src1[i] >> 7);
/*
* Use PPMd if at least 70% of source is 7-bit textual bytes, otherwise
* Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
* use Bzip2 or LZMA.
*/
if (bincount > (srclen / 10 * 3)) {
if (adat->adapt_mode == 2) {
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
if (rv < 0)
return (rv);
rv = COMPRESS_LZMA;
lzma_count++;
} else {
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
if (rv < 0)
return (rv);
rv = COMPRESS_BZIP2;
bzip2_count++;
}
if (adat->adapt_mode == 2 && tot8b > TWENTY_PCT(srclen)) {
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
if (rv < 0)
return (rv);
rv = COMPRESS_LZMA;
lzma_count++;
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
if (rv < 0)
return (rv);
rv = COMPRESS_BZIP2;
bzip2_count++;
} else {
rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
if (rv < 0)

12
main.c
View file

@ -265,7 +265,7 @@ perform_decompress(void *dat)
{
struct cmp_data *tdat = (struct cmp_data *)dat;
ssize_t _chunksize;
ssize_t dedupe_index_sz, rabin_data_sz, dedupe_index_sz_cmp, rabin_data_sz_cmp;
ssize_t dedupe_index_sz, dedupe_data_sz, dedupe_index_sz_cmp, dedupe_data_sz_cmp;
int type, rv;
unsigned int blknum;
uchar_t checksum[CKSUM_MAX_BYTES];
@ -305,9 +305,9 @@ redo:
if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
uchar_t *cmpbuf, *ubuf;
/* Extract various sizes from rabin header. */
parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz,
&dedupe_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize);
/* Extract various sizes from dedupe header. */
parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &dedupe_data_sz,
&dedupe_index_sz_cmp, &dedupe_data_sz_cmp, &_chunksize);
memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE);
/*
@ -320,10 +320,10 @@ redo:
ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + dedupe_index_sz;
if (HDR & COMPRESSED) {
if (HDR & CHUNK_FLAG_PREPROC) {
rv = preproc_decompress(tdat->decompress, cmpbuf, rabin_data_sz_cmp,
rv = preproc_decompress(tdat->decompress, cmpbuf, dedupe_data_sz_cmp,
ubuf, &_chunksize, tdat->level, HDR, tdat->data);
} else {
rv = tdat->decompress(cmpbuf, rabin_data_sz_cmp, ubuf, &_chunksize,
rv = tdat->decompress(cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize,
tdat->level, HDR, tdat->data);
}
if (rv == -1) {

View file

@ -130,9 +130,6 @@ ppmd_decompress(void *src, size_t srclen, void *dst,
size_t i;
int res;
if (*((char *)_src) < 2)
return (-1);
_ppmd->buf = (Byte *)_src;
_ppmd->bufLen = srclen;
_ppmd->bufUsed = 0;

View file

@ -707,20 +707,20 @@ cont:
}
void
update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp)
update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t dedupe_data_sz_cmp)
{
ssize_t *entries;
buf += sizeof (uint32_t);
entries = (ssize_t *)buf;
entries[1] = htonll(dedupe_index_sz_cmp);
entries[3] = htonll(rabin_data_sz_cmp);
entries[3] = htonll(dedupe_data_sz_cmp);
}
void
parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
ssize_t *rabin_data_sz, ssize_t *dedupe_index_sz_cmp,
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size)
ssize_t *dedupe_data_sz, ssize_t *dedupe_index_sz_cmp,
ssize_t *dedupe_data_sz_cmp, ssize_t *deduped_size)
{
ssize_t *entries;
@ -728,11 +728,11 @@ parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
buf += sizeof (uint32_t);
entries = (ssize_t *)buf;
*rabin_data_sz = ntohll(entries[0]);
*dedupe_data_sz = ntohll(entries[0]);
*dedupe_index_sz = (ssize_t)(*blknum) * RABIN_ENTRY_SIZE;
*dedupe_index_sz_cmp = ntohll(entries[1]);
*rabin_deduped_size = ntohll(entries[2]);
*rabin_data_sz_cmp = ntohll(entries[3]);
*deduped_size = ntohll(entries[2]);
*dedupe_data_sz_cmp = ntohll(entries[3]);
}
void

View file

@ -165,11 +165,11 @@ extern void destroy_dedupe_context(dedupe_context_t *ctx);
extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf,
ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
extern void dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size);
extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz,
ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp,
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size);
extern void update_dedupe_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
ssize_t rabin_data_sz_cmp);
extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *dedupe_index_sz,
ssize_t *dedupe_data_sz, ssize_t *rabin_index_sz_cmp,
ssize_t *dedupe_data_sz_cmp, ssize_t *deduped_size);
extern void update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp,
ssize_t dedupe_data_sz_cmp);
extern void reset_dedupe_context(dedupe_context_t *ctx);
extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
int delta_flag);