Update adaptive mode heuristic based on algorithms.

Remove incorrect check in PPMd decompression code.
More refactoring of variable names.
This commit is contained in:
Moinak Ghosh 2012-09-27 22:29:08 +05:30
parent 449dc35675
commit 8f8af7ed6b
6 changed files with 44 additions and 44 deletions

View file

@ -52,11 +52,11 @@ Usage
Bzip2 (See: libbsc.com). Bzip2 (See: libbsc.com).
adapt - Adaptive mode where ppmd or bzip2 will be used per chunk, adapt - Adaptive mode where ppmd or bzip2 will be used per chunk,
depending on which one produces better compression. This mode depending on heuristics. If at least 50% of the input data is
is obviously fairly slow and requires lots of memory. 7-bit text then PPMd will be used otherwise Bzip2.
adapt2 - Adaptive mode which includes ppmd and lzma. This requires adapt2 - Adaptive mode which includes ppmd and lzma. If at least 80% of
more memory than adapt mode, is slower and potentially gives the input data is 7-bit text then PPMd will be used otherwise
the best compression. LZMA. It has significantly more memory usage than adapt.
none - No compression. This is only meaningful with -D and -E so Dedupe none - No compression. This is only meaningful with -D and -E so Dedupe
can be done for post-processing with an external utility. can be done for post-processing with an external utility.
<chunk_size> - This can be in bytes or can use the following suffixes: <chunk_size> - This can be in bytes or can use the following suffixes:

View file

@ -35,6 +35,9 @@
#include <pcompress.h> #include <pcompress.h>
#include <allocator.h> #include <allocator.h>
#define FIFTY_PCT(x) (((x)/10) * 5)
#define TWENTY_PCT(x) (((x)/10) * 2)
static unsigned int lzma_count = 0; static unsigned int lzma_count = 0;
static unsigned int bzip2_count = 0; static unsigned int bzip2_count = 0;
static unsigned int ppmd_count = 0; static unsigned int ppmd_count = 0;
@ -141,34 +144,34 @@ adapt_compress(void *src, size_t srclen, void *dst,
{ {
struct adapt_data *adat = (struct adapt_data *)(data); struct adapt_data *adat = (struct adapt_data *)(data);
uchar_t *src1 = (uchar_t *)src; uchar_t *src1 = (uchar_t *)src;
size_t i, bincount; size_t i, tot8b;
int rv; int rv;
/* /*
* Count number of 8-bit binary bytes in source. * Count number of 8-bit binary bytes in source.
*/ */
bincount = 0; tot8b = 0;
for (i = 0; i < srclen; i++) for (i = 0; i < srclen; i++)
bincount += (src1[i] >> 7); tot8b += (src1[i] >> 7);
/* /*
* Use PPMd if at least 70% of source is 7-bit textual bytes, otherwise * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
* use Bzip2 or LZMA. * use Bzip2 or LZMA.
*/ */
if (bincount > (srclen / 10 * 3)) { if (adat->adapt_mode == 2 && tot8b > TWENTY_PCT(srclen)) {
if (adat->adapt_mode == 2) {
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data); rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
rv = COMPRESS_LZMA; rv = COMPRESS_LZMA;
lzma_count++; lzma_count++;
} else {
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL); rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
rv = COMPRESS_BZIP2; rv = COMPRESS_BZIP2;
bzip2_count++; bzip2_count++;
}
} else { } else {
rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data); rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
if (rv < 0) if (rv < 0)

12
main.c
View file

@ -265,7 +265,7 @@ perform_decompress(void *dat)
{ {
struct cmp_data *tdat = (struct cmp_data *)dat; struct cmp_data *tdat = (struct cmp_data *)dat;
ssize_t _chunksize; ssize_t _chunksize;
ssize_t dedupe_index_sz, rabin_data_sz, dedupe_index_sz_cmp, rabin_data_sz_cmp; ssize_t dedupe_index_sz, dedupe_data_sz, dedupe_index_sz_cmp, dedupe_data_sz_cmp;
int type, rv; int type, rv;
unsigned int blknum; unsigned int blknum;
uchar_t checksum[CKSUM_MAX_BYTES]; uchar_t checksum[CKSUM_MAX_BYTES];
@ -305,9 +305,9 @@ redo:
if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) { if ((enable_rabin_scan || enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
uchar_t *cmpbuf, *ubuf; uchar_t *cmpbuf, *ubuf;
/* Extract various sizes from rabin header. */ /* Extract various sizes from dedupe header. */
parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &rabin_data_sz, parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &dedupe_data_sz,
&dedupe_index_sz_cmp, &rabin_data_sz_cmp, &_chunksize); &dedupe_index_sz_cmp, &dedupe_data_sz_cmp, &_chunksize);
memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE); memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE);
/* /*
@ -320,10 +320,10 @@ redo:
ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + dedupe_index_sz; ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + dedupe_index_sz;
if (HDR & COMPRESSED) { if (HDR & COMPRESSED) {
if (HDR & CHUNK_FLAG_PREPROC) { if (HDR & CHUNK_FLAG_PREPROC) {
rv = preproc_decompress(tdat->decompress, cmpbuf, rabin_data_sz_cmp, rv = preproc_decompress(tdat->decompress, cmpbuf, dedupe_data_sz_cmp,
ubuf, &_chunksize, tdat->level, HDR, tdat->data); ubuf, &_chunksize, tdat->level, HDR, tdat->data);
} else { } else {
rv = tdat->decompress(cmpbuf, rabin_data_sz_cmp, ubuf, &_chunksize, rv = tdat->decompress(cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize,
tdat->level, HDR, tdat->data); tdat->level, HDR, tdat->data);
} }
if (rv == -1) { if (rv == -1) {

View file

@ -130,9 +130,6 @@ ppmd_decompress(void *src, size_t srclen, void *dst,
size_t i; size_t i;
int res; int res;
if (*((char *)_src) < 2)
return (-1);
_ppmd->buf = (Byte *)_src; _ppmd->buf = (Byte *)_src;
_ppmd->bufLen = srclen; _ppmd->bufLen = srclen;
_ppmd->bufUsed = 0; _ppmd->bufUsed = 0;

View file

@ -707,20 +707,20 @@ cont:
} }
void void
update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t rabin_data_sz_cmp) update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp, ssize_t dedupe_data_sz_cmp)
{ {
ssize_t *entries; ssize_t *entries;
buf += sizeof (uint32_t); buf += sizeof (uint32_t);
entries = (ssize_t *)buf; entries = (ssize_t *)buf;
entries[1] = htonll(dedupe_index_sz_cmp); entries[1] = htonll(dedupe_index_sz_cmp);
entries[3] = htonll(rabin_data_sz_cmp); entries[3] = htonll(dedupe_data_sz_cmp);
} }
void void
parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz, parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
ssize_t *rabin_data_sz, ssize_t *dedupe_index_sz_cmp, ssize_t *dedupe_data_sz, ssize_t *dedupe_index_sz_cmp,
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size) ssize_t *dedupe_data_sz_cmp, ssize_t *deduped_size)
{ {
ssize_t *entries; ssize_t *entries;
@ -728,11 +728,11 @@ parse_dedupe_hdr(uchar_t *buf, uint32_t *blknum, ssize_t *dedupe_index_sz,
buf += sizeof (uint32_t); buf += sizeof (uint32_t);
entries = (ssize_t *)buf; entries = (ssize_t *)buf;
*rabin_data_sz = ntohll(entries[0]); *dedupe_data_sz = ntohll(entries[0]);
*dedupe_index_sz = (ssize_t)(*blknum) * RABIN_ENTRY_SIZE; *dedupe_index_sz = (ssize_t)(*blknum) * RABIN_ENTRY_SIZE;
*dedupe_index_sz_cmp = ntohll(entries[1]); *dedupe_index_sz_cmp = ntohll(entries[1]);
*rabin_deduped_size = ntohll(entries[2]); *deduped_size = ntohll(entries[2]);
*rabin_data_sz_cmp = ntohll(entries[3]); *dedupe_data_sz_cmp = ntohll(entries[3]);
} }
void void

View file

@ -165,11 +165,11 @@ extern void destroy_dedupe_context(dedupe_context_t *ctx);
extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf, extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf,
ssize_t *size, ssize_t offset, ssize_t *rabin_pos); ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
extern void dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size); extern void dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size);
extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_index_sz, extern void parse_dedupe_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *dedupe_index_sz,
ssize_t *rabin_data_sz, ssize_t *rabin_index_sz_cmp, ssize_t *dedupe_data_sz, ssize_t *rabin_index_sz_cmp,
ssize_t *rabin_data_sz_cmp, ssize_t *rabin_deduped_size); ssize_t *dedupe_data_sz_cmp, ssize_t *deduped_size);
extern void update_dedupe_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp, extern void update_dedupe_hdr(uchar_t *buf, ssize_t dedupe_index_sz_cmp,
ssize_t rabin_data_sz_cmp); ssize_t dedupe_data_sz_cmp);
extern void reset_dedupe_context(dedupe_context_t *ctx); extern void reset_dedupe_context(dedupe_context_t *ctx);
extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
int delta_flag); int delta_flag);