Add measurements for Chunking properties.

This commit is contained in:
Moinak Ghosh 2013-06-20 22:08:07 +05:30
parent 6432c76b4b
commit 916f31d62b
3 changed files with 59 additions and 19 deletions

View file

@ -1103,12 +1103,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, const char *to_filename)
} }
} }
nprocs = sysconf(_SC_NPROCESSORS_ONLN); nprocs = 1;
if (pctx->nthreads > 0 && pctx->nthreads < nprocs)
nprocs = pctx->nthreads;
else
pctx->nthreads = nprocs;
set_threadcounts(&props, &(pctx->nthreads), nprocs, DECOMPRESS_THREADS); set_threadcounts(&props, &(pctx->nthreads), nprocs, DECOMPRESS_THREADS);
if (props.is_single_chunk) if (props.is_single_chunk)
pctx->nthreads = 1; pctx->nthreads = 1;
@ -1662,7 +1657,8 @@ repeat:
pctx->avg_chunk += tdat->len_cmp; pctx->avg_chunk += tdat->len_cmp;
} }
wbytes = Write(w->wfd, tdat->cmp_seg, tdat->len_cmp); //wbytes = Write(w->wfd, tdat->cmp_seg, tdat->len_cmp);
wbytes = tdat->len_cmp;
if (unlikely(wbytes != tdat->len_cmp)) { if (unlikely(wbytes != tdat->len_cmp)) {
perror("Chunk Write: "); perror("Chunk Write: ");
do_cancel: do_cancel:
@ -1785,12 +1781,7 @@ start_compress(pc_ctx_t *pctx, const char *filename, uint64_t chunksize, int lev
thread = 0; thread = 0;
single_chunk = 0; single_chunk = 0;
rctx = NULL; rctx = NULL;
nprocs = 1;
nprocs = sysconf(_SC_NPROCESSORS_ONLN);
if (pctx->nthreads > 0 && pctx->nthreads < nprocs)
nprocs = pctx->nthreads;
else
pctx->nthreads = nprocs;
/* A host of sanity checks. */ /* A host of sanity checks. */
if (!pctx->pipe_mode) { if (!pctx->pipe_mode) {
@ -2387,6 +2378,7 @@ comp_done:
if (uncompfd != -1) close(uncompfd); if (uncompfd != -1) close(uncompfd);
} }
dump_frequencies();
if (!pctx->hide_cmp_stats) show_compression_stats(pctx); if (!pctx->hide_cmp_stats) show_compression_stats(pctx);
pctx->_stats_func(!pctx->hide_cmp_stats); pctx->_stats_func(!pctx->hide_cmp_stats);
@ -2726,6 +2718,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
optind = 0; optind = 0;
pthread_mutex_unlock(&opt_parse); pthread_mutex_unlock(&opt_parse);
pctx->nthreads = 1;
if ((pctx->do_compress && pctx->do_uncompress) || (!pctx->do_compress && !pctx->do_uncompress)) { if ((pctx->do_compress && pctx->do_uncompress) || (!pctx->do_compress && !pctx->do_uncompress)) {
return (2); return (2);
} }

View file

@ -114,6 +114,11 @@ uint64_t ir[256], out[256];
static int inited = 0; static int inited = 0;
archive_config_t *arc = NULL; archive_config_t *arc = NULL;
uint64_t freqs[RAB_POLYNOMIAL_MAX_BLOCK_SIZE+1];
uint64_t tot_chunks = 0;
uint64_t tot_size = 0;
double tot_time = 0;
static uint32_t static uint32_t
dedupe_min_blksz(int rab_blk_sz) dedupe_min_blksz(int rab_blk_sz)
{ {
@ -132,6 +137,34 @@ dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta
return ((chunksize / dedupe_min_blksz(rab_blk_sz)) * sizeof (uint32_t)); return ((chunksize / dedupe_min_blksz(rab_blk_sz)) * sizeof (uint32_t));
} }
void
dump_frequencies()
{
int i, j;
uint64_t tot;
double tot_c, tot_s, bytes_sec;
printf("\nChunk Frequency Distribution\n");
printf("====================================\n");
for (i = 1; i <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE;) {
tot = 0;
for (j = 0; j < 4096; j++) tot += freqs[i++];
if (tot > 0)
printf("%3d KB: %" PRIu64 "\n", i/1024, tot);
}
printf("====================================\n");
printf("Number of chunks : %" PRIu64 "\n", tot_chunks);
tot_c = tot_chunks;
tot_s = tot_size;
printf("Average chunk size: %.2F Bytes\n", tot_s / tot_c);
bytes_sec = tot_s / tot_time * 1000;
printf("Average chunking speed: %.3f MB/s\n", BYTES_TO_MB(bytes_sec));
printf("====================================\n");
}
/* /*
* Helper function to let caller size the the user specific compression chunk/segment * Helper function to let caller size the the user specific compression chunk/segment
* to align with deduplication requirements. * to align with deduplication requirements.
@ -185,6 +218,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
int term, pow, j; int term, pow, j;
uint64_t val, poly_pow; uint64_t val, poly_pow;
memset(freqs, 0, sizeof (freqs));
poly_pow = 1; poly_pow = 1;
for (j = 0; j < RAB_POLYNOMIAL_WIN_SIZE; j++) { for (j = 0; j < RAB_POLYNOMIAL_WIN_SIZE; j++) {
poly_pow = (poly_pow * RAB_POLYNOMIAL_CONST) & POLY_MASK; poly_pow = (poly_pow * RAB_POLYNOMIAL_CONST) & POLY_MASK;
@ -478,9 +512,9 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
uint32_t *ctx_heap; uint32_t *ctx_heap;
rabin_blockentry_t **htab; rabin_blockentry_t **htab;
MinHeap heap; MinHeap heap;
DEBUG_STAT_EN(uint32_t max_count); DEBUG_STAT_EN(uint32_t max_count = 0);
DEBUG_STAT_EN(max_count = 0); DEBUG_STAT_EN(double en);
DEBUG_STAT_EN(double strt, en_1, en); double strt, en_1;
length = offset; length = offset;
last_offset = 0; last_offset = 0;
@ -489,7 +523,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
ctx->valid = 0; ctx->valid = 0;
cur_roll_checksum = 0; cur_roll_checksum = 0;
if (*size < ctx->rabin_poly_avg_block_size) return (0); if (*size < ctx->rabin_poly_avg_block_size) return (0);
DEBUG_STAT_EN(strt = get_wtime_millis()); strt = get_wtime_millis();
if (ctx->dedupe_flag == RABIN_DEDUPE_FIXED) { if (ctx->dedupe_flag == RABIN_DEDUPE_FIXED) {
blknum = *size / ctx->rabin_poly_avg_block_size; blknum = *size / ctx->rabin_poly_avg_block_size;
@ -516,7 +550,12 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
ctx->blocks[i]->hash = XXH32(buf1+last_offset, length, 0); ctx->blocks[i]->hash = XXH32(buf1+last_offset, length, 0);
ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash; ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash;
last_offset += length; last_offset += length;
tot_chunks++;
tot_size += length;
} }
en_1 = get_wtime_millis();
tot_time += en_1 - strt;
for (i=0; i<blknum; i++) freqs[ctx->blocks[i]->length]++;
goto process_blocks; goto process_blocks;
} }
@ -659,6 +698,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
ctx->g_blocks[blknum].length = length; ctx->g_blocks[blknum].length = length;
ctx->g_blocks[blknum].offset = last_offset; ctx->g_blocks[blknum].offset = last_offset;
} }
tot_chunks++;
tot_size += length;
DEBUG_STAT_EN(if (length >= ctx->rabin_poly_max_block_size) ++max_count); DEBUG_STAT_EN(if (length >= ctx->rabin_poly_max_block_size) ++max_count);
/* /*
@ -708,6 +749,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
ctx->g_blocks[blknum].offset = last_offset; ctx->g_blocks[blknum].offset = last_offset;
} }
tot_chunks++;
tot_size += length;
if (ctx->delta_flag) { if (ctx->delta_flag) {
uint64_t cur_sketch; uint64_t cur_sketch;
uint64_t pc[4]; uint64_t pc[4];
@ -735,7 +778,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
process_blocks: process_blocks:
// If we found at least a few chunks, perform dedup. // If we found at least a few chunks, perform dedup.
DEBUG_STAT_EN(en_1 = get_wtime_millis()); en_1 = get_wtime_millis();
tot_time += en_1 - strt;
DEBUG_STAT_EN(fprintf(stderr, "Original size: %" PRId64 ", blknum: %u\n", *size, blknum)); DEBUG_STAT_EN(fprintf(stderr, "Original size: %" PRId64 ", blknum: %u\n", *size, blknum));
DEBUG_STAT_EN(fprintf(stderr, "Number of maxlen blocks: %u\n", max_count)); DEBUG_STAT_EN(fprintf(stderr, "Number of maxlen blocks: %u\n", max_count));
if (blknum <=2 && ctx->arc) { if (blknum <=2 && ctx->arc) {
@ -773,6 +817,7 @@ process_blocks:
ctx->g_blocks[i].length, 0, 0); ctx->g_blocks[i].length, 0, 0);
} }
for (i=0; i<blknum; i++) freqs[ctx->g_blocks[i].length]++;
/* /*
* Index table within this segment. * Index table within this segment.
*/ */
@ -1288,6 +1333,7 @@ next_ent:
} }
} }
for (i=0; i<blknum; i++) freqs[ctx->blocks[i]->length]++;
ary_sz = (blknum << 1) * sizeof (rabin_blockentry_t *); ary_sz = (blknum << 1) * sizeof (rabin_blockentry_t *);
htab = (rabin_blockentry_t **)(ctx->cbuf + ctx->real_chunksize - ary_sz); htab = (rabin_blockentry_t **)(ctx->cbuf + ctx->real_chunksize - ary_sz);
memset(htab, 0, ary_sz); memset(htab, 0, ary_sz);

View file

@ -80,7 +80,7 @@
#define RAB_POLYNOMIAL_WIN_SIZE 16 #define RAB_POLYNOMIAL_WIN_SIZE 16
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 8 #define RAB_POLYNOMIAL_MIN_WIN_SIZE 8
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 64 #define RAB_POLYNOMIAL_MAX_WIN_SIZE 64
#define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024) #define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (64 * 1024)
#define RAB_BLK_MASK (((1 << RAB_BLK_MIN_BITS) - 1) >> 1) #define RAB_BLK_MASK (((1 << RAB_BLK_MIN_BITS) - 1) >> 1)
#define RAB_BLK_AVG_SZ(x) (1 << ((x) + RAB_BLK_MIN_BITS)) #define RAB_BLK_AVG_SZ(x) (1 << ((x) + RAB_BLK_MIN_BITS))
@ -206,5 +206,6 @@ extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char
extern int global_dedupe_bufadjust(uint32_t rab_blk_sz, uint64_t *user_chunk_sz, int pct_interval, extern int global_dedupe_bufadjust(uint32_t rab_blk_sz, uint64_t *user_chunk_sz, int pct_interval,
const char *algo, cksum_t ck, cksum_t ck_sim, size_t file_sz, const char *algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
size_t memlimit, int nthreads, int pipe_mode); size_t memlimit, int nthreads, int pipe_mode);
extern void dump_frequencies();
#endif /* _RABIN_POLY_H_ */ #endif /* _RABIN_POLY_H_ */