Add measurements for Chunking properties.
This commit is contained in:
parent
6432c76b4b
commit
916f31d62b
3 changed files with 59 additions and 19 deletions
19
pcompress.c
19
pcompress.c
|
@ -1103,12 +1103,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, const char *to_filename)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
nprocs = sysconf(_SC_NPROCESSORS_ONLN);
|
nprocs = 1;
|
||||||
if (pctx->nthreads > 0 && pctx->nthreads < nprocs)
|
|
||||||
nprocs = pctx->nthreads;
|
|
||||||
else
|
|
||||||
pctx->nthreads = nprocs;
|
|
||||||
|
|
||||||
set_threadcounts(&props, &(pctx->nthreads), nprocs, DECOMPRESS_THREADS);
|
set_threadcounts(&props, &(pctx->nthreads), nprocs, DECOMPRESS_THREADS);
|
||||||
if (props.is_single_chunk)
|
if (props.is_single_chunk)
|
||||||
pctx->nthreads = 1;
|
pctx->nthreads = 1;
|
||||||
|
@ -1662,7 +1657,8 @@ repeat:
|
||||||
pctx->avg_chunk += tdat->len_cmp;
|
pctx->avg_chunk += tdat->len_cmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
wbytes = Write(w->wfd, tdat->cmp_seg, tdat->len_cmp);
|
//wbytes = Write(w->wfd, tdat->cmp_seg, tdat->len_cmp);
|
||||||
|
wbytes = tdat->len_cmp;
|
||||||
if (unlikely(wbytes != tdat->len_cmp)) {
|
if (unlikely(wbytes != tdat->len_cmp)) {
|
||||||
perror("Chunk Write: ");
|
perror("Chunk Write: ");
|
||||||
do_cancel:
|
do_cancel:
|
||||||
|
@ -1785,12 +1781,7 @@ start_compress(pc_ctx_t *pctx, const char *filename, uint64_t chunksize, int lev
|
||||||
thread = 0;
|
thread = 0;
|
||||||
single_chunk = 0;
|
single_chunk = 0;
|
||||||
rctx = NULL;
|
rctx = NULL;
|
||||||
|
nprocs = 1;
|
||||||
nprocs = sysconf(_SC_NPROCESSORS_ONLN);
|
|
||||||
if (pctx->nthreads > 0 && pctx->nthreads < nprocs)
|
|
||||||
nprocs = pctx->nthreads;
|
|
||||||
else
|
|
||||||
pctx->nthreads = nprocs;
|
|
||||||
|
|
||||||
/* A host of sanity checks. */
|
/* A host of sanity checks. */
|
||||||
if (!pctx->pipe_mode) {
|
if (!pctx->pipe_mode) {
|
||||||
|
@ -2387,6 +2378,7 @@ comp_done:
|
||||||
if (uncompfd != -1) close(uncompfd);
|
if (uncompfd != -1) close(uncompfd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dump_frequencies();
|
||||||
if (!pctx->hide_cmp_stats) show_compression_stats(pctx);
|
if (!pctx->hide_cmp_stats) show_compression_stats(pctx);
|
||||||
pctx->_stats_func(!pctx->hide_cmp_stats);
|
pctx->_stats_func(!pctx->hide_cmp_stats);
|
||||||
|
|
||||||
|
@ -2726,6 +2718,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
|
||||||
optind = 0;
|
optind = 0;
|
||||||
pthread_mutex_unlock(&opt_parse);
|
pthread_mutex_unlock(&opt_parse);
|
||||||
|
|
||||||
|
pctx->nthreads = 1;
|
||||||
if ((pctx->do_compress && pctx->do_uncompress) || (!pctx->do_compress && !pctx->do_uncompress)) {
|
if ((pctx->do_compress && pctx->do_uncompress) || (!pctx->do_compress && !pctx->do_uncompress)) {
|
||||||
return (2);
|
return (2);
|
||||||
}
|
}
|
||||||
|
|
|
@ -114,6 +114,11 @@ uint64_t ir[256], out[256];
|
||||||
static int inited = 0;
|
static int inited = 0;
|
||||||
archive_config_t *arc = NULL;
|
archive_config_t *arc = NULL;
|
||||||
|
|
||||||
|
uint64_t freqs[RAB_POLYNOMIAL_MAX_BLOCK_SIZE+1];
|
||||||
|
uint64_t tot_chunks = 0;
|
||||||
|
uint64_t tot_size = 0;
|
||||||
|
double tot_time = 0;
|
||||||
|
|
||||||
static uint32_t
|
static uint32_t
|
||||||
dedupe_min_blksz(int rab_blk_sz)
|
dedupe_min_blksz(int rab_blk_sz)
|
||||||
{
|
{
|
||||||
|
@ -132,6 +137,34 @@ dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta
|
||||||
return ((chunksize / dedupe_min_blksz(rab_blk_sz)) * sizeof (uint32_t));
|
return ((chunksize / dedupe_min_blksz(rab_blk_sz)) * sizeof (uint32_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
dump_frequencies()
|
||||||
|
{
|
||||||
|
int i, j;
|
||||||
|
uint64_t tot;
|
||||||
|
double tot_c, tot_s, bytes_sec;
|
||||||
|
|
||||||
|
printf("\nChunk Frequency Distribution\n");
|
||||||
|
printf("====================================\n");
|
||||||
|
|
||||||
|
for (i = 1; i <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE;) {
|
||||||
|
tot = 0;
|
||||||
|
for (j = 0; j < 4096; j++) tot += freqs[i++];
|
||||||
|
if (tot > 0)
|
||||||
|
printf("%3d KB: %" PRIu64 "\n", i/1024, tot);
|
||||||
|
}
|
||||||
|
printf("====================================\n");
|
||||||
|
printf("Number of chunks : %" PRIu64 "\n", tot_chunks);
|
||||||
|
tot_c = tot_chunks;
|
||||||
|
tot_s = tot_size;
|
||||||
|
printf("Average chunk size: %.2F Bytes\n", tot_s / tot_c);
|
||||||
|
|
||||||
|
bytes_sec = tot_s / tot_time * 1000;
|
||||||
|
printf("Average chunking speed: %.3f MB/s\n", BYTES_TO_MB(bytes_sec));
|
||||||
|
|
||||||
|
printf("====================================\n");
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Helper function to let caller size the the user specific compression chunk/segment
|
* Helper function to let caller size the the user specific compression chunk/segment
|
||||||
* to align with deduplication requirements.
|
* to align with deduplication requirements.
|
||||||
|
@ -185,6 +218,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
||||||
int term, pow, j;
|
int term, pow, j;
|
||||||
uint64_t val, poly_pow;
|
uint64_t val, poly_pow;
|
||||||
|
|
||||||
|
memset(freqs, 0, sizeof (freqs));
|
||||||
poly_pow = 1;
|
poly_pow = 1;
|
||||||
for (j = 0; j < RAB_POLYNOMIAL_WIN_SIZE; j++) {
|
for (j = 0; j < RAB_POLYNOMIAL_WIN_SIZE; j++) {
|
||||||
poly_pow = (poly_pow * RAB_POLYNOMIAL_CONST) & POLY_MASK;
|
poly_pow = (poly_pow * RAB_POLYNOMIAL_CONST) & POLY_MASK;
|
||||||
|
@ -478,9 +512,9 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
||||||
uint32_t *ctx_heap;
|
uint32_t *ctx_heap;
|
||||||
rabin_blockentry_t **htab;
|
rabin_blockentry_t **htab;
|
||||||
MinHeap heap;
|
MinHeap heap;
|
||||||
DEBUG_STAT_EN(uint32_t max_count);
|
DEBUG_STAT_EN(uint32_t max_count = 0);
|
||||||
DEBUG_STAT_EN(max_count = 0);
|
DEBUG_STAT_EN(double en);
|
||||||
DEBUG_STAT_EN(double strt, en_1, en);
|
double strt, en_1;
|
||||||
|
|
||||||
length = offset;
|
length = offset;
|
||||||
last_offset = 0;
|
last_offset = 0;
|
||||||
|
@ -489,7 +523,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
||||||
ctx->valid = 0;
|
ctx->valid = 0;
|
||||||
cur_roll_checksum = 0;
|
cur_roll_checksum = 0;
|
||||||
if (*size < ctx->rabin_poly_avg_block_size) return (0);
|
if (*size < ctx->rabin_poly_avg_block_size) return (0);
|
||||||
DEBUG_STAT_EN(strt = get_wtime_millis());
|
strt = get_wtime_millis();
|
||||||
|
|
||||||
if (ctx->dedupe_flag == RABIN_DEDUPE_FIXED) {
|
if (ctx->dedupe_flag == RABIN_DEDUPE_FIXED) {
|
||||||
blknum = *size / ctx->rabin_poly_avg_block_size;
|
blknum = *size / ctx->rabin_poly_avg_block_size;
|
||||||
|
@ -516,7 +550,12 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
||||||
ctx->blocks[i]->hash = XXH32(buf1+last_offset, length, 0);
|
ctx->blocks[i]->hash = XXH32(buf1+last_offset, length, 0);
|
||||||
ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash;
|
ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash;
|
||||||
last_offset += length;
|
last_offset += length;
|
||||||
|
tot_chunks++;
|
||||||
|
tot_size += length;
|
||||||
}
|
}
|
||||||
|
en_1 = get_wtime_millis();
|
||||||
|
tot_time += en_1 - strt;
|
||||||
|
for (i=0; i<blknum; i++) freqs[ctx->blocks[i]->length]++;
|
||||||
goto process_blocks;
|
goto process_blocks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -659,6 +698,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
||||||
ctx->g_blocks[blknum].length = length;
|
ctx->g_blocks[blknum].length = length;
|
||||||
ctx->g_blocks[blknum].offset = last_offset;
|
ctx->g_blocks[blknum].offset = last_offset;
|
||||||
}
|
}
|
||||||
|
tot_chunks++;
|
||||||
|
tot_size += length;
|
||||||
DEBUG_STAT_EN(if (length >= ctx->rabin_poly_max_block_size) ++max_count);
|
DEBUG_STAT_EN(if (length >= ctx->rabin_poly_max_block_size) ++max_count);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -708,6 +749,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
||||||
ctx->g_blocks[blknum].offset = last_offset;
|
ctx->g_blocks[blknum].offset = last_offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tot_chunks++;
|
||||||
|
tot_size += length;
|
||||||
if (ctx->delta_flag) {
|
if (ctx->delta_flag) {
|
||||||
uint64_t cur_sketch;
|
uint64_t cur_sketch;
|
||||||
uint64_t pc[4];
|
uint64_t pc[4];
|
||||||
|
@ -735,7 +778,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
||||||
|
|
||||||
process_blocks:
|
process_blocks:
|
||||||
// If we found at least a few chunks, perform dedup.
|
// If we found at least a few chunks, perform dedup.
|
||||||
DEBUG_STAT_EN(en_1 = get_wtime_millis());
|
en_1 = get_wtime_millis();
|
||||||
|
tot_time += en_1 - strt;
|
||||||
DEBUG_STAT_EN(fprintf(stderr, "Original size: %" PRId64 ", blknum: %u\n", *size, blknum));
|
DEBUG_STAT_EN(fprintf(stderr, "Original size: %" PRId64 ", blknum: %u\n", *size, blknum));
|
||||||
DEBUG_STAT_EN(fprintf(stderr, "Number of maxlen blocks: %u\n", max_count));
|
DEBUG_STAT_EN(fprintf(stderr, "Number of maxlen blocks: %u\n", max_count));
|
||||||
if (blknum <=2 && ctx->arc) {
|
if (blknum <=2 && ctx->arc) {
|
||||||
|
@ -773,6 +817,7 @@ process_blocks:
|
||||||
ctx->g_blocks[i].length, 0, 0);
|
ctx->g_blocks[i].length, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (i=0; i<blknum; i++) freqs[ctx->g_blocks[i].length]++;
|
||||||
/*
|
/*
|
||||||
* Index table within this segment.
|
* Index table within this segment.
|
||||||
*/
|
*/
|
||||||
|
@ -1288,6 +1333,7 @@ next_ent:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (i=0; i<blknum; i++) freqs[ctx->blocks[i]->length]++;
|
||||||
ary_sz = (blknum << 1) * sizeof (rabin_blockentry_t *);
|
ary_sz = (blknum << 1) * sizeof (rabin_blockentry_t *);
|
||||||
htab = (rabin_blockentry_t **)(ctx->cbuf + ctx->real_chunksize - ary_sz);
|
htab = (rabin_blockentry_t **)(ctx->cbuf + ctx->real_chunksize - ary_sz);
|
||||||
memset(htab, 0, ary_sz);
|
memset(htab, 0, ary_sz);
|
||||||
|
|
|
@ -80,7 +80,7 @@
|
||||||
#define RAB_POLYNOMIAL_WIN_SIZE 16
|
#define RAB_POLYNOMIAL_WIN_SIZE 16
|
||||||
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 8
|
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 8
|
||||||
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 64
|
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 64
|
||||||
#define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (128 * 1024)
|
#define RAB_POLYNOMIAL_MAX_BLOCK_SIZE (64 * 1024)
|
||||||
#define RAB_BLK_MASK (((1 << RAB_BLK_MIN_BITS) - 1) >> 1)
|
#define RAB_BLK_MASK (((1 << RAB_BLK_MIN_BITS) - 1) >> 1)
|
||||||
#define RAB_BLK_AVG_SZ(x) (1 << ((x) + RAB_BLK_MIN_BITS))
|
#define RAB_BLK_AVG_SZ(x) (1 << ((x) + RAB_BLK_MIN_BITS))
|
||||||
|
|
||||||
|
@ -206,5 +206,6 @@ extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char
|
||||||
extern int global_dedupe_bufadjust(uint32_t rab_blk_sz, uint64_t *user_chunk_sz, int pct_interval,
|
extern int global_dedupe_bufadjust(uint32_t rab_blk_sz, uint64_t *user_chunk_sz, int pct_interval,
|
||||||
const char *algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
|
const char *algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
|
||||||
size_t memlimit, int nthreads, int pipe_mode);
|
size_t memlimit, int nthreads, int pipe_mode);
|
||||||
|
extern void dump_frequencies();
|
||||||
|
|
||||||
#endif /* _RABIN_POLY_H_ */
|
#endif /* _RABIN_POLY_H_ */
|
||||||
|
|
Loading…
Reference in a new issue