From e732e86b918299b2215a772ef09dc486f0397792 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Sun, 23 Jun 2013 21:30:32 +0530 Subject: [PATCH] New option to capture statistics for all rolling hash breakpoints. --- pcompress.c | 8 ++++- pcompress.h | 1 + rabin/rabin_dedup.c | 83 +++++++++++++++++++++++++++++---------------- rabin/rabin_dedup.h | 1 + 4 files changed, 63 insertions(+), 30 deletions(-) diff --git a/pcompress.c b/pcompress.c index 46536c1..d3aee22 100644 --- a/pcompress.c +++ b/pcompress.c @@ -2048,6 +2048,7 @@ start_compress(pc_ctx_t *pctx, const char *filename, uint64_t chunksize, int lev COMP_BAIL; } + tdat->rctx->full_chunking = pctx->full_chunking; tdat->rctx->index_sem = &(tdat->index_sem); tdat->rctx->id = i; } @@ -2173,6 +2174,7 @@ start_compress(pc_ctx_t *pctx, const char *filename, uint64_t chunksize, int lev if (pctx->enable_rabin_split) { rctx = create_dedupe_context(chunksize, 0, 0, pctx->algo, &props, pctx->enable_delta_encode, pctx->enable_fixed_scan, VERSION, COMPRESS, 0, NULL, pctx->pipe_mode, nprocs); + rctx->full_chunking = 0; rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx); } else { rbytes = Read(uncompfd, cread_buf, chunksize); @@ -2571,7 +2573,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) pctx->chunksize = DEFAULT_CHUNKSIZE; pthread_mutex_lock(&opt_parse); - while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDGEe:w:rLPS:B:Fk:")) != -1) { + while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDGEe:w:rLPS:B:Fk:f")) != -1) { int ovr; int64_t chunksize; @@ -2708,6 +2710,10 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) } break; + case 'f': + pctx->full_chunking = 1; + break; + case '?': default: return (2); diff --git a/pcompress.h b/pcompress.h index 8d70101..18b90a9 100644 --- a/pcompress.h +++ b/pcompress.h @@ -197,6 +197,7 @@ typedef struct pc_ctx { int enable_fixed_scan; int lzp_preprocess; int encrypt_type; + int full_chunking; unsigned int chunk_num; uint64_t largest_chunk, smallest_chunk, avg_chunk; uint64_t chunksize; diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index d76c845..737bbd8 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -114,10 +114,11 @@ uint64_t ir[256], out[256]; static int inited = 0; archive_config_t *arc = NULL; -uint64_t freqs[RAB_POLYNOMIAL_MAX_BLOCK_SIZE+1]; -uint64_t tot_chunks = 0, min_chunk; -uint64_t tot_size = 0, non_hashed_size = 0; -double tot_time = 0; +static uint64_t freqs[RAB_POLYNOMIAL_MAX_BLOCK_SIZE+1]; +static uint64_t tot_chunks = 0, min_chunk; +static uint64_t tot_size = 0, non_hashed_size = 0; +static double tot_time = 0; +static int full_chunking = 0; static uint32_t dedupe_min_blksz(int rab_blk_sz) @@ -140,7 +141,7 @@ dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta void dump_frequencies() { - int i, j; + int i, j, limit; uint64_t tot; double tot_c, tot_s, bytes_sec; @@ -149,11 +150,16 @@ dump_frequencies() printf("Min chunk size: %" PRIu64 "\n", min_chunk); + if (full_chunking) + limit = 1024; + else + limit = 4096; for (i = 1; i <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE;) { tot = 0; - for (j = 0; j < 4096; j++) tot += freqs[i++]; - if (tot > 0) + for (j = 0; j < limit; j++) tot += freqs[i++]; + if (tot > 0) { printf("%3d KB: %" PRIu64 "\n", i/1024, tot); + } } printf("====================================\n"); printf("Number of chunks : %" PRIu64 "\n", tot_chunks); @@ -530,6 +536,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of window_pos = 0; ctx->valid = 0; cur_roll_checksum = 0; + full_chunking = ctx->full_chunking; if (*size < ctx->rabin_poly_avg_block_size) return (0); strt = get_wtime_millis(); @@ -586,6 +593,9 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of ary_sz += ctx->rabin_poly_max_block_size; ctx_heap = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz); } + if (ctx->full_chunking) { + ctx->rabin_poly_min_block_size = 1; + } #ifndef SSE_MODE memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE); #else @@ -648,7 +658,11 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of * Start our sliding window at a fixed number of bytes before the min window size. * It is pointless to slide the window over the whole length of the chunk. */ - offset = ctx->rabin_poly_min_block_size - RAB_WINDOW_SLIDE_OFFSET; + if (ctx->full_chunking) { + offset = 0; + } else { + offset = ctx->rabin_poly_min_block_size - RAB_WINDOW_SLIDE_OFFSET; + } length = offset; non_hashed_size += offset; for (i=offset; irabin_avg_block_mask) == ctx->rabin_break_patt || length >= ctx->rabin_poly_max_block_size) { - if (!(ctx->arc)) { - if (ctx->blocks[blknum] == 0) - ctx->blocks[blknum] = (rabin_blockentry_t *)slab_alloc(NULL, - sizeof (rabin_blockentry_t)); - ctx->blocks[blknum]->offset = last_offset; - ctx->blocks[blknum]->index = blknum; // Need to store for sorting - ctx->blocks[blknum]->length = length; + if (!(ctx->full_chunking)) { + if (!(ctx->arc)) { + if (ctx->blocks[blknum] == 0) + ctx->blocks[blknum] = (rabin_blockentry_t *)slab_alloc(NULL, + sizeof (rabin_blockentry_t)); + ctx->blocks[blknum]->offset = last_offset; + ctx->blocks[blknum]->index = blknum; // Need to store for sorting + ctx->blocks[blknum]->length = length; + } else { + ctx->g_blocks[blknum].length = length; + ctx->g_blocks[blknum].offset = last_offset; + } } else { - ctx->g_blocks[blknum].length = length; - ctx->g_blocks[blknum].offset = last_offset; + freqs[length]++; } tot_chunks++; tot_size += length; @@ -739,8 +757,10 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of last_offset = i+1; length = 0; if (*size - last_offset <= ctx->rabin_poly_min_block_size) break; - length = ctx->rabin_poly_min_block_size - RAB_WINDOW_SLIDE_OFFSET; - i = i + length; + if (ctx->full_chunking == 0) { + length = ctx->rabin_poly_min_block_size - RAB_WINDOW_SLIDE_OFFSET; + i = i + length; + } non_hashed_size += length; } } @@ -749,16 +769,20 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of if (last_offset < *size) { length = *size - last_offset; non_hashed_size += length; - if (!(ctx->arc)) { - if (ctx->blocks[blknum] == 0) - ctx->blocks[blknum] = (rabin_blockentry_t *)slab_alloc(NULL, - sizeof (rabin_blockentry_t)); - ctx->blocks[blknum]->offset = last_offset; - ctx->blocks[blknum]->index = blknum; - ctx->blocks[blknum]->length = length; + if (!(ctx->full_chunking)) { + if (!(ctx->arc)) { + if (ctx->blocks[blknum] == 0) + ctx->blocks[blknum] = (rabin_blockentry_t *)slab_alloc(NULL, + sizeof (rabin_blockentry_t)); + ctx->blocks[blknum]->offset = last_offset; + ctx->blocks[blknum]->index = blknum; + ctx->blocks[blknum]->length = length; + } else { + ctx->g_blocks[blknum].length = length; + ctx->g_blocks[blknum].offset = last_offset; + } } else { - ctx->g_blocks[blknum].length = length; - ctx->g_blocks[blknum].offset = last_offset; + freqs[length]++; } tot_chunks++; @@ -794,6 +818,7 @@ process_blocks: tot_time += en_1 - strt; DEBUG_STAT_EN(fprintf(stderr, "Original size: %" PRId64 ", blknum: %u\n", *size, blknum)); DEBUG_STAT_EN(fprintf(stderr, "Number of maxlen blocks: %u\n", max_count)); + if (ctx->full_chunking) blknum = 0; if (blknum <=2 && ctx->arc) { sem_wait(ctx->index_sem); sem_post(ctx->index_sem_next); @@ -829,12 +854,12 @@ process_blocks: ctx->g_blocks[i].length, 0, 0); } - for (i=0; ig_blocks[i].length]++; /* * Index table within this segment. */ g_dedupe_idx = ctx->cbuf + RABIN_HDR_SIZE; dedupe_index_sz = 0; + for (i=0; ig_blocks[i].length]++; /* * First entry in table is the original file offset where this diff --git a/rabin/rabin_dedup.h b/rabin/rabin_dedup.h index e24d75e..40ef528 100644 --- a/rabin/rabin_dedup.h +++ b/rabin/rabin_dedup.h @@ -178,6 +178,7 @@ typedef struct { short valid; void *lzma_data; int level, delta_flag, dedupe_flag, deltac_min_distance; + int full_chunking; uint64_t file_offset; // For global dedupe archive_config_t *arc; sem_t *index_sem;